Spaces:
Sleeping
Sleeping
save
Browse files
app.py
CHANGED
|
@@ -27,29 +27,27 @@ def get_quantization_recipe(method, model_architecture):
|
|
| 27 |
f"AWQ quantization is only supported for LlamaForCausalLM and Qwen2_5_VLForConditionalGeneration architectures, got {model_architecture}"
|
| 28 |
)
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
|
| 32 |
-
AWQMapping(
|
| 33 |
-
"re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
|
| 34 |
-
),
|
| 35 |
-
AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
|
| 36 |
-
AWQMapping(
|
| 37 |
-
"re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
|
| 38 |
-
),
|
| 39 |
-
AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
|
| 40 |
-
]
|
| 41 |
-
|
| 42 |
if model_architecture == "Qwen2_5_VLForConditionalGeneration":
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
]
|
| 52 |
-
else: # LlamaForCausalLM
|
| 53 |
return [
|
| 54 |
AWQModifier(
|
| 55 |
ignore=["lm_head"],
|
|
@@ -359,12 +357,12 @@ def compress_and_upload(
|
|
| 359 |
)
|
| 360 |
except Exception as e:
|
| 361 |
print(f"Could not load multimodal dataset, falling back to text-only: {e}")
|
| 362 |
-
# Fall back to text-only dataset
|
|
|
|
|
|
|
| 363 |
oneshot(
|
| 364 |
model=model,
|
| 365 |
-
dataset=
|
| 366 |
-
dataset_config_name="wikitext-2-raw-v1",
|
| 367 |
-
split="train[:1%]",
|
| 368 |
recipe=recipe,
|
| 369 |
save_compressed=True,
|
| 370 |
output_dir=output_dir,
|
|
@@ -373,11 +371,11 @@ def compress_and_upload(
|
|
| 373 |
)
|
| 374 |
else:
|
| 375 |
# For non-multimodal models, use the original approach
|
|
|
|
|
|
|
| 376 |
oneshot(
|
| 377 |
model=model,
|
| 378 |
-
dataset=
|
| 379 |
-
dataset_config_name="wikitext-2-raw-v1",
|
| 380 |
-
split="train[:1%]",
|
| 381 |
recipe=recipe,
|
| 382 |
save_compressed=True,
|
| 383 |
output_dir=output_dir,
|
|
@@ -450,7 +448,7 @@ def build_gradio_app():
|
|
| 450 |
|
| 451 |
gr.Markdown("### 2. Choose a Quantization Method")
|
| 452 |
quant_method_dropdown = gr.Dropdown(
|
| 453 |
-
["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "
|
| 454 |
label="Quantization Method",
|
| 455 |
value="W4A16"
|
| 456 |
)
|
|
|
|
| 27 |
f"AWQ quantization is only supported for LlamaForCausalLM and Qwen2_5_VLForConditionalGeneration architectures, got {model_architecture}"
|
| 28 |
)
|
| 29 |
|
| 30 |
+
# AWQ is fundamentally incompatible with Qwen2.5-VL models due to conflicts with
|
| 31 |
+
# the complex 3D rotary positional embedding system used for multimodal processing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
if model_architecture == "Qwen2_5_VLForConditionalGeneration":
|
| 33 |
+
raise ValueError(
|
| 34 |
+
f"AWQ quantization is not compatible with {model_architecture} architecture "
|
| 35 |
+
"due to fundamental conflicts with complex 3D rotary positional embeddings. "
|
| 36 |
+
"This quantization method modifies weights in a way that breaks the multimodal "
|
| 37 |
+
"positional encoding system. Please use GPTQ, W4A16, W8A16, W8A8_INT8, W8A8_FP8, or FP8 methods instead."
|
| 38 |
+
)
|
| 39 |
+
else: # LlamaForCausalLM and other supported architectures
|
| 40 |
+
# Create AWQ mappings for Llama models
|
| 41 |
+
mappings = [
|
| 42 |
+
AWQMapping(
|
| 43 |
+
"re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
|
| 44 |
),
|
| 45 |
+
AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
|
| 46 |
+
AWQMapping(
|
| 47 |
+
"re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
|
| 48 |
+
),
|
| 49 |
+
AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
|
| 50 |
]
|
|
|
|
| 51 |
return [
|
| 52 |
AWQModifier(
|
| 53 |
ignore=["lm_head"],
|
|
|
|
| 357 |
)
|
| 358 |
except Exception as e:
|
| 359 |
print(f"Could not load multimodal dataset, falling back to text-only: {e}")
|
| 360 |
+
# Fall back to text-only dataset - load it properly and pass as dataset
|
| 361 |
+
from datasets import load_dataset
|
| 362 |
+
fallback_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
|
| 363 |
oneshot(
|
| 364 |
model=model,
|
| 365 |
+
dataset=fallback_ds,
|
|
|
|
|
|
|
| 366 |
recipe=recipe,
|
| 367 |
save_compressed=True,
|
| 368 |
output_dir=output_dir,
|
|
|
|
| 371 |
)
|
| 372 |
else:
|
| 373 |
# For non-multimodal models, use the original approach
|
| 374 |
+
from datasets import load_dataset
|
| 375 |
+
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
|
| 376 |
oneshot(
|
| 377 |
model=model,
|
| 378 |
+
dataset=ds,
|
|
|
|
|
|
|
| 379 |
recipe=recipe,
|
| 380 |
save_compressed=True,
|
| 381 |
output_dir=output_dir,
|
|
|
|
| 448 |
|
| 449 |
gr.Markdown("### 2. Choose a Quantization Method")
|
| 450 |
quant_method_dropdown = gr.Dropdown(
|
| 451 |
+
["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "GPTQ", "FP8", "AWQ", "SmoothQuant", "SparseGPT"],
|
| 452 |
label="Quantization Method",
|
| 453 |
value="W4A16"
|
| 454 |
)
|