Spaces:

n00b001
/

llm-compressor-my-repo

Sleeping

App Files Files Community

n00b001 commited on 15 days ago

Commit

240d878

unverified ·

1 Parent(s): 00936a3

save

Browse files

Files changed (1) hide show

app.py +26 -28

app.py CHANGED Viewed

@@ -27,29 +27,27 @@ def get_quantization_recipe(method, model_architecture):
                 f"AWQ quantization is only supported for LlamaForCausalLM and Qwen2_5_VLForConditionalGeneration architectures, got {model_architecture}"
             )
-        # Create AWQ mappings for both architectures
-        mappings = [
-            AWQMapping(
-                "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
-            ),
-            AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
-            AWQMapping(
-                "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
-            ),
-            AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
-        ]
         if model_architecture == "Qwen2_5_VLForConditionalGeneration":
-            return [
-                AWQModifier(
-                    ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
-                    scheme="W4A16_ASYM",
-                    targets=["Linear"],
-                    mappings=mappings,
-                    sequential_targets=["Qwen2_5_VLDecoderLayer"],  # Sequential onloading for Qwen2.5-VL
                 ),
             ]
-        else:  # LlamaForCausalLM
             return [
                 AWQModifier(
                     ignore=["lm_head"],
@@ -359,12 +357,12 @@ def compress_and_upload(
                 )
             except Exception as e:
                 print(f"Could not load multimodal dataset, falling back to text-only: {e}")
-                # Fall back to text-only dataset
                 oneshot(
                     model=model,
-                    dataset="wikitext",
-                    dataset_config_name="wikitext-2-raw-v1",
-                    split="train[:1%]",
                     recipe=recipe,
                     save_compressed=True,
                     output_dir=output_dir,
@@ -373,11 +371,11 @@ def compress_and_upload(
                 )
         else:
             # For non-multimodal models, use the original approach
             oneshot(
                 model=model,
-                dataset="wikitext",
-                dataset_config_name="wikitext-2-raw-v1",
-                split="train[:1%]",
                 recipe=recipe,
                 save_compressed=True,
                 output_dir=output_dir,
@@ -450,7 +448,7 @@ def build_gradio_app():
         gr.Markdown("### 2. Choose a Quantization Method")
         quant_method_dropdown = gr.Dropdown(
-            ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "AWQ", "GPTQ", "FP8", "SmoothQuant", "SparseGPT"],
             label="Quantization Method",
             value="W4A16"
         )

                 f"AWQ quantization is only supported for LlamaForCausalLM and Qwen2_5_VLForConditionalGeneration architectures, got {model_architecture}"
             )
+        # AWQ is fundamentally incompatible with Qwen2.5-VL models due to conflicts with
+        # the complex 3D rotary positional embedding system used for multimodal processing
         if model_architecture == "Qwen2_5_VLForConditionalGeneration":
+            raise ValueError(
+                f"AWQ quantization is not compatible with {model_architecture} architecture "
+                "due to fundamental conflicts with complex 3D rotary positional embeddings. "
+                "This quantization method modifies weights in a way that breaks the multimodal "
+                "positional encoding system. Please use GPTQ, W4A16, W8A16, W8A8_INT8, W8A8_FP8, or FP8 methods instead."
+            )
+        else:  # LlamaForCausalLM and other supported architectures
+            # Create AWQ mappings for Llama models
+            mappings = [
+                AWQMapping(
+                    "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
                 ),
+                AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
+                AWQMapping(
+                    "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
+                ),
+                AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
             ]
             return [
                 AWQModifier(
                     ignore=["lm_head"],
                 )
             except Exception as e:
                 print(f"Could not load multimodal dataset, falling back to text-only: {e}")
+                # Fall back to text-only dataset - load it properly and pass as dataset
+                from datasets import load_dataset
+                fallback_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
                 oneshot(
                     model=model,
+                    dataset=fallback_ds,
                     recipe=recipe,
                     save_compressed=True,
                     output_dir=output_dir,
                 )
         else:
             # For non-multimodal models, use the original approach
+            from datasets import load_dataset
+            ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
             oneshot(
                 model=model,
+                dataset=ds,
                 recipe=recipe,
                 save_compressed=True,
                 output_dir=output_dir,
         gr.Markdown("### 2. Choose a Quantization Method")
         quant_method_dropdown = gr.Dropdown(
+            ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "GPTQ", "FP8", "AWQ", "SmoothQuant", "SparseGPT"],
             label="Quantization Method",
             value="W4A16"
         )