n00b001 commited on
Commit
240d878
·
unverified ·
1 Parent(s): 00936a3
Files changed (1) hide show
  1. app.py +26 -28
app.py CHANGED
@@ -27,29 +27,27 @@ def get_quantization_recipe(method, model_architecture):
27
  f"AWQ quantization is only supported for LlamaForCausalLM and Qwen2_5_VLForConditionalGeneration architectures, got {model_architecture}"
28
  )
29
 
30
- # Create AWQ mappings for both architectures
31
- mappings = [
32
- AWQMapping(
33
- "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
34
- ),
35
- AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
36
- AWQMapping(
37
- "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
38
- ),
39
- AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
40
- ]
41
-
42
  if model_architecture == "Qwen2_5_VLForConditionalGeneration":
43
- return [
44
- AWQModifier(
45
- ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
46
- scheme="W4A16_ASYM",
47
- targets=["Linear"],
48
- mappings=mappings,
49
- sequential_targets=["Qwen2_5_VLDecoderLayer"], # Sequential onloading for Qwen2.5-VL
 
 
 
 
50
  ),
 
 
 
 
 
51
  ]
52
- else: # LlamaForCausalLM
53
  return [
54
  AWQModifier(
55
  ignore=["lm_head"],
@@ -359,12 +357,12 @@ def compress_and_upload(
359
  )
360
  except Exception as e:
361
  print(f"Could not load multimodal dataset, falling back to text-only: {e}")
362
- # Fall back to text-only dataset
 
 
363
  oneshot(
364
  model=model,
365
- dataset="wikitext",
366
- dataset_config_name="wikitext-2-raw-v1",
367
- split="train[:1%]",
368
  recipe=recipe,
369
  save_compressed=True,
370
  output_dir=output_dir,
@@ -373,11 +371,11 @@ def compress_and_upload(
373
  )
374
  else:
375
  # For non-multimodal models, use the original approach
 
 
376
  oneshot(
377
  model=model,
378
- dataset="wikitext",
379
- dataset_config_name="wikitext-2-raw-v1",
380
- split="train[:1%]",
381
  recipe=recipe,
382
  save_compressed=True,
383
  output_dir=output_dir,
@@ -450,7 +448,7 @@ def build_gradio_app():
450
 
451
  gr.Markdown("### 2. Choose a Quantization Method")
452
  quant_method_dropdown = gr.Dropdown(
453
- ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "AWQ", "GPTQ", "FP8", "SmoothQuant", "SparseGPT"],
454
  label="Quantization Method",
455
  value="W4A16"
456
  )
 
27
  f"AWQ quantization is only supported for LlamaForCausalLM and Qwen2_5_VLForConditionalGeneration architectures, got {model_architecture}"
28
  )
29
 
30
+ # AWQ is fundamentally incompatible with Qwen2.5-VL models due to conflicts with
31
+ # the complex 3D rotary positional embedding system used for multimodal processing
 
 
 
 
 
 
 
 
 
 
32
  if model_architecture == "Qwen2_5_VLForConditionalGeneration":
33
+ raise ValueError(
34
+ f"AWQ quantization is not compatible with {model_architecture} architecture "
35
+ "due to fundamental conflicts with complex 3D rotary positional embeddings. "
36
+ "This quantization method modifies weights in a way that breaks the multimodal "
37
+ "positional encoding system. Please use GPTQ, W4A16, W8A16, W8A8_INT8, W8A8_FP8, or FP8 methods instead."
38
+ )
39
+ else: # LlamaForCausalLM and other supported architectures
40
+ # Create AWQ mappings for Llama models
41
+ mappings = [
42
+ AWQMapping(
43
+ "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
44
  ),
45
+ AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
46
+ AWQMapping(
47
+ "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
48
+ ),
49
+ AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
50
  ]
 
51
  return [
52
  AWQModifier(
53
  ignore=["lm_head"],
 
357
  )
358
  except Exception as e:
359
  print(f"Could not load multimodal dataset, falling back to text-only: {e}")
360
+ # Fall back to text-only dataset - load it properly and pass as dataset
361
+ from datasets import load_dataset
362
+ fallback_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
363
  oneshot(
364
  model=model,
365
+ dataset=fallback_ds,
 
 
366
  recipe=recipe,
367
  save_compressed=True,
368
  output_dir=output_dir,
 
371
  )
372
  else:
373
  # For non-multimodal models, use the original approach
374
+ from datasets import load_dataset
375
+ ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
376
  oneshot(
377
  model=model,
378
+ dataset=ds,
 
 
379
  recipe=recipe,
380
  save_compressed=True,
381
  output_dir=output_dir,
 
448
 
449
  gr.Markdown("### 2. Choose a Quantization Method")
450
  quant_method_dropdown = gr.Dropdown(
451
+ ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "GPTQ", "FP8", "AWQ", "SmoothQuant", "SparseGPT"],
452
  label="Quantization Method",
453
  value="W4A16"
454
  )