Spaces:

n00b001
/

llm-compressor-my-repo

Sleeping

App Files Files Community

n00b001 commited on 14 days ago

Commit

d95ff5b

unverified ·

1 Parent(s): 6f060c2

save

Browse files

Files changed (6) hide show

quantize_huihui_fara.py +365 -0
quantize_qwen2_5_vl.py +396 -0
test_final_solution.py +103 -0
test_final_verification.py +123 -0
test_new_quantization_methods.py +64 -0
test_qwen2_5_vl_architecture.py +71 -0

quantize_huihui_fara.py ADDED Viewed

	@@ -0,0 +1,365 @@

+#!/usr/bin/env python
+"""
+Script to quantize the huihui-ai/Huihui-Fara-7B-abliterated model with Qwen2.5-VL architecture support
+Uses sequential onloading for memory efficiency.
+"""
+import base64
+from io import BytesIO
+import torch
+from datasets import load_dataset
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
+from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
+from llmcompressor.utils import dispatch_for_generation
+def create_qwen2_5_vl_data_collator():
+    """Create a data collator for Qwen2.5-VL models that handles multimodal inputs."""
+    def data_collator(batch):
+        assert len(batch) == 1
+        return {key: torch.tensor(value) if isinstance(value, (list, int, float)) else value
+                for key, value in batch[0].items()}
+    return data_collator
+def create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length: int = 2048):
+    """Create a preprocessing function for Qwen2.5-VL datasets."""
+    def preprocess_and_tokenize(example):
+        # Handle different image formats
+        if 'image' in example:
+            # Process image
+            if hasattr(example['image'], 'save'):
+                # PIL Image object
+                buffered = BytesIO()
+                example["image"].save(buffered, format="PNG")
+                encoded_image = base64.b64encode(buffered.getvalue())
+                encoded_image_text = encoded_image.decode("utf-8")
+                base64_qwen = f"data:image;base64,{encoded_image_text}"
+            else:
+                # Already a string or other format
+                base64_qwen = str(example["image"])
+        else:
+            # If there's no image field, try 'img' or similar
+            img_key = None
+            for key in example.keys():
+                if 'image' in key.lower() or 'img' in key.lower():
+                    img_key = key
+                    break
+            if img_key:
+                if hasattr(example[img_key], 'save'):
+                    buffered = BytesIO()
+                    example[img_key].save(buffered, format="PNG")
+                    encoded_image = base64.b64encode(buffered.getvalue())
+                    encoded_image_text = encoded_image.decode("utf-8")
+                    base64_qwen = f"data:image;base64,{encoded_image_text}"
+                else:
+                    base64_qwen = str(example[img_key])
+            else:
+                # If no image, create a simple text-only example
+                messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": example.get('text', example.get('content', 'What can you tell me about this?'))},
+                        ],
+                    }
+                ]
+                text = processor.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                return processor(
+                    text=[text],
+                    padding=False,
+                    max_length=max_sequence_length,
+                    truncation=True,
+                )
+        # Create message with image
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": base64_qwen},
+                    {"type": "text", "text": "What does the image show?"},
+                ],
+            }
+        ]
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        # tokenize
+        return processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=False,
+            max_length=max_sequence_length,
+            truncation=True,
+        )
+    return preprocess_and_tokenize
+def get_qwen2_5_vl_quantization_recipe(method: str, scheme: str = "W4A16"):
+    """
+    Creates the appropriate quantization recipe for Qwen2.5-VL models.
+    Args:
+        method: Quantization method ("GPTQ", "AWQ", or "FP8")
+        scheme: Quantization scheme (e.g., "W4A16", "W8A8", "FP8")
+    Returns:
+        List of modifiers for the quantization recipe
+    """
+    if method == "GPTQ":
+        return [
+            GPTQModifier(
+                targets="Linear",
+                scheme=scheme,
+                ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
+                sequential_targets=["Qwen2_5_VLDecoderLayer"],  # This enables sequential onloading
+            ),
+        ]
+    elif method == "AWQ":
+        # Create AWQ mappings for Qwen2.5-VL architecture
+        mappings = [
+            AWQMapping(
+                "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
+            ),
+            AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
+            AWQMapping(
+                "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
+            ),
+            AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
+        ]
+        return [
+            AWQModifier(
+                ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
+                scheme="W4A16_ASYM" if scheme == "W4A16" else scheme,
+                targets=["Linear"],
+                mappings=mappings,
+                sequential_targets=["Qwen2_5_VLDecoderLayer"],  # Sequential onloading for memory efficiency
+            ),
+        ]
+    elif method == "FP8":
+        return [
+            QuantizationModifier(
+                scheme="FP8",
+                targets="Linear",
+                ignore=["lm_head", "re:visual.*", "re:model.visual.*"]
+            )
+        ]
+    else:
+        raise ValueError(f"Unsupported quantization method: {method}")
+def quantize_huihui_fara_model(
+    model_id: str = "huihui-ai/Huihui-Fara-7B-abliterated",
+    quantization_method: str = "GPTQ",
+    output_dir: str = None,
+    dataset_id: str = "wikitext",
+    dataset_config: str = "wikitext-2-raw-v1",
+    dataset_split: str = "train[:1%]",
+    num_calibration_samples: int = 64,
+    max_sequence_length: int = 512,
+    scheme: str = "W4A16",
+    trust_remote_code: bool = True,
+):
+    """
+    Quantizes the huihui-ai/Huihui-Fara-7B-abliterated model with proper Qwen2.5-VL architecture support.
+    Args:
+        model_id: Hugging Face model ID to quantize
+        quantization_method: Method to use ("GPTQ", "AWQ", or "FP8")
+        output_dir: Directory to save the quantized model
+        dataset_id: Dataset ID for calibration
+        dataset_config: Dataset config for calibration
+        dataset_split: Dataset split for calibration
+        num_calibration_samples: Number of samples to use for calibration
+        max_sequence_length: Maximum sequence length for processing
+        scheme: Quantization scheme (e.g., "W4A16", "W8A8")
+        trust_remote_code: Whether to trust remote code in model loading
+    Returns:
+        Quantized model
+    """
+    print(f"Loading model: {model_id}")
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16,  # Use float16 to save memory
+        device_map="auto",  # Auto device mapping for memory efficiency
+        trust_remote_code=trust_remote_code
+    )
+    print(f"Loading processor for: {model_id}")
+    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+    # If output directory not specified, create one based on model and method
+    if not output_dir:
+        model_name = model_id.rstrip("/").split("/")[-1]
+        output_dir = f"{model_name}-{scheme.replace(':', '-')}-{quantization_method}"
+    print(f"Output directory: {output_dir}")
+    # Load dataset and preprocess
+    print(f"Loading dataset: {dataset_id}")
+    try:
+        # Try to load a multimodal dataset first
+        ds = load_dataset("lmms-lab/flickr30k", split="test[:64]")
+        print("Using multimodal dataset for calibration")
+        preprocess_fn = create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length)
+        ds = ds.map(preprocess_fn, remove_columns=ds.column_names)
+    except Exception as e:
+        print(f"Failed to load multimodal dataset: {e}, falling back to text-only dataset")
+        # If multimodal dataset fails, use text-only
+        ds = load_dataset(dataset_id, dataset_config, split=dataset_split)
+        ds = ds.shuffle(seed=42)
+        # Text-only preprocessing
+        def text_only_preprocess(example):
+            text = example.get('text', example.get('content', str(example)))
+            if not isinstance(text, str):
+                text = str(text)
+            # Limit text length to avoid exceeding max sequence length
+            text = text[:500] + "..." if len(text) > 500 else text
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": text},
+                    ],
+                }
+            ]
+            prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            return processor(text=[prompt], padding=False, max_length=max_sequence_length, truncation=True)
+        ds = ds.map(text_only_preprocess, remove_columns=ds.column_names)
+    # Define data collator
+    data_collator = create_qwen2_5_vl_data_collator()
+    # Create recipe
+    recipe = get_qwen2_5_vl_quantization_recipe(quantization_method, scheme)
+    print(f"Starting quantization with method: {quantization_method}")
+    print(f"Using recipe: {recipe}")
+    print(f"Using sequential targets: {[mod.sequential_targets if hasattr(mod, 'sequential_targets') else 'N/A' for mod in recipe]}")
+    # Perform oneshot quantization with sequential onloading for memory efficiency
+    oneshot(
+        model=model,
+        tokenizer=processor,  # Use processor as tokenizer for Qwen2.5-VL
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=max_sequence_length,
+        num_calibration_samples=num_calibration_samples,
+        trust_remote_code_model=trust_remote_code,
+        data_collator=data_collator,
+        save_compressed=True,
+        output_dir=output_dir,
+    )
+    print(f"Quantization completed! Model saved to: {output_dir}")
+    # Save the processor as well
+    processor.save_pretrained(output_dir)
+    return model
+def test_quantized_model(model, processor, max_sequence_length: int = 2048):
+    """
+    Tests the quantized model with a sample generation.
+    """
+    print("========== SAMPLE GENERATION ==============")
+    try:
+        dispatch_for_generation(model)
+        # Simple text-only test first
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Hello, how are you today?"},
+                ],
+            }
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(
+            text=[prompt],
+            padding=False,
+            max_length=max_sequence_length,
+            truncation=True,
+            return_tensors="pt",
+        ).to(model.device)
+        output = model.generate(**inputs, max_new_tokens=50)
+        result = processor.decode(output[0], skip_special_tokens=True)
+        print(result)
+        print("==========================================")
+        return result
+    except Exception as e:
+        print(f"Test generation failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+def main():
+    """
+    Main function to quantize the Huihui-Fara model.
+    """
+    import argparse
+    parser = argparse.ArgumentParser(description="Quantize huihui-ai/Huihui-Fara-7B-abliterated model")
+    parser.add_argument("--model_id", type=str, default="huihui-ai/Huihui-Fara-7B-abliterated",
+                        help="Model ID to quantize")
+    parser.add_argument("--method", type=str, choices=["GPTQ", "AWQ", "FP8"],
+                        default="GPTQ", help="Quantization method to use")
+    parser.add_argument("--output_dir", type=str, default=None,
+                        help="Output directory for quantized model")
+    parser.add_argument("--dataset_id", type=str, default="wikitext",
+                        help="Dataset for calibration (default: wikitext)")
+    parser.add_argument("--scheme", type=str, default="W4A16",
+                        help="Quantization scheme (e.g., W4A16, W8A8)")
+    parser.add_argument("--num_samples", type=int, default=64,
+                        help="Number of calibration samples")
+    args = parser.parse_args()
+    print(f"Starting quantization of {args.model_id} using {args.method}")
+    print("Note: This may take a while and will use sequential onloading for memory efficiency...")
+    try:
+        # Quantize the model
+        quantized_model = quantize_huihui_fara_model(
+            model_id=args.model_id,
+            quantization_method=args.method,
+            output_dir=args.output_dir,
+            dataset_id=args.dataset_id,
+            num_calibration_samples=args.num_samples,
+            scheme=args.scheme
+        )
+        # Test the model
+        processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True)
+        test_quantized_model(quantized_model, processor)
+        print(f"✅ Successfully quantized {args.model_id} with {args.method}")
+        print(f"Model saved to: {args.output_dir or args.model_id.split('/')[-1] + f'-{args.scheme}-{args.method}'}")
+    except Exception as e:
+        print(f"❌ Quantization failed: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

quantize_qwen2_5_vl.py ADDED Viewed

	@@ -0,0 +1,396 @@

+#!/usr/bin/env python
+"""
+Specialized script for quantizing Qwen2.5-VL models with sequential onloading
+Handles quantization of Qwen2_5_VLForConditionalGeneration models properly
+"""
+import base64
+from io import BytesIO
+from typing import Optional, Union, Dict, Any
+import torch
+from datasets import load_dataset
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, AutoTokenizer
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
+from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
+from llmcompressor.utils import dispatch_for_generation
+def create_qwen2_5_vl_data_collator():
+    """Create a data collator for Qwen2.5-VL models that handles multimodal inputs."""
+    def data_collator(batch):
+        assert len(batch) == 1
+        return {key: torch.tensor(value) if isinstance(value, (list, int, float)) else value
+                for key, value in batch[0].items()}
+    return data_collator
+def create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length: int = 2048):
+    """Create a preprocessing function for Qwen2.5-VL datasets."""
+    def preprocess_and_tokenize(example):
+        # Handle different image formats
+        if 'image' in example:
+            # Process image
+            if hasattr(example['image'], 'save'):
+                # PIL Image object
+                buffered = BytesIO()
+                example["image"].save(buffered, format="PNG")
+                encoded_image = base64.b64encode(buffered.getvalue())
+                encoded_image_text = encoded_image.decode("utf-8")
+                base64_qwen = f"data:image;base64,{encoded_image_text}"
+            else:
+                # Already a string or other format
+                base64_qwen = str(example["image"])
+        else:
+            # If there's no image field, try 'img' or similar
+            img_key = None
+            for key in example.keys():
+                if 'image' in key.lower() or 'img' in key.lower():
+                    img_key = key
+                    break
+            if img_key:
+                if hasattr(example[img_key], 'save'):
+                    buffered = BytesIO()
+                    example[img_key].save(buffered, format="PNG")
+                    encoded_image = base64.b64encode(buffered.getvalue())
+                    encoded_image_text = encoded_image.decode("utf-8")
+                    base64_qwen = f"data:image;base64,{encoded_image_text}"
+                else:
+                    base64_qwen = str(example[img_key])
+            else:
+                # If no image, create a simple text-only example
+                messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": example.get('text', example.get('content', 'What can you tell me about this?'))},
+                        ],
+                    }
+                ]
+                text = processor.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                return processor(
+                    text=[text],
+                    padding=False,
+                    max_length=max_sequence_length,
+                    truncation=True,
+                )
+        # Create message with image
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": base64_qwen},
+                    {"type": "text", "text": "What does the image show?"},
+                ],
+            }
+        ]
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        # tokenize
+        return processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=False,
+            max_length=max_sequence_length,
+            truncation=True,
+        )
+    return preprocess_and_tokenize
+def get_qwen2_5_vl_quantization_recipe(method: str, scheme: str = "W4A16"):
+    """
+    Creates the appropriate quantization recipe for Qwen2.5-VL models.
+    Args:
+        method: Quantization method ("GPTQ", "AWQ", or "FP8")
+        scheme: Quantization scheme (e.g., "W4A16", "W8A8", "FP8")
+    Returns:
+        List of modifiers for the quantization recipe
+    """
+    if method == "GPTQ":
+        return [
+            GPTQModifier(
+                targets="Linear",
+                scheme=scheme,
+                ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
+                sequential_targets=["Qwen2_5_VLDecoderLayer"],  # This is key for the architecture
+            ),
+        ]
+    elif method == "AWQ":
+        # Create AWQ mappings for Qwen2.5-VL architecture
+        mappings = [
+            AWQMapping(
+                "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
+            ),
+            AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
+            AWQMapping(
+                "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
+            ),
+            AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
+        ]
+        return [
+            AWQModifier(
+                ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
+                scheme="W4A16_ASYM" if scheme == "W4A16" else scheme,
+                targets=["Linear"],
+                mappings=mappings,
+            ),
+        ]
+    elif method == "FP8":
+        return [
+            QuantizationModifier(
+                scheme="FP8",
+                targets="Linear",
+                ignore=["lm_head", "re:visual.*", "re:model.visual.*"]
+            )
+        ]
+    else:
+        raise ValueError(f"Unsupported quantization method: {method}")
+def quantize_qwen2_5_vl_model(
+    model_id: str,
+    quantization_method: str,
+    output_dir: Optional[str] = None,
+    dataset_id: str = "lmms-lab/flickr30k",
+    dataset_split: str = "test[:512]",
+    num_calibration_samples: int = 512,
+    max_sequence_length: int = 2048,
+    scheme: str = "W4A16",
+    trust_remote_code: bool = True,
+):
+    """
+    Quantizes a Qwen2.5-VL model with proper architecture handling and sequential onloading.
+    Args:
+        model_id: Hugging Face model ID to quantize
+        quantization_method: Method to use ("GPTQ", "AWQ", or "FP8")
+        output_dir: Directory to save the quantized model
+        dataset_id: Dataset ID for calibration
+        dataset_split: Dataset split for calibration
+        num_calibration_samples: Number of samples to use for calibration
+        max_sequence_length: Maximum sequence length for processing
+        scheme: Quantization scheme (e.g., "W4A16", "W8A8")
+        trust_remote_code: Whether to trust remote code in model loading
+    Returns:
+        Quantized model
+    """
+    print(f"Loading model: {model_id}")
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_id,
+        torch_dtype="auto",
+        device_map=None,  # Let the system decide device mapping
+        trust_remote_code=trust_remote_code
+    )
+    print(f"Loading processor for: {model_id}")
+    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+    # If output directory not specified, create one based on model and method
+    if not output_dir:
+        model_name = model_id.rstrip("/").split("/")[-1]
+        output_dir = f"{model_name}-{scheme.replace(':', '-')}-{quantization_method}"
+    print(f"Output directory: {output_dir}")
+    # Load dataset and preprocess
+    print(f"Loading dataset: {dataset_id}")
+    try:
+        ds = load_dataset(dataset_id, split=dataset_split)
+    except Exception as e:
+        print(f"Failed to load {dataset_id}, trying alternative text-only dataset: {e}")
+        # If the image dataset fails, try a text-only dataset
+        ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:512]")
+        # We'll need to adjust preprocessing for text-only data
+    ds = ds.shuffle(seed=42)
+    # Apply preprocessing
+    preprocess_fn = create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length)
+    try:
+        ds = ds.map(preprocess_fn, remove_columns=ds.column_names if hasattr(ds, 'column_names') else [])
+    except Exception as e:
+        print(f"Preprocessing failed: {e}")
+        print("Trying simpler preprocessing with text-only data...")
+        # Fallback: use text-only preprocessing
+        def text_only_preprocess(example):
+            text = example.get('text', example.get('content', str(example)))
+            if not isinstance(text, str):
+                text = str(text)
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": text[:500] + "..." if len(text) > 500 else text},  # Limit length
+                    ],
+                }
+            ]
+            prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            return processor(text=[prompt], padding=False, max_length=max_sequence_length, truncation=True)
+        ds = ds.map(text_only_preprocess, remove_columns=ds.column_names if hasattr(ds, 'column_names') else [])
+    # Define data collator
+    data_collator = create_qwen2_5_vl_data_collator()
+    # Create recipe
+    recipe = get_qwen2_5_vl_quantization_recipe(quantization_method, scheme)
+    print(f"Starting quantization with method: {quantization_method}")
+    print(f"Using recipe: {recipe}")
+    # Perform oneshot quantization with sequential targets and proper handling
+    oneshot(
+        model=model,
+        tokenizer=processor,  # Use processor as tokenizer for Qwen2.5-VL
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=max_sequence_length,
+        num_calibration_samples=num_calibration_samples,
+        trust_remote_code_model=trust_remote_code,
+        data_collator=data_collator,
+        # Use sequential onloading for memory efficiency
+        sequential_targets=["Qwen2_5_VLDecoderLayer"],
+        save_compressed=True,
+        output_dir=output_dir,
+    )
+    print(f"Quantization completed! Model saved to: {output_dir}")
+    # Save the processor as well
+    processor.save_pretrained(output_dir)
+    return model
+def test_quantized_model(model, processor, max_sequence_length: int = 2048):
+    """
+    Tests the quantized model with a sample generation.
+    """
+    print("========== SAMPLE GENERATION ==============")
+    try:
+        dispatch_for_generation(model)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "http://images.cocodataset.org/train2017/000000231895.jpg",
+                    },
+                    {"type": "text", "text": "Please describe the animal in this image\n"},
+                ],
+            }
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[prompt],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=False,
+            max_length=max_sequence_length,
+            truncation=True,
+            return_tensors="pt",
+        ).to(model.device)
+        output = model.generate(**inputs, max_new_tokens=100)
+        result = processor.decode(output[0], skip_special_tokens=True)
+        print(result)
+        print("==========================================")
+        return result
+    except Exception as e:
+        print(f"Test generation failed: {e}")
+        print("Trying text-only generation...")
+        # Try with text-only
+        try:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Hello, how are you today?"},
+                    ],
+                }
+            ]
+            prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+            inputs = processor(
+                text=[prompt],
+                padding=False,
+                max_length=max_sequence_length,
+                truncation=True,
+                return_tensors="pt",
+            ).to(model.device)
+            output = model.generate(**inputs, max_new_tokens=50)
+            result = processor.decode(output[0], skip_special_tokens=True)
+            print(result)
+            print("==========================================")
+            return result
+        except Exception as e2:
+            print(f"Text-only generation also failed: {e2}")
+            return None
+def main():
+    """
+    Main function to demonstrate quantization of Qwen2.5-VL models.
+    """
+    import argparse
+    parser = argparse.ArgumentParser(description="Quantize Qwen2.5-VL models")
+    parser.add_argument("--model_id", type=str, required=True,
+                        help="Model ID to quantize (e.g., 'huihui-ai/Huihui-Fara-7B-abliterated')")
+    parser.add_argument("--method", type=str, choices=["GPTQ", "AWQ", "FP8"],
+                        default="GPTQ", help="Quantization method to use")
+    parser.add_argument("--output_dir", type=str, default=None,
+                        help="Output directory for quantized model")
+    parser.add_argument("--dataset_id", type=str, default="lmms-lab/flickr30k",
+                        help="Dataset for calibration (default: lmms-lab/flickr30k)")
+    parser.add_argument("--scheme", type=str, default="W4A16",
+                        help="Quantization scheme (e.g., W4A16, W8A8)")
+    parser.add_argument("--num_samples", type=int, default=128,
+                        help="Number of calibration samples")
+    args = parser.parse_args()
+    print(f"Starting quantization of {args.model_id} using {args.method}")
+    try:
+        # Quantize the model
+        quantized_model = quantize_qwen2_5_vl_model(
+            model_id=args.model_id,
+            quantization_method=args.method,
+            output_dir=args.output_dir,
+            dataset_id=args.dataset_id,
+            num_calibration_samples=args.num_samples,
+            scheme=args.scheme
+        )
+        # Test the model
+        # Load the processor again to test
+        processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True)
+        test_quantized_model(quantized_model, processor)
+        print(f"Successfully quantized {args.model_id} with {args.method}")
+    except Exception as e:
+        print(f"Quantization failed: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

test_final_solution.py ADDED Viewed

	@@ -0,0 +1,103 @@

+#!/usr/bin/env python
+"""
+Final verification test after implementing proper AWQ incompatibility with Qwen2.5-VL models
+"""
+from app import get_quantization_recipe
+def test_qwen2_5_vl_compatible_methods():
+    """
+    Test all methods that should work with Qwen2.5-VL models
+    """
+    print("Testing quantization methods compatible with Qwen2.5-VL models...")
+    # Methods that should work
+    compatible_methods = ["GPTQ", "W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "FP8"]
+    all_passed = True
+    for method in compatible_methods:
+        try:
+            recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
+            print(f"✓ {method} works with Qwen2_5_VLForConditionalGeneration")
+            if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
+                print(f"  - Uses sequential onloading: {recipe[0].sequential_targets}")
+            print(f"  - Ignore patterns: {recipe[0].ignore}")
+        except Exception as e:
+            print(f"✗ {method} failed: {e}")
+            all_passed = False
+    return all_passed
+def test_awq_incompatibility():
+    """
+    Test that AWQ properly fails for Qwen2.5-VL models
+    """
+    print("\nTesting AWQ incompatibility with Qwen2.5-VL models...")
+    try:
+        recipe = get_quantization_recipe("AWQ", "Qwen2_5_VLForConditionalGeneration")
+        print("✗ AWQ unexpectedly succeeded for Qwen2.5-VL (should have failed)")
+        return False
+    except ValueError as e:
+        if "not compatible" in str(e) and "rotary positional embeddings" in str(e):
+            print(f"✓ AWQ properly fails for Qwen2.5-VL: {e}")
+            return True
+        else:
+            print(f"✗ AWQ failed but with wrong error: {e}")
+            return False
+def test_awq_still_works_for_llama():
+    """
+    Test that AWQ still works for Llama models
+    """
+    print("\nTesting AWQ still works for Llama models...")
+    try:
+        recipe = get_quantization_recipe("AWQ", "LlamaForCausalLM")
+        print(f"✓ AWQ still works for LlamaForCausalLM")
+        print(f"  - Ignore patterns: {recipe[0].ignore}")
+        return True
+    except Exception as e:
+        print(f"✗ AWQ failed for LlamaForCausalLM: {e}")
+        return False
+def test_target_model():
+    """
+    Test with the specific target model
+    """
+    print(f"\nTesting with target model architecture: Qwen2_5_VLForConditionalGeneration")
+    # All methods except AWQ should work
+    methods = ["GPTQ", "W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "FP8"]
+    success_count = 0
+    for method in methods:
+        try:
+            recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
+            success_count += 1
+        except Exception as e:
+            print(f"Method {method} failed: {e}")
+    print(f"✓ {success_count}/{len(methods)} methods work for target model")
+    return success_count == len(methods)
+if __name__ == "__main__":
+    print("Final verification after fixing AWQ incompatibility issue\n")
+    test1 = test_qwen2_5_vl_compatible_methods()
+    test2 = test_awq_incompatibility()
+    test3 = test_awq_still_works_for_llama()
+    test4 = test_target_model()
+    print(f"\n{'='*60}")
+    if test1 and test2 and test3 and test4:
+        print("✅ ALL TESTS PASSED")
+        print("\nSOLUTION SUMMARY:")
+        print("• AWQ is now properly blocked for Qwen2.5-VL models due to incompatibility")
+        print("• All other methods (GPTQ, W4A16, W8A16, W8A8_INT8, W8A8_FP8, FP8) work for Qwen2.5-VL")
+        print("• AWQ still works for Llama models as expected")
+        print("• Sequential onloading is preserved for memory efficiency")
+        print("• Users will get clear error messages when trying incompatible methods")
+    else:
+        print("❌ SOME TESTS FAILED")
+    print(f"{'='*60}")

test_final_verification.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/env python
+"""
+Final test to confirm the original issue is resolved:
+GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture
+"""
+from app import get_quantization_recipe
+def test_original_issue_fixed():
+    """
+    Test to confirm the original error is fixed.
+    The original error was:
+    GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture
+    """
+    print("Testing the original issue that was reported...")
+    print("Original error: GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture")
+    print()
+    # Test the original problematic case
+    try:
+        recipe = get_quantization_recipe("GPTQ", "Qwen2_5_VLForConditionalGeneration")
+        print("✓ GPTQ quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
+        print(f"  Recipe: {recipe}")
+        if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
+            print(f"  Uses sequential onloading: {recipe[0].sequential_targets}")
+        print(f"  Ignores visual components: {recipe[0].ignore}")
+        success_gptq = True
+    except Exception as e:
+        print(f"✗ GPTQ still fails: {e}")
+        success_gptq = False
+    print()
+    # Test other methods that were also problematic
+    other_methods = ["AWQ", "FP8"]
+    success_others = True
+    for method in other_methods:
+        try:
+            recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
+            print(f"✓ {method} quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
+            if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
+                print(f"  Uses sequential onloading: {recipe[0].sequential_targets}")
+            success_others = success_others and True
+        except Exception as e:
+            print(f"✗ {method} still fails: {e}")
+            success_others = False
+    print()
+    # Test new methods for Qwen2.5-VL
+    new_methods = ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8"]
+    success_new = True
+    for method in new_methods:
+        try:
+            recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
+            print(f"✓ {method} quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
+            success_new = success_new and True
+        except Exception as e:
+            print(f"✗ {method} fails: {e}")
+            success_new = False
+    print()
+    if success_gptq and success_others and success_new:
+        print("🎉 SUCCESS: The original issue has been completely resolved!")
+        print("   - GPTQ now works for Qwen2_5_VLForConditionalGeneration")
+        print("   - AWQ now works for Qwen2_5_VLForConditionalGeneration")
+        print("   - FP8 now works for Qwen2_5_VLForConditionalGeneration")
+        print("   - New methods (W4A16, W8A16, W8A8_INT8, W8A8_FP8) also work!")
+        print("   - Sequential onloading is used for memory efficiency")
+        print("   - Visual components are properly ignored during quantization")
+        return True
+    else:
+        print("❌ FAILURE: Some issues remain")
+        return False
+def test_specific_model():
+    """
+    Test with the specific model mentioned: huihui-ai/Huihui-Fara-7B-abliterated
+    """
+    print("\n" + "="*60)
+    print("Testing with the specific model: huihui-ai/Huihui-Fara-7B-abliterated")
+    print("(This model has architecture: Qwen2_5_VLForConditionalGeneration)")
+    print("="*60)
+    # All the methods that should now work for this model
+    methods = ["GPTQ", "AWQ", "FP8", "W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8"]
+    success = True
+    for method in methods:
+        try:
+            recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
+            print(f"✓ {method}: OK")
+        except Exception as e:
+            print(f"✗ {method}: FAILED - {e}")
+            success = False
+    if success:
+        print(f"\n🎉 All {len(methods)} quantization methods now work for the target model!")
+        print("Users can now quantize huihui-ai/Huihui-Fara-7B-abliterated with any of these methods.")
+    else:
+        print("\n❌ Some methods still don't work for the target model.")
+    return success
+if __name__ == "__main__":
+    print("Testing resolution of the original quantization issue...\n")
+    issue_fixed = test_original_issue_fixed()
+    model_specific = test_specific_model()
+    print("\n" + "="*60)
+    if issue_fixed and model_specific:
+        print("✅ ALL TESTS PASSED - The issue is completely resolved!")
+        print("\nThe Hugging Face Space now supports:")
+        print("  • All original methods: GPTQ, AWQ, FP8")
+        print("  • New methods: W4A16, W8A16, W8A8_INT8, W8A8_FP8")
+        print("  • Sequential onloading for memory efficiency")
+        print("  • Proper handling of Qwen2.5-VL visual components")
+        print("  • All methods work with Qwen2_5_VLForConditionalGeneration models")
+    else:
+        print("❌ SOME TESTS FAILED - Issue may not be completely resolved")
+    print("="*60)

test_new_quantization_methods.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python
+"""
+Test script to verify that the new quantization methods work with Qwen2.5-VL architecture
+"""
+from app import get_quantization_recipe
+import torch
+def test_new_quantization_methods():
+    """
+    Test the new quantization methods with Qwen2.5-VL architecture.
+    """
+    architectures = ["Qwen2_5_VLForConditionalGeneration"]
+    # Test all the new quantization methods
+    new_methods = ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "FP8", "SmoothQuant", "SparseGPT"]
+    print(f"Testing new quantization methods with architecture: {architectures[0]}")
+    for method in new_methods:
+        print(f"\nTesting {method} quantization recipe...")
+        try:
+            if method in ["SmoothQuant", "SparseGPT"] and architectures[0] == "Qwen2_5_VLForConditionalGeneration":
+                # These methods don't support Qwen2_5_VLForConditionalGeneration, so they should raise an error
+                try:
+                    recipe = get_quantization_recipe(method, architectures[0])
+                    print(f"✗ {method} should not be supported for Qwen2.5-VL but it didn't raise an error")
+                except ValueError as e:
+                    print(f"✓ {method} correctly raises error for Qwen2.5-VL: {e}")
+            else:
+                recipe = get_quantization_recipe(method, architectures[0])
+                print(f"✓ {method} recipe created successfully: {recipe}")
+                if hasattr(recipe[0], 'scheme'):
+                    print(f"  Scheme: {recipe[0].scheme}")
+                if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
+                    print(f"  Sequential targets: {recipe[0].sequential_targets}")
+                if hasattr(recipe[0], 'ignore'):
+                    print(f"  Ignore layers: {recipe[0].ignore}")
+        except ValueError as e:
+            if method in ["SmoothQuant", "SparseGPT"]:
+                # These are expected to not work with Qwen2.5-VL
+                print(f"✓ {method} correctly not supported for Qwen2.5-VL: {e}")
+            else:
+                print(f"✗ Error with {method}: {e}")
+        except Exception as e:
+            print(f"✗ Unexpected error with {method}: {e}")
+    # Test that Llama models still work with all methods
+    print(f"\n\nTesting LlamaForCausalLM compatibility...")
+    llama_arch = "LlamaForCausalLM"
+    for method in new_methods:
+        print(f"Testing {method} with {llama_arch}...")
+        try:
+            recipe = get_quantization_recipe(method, llama_arch)
+            print(f"✓ {method} works with {llama_arch}")
+        except Exception as e:
+            print(f"✗ {method} failed with {llama_arch}: {e}")
+    return True
+if __name__ == "__main__":
+    print("Testing new quantization methods...\n")
+    test_new_quantization_methods()
+    print("\n✓ Testing of new quantization methods completed!")

test_qwen2_5_vl_architecture.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/env python
+"""
+Test script to verify that the Qwen2.5-VL architecture detection and quantization recipe work correctly
+"""
+from transformers import AutoConfig
+from app import determine_model_class, get_quantization_recipe
+import torch
+def test_qwen2_5_vl_detection():
+    """
+    Test to see if we can properly detect the Qwen2.5-VL architecture.
+    We'll use a known Qwen2.5-VL model ID to test the detection.
+    """
+    # For testing purposes, use a known Qwen2.5-VL model ID
+    model_id = "Qwen/Qwen2.5-VL-7B-Instruct"  # Use a known Qwen2.5-VL model
+    # Simulate the architecture string that would come from the model config
+    # In the real scenario, this comes from model.config.architectures[0]
+    architectures = ["Qwen2_5_VLForConditionalGeneration"]
+    print(f"Testing architecture detection for: {model_id}")
+    print(f"Architectures found: {architectures}")
+    try:
+        # Test if our recipe function can handle this architecture
+        for method in ["GPTQ", "AWQ", "FP8"]:
+            print(f"\nTesting {method} quantization recipe...")
+            recipe = get_quantization_recipe(method, architectures[0])
+            print(f"{method} recipe created successfully: {recipe}")
+            print(f"Sequential targets: {[mod.sequential_targets if hasattr(mod, 'sequential_targets') else 'N/A' for mod in recipe]}")
+            print(f"Ignore layers: {[mod.ignore for mod in recipe if hasattr(mod, 'ignore')]}")
+        print("\n✓ All quantization methods work with Qwen2_5_VLForConditionalGeneration architecture")
+    except Exception as e:
+        print(f"\n✗ Error creating quantization recipe: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+    return True
+def test_manual_model_class_detection():
+    """
+    Test the manual model class detection in the app.
+    """
+    print("\nTesting manual model class detection...")
+    manual_model_type = "Qwen2_5_VLForConditionalGeneration (Qwen2.5-VL)"
+    try:
+        model_class = determine_model_class("test", "dummy_token", manual_model_type)
+        print(f"Manual detection returned: {model_class}")
+        print("✓ Manual model class detection works")
+        return True
+    except Exception as e:
+        print(f"✗ Error in manual detection: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+if __name__ == "__main__":
+    print("Testing Qwen2.5-VL architecture detection and quantization support...\n")
+    success1 = test_qwen2_5_vl_detection()
+    success2 = test_manual_model_class_detection()
+    if success1 and success2:
+        print("\n✓ All tests passed! Qwen2.5-VL models should now be properly supported.")
+    else:
+        print("\n✗ Some tests failed. Please check the implementation.")