Spaces:
Sleeping
Sleeping
save
Browse files- quantize_huihui_fara.py +365 -0
- quantize_qwen2_5_vl.py +396 -0
- test_final_solution.py +103 -0
- test_final_verification.py +123 -0
- test_new_quantization_methods.py +64 -0
- test_qwen2_5_vl_architecture.py +71 -0
quantize_huihui_fara.py
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Script to quantize the huihui-ai/Huihui-Fara-7B-abliterated model with Qwen2.5-VL architecture support
|
| 4 |
+
Uses sequential onloading for memory efficiency.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import base64
|
| 8 |
+
from io import BytesIO
|
| 9 |
+
import torch
|
| 10 |
+
from datasets import load_dataset
|
| 11 |
+
from qwen_vl_utils import process_vision_info
|
| 12 |
+
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
|
| 13 |
+
|
| 14 |
+
from llmcompressor import oneshot
|
| 15 |
+
from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
|
| 16 |
+
from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
|
| 17 |
+
from llmcompressor.utils import dispatch_for_generation
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def create_qwen2_5_vl_data_collator():
|
| 21 |
+
"""Create a data collator for Qwen2.5-VL models that handles multimodal inputs."""
|
| 22 |
+
def data_collator(batch):
|
| 23 |
+
assert len(batch) == 1
|
| 24 |
+
return {key: torch.tensor(value) if isinstance(value, (list, int, float)) else value
|
| 25 |
+
for key, value in batch[0].items()}
|
| 26 |
+
return data_collator
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length: int = 2048):
|
| 30 |
+
"""Create a preprocessing function for Qwen2.5-VL datasets."""
|
| 31 |
+
def preprocess_and_tokenize(example):
|
| 32 |
+
# Handle different image formats
|
| 33 |
+
if 'image' in example:
|
| 34 |
+
# Process image
|
| 35 |
+
if hasattr(example['image'], 'save'):
|
| 36 |
+
# PIL Image object
|
| 37 |
+
buffered = BytesIO()
|
| 38 |
+
example["image"].save(buffered, format="PNG")
|
| 39 |
+
encoded_image = base64.b64encode(buffered.getvalue())
|
| 40 |
+
encoded_image_text = encoded_image.decode("utf-8")
|
| 41 |
+
base64_qwen = f"data:image;base64,{encoded_image_text}"
|
| 42 |
+
else:
|
| 43 |
+
# Already a string or other format
|
| 44 |
+
base64_qwen = str(example["image"])
|
| 45 |
+
else:
|
| 46 |
+
# If there's no image field, try 'img' or similar
|
| 47 |
+
img_key = None
|
| 48 |
+
for key in example.keys():
|
| 49 |
+
if 'image' in key.lower() or 'img' in key.lower():
|
| 50 |
+
img_key = key
|
| 51 |
+
break
|
| 52 |
+
if img_key:
|
| 53 |
+
if hasattr(example[img_key], 'save'):
|
| 54 |
+
buffered = BytesIO()
|
| 55 |
+
example[img_key].save(buffered, format="PNG")
|
| 56 |
+
encoded_image = base64.b64encode(buffered.getvalue())
|
| 57 |
+
encoded_image_text = encoded_image.decode("utf-8")
|
| 58 |
+
base64_qwen = f"data:image;base64,{encoded_image_text}"
|
| 59 |
+
else:
|
| 60 |
+
base64_qwen = str(example[img_key])
|
| 61 |
+
else:
|
| 62 |
+
# If no image, create a simple text-only example
|
| 63 |
+
messages = [
|
| 64 |
+
{
|
| 65 |
+
"role": "user",
|
| 66 |
+
"content": [
|
| 67 |
+
{"type": "text", "text": example.get('text', example.get('content', 'What can you tell me about this?'))},
|
| 68 |
+
],
|
| 69 |
+
}
|
| 70 |
+
]
|
| 71 |
+
text = processor.apply_chat_template(
|
| 72 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
return processor(
|
| 76 |
+
text=[text],
|
| 77 |
+
padding=False,
|
| 78 |
+
max_length=max_sequence_length,
|
| 79 |
+
truncation=True,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Create message with image
|
| 83 |
+
messages = [
|
| 84 |
+
{
|
| 85 |
+
"role": "user",
|
| 86 |
+
"content": [
|
| 87 |
+
{"type": "image", "image": base64_qwen},
|
| 88 |
+
{"type": "text", "text": "What does the image show?"},
|
| 89 |
+
],
|
| 90 |
+
}
|
| 91 |
+
]
|
| 92 |
+
text = processor.apply_chat_template(
|
| 93 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 94 |
+
)
|
| 95 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 96 |
+
|
| 97 |
+
# tokenize
|
| 98 |
+
return processor(
|
| 99 |
+
text=[text],
|
| 100 |
+
images=image_inputs,
|
| 101 |
+
videos=video_inputs,
|
| 102 |
+
padding=False,
|
| 103 |
+
max_length=max_sequence_length,
|
| 104 |
+
truncation=True,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
return preprocess_and_tokenize
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def get_qwen2_5_vl_quantization_recipe(method: str, scheme: str = "W4A16"):
|
| 111 |
+
"""
|
| 112 |
+
Creates the appropriate quantization recipe for Qwen2.5-VL models.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
method: Quantization method ("GPTQ", "AWQ", or "FP8")
|
| 116 |
+
scheme: Quantization scheme (e.g., "W4A16", "W8A8", "FP8")
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
List of modifiers for the quantization recipe
|
| 120 |
+
"""
|
| 121 |
+
if method == "GPTQ":
|
| 122 |
+
return [
|
| 123 |
+
GPTQModifier(
|
| 124 |
+
targets="Linear",
|
| 125 |
+
scheme=scheme,
|
| 126 |
+
ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
|
| 127 |
+
sequential_targets=["Qwen2_5_VLDecoderLayer"], # This enables sequential onloading
|
| 128 |
+
),
|
| 129 |
+
]
|
| 130 |
+
elif method == "AWQ":
|
| 131 |
+
# Create AWQ mappings for Qwen2.5-VL architecture
|
| 132 |
+
mappings = [
|
| 133 |
+
AWQMapping(
|
| 134 |
+
"re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
|
| 135 |
+
),
|
| 136 |
+
AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
|
| 137 |
+
AWQMapping(
|
| 138 |
+
"re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
|
| 139 |
+
),
|
| 140 |
+
AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
|
| 141 |
+
]
|
| 142 |
+
return [
|
| 143 |
+
AWQModifier(
|
| 144 |
+
ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
|
| 145 |
+
scheme="W4A16_ASYM" if scheme == "W4A16" else scheme,
|
| 146 |
+
targets=["Linear"],
|
| 147 |
+
mappings=mappings,
|
| 148 |
+
sequential_targets=["Qwen2_5_VLDecoderLayer"], # Sequential onloading for memory efficiency
|
| 149 |
+
),
|
| 150 |
+
]
|
| 151 |
+
elif method == "FP8":
|
| 152 |
+
return [
|
| 153 |
+
QuantizationModifier(
|
| 154 |
+
scheme="FP8",
|
| 155 |
+
targets="Linear",
|
| 156 |
+
ignore=["lm_head", "re:visual.*", "re:model.visual.*"]
|
| 157 |
+
)
|
| 158 |
+
]
|
| 159 |
+
else:
|
| 160 |
+
raise ValueError(f"Unsupported quantization method: {method}")
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def quantize_huihui_fara_model(
|
| 164 |
+
model_id: str = "huihui-ai/Huihui-Fara-7B-abliterated",
|
| 165 |
+
quantization_method: str = "GPTQ",
|
| 166 |
+
output_dir: str = None,
|
| 167 |
+
dataset_id: str = "wikitext",
|
| 168 |
+
dataset_config: str = "wikitext-2-raw-v1",
|
| 169 |
+
dataset_split: str = "train[:1%]",
|
| 170 |
+
num_calibration_samples: int = 64,
|
| 171 |
+
max_sequence_length: int = 512,
|
| 172 |
+
scheme: str = "W4A16",
|
| 173 |
+
trust_remote_code: bool = True,
|
| 174 |
+
):
|
| 175 |
+
"""
|
| 176 |
+
Quantizes the huihui-ai/Huihui-Fara-7B-abliterated model with proper Qwen2.5-VL architecture support.
|
| 177 |
+
|
| 178 |
+
Args:
|
| 179 |
+
model_id: Hugging Face model ID to quantize
|
| 180 |
+
quantization_method: Method to use ("GPTQ", "AWQ", or "FP8")
|
| 181 |
+
output_dir: Directory to save the quantized model
|
| 182 |
+
dataset_id: Dataset ID for calibration
|
| 183 |
+
dataset_config: Dataset config for calibration
|
| 184 |
+
dataset_split: Dataset split for calibration
|
| 185 |
+
num_calibration_samples: Number of samples to use for calibration
|
| 186 |
+
max_sequence_length: Maximum sequence length for processing
|
| 187 |
+
scheme: Quantization scheme (e.g., "W4A16", "W8A8")
|
| 188 |
+
trust_remote_code: Whether to trust remote code in model loading
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
Quantized model
|
| 192 |
+
"""
|
| 193 |
+
print(f"Loading model: {model_id}")
|
| 194 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 195 |
+
model_id,
|
| 196 |
+
torch_dtype=torch.float16, # Use float16 to save memory
|
| 197 |
+
device_map="auto", # Auto device mapping for memory efficiency
|
| 198 |
+
trust_remote_code=trust_remote_code
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
print(f"Loading processor for: {model_id}")
|
| 202 |
+
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
|
| 203 |
+
|
| 204 |
+
# If output directory not specified, create one based on model and method
|
| 205 |
+
if not output_dir:
|
| 206 |
+
model_name = model_id.rstrip("/").split("/")[-1]
|
| 207 |
+
output_dir = f"{model_name}-{scheme.replace(':', '-')}-{quantization_method}"
|
| 208 |
+
|
| 209 |
+
print(f"Output directory: {output_dir}")
|
| 210 |
+
|
| 211 |
+
# Load dataset and preprocess
|
| 212 |
+
print(f"Loading dataset: {dataset_id}")
|
| 213 |
+
try:
|
| 214 |
+
# Try to load a multimodal dataset first
|
| 215 |
+
ds = load_dataset("lmms-lab/flickr30k", split="test[:64]")
|
| 216 |
+
print("Using multimodal dataset for calibration")
|
| 217 |
+
|
| 218 |
+
preprocess_fn = create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length)
|
| 219 |
+
ds = ds.map(preprocess_fn, remove_columns=ds.column_names)
|
| 220 |
+
except Exception as e:
|
| 221 |
+
print(f"Failed to load multimodal dataset: {e}, falling back to text-only dataset")
|
| 222 |
+
# If multimodal dataset fails, use text-only
|
| 223 |
+
ds = load_dataset(dataset_id, dataset_config, split=dataset_split)
|
| 224 |
+
ds = ds.shuffle(seed=42)
|
| 225 |
+
|
| 226 |
+
# Text-only preprocessing
|
| 227 |
+
def text_only_preprocess(example):
|
| 228 |
+
text = example.get('text', example.get('content', str(example)))
|
| 229 |
+
if not isinstance(text, str):
|
| 230 |
+
text = str(text)
|
| 231 |
+
# Limit text length to avoid exceeding max sequence length
|
| 232 |
+
text = text[:500] + "..." if len(text) > 500 else text
|
| 233 |
+
messages = [
|
| 234 |
+
{
|
| 235 |
+
"role": "user",
|
| 236 |
+
"content": [
|
| 237 |
+
{"type": "text", "text": text},
|
| 238 |
+
],
|
| 239 |
+
}
|
| 240 |
+
]
|
| 241 |
+
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 242 |
+
return processor(text=[prompt], padding=False, max_length=max_sequence_length, truncation=True)
|
| 243 |
+
|
| 244 |
+
ds = ds.map(text_only_preprocess, remove_columns=ds.column_names)
|
| 245 |
+
|
| 246 |
+
# Define data collator
|
| 247 |
+
data_collator = create_qwen2_5_vl_data_collator()
|
| 248 |
+
|
| 249 |
+
# Create recipe
|
| 250 |
+
recipe = get_qwen2_5_vl_quantization_recipe(quantization_method, scheme)
|
| 251 |
+
|
| 252 |
+
print(f"Starting quantization with method: {quantization_method}")
|
| 253 |
+
print(f"Using recipe: {recipe}")
|
| 254 |
+
print(f"Using sequential targets: {[mod.sequential_targets if hasattr(mod, 'sequential_targets') else 'N/A' for mod in recipe]}")
|
| 255 |
+
|
| 256 |
+
# Perform oneshot quantization with sequential onloading for memory efficiency
|
| 257 |
+
oneshot(
|
| 258 |
+
model=model,
|
| 259 |
+
tokenizer=processor, # Use processor as tokenizer for Qwen2.5-VL
|
| 260 |
+
dataset=ds,
|
| 261 |
+
recipe=recipe,
|
| 262 |
+
max_seq_length=max_sequence_length,
|
| 263 |
+
num_calibration_samples=num_calibration_samples,
|
| 264 |
+
trust_remote_code_model=trust_remote_code,
|
| 265 |
+
data_collator=data_collator,
|
| 266 |
+
save_compressed=True,
|
| 267 |
+
output_dir=output_dir,
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
print(f"Quantization completed! Model saved to: {output_dir}")
|
| 271 |
+
|
| 272 |
+
# Save the processor as well
|
| 273 |
+
processor.save_pretrained(output_dir)
|
| 274 |
+
|
| 275 |
+
return model
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def test_quantized_model(model, processor, max_sequence_length: int = 2048):
|
| 279 |
+
"""
|
| 280 |
+
Tests the quantized model with a sample generation.
|
| 281 |
+
"""
|
| 282 |
+
print("========== SAMPLE GENERATION ==============")
|
| 283 |
+
try:
|
| 284 |
+
dispatch_for_generation(model)
|
| 285 |
+
# Simple text-only test first
|
| 286 |
+
messages = [
|
| 287 |
+
{
|
| 288 |
+
"role": "user",
|
| 289 |
+
"content": [
|
| 290 |
+
{"type": "text", "text": "Hello, how are you today?"},
|
| 291 |
+
],
|
| 292 |
+
}
|
| 293 |
+
]
|
| 294 |
+
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
| 295 |
+
inputs = processor(
|
| 296 |
+
text=[prompt],
|
| 297 |
+
padding=False,
|
| 298 |
+
max_length=max_sequence_length,
|
| 299 |
+
truncation=True,
|
| 300 |
+
return_tensors="pt",
|
| 301 |
+
).to(model.device)
|
| 302 |
+
|
| 303 |
+
output = model.generate(**inputs, max_new_tokens=50)
|
| 304 |
+
result = processor.decode(output[0], skip_special_tokens=True)
|
| 305 |
+
print(result)
|
| 306 |
+
print("==========================================")
|
| 307 |
+
return result
|
| 308 |
+
except Exception as e:
|
| 309 |
+
print(f"Test generation failed: {e}")
|
| 310 |
+
import traceback
|
| 311 |
+
traceback.print_exc()
|
| 312 |
+
return None
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def main():
|
| 316 |
+
"""
|
| 317 |
+
Main function to quantize the Huihui-Fara model.
|
| 318 |
+
"""
|
| 319 |
+
import argparse
|
| 320 |
+
|
| 321 |
+
parser = argparse.ArgumentParser(description="Quantize huihui-ai/Huihui-Fara-7B-abliterated model")
|
| 322 |
+
parser.add_argument("--model_id", type=str, default="huihui-ai/Huihui-Fara-7B-abliterated",
|
| 323 |
+
help="Model ID to quantize")
|
| 324 |
+
parser.add_argument("--method", type=str, choices=["GPTQ", "AWQ", "FP8"],
|
| 325 |
+
default="GPTQ", help="Quantization method to use")
|
| 326 |
+
parser.add_argument("--output_dir", type=str, default=None,
|
| 327 |
+
help="Output directory for quantized model")
|
| 328 |
+
parser.add_argument("--dataset_id", type=str, default="wikitext",
|
| 329 |
+
help="Dataset for calibration (default: wikitext)")
|
| 330 |
+
parser.add_argument("--scheme", type=str, default="W4A16",
|
| 331 |
+
help="Quantization scheme (e.g., W4A16, W8A8)")
|
| 332 |
+
parser.add_argument("--num_samples", type=int, default=64,
|
| 333 |
+
help="Number of calibration samples")
|
| 334 |
+
|
| 335 |
+
args = parser.parse_args()
|
| 336 |
+
|
| 337 |
+
print(f"Starting quantization of {args.model_id} using {args.method}")
|
| 338 |
+
print("Note: This may take a while and will use sequential onloading for memory efficiency...")
|
| 339 |
+
|
| 340 |
+
try:
|
| 341 |
+
# Quantize the model
|
| 342 |
+
quantized_model = quantize_huihui_fara_model(
|
| 343 |
+
model_id=args.model_id,
|
| 344 |
+
quantization_method=args.method,
|
| 345 |
+
output_dir=args.output_dir,
|
| 346 |
+
dataset_id=args.dataset_id,
|
| 347 |
+
num_calibration_samples=args.num_samples,
|
| 348 |
+
scheme=args.scheme
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
# Test the model
|
| 352 |
+
processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True)
|
| 353 |
+
test_quantized_model(quantized_model, processor)
|
| 354 |
+
|
| 355 |
+
print(f"β
Successfully quantized {args.model_id} with {args.method}")
|
| 356 |
+
print(f"Model saved to: {args.output_dir or args.model_id.split('/')[-1] + f'-{args.scheme}-{args.method}'}")
|
| 357 |
+
|
| 358 |
+
except Exception as e:
|
| 359 |
+
print(f"β Quantization failed: {e}")
|
| 360 |
+
import traceback
|
| 361 |
+
traceback.print_exc()
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
if __name__ == "__main__":
|
| 365 |
+
main()
|
quantize_qwen2_5_vl.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Specialized script for quantizing Qwen2.5-VL models with sequential onloading
|
| 4 |
+
Handles quantization of Qwen2_5_VLForConditionalGeneration models properly
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import base64
|
| 8 |
+
from io import BytesIO
|
| 9 |
+
from typing import Optional, Union, Dict, Any
|
| 10 |
+
import torch
|
| 11 |
+
from datasets import load_dataset
|
| 12 |
+
from qwen_vl_utils import process_vision_info
|
| 13 |
+
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, AutoTokenizer
|
| 14 |
+
|
| 15 |
+
from llmcompressor import oneshot
|
| 16 |
+
from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
|
| 17 |
+
from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
|
| 18 |
+
from llmcompressor.utils import dispatch_for_generation
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def create_qwen2_5_vl_data_collator():
|
| 22 |
+
"""Create a data collator for Qwen2.5-VL models that handles multimodal inputs."""
|
| 23 |
+
def data_collator(batch):
|
| 24 |
+
assert len(batch) == 1
|
| 25 |
+
return {key: torch.tensor(value) if isinstance(value, (list, int, float)) else value
|
| 26 |
+
for key, value in batch[0].items()}
|
| 27 |
+
return data_collator
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length: int = 2048):
|
| 31 |
+
"""Create a preprocessing function for Qwen2.5-VL datasets."""
|
| 32 |
+
def preprocess_and_tokenize(example):
|
| 33 |
+
# Handle different image formats
|
| 34 |
+
if 'image' in example:
|
| 35 |
+
# Process image
|
| 36 |
+
if hasattr(example['image'], 'save'):
|
| 37 |
+
# PIL Image object
|
| 38 |
+
buffered = BytesIO()
|
| 39 |
+
example["image"].save(buffered, format="PNG")
|
| 40 |
+
encoded_image = base64.b64encode(buffered.getvalue())
|
| 41 |
+
encoded_image_text = encoded_image.decode("utf-8")
|
| 42 |
+
base64_qwen = f"data:image;base64,{encoded_image_text}"
|
| 43 |
+
else:
|
| 44 |
+
# Already a string or other format
|
| 45 |
+
base64_qwen = str(example["image"])
|
| 46 |
+
else:
|
| 47 |
+
# If there's no image field, try 'img' or similar
|
| 48 |
+
img_key = None
|
| 49 |
+
for key in example.keys():
|
| 50 |
+
if 'image' in key.lower() or 'img' in key.lower():
|
| 51 |
+
img_key = key
|
| 52 |
+
break
|
| 53 |
+
if img_key:
|
| 54 |
+
if hasattr(example[img_key], 'save'):
|
| 55 |
+
buffered = BytesIO()
|
| 56 |
+
example[img_key].save(buffered, format="PNG")
|
| 57 |
+
encoded_image = base64.b64encode(buffered.getvalue())
|
| 58 |
+
encoded_image_text = encoded_image.decode("utf-8")
|
| 59 |
+
base64_qwen = f"data:image;base64,{encoded_image_text}"
|
| 60 |
+
else:
|
| 61 |
+
base64_qwen = str(example[img_key])
|
| 62 |
+
else:
|
| 63 |
+
# If no image, create a simple text-only example
|
| 64 |
+
messages = [
|
| 65 |
+
{
|
| 66 |
+
"role": "user",
|
| 67 |
+
"content": [
|
| 68 |
+
{"type": "text", "text": example.get('text', example.get('content', 'What can you tell me about this?'))},
|
| 69 |
+
],
|
| 70 |
+
}
|
| 71 |
+
]
|
| 72 |
+
text = processor.apply_chat_template(
|
| 73 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
return processor(
|
| 77 |
+
text=[text],
|
| 78 |
+
padding=False,
|
| 79 |
+
max_length=max_sequence_length,
|
| 80 |
+
truncation=True,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Create message with image
|
| 84 |
+
messages = [
|
| 85 |
+
{
|
| 86 |
+
"role": "user",
|
| 87 |
+
"content": [
|
| 88 |
+
{"type": "image", "image": base64_qwen},
|
| 89 |
+
{"type": "text", "text": "What does the image show?"},
|
| 90 |
+
],
|
| 91 |
+
}
|
| 92 |
+
]
|
| 93 |
+
text = processor.apply_chat_template(
|
| 94 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 95 |
+
)
|
| 96 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 97 |
+
|
| 98 |
+
# tokenize
|
| 99 |
+
return processor(
|
| 100 |
+
text=[text],
|
| 101 |
+
images=image_inputs,
|
| 102 |
+
videos=video_inputs,
|
| 103 |
+
padding=False,
|
| 104 |
+
max_length=max_sequence_length,
|
| 105 |
+
truncation=True,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
return preprocess_and_tokenize
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def get_qwen2_5_vl_quantization_recipe(method: str, scheme: str = "W4A16"):
|
| 112 |
+
"""
|
| 113 |
+
Creates the appropriate quantization recipe for Qwen2.5-VL models.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
method: Quantization method ("GPTQ", "AWQ", or "FP8")
|
| 117 |
+
scheme: Quantization scheme (e.g., "W4A16", "W8A8", "FP8")
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
List of modifiers for the quantization recipe
|
| 121 |
+
"""
|
| 122 |
+
if method == "GPTQ":
|
| 123 |
+
return [
|
| 124 |
+
GPTQModifier(
|
| 125 |
+
targets="Linear",
|
| 126 |
+
scheme=scheme,
|
| 127 |
+
ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
|
| 128 |
+
sequential_targets=["Qwen2_5_VLDecoderLayer"], # This is key for the architecture
|
| 129 |
+
),
|
| 130 |
+
]
|
| 131 |
+
elif method == "AWQ":
|
| 132 |
+
# Create AWQ mappings for Qwen2.5-VL architecture
|
| 133 |
+
mappings = [
|
| 134 |
+
AWQMapping(
|
| 135 |
+
"re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
|
| 136 |
+
),
|
| 137 |
+
AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
|
| 138 |
+
AWQMapping(
|
| 139 |
+
"re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
|
| 140 |
+
),
|
| 141 |
+
AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
|
| 142 |
+
]
|
| 143 |
+
return [
|
| 144 |
+
AWQModifier(
|
| 145 |
+
ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
|
| 146 |
+
scheme="W4A16_ASYM" if scheme == "W4A16" else scheme,
|
| 147 |
+
targets=["Linear"],
|
| 148 |
+
mappings=mappings,
|
| 149 |
+
),
|
| 150 |
+
]
|
| 151 |
+
elif method == "FP8":
|
| 152 |
+
return [
|
| 153 |
+
QuantizationModifier(
|
| 154 |
+
scheme="FP8",
|
| 155 |
+
targets="Linear",
|
| 156 |
+
ignore=["lm_head", "re:visual.*", "re:model.visual.*"]
|
| 157 |
+
)
|
| 158 |
+
]
|
| 159 |
+
else:
|
| 160 |
+
raise ValueError(f"Unsupported quantization method: {method}")
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def quantize_qwen2_5_vl_model(
|
| 164 |
+
model_id: str,
|
| 165 |
+
quantization_method: str,
|
| 166 |
+
output_dir: Optional[str] = None,
|
| 167 |
+
dataset_id: str = "lmms-lab/flickr30k",
|
| 168 |
+
dataset_split: str = "test[:512]",
|
| 169 |
+
num_calibration_samples: int = 512,
|
| 170 |
+
max_sequence_length: int = 2048,
|
| 171 |
+
scheme: str = "W4A16",
|
| 172 |
+
trust_remote_code: bool = True,
|
| 173 |
+
):
|
| 174 |
+
"""
|
| 175 |
+
Quantizes a Qwen2.5-VL model with proper architecture handling and sequential onloading.
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
model_id: Hugging Face model ID to quantize
|
| 179 |
+
quantization_method: Method to use ("GPTQ", "AWQ", or "FP8")
|
| 180 |
+
output_dir: Directory to save the quantized model
|
| 181 |
+
dataset_id: Dataset ID for calibration
|
| 182 |
+
dataset_split: Dataset split for calibration
|
| 183 |
+
num_calibration_samples: Number of samples to use for calibration
|
| 184 |
+
max_sequence_length: Maximum sequence length for processing
|
| 185 |
+
scheme: Quantization scheme (e.g., "W4A16", "W8A8")
|
| 186 |
+
trust_remote_code: Whether to trust remote code in model loading
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
Quantized model
|
| 190 |
+
"""
|
| 191 |
+
print(f"Loading model: {model_id}")
|
| 192 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 193 |
+
model_id,
|
| 194 |
+
torch_dtype="auto",
|
| 195 |
+
device_map=None, # Let the system decide device mapping
|
| 196 |
+
trust_remote_code=trust_remote_code
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
print(f"Loading processor for: {model_id}")
|
| 200 |
+
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
|
| 201 |
+
|
| 202 |
+
# If output directory not specified, create one based on model and method
|
| 203 |
+
if not output_dir:
|
| 204 |
+
model_name = model_id.rstrip("/").split("/")[-1]
|
| 205 |
+
output_dir = f"{model_name}-{scheme.replace(':', '-')}-{quantization_method}"
|
| 206 |
+
|
| 207 |
+
print(f"Output directory: {output_dir}")
|
| 208 |
+
|
| 209 |
+
# Load dataset and preprocess
|
| 210 |
+
print(f"Loading dataset: {dataset_id}")
|
| 211 |
+
try:
|
| 212 |
+
ds = load_dataset(dataset_id, split=dataset_split)
|
| 213 |
+
except Exception as e:
|
| 214 |
+
print(f"Failed to load {dataset_id}, trying alternative text-only dataset: {e}")
|
| 215 |
+
# If the image dataset fails, try a text-only dataset
|
| 216 |
+
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:512]")
|
| 217 |
+
# We'll need to adjust preprocessing for text-only data
|
| 218 |
+
|
| 219 |
+
ds = ds.shuffle(seed=42)
|
| 220 |
+
|
| 221 |
+
# Apply preprocessing
|
| 222 |
+
preprocess_fn = create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length)
|
| 223 |
+
try:
|
| 224 |
+
ds = ds.map(preprocess_fn, remove_columns=ds.column_names if hasattr(ds, 'column_names') else [])
|
| 225 |
+
except Exception as e:
|
| 226 |
+
print(f"Preprocessing failed: {e}")
|
| 227 |
+
print("Trying simpler preprocessing with text-only data...")
|
| 228 |
+
# Fallback: use text-only preprocessing
|
| 229 |
+
def text_only_preprocess(example):
|
| 230 |
+
text = example.get('text', example.get('content', str(example)))
|
| 231 |
+
if not isinstance(text, str):
|
| 232 |
+
text = str(text)
|
| 233 |
+
messages = [
|
| 234 |
+
{
|
| 235 |
+
"role": "user",
|
| 236 |
+
"content": [
|
| 237 |
+
{"type": "text", "text": text[:500] + "..." if len(text) > 500 else text}, # Limit length
|
| 238 |
+
],
|
| 239 |
+
}
|
| 240 |
+
]
|
| 241 |
+
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 242 |
+
return processor(text=[prompt], padding=False, max_length=max_sequence_length, truncation=True)
|
| 243 |
+
|
| 244 |
+
ds = ds.map(text_only_preprocess, remove_columns=ds.column_names if hasattr(ds, 'column_names') else [])
|
| 245 |
+
|
| 246 |
+
# Define data collator
|
| 247 |
+
data_collator = create_qwen2_5_vl_data_collator()
|
| 248 |
+
|
| 249 |
+
# Create recipe
|
| 250 |
+
recipe = get_qwen2_5_vl_quantization_recipe(quantization_method, scheme)
|
| 251 |
+
|
| 252 |
+
print(f"Starting quantization with method: {quantization_method}")
|
| 253 |
+
print(f"Using recipe: {recipe}")
|
| 254 |
+
|
| 255 |
+
# Perform oneshot quantization with sequential targets and proper handling
|
| 256 |
+
oneshot(
|
| 257 |
+
model=model,
|
| 258 |
+
tokenizer=processor, # Use processor as tokenizer for Qwen2.5-VL
|
| 259 |
+
dataset=ds,
|
| 260 |
+
recipe=recipe,
|
| 261 |
+
max_seq_length=max_sequence_length,
|
| 262 |
+
num_calibration_samples=num_calibration_samples,
|
| 263 |
+
trust_remote_code_model=trust_remote_code,
|
| 264 |
+
data_collator=data_collator,
|
| 265 |
+
# Use sequential onloading for memory efficiency
|
| 266 |
+
sequential_targets=["Qwen2_5_VLDecoderLayer"],
|
| 267 |
+
save_compressed=True,
|
| 268 |
+
output_dir=output_dir,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
print(f"Quantization completed! Model saved to: {output_dir}")
|
| 272 |
+
|
| 273 |
+
# Save the processor as well
|
| 274 |
+
processor.save_pretrained(output_dir)
|
| 275 |
+
|
| 276 |
+
return model
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def test_quantized_model(model, processor, max_sequence_length: int = 2048):
|
| 280 |
+
"""
|
| 281 |
+
Tests the quantized model with a sample generation.
|
| 282 |
+
"""
|
| 283 |
+
print("========== SAMPLE GENERATION ==============")
|
| 284 |
+
try:
|
| 285 |
+
dispatch_for_generation(model)
|
| 286 |
+
messages = [
|
| 287 |
+
{
|
| 288 |
+
"role": "user",
|
| 289 |
+
"content": [
|
| 290 |
+
{
|
| 291 |
+
"type": "image",
|
| 292 |
+
"image": "http://images.cocodataset.org/train2017/000000231895.jpg",
|
| 293 |
+
},
|
| 294 |
+
{"type": "text", "text": "Please describe the animal in this image\n"},
|
| 295 |
+
],
|
| 296 |
+
}
|
| 297 |
+
]
|
| 298 |
+
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
| 299 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 300 |
+
inputs = processor(
|
| 301 |
+
text=[prompt],
|
| 302 |
+
images=image_inputs,
|
| 303 |
+
videos=video_inputs,
|
| 304 |
+
padding=False,
|
| 305 |
+
max_length=max_sequence_length,
|
| 306 |
+
truncation=True,
|
| 307 |
+
return_tensors="pt",
|
| 308 |
+
).to(model.device)
|
| 309 |
+
|
| 310 |
+
output = model.generate(**inputs, max_new_tokens=100)
|
| 311 |
+
result = processor.decode(output[0], skip_special_tokens=True)
|
| 312 |
+
print(result)
|
| 313 |
+
print("==========================================")
|
| 314 |
+
return result
|
| 315 |
+
except Exception as e:
|
| 316 |
+
print(f"Test generation failed: {e}")
|
| 317 |
+
print("Trying text-only generation...")
|
| 318 |
+
# Try with text-only
|
| 319 |
+
try:
|
| 320 |
+
messages = [
|
| 321 |
+
{
|
| 322 |
+
"role": "user",
|
| 323 |
+
"content": [
|
| 324 |
+
{"type": "text", "text": "Hello, how are you today?"},
|
| 325 |
+
],
|
| 326 |
+
}
|
| 327 |
+
]
|
| 328 |
+
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
| 329 |
+
inputs = processor(
|
| 330 |
+
text=[prompt],
|
| 331 |
+
padding=False,
|
| 332 |
+
max_length=max_sequence_length,
|
| 333 |
+
truncation=True,
|
| 334 |
+
return_tensors="pt",
|
| 335 |
+
).to(model.device)
|
| 336 |
+
|
| 337 |
+
output = model.generate(**inputs, max_new_tokens=50)
|
| 338 |
+
result = processor.decode(output[0], skip_special_tokens=True)
|
| 339 |
+
print(result)
|
| 340 |
+
print("==========================================")
|
| 341 |
+
return result
|
| 342 |
+
except Exception as e2:
|
| 343 |
+
print(f"Text-only generation also failed: {e2}")
|
| 344 |
+
return None
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
def main():
|
| 348 |
+
"""
|
| 349 |
+
Main function to demonstrate quantization of Qwen2.5-VL models.
|
| 350 |
+
"""
|
| 351 |
+
import argparse
|
| 352 |
+
|
| 353 |
+
parser = argparse.ArgumentParser(description="Quantize Qwen2.5-VL models")
|
| 354 |
+
parser.add_argument("--model_id", type=str, required=True,
|
| 355 |
+
help="Model ID to quantize (e.g., 'huihui-ai/Huihui-Fara-7B-abliterated')")
|
| 356 |
+
parser.add_argument("--method", type=str, choices=["GPTQ", "AWQ", "FP8"],
|
| 357 |
+
default="GPTQ", help="Quantization method to use")
|
| 358 |
+
parser.add_argument("--output_dir", type=str, default=None,
|
| 359 |
+
help="Output directory for quantized model")
|
| 360 |
+
parser.add_argument("--dataset_id", type=str, default="lmms-lab/flickr30k",
|
| 361 |
+
help="Dataset for calibration (default: lmms-lab/flickr30k)")
|
| 362 |
+
parser.add_argument("--scheme", type=str, default="W4A16",
|
| 363 |
+
help="Quantization scheme (e.g., W4A16, W8A8)")
|
| 364 |
+
parser.add_argument("--num_samples", type=int, default=128,
|
| 365 |
+
help="Number of calibration samples")
|
| 366 |
+
|
| 367 |
+
args = parser.parse_args()
|
| 368 |
+
|
| 369 |
+
print(f"Starting quantization of {args.model_id} using {args.method}")
|
| 370 |
+
|
| 371 |
+
try:
|
| 372 |
+
# Quantize the model
|
| 373 |
+
quantized_model = quantize_qwen2_5_vl_model(
|
| 374 |
+
model_id=args.model_id,
|
| 375 |
+
quantization_method=args.method,
|
| 376 |
+
output_dir=args.output_dir,
|
| 377 |
+
dataset_id=args.dataset_id,
|
| 378 |
+
num_calibration_samples=args.num_samples,
|
| 379 |
+
scheme=args.scheme
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
# Test the model
|
| 383 |
+
# Load the processor again to test
|
| 384 |
+
processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True)
|
| 385 |
+
test_quantized_model(quantized_model, processor)
|
| 386 |
+
|
| 387 |
+
print(f"Successfully quantized {args.model_id} with {args.method}")
|
| 388 |
+
|
| 389 |
+
except Exception as e:
|
| 390 |
+
print(f"Quantization failed: {e}")
|
| 391 |
+
import traceback
|
| 392 |
+
traceback.print_exc()
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
if __name__ == "__main__":
|
| 396 |
+
main()
|
test_final_solution.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Final verification test after implementing proper AWQ incompatibility with Qwen2.5-VL models
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from app import get_quantization_recipe
|
| 7 |
+
|
| 8 |
+
def test_qwen2_5_vl_compatible_methods():
|
| 9 |
+
"""
|
| 10 |
+
Test all methods that should work with Qwen2.5-VL models
|
| 11 |
+
"""
|
| 12 |
+
print("Testing quantization methods compatible with Qwen2.5-VL models...")
|
| 13 |
+
|
| 14 |
+
# Methods that should work
|
| 15 |
+
compatible_methods = ["GPTQ", "W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "FP8"]
|
| 16 |
+
|
| 17 |
+
all_passed = True
|
| 18 |
+
for method in compatible_methods:
|
| 19 |
+
try:
|
| 20 |
+
recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
|
| 21 |
+
print(f"β {method} works with Qwen2_5_VLForConditionalGeneration")
|
| 22 |
+
if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
|
| 23 |
+
print(f" - Uses sequential onloading: {recipe[0].sequential_targets}")
|
| 24 |
+
print(f" - Ignore patterns: {recipe[0].ignore}")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"β {method} failed: {e}")
|
| 27 |
+
all_passed = False
|
| 28 |
+
|
| 29 |
+
return all_passed
|
| 30 |
+
|
| 31 |
+
def test_awq_incompatibility():
|
| 32 |
+
"""
|
| 33 |
+
Test that AWQ properly fails for Qwen2.5-VL models
|
| 34 |
+
"""
|
| 35 |
+
print("\nTesting AWQ incompatibility with Qwen2.5-VL models...")
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
recipe = get_quantization_recipe("AWQ", "Qwen2_5_VLForConditionalGeneration")
|
| 39 |
+
print("β AWQ unexpectedly succeeded for Qwen2.5-VL (should have failed)")
|
| 40 |
+
return False
|
| 41 |
+
except ValueError as e:
|
| 42 |
+
if "not compatible" in str(e) and "rotary positional embeddings" in str(e):
|
| 43 |
+
print(f"β AWQ properly fails for Qwen2.5-VL: {e}")
|
| 44 |
+
return True
|
| 45 |
+
else:
|
| 46 |
+
print(f"β AWQ failed but with wrong error: {e}")
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
def test_awq_still_works_for_llama():
|
| 50 |
+
"""
|
| 51 |
+
Test that AWQ still works for Llama models
|
| 52 |
+
"""
|
| 53 |
+
print("\nTesting AWQ still works for Llama models...")
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
recipe = get_quantization_recipe("AWQ", "LlamaForCausalLM")
|
| 57 |
+
print(f"β AWQ still works for LlamaForCausalLM")
|
| 58 |
+
print(f" - Ignore patterns: {recipe[0].ignore}")
|
| 59 |
+
return True
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"β AWQ failed for LlamaForCausalLM: {e}")
|
| 62 |
+
return False
|
| 63 |
+
|
| 64 |
+
def test_target_model():
|
| 65 |
+
"""
|
| 66 |
+
Test with the specific target model
|
| 67 |
+
"""
|
| 68 |
+
print(f"\nTesting with target model architecture: Qwen2_5_VLForConditionalGeneration")
|
| 69 |
+
|
| 70 |
+
# All methods except AWQ should work
|
| 71 |
+
methods = ["GPTQ", "W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "FP8"]
|
| 72 |
+
|
| 73 |
+
success_count = 0
|
| 74 |
+
for method in methods:
|
| 75 |
+
try:
|
| 76 |
+
recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
|
| 77 |
+
success_count += 1
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"Method {method} failed: {e}")
|
| 80 |
+
|
| 81 |
+
print(f"β {success_count}/{len(methods)} methods work for target model")
|
| 82 |
+
return success_count == len(methods)
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
print("Final verification after fixing AWQ incompatibility issue\n")
|
| 86 |
+
|
| 87 |
+
test1 = test_qwen2_5_vl_compatible_methods()
|
| 88 |
+
test2 = test_awq_incompatibility()
|
| 89 |
+
test3 = test_awq_still_works_for_llama()
|
| 90 |
+
test4 = test_target_model()
|
| 91 |
+
|
| 92 |
+
print(f"\n{'='*60}")
|
| 93 |
+
if test1 and test2 and test3 and test4:
|
| 94 |
+
print("β
ALL TESTS PASSED")
|
| 95 |
+
print("\nSOLUTION SUMMARY:")
|
| 96 |
+
print("β’ AWQ is now properly blocked for Qwen2.5-VL models due to incompatibility")
|
| 97 |
+
print("β’ All other methods (GPTQ, W4A16, W8A16, W8A8_INT8, W8A8_FP8, FP8) work for Qwen2.5-VL")
|
| 98 |
+
print("β’ AWQ still works for Llama models as expected")
|
| 99 |
+
print("β’ Sequential onloading is preserved for memory efficiency")
|
| 100 |
+
print("β’ Users will get clear error messages when trying incompatible methods")
|
| 101 |
+
else:
|
| 102 |
+
print("β SOME TESTS FAILED")
|
| 103 |
+
print(f"{'='*60}")
|
test_final_verification.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Final test to confirm the original issue is resolved:
|
| 4 |
+
GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from app import get_quantization_recipe
|
| 8 |
+
|
| 9 |
+
def test_original_issue_fixed():
|
| 10 |
+
"""
|
| 11 |
+
Test to confirm the original error is fixed.
|
| 12 |
+
The original error was:
|
| 13 |
+
GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture
|
| 14 |
+
"""
|
| 15 |
+
print("Testing the original issue that was reported...")
|
| 16 |
+
print("Original error: GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture")
|
| 17 |
+
print()
|
| 18 |
+
|
| 19 |
+
# Test the original problematic case
|
| 20 |
+
try:
|
| 21 |
+
recipe = get_quantization_recipe("GPTQ", "Qwen2_5_VLForConditionalGeneration")
|
| 22 |
+
print("β GPTQ quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
|
| 23 |
+
print(f" Recipe: {recipe}")
|
| 24 |
+
if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
|
| 25 |
+
print(f" Uses sequential onloading: {recipe[0].sequential_targets}")
|
| 26 |
+
print(f" Ignores visual components: {recipe[0].ignore}")
|
| 27 |
+
success_gptq = True
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"β GPTQ still fails: {e}")
|
| 30 |
+
success_gptq = False
|
| 31 |
+
|
| 32 |
+
print()
|
| 33 |
+
|
| 34 |
+
# Test other methods that were also problematic
|
| 35 |
+
other_methods = ["AWQ", "FP8"]
|
| 36 |
+
success_others = True
|
| 37 |
+
for method in other_methods:
|
| 38 |
+
try:
|
| 39 |
+
recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
|
| 40 |
+
print(f"β {method} quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
|
| 41 |
+
if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
|
| 42 |
+
print(f" Uses sequential onloading: {recipe[0].sequential_targets}")
|
| 43 |
+
success_others = success_others and True
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"β {method} still fails: {e}")
|
| 46 |
+
success_others = False
|
| 47 |
+
|
| 48 |
+
print()
|
| 49 |
+
|
| 50 |
+
# Test new methods for Qwen2.5-VL
|
| 51 |
+
new_methods = ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8"]
|
| 52 |
+
success_new = True
|
| 53 |
+
for method in new_methods:
|
| 54 |
+
try:
|
| 55 |
+
recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
|
| 56 |
+
print(f"β {method} quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
|
| 57 |
+
success_new = success_new and True
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"β {method} fails: {e}")
|
| 60 |
+
success_new = False
|
| 61 |
+
|
| 62 |
+
print()
|
| 63 |
+
|
| 64 |
+
if success_gptq and success_others and success_new:
|
| 65 |
+
print("π SUCCESS: The original issue has been completely resolved!")
|
| 66 |
+
print(" - GPTQ now works for Qwen2_5_VLForConditionalGeneration")
|
| 67 |
+
print(" - AWQ now works for Qwen2_5_VLForConditionalGeneration")
|
| 68 |
+
print(" - FP8 now works for Qwen2_5_VLForConditionalGeneration")
|
| 69 |
+
print(" - New methods (W4A16, W8A16, W8A8_INT8, W8A8_FP8) also work!")
|
| 70 |
+
print(" - Sequential onloading is used for memory efficiency")
|
| 71 |
+
print(" - Visual components are properly ignored during quantization")
|
| 72 |
+
return True
|
| 73 |
+
else:
|
| 74 |
+
print("β FAILURE: Some issues remain")
|
| 75 |
+
return False
|
| 76 |
+
|
| 77 |
+
def test_specific_model():
|
| 78 |
+
"""
|
| 79 |
+
Test with the specific model mentioned: huihui-ai/Huihui-Fara-7B-abliterated
|
| 80 |
+
"""
|
| 81 |
+
print("\n" + "="*60)
|
| 82 |
+
print("Testing with the specific model: huihui-ai/Huihui-Fara-7B-abliterated")
|
| 83 |
+
print("(This model has architecture: Qwen2_5_VLForConditionalGeneration)")
|
| 84 |
+
print("="*60)
|
| 85 |
+
|
| 86 |
+
# All the methods that should now work for this model
|
| 87 |
+
methods = ["GPTQ", "AWQ", "FP8", "W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8"]
|
| 88 |
+
|
| 89 |
+
success = True
|
| 90 |
+
for method in methods:
|
| 91 |
+
try:
|
| 92 |
+
recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
|
| 93 |
+
print(f"β {method}: OK")
|
| 94 |
+
except Exception as e:
|
| 95 |
+
print(f"β {method}: FAILED - {e}")
|
| 96 |
+
success = False
|
| 97 |
+
|
| 98 |
+
if success:
|
| 99 |
+
print(f"\nπ All {len(methods)} quantization methods now work for the target model!")
|
| 100 |
+
print("Users can now quantize huihui-ai/Huihui-Fara-7B-abliterated with any of these methods.")
|
| 101 |
+
else:
|
| 102 |
+
print("\nβ Some methods still don't work for the target model.")
|
| 103 |
+
|
| 104 |
+
return success
|
| 105 |
+
|
| 106 |
+
if __name__ == "__main__":
|
| 107 |
+
print("Testing resolution of the original quantization issue...\n")
|
| 108 |
+
|
| 109 |
+
issue_fixed = test_original_issue_fixed()
|
| 110 |
+
model_specific = test_specific_model()
|
| 111 |
+
|
| 112 |
+
print("\n" + "="*60)
|
| 113 |
+
if issue_fixed and model_specific:
|
| 114 |
+
print("β
ALL TESTS PASSED - The issue is completely resolved!")
|
| 115 |
+
print("\nThe Hugging Face Space now supports:")
|
| 116 |
+
print(" β’ All original methods: GPTQ, AWQ, FP8")
|
| 117 |
+
print(" β’ New methods: W4A16, W8A16, W8A8_INT8, W8A8_FP8")
|
| 118 |
+
print(" β’ Sequential onloading for memory efficiency")
|
| 119 |
+
print(" β’ Proper handling of Qwen2.5-VL visual components")
|
| 120 |
+
print(" β’ All methods work with Qwen2_5_VLForConditionalGeneration models")
|
| 121 |
+
else:
|
| 122 |
+
print("β SOME TESTS FAILED - Issue may not be completely resolved")
|
| 123 |
+
print("="*60)
|
test_new_quantization_methods.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify that the new quantization methods work with Qwen2.5-VL architecture
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from app import get_quantization_recipe
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
def test_new_quantization_methods():
|
| 10 |
+
"""
|
| 11 |
+
Test the new quantization methods with Qwen2.5-VL architecture.
|
| 12 |
+
"""
|
| 13 |
+
architectures = ["Qwen2_5_VLForConditionalGeneration"]
|
| 14 |
+
|
| 15 |
+
# Test all the new quantization methods
|
| 16 |
+
new_methods = ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "FP8", "SmoothQuant", "SparseGPT"]
|
| 17 |
+
|
| 18 |
+
print(f"Testing new quantization methods with architecture: {architectures[0]}")
|
| 19 |
+
|
| 20 |
+
for method in new_methods:
|
| 21 |
+
print(f"\nTesting {method} quantization recipe...")
|
| 22 |
+
try:
|
| 23 |
+
if method in ["SmoothQuant", "SparseGPT"] and architectures[0] == "Qwen2_5_VLForConditionalGeneration":
|
| 24 |
+
# These methods don't support Qwen2_5_VLForConditionalGeneration, so they should raise an error
|
| 25 |
+
try:
|
| 26 |
+
recipe = get_quantization_recipe(method, architectures[0])
|
| 27 |
+
print(f"β {method} should not be supported for Qwen2.5-VL but it didn't raise an error")
|
| 28 |
+
except ValueError as e:
|
| 29 |
+
print(f"β {method} correctly raises error for Qwen2.5-VL: {e}")
|
| 30 |
+
else:
|
| 31 |
+
recipe = get_quantization_recipe(method, architectures[0])
|
| 32 |
+
print(f"β {method} recipe created successfully: {recipe}")
|
| 33 |
+
if hasattr(recipe[0], 'scheme'):
|
| 34 |
+
print(f" Scheme: {recipe[0].scheme}")
|
| 35 |
+
if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
|
| 36 |
+
print(f" Sequential targets: {recipe[0].sequential_targets}")
|
| 37 |
+
if hasattr(recipe[0], 'ignore'):
|
| 38 |
+
print(f" Ignore layers: {recipe[0].ignore}")
|
| 39 |
+
except ValueError as e:
|
| 40 |
+
if method in ["SmoothQuant", "SparseGPT"]:
|
| 41 |
+
# These are expected to not work with Qwen2.5-VL
|
| 42 |
+
print(f"β {method} correctly not supported for Qwen2.5-VL: {e}")
|
| 43 |
+
else:
|
| 44 |
+
print(f"β Error with {method}: {e}")
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"β Unexpected error with {method}: {e}")
|
| 47 |
+
|
| 48 |
+
# Test that Llama models still work with all methods
|
| 49 |
+
print(f"\n\nTesting LlamaForCausalLM compatibility...")
|
| 50 |
+
llama_arch = "LlamaForCausalLM"
|
| 51 |
+
for method in new_methods:
|
| 52 |
+
print(f"Testing {method} with {llama_arch}...")
|
| 53 |
+
try:
|
| 54 |
+
recipe = get_quantization_recipe(method, llama_arch)
|
| 55 |
+
print(f"β {method} works with {llama_arch}")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"β {method} failed with {llama_arch}: {e}")
|
| 58 |
+
|
| 59 |
+
return True
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
print("Testing new quantization methods...\n")
|
| 63 |
+
test_new_quantization_methods()
|
| 64 |
+
print("\nβ Testing of new quantization methods completed!")
|
test_qwen2_5_vl_architecture.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify that the Qwen2.5-VL architecture detection and quantization recipe work correctly
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from transformers import AutoConfig
|
| 7 |
+
from app import determine_model_class, get_quantization_recipe
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
def test_qwen2_5_vl_detection():
|
| 11 |
+
"""
|
| 12 |
+
Test to see if we can properly detect the Qwen2.5-VL architecture.
|
| 13 |
+
We'll use a known Qwen2.5-VL model ID to test the detection.
|
| 14 |
+
"""
|
| 15 |
+
# For testing purposes, use a known Qwen2.5-VL model ID
|
| 16 |
+
model_id = "Qwen/Qwen2.5-VL-7B-Instruct" # Use a known Qwen2.5-VL model
|
| 17 |
+
|
| 18 |
+
# Simulate the architecture string that would come from the model config
|
| 19 |
+
# In the real scenario, this comes from model.config.architectures[0]
|
| 20 |
+
architectures = ["Qwen2_5_VLForConditionalGeneration"]
|
| 21 |
+
|
| 22 |
+
print(f"Testing architecture detection for: {model_id}")
|
| 23 |
+
print(f"Architectures found: {architectures}")
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
# Test if our recipe function can handle this architecture
|
| 27 |
+
for method in ["GPTQ", "AWQ", "FP8"]:
|
| 28 |
+
print(f"\nTesting {method} quantization recipe...")
|
| 29 |
+
recipe = get_quantization_recipe(method, architectures[0])
|
| 30 |
+
print(f"{method} recipe created successfully: {recipe}")
|
| 31 |
+
print(f"Sequential targets: {[mod.sequential_targets if hasattr(mod, 'sequential_targets') else 'N/A' for mod in recipe]}")
|
| 32 |
+
print(f"Ignore layers: {[mod.ignore for mod in recipe if hasattr(mod, 'ignore')]}")
|
| 33 |
+
|
| 34 |
+
print("\nβ All quantization methods work with Qwen2_5_VLForConditionalGeneration architecture")
|
| 35 |
+
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"\nβ Error creating quantization recipe: {e}")
|
| 38 |
+
import traceback
|
| 39 |
+
traceback.print_exc()
|
| 40 |
+
return False
|
| 41 |
+
|
| 42 |
+
return True
|
| 43 |
+
|
| 44 |
+
def test_manual_model_class_detection():
|
| 45 |
+
"""
|
| 46 |
+
Test the manual model class detection in the app.
|
| 47 |
+
"""
|
| 48 |
+
print("\nTesting manual model class detection...")
|
| 49 |
+
|
| 50 |
+
manual_model_type = "Qwen2_5_VLForConditionalGeneration (Qwen2.5-VL)"
|
| 51 |
+
try:
|
| 52 |
+
model_class = determine_model_class("test", "dummy_token", manual_model_type)
|
| 53 |
+
print(f"Manual detection returned: {model_class}")
|
| 54 |
+
print("β Manual model class detection works")
|
| 55 |
+
return True
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"β Error in manual detection: {e}")
|
| 58 |
+
import traceback
|
| 59 |
+
traceback.print_exc()
|
| 60 |
+
return False
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
print("Testing Qwen2.5-VL architecture detection and quantization support...\n")
|
| 64 |
+
|
| 65 |
+
success1 = test_qwen2_5_vl_detection()
|
| 66 |
+
success2 = test_manual_model_class_detection()
|
| 67 |
+
|
| 68 |
+
if success1 and success2:
|
| 69 |
+
print("\nβ All tests passed! Qwen2.5-VL models should now be properly supported.")
|
| 70 |
+
else:
|
| 71 |
+
print("\nβ Some tests failed. Please check the implementation.")
|