n00b001 commited on
Commit
d95ff5b
Β·
unverified Β·
1 Parent(s): 6f060c2
quantize_huihui_fara.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Script to quantize the huihui-ai/Huihui-Fara-7B-abliterated model with Qwen2.5-VL architecture support
4
+ Uses sequential onloading for memory efficiency.
5
+ """
6
+
7
+ import base64
8
+ from io import BytesIO
9
+ import torch
10
+ from datasets import load_dataset
11
+ from qwen_vl_utils import process_vision_info
12
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
13
+
14
+ from llmcompressor import oneshot
15
+ from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
16
+ from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
17
+ from llmcompressor.utils import dispatch_for_generation
18
+
19
+
20
+ def create_qwen2_5_vl_data_collator():
21
+ """Create a data collator for Qwen2.5-VL models that handles multimodal inputs."""
22
+ def data_collator(batch):
23
+ assert len(batch) == 1
24
+ return {key: torch.tensor(value) if isinstance(value, (list, int, float)) else value
25
+ for key, value in batch[0].items()}
26
+ return data_collator
27
+
28
+
29
+ def create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length: int = 2048):
30
+ """Create a preprocessing function for Qwen2.5-VL datasets."""
31
+ def preprocess_and_tokenize(example):
32
+ # Handle different image formats
33
+ if 'image' in example:
34
+ # Process image
35
+ if hasattr(example['image'], 'save'):
36
+ # PIL Image object
37
+ buffered = BytesIO()
38
+ example["image"].save(buffered, format="PNG")
39
+ encoded_image = base64.b64encode(buffered.getvalue())
40
+ encoded_image_text = encoded_image.decode("utf-8")
41
+ base64_qwen = f"data:image;base64,{encoded_image_text}"
42
+ else:
43
+ # Already a string or other format
44
+ base64_qwen = str(example["image"])
45
+ else:
46
+ # If there's no image field, try 'img' or similar
47
+ img_key = None
48
+ for key in example.keys():
49
+ if 'image' in key.lower() or 'img' in key.lower():
50
+ img_key = key
51
+ break
52
+ if img_key:
53
+ if hasattr(example[img_key], 'save'):
54
+ buffered = BytesIO()
55
+ example[img_key].save(buffered, format="PNG")
56
+ encoded_image = base64.b64encode(buffered.getvalue())
57
+ encoded_image_text = encoded_image.decode("utf-8")
58
+ base64_qwen = f"data:image;base64,{encoded_image_text}"
59
+ else:
60
+ base64_qwen = str(example[img_key])
61
+ else:
62
+ # If no image, create a simple text-only example
63
+ messages = [
64
+ {
65
+ "role": "user",
66
+ "content": [
67
+ {"type": "text", "text": example.get('text', example.get('content', 'What can you tell me about this?'))},
68
+ ],
69
+ }
70
+ ]
71
+ text = processor.apply_chat_template(
72
+ messages, tokenize=False, add_generation_prompt=True
73
+ )
74
+
75
+ return processor(
76
+ text=[text],
77
+ padding=False,
78
+ max_length=max_sequence_length,
79
+ truncation=True,
80
+ )
81
+
82
+ # Create message with image
83
+ messages = [
84
+ {
85
+ "role": "user",
86
+ "content": [
87
+ {"type": "image", "image": base64_qwen},
88
+ {"type": "text", "text": "What does the image show?"},
89
+ ],
90
+ }
91
+ ]
92
+ text = processor.apply_chat_template(
93
+ messages, tokenize=False, add_generation_prompt=True
94
+ )
95
+ image_inputs, video_inputs = process_vision_info(messages)
96
+
97
+ # tokenize
98
+ return processor(
99
+ text=[text],
100
+ images=image_inputs,
101
+ videos=video_inputs,
102
+ padding=False,
103
+ max_length=max_sequence_length,
104
+ truncation=True,
105
+ )
106
+
107
+ return preprocess_and_tokenize
108
+
109
+
110
+ def get_qwen2_5_vl_quantization_recipe(method: str, scheme: str = "W4A16"):
111
+ """
112
+ Creates the appropriate quantization recipe for Qwen2.5-VL models.
113
+
114
+ Args:
115
+ method: Quantization method ("GPTQ", "AWQ", or "FP8")
116
+ scheme: Quantization scheme (e.g., "W4A16", "W8A8", "FP8")
117
+
118
+ Returns:
119
+ List of modifiers for the quantization recipe
120
+ """
121
+ if method == "GPTQ":
122
+ return [
123
+ GPTQModifier(
124
+ targets="Linear",
125
+ scheme=scheme,
126
+ ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
127
+ sequential_targets=["Qwen2_5_VLDecoderLayer"], # This enables sequential onloading
128
+ ),
129
+ ]
130
+ elif method == "AWQ":
131
+ # Create AWQ mappings for Qwen2.5-VL architecture
132
+ mappings = [
133
+ AWQMapping(
134
+ "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
135
+ ),
136
+ AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
137
+ AWQMapping(
138
+ "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
139
+ ),
140
+ AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
141
+ ]
142
+ return [
143
+ AWQModifier(
144
+ ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
145
+ scheme="W4A16_ASYM" if scheme == "W4A16" else scheme,
146
+ targets=["Linear"],
147
+ mappings=mappings,
148
+ sequential_targets=["Qwen2_5_VLDecoderLayer"], # Sequential onloading for memory efficiency
149
+ ),
150
+ ]
151
+ elif method == "FP8":
152
+ return [
153
+ QuantizationModifier(
154
+ scheme="FP8",
155
+ targets="Linear",
156
+ ignore=["lm_head", "re:visual.*", "re:model.visual.*"]
157
+ )
158
+ ]
159
+ else:
160
+ raise ValueError(f"Unsupported quantization method: {method}")
161
+
162
+
163
+ def quantize_huihui_fara_model(
164
+ model_id: str = "huihui-ai/Huihui-Fara-7B-abliterated",
165
+ quantization_method: str = "GPTQ",
166
+ output_dir: str = None,
167
+ dataset_id: str = "wikitext",
168
+ dataset_config: str = "wikitext-2-raw-v1",
169
+ dataset_split: str = "train[:1%]",
170
+ num_calibration_samples: int = 64,
171
+ max_sequence_length: int = 512,
172
+ scheme: str = "W4A16",
173
+ trust_remote_code: bool = True,
174
+ ):
175
+ """
176
+ Quantizes the huihui-ai/Huihui-Fara-7B-abliterated model with proper Qwen2.5-VL architecture support.
177
+
178
+ Args:
179
+ model_id: Hugging Face model ID to quantize
180
+ quantization_method: Method to use ("GPTQ", "AWQ", or "FP8")
181
+ output_dir: Directory to save the quantized model
182
+ dataset_id: Dataset ID for calibration
183
+ dataset_config: Dataset config for calibration
184
+ dataset_split: Dataset split for calibration
185
+ num_calibration_samples: Number of samples to use for calibration
186
+ max_sequence_length: Maximum sequence length for processing
187
+ scheme: Quantization scheme (e.g., "W4A16", "W8A8")
188
+ trust_remote_code: Whether to trust remote code in model loading
189
+
190
+ Returns:
191
+ Quantized model
192
+ """
193
+ print(f"Loading model: {model_id}")
194
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
195
+ model_id,
196
+ torch_dtype=torch.float16, # Use float16 to save memory
197
+ device_map="auto", # Auto device mapping for memory efficiency
198
+ trust_remote_code=trust_remote_code
199
+ )
200
+
201
+ print(f"Loading processor for: {model_id}")
202
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
203
+
204
+ # If output directory not specified, create one based on model and method
205
+ if not output_dir:
206
+ model_name = model_id.rstrip("/").split("/")[-1]
207
+ output_dir = f"{model_name}-{scheme.replace(':', '-')}-{quantization_method}"
208
+
209
+ print(f"Output directory: {output_dir}")
210
+
211
+ # Load dataset and preprocess
212
+ print(f"Loading dataset: {dataset_id}")
213
+ try:
214
+ # Try to load a multimodal dataset first
215
+ ds = load_dataset("lmms-lab/flickr30k", split="test[:64]")
216
+ print("Using multimodal dataset for calibration")
217
+
218
+ preprocess_fn = create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length)
219
+ ds = ds.map(preprocess_fn, remove_columns=ds.column_names)
220
+ except Exception as e:
221
+ print(f"Failed to load multimodal dataset: {e}, falling back to text-only dataset")
222
+ # If multimodal dataset fails, use text-only
223
+ ds = load_dataset(dataset_id, dataset_config, split=dataset_split)
224
+ ds = ds.shuffle(seed=42)
225
+
226
+ # Text-only preprocessing
227
+ def text_only_preprocess(example):
228
+ text = example.get('text', example.get('content', str(example)))
229
+ if not isinstance(text, str):
230
+ text = str(text)
231
+ # Limit text length to avoid exceeding max sequence length
232
+ text = text[:500] + "..." if len(text) > 500 else text
233
+ messages = [
234
+ {
235
+ "role": "user",
236
+ "content": [
237
+ {"type": "text", "text": text},
238
+ ],
239
+ }
240
+ ]
241
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
242
+ return processor(text=[prompt], padding=False, max_length=max_sequence_length, truncation=True)
243
+
244
+ ds = ds.map(text_only_preprocess, remove_columns=ds.column_names)
245
+
246
+ # Define data collator
247
+ data_collator = create_qwen2_5_vl_data_collator()
248
+
249
+ # Create recipe
250
+ recipe = get_qwen2_5_vl_quantization_recipe(quantization_method, scheme)
251
+
252
+ print(f"Starting quantization with method: {quantization_method}")
253
+ print(f"Using recipe: {recipe}")
254
+ print(f"Using sequential targets: {[mod.sequential_targets if hasattr(mod, 'sequential_targets') else 'N/A' for mod in recipe]}")
255
+
256
+ # Perform oneshot quantization with sequential onloading for memory efficiency
257
+ oneshot(
258
+ model=model,
259
+ tokenizer=processor, # Use processor as tokenizer for Qwen2.5-VL
260
+ dataset=ds,
261
+ recipe=recipe,
262
+ max_seq_length=max_sequence_length,
263
+ num_calibration_samples=num_calibration_samples,
264
+ trust_remote_code_model=trust_remote_code,
265
+ data_collator=data_collator,
266
+ save_compressed=True,
267
+ output_dir=output_dir,
268
+ )
269
+
270
+ print(f"Quantization completed! Model saved to: {output_dir}")
271
+
272
+ # Save the processor as well
273
+ processor.save_pretrained(output_dir)
274
+
275
+ return model
276
+
277
+
278
+ def test_quantized_model(model, processor, max_sequence_length: int = 2048):
279
+ """
280
+ Tests the quantized model with a sample generation.
281
+ """
282
+ print("========== SAMPLE GENERATION ==============")
283
+ try:
284
+ dispatch_for_generation(model)
285
+ # Simple text-only test first
286
+ messages = [
287
+ {
288
+ "role": "user",
289
+ "content": [
290
+ {"type": "text", "text": "Hello, how are you today?"},
291
+ ],
292
+ }
293
+ ]
294
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
295
+ inputs = processor(
296
+ text=[prompt],
297
+ padding=False,
298
+ max_length=max_sequence_length,
299
+ truncation=True,
300
+ return_tensors="pt",
301
+ ).to(model.device)
302
+
303
+ output = model.generate(**inputs, max_new_tokens=50)
304
+ result = processor.decode(output[0], skip_special_tokens=True)
305
+ print(result)
306
+ print("==========================================")
307
+ return result
308
+ except Exception as e:
309
+ print(f"Test generation failed: {e}")
310
+ import traceback
311
+ traceback.print_exc()
312
+ return None
313
+
314
+
315
+ def main():
316
+ """
317
+ Main function to quantize the Huihui-Fara model.
318
+ """
319
+ import argparse
320
+
321
+ parser = argparse.ArgumentParser(description="Quantize huihui-ai/Huihui-Fara-7B-abliterated model")
322
+ parser.add_argument("--model_id", type=str, default="huihui-ai/Huihui-Fara-7B-abliterated",
323
+ help="Model ID to quantize")
324
+ parser.add_argument("--method", type=str, choices=["GPTQ", "AWQ", "FP8"],
325
+ default="GPTQ", help="Quantization method to use")
326
+ parser.add_argument("--output_dir", type=str, default=None,
327
+ help="Output directory for quantized model")
328
+ parser.add_argument("--dataset_id", type=str, default="wikitext",
329
+ help="Dataset for calibration (default: wikitext)")
330
+ parser.add_argument("--scheme", type=str, default="W4A16",
331
+ help="Quantization scheme (e.g., W4A16, W8A8)")
332
+ parser.add_argument("--num_samples", type=int, default=64,
333
+ help="Number of calibration samples")
334
+
335
+ args = parser.parse_args()
336
+
337
+ print(f"Starting quantization of {args.model_id} using {args.method}")
338
+ print("Note: This may take a while and will use sequential onloading for memory efficiency...")
339
+
340
+ try:
341
+ # Quantize the model
342
+ quantized_model = quantize_huihui_fara_model(
343
+ model_id=args.model_id,
344
+ quantization_method=args.method,
345
+ output_dir=args.output_dir,
346
+ dataset_id=args.dataset_id,
347
+ num_calibration_samples=args.num_samples,
348
+ scheme=args.scheme
349
+ )
350
+
351
+ # Test the model
352
+ processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True)
353
+ test_quantized_model(quantized_model, processor)
354
+
355
+ print(f"βœ… Successfully quantized {args.model_id} with {args.method}")
356
+ print(f"Model saved to: {args.output_dir or args.model_id.split('/')[-1] + f'-{args.scheme}-{args.method}'}")
357
+
358
+ except Exception as e:
359
+ print(f"❌ Quantization failed: {e}")
360
+ import traceback
361
+ traceback.print_exc()
362
+
363
+
364
+ if __name__ == "__main__":
365
+ main()
quantize_qwen2_5_vl.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Specialized script for quantizing Qwen2.5-VL models with sequential onloading
4
+ Handles quantization of Qwen2_5_VLForConditionalGeneration models properly
5
+ """
6
+
7
+ import base64
8
+ from io import BytesIO
9
+ from typing import Optional, Union, Dict, Any
10
+ import torch
11
+ from datasets import load_dataset
12
+ from qwen_vl_utils import process_vision_info
13
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, AutoTokenizer
14
+
15
+ from llmcompressor import oneshot
16
+ from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
17
+ from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
18
+ from llmcompressor.utils import dispatch_for_generation
19
+
20
+
21
+ def create_qwen2_5_vl_data_collator():
22
+ """Create a data collator for Qwen2.5-VL models that handles multimodal inputs."""
23
+ def data_collator(batch):
24
+ assert len(batch) == 1
25
+ return {key: torch.tensor(value) if isinstance(value, (list, int, float)) else value
26
+ for key, value in batch[0].items()}
27
+ return data_collator
28
+
29
+
30
+ def create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length: int = 2048):
31
+ """Create a preprocessing function for Qwen2.5-VL datasets."""
32
+ def preprocess_and_tokenize(example):
33
+ # Handle different image formats
34
+ if 'image' in example:
35
+ # Process image
36
+ if hasattr(example['image'], 'save'):
37
+ # PIL Image object
38
+ buffered = BytesIO()
39
+ example["image"].save(buffered, format="PNG")
40
+ encoded_image = base64.b64encode(buffered.getvalue())
41
+ encoded_image_text = encoded_image.decode("utf-8")
42
+ base64_qwen = f"data:image;base64,{encoded_image_text}"
43
+ else:
44
+ # Already a string or other format
45
+ base64_qwen = str(example["image"])
46
+ else:
47
+ # If there's no image field, try 'img' or similar
48
+ img_key = None
49
+ for key in example.keys():
50
+ if 'image' in key.lower() or 'img' in key.lower():
51
+ img_key = key
52
+ break
53
+ if img_key:
54
+ if hasattr(example[img_key], 'save'):
55
+ buffered = BytesIO()
56
+ example[img_key].save(buffered, format="PNG")
57
+ encoded_image = base64.b64encode(buffered.getvalue())
58
+ encoded_image_text = encoded_image.decode("utf-8")
59
+ base64_qwen = f"data:image;base64,{encoded_image_text}"
60
+ else:
61
+ base64_qwen = str(example[img_key])
62
+ else:
63
+ # If no image, create a simple text-only example
64
+ messages = [
65
+ {
66
+ "role": "user",
67
+ "content": [
68
+ {"type": "text", "text": example.get('text', example.get('content', 'What can you tell me about this?'))},
69
+ ],
70
+ }
71
+ ]
72
+ text = processor.apply_chat_template(
73
+ messages, tokenize=False, add_generation_prompt=True
74
+ )
75
+
76
+ return processor(
77
+ text=[text],
78
+ padding=False,
79
+ max_length=max_sequence_length,
80
+ truncation=True,
81
+ )
82
+
83
+ # Create message with image
84
+ messages = [
85
+ {
86
+ "role": "user",
87
+ "content": [
88
+ {"type": "image", "image": base64_qwen},
89
+ {"type": "text", "text": "What does the image show?"},
90
+ ],
91
+ }
92
+ ]
93
+ text = processor.apply_chat_template(
94
+ messages, tokenize=False, add_generation_prompt=True
95
+ )
96
+ image_inputs, video_inputs = process_vision_info(messages)
97
+
98
+ # tokenize
99
+ return processor(
100
+ text=[text],
101
+ images=image_inputs,
102
+ videos=video_inputs,
103
+ padding=False,
104
+ max_length=max_sequence_length,
105
+ truncation=True,
106
+ )
107
+
108
+ return preprocess_and_tokenize
109
+
110
+
111
+ def get_qwen2_5_vl_quantization_recipe(method: str, scheme: str = "W4A16"):
112
+ """
113
+ Creates the appropriate quantization recipe for Qwen2.5-VL models.
114
+
115
+ Args:
116
+ method: Quantization method ("GPTQ", "AWQ", or "FP8")
117
+ scheme: Quantization scheme (e.g., "W4A16", "W8A8", "FP8")
118
+
119
+ Returns:
120
+ List of modifiers for the quantization recipe
121
+ """
122
+ if method == "GPTQ":
123
+ return [
124
+ GPTQModifier(
125
+ targets="Linear",
126
+ scheme=scheme,
127
+ ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
128
+ sequential_targets=["Qwen2_5_VLDecoderLayer"], # This is key for the architecture
129
+ ),
130
+ ]
131
+ elif method == "AWQ":
132
+ # Create AWQ mappings for Qwen2.5-VL architecture
133
+ mappings = [
134
+ AWQMapping(
135
+ "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
136
+ ),
137
+ AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
138
+ AWQMapping(
139
+ "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
140
+ ),
141
+ AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
142
+ ]
143
+ return [
144
+ AWQModifier(
145
+ ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
146
+ scheme="W4A16_ASYM" if scheme == "W4A16" else scheme,
147
+ targets=["Linear"],
148
+ mappings=mappings,
149
+ ),
150
+ ]
151
+ elif method == "FP8":
152
+ return [
153
+ QuantizationModifier(
154
+ scheme="FP8",
155
+ targets="Linear",
156
+ ignore=["lm_head", "re:visual.*", "re:model.visual.*"]
157
+ )
158
+ ]
159
+ else:
160
+ raise ValueError(f"Unsupported quantization method: {method}")
161
+
162
+
163
+ def quantize_qwen2_5_vl_model(
164
+ model_id: str,
165
+ quantization_method: str,
166
+ output_dir: Optional[str] = None,
167
+ dataset_id: str = "lmms-lab/flickr30k",
168
+ dataset_split: str = "test[:512]",
169
+ num_calibration_samples: int = 512,
170
+ max_sequence_length: int = 2048,
171
+ scheme: str = "W4A16",
172
+ trust_remote_code: bool = True,
173
+ ):
174
+ """
175
+ Quantizes a Qwen2.5-VL model with proper architecture handling and sequential onloading.
176
+
177
+ Args:
178
+ model_id: Hugging Face model ID to quantize
179
+ quantization_method: Method to use ("GPTQ", "AWQ", or "FP8")
180
+ output_dir: Directory to save the quantized model
181
+ dataset_id: Dataset ID for calibration
182
+ dataset_split: Dataset split for calibration
183
+ num_calibration_samples: Number of samples to use for calibration
184
+ max_sequence_length: Maximum sequence length for processing
185
+ scheme: Quantization scheme (e.g., "W4A16", "W8A8")
186
+ trust_remote_code: Whether to trust remote code in model loading
187
+
188
+ Returns:
189
+ Quantized model
190
+ """
191
+ print(f"Loading model: {model_id}")
192
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
193
+ model_id,
194
+ torch_dtype="auto",
195
+ device_map=None, # Let the system decide device mapping
196
+ trust_remote_code=trust_remote_code
197
+ )
198
+
199
+ print(f"Loading processor for: {model_id}")
200
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
201
+
202
+ # If output directory not specified, create one based on model and method
203
+ if not output_dir:
204
+ model_name = model_id.rstrip("/").split("/")[-1]
205
+ output_dir = f"{model_name}-{scheme.replace(':', '-')}-{quantization_method}"
206
+
207
+ print(f"Output directory: {output_dir}")
208
+
209
+ # Load dataset and preprocess
210
+ print(f"Loading dataset: {dataset_id}")
211
+ try:
212
+ ds = load_dataset(dataset_id, split=dataset_split)
213
+ except Exception as e:
214
+ print(f"Failed to load {dataset_id}, trying alternative text-only dataset: {e}")
215
+ # If the image dataset fails, try a text-only dataset
216
+ ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:512]")
217
+ # We'll need to adjust preprocessing for text-only data
218
+
219
+ ds = ds.shuffle(seed=42)
220
+
221
+ # Apply preprocessing
222
+ preprocess_fn = create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length)
223
+ try:
224
+ ds = ds.map(preprocess_fn, remove_columns=ds.column_names if hasattr(ds, 'column_names') else [])
225
+ except Exception as e:
226
+ print(f"Preprocessing failed: {e}")
227
+ print("Trying simpler preprocessing with text-only data...")
228
+ # Fallback: use text-only preprocessing
229
+ def text_only_preprocess(example):
230
+ text = example.get('text', example.get('content', str(example)))
231
+ if not isinstance(text, str):
232
+ text = str(text)
233
+ messages = [
234
+ {
235
+ "role": "user",
236
+ "content": [
237
+ {"type": "text", "text": text[:500] + "..." if len(text) > 500 else text}, # Limit length
238
+ ],
239
+ }
240
+ ]
241
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
242
+ return processor(text=[prompt], padding=False, max_length=max_sequence_length, truncation=True)
243
+
244
+ ds = ds.map(text_only_preprocess, remove_columns=ds.column_names if hasattr(ds, 'column_names') else [])
245
+
246
+ # Define data collator
247
+ data_collator = create_qwen2_5_vl_data_collator()
248
+
249
+ # Create recipe
250
+ recipe = get_qwen2_5_vl_quantization_recipe(quantization_method, scheme)
251
+
252
+ print(f"Starting quantization with method: {quantization_method}")
253
+ print(f"Using recipe: {recipe}")
254
+
255
+ # Perform oneshot quantization with sequential targets and proper handling
256
+ oneshot(
257
+ model=model,
258
+ tokenizer=processor, # Use processor as tokenizer for Qwen2.5-VL
259
+ dataset=ds,
260
+ recipe=recipe,
261
+ max_seq_length=max_sequence_length,
262
+ num_calibration_samples=num_calibration_samples,
263
+ trust_remote_code_model=trust_remote_code,
264
+ data_collator=data_collator,
265
+ # Use sequential onloading for memory efficiency
266
+ sequential_targets=["Qwen2_5_VLDecoderLayer"],
267
+ save_compressed=True,
268
+ output_dir=output_dir,
269
+ )
270
+
271
+ print(f"Quantization completed! Model saved to: {output_dir}")
272
+
273
+ # Save the processor as well
274
+ processor.save_pretrained(output_dir)
275
+
276
+ return model
277
+
278
+
279
+ def test_quantized_model(model, processor, max_sequence_length: int = 2048):
280
+ """
281
+ Tests the quantized model with a sample generation.
282
+ """
283
+ print("========== SAMPLE GENERATION ==============")
284
+ try:
285
+ dispatch_for_generation(model)
286
+ messages = [
287
+ {
288
+ "role": "user",
289
+ "content": [
290
+ {
291
+ "type": "image",
292
+ "image": "http://images.cocodataset.org/train2017/000000231895.jpg",
293
+ },
294
+ {"type": "text", "text": "Please describe the animal in this image\n"},
295
+ ],
296
+ }
297
+ ]
298
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
299
+ image_inputs, video_inputs = process_vision_info(messages)
300
+ inputs = processor(
301
+ text=[prompt],
302
+ images=image_inputs,
303
+ videos=video_inputs,
304
+ padding=False,
305
+ max_length=max_sequence_length,
306
+ truncation=True,
307
+ return_tensors="pt",
308
+ ).to(model.device)
309
+
310
+ output = model.generate(**inputs, max_new_tokens=100)
311
+ result = processor.decode(output[0], skip_special_tokens=True)
312
+ print(result)
313
+ print("==========================================")
314
+ return result
315
+ except Exception as e:
316
+ print(f"Test generation failed: {e}")
317
+ print("Trying text-only generation...")
318
+ # Try with text-only
319
+ try:
320
+ messages = [
321
+ {
322
+ "role": "user",
323
+ "content": [
324
+ {"type": "text", "text": "Hello, how are you today?"},
325
+ ],
326
+ }
327
+ ]
328
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
329
+ inputs = processor(
330
+ text=[prompt],
331
+ padding=False,
332
+ max_length=max_sequence_length,
333
+ truncation=True,
334
+ return_tensors="pt",
335
+ ).to(model.device)
336
+
337
+ output = model.generate(**inputs, max_new_tokens=50)
338
+ result = processor.decode(output[0], skip_special_tokens=True)
339
+ print(result)
340
+ print("==========================================")
341
+ return result
342
+ except Exception as e2:
343
+ print(f"Text-only generation also failed: {e2}")
344
+ return None
345
+
346
+
347
+ def main():
348
+ """
349
+ Main function to demonstrate quantization of Qwen2.5-VL models.
350
+ """
351
+ import argparse
352
+
353
+ parser = argparse.ArgumentParser(description="Quantize Qwen2.5-VL models")
354
+ parser.add_argument("--model_id", type=str, required=True,
355
+ help="Model ID to quantize (e.g., 'huihui-ai/Huihui-Fara-7B-abliterated')")
356
+ parser.add_argument("--method", type=str, choices=["GPTQ", "AWQ", "FP8"],
357
+ default="GPTQ", help="Quantization method to use")
358
+ parser.add_argument("--output_dir", type=str, default=None,
359
+ help="Output directory for quantized model")
360
+ parser.add_argument("--dataset_id", type=str, default="lmms-lab/flickr30k",
361
+ help="Dataset for calibration (default: lmms-lab/flickr30k)")
362
+ parser.add_argument("--scheme", type=str, default="W4A16",
363
+ help="Quantization scheme (e.g., W4A16, W8A8)")
364
+ parser.add_argument("--num_samples", type=int, default=128,
365
+ help="Number of calibration samples")
366
+
367
+ args = parser.parse_args()
368
+
369
+ print(f"Starting quantization of {args.model_id} using {args.method}")
370
+
371
+ try:
372
+ # Quantize the model
373
+ quantized_model = quantize_qwen2_5_vl_model(
374
+ model_id=args.model_id,
375
+ quantization_method=args.method,
376
+ output_dir=args.output_dir,
377
+ dataset_id=args.dataset_id,
378
+ num_calibration_samples=args.num_samples,
379
+ scheme=args.scheme
380
+ )
381
+
382
+ # Test the model
383
+ # Load the processor again to test
384
+ processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True)
385
+ test_quantized_model(quantized_model, processor)
386
+
387
+ print(f"Successfully quantized {args.model_id} with {args.method}")
388
+
389
+ except Exception as e:
390
+ print(f"Quantization failed: {e}")
391
+ import traceback
392
+ traceback.print_exc()
393
+
394
+
395
+ if __name__ == "__main__":
396
+ main()
test_final_solution.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Final verification test after implementing proper AWQ incompatibility with Qwen2.5-VL models
4
+ """
5
+
6
+ from app import get_quantization_recipe
7
+
8
+ def test_qwen2_5_vl_compatible_methods():
9
+ """
10
+ Test all methods that should work with Qwen2.5-VL models
11
+ """
12
+ print("Testing quantization methods compatible with Qwen2.5-VL models...")
13
+
14
+ # Methods that should work
15
+ compatible_methods = ["GPTQ", "W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "FP8"]
16
+
17
+ all_passed = True
18
+ for method in compatible_methods:
19
+ try:
20
+ recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
21
+ print(f"βœ“ {method} works with Qwen2_5_VLForConditionalGeneration")
22
+ if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
23
+ print(f" - Uses sequential onloading: {recipe[0].sequential_targets}")
24
+ print(f" - Ignore patterns: {recipe[0].ignore}")
25
+ except Exception as e:
26
+ print(f"βœ— {method} failed: {e}")
27
+ all_passed = False
28
+
29
+ return all_passed
30
+
31
+ def test_awq_incompatibility():
32
+ """
33
+ Test that AWQ properly fails for Qwen2.5-VL models
34
+ """
35
+ print("\nTesting AWQ incompatibility with Qwen2.5-VL models...")
36
+
37
+ try:
38
+ recipe = get_quantization_recipe("AWQ", "Qwen2_5_VLForConditionalGeneration")
39
+ print("βœ— AWQ unexpectedly succeeded for Qwen2.5-VL (should have failed)")
40
+ return False
41
+ except ValueError as e:
42
+ if "not compatible" in str(e) and "rotary positional embeddings" in str(e):
43
+ print(f"βœ“ AWQ properly fails for Qwen2.5-VL: {e}")
44
+ return True
45
+ else:
46
+ print(f"βœ— AWQ failed but with wrong error: {e}")
47
+ return False
48
+
49
+ def test_awq_still_works_for_llama():
50
+ """
51
+ Test that AWQ still works for Llama models
52
+ """
53
+ print("\nTesting AWQ still works for Llama models...")
54
+
55
+ try:
56
+ recipe = get_quantization_recipe("AWQ", "LlamaForCausalLM")
57
+ print(f"βœ“ AWQ still works for LlamaForCausalLM")
58
+ print(f" - Ignore patterns: {recipe[0].ignore}")
59
+ return True
60
+ except Exception as e:
61
+ print(f"βœ— AWQ failed for LlamaForCausalLM: {e}")
62
+ return False
63
+
64
+ def test_target_model():
65
+ """
66
+ Test with the specific target model
67
+ """
68
+ print(f"\nTesting with target model architecture: Qwen2_5_VLForConditionalGeneration")
69
+
70
+ # All methods except AWQ should work
71
+ methods = ["GPTQ", "W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "FP8"]
72
+
73
+ success_count = 0
74
+ for method in methods:
75
+ try:
76
+ recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
77
+ success_count += 1
78
+ except Exception as e:
79
+ print(f"Method {method} failed: {e}")
80
+
81
+ print(f"βœ“ {success_count}/{len(methods)} methods work for target model")
82
+ return success_count == len(methods)
83
+
84
+ if __name__ == "__main__":
85
+ print("Final verification after fixing AWQ incompatibility issue\n")
86
+
87
+ test1 = test_qwen2_5_vl_compatible_methods()
88
+ test2 = test_awq_incompatibility()
89
+ test3 = test_awq_still_works_for_llama()
90
+ test4 = test_target_model()
91
+
92
+ print(f"\n{'='*60}")
93
+ if test1 and test2 and test3 and test4:
94
+ print("βœ… ALL TESTS PASSED")
95
+ print("\nSOLUTION SUMMARY:")
96
+ print("β€’ AWQ is now properly blocked for Qwen2.5-VL models due to incompatibility")
97
+ print("β€’ All other methods (GPTQ, W4A16, W8A16, W8A8_INT8, W8A8_FP8, FP8) work for Qwen2.5-VL")
98
+ print("β€’ AWQ still works for Llama models as expected")
99
+ print("β€’ Sequential onloading is preserved for memory efficiency")
100
+ print("β€’ Users will get clear error messages when trying incompatible methods")
101
+ else:
102
+ print("❌ SOME TESTS FAILED")
103
+ print(f"{'='*60}")
test_final_verification.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Final test to confirm the original issue is resolved:
4
+ GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture
5
+ """
6
+
7
+ from app import get_quantization_recipe
8
+
9
+ def test_original_issue_fixed():
10
+ """
11
+ Test to confirm the original error is fixed.
12
+ The original error was:
13
+ GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture
14
+ """
15
+ print("Testing the original issue that was reported...")
16
+ print("Original error: GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture")
17
+ print()
18
+
19
+ # Test the original problematic case
20
+ try:
21
+ recipe = get_quantization_recipe("GPTQ", "Qwen2_5_VLForConditionalGeneration")
22
+ print("βœ“ GPTQ quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
23
+ print(f" Recipe: {recipe}")
24
+ if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
25
+ print(f" Uses sequential onloading: {recipe[0].sequential_targets}")
26
+ print(f" Ignores visual components: {recipe[0].ignore}")
27
+ success_gptq = True
28
+ except Exception as e:
29
+ print(f"βœ— GPTQ still fails: {e}")
30
+ success_gptq = False
31
+
32
+ print()
33
+
34
+ # Test other methods that were also problematic
35
+ other_methods = ["AWQ", "FP8"]
36
+ success_others = True
37
+ for method in other_methods:
38
+ try:
39
+ recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
40
+ print(f"βœ“ {method} quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
41
+ if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
42
+ print(f" Uses sequential onloading: {recipe[0].sequential_targets}")
43
+ success_others = success_others and True
44
+ except Exception as e:
45
+ print(f"βœ— {method} still fails: {e}")
46
+ success_others = False
47
+
48
+ print()
49
+
50
+ # Test new methods for Qwen2.5-VL
51
+ new_methods = ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8"]
52
+ success_new = True
53
+ for method in new_methods:
54
+ try:
55
+ recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
56
+ print(f"βœ“ {method} quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
57
+ success_new = success_new and True
58
+ except Exception as e:
59
+ print(f"βœ— {method} fails: {e}")
60
+ success_new = False
61
+
62
+ print()
63
+
64
+ if success_gptq and success_others and success_new:
65
+ print("πŸŽ‰ SUCCESS: The original issue has been completely resolved!")
66
+ print(" - GPTQ now works for Qwen2_5_VLForConditionalGeneration")
67
+ print(" - AWQ now works for Qwen2_5_VLForConditionalGeneration")
68
+ print(" - FP8 now works for Qwen2_5_VLForConditionalGeneration")
69
+ print(" - New methods (W4A16, W8A16, W8A8_INT8, W8A8_FP8) also work!")
70
+ print(" - Sequential onloading is used for memory efficiency")
71
+ print(" - Visual components are properly ignored during quantization")
72
+ return True
73
+ else:
74
+ print("❌ FAILURE: Some issues remain")
75
+ return False
76
+
77
+ def test_specific_model():
78
+ """
79
+ Test with the specific model mentioned: huihui-ai/Huihui-Fara-7B-abliterated
80
+ """
81
+ print("\n" + "="*60)
82
+ print("Testing with the specific model: huihui-ai/Huihui-Fara-7B-abliterated")
83
+ print("(This model has architecture: Qwen2_5_VLForConditionalGeneration)")
84
+ print("="*60)
85
+
86
+ # All the methods that should now work for this model
87
+ methods = ["GPTQ", "AWQ", "FP8", "W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8"]
88
+
89
+ success = True
90
+ for method in methods:
91
+ try:
92
+ recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
93
+ print(f"βœ“ {method}: OK")
94
+ except Exception as e:
95
+ print(f"βœ— {method}: FAILED - {e}")
96
+ success = False
97
+
98
+ if success:
99
+ print(f"\nπŸŽ‰ All {len(methods)} quantization methods now work for the target model!")
100
+ print("Users can now quantize huihui-ai/Huihui-Fara-7B-abliterated with any of these methods.")
101
+ else:
102
+ print("\n❌ Some methods still don't work for the target model.")
103
+
104
+ return success
105
+
106
+ if __name__ == "__main__":
107
+ print("Testing resolution of the original quantization issue...\n")
108
+
109
+ issue_fixed = test_original_issue_fixed()
110
+ model_specific = test_specific_model()
111
+
112
+ print("\n" + "="*60)
113
+ if issue_fixed and model_specific:
114
+ print("βœ… ALL TESTS PASSED - The issue is completely resolved!")
115
+ print("\nThe Hugging Face Space now supports:")
116
+ print(" β€’ All original methods: GPTQ, AWQ, FP8")
117
+ print(" β€’ New methods: W4A16, W8A16, W8A8_INT8, W8A8_FP8")
118
+ print(" β€’ Sequential onloading for memory efficiency")
119
+ print(" β€’ Proper handling of Qwen2.5-VL visual components")
120
+ print(" β€’ All methods work with Qwen2_5_VLForConditionalGeneration models")
121
+ else:
122
+ print("❌ SOME TESTS FAILED - Issue may not be completely resolved")
123
+ print("="*60)
test_new_quantization_methods.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Test script to verify that the new quantization methods work with Qwen2.5-VL architecture
4
+ """
5
+
6
+ from app import get_quantization_recipe
7
+ import torch
8
+
9
+ def test_new_quantization_methods():
10
+ """
11
+ Test the new quantization methods with Qwen2.5-VL architecture.
12
+ """
13
+ architectures = ["Qwen2_5_VLForConditionalGeneration"]
14
+
15
+ # Test all the new quantization methods
16
+ new_methods = ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8", "FP8", "SmoothQuant", "SparseGPT"]
17
+
18
+ print(f"Testing new quantization methods with architecture: {architectures[0]}")
19
+
20
+ for method in new_methods:
21
+ print(f"\nTesting {method} quantization recipe...")
22
+ try:
23
+ if method in ["SmoothQuant", "SparseGPT"] and architectures[0] == "Qwen2_5_VLForConditionalGeneration":
24
+ # These methods don't support Qwen2_5_VLForConditionalGeneration, so they should raise an error
25
+ try:
26
+ recipe = get_quantization_recipe(method, architectures[0])
27
+ print(f"βœ— {method} should not be supported for Qwen2.5-VL but it didn't raise an error")
28
+ except ValueError as e:
29
+ print(f"βœ“ {method} correctly raises error for Qwen2.5-VL: {e}")
30
+ else:
31
+ recipe = get_quantization_recipe(method, architectures[0])
32
+ print(f"βœ“ {method} recipe created successfully: {recipe}")
33
+ if hasattr(recipe[0], 'scheme'):
34
+ print(f" Scheme: {recipe[0].scheme}")
35
+ if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
36
+ print(f" Sequential targets: {recipe[0].sequential_targets}")
37
+ if hasattr(recipe[0], 'ignore'):
38
+ print(f" Ignore layers: {recipe[0].ignore}")
39
+ except ValueError as e:
40
+ if method in ["SmoothQuant", "SparseGPT"]:
41
+ # These are expected to not work with Qwen2.5-VL
42
+ print(f"βœ“ {method} correctly not supported for Qwen2.5-VL: {e}")
43
+ else:
44
+ print(f"βœ— Error with {method}: {e}")
45
+ except Exception as e:
46
+ print(f"βœ— Unexpected error with {method}: {e}")
47
+
48
+ # Test that Llama models still work with all methods
49
+ print(f"\n\nTesting LlamaForCausalLM compatibility...")
50
+ llama_arch = "LlamaForCausalLM"
51
+ for method in new_methods:
52
+ print(f"Testing {method} with {llama_arch}...")
53
+ try:
54
+ recipe = get_quantization_recipe(method, llama_arch)
55
+ print(f"βœ“ {method} works with {llama_arch}")
56
+ except Exception as e:
57
+ print(f"βœ— {method} failed with {llama_arch}: {e}")
58
+
59
+ return True
60
+
61
+ if __name__ == "__main__":
62
+ print("Testing new quantization methods...\n")
63
+ test_new_quantization_methods()
64
+ print("\nβœ“ Testing of new quantization methods completed!")
test_qwen2_5_vl_architecture.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Test script to verify that the Qwen2.5-VL architecture detection and quantization recipe work correctly
4
+ """
5
+
6
+ from transformers import AutoConfig
7
+ from app import determine_model_class, get_quantization_recipe
8
+ import torch
9
+
10
+ def test_qwen2_5_vl_detection():
11
+ """
12
+ Test to see if we can properly detect the Qwen2.5-VL architecture.
13
+ We'll use a known Qwen2.5-VL model ID to test the detection.
14
+ """
15
+ # For testing purposes, use a known Qwen2.5-VL model ID
16
+ model_id = "Qwen/Qwen2.5-VL-7B-Instruct" # Use a known Qwen2.5-VL model
17
+
18
+ # Simulate the architecture string that would come from the model config
19
+ # In the real scenario, this comes from model.config.architectures[0]
20
+ architectures = ["Qwen2_5_VLForConditionalGeneration"]
21
+
22
+ print(f"Testing architecture detection for: {model_id}")
23
+ print(f"Architectures found: {architectures}")
24
+
25
+ try:
26
+ # Test if our recipe function can handle this architecture
27
+ for method in ["GPTQ", "AWQ", "FP8"]:
28
+ print(f"\nTesting {method} quantization recipe...")
29
+ recipe = get_quantization_recipe(method, architectures[0])
30
+ print(f"{method} recipe created successfully: {recipe}")
31
+ print(f"Sequential targets: {[mod.sequential_targets if hasattr(mod, 'sequential_targets') else 'N/A' for mod in recipe]}")
32
+ print(f"Ignore layers: {[mod.ignore for mod in recipe if hasattr(mod, 'ignore')]}")
33
+
34
+ print("\nβœ“ All quantization methods work with Qwen2_5_VLForConditionalGeneration architecture")
35
+
36
+ except Exception as e:
37
+ print(f"\nβœ— Error creating quantization recipe: {e}")
38
+ import traceback
39
+ traceback.print_exc()
40
+ return False
41
+
42
+ return True
43
+
44
+ def test_manual_model_class_detection():
45
+ """
46
+ Test the manual model class detection in the app.
47
+ """
48
+ print("\nTesting manual model class detection...")
49
+
50
+ manual_model_type = "Qwen2_5_VLForConditionalGeneration (Qwen2.5-VL)"
51
+ try:
52
+ model_class = determine_model_class("test", "dummy_token", manual_model_type)
53
+ print(f"Manual detection returned: {model_class}")
54
+ print("βœ“ Manual model class detection works")
55
+ return True
56
+ except Exception as e:
57
+ print(f"βœ— Error in manual detection: {e}")
58
+ import traceback
59
+ traceback.print_exc()
60
+ return False
61
+
62
+ if __name__ == "__main__":
63
+ print("Testing Qwen2.5-VL architecture detection and quantization support...\n")
64
+
65
+ success1 = test_qwen2_5_vl_detection()
66
+ success2 = test_manual_model_class_detection()
67
+
68
+ if success1 and success2:
69
+ print("\nβœ“ All tests passed! Qwen2.5-VL models should now be properly supported.")
70
+ else:
71
+ print("\nβœ— Some tests failed. Please check the implementation.")