File size: 5,232 Bytes
d95ff5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
"""
Final test to confirm the original issue is resolved:
GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture
"""

from app import get_quantization_recipe

def test_original_issue_fixed():
    """
    Test to confirm the original error is fixed.
    The original error was:
    GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture
    """
    print("Testing the original issue that was reported...")
    print("Original error: GPTQ quantization is not supported for Qwen2_5_VLForConditionalGeneration architecture")
    print()
    
    # Test the original problematic case
    try:
        recipe = get_quantization_recipe("GPTQ", "Qwen2_5_VLForConditionalGeneration")
        print("βœ“ GPTQ quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
        print(f"  Recipe: {recipe}")
        if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
            print(f"  Uses sequential onloading: {recipe[0].sequential_targets}")
        print(f"  Ignores visual components: {recipe[0].ignore}")
        success_gptq = True
    except Exception as e:
        print(f"βœ— GPTQ still fails: {e}")
        success_gptq = False
    
    print()
    
    # Test other methods that were also problematic
    other_methods = ["AWQ", "FP8"]
    success_others = True
    for method in other_methods:
        try:
            recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
            print(f"βœ“ {method} quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
            if hasattr(recipe[0], 'sequential_targets') and recipe[0].sequential_targets:
                print(f"  Uses sequential onloading: {recipe[0].sequential_targets}")
            success_others = success_others and True
        except Exception as e:
            print(f"βœ— {method} still fails: {e}")
            success_others = False
    
    print()
    
    # Test new methods for Qwen2.5-VL
    new_methods = ["W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8"]
    success_new = True
    for method in new_methods:
        try:
            recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
            print(f"βœ“ {method} quantization recipe created successfully for Qwen2_5_VLForConditionalGeneration")
            success_new = success_new and True
        except Exception as e:
            print(f"βœ— {method} fails: {e}")
            success_new = False
    
    print()
    
    if success_gptq and success_others and success_new:
        print("πŸŽ‰ SUCCESS: The original issue has been completely resolved!")
        print("   - GPTQ now works for Qwen2_5_VLForConditionalGeneration")
        print("   - AWQ now works for Qwen2_5_VLForConditionalGeneration") 
        print("   - FP8 now works for Qwen2_5_VLForConditionalGeneration")
        print("   - New methods (W4A16, W8A16, W8A8_INT8, W8A8_FP8) also work!")
        print("   - Sequential onloading is used for memory efficiency")
        print("   - Visual components are properly ignored during quantization")
        return True
    else:
        print("❌ FAILURE: Some issues remain")
        return False

def test_specific_model():
    """
    Test with the specific model mentioned: huihui-ai/Huihui-Fara-7B-abliterated
    """
    print("\n" + "="*60)
    print("Testing with the specific model: huihui-ai/Huihui-Fara-7B-abliterated")
    print("(This model has architecture: Qwen2_5_VLForConditionalGeneration)")
    print("="*60)
    
    # All the methods that should now work for this model
    methods = ["GPTQ", "AWQ", "FP8", "W4A16", "W8A16", "W8A8_INT8", "W8A8_FP8"]
    
    success = True
    for method in methods:
        try:
            recipe = get_quantization_recipe(method, "Qwen2_5_VLForConditionalGeneration")
            print(f"βœ“ {method}: OK")
        except Exception as e:
            print(f"βœ— {method}: FAILED - {e}")
            success = False
    
    if success:
        print(f"\nπŸŽ‰ All {len(methods)} quantization methods now work for the target model!")
        print("Users can now quantize huihui-ai/Huihui-Fara-7B-abliterated with any of these methods.")
    else:
        print("\n❌ Some methods still don't work for the target model.")
        
    return success

if __name__ == "__main__":
    print("Testing resolution of the original quantization issue...\n")
    
    issue_fixed = test_original_issue_fixed()
    model_specific = test_specific_model()
    
    print("\n" + "="*60)
    if issue_fixed and model_specific:
        print("βœ… ALL TESTS PASSED - The issue is completely resolved!")
        print("\nThe Hugging Face Space now supports:")
        print("  β€’ All original methods: GPTQ, AWQ, FP8")
        print("  β€’ New methods: W4A16, W8A16, W8A8_INT8, W8A8_FP8")
        print("  β€’ Sequential onloading for memory efficiency")
        print("  β€’ Proper handling of Qwen2.5-VL visual components")
        print("  β€’ All methods work with Qwen2_5_VLForConditionalGeneration models")
    else:
        print("❌ SOME TESTS FAILED - Issue may not be completely resolved")
    print("="*60)