n00b001 commited on
Commit
411ab9e
·
verified ·
1 Parent(s): 8043cea

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -0
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import HfApi, ModelCard, whoami
3
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
4
+ import os
5
+ from llmcompressor import oneshot
6
+ from llmcompressor.modifiers.quantization import QuantizationModifier, GPTQModifier
7
+ from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
+
10
+ # --- Helper Functions ---
11
+
12
+ def get_quantization_recipe(method, model_architecture):
13
+ """
14
+ Returns the appropriate llm-compressor recipe based on the selected method.
15
+ """
16
+ if method == "AWQ":
17
+ # Mappings for Llama-like architectures. This may need to be expanded
18
+ # for other model types.
19
+ mappings = [
20
+ AWQMapping("re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]),
21
+ AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
22
+ AWQMapping("re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]),
23
+ AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
24
+ ]
25
+ return [
26
+ AWQModifier(
27
+ ignore=["lm_head"],
28
+ scheme="W4A16_ASYM",
29
+ targets=["Linear"],
30
+ mappings=mappings
31
+ ),
32
+ ]
33
+ elif method == "GPTQ":
34
+ # Sequential targets need to be identified based on the model architecture.
35
+ # This is a common pattern for Llama-like models.
36
+ sequential_target_map = {
37
+ "LlamaForCausalLM": "LlamaDecoderLayer",
38
+ "MistralForCausalLM": "MistralDecoderLayer",
39
+ "MixtralForCausalLM": "MixtralDecoderLayer",
40
+ }
41
+ sequential_target = sequential_target_map.get(model_architecture, "LlamaDecoderLayer")
42
+
43
+ return [
44
+ GPTQModifier(
45
+ targets="Linear",
46
+ scheme="W4A16",
47
+ sequential_targets=[sequential_target],
48
+ ignore=["re:.*lm_head"],
49
+ ),
50
+ ]
51
+ elif method == "FP8":
52
+ # For MoE models, it's common to ignore the gate layers.
53
+ ignore_layers = ["lm_head"]
54
+ if "Mixtral" in model_architecture:
55
+ ignore_layers.append("re:.*block_sparse_moe.gate")
56
+
57
+ return QuantizationModifier(
58
+ scheme="FP8",
59
+ targets="Linear",
60
+ ignore=ignore_layers
61
+ )
62
+ else:
63
+ raise ValueError(f"Unsupported quantization method: {method}")
64
+
65
+
66
+ def compress_and_upload(model_id: str, quant_method: str, oauth_token: gr.OAuthToken | None):
67
+ """
68
+ Compresses a model using llm-compressor and uploads it to a new HF repo.
69
+ """
70
+ if not model_id:
71
+ raise gr.Error("Please select a model from the search bar.")
72
+
73
+ if oauth_token is None:
74
+ raise gr.Error("Please log in with your Hugging Face account to continue.")
75
+
76
+ try:
77
+ # --- 1. Load Model and Tokenizer ---
78
+ # Load model on CPU first to allow for sequential onloading
79
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map=None)
80
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
81
+
82
+ output_dir = f"{model_id.split('/')[-1]}-{quant_method}"
83
+
84
+ # --- 2. Get Recipe ---
85
+ recipe = get_quantization_recipe(quant_method, model.config.architectures[0])
86
+
87
+ # --- 3. Run Compression ---
88
+ # Using a small slice of a common dataset for calibration
89
+ oneshot(
90
+ model=model,
91
+ dataset="wikitext",
92
+ dataset_config_name="wikitext-2-raw-v1",
93
+ split="train[:1%]", # Using a small part of the dataset for calibration
94
+ recipe=recipe,
95
+ save_compressed=True,
96
+ output_dir=output_dir,
97
+ max_seq_length=512,
98
+ num_calibration_samples=64, # A small number of samples for speed
99
+ )
100
+
101
+ # --- 4. Create Repo and Upload ---
102
+ api = HfApi(token=oauth_token.token)
103
+ username = whoami(token=oauth_token.token)["name"]
104
+ repo_id = f"{username}/{output_dir}"
105
+
106
+ repo_url = api.create_repo(repo_id=repo_id, exist_ok=True)
107
+
108
+ api.upload_folder(
109
+ folder_path=output_dir,
110
+ repo_id=repo_id,
111
+ commit_message=f"Upload {quant_method} compressed model",
112
+ )
113
+
114
+ # --- 5. Create Model Card ---
115
+ card_content = f"""
116
+ ---
117
+ license: apache-2.0
118
+ base_model: {model_id}
119
+ tags:
120
+ - llm-compressor
121
+ - quantization
122
+ - {quant_method.lower()}
123
+ ---
124
+
125
+ # {quant_method} Compressed Model: {repo_id}
126
+
127
+ This model was compressed from [`{model_id}`](https://huggingface.co/{model_id}) using the [vLLM LLM-Compressor](https://github.com/vllm-project/llm-compressor) library.
128
+
129
+ This conversion was performed by the `llm-compressor-my-repo` Hugging Face Space.
130
+
131
+ ## Quantization Method: {quant_method}
132
+
133
+ For more details on the recipe used, refer to the `recipe.yaml` file in this repository.
134
+ """
135
+ card = ModelCard(card_content)
136
+ card.push_to_hub(repo_id, token=oauth_token.token)
137
+
138
+ return f'<h1>✅ Success!</h1><br/>Model compressed and saved to your new repo: <a href="{repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>'
139
+
140
+ except Exception as e:
141
+ error_message = str(e).replace("\n", "<br/>")
142
+ return f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{error_message}</pre>'
143
+
144
+ # --- Gradio Interface ---
145
+ with gr.Blocks(css="footer {display: none !important;}") as demo:
146
+ gr.Markdown("# LLM-Compressor My Repo")
147
+ gr.Markdown(
148
+ "Log in, choose a model, select a quantization method, and this Space will create a new compressed model repository on your Hugging Face profile."
149
+ )
150
+ with gr.Row():
151
+ login_button = gr.LoginButton(min_width=250)
152
+
153
+ gr.Markdown("### 1. Select a Model from the Hugging Face Hub")
154
+ model_input = HuggingfaceHubSearch(
155
+ label="Search for a Model",
156
+ search_type="model",
157
+ )
158
+
159
+ gr.Markdown("### 2. Choose a Quantization Method")
160
+ quant_method_dropdown = gr.Dropdown(
161
+ ["AWQ", "GPTQ", "FP8"],
162
+ label="Quantization Method",
163
+ value="AWQ"
164
+ )
165
+
166
+ compress_button = gr.Button("Compress and Create Repo", variant="primary")
167
+ output_html = gr.HTML(label="Result")
168
+
169
+ compress_button.click(
170
+ fn=compress_and_upload,
171
+ inputs=[model_input, quant_method_dropdown, login_button],
172
+ outputs=output_html
173
+ )
174
+
175
+ gr.Examples(
176
+ examples=[
177
+ ["mistralai/Mistral-7B-Instruct-v0.2", "AWQ"],
178
+ ["meta-llama/Llama-2-7b-chat-hf", "GPTQ"],
179
+ ],
180
+ inputs=[model_input, quant_method_dropdown],
181
+ )
182
+
183
+ demo.queue(max_size=5).launch()