n00b001 commited on
Commit
3717818
·
verified ·
1 Parent(s): 9e06206

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -15
app.py CHANGED
@@ -14,8 +14,6 @@ def get_quantization_recipe(method, model_architecture):
14
  Returns the appropriate llm-compressor recipe based on the selected method.
15
  """
16
  if method == "AWQ":
17
- # Mappings for Llama-like architectures. This may need to be expanded
18
- # for other model types.
19
  mappings = [
20
  AWQMapping("re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]),
21
  AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
@@ -31,8 +29,6 @@ def get_quantization_recipe(method, model_architecture):
31
  ),
32
  ]
33
  elif method == "GPTQ":
34
- # Sequential targets need to be identified based on the model architecture.
35
- # This is a common pattern for Llama-like models.
36
  sequential_target_map = {
37
  "LlamaForCausalLM": "LlamaDecoderLayer",
38
  "MistralForCausalLM": "MistralDecoderLayer",
@@ -49,7 +45,6 @@ def get_quantization_recipe(method, model_architecture):
49
  ),
50
  ]
51
  elif method == "FP8":
52
- # For MoE models, it's common to ignore the gate layers.
53
  ignore_layers = ["lm_head"]
54
  if "Mixtral" in model_architecture:
55
  ignore_layers.append("re:.*block_sparse_moe.gate")
@@ -62,11 +57,15 @@ def get_quantization_recipe(method, model_architecture):
62
  else:
63
  raise ValueError(f"Unsupported quantization method: {method}")
64
 
65
-
66
- def compress_and_upload(model_id: str, quant_method: str, oauth_token: gr.OAuthToken | None):
 
 
67
  """
68
  Compresses a model using llm-compressor and uploads it to a new HF repo.
69
  """
 
 
70
  if not model_id:
71
  raise gr.Error("Please select a model from the search bar.")
72
 
@@ -75,7 +74,6 @@ def compress_and_upload(model_id: str, quant_method: str, oauth_token: gr.OAuthT
75
 
76
  try:
77
  # --- 1. Load Model and Tokenizer ---
78
- # Load model on CPU first to allow for sequential onloading
79
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map=None)
80
  tokenizer = AutoTokenizer.from_pretrained(model_id)
81
 
@@ -85,22 +83,21 @@ def compress_and_upload(model_id: str, quant_method: str, oauth_token: gr.OAuthT
85
  recipe = get_quantization_recipe(quant_method, model.config.architectures[0])
86
 
87
  # --- 3. Run Compression ---
88
- # Using a small slice of a common dataset for calibration
89
  oneshot(
90
  model=model,
91
  dataset="wikitext",
92
  dataset_config_name="wikitext-2-raw-v1",
93
- split="train[:1%]", # Using a small part of the dataset for calibration
94
  recipe=recipe,
95
  save_compressed=True,
96
  output_dir=output_dir,
97
  max_seq_length=512,
98
- num_calibration_samples=64, # A small number of samples for speed
99
  )
100
 
101
  # --- 4. Create Repo and Upload ---
102
- api = HfApi(token=oauth_token.token)
103
- username = whoami(token=oauth_token.token)["name"]
104
  repo_id = f"{username}/{output_dir}"
105
 
106
  repo_url = api.create_repo(repo_id=repo_id, exist_ok=True)
@@ -133,7 +130,7 @@ This conversion was performed by the `llm-compressor-my-repo` Hugging Face Space
133
  For more details on the recipe used, refer to the `recipe.yaml` file in this repository.
134
  """
135
  card = ModelCard(card_content)
136
- card.push_to_hub(repo_id, token=oauth_token.token)
137
 
138
  return f'<h1>✅ Success!</h1><br/>Model compressed and saved to your new repo: <a href="{repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>'
139
 
@@ -166,9 +163,12 @@ with gr.Blocks(css="footer {display: none !important;}") as demo:
166
  compress_button = gr.Button("Compress and Create Repo", variant="primary")
167
  output_html = gr.HTML(label="Result")
168
 
 
 
 
169
  compress_button.click(
170
  fn=compress_and_upload,
171
- inputs=[model_input, quant_method_dropdown, login_button],
172
  outputs=output_html
173
  )
174
 
 
14
  Returns the appropriate llm-compressor recipe based on the selected method.
15
  """
16
  if method == "AWQ":
 
 
17
  mappings = [
18
  AWQMapping("re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]),
19
  AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
 
29
  ),
30
  ]
31
  elif method == "GPTQ":
 
 
32
  sequential_target_map = {
33
  "LlamaForCausalLM": "LlamaDecoderLayer",
34
  "MistralForCausalLM": "MistralDecoderLayer",
 
45
  ),
46
  ]
47
  elif method == "FP8":
 
48
  ignore_layers = ["lm_head"]
49
  if "Mixtral" in model_architecture:
50
  ignore_layers.append("re:.*block_sparse_moe.gate")
 
57
  else:
58
  raise ValueError(f"Unsupported quantization method: {method}")
59
 
60
+ # --------------------------------------------------------------------------------
61
+ # CHANGE #1: Modified function signature to use gr.Request
62
+ # --------------------------------------------------------------------------------
63
+ def compress_and_upload(model_id: str, quant_method: str, request: gr.Request):
64
  """
65
  Compresses a model using llm-compressor and uploads it to a new HF repo.
66
  """
67
+ oauth_token = request.token # Get the token from the request object
68
+
69
  if not model_id:
70
  raise gr.Error("Please select a model from the search bar.")
71
 
 
74
 
75
  try:
76
  # --- 1. Load Model and Tokenizer ---
 
77
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map=None)
78
  tokenizer = AutoTokenizer.from_pretrained(model_id)
79
 
 
83
  recipe = get_quantization_recipe(quant_method, model.config.architectures[0])
84
 
85
  # --- 3. Run Compression ---
 
86
  oneshot(
87
  model=model,
88
  dataset="wikitext",
89
  dataset_config_name="wikitext-2-raw-v1",
90
+ split="train[:1%]",
91
  recipe=recipe,
92
  save_compressed=True,
93
  output_dir=output_dir,
94
  max_seq_length=512,
95
+ num_calibration_samples=64,
96
  )
97
 
98
  # --- 4. Create Repo and Upload ---
99
+ api = HfApi(token=oauth_token)
100
+ username = whoami(token=oauth_token)["name"]
101
  repo_id = f"{username}/{output_dir}"
102
 
103
  repo_url = api.create_repo(repo_id=repo_id, exist_ok=True)
 
130
  For more details on the recipe used, refer to the `recipe.yaml` file in this repository.
131
  """
132
  card = ModelCard(card_content)
133
+ card.push_to_hub(repo_id, token=oauth_token)
134
 
135
  return f'<h1>✅ Success!</h1><br/>Model compressed and saved to your new repo: <a href="{repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>'
136
 
 
163
  compress_button = gr.Button("Compress and Create Repo", variant="primary")
164
  output_html = gr.HTML(label="Result")
165
 
166
+ # --------------------------------------------------------------------------------
167
+ # CHANGE #2: Removed `login_button` from the inputs list
168
+ # --------------------------------------------------------------------------------
169
  compress_button.click(
170
  fn=compress_and_upload,
171
+ inputs=[model_input, quant_method_dropdown], # login_button removed
172
  outputs=output_html
173
  )
174