Spaces:

n00b001
/

llm-compressor-my-repo

Running

App Files Files Community

n00b001 commited on Sep 22

Commit

3717818

verified ·

1 Parent(s): 9e06206

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -15

app.py CHANGED Viewed

@@ -14,8 +14,6 @@ def get_quantization_recipe(method, model_architecture):
     Returns the appropriate llm-compressor recipe based on the selected method.
     """
     if method == "AWQ":
-        # Mappings for Llama-like architectures. This may need to be expanded
-        # for other model types.
         mappings = [
             AWQMapping("re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]),
             AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
@@ -31,8 +29,6 @@ def get_quantization_recipe(method, model_architecture):
             ),
         ]
     elif method == "GPTQ":
-        # Sequential targets need to be identified based on the model architecture.
-        # This is a common pattern for Llama-like models.
         sequential_target_map = {
             "LlamaForCausalLM": "LlamaDecoderLayer",
             "MistralForCausalLM": "MistralDecoderLayer",
@@ -49,7 +45,6 @@ def get_quantization_recipe(method, model_architecture):
             ),
         ]
     elif method == "FP8":
-        # For MoE models, it's common to ignore the gate layers.
         ignore_layers = ["lm_head"]
         if "Mixtral" in model_architecture:
             ignore_layers.append("re:.*block_sparse_moe.gate")
@@ -62,11 +57,15 @@ def get_quantization_recipe(method, model_architecture):
     else:
         raise ValueError(f"Unsupported quantization method: {method}")
-def compress_and_upload(model_id: str, quant_method: str, oauth_token: gr.OAuthToken | None):
     """
     Compresses a model using llm-compressor and uploads it to a new HF repo.
     """
     if not model_id:
         raise gr.Error("Please select a model from the search bar.")
@@ -75,7 +74,6 @@ def compress_and_upload(model_id: str, quant_method: str, oauth_token: gr.OAuthT
     try:
         # --- 1. Load Model and Tokenizer ---
-        # Load model on CPU first to allow for sequential onloading
         model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map=None)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -85,22 +83,21 @@ def compress_and_upload(model_id: str, quant_method: str, oauth_token: gr.OAuthT
         recipe = get_quantization_recipe(quant_method, model.config.architectures[0])
         # --- 3. Run Compression ---
-        # Using a small slice of a common dataset for calibration
         oneshot(
             model=model,
             dataset="wikitext",
             dataset_config_name="wikitext-2-raw-v1",
-            split="train[:1%]", # Using a small part of the dataset for calibration
             recipe=recipe,
             save_compressed=True,
             output_dir=output_dir,
             max_seq_length=512,
-            num_calibration_samples=64, # A small number of samples for speed
         )
         # --- 4. Create Repo and Upload ---
-        api = HfApi(token=oauth_token.token)
-        username = whoami(token=oauth_token.token)["name"]
         repo_id = f"{username}/{output_dir}"
         repo_url = api.create_repo(repo_id=repo_id, exist_ok=True)
@@ -133,7 +130,7 @@ This conversion was performed by the `llm-compressor-my-repo` Hugging Face Space
 For more details on the recipe used, refer to the `recipe.yaml` file in this repository.
 """
         card = ModelCard(card_content)
-        card.push_to_hub(repo_id, token=oauth_token.token)
         return f'<h1>✅ Success!</h1><br/>Model compressed and saved to your new repo: <a href="{repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>'
@@ -166,9 +163,12 @@ with gr.Blocks(css="footer {display: none !important;}") as demo:
     compress_button = gr.Button("Compress and Create Repo", variant="primary")
     output_html = gr.HTML(label="Result")
     compress_button.click(
         fn=compress_and_upload,
-        inputs=[model_input, quant_method_dropdown, login_button],
         outputs=output_html
     )

     Returns the appropriate llm-compressor recipe based on the selected method.
     """
     if method == "AWQ":
         mappings = [
             AWQMapping("re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]),
             AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
             ),
         ]
     elif method == "GPTQ":
         sequential_target_map = {
             "LlamaForCausalLM": "LlamaDecoderLayer",
             "MistralForCausalLM": "MistralDecoderLayer",
             ),
         ]
     elif method == "FP8":
         ignore_layers = ["lm_head"]
         if "Mixtral" in model_architecture:
             ignore_layers.append("re:.*block_sparse_moe.gate")
     else:
         raise ValueError(f"Unsupported quantization method: {method}")
+# --------------------------------------------------------------------------------
+# CHANGE #1: Modified function signature to use gr.Request
+# --------------------------------------------------------------------------------
+def compress_and_upload(model_id: str, quant_method: str, request: gr.Request):
     """
     Compresses a model using llm-compressor and uploads it to a new HF repo.
     """
+    oauth_token = request.token  # Get the token from the request object
     if not model_id:
         raise gr.Error("Please select a model from the search bar.")
     try:
         # --- 1. Load Model and Tokenizer ---
         model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map=None)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         recipe = get_quantization_recipe(quant_method, model.config.architectures[0])
         # --- 3. Run Compression ---
         oneshot(
             model=model,
             dataset="wikitext",
             dataset_config_name="wikitext-2-raw-v1",
+            split="train[:1%]",
             recipe=recipe,
             save_compressed=True,
             output_dir=output_dir,
             max_seq_length=512,
+            num_calibration_samples=64,
         )
         # --- 4. Create Repo and Upload ---
+        api = HfApi(token=oauth_token)
+        username = whoami(token=oauth_token)["name"]
         repo_id = f"{username}/{output_dir}"
         repo_url = api.create_repo(repo_id=repo_id, exist_ok=True)
 For more details on the recipe used, refer to the `recipe.yaml` file in this repository.
 """
         card = ModelCard(card_content)
+        card.push_to_hub(repo_id, token=oauth_token)
         return f'<h1>✅ Success!</h1><br/>Model compressed and saved to your new repo: <a href="{repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>'
     compress_button = gr.Button("Compress and Create Repo", variant="primary")
     output_html = gr.HTML(label="Result")
+    # --------------------------------------------------------------------------------
+    # CHANGE #2: Removed `login_button` from the inputs list
+    # --------------------------------------------------------------------------------
     compress_button.click(
         fn=compress_and_upload,
+        inputs=[model_input, quant_method_dropdown], # login_button removed
         outputs=output_html
     )