Spaces:
Running
Running
| import gradio as gr | |
| from huggingface_hub import HfApi, ModelCard, whoami | |
| from gradio_huggingfacehub_search import HuggingfaceHubSearch | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.quantization import QuantizationModifier, GPTQModifier | |
| from llmcompressor.modifiers.awq import AWQModifier, AWQMapping | |
| from transformers import AutoModelForCausalLM, Qwen2_5_VLForConditionalGeneration | |
| # --- Helper Functions --- | |
| def get_quantization_recipe(method, model_architecture): | |
| """ | |
| Returns the appropriate llm-compressor recipe based on the selected method. | |
| """ | |
| if method == "AWQ": | |
| if model_architecture != "LlamaForCausalLM": | |
| raise ValueError( | |
| f"AWQ quantization is only supported for LlamaForCausalLM architectures, got {model_architecture}" | |
| ) | |
| mappings = [ | |
| AWQMapping( | |
| "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"] | |
| ), | |
| AWQMapping("re:.*v_proj", ["re:.*o_proj"]), | |
| AWQMapping( | |
| "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"] | |
| ), | |
| AWQMapping("re:.*up_proj", ["re:.*down_proj"]), | |
| ] | |
| return [ | |
| AWQModifier( | |
| ignore=["lm_head"], | |
| scheme="W4A16_ASYM", | |
| targets=["Linear"], | |
| mappings=mappings, | |
| ), | |
| ] | |
| elif method == "GPTQ": | |
| sequential_target_map = { | |
| "LlamaForCausalLM": "LlamaDecoderLayer", | |
| "MistralForCausalLM": "MistralDecoderLayer", | |
| "MixtralForCausalLM": "MixtralDecoderLayer", | |
| } | |
| sequential_target = sequential_target_map.get(model_architecture) | |
| if sequential_target is None: | |
| raise ValueError( | |
| f"GPTQ quantization is not supported for {model_architecture} architecture. " | |
| "Supported architectures are: " | |
| f"{', '.join(sequential_target_map.keys())}" | |
| ) | |
| return [ | |
| GPTQModifier( | |
| targets="Linear", | |
| scheme="W4A16", | |
| sequential_targets=[sequential_target], | |
| ignore=["re:.*lm_head"], | |
| ), | |
| ] | |
| elif method == "FP8": | |
| if model_architecture not in ["LlamaForCausalLM", "MixtralForCausalLM"]: | |
| raise ValueError( | |
| f"FP8 quantization is only supported for LlamaForCausalLM and MixtralForCausalLM architectures, got {model_architecture}" | |
| ) | |
| ignore_layers = ["lm_head"] | |
| if "Mixtral" in model_architecture: | |
| ignore_layers.append("re:.*block_sparse_moe.gate") | |
| return [QuantizationModifier( | |
| scheme="FP8", targets="Linear", ignore=ignore_layers | |
| )] | |
| else: | |
| raise ValueError(f"Unsupported quantization method: {method}") | |
| def compress_and_upload( | |
| model_id: str, | |
| quant_method: str, | |
| oauth_token: gr.OAuthToken | None, | |
| ): | |
| """ | |
| Compresses a model using llm-compressor and uploads it to a new HF repo. | |
| """ | |
| if not model_id: | |
| raise gr.Error("Please select a model from the search bar.") | |
| if oauth_token is None: | |
| raise gr.Error("Authentication error. Please log in to continue.") | |
| token = oauth_token.token | |
| try: | |
| # Use the provided token for all hub interactions | |
| username = whoami(token=token)["name"] | |
| # --- 1. Load Model and Tokenizer --- | |
| try: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, torch_dtype="auto", device_map=None, token=token, trust_remote_code=True | |
| ) | |
| except ValueError as e: | |
| if "Unrecognized configuration class" in str(e) and "qwen" in model_id.lower(): | |
| print(f"AutoModelForCausalLM failed, trying Qwen2_5_VLForConditionalGeneration for {model_id}") | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| model_id, torch_dtype="auto", device_map=None, token=token, trust_remote_code=True | |
| ) | |
| else: | |
| raise | |
| output_dir = f"{model_id.split('/')[-1]}-{quant_method}" | |
| # --- 2. Get Recipe --- | |
| if not model.config.architectures: | |
| raise gr.Error("Could not determine model architecture.") | |
| recipe = get_quantization_recipe(quant_method, model.config.architectures[0]) | |
| # --- 3. Run Compression --- | |
| oneshot( | |
| model=model, | |
| dataset="wikitext", | |
| dataset_config_name="wikitext-2-raw-v1", | |
| split="train[:1%]", | |
| recipe=recipe, | |
| save_compressed=True, | |
| output_dir=output_dir, | |
| max_seq_length=512, | |
| num_calibration_samples=64, | |
| ) | |
| # --- 4. Create Repo and Upload --- | |
| api = HfApi(token=token) | |
| repo_id = f"{username}/{output_dir}" | |
| repo_url = api.create_repo(repo_id=repo_id, exist_ok=True) | |
| api.upload_folder( | |
| folder_path=output_dir, | |
| repo_id=repo_id, | |
| commit_message=f"Upload {quant_method} compressed model", | |
| ) | |
| # --- 5. Create Model Card --- | |
| card_content = f""" | |
| --- | |
| license: apache-2.0 | |
| base_model: {model_id} | |
| tags: | |
| - llm-compressor | |
| - quantization | |
| - {quant_method.lower()} | |
| --- | |
| # {quant_method} Compressed Model: {repo_id} | |
| This model was compressed from [`{model_id}`](https://huggingface.co/{model_id}) using the [vLLM LLM-Compressor](https://github.com/vllm-project/llm-compressor) library. | |
| This conversion was performed by the `llm-compressor-my-repo` Hugging Face Space. | |
| ## Quantization Method: {quant_method} | |
| For more details on the recipe used, refer to the `recipe.yaml` file in this repository. | |
| """ | |
| card = ModelCard(card_content) | |
| card.push_to_hub(repo_id, token=token) | |
| return f'<h1>✅ Success!</h1><br/>Model compressed and saved to your new repo: <a href="{repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>' | |
| except gr.Error as e: | |
| raise e | |
| except Exception as e: | |
| error_message = str(e).replace("\n", "<br/>") | |
| return f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{error_message}</pre>' | |
| # --- Gradio Interface --- | |
| def build_gradio_app(): | |
| with gr.Blocks(css="footer {display: none !important;}") as demo: | |
| gr.Markdown("# LLM-Compressor My Repo") | |
| gr.Markdown( | |
| "Log in, choose a model, select a quantization method, and this Space will create a new compressed model repository on your Hugging Face profile." | |
| ) | |
| with gr.Row(): | |
| login_button = gr.LoginButton(min_width=250) # noqa: F841 | |
| gr.Markdown("### 1. Select a Model from the Hugging Face Hub") | |
| model_input = HuggingfaceHubSearch( | |
| label="Search for a Model", | |
| search_type="model", | |
| ) | |
| gr.Markdown("### 2. Choose a Quantization Method") | |
| quant_method_dropdown = gr.Dropdown( | |
| ["AWQ", "GPTQ", "FP8"], label="Quantization Method", value="AWQ" | |
| ) | |
| compress_button = gr.Button("Compress and Create Repo", variant="primary") | |
| output_html = gr.HTML(label="Result") | |
| compress_button.click( | |
| fn=compress_and_upload, | |
| inputs=[model_input, quant_method_dropdown], | |
| outputs=output_html, | |
| ) | |
| return demo | |
| def main(): | |
| demo = build_gradio_app() | |
| demo.queue(max_size=5).launch() | |
| if __name__ == "__main__": | |
| main() | |