Spaces:

n00b001
/

llm-compressor-my-repo

Running

File size: 7,548 Bytes

import gradio as gr
from huggingface_hub import HfApi, ModelCard, whoami
from gradio_huggingfacehub_search import HuggingfaceHubSearch

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier, GPTQModifier
from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
from transformers import AutoModelForCausalLM, Qwen2_5_VLForConditionalGeneration

# --- Helper Functions ---


def get_quantization_recipe(method, model_architecture):
    """
    Returns the appropriate llm-compressor recipe based on the selected method.
    """
    if method == "AWQ":
        if model_architecture != "LlamaForCausalLM":
            raise ValueError(
                f"AWQ quantization is only supported for LlamaForCausalLM architectures, got {model_architecture}"
            )
        mappings = [
            AWQMapping(
                "re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
            ),
            AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
            AWQMapping(
                "re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]
            ),
            AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
        ]
        return [
            AWQModifier(
                ignore=["lm_head"],
                scheme="W4A16_ASYM",
                targets=["Linear"],
                mappings=mappings,
            ),
        ]
    elif method == "GPTQ":
        sequential_target_map = {
            "LlamaForCausalLM": "LlamaDecoderLayer",
            "MistralForCausalLM": "MistralDecoderLayer",
            "MixtralForCausalLM": "MixtralDecoderLayer",
        }
        sequential_target = sequential_target_map.get(model_architecture)
        if sequential_target is None:
            raise ValueError(
                f"GPTQ quantization is not supported for {model_architecture} architecture. "
                "Supported architectures are: "
                f"{', '.join(sequential_target_map.keys())}"
            )

        return [
            GPTQModifier(
                targets="Linear",
                scheme="W4A16",
                sequential_targets=[sequential_target],
                ignore=["re:.*lm_head"],
            ),
        ]
    elif method == "FP8":
        if model_architecture not in ["LlamaForCausalLM", "MixtralForCausalLM"]:
            raise ValueError(
                f"FP8 quantization is only supported for LlamaForCausalLM and MixtralForCausalLM architectures, got {model_architecture}"
            )
        ignore_layers = ["lm_head"]
        if "Mixtral" in model_architecture:
            ignore_layers.append("re:.*block_sparse_moe.gate")

        return [QuantizationModifier(
            scheme="FP8", targets="Linear", ignore=ignore_layers
        )]
    else:
        raise ValueError(f"Unsupported quantization method: {method}")


def compress_and_upload(
    model_id: str,
    quant_method: str,
    oauth_token: gr.OAuthToken | None,
):
    """
    Compresses a model using llm-compressor and uploads it to a new HF repo.
    """
    if not model_id:
        raise gr.Error("Please select a model from the search bar.")

    if oauth_token is None:
        raise gr.Error("Authentication error. Please log in to continue.")

    token = oauth_token.token

    try:
        # Use the provided token for all hub interactions
        username = whoami(token=token)["name"]

        # --- 1. Load Model and Tokenizer ---
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_id, torch_dtype="auto", device_map=None, token=token, trust_remote_code=True
            )
        except ValueError as e:
            if "Unrecognized configuration class" in str(e) and "qwen" in model_id.lower():
                print(f"AutoModelForCausalLM failed, trying Qwen2_5_VLForConditionalGeneration for {model_id}")
                model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                    model_id, torch_dtype="auto", device_map=None, token=token, trust_remote_code=True
                )
            else:
                raise

        output_dir = f"{model_id.split('/')[-1]}-{quant_method}"

        # --- 2. Get Recipe ---
        if not model.config.architectures:
            raise gr.Error("Could not determine model architecture.")
        recipe = get_quantization_recipe(quant_method, model.config.architectures[0])

        # --- 3. Run Compression ---
        oneshot(
            model=model,
            dataset="wikitext",
            dataset_config_name="wikitext-2-raw-v1",
            split="train[:1%]",
            recipe=recipe,
            save_compressed=True,
            output_dir=output_dir,
            max_seq_length=512,
            num_calibration_samples=64,
        )

        # --- 4. Create Repo and Upload ---
        api = HfApi(token=token)
        repo_id = f"{username}/{output_dir}"

        repo_url = api.create_repo(repo_id=repo_id, exist_ok=True)

        api.upload_folder(
            folder_path=output_dir,
            repo_id=repo_id,
            commit_message=f"Upload {quant_method} compressed model",
        )

        # --- 5. Create Model Card ---
        card_content = f"""
---
license: apache-2.0
base_model: {model_id}
tags:
- llm-compressor
- quantization
- {quant_method.lower()}
---

# {quant_method} Compressed Model: {repo_id}

This model was compressed from [`{model_id}`](https://huggingface.co/{model_id}) using the [vLLM LLM-Compressor](https://github.com/vllm-project/llm-compressor) library.

This conversion was performed by the `llm-compressor-my-repo` Hugging Face Space.

## Quantization Method: {quant_method}

For more details on the recipe used, refer to the `recipe.yaml` file in this repository.
"""
        card = ModelCard(card_content)
        card.push_to_hub(repo_id, token=token)

        return f'<h1>✅ Success!</h1><br/>Model compressed and saved to your new repo: <a href="{repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>'

    except gr.Error as e:
        raise e
    except Exception as e:
        error_message = str(e).replace("\n", "<br/>")
        return f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{error_message}</pre>'



# --- Gradio Interface ---
def build_gradio_app():
    with gr.Blocks(css="footer {display: none !important;}") as demo:
        gr.Markdown("# LLM-Compressor My Repo")
        gr.Markdown(
            "Log in, choose a model, select a quantization method, and this Space will create a new compressed model repository on your Hugging Face profile."
        )



        with gr.Row():
            login_button = gr.LoginButton(min_width=250)  # noqa: F841

        gr.Markdown("### 1. Select a Model from the Hugging Face Hub")
        model_input = HuggingfaceHubSearch(
            label="Search for a Model",
            search_type="model",
        )

        gr.Markdown("### 2. Choose a Quantization Method")
        quant_method_dropdown = gr.Dropdown(
            ["AWQ", "GPTQ", "FP8"], label="Quantization Method", value="AWQ"
        )

        compress_button = gr.Button("Compress and Create Repo", variant="primary")
        output_html = gr.HTML(label="Result")

        compress_button.click(
            fn=compress_and_upload,
            inputs=[model_input, quant_method_dropdown],
            outputs=output_html,
        )
    return demo

def main():
    demo = build_gradio_app()
    demo.queue(max_size=5).launch()

if __name__ == "__main__":
    main()