Spaces:

joaogante
/

assisted_generation_demo

Running on Zero

joaogante commited on Mar 6

Commit

4b1ae14

verified ·

1 Parent(s): 35c2ab9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,8 +8,8 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
-model_id = "google/gemma-2-27b-it"
-assistant_id = "google/gemma-2-2b-it"
 model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto")
 assistant_model = AutoModelForCausalLM.from_pretrained(assistant_id).to(device=model.device, dtype=torch.float16)
@@ -61,9 +61,9 @@ def reset_textbox():
 with gr.Blocks() as demo:
     gr.Markdown(
         "# 🤗 Assisted Generation Demo\n"
-        f"- Model: {model_id} (4-bit quant, 14B params, GPU memory = ~7GB)\n"
-        f"- Assistant Model: {assistant_id} (FP16, 0.5B params, GPU memory = ~1GB)\n"
-        "- Recipe for speedup: a) >10x model size difference in parameters; b) assistant trained similarly; c) CPU is not a bottleneck"
     )
     with gr.Row():

 from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+model_id = "meta-llama/Llama-3.1-8B"
+assistant_id = "meta-llama/Llama-3.2-1B"
 model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto")
 assistant_model = AutoModelForCausalLM.from_pretrained(assistant_id).to(device=model.device, dtype=torch.float16)
 with gr.Blocks() as demo:
     gr.Markdown(
         "# 🤗 Assisted Generation Demo\n"
+        f"- Model: {model_id}\n"
+        f"- Assistant Model: {assistant_id}\n"
+        "- Recipe for good speedup: a) >10x model size difference in parameters; b) assistant trained similarly; c) CPU is not a bottleneck"
     )
     with gr.Row():