import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel # Base model base_model = "FreedomIntelligence/apollo-7b" # Apollo 7B base adapter_path = "Afsin-maahi/model" # replace with your repo name # Load tokenizer and base model tokenizer = AutoTokenizer.from_pretrained(base_model) model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16, device_map="auto") # Load LoRA/adapter model = PeftModel.from_pretrained(model, adapter_path) # Chat function def chat(prompt): inputs = tokenizer(prompt, return_tensors="pt").to("cuda") outputs = model.generate(**inputs, max_new_tokens=200) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Gradio interface iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="Apollo 7B Proxy Chat") iface.launch()