Spaces:
Sleeping
Sleeping
| import os | |
| import gc | |
| import llama_cpp | |
| from llama_cpp import Llama | |
| class MairaBrain: | |
| def __init__(self, repo_id, filename): | |
| self.repo_id = repo_id | |
| self.filename = filename | |
| self.llm = None | |
| def load(self): | |
| """Wakes the core with Turbo settings""" | |
| if self.llm is None: | |
| print(f"π TURBO LOADING: {self.filename}") | |
| model_path = os.path.join("/app", self.filename) | |
| self.llm = Llama( | |
| model_path=model_path, | |
| # ποΈ SPEED TRICK 1: Smaller context (512) makes response start INSTANTLY | |
| n_ctx=512, | |
| # ποΈ SPEED TRICK 2: Match HF's physical CPU cores (usually 4) | |
| n_threads=4, | |
| # ποΈ SPEED TRICK 3: Batch processing size | |
| n_batch=512, | |
| # ποΈ SPEED TRICK 4: Quantize the KV Cache (Moves 50% less data through RAM) | |
| type_k=llama_cpp.GGML_TYPE_Q8_0, | |
| type_v=llama_cpp.GGML_TYPE_Q8_0, | |
| # ποΈ SPEED TRICK 5: Flash Attention (if supported by the specific model) | |
| flash_attn=True, | |
| use_mmap=True, | |
| use_mlock=False, | |
| verbose=False | |
| ) | |
| def unload(self): | |
| """Clears the tracks for the next runner""" | |
| if self.llm is not None: | |
| print(f"π§Ή CLEARING CACHE: {self.filename}") | |
| try: | |
| self.llm.close() | |
| except: | |
| pass | |
| del self.llm | |
| self.llm = None | |
| gc.collect() | |
| def get_response(self, user_id, user_input): | |
| self.load() | |
| # Keep the prompt short. Long prompts slow down the "Time to First Token" | |
| prompt = f"Maira: I am a high-speed AI core.\\nUser: {user_input}\\nMaira:" | |
| # generate tokens | |
| output = self.llm( | |
| prompt, | |
| max_tokens=128, # Short responses feel faster | |
| stop=["User:", "\\n"], | |
| temperature=0.7, | |
| repeat_penalty=1.1 | |
| ) | |
| return output["choices"][0]["text"].strip() |