maira-chaty / brain.py
CyberCoder225's picture
Update brain.py
33ce563 verified
import os
import gc
import llama_cpp
from llama_cpp import Llama
class MairaBrain:
def __init__(self, repo_id, filename):
self.repo_id = repo_id
self.filename = filename
self.llm = None
def load(self):
"""Wakes the core with Turbo settings"""
if self.llm is None:
print(f"πŸš€ TURBO LOADING: {self.filename}")
model_path = os.path.join("/app", self.filename)
self.llm = Llama(
model_path=model_path,
# 🏎️ SPEED TRICK 1: Smaller context (512) makes response start INSTANTLY
n_ctx=512,
# 🏎️ SPEED TRICK 2: Match HF's physical CPU cores (usually 4)
n_threads=4,
# 🏎️ SPEED TRICK 3: Batch processing size
n_batch=512,
# 🏎️ SPEED TRICK 4: Quantize the KV Cache (Moves 50% less data through RAM)
type_k=llama_cpp.GGML_TYPE_Q8_0,
type_v=llama_cpp.GGML_TYPE_Q8_0,
# 🏎️ SPEED TRICK 5: Flash Attention (if supported by the specific model)
flash_attn=True,
use_mmap=True,
use_mlock=False,
verbose=False
)
def unload(self):
"""Clears the tracks for the next runner"""
if self.llm is not None:
print(f"🧹 CLEARING CACHE: {self.filename}")
try:
self.llm.close()
except:
pass
del self.llm
self.llm = None
gc.collect()
def get_response(self, user_id, user_input):
self.load()
# Keep the prompt short. Long prompts slow down the "Time to First Token"
prompt = f"Maira: I am a high-speed AI core.\\nUser: {user_input}\\nMaira:"
# generate tokens
output = self.llm(
prompt,
max_tokens=128, # Short responses feel faster
stop=["User:", "\\n"],
temperature=0.7,
repeat_penalty=1.1
)
return output["choices"][0]["text"].strip()