Spaces:
Sleeping
Sleeping
File size: 2,192 Bytes
bc1d475 d5fba0e 33ce563 14dd090 d5fba0e 9b5f8f9 33ce563 d5fba0e 33ce563 d5fba0e 33ce563 d5fba0e bc1d475 33ce563 bc1d475 d5fba0e 33ce563 d5fba0e 33ce563 d5fba0e 33ce563 d5fba0e 51c91db d5fba0e 33ce563 d5fba0e 33ce563 d5fba0e 33ce563 d5fba0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import os
import gc
import llama_cpp
from llama_cpp import Llama
class MairaBrain:
def __init__(self, repo_id, filename):
self.repo_id = repo_id
self.filename = filename
self.llm = None
def load(self):
"""Wakes the core with Turbo settings"""
if self.llm is None:
print(f"๐ TURBO LOADING: {self.filename}")
model_path = os.path.join("/app", self.filename)
self.llm = Llama(
model_path=model_path,
# ๐๏ธ SPEED TRICK 1: Smaller context (512) makes response start INSTANTLY
n_ctx=512,
# ๐๏ธ SPEED TRICK 2: Match HF's physical CPU cores (usually 4)
n_threads=4,
# ๐๏ธ SPEED TRICK 3: Batch processing size
n_batch=512,
# ๐๏ธ SPEED TRICK 4: Quantize the KV Cache (Moves 50% less data through RAM)
type_k=llama_cpp.GGML_TYPE_Q8_0,
type_v=llama_cpp.GGML_TYPE_Q8_0,
# ๐๏ธ SPEED TRICK 5: Flash Attention (if supported by the specific model)
flash_attn=True,
use_mmap=True,
use_mlock=False,
verbose=False
)
def unload(self):
"""Clears the tracks for the next runner"""
if self.llm is not None:
print(f"๐งน CLEARING CACHE: {self.filename}")
try:
self.llm.close()
except:
pass
del self.llm
self.llm = None
gc.collect()
def get_response(self, user_id, user_input):
self.load()
# Keep the prompt short. Long prompts slow down the "Time to First Token"
prompt = f"Maira: I am a high-speed AI core.\\nUser: {user_input}\\nMaira:"
# generate tokens
output = self.llm(
prompt,
max_tokens=128, # Short responses feel faster
stop=["User:", "\\n"],
temperature=0.7,
repeat_penalty=1.1
)
return output["choices"][0]["text"].strip() |