maira-chaty

Sleeping

maira-chaty / brain.py

Update brain.py

33ce563 verified 12 days ago

2.19 kB

	import os
	import gc
	import llama_cpp
	from llama_cpp import Llama

	class MairaBrain:
	def __init__(self, repo_id, filename):
	self.repo_id = repo_id
	self.filename = filename
	self.llm = None

	def load(self):
	"""Wakes the core with Turbo settings"""
	if self.llm is None:
	print(f"🚀 TURBO LOADING: {self.filename}")
	model_path = os.path.join("/app", self.filename)

	self.llm = Llama(
	model_path=model_path,
	# 🏎️ SPEED TRICK 1: Smaller context (512) makes response start INSTANTLY
	n_ctx=512,
	# 🏎️ SPEED TRICK 2: Match HF's physical CPU cores (usually 4)
	n_threads=4,
	# 🏎️ SPEED TRICK 3: Batch processing size
	n_batch=512,
	# 🏎️ SPEED TRICK 4: Quantize the KV Cache (Moves 50% less data through RAM)
	type_k=llama_cpp.GGML_TYPE_Q8_0,
	type_v=llama_cpp.GGML_TYPE_Q8_0,
	# 🏎️ SPEED TRICK 5: Flash Attention (if supported by the specific model)
	flash_attn=True,
	use_mmap=True,
	use_mlock=False,
	verbose=False
	)

	def unload(self):
	"""Clears the tracks for the next runner"""
	if self.llm is not None:
	print(f"🧹 CLEARING CACHE: {self.filename}")
	try:
	self.llm.close()
	except:
	pass
	del self.llm
	self.llm = None
	gc.collect()

	def get_response(self, user_id, user_input):
	self.load()

	# Keep the prompt short. Long prompts slow down the "Time to First Token"
	prompt = f"Maira: I am a high-speed AI core.\\nUser: {user_input}\\nMaira:"

	# generate tokens
	output = self.llm(
	prompt,
	max_tokens=128, # Short responses feel faster
	stop=["User:", "\\n"],
	temperature=0.7,
	repeat_penalty=1.1
	)

	return output["choices"][0]["text"].strip()