maira-chaty

Sleeping

File size: 2,192 Bytes

bc1d475
d5fba0e
33ce563
14dd090
 
 
d5fba0e
9b5f8f9
 
33ce563
d5fba0e
 
33ce563
d5fba0e
33ce563
d5fba0e
 
bc1d475
 
33ce563
 
 
 
 
 
 
 
 
 
 
 
 
 
bc1d475
 
d5fba0e
33ce563
d5fba0e
33ce563
d5fba0e
33ce563
 
 
d5fba0e
 
 
51c91db
d5fba0e
 
 
33ce563
 
d5fba0e
33ce563
d5fba0e
33ce563
 
 
 
 
d5fba0e

import os
import gc
import llama_cpp
from llama_cpp import Llama

class MairaBrain:
    def __init__(self, repo_id, filename):
        self.repo_id = repo_id
        self.filename = filename
        self.llm = None

    def load(self):
        """Wakes the core with Turbo settings"""
        if self.llm is None:
            print(f"🚀 TURBO LOADING: {self.filename}")
            model_path = os.path.join("/app", self.filename)
            
            self.llm = Llama(
                model_path=model_path,
                # 🏎️ SPEED TRICK 1: Smaller context (512) makes response start INSTANTLY
                n_ctx=512,             
                # 🏎️ SPEED TRICK 2: Match HF's physical CPU cores (usually 4)
                n_threads=4,           
                # 🏎️ SPEED TRICK 3: Batch processing size
                n_batch=512,           
                # 🏎️ SPEED TRICK 4: Quantize the KV Cache (Moves 50% less data through RAM)
                type_k=llama_cpp.GGML_TYPE_Q8_0, 
                type_v=llama_cpp.GGML_TYPE_Q8_0,
                # 🏎️ SPEED TRICK 5: Flash Attention (if supported by the specific model)
                flash_attn=True,
                use_mmap=True,
                use_mlock=False,
                verbose=False
            )

    def unload(self):
        """Clears the tracks for the next runner"""
        if self.llm is not None:
            print(f"🧹 CLEARING CACHE: {self.filename}")
            try:
                self.llm.close()
            except:
                pass
            del self.llm
            self.llm = None
            gc.collect()

    def get_response(self, user_id, user_input):
        self.load()
        
        # Keep the prompt short. Long prompts slow down the "Time to First Token"
        prompt = f"Maira: I am a high-speed AI core.\\nUser: {user_input}\\nMaira:"
        
        # generate tokens
        output = self.llm(
            prompt,
            max_tokens=128, # Short responses feel faster
            stop=["User:", "\\n"],
            temperature=0.7,
            repeat_penalty=1.1
        )
        
        return output["choices"][0]["text"].strip()