"""
Model management for STT, TTS, and LLM
Optimized for Hugging Face Zero GPU (H200)
"""

import os
import torch
import spaces
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer
)
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer as ParlerTokenizer
import tempfile
from typing import List, Dict
import numpy as np
from scipy.io import wavfile
import soundfile as sf

class ModelManager:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        
        # Models will be loaded lazily
        self.whisper_pipe = None
        self.tts_model = None
        self.tts_tokenizer = None
        self.llm_model = None
        self.llm_tokenizer = None
        
    def load_whisper(self):
        """Load Whisper model for STT"""
        if self.whisper_pipe is None:
            print("Loading Whisper model...")
            # Using medium model for better speed/accuracy balance
            model_id = "openai/whisper-medium"
            
            model = AutoModelForSpeechSeq2Seq.from_pretrained(
                model_id,
                torch_dtype=self.torch_dtype,
                low_cpu_mem_usage=True,
                use_safetensors=True
            )
            model.to(self.device)
            
            processor = AutoProcessor.from_pretrained(model_id)
            
            self.whisper_pipe = pipeline(
                "automatic-speech-recognition",
                model=model,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                torch_dtype=self.torch_dtype,
                device=self.device,
                chunk_length_s=30,
                batch_size=16,
            )
            print("Whisper model loaded successfully!")
    
    def load_tts(self):
        """Load TTS model for text-to-speech"""
        if self.tts_model is None:
            print("Loading TTS model...")
            # Using smaller, faster TTS model
            model_id = "parler-tts/parler-tts-tiny-v1"
            
            self.tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
                model_id,
                torch_dtype=self.torch_dtype
            ).to(self.device)
            
            self.tts_tokenizer = ParlerTokenizer.from_pretrained(model_id)
            print("TTS model loaded successfully!")
    
    def load_llm(self):
        """Load LLM for conversation generation"""
        if self.llm_model is None:
            print("Loading LLM...")
            # Using Llama 3.2 3B - smaller and faster than 7B models
            model_id = "meta-llama/Llama-3.2-3B-Instruct"
            
            self.llm_tokenizer = AutoTokenizer.from_pretrained(model_id)
            self.llm_model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=self.torch_dtype,
                device_map="auto",
                low_cpu_mem_usage=True
            )
            print("LLM loaded successfully!")
    
    @spaces.GPU
    def speech_to_text(self, audio_path: str) -> str:
        """Convert speech to text using Whisper - optimized for speed"""
        try:
            self.load_whisper()
            
            # Validate audio file exists and has correct format
            if not audio_path or not os.path.exists(audio_path):
                print(f"Audio file not found: {audio_path}")
                return ""
            
            # Check file extension
            if not audio_path.lower().endswith(('.wav', '.mp3', '.flac', '.m4a', '.ogg')):
                print(f"Invalid audio format: {audio_path}")
                return ""
            
            result = self.whisper_pipe(
                audio_path,
                return_timestamps=False,
                generate_kwargs={
                    "language": "english",
                    "task": "transcribe",
                    "num_beams": 1,  # Faster
                    "temperature": 0.0  # More deterministic
                }
            )
            
            return result["text"].strip()
        except Exception as e:
            print(f"Error in STT: {e}")
            import traceback
            traceback.print_exc()
            return ""
    
    @spaces.GPU
    def text_to_speech(self, text: str, accent: str = "American", speaker_name: str = None) -> str:
        """Convert text to speech - optimized for speed with American accent"""
        try:
            self.load_tts()
            
            # Simplified: Just use one clear American voice for speed
            description = "A clear American male voice speaks at moderate pace with good enunciation."
            
            # Limit text length for faster generation
            if len(text) > 200:
                text = text[:200] + "..."
            
            # Generate audio with optimized settings
            input_ids = self.tts_tokenizer(description, return_tensors="pt").input_ids.to(self.device)
            prompt_input_ids = self.tts_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
            
            generation = self.tts_model.generate(
                input_ids=input_ids,
                prompt_input_ids=prompt_input_ids,
                attention_mask=torch.ones_like(input_ids),
                do_sample=False,  # Faster, deterministic
                num_beams=1       # Faster generation
            )
            
            audio_arr = generation.cpu().numpy().squeeze()
            
            # Save to temporary file using scipy
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
            
            # Normalize audio to int16 range
            audio_int16 = (audio_arr * 32767).astype(np.int16)
            
            # Save using scipy.io.wavfile
            wavfile.write(
                temp_file.name,
                self.tts_model.config.sampling_rate,
                audio_int16
            )
            
            return temp_file.name
            
        except Exception as e:
            print(f"Error in TTS: {e}")
            # Return a silent audio file as fallback
            return None
    
    @spaces.GPU
    def generate_response(
        self,
        system_prompt: str,
        conversation_history: List[Dict],
        bot_name: str
    ) -> str:
        """Generate conversational response using LLM"""
        try:
            self.load_llm()
            
            # Format conversation for the model
            messages = [{"role": "system", "content": system_prompt}]
            
            # Add conversation history
            for msg in conversation_history[-6:]:  # Keep last 6 messages for context
                messages.append({
                    "role": msg["role"],
                    "content": msg["content"]
                })
            
            # Format conversation for Llama
            inputs = self.llm_tokenizer.apply_chat_template(
                messages,
                return_tensors="pt",
                add_generation_prompt=True
            ).to(self.device)
            
            outputs = self.llm_model.generate(
                inputs,
                max_new_tokens=200,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.llm_tokenizer.eos_token_id
            )
            
            response = self.llm_tokenizer.decode(
                outputs[0][inputs.shape[1]:],
                skip_special_tokens=True
            )
            
            return response.strip()
            
        except Exception as e:
            print(f"Error in LLM generation: {e}")
            return f"I understand. Could you tell me more about that?"
    
    @spaces.GPU
    def generate_feedback(self, prompt: str) -> str:
        """Generate detailed feedback using LLM"""
        try:
            self.load_llm()
            
            # Format feedback prompt for Llama
            messages = [
                {
                    "role": "system",
                    "content": "You are an expert communication coach specializing in sales and professional communication. Provide specific, actionable feedback."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ]
            
            inputs = self.llm_tokenizer.apply_chat_template(
                messages,
                return_tensors="pt",
                add_generation_prompt=True
            ).to(self.device)
            
            outputs = self.llm_model.generate(
                inputs,
                max_new_tokens=500,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.llm_tokenizer.eos_token_id
            )
            
            feedback = self.llm_tokenizer.decode(
                outputs[0][inputs.shape[1]:],
                skip_special_tokens=True
            )
            
            return feedback.strip()
            
        except Exception as e:
            print(f"Error in feedback generation: {e}")
            return "Unable to generate feedback at this time."