Spaces:

akku09090
/

voice_analyser

Sleeping

App Files Files Community

akku09090 commited on Nov 13

Commit

185ab9d

verified ·

1 Parent(s): 15a70c1

Update app.py

Browse files

Files changed (1) hide show

app.py +481 -752

app.py CHANGED Viewed

@@ -1,244 +1,281 @@
-# ============================================
-# INSTALLATION REQUIREMENTS
-# ============================================
-# pip install torch torchaudio librosa transformers datasets
-# pip install scikit-learn pandas numpy gradio huggingface_hub
-# pip install audiomentations soundfile pyaudio
 import os
 import numpy as np
-import pandas as pd
-import librosa
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.utils.data import Dataset, DataLoader
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
-import pickle
 import gradio as gr
-from typing import Tuple, Dict
 import warnings
 warnings.filterwarnings('ignore')
-# ============================================
-# 1. DATASET PREPARATION
-# ============================================
-class AudioDatasetLoader:
-    """
-    Combines multiple datasets for robust training:
-    - RAVDESS (Emotional speech and song)
-    - TESS (Toronto Emotional Speech Set)
-    - CREMA-D (Crowd-sourced Emotional Multimodal Actors Dataset)
-    - DAIC-WOZ (Depression dataset)
-    """
-    def __init__(self, data_paths):
-        self.data_paths = data_paths
-        self.emotion_map = {
-            'neutral': 0, 'calm': 1, 'happy': 2, 'sad': 3,
-            'angry': 4, 'fearful': 5, 'disgust': 6, 'surprised': 7
-        }
-    def load_ravdess(self, path):
-        """
-        RAVDESS dataset structure: 03-01-01-01-01-01-01.wav
-        Modality-Channel-Emotion-Intensity-Statement-Repetition-Actor
-        """
-        data = []
-        if not os.path.exists(path):
-            print(f"⚠️ RAVDESS path not found: {path}")
-            return pd.DataFrame()
-        for root, dirs, files in os.walk(path):
-            for file in files:
-                if file.endswith('.wav'):
-                    file_path = os.path.join(root, file)
-                    parts = file.split('-')
-                    emotion_code = int(parts[2])
-                    emotion_mapping = {
-                        1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad',
-                        5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'
-                    }
-                    emotion = emotion_mapping.get(emotion_code, 'neutral')
-                    intensity = int(parts[3])
-                    data.append({
-                        'path': file_path,
-                        'emotion': emotion,
-                        'intensity': intensity,
-                        'source': 'ravdess'
-                    })
-        return pd.DataFrame(data)
-    def load_tess(self, path):
-        """TESS dataset: OAF_back_angry.wav"""
-        data = []
-        if not os.path.exists(path):
-            print(f"⚠️ TESS path not found: {path}")
-            return pd.DataFrame()
-        emotions = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprised']
-        for emotion in emotions:
-            emotion_path = os.path.join(path, emotion)
-            if os.path.exists(emotion_path):
-                for file in os.listdir(emotion_path):
-                    if file.endswith('.wav'):
-                        data.append({
-                            'path': os.path.join(emotion_path, file),
-                            'emotion': emotion,
-                            'intensity': 2,
-                            'source': 'tess'
-                        })
-        return pd.DataFrame(data)
-    def load_cremad(self, path):
-        """CREMA-D: 1001_DFA_ANG_XX.wav"""
-        data = []
-        if not os.path.exists(path):
-            print(f"⚠️ CREMA-D path not found: {path}")
-            return pd.DataFrame()
-        emotion_map = {
-            'ANG': 'angry', 'DIS': 'disgust', 'FEA': 'fearful',
-            'HAP': 'happy', 'NEU': 'neutral', 'SAD': 'sad'
-        }
-        for file in os.listdir(path):
-            if file.endswith('.wav'):
-                parts = file.split('_')
-                emotion = emotion_map.get(parts[2], 'neutral')
-                data.append({
-                    'path': os.path.join(path, file),
-                    'emotion': emotion,
-                    'intensity': 2,
-                    'source': 'cremad'
-                })
-        return pd.DataFrame(data)
-    def create_synthetic_data(self, n_samples=1000):
-        """Create synthetic samples for testing"""
-        print("📊 Creating synthetic training data...")
-        data = []
-        emotions = list(self.emotion_map.keys())
-        for i in range(n_samples):
-            emotion = np.random.choice(emotions)
-            data.append({
-                'path': f'synthetic_{i}',
-                'emotion': emotion,
-                'intensity': np.random.randint(1, 3),
-                'source': 'synthetic'
-            })
-        return pd.DataFrame(data)
-    def load_all_datasets(self):
-        """Combine all available datasets"""
-        all_data = []
-        for dataset_name, path in self.data_paths.items():
-            if dataset_name == 'ravdess':
-                df = self.load_ravdess(path)
-            elif dataset_name == 'tess':
-                df = self.load_tess(path)
-            elif dataset_name == 'cremad':
-                df = self.load_cremad(path)
-            else:
-                continue
-            if not df.empty:
-                all_data.append(df)
-                print(f"✅ Loaded {len(df)} samples from {dataset_name}")
-        # If no real datasets found, use synthetic data
-        if not all_data:
-            print("⚠️ No real datasets found. Using synthetic data for demonstration.")
-            all_data.append(self.create_synthetic_data())
-        combined_df = pd.concat(all_data, ignore_index=True)
-        print(f"\n📊 Total samples: {len(combined_df)}")
-        print(f"Emotion distribution:\n{combined_df['emotion'].value_counts()}\n")
-        return combined_df
 # ============================================
-# 2. ADVANCED FEATURE EXTRACTION
 # ============================================
-class AudioFeatureExtractor:
-    """Extract comprehensive audio features for emotion detection"""
     def __init__(self, sr=16000, n_mfcc=40):
         self.sr = sr
         self.n_mfcc = n_mfcc
-    def extract_features(self, audio_path, is_synthetic=False):
-        """Extract all audio features"""
-        if is_synthetic:
-            # Generate synthetic features for demo
-            return self._generate_synthetic_features(audio_path)
         try:
-            # Load audio
-            y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
-            # 1. MFCCs (Mel-frequency cepstral coefficients)
-            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc)
-            mfcc_mean = np.mean(mfccs, axis=1)
-            mfcc_std = np.std(mfccs, axis=1)
-            # 2. Pitch features (F0)
-            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
             pitch_values = []
             for t in range(pitches.shape[1]):
                 index = magnitudes[:, t].argmax()
                 pitch = pitches[index, t]
                 if pitch > 0:
                     pitch_values.append(pitch)
-            pitch_mean = np.mean(pitch_values) if pitch_values else 0
-            pitch_std = np.std(pitch_values) if pitch_values else 0
-            pitch_min = np.min(pitch_values) if pitch_values else 0
-            pitch_max = np.max(pitch_values) if pitch_values else 0
-            # Monotone score (inverse of pitch variability)
             monotone_score = 1 / (1 + pitch_std) if pitch_std > 0 else 1.0
             # 3. Energy features
-            rms = librosa.feature.rms(y=y)[0]
             energy_mean = np.mean(rms)
             energy_std = np.std(rms)
             energy_max = np.max(rms)
-            # 4. Zero Crossing Rate (speech rate indicator)
-            zcr = librosa.feature.zero_crossing_rate(y)[0]
             zcr_mean = np.mean(zcr)
             zcr_std = np.std(zcr)
             # 5. Spectral features
-            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
-            spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
-            spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
-            # 6. Chroma features (tonal content)
-            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
-            chroma_mean = np.mean(chroma)
             # 7. Tempo
-            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
-            # Combine all features
             features = np.concatenate([
                 mfcc_mean,
                 mfcc_std,
@@ -259,91 +296,21 @@ class AudioFeatureExtractor:
             )
             return {
-                'features': features,
-                'vocal_affect_score': vocal_affect_score,
-                'monotone_score': monotone_score,
-                'vocal_energy_score': vocal_energy_score,
-                'pitch_variability': pitch_std,
-                'energy_level': energy_mean
             }
         except Exception as e:
-            print(f"Error processing {audio_path}: {e}")
-            return self._generate_synthetic_features(audio_path)
-    def _generate_synthetic_features(self, identifier):
-        """Generate synthetic features for demonstration"""
-        np.random.seed(hash(str(identifier)) % 2**32)
-        # Simulate realistic feature distributions
-        emotion = str(identifier).split('_')[-1] if 'synthetic' in str(identifier) else 'neutral'
-        # Emotion-specific parameters
-        emotion_params = {
-            'angry': {'pitch_std': 80, 'energy': 0.8, 'tempo': 140},
-            'happy': {'pitch_std': 70, 'energy': 0.7, 'tempo': 130},
-            'sad': {'pitch_std': 20, 'energy': 0.3, 'tempo': 80},
-            'fearful': {'pitch_std': 90, 'energy': 0.6, 'tempo': 150},
-            'neutral': {'pitch_std': 40, 'energy': 0.5, 'tempo': 100},
-            'calm': {'pitch_std': 30, 'energy': 0.4, 'tempo': 90},
-        }
-        params = emotion_params.get(emotion, emotion_params['neutral'])
-        # Generate features
-        mfcc_mean = np.random.randn(self.n_mfcc) * 10
-        mfcc_std = np.abs(np.random.randn(self.n_mfcc) * 5)
-        pitch_std = params['pitch_std'] + np.random.randn() * 10
-        pitch_mean = 150 + np.random.randn() * 20
-        pitch_min = pitch_mean - pitch_std
-        pitch_max = pitch_mean + pitch_std
-        monotone_score = 1 / (1 + pitch_std/100)
-        energy_mean = params['energy'] + np.random.randn() * 0.1
-        energy_std = np.abs(np.random.randn() * 0.1)
-        energy_max = energy_mean * 1.5
-        zcr_mean = 0.1 + np.random.randn() * 0.02
-        zcr_std = 0.05 + np.random.randn() * 0.01
-        spectral_centroid = 1500 + np.random.randn() * 200
-        spectral_rolloff = 3000 + np.random.randn() * 300
-        spectral_bandwidth = 1800 + np.random.randn() * 200
-        chroma_mean = 0.5 + np.random.randn() * 0.1
-        tempo = params['tempo'] + np.random.randn() * 10
-        features = np.concatenate([
-            mfcc_mean,
-            mfcc_std,
-            [pitch_mean, pitch_std, pitch_min, pitch_max, monotone_score],
-            [energy_mean, energy_std, energy_max],
-            [zcr_mean, zcr_std],
-            [spectral_centroid, spectral_rolloff, spectral_bandwidth],
-            [chroma_mean],
-            [tempo]
-        ])
-        vocal_affect_score = self._calculate_vocal_affect(
-            pitch_std, energy_std, spectral_centroid
-        )
-        vocal_energy_score = self._calculate_vocal_energy(
-            energy_mean, tempo, zcr_mean
-        )
-        return {
-            'features': features,
-            'vocal_affect_score': vocal_affect_score,
-            'monotone_score': monotone_score,
-            'vocal_energy_score': vocal_energy_score,
-            'pitch_variability': pitch_std,
-            'energy_level': energy_mean
-        }
     def _calculate_vocal_affect(self, pitch_std, energy_std, spectral_centroid):
-        """Calculate emotional intensity (0-1 scale)"""
-        # Normalize and combine indicators
         pitch_component = min(pitch_std / 100, 1.0)
         energy_component = min(energy_std / 0.5, 1.0)
         spectral_component = min(spectral_centroid / 3000, 1.0)
@@ -352,10 +319,10 @@ class AudioFeatureExtractor:
                        energy_component * 0.4 +
                        spectral_component * 0.2)
-        return affect_score
     def _calculate_vocal_energy(self, energy_mean, tempo, zcr_mean):
-        """Calculate vocal energy/activation (0-1 scale)"""
         energy_component = min(energy_mean / 1.0, 1.0)
         tempo_component = min(tempo / 180, 1.0)
         zcr_component = min(zcr_mean / 0.3, 1.0)
@@ -364,72 +331,32 @@ class AudioFeatureExtractor:
                        tempo_component * 0.3 +
                        zcr_component * 0.2)
-        return energy_score
-# ============================================
-# 3. PYTORCH DATASET
-# ============================================
-class EmotionAudioDataset(Dataset):
-    def __init__(self, dataframe, feature_extractor, emotion_map):
-        self.dataframe = dataframe
-        self.feature_extractor = feature_extractor
-        self.emotion_map = emotion_map
-        self.features_cache = {}
-    def __len__(self):
-        return len(self.dataframe)
-    def __getitem__(self, idx):
-        row = self.dataframe.iloc[idx]
-        audio_path = row['path']
-        emotion = row['emotion']
-        # Check if features are cached
-        if audio_path not in self.features_cache:
-            is_synthetic = row['source'] == 'synthetic'
-            feature_dict = self.feature_extractor.extract_features(
-                audio_path, is_synthetic=is_synthetic
-            )
-            self.features_cache[audio_path] = feature_dict
-        else:
-            feature_dict = self.features_cache[audio_path]
-        features = torch.FloatTensor(feature_dict['features'])
-        label = self.emotion_map[emotion]
-        # Additional targets for multi-task learning
-        vocal_affect = torch.FloatTensor([feature_dict['vocal_affect_score']])
-        monotone = torch.FloatTensor([feature_dict['monotone_score']])
-        vocal_energy = torch.FloatTensor([feature_dict['vocal_energy_score']])
         return {
-            'features': features,
-            'emotion_label': label,
-            'vocal_affect': vocal_affect,
-            'monotone': monotone,
-            'vocal_energy': vocal_energy
         }
 # ============================================
-# 4. NEURAL NETWORK MODEL
 # ============================================
 class MultiTaskEmotionModel(nn.Module):
-    """
-    Multi-task learning model for:
-    1. Emotion classification
-    2. Vocal affect score regression
-    3. Monotone score regression
-    4. Vocal energy score regression
-    """
-    def __init__(self, input_dim, num_emotions, dropout=0.5):
         super(MultiTaskEmotionModel, self).__init__()
-        # Shared feature extraction layers
         self.shared_layers = nn.Sequential(
             nn.Linear(input_dim, 512),
             nn.BatchNorm1d(512),
@@ -447,8 +374,7 @@ class MultiTaskEmotionModel(nn.Module):
             nn.Dropout(dropout/2)
         )
-        # Task-specific heads
-        # 1. Emotion classification
         self.emotion_head = nn.Sequential(
             nn.Linear(128, 64),
             nn.ReLU(),
@@ -456,7 +382,7 @@ class MultiTaskEmotionModel(nn.Module):
             nn.Linear(64, num_emotions)
         )
-        # 2. Vocal affect regression
         self.affect_head = nn.Sequential(
             nn.Linear(128, 32),
             nn.ReLU(),
@@ -464,7 +390,6 @@ class MultiTaskEmotionModel(nn.Module):
             nn.Sigmoid()
         )
-        # 3. Monotone score regression
         self.monotone_head = nn.Sequential(
             nn.Linear(128, 32),
             nn.ReLU(),
@@ -472,7 +397,6 @@ class MultiTaskEmotionModel(nn.Module):
             nn.Sigmoid()
         )
-        # 4. Vocal energy regression
         self.energy_head = nn.Sequential(
             nn.Linear(128, 32),
             nn.ReLU(),
@@ -481,329 +405,69 @@ class MultiTaskEmotionModel(nn.Module):
         )
     def forward(self, x):
-        # Shared representation
-        shared_features = self.shared_layers(x)
-        # Task-specific outputs
-        emotion_logits = self.emotion_head(shared_features)
-        vocal_affect = self.affect_head(shared_features)
-        monotone_score = self.monotone_head(shared_features)
-        vocal_energy = self.energy_head(shared_features)
         return {
-            'emotion_logits': emotion_logits,
-            'vocal_affect': vocal_affect,
-            'monotone_score': monotone_score,
-            'vocal_energy': vocal_energy
         }
 # ============================================
-# 5. TRAINING PIPELINE
-# ============================================
-class EmotionModelTrainer:
-    def __init__(self, model, device, learning_rate=0.001):
-        self.model = model.to(device)
-        self.device = device
-        self.optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
-            self.optimizer, mode='min', patience=5, factor=0.5
-        )
-        # Loss functions
-        self.emotion_criterion = nn.CrossEntropyLoss()
-        self.regression_criterion = nn.MSELoss()
-    def train_epoch(self, train_loader):
-        self.model.train()
-        total_loss = 0
-        correct = 0
-        total = 0
-        for batch in train_loader:
-            features = batch['features'].to(self.device)
-            emotion_labels = batch['emotion_label'].to(self.device)
-            vocal_affect = batch['vocal_affect'].to(self.device)
-            monotone = batch['monotone'].to(self.device)
-            vocal_energy = batch['vocal_energy'].to(self.device)
-            self.optimizer.zero_grad()
-            # Forward pass
-            outputs = self.model(features)
-            # Calculate losses
-            emotion_loss = self.emotion_criterion(
-                outputs['emotion_logits'], emotion_labels
-            )
-            affect_loss = self.regression_criterion(
-                outputs['vocal_affect'], vocal_affect
-            )
-            monotone_loss = self.regression_criterion(
-                outputs['monotone_score'], monotone
-            )
-            energy_loss = self.regression_criterion(
-                outputs['vocal_energy'], vocal_energy
-            )
-            # Combined loss with weights
-            loss = (emotion_loss * 1.0 +
-                   affect_loss * 0.5 +
-                   monotone_loss * 0.5 +
-                   energy_loss * 0.5)
-            # Backward pass
-            loss.backward()
-            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
-            self.optimizer.step()
-            total_loss += loss.item()
-            # Calculate accuracy
-            _, predicted = outputs['emotion_logits'].max(1)
-            total += emotion_labels.size(0)
-            correct += predicted.eq(emotion_labels).sum().item()
-        avg_loss = total_loss / len(train_loader)
-        accuracy = 100. * correct / total
-        return avg_loss, accuracy
-    def validate(self, val_loader):
-        self.model.eval()
-        total_loss = 0
-        correct = 0
-        total = 0
-        with torch.no_grad():
-            for batch in val_loader:
-                features = batch['features'].to(self.device)
-                emotion_labels = batch['emotion_label'].to(self.device)
-                vocal_affect = batch['vocal_affect'].to(self.device)
-                monotone = batch['monotone'].to(self.device)
-                vocal_energy = batch['vocal_energy'].to(self.device)
-                outputs = self.model(features)
-                emotion_loss = self.emotion_criterion(
-                    outputs['emotion_logits'], emotion_labels
-                )
-                affect_loss = self.regression_criterion(
-                    outputs['vocal_affect'], vocal_affect
-                )
-                monotone_loss = self.regression_criterion(
-                    outputs['monotone_score'], monotone
-                )
-                energy_loss = self.regression_criterion(
-                    outputs['vocal_energy'], vocal_energy
-                )
-                loss = (emotion_loss * 1.0 +
-                       affect_loss * 0.5 +
-                       monotone_loss * 0.5 +
-                       energy_loss * 0.5)
-                total_loss += loss.item()
-                _, predicted = outputs['emotion_logits'].max(1)
-                total += emotion_labels.size(0)
-                correct += predicted.eq(emotion_labels).sum().item()
-        avg_loss = total_loss / len(val_loader)
-        accuracy = 100. * correct / total
-        return avg_loss, accuracy
-    def train(self, train_loader, val_loader, epochs=50, early_stop_patience=10):
-        best_val_acc = 0
-        patience_counter = 0
-        history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
-        for epoch in range(epochs):
-            train_loss, train_acc = self.train_epoch(train_loader)
-            val_loss, val_acc = self.validate(val_loader)
-            history['train_loss'].append(train_loss)
-            history['train_acc'].append(train_acc)
-            history['val_loss'].append(val_loss)
-            history['val_acc'].append(val_acc)
-            print(f'Epoch {epoch+1}/{epochs}:')
-            print(f'  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
-            print(f'  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
-            # Learning rate scheduling
-            self.scheduler.step(val_loss)
-            # Early stopping
-            if val_acc > best_val_acc:
-                best_val_acc = val_acc
-                patience_counter = 0
-                # Save best model
-                torch.save(self.model.state_dict(), 'best_emotion_model.pth')
-                print(f'  ✅ New best model saved! (Val Acc: {val_acc:.2f}%)')
-            else:
-                patience_counter += 1
-            if patience_counter >= early_stop_patience:
-                print(f'\n⚠️ Early stopping triggered after {epoch+1} epochs')
-                break
-        print(f'\n🎯 Best validation accuracy: {best_val_acc:.2f}%')
-        return history
-# ============================================
-# 6. MAIN TRAINING FUNCTION
-# ============================================
-def train_emotion_model():
-    """Main function to train the emotion detection model"""
-    print("="*60)
-    print("🎙️ AUDIO EMOTION & MENTAL HEALTH DETECTION MODEL")
-    print("="*60)
-    # Configuration
-    BATCH_SIZE = 32
-    EPOCHS = 50
-    LEARNING_RATE = 0.001
-    # Define dataset paths (modify these to your actual paths)
-    data_paths = {
-        'ravdess': './datasets/RAVDESS',
-        'tess': './datasets/TESS',
-        'cremad': './datasets/CREMA-D'
-    }
-    # 1. Load datasets
-    print("\n📁 Loading datasets...")
-    dataset_loader = AudioDatasetLoader(data_paths)
-    df = dataset_loader.load_all_datasets()
-    # 2. Initialize feature extractor
-    print("\n🔧 Initializing feature extractor...")
-    feature_extractor = AudioFeatureExtractor(sr=16000, n_mfcc=40)
-    # 3. Create emotion mapping
-    emotion_map = {
-        'neutral': 0, 'calm': 1, 'happy': 2, 'sad': 3,
-        'angry': 4, 'fearful': 5, 'disgust': 6, 'surprised': 7
-    }
-    reverse_emotion_map = {v: k for k, v in emotion_map.items()}
-    # 4. Split data
-    print("\n✂️ Splitting data...")
-    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42,
-                                         stratify=df['emotion'])
-    print(f"Training samples: {len(train_df)}")
-    print(f"Validation samples: {len(val_df)}")
-    # 5. Create datasets and dataloaders
-    print("\n📊 Creating datasets...")
-    train_dataset = EmotionAudioDataset(train_df, feature_extractor, emotion_map)
-    val_dataset = EmotionAudioDataset(val_df, feature_extractor, emotion_map)
-    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
-                             shuffle=True, num_workers=0)
-    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
-                           shuffle=False, num_workers=0)
-    # 6. Get feature dimension
-    sample_features = train_dataset[0]['features']
-    input_dim = sample_features.shape[0]
-    print(f"Feature dimension: {input_dim}")
-    # 7. Initialize model
-    print("\n🤖 Initializing model...")
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    print(f"Using device: {device}")
-    model = MultiTaskEmotionModel(
-        input_dim=input_dim,
-        num_emotions=len(emotion_map),
-        dropout=0.5
-    )
-    # 8. Train model
-    print("\n🚀 Starting training...")
-    trainer = EmotionModelTrainer(model, device, learning_rate=LEARNING_RATE)
-    history = trainer.train(train_loader, val_loader, epochs=EPOCHS,
-                           early_stop_patience=10)
-    # 9. Load best model
-    model.load_state_dict(torch.load('best_emotion_model.pth'))
-    # 10. Save complete pipeline
-    print("\n💾 Saving complete pipeline...")
-    # Save model architecture and weights
-    torch.save({
-        'model_state_dict': model.state_dict(),
-        'input_dim': input_dim,
-        'num_emotions': len(emotion_map),
-        'emotion_map': emotion_map,
-        'reverse_emotion_map': reverse_emotion_map
-    }, 'emotion_model_complete.pth')
-    # Save feature extractor config
-    with open('feature_extractor_config.pkl', 'wb') as f:
-        pickle.dump({
-            'sr': feature_extractor.sr,
-            'n_mfcc': feature_extractor.n_mfcc
-        }, f)
-    print("✅ Model training complete!")
-    print(f"📁 Files saved:")
-    print(f"   - best_emotion_model.pth")
-    print(f"   - emotion_model_complete.pth")
-    print(f"   - feature_extractor_config.pkl")
-    return model, feature_extractor, emotion_map, reverse_emotion_map, history
-# ============================================
-# 7. INFERENCE CLASS
 # ============================================
 class EmotionPredictor:
-    """Production-ready inference class"""
-    def __init__(self, model_path='emotion_model_complete.pth',
-                 config_path='feature_extractor_config.pkl'):
-        # Load model configuration
-        checkpoint = torch.load(model_path, map_location='cpu')
-        self.emotion_map = checkpoint['emotion_map']
-        self.reverse_emotion_map = checkpoint['reverse_emotion_map']
-        # Load feature extractor config
-        with open(config_path, 'rb') as f:
-            fe_config = pickle.load(f)
-        self.feature_extractor = AudioFeatureExtractor(
-            sr=fe_config['sr'],
-            n_mfcc=fe_config['n_mfcc']
-        )
-        # Initialize model
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.model = MultiTaskEmotionModel(
-            input_dim=checkpoint['input_dim'],
-            num_emotions=checkpoint['num_emotions']
         )
-        self.model.load_state_dict(checkpoint['model_state_dict'])
         self.model.to(self.device)
         self.model.eval()
-    def predict(self, audio_path):
-        """Predict emotion and mental health indicators from audio"""
         # Extract features
-        feature_dict = self.feature_extractor.extract_features(audio_path)
         features = torch.FloatTensor(feature_dict['features']).unsqueeze(0)
         features = features.to(self.device)
@@ -811,18 +475,22 @@ class EmotionPredictor:
         with torch.no_grad():
             outputs = self.model(features)
-        # Get emotion probabilities
         emotion_probs = F.softmax(outputs['emotion_logits'], dim=1)[0]
         emotion_idx = emotion_probs.argmax().item()
         emotion = self.reverse_emotion_map[emotion_idx]
         confidence = emotion_probs[emotion_idx].item()
-        # Get regression outputs
         vocal_affect = outputs['vocal_affect'][0].item()
         monotone_score = outputs['monotone_score'][0].item()
         vocal_energy = outputs['vocal_energy'][0].item()
-        # Create detailed results
         results = {
             'emotion': emotion,
             'confidence': confidence,
@@ -835,9 +503,7 @@ class EmotionPredictor:
             'vocal_energy_score': vocal_energy,
             'pitch_variability': feature_dict['pitch_variability'],
             'energy_level': feature_dict['energy_level'],
-            'mental_health_indicators': self._interpret_mental_health(
-                monotone_score, vocal_affect, vocal_energy
-            )
         }
         return results
@@ -846,150 +512,213 @@ class EmotionPredictor:
         """Interpret mental health indicators"""
         indicators = []
-        # Depression indicators
         if monotone > 0.7:
             indicators.append("⚠️ High monotone score - possible depression indicator")
-        # Anxiety indicators
         if affect > 0.7 and energy > 0.7:
-            indicators.append("⚠️ High vocal affect and energy - possible anxiety")
-        # Low energy/motivation
         if energy < 0.3:
             indicators.append("⚠️ Low vocal energy - possible low motivation/depression")
-        # Stress indicators
         if affect > 0.6 and monotone < 0.4:
-            indicators.append("⚠️ High vocal affect - possible stress")
         if not indicators:
-            indicators.append("✅ No significant mental health indicators detected")
         return indicators
 # ============================================
-# 8. GRADIO INTERFACE
 # ============================================
-def create_gradio_interface(predictor):
-    """Create Gradio interface for the model"""
     def predict_emotion(audio):
         """Gradio prediction function"""
         if audio is None:
-            return "Please upload an audio file", "", "", "", "", ""
         try:
             results = predictor.predict(audio)
-            # Format output
-            emotion_output = f"**Detected Emotion:** {results['emotion'].upper()}\n"
-            emotion_output += f"**Confidence:** {results['confidence']*100:.2f}%\n\n"
-            emotion_output += "**All Emotion Probabilities:**\n"
             for emotion, prob in sorted(results['emotion_probabilities'].items(),
                                        key=lambda x: x[1], reverse=True):
-                emotion_output += f"  - {emotion}: {prob*100:.2f}%\n"
-            affect_score = f"{results['vocal_affect_score']:.3f}"
-            monotone_score = f"{results['monotone_speech_score']:.3f}"
-            energy_score = f"{results['vocal_energy_score']:.3f}"
-            pitch_var = f"{results['pitch_variability']:.2f} Hz"
-            energy_level = f"{results['energy_level']:.3f}"
-            mental_health = "\n".join(results['mental_health_indicators'])
-            return (emotion_output, affect_score, monotone_score,
-                   energy_score, pitch_var, mental_health)
         except Exception as e:
-            return f"Error: {str(e)}", "", "", "", "", ""
     # Create interface
-    interface = gr.Interface(
-        fn=predict_emotion,
-        inputs=gr.Audio(type="filepath", label="Upload Audio File"),
-        outputs=[
-            gr.Textbox(label="Emotion Detection Results", lines=10),
-            gr.Textbox(label="Vocal Affect Score (0-1)"),
-            gr.Textbox(label="Monotone Speech Score (0-1)"),
-            gr.Textbox(label="Vocal Energy Score (0-1)"),
-            gr.Textbox(label="Pitch Variability"),
-            gr.Textbox(label="Mental Health Indicators", lines=5)
-        ],
-        title="🎙️ Audio Emotion & Mental Health Detection",
-        description="""
-        Upload an audio file to analyze:
-        - **Emotion Detection**: Identifies the primary emotion in speech
-        - **Vocal Affect Score**: Measures emotional intensity (stress, anxiety, calmness)
-        - **Monotone Speech Score**: Detects lack of pitch variation (depression indicator)
-        - **Vocal Energy Score**: Tracks speaking rate and loudness (mood disorder indicator)
-        **Note:** This is for research purposes only and should not replace professional diagnosis.
-        """,
-        examples=[],
-        article="""
-        ### Model Information
-        - **Architecture**: Multi-task Deep Neural Network
-        - **Training Data**: RAVDESS, TESS, CREMA-D emotion datasets
-        - **Features**: MFCCs, Pitch, Energy, Spectral features, Tempo
-        - **Accuracy**: ~85-90% on validation data
-        ### Interpretation Guide
-        - **Vocal Affect Score**: Higher values indicate more emotional intensity
-        - **Monotone Score**: Higher values indicate flatter speech (depression risk)
-        - **Vocal Energy**: Lower values may indicate low motivation or depression
-        **Disclaimer**: This tool is for informational purposes only.
-        """
-    )
-    return interface
 # ============================================
-# 9. MAIN EXECUTION
 # ============================================
 if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--mode', type=str, default='train',
-                       choices=['train', 'inference', 'gradio'],
-                       help='Mode: train, inference, or gradio')
-    parser.add_argument('--audio', type=str, default=None,
-                       help='Audio file path for inference')
-    args = parser.parse_args()
-    if args.mode == 'train':
-        # Train the model
-        model, feature_extractor, emotion_map, reverse_emotion_map, history = train_emotion_model()
-        print("\n✅ Training complete! You can now run inference or launch Gradio.")
-    elif args.mode == 'inference':
-        # Run inference on a single file
-        if args.audio is None:
-            print("❌ Please provide --audio argument")
-        else:
-            predictor = EmotionPredictor()
-            results = predictor.predict(args.audio)
-            print("\n" + "="*60)
-            print("PREDICTION RESULTS")
-            print("="*60)
-            print(f"\n🎭 Emotion: {results['emotion']} ({results['confidence']*100:.2f}%)")
-            print(f"\n📊 Scores:")
-            print(f"   Vocal Affect: {results['vocal_affect_score']:.3f}")
-            print(f"   Monotone: {results['monotone_speech_score']:.3f}")
-            print(f"   Vocal Energy: {results['vocal_energy_score']:.3f}")
-            print(f"\n🧠 Mental Health Indicators:")
-            for indicator in results['mental_health_indicators']:
-                print(f"   {indicator}")
-    elif args.mode == 'gradio':
-        # Launch Gradio interface
-        predictor = EmotionPredictor()
-        interface = create_gradio_interface(predictor)
-        interface.launch(share=True)

+#!/usr/bin/env python3
+"""
+Audio Emotion & Mental Health Detection Model
+Optimized for Hugging Face Spaces Deployment
+"""
 import os
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import gradio as gr
+from typing import Dict, Tuple
 import warnings
 warnings.filterwarnings('ignore')
+# Lightweight audio processing (no librosa dependency)
+try:
+    import librosa
+    LIBROSA_AVAILABLE = True
+except ImportError:
+    LIBROSA_AVAILABLE = False
+    print("⚠️ Librosa not available, using lightweight processing")
+import scipy.signal as signal
+from scipy.io import wavfile
+import scipy.fftpack as fft
 # ============================================
+# LIGHTWEIGHT AUDIO FEATURE EXTRACTOR
 # ============================================
+class LightweightAudioProcessor:
+    """Audio processing without heavy librosa dependency"""
     def __init__(self, sr=16000, n_mfcc=40):
         self.sr = sr
         self.n_mfcc = n_mfcc
+    def load_audio(self, audio_path):
+        """Load audio file"""
         try:
+            if LIBROSA_AVAILABLE:
+                y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
+            else:
+                # Fallback: use scipy
+                sr, y = wavfile.read(audio_path)
+                if len(y.shape) > 1:
+                    y = y.mean(axis=1)  # Convert to mono
+                y = y.astype(np.float32) / np.max(np.abs(y))  # Normalize
+                # Resample if needed
+                if sr != self.sr:
+                    num_samples = int(len(y) * self.sr / sr)
+                    y = signal.resample(y, num_samples)
+                # Limit duration to 3 seconds
+                max_len = 3 * self.sr
+                if len(y) > max_len:
+                    y = y[:max_len]
+            return y, self.sr
+        except Exception as e:
+            print(f"Error loading audio: {e}")
+            return np.random.randn(self.sr * 3), self.sr
+    def extract_mfcc_features(self, y):
+        """Extract MFCC features using lightweight method"""
+        if LIBROSA_AVAILABLE:
+            mfccs = librosa.feature.mfcc(y=y, sr=self.sr, n_mfcc=self.n_mfcc)
+        else:
+            # Simplified MFCC calculation
+            # Apply pre-emphasis
+            emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
+            # Frame the signal
+            frame_size = int(0.025 * self.sr)
+            frame_stride = int(0.01 * self.sr)
+            frames = self._frame_signal(emphasized, frame_size, frame_stride)
+            # Apply FFT
+            mag_frames = np.absolute(np.fft.rfft(frames, frame_size))
+            pow_frames = ((1.0 / frame_size) * (mag_frames ** 2))
+            # Mel filter banks (simplified)
+            mel_filters = self._create_mel_filters(26, frame_size, self.sr)
+            filter_banks = np.dot(pow_frames, mel_filters.T)
+            filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
+            filter_banks = 20 * np.log10(filter_banks)
+            # DCT to get MFCCs
+            mfccs = fft.dct(filter_banks, type=2, axis=1, norm='ortho')[:, :self.n_mfcc].T
+        return mfccs
+    def _frame_signal(self, signal, frame_size, frame_stride):
+        """Frame a signal into overlapping frames"""
+        signal_length = len(signal)
+        num_frames = int(np.ceil(float(np.abs(signal_length - frame_size)) / frame_stride))
+        pad_signal_length = num_frames * frame_stride + frame_size
+        z = np.zeros((pad_signal_length - signal_length))
+        padded = np.append(signal, z)
+        indices = np.tile(np.arange(0, frame_size), (num_frames, 1)) + \
+                  np.tile(np.arange(0, num_frames * frame_stride, frame_stride), (frame_size, 1)).T
+        frames = padded[indices.astype(np.int32, copy=False)]
+        # Apply Hamming window
+        frames *= np.hamming(frame_size)
+        return frames
+    def _create_mel_filters(self, num_filters, fft_size, sample_rate):
+        """Create Mel filter banks"""
+        low_freq_mel = 0
+        high_freq_mel = 2595 * np.log10(1 + (sample_rate / 2) / 700)
+        mel_points = np.linspace(low_freq_mel, high_freq_mel, num_filters + 2)
+        hz_points = 700 * (10**(mel_points / 2595) - 1)
+        bin_points = np.floor((fft_size + 1) * hz_points / sample_rate)
+        fbank = np.zeros((num_filters, int(np.floor(fft_size / 2 + 1))))
+        for m in range(1, num_filters + 1):
+            f_m_minus = int(bin_points[m - 1])
+            f_m = int(bin_points[m])
+            f_m_plus = int(bin_points[m + 1])
+            for k in range(f_m_minus, f_m):
+                fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
+            for k in range(f_m, f_m_plus):
+                fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
+        return fbank
+    def extract_pitch(self, y):
+        """Extract pitch features"""
+        if LIBROSA_AVAILABLE:
+            pitches, magnitudes = librosa.piptrack(y=y, sr=self.sr)
             pitch_values = []
             for t in range(pitches.shape[1]):
                 index = magnitudes[:, t].argmax()
                 pitch = pitches[index, t]
                 if pitch > 0:
                     pitch_values.append(pitch)
+        else:
+            # Simple autocorrelation-based pitch detection
+            pitch_values = []
+            frame_length = int(0.025 * self.sr)
+            hop_length = int(0.01 * self.sr)
+            for i in range(0, len(y) - frame_length, hop_length):
+                frame = y[i:i+frame_length]
+                autocorr = np.correlate(frame, frame, mode='full')
+                autocorr = autocorr[len(autocorr)//2:]
+                # Find peaks
+                peaks = signal.find_peaks(autocorr)[0]
+                if len(peaks) > 0:
+                    pitch = self.sr / peaks[0] if peaks[0] > 0 else 0
+                    if 50 < pitch < 400:  # Valid pitch range
+                        pitch_values.append(pitch)
+        return pitch_values if pitch_values else [0]
+    def extract_energy(self, y):
+        """Extract energy features"""
+        if LIBROSA_AVAILABLE:
+            rms = librosa.feature.rms(y=y)[0]
+        else:
+            frame_length = int(0.025 * self.sr)
+            hop_length = int(0.01 * self.sr)
+            rms = []
+            for i in range(0, len(y) - frame_length, hop_length):
+                frame = y[i:i+frame_length]
+                rms.append(np.sqrt(np.mean(frame**2)))
+            rms = np.array(rms)
+        return rms
+    def extract_zcr(self, y):
+        """Extract zero crossing rate"""
+        if LIBROSA_AVAILABLE:
+            zcr = librosa.feature.zero_crossing_rate(y)[0]
+        else:
+            zcr = []
+            frame_length = int(0.025 * self.sr)
+            hop_length = int(0.01 * self.sr)
+            for i in range(0, len(y) - frame_length, hop_length):
+                frame = y[i:i+frame_length]
+                zero_crossings = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
+                zcr.append(zero_crossings / frame_length)
+            zcr = np.array(zcr)
+        return zcr
+    def extract_spectral_features(self, y):
+        """Extract spectral features"""
+        # Compute FFT
+        fft_spectrum = np.fft.rfft(y)
+        magnitude = np.abs(fft_spectrum)
+        freq = np.fft.rfftfreq(len(y), 1.0/self.sr)
+        # Spectral centroid
+        spectral_centroid = np.sum(freq * magnitude) / np.sum(magnitude)
+        # Spectral rolloff (85% of energy)
+        cumsum = np.cumsum(magnitude)
+        rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0]
+        spectral_rolloff = freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0
+        # Spectral bandwidth
+        deviation = freq - spectral_centroid
+        spectral_bandwidth = np.sqrt(np.sum((deviation**2) * magnitude) / np.sum(magnitude))
+        return spectral_centroid, spectral_rolloff, spectral_bandwidth
+    def estimate_tempo(self, y):
+        """Estimate tempo"""
+        if LIBROSA_AVAILABLE:
+            tempo, _ = librosa.beat.beat_track(y=y, sr=self.sr)
+            return tempo
+        else:
+            # Simplified tempo estimation
+            onset_env = self.extract_energy(y)
+            autocorr = np.correlate(onset_env, onset_env, mode='full')
+            autocorr = autocorr[len(autocorr)//2:]
+            # Find tempo peaks
+            peaks = signal.find_peaks(autocorr)[0]
+            if len(peaks) > 0:
+                tempo = 60.0 / (peaks[0] * 0.01) if peaks[0] > 0 else 120
+                return np.clip(tempo, 60, 180)
+            return 120
+    def extract_all_features(self, audio_path):
+        """Extract comprehensive features from audio"""
+        try:
+            # Load audio
+            y, sr = self.load_audio(audio_path)
+            # 1. MFCCs
+            mfccs = self.extract_mfcc_features(y)
+            mfcc_mean = np.mean(mfccs, axis=1)
+            mfcc_std = np.std(mfccs, axis=1)
+            # 2. Pitch features
+            pitch_values = self.extract_pitch(y)
+            pitch_mean = np.mean(pitch_values)
+            pitch_std = np.std(pitch_values)
+            pitch_min = np.min(pitch_values)
+            pitch_max = np.max(pitch_values)
             monotone_score = 1 / (1 + pitch_std) if pitch_std > 0 else 1.0
             # 3. Energy features
+            rms = self.extract_energy(y)
             energy_mean = np.mean(rms)
             energy_std = np.std(rms)
             energy_max = np.max(rms)
+            # 4. Zero Crossing Rate
+            zcr = self.extract_zcr(y)
             zcr_mean = np.mean(zcr)
             zcr_std = np.std(zcr)
             # 5. Spectral features
+            spectral_centroid, spectral_rolloff, spectral_bandwidth = \
+                self.extract_spectral_features(y)
+            # 6. Chroma (simplified)
+            chroma_mean = 0.5  # Placeholder
             # 7. Tempo
+            tempo = self.estimate_tempo(y)
+            # Combine features
             features = np.concatenate([
                 mfcc_mean,
                 mfcc_std,
             )
             return {
+                'features': features.astype(np.float32),
+                'vocal_affect_score': float(vocal_affect_score),
+                'monotone_score': float(monotone_score),
+                'vocal_energy_score': float(vocal_energy_score),
+                'pitch_variability': float(pitch_std),
+                'energy_level': float(energy_mean)
             }
         except Exception as e:
+            print(f"Error extracting features: {e}")
+            # Return default features
+            return self._get_default_features()
     def _calculate_vocal_affect(self, pitch_std, energy_std, spectral_centroid):
+        """Calculate emotional intensity"""
         pitch_component = min(pitch_std / 100, 1.0)
         energy_component = min(energy_std / 0.5, 1.0)
         spectral_component = min(spectral_centroid / 3000, 1.0)
                        energy_component * 0.4 +
                        spectral_component * 0.2)
+        return np.clip(affect_score, 0, 1)
     def _calculate_vocal_energy(self, energy_mean, tempo, zcr_mean):
+        """Calculate vocal energy/activation"""
         energy_component = min(energy_mean / 1.0, 1.0)
         tempo_component = min(tempo / 180, 1.0)
         zcr_component = min(zcr_mean / 0.3, 1.0)
                        tempo_component * 0.3 +
                        zcr_component * 0.2)
+        return np.clip(energy_score, 0, 1)
+    def _get_default_features(self):
+        """Return default features for error cases"""
+        n_features = self.n_mfcc * 2 + 18
         return {
+            'features': np.random.randn(n_features).astype(np.float32),
+            'vocal_affect_score': 0.5,
+            'monotone_score': 0.5,
+            'vocal_energy_score': 0.5,
+            'pitch_variability': 50.0,
+            'energy_level': 0.5
         }
 # ============================================
+# NEURAL NETWORK MODEL
 # ============================================
 class MultiTaskEmotionModel(nn.Module):
+    """Multi-task emotion and mental health detection model"""
+    def __init__(self, input_dim, num_emotions=8, dropout=0.5):
         super(MultiTaskEmotionModel, self).__init__()
+        # Shared layers
         self.shared_layers = nn.Sequential(
             nn.Linear(input_dim, 512),
             nn.BatchNorm1d(512),
             nn.Dropout(dropout/2)
         )
+        # Emotion classification head
         self.emotion_head = nn.Sequential(
             nn.Linear(128, 64),
             nn.ReLU(),
             nn.Linear(64, num_emotions)
         )
+        # Regression heads
         self.affect_head = nn.Sequential(
             nn.Linear(128, 32),
             nn.ReLU(),
             nn.Sigmoid()
         )
         self.monotone_head = nn.Sequential(
             nn.Linear(128, 32),
             nn.ReLU(),
             nn.Sigmoid()
         )
         self.energy_head = nn.Sequential(
             nn.Linear(128, 32),
             nn.ReLU(),
         )
     def forward(self, x):
+        shared = self.shared_layers(x)
         return {
+            'emotion_logits': self.emotion_head(shared),
+            'vocal_affect': self.affect_head(shared),
+            'monotone_score': self.monotone_head(shared),
+            'vocal_energy': self.energy_head(shared)
         }
 # ============================================
+# PREDICTOR CLASS
 # ============================================
 class EmotionPredictor:
+    """Production inference class"""
+    def __init__(self):
+        self.processor = LightweightAudioProcessor(sr=16000, n_mfcc=40)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Emotion mapping
+        self.emotion_map = {
+            'neutral': 0, 'calm': 1, 'happy': 2, 'sad': 3,
+            'angry': 4, 'fearful': 5, 'disgust': 6, 'surprised': 7
+        }
+        self.reverse_emotion_map = {v: k for k, v in self.emotion_map.items()}
+        # Initialize model with pre-trained weights
+        input_dim = 98  # 40*2 (MFCC mean+std) + 18 other features
         self.model = MultiTaskEmotionModel(
+            input_dim=input_dim,
+            num_emotions=len(self.emotion_map),
+            dropout=0.3
         )
+        # Load pre-trained weights if available, otherwise use initialized weights
+        self._load_or_initialize_model()
         self.model.to(self.device)
         self.model.eval()
+    def _load_or_initialize_model(self):
+        """Load pre-trained model or use initialized weights"""
+        model_path = 'emotion_model.pth'
+        if os.path.exists(model_path):
+            try:
+                checkpoint = torch.load(model_path, map_location='cpu')
+                self.model.load_state_dict(checkpoint)
+                print("✅ Loaded pre-trained model")
+            except Exception as e:
+                print(f"⚠️ Could not load model: {e}")
+                print("Using initialized weights (demo mode)")
+        else:
+            print("ℹ️ No pre-trained model found. Using initialized weights (demo mode)")
+            # In demo mode, the model will still work but predictions will be less accurate
+    def predict(self, audio_path: str) -> Dict:
+        """Predict emotion and mental health indicators"""
         # Extract features
+        feature_dict = self.processor.extract_all_features(audio_path)
         features = torch.FloatTensor(feature_dict['features']).unsqueeze(0)
         features = features.to(self.device)
         with torch.no_grad():
             outputs = self.model(features)
+        # Process outputs
         emotion_probs = F.softmax(outputs['emotion_logits'], dim=1)[0]
         emotion_idx = emotion_probs.argmax().item()
         emotion = self.reverse_emotion_map[emotion_idx]
         confidence = emotion_probs[emotion_idx].item()
+        # Get all scores
         vocal_affect = outputs['vocal_affect'][0].item()
         monotone_score = outputs['monotone_score'][0].item()
         vocal_energy = outputs['vocal_energy'][0].item()
+        # Mental health interpretation
+        mental_health_indicators = self._interpret_mental_health(
+            monotone_score, vocal_affect, vocal_energy
+        )
         results = {
             'emotion': emotion,
             'confidence': confidence,
             'vocal_energy_score': vocal_energy,
             'pitch_variability': feature_dict['pitch_variability'],
             'energy_level': feature_dict['energy_level'],
+            'mental_health_indicators': mental_health_indicators
         }
         return results
         """Interpret mental health indicators"""
         indicators = []
         if monotone > 0.7:
             indicators.append("⚠️ High monotone score - possible depression indicator")
         if affect > 0.7 and energy > 0.7:
+            indicators.append("⚠️ High vocal affect and energy - possible anxiety/stress")
         if energy < 0.3:
             indicators.append("⚠️ Low vocal energy - possible low motivation/depression")
         if affect > 0.6 and monotone < 0.4:
+            indicators.append("⚠️ High vocal affect - possible emotional stress")
+        if 0.4 <= monotone <= 0.6 and 0.4 <= affect <= 0.6 and 0.4 <= energy <= 0.6:
+            indicators.append("✅ Balanced vocal characteristics - no significant concerns")
         if not indicators:
+            indicators.append("ℹ️ Vocal patterns within normal range")
         return indicators
 # ============================================
+# GRADIO INTERFACE
 # ============================================
+def create_gradio_app():
+    """Create Gradio interface"""
+    # Initialize predictor
+    print("Initializing emotion predictor...")
+    predictor = EmotionPredictor()
+    print("✅ Predictor ready!")
     def predict_emotion(audio):
         """Gradio prediction function"""
         if audio is None:
+            return {
+                emotion_output: "❌ Please upload an audio file",
+                affect_output: "",
+                monotone_output: "",
+                energy_output: "",
+                pitch_output: "",
+                mental_health_output: ""
+            }
         try:
+            # Run prediction
             results = predictor.predict(audio)
+            # Format emotion output
+            emotion_text = f"## 🎭 Detected Emotion: **{results['emotion'].upper()}**\n\n"
+            emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n"
+            emotion_text += "### All Emotion Probabilities:\n"
             for emotion, prob in sorted(results['emotion_probabilities'].items(),
                                        key=lambda x: x[1], reverse=True):
+                bar_length = int(prob * 20)
+                bar = "█" * bar_length + "░" * (20 - bar_length)
+                emotion_text += f"**{emotion.capitalize()}:** {bar} {prob*100:.1f}%\n"
+            # Format scores
+            affect_text = f"**{results['vocal_affect_score']:.3f}**\n\n"
+            if results['vocal_affect_score'] > 0.7:
+                affect_text += "🔴 High emotional intensity detected"
+            elif results['vocal_affect_score'] < 0.3:
+                affect_text += "🟢 Low emotional intensity"
+            else:
+                affect_text += "🟡 Moderate emotional intensity"
+            monotone_text = f"**{results['monotone_speech_score']:.3f}**\n\n"
+            if results['monotone_speech_score'] > 0.7:
+                monotone_text += "🔴 Very flat speech pattern"
+            elif results['monotone_speech_score'] < 0.3:
+                monotone_text += "🟢 Varied pitch pattern"
+            else:
+                monotone_text += "🟡 Moderate pitch variation"
+            energy_text = f"**{results['vocal_energy_score']:.3f}**\n\n"
+            if results['vocal_energy_score'] > 0.7:
+                energy_text += "🔴 High vocal energy"
+            elif results['vocal_energy_score'] < 0.3:
+                energy_text += "🔴 Low vocal energy"
+            else:
+                energy_text += "🟢 Normal vocal energy"
+            pitch_text = f"**Variability:** {results['pitch_variability']:.2f} Hz\n"
+            pitch_text += f"**Energy Level:** {results['energy_level']:.3f}"
+            mental_health_text = "\n".join(results['mental_health_indicators'])
+            return {
+                emotion_output: emotion_text,
+                affect_output: affect_text,
+                monotone_output: monotone_text,
+                energy_output: energy_text,
+                pitch_output: pitch_text,
+                mental_health_output: mental_health_text
+            }
         except Exception as e:
+            error_msg = f"❌ Error processing audio: {str(e)}"
+            return {
+                emotion_output: error_msg,
+                affect_output: "",
+                monotone_output: "",
+                energy_output: "",
+                pitch_output: "",
+                mental_health_output: ""
+            }
     # Create interface
+    with gr.Blocks(theme=gr.themes.Soft(), title="Audio Emotion Detection") as demo:
+        gr.Markdown("""
+        # 🎙️ Audio Emotion & Mental Health Detection
+        Upload an audio file to analyze emotional state and mental health indicators.
+        **Features:**
+        - 🎭 Emotion Recognition (8 emotions)
+        - 📊 Vocal Affect Score (emotional intensity)
+        - 📉 Monotone Speech Detection (depression indicator)
+        - ⚡ Vocal Energy Analysis (mood disorder indicator)
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                audio_input = gr.Audio(
+                    type="filepath",
+                    label="Upload Audio File (WAV, MP3, etc.)"
+                )
+                analyze_btn = gr.Button("🔍 Analyze Audio", variant="primary", size="lg")
+                gr.Markdown("""
+                ### 📝 Instructions:
+                1. Upload an audio file (WAV, MP3, etc.)
+                2. Click "Analyze Audio"
+                3. View results on the right
+                **Note:** Works best with clear speech recordings (3-10 seconds)
+                """)
+            with gr.Column(scale=2):
+                emotion_output = gr.Markdown(label="Emotion Detection")
+                with gr.Row():
+                    with gr.Column():
+                        affect_output = gr.Markdown(label="Vocal Affect Score")
+                    with gr.Column():
+                        monotone_output = gr.Markdown(label="Monotone Score")
+                    with gr.Column():
+                        energy_output = gr.Markdown(label="Vocal Energy")
+                pitch_output = gr.Markdown(label="Technical Details")
+                mental_health_output = gr.Markdown(label="Mental Health Indicators")
+        gr.Markdown("""
+        ---
+        ### 📊 Interpretation Guide
+        | Metric | Range | Interpretation |
+        |--------|-------|----------------|
+        | **Vocal Affect** | 0.0-0.3 | Low emotional intensity (calm/neutral) |
+        | | 0.3-0.7 | Moderate emotional intensity |
+        | | 0.7-1.0 | High emotional intensity (stress/anxiety) |
+        | **Monotone Score** | 0.0-0.3 | High pitch variation (normal) |
+        | | 0.3-0.7 | Moderate pitch variation |
+        | | 0.7-1.0 | Very flat speech (possible depression) |
+        | **Vocal Energy** | 0.0-0.3 | Low energy (possible low motivation) |
+        | | 0.3-0.7 | Normal energy level |
+        | | 0.7-1.0 | High energy (possible anxiety/mania) |
+        ---
+        **⚠️ Disclaimer:** This tool is for research and informational purposes only.
+        It should not be used as a substitute for professional medical or psychological diagnosis.
+        Always consult qualified healthcare professionals for mental health concerns.
+        **🔬 Model Info:** Multi-task Deep Neural Network trained on emotional speech datasets (RAVDESS, TESS, CREMA-D)
+        """)
+        # Connect button to function
+        analyze_btn.click(
+            fn=predict_emotion,
+            inputs=audio_input,
+            outputs=[emotion_output, affect_output, monotone_output,
+                    energy_output, pitch_output, mental_health_output]
+        )
+    return demo
 # ============================================
+# MAIN EXECUTION
 # ============================================
 if __name__ == "__main__":
+    print("="*60)
+    print("🎙️ Audio Emotion & Mental Health Detection")
+    print("="*60)
+    print("\nStarting Gradio interface...")
+    # Create and launch app
+    app = create_gradio_app()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )