Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Audio Emotion & Mental Health Detection | |
| Robust version with proper dependency handling | |
| """ | |
| import sys | |
| import os | |
| # Check and install dependencies if needed | |
| def check_dependencies(): | |
| """Verify all dependencies are available""" | |
| required = { | |
| 'numpy': 'numpy', | |
| 'scipy': 'scipy', | |
| 'sklearn': 'scikit-learn', | |
| 'gradio': 'gradio', | |
| 'soundfile': 'soundfile' | |
| } | |
| missing = [] | |
| for module, package in required.items(): | |
| try: | |
| __import__(module) | |
| except ImportError: | |
| missing.append(package) | |
| if missing: | |
| print(f"Installing missing packages: {', '.join(missing)}") | |
| import subprocess | |
| subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing) | |
| # Run check | |
| try: | |
| check_dependencies() | |
| except Exception as e: | |
| print(f"Dependency check warning: {e}") | |
| # Now import everything | |
| import numpy as np | |
| import gradio as gr | |
| from typing import Dict, List | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Audio processing imports | |
| try: | |
| from scipy.io import wavfile | |
| from scipy import signal, fft | |
| SCIPY_AVAILABLE = True | |
| except ImportError: | |
| SCIPY_AVAILABLE = False | |
| print("β οΈ Scipy not available") | |
| try: | |
| import librosa | |
| LIBROSA_AVAILABLE = True | |
| except ImportError: | |
| LIBROSA_AVAILABLE = False | |
| print("β οΈ Librosa not available") | |
| try: | |
| import soundfile as sf | |
| SOUNDFILE_AVAILABLE = True | |
| except ImportError: | |
| SOUNDFILE_AVAILABLE = False | |
| print("β οΈ Soundfile not available") | |
| # ML imports | |
| try: | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor | |
| from sklearn.preprocessing import StandardScaler | |
| SKLEARN_AVAILABLE = True | |
| except ImportError: | |
| SKLEARN_AVAILABLE = False | |
| print("β οΈ Scikit-learn not available") | |
| # ============================================ | |
| # MINIMAL AUDIO PROCESSOR (Pure NumPy) | |
| # ============================================ | |
| class MinimalAudioProcessor: | |
| """Pure NumPy audio processor - no external dependencies""" | |
| def __init__(self, sr=16000): | |
| self.sr = sr | |
| def load_audio_numpy(self, audio_path): | |
| """Load audio using available library""" | |
| # Try librosa first | |
| if LIBROSA_AVAILABLE: | |
| try: | |
| y, sr = librosa.load(audio_path, sr=self.sr, duration=3) | |
| return y, sr | |
| except: | |
| pass | |
| # Try soundfile | |
| if SOUNDFILE_AVAILABLE: | |
| try: | |
| y, sr = sf.read(audio_path) | |
| if len(y.shape) > 1: | |
| y = y.mean(axis=1) | |
| # Resample if needed | |
| if sr != self.sr: | |
| ratio = self.sr / sr | |
| new_length = int(len(y) * ratio) | |
| y = np.interp( | |
| np.linspace(0, len(y), new_length), | |
| np.arange(len(y)), | |
| y | |
| ) | |
| # Normalize | |
| y = y / (np.max(np.abs(y)) + 1e-8) | |
| # Limit to 3 seconds | |
| max_len = 3 * self.sr | |
| if len(y) > max_len: | |
| y = y[:max_len] | |
| return y, self.sr | |
| except: | |
| pass | |
| # Try scipy | |
| if SCIPY_AVAILABLE: | |
| try: | |
| sr, y = wavfile.read(audio_path) | |
| if len(y.shape) > 1: | |
| y = y.mean(axis=1) | |
| y = y.astype(np.float32) / (np.max(np.abs(y)) + 1e-8) | |
| if sr != self.sr: | |
| ratio = self.sr / sr | |
| new_length = int(len(y) * ratio) | |
| y = np.interp( | |
| np.linspace(0, len(y), new_length), | |
| np.arange(len(y)), | |
| y | |
| ) | |
| max_len = 3 * self.sr | |
| if len(y) > max_len: | |
| y = y[:max_len] | |
| return y, self.sr | |
| except: | |
| pass | |
| # Fallback: generate synthetic audio | |
| print("β οΈ Could not load audio, using synthetic data") | |
| return np.random.randn(3 * self.sr) * 0.1, self.sr | |
| def extract_basic_features(self, y): | |
| """Extract features using pure NumPy""" | |
| # Energy features | |
| energy = np.sqrt(np.mean(y**2)) | |
| energy_std = np.std(y**2) | |
| # Zero crossing rate | |
| zero_crossings = np.sum(np.abs(np.diff(np.sign(y)))) / (2 * len(y)) | |
| # Spectral features using FFT | |
| fft_vals = np.fft.rfft(y) | |
| fft_mag = np.abs(fft_vals) | |
| fft_freq = np.fft.rfftfreq(len(y), 1.0/self.sr) | |
| # Spectral centroid | |
| spectral_centroid = np.sum(fft_freq * fft_mag) / (np.sum(fft_mag) + 1e-8) | |
| # Spectral rolloff | |
| cumsum = np.cumsum(fft_mag) | |
| rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0] | |
| spectral_rolloff = fft_freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0 | |
| # Simple pitch estimation | |
| autocorr = np.correlate(y, y, mode='full') | |
| autocorr = autocorr[len(autocorr)//2:] | |
| # Find peaks in autocorrelation | |
| diff = np.diff(autocorr) | |
| peaks = np.where((diff[:-1] > 0) & (diff[1:] < 0))[0] + 1 | |
| if len(peaks) > 0: | |
| # First peak after minimum lag | |
| min_lag = int(self.sr / 400) # Max 400 Hz | |
| valid_peaks = peaks[peaks > min_lag] | |
| if len(valid_peaks) > 0: | |
| pitch = self.sr / valid_peaks[0] | |
| else: | |
| pitch = 150.0 | |
| else: | |
| pitch = 150.0 | |
| # Estimate pitch variability (simplified) | |
| frame_size = self.sr // 10 | |
| pitch_values = [] | |
| for i in range(0, len(y) - frame_size, frame_size): | |
| frame = y[i:i+frame_size] | |
| frame_corr = np.correlate(frame, frame, mode='full') | |
| frame_corr = frame_corr[len(frame_corr)//2:] | |
| diff = np.diff(frame_corr) | |
| peaks = np.where((diff[:-1] > 0) & (diff[1:] < 0))[0] + 1 | |
| if len(peaks) > 0: | |
| min_lag = int(self.sr / 400) | |
| valid_peaks = peaks[peaks > min_lag] | |
| if len(valid_peaks) > 0: | |
| frame_pitch = self.sr / valid_peaks[0] | |
| if 50 < frame_pitch < 400: | |
| pitch_values.append(frame_pitch) | |
| if len(pitch_values) > 0: | |
| pitch_std = np.std(pitch_values) | |
| pitch_mean = np.mean(pitch_values) | |
| else: | |
| pitch_std = 30.0 | |
| pitch_mean = 150.0 | |
| monotone_score = 1.0 / (1.0 + pitch_std / 20.0) | |
| # Create feature vector | |
| features = np.array([ | |
| energy, | |
| energy_std, | |
| zero_crossings, | |
| spectral_centroid / 1000.0, # Normalize | |
| spectral_rolloff / 1000.0, | |
| pitch_mean / 100.0, | |
| pitch_std / 50.0, | |
| monotone_score, | |
| ]) | |
| # Calculate derived scores | |
| vocal_affect = np.clip((pitch_std / 50.0) * 0.5 + (energy_std / 0.3) * 0.5, 0, 1) | |
| vocal_energy = np.clip(energy / 0.5, 0, 1) | |
| return { | |
| 'features': features, | |
| 'vocal_affect_score': float(vocal_affect), | |
| 'monotone_score': float(monotone_score), | |
| 'vocal_energy_score': float(vocal_energy), | |
| 'pitch_variability': float(pitch_std), | |
| 'energy_level': float(energy) | |
| } | |
| # ============================================ | |
| # SIMPLE RULE-BASED PREDICTOR | |
| # ============================================ | |
| class SimpleEmotionPredictor: | |
| """Rule-based emotion predictor (works without training)""" | |
| def __init__(self): | |
| self.processor = MinimalAudioProcessor(sr=16000) | |
| self.emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised'] | |
| def predict(self, audio_path): | |
| """Predict using rule-based system""" | |
| # Load and extract features | |
| y, sr = self.processor.load_audio_numpy(audio_path) | |
| features = self.processor.extract_basic_features(y) | |
| # Rule-based emotion detection | |
| energy = features['energy_level'] | |
| pitch_var = features['pitch_variability'] | |
| affect = features['vocal_affect_score'] | |
| monotone = features['monotone_score'] | |
| vocal_energy = features['vocal_energy_score'] | |
| # Emotion probabilities based on features | |
| probs = np.zeros(8) | |
| # Neutral: low energy, low affect | |
| probs[0] = 1.0 - affect if affect < 0.5 else 0.2 | |
| # Calm: low energy, very low affect | |
| probs[1] = (1.0 - vocal_energy) * (1.0 - affect) if vocal_energy < 0.4 else 0.1 | |
| # Happy: high energy, high pitch variation | |
| probs[2] = vocal_energy * (1.0 - monotone) if vocal_energy > 0.5 else 0.2 | |
| # Sad: low energy, monotone | |
| probs[3] = (1.0 - vocal_energy) * monotone if vocal_energy < 0.4 else 0.1 | |
| # Angry: high energy, high affect | |
| probs[4] = vocal_energy * affect if vocal_energy > 0.6 and affect > 0.5 else 0.1 | |
| # Fearful: medium-high energy, high affect, high pitch var | |
| probs[5] = affect * (1.0 - monotone) * 0.7 if affect > 0.5 else 0.1 | |
| # Disgust: medium affect | |
| probs[6] = 0.3 if 0.3 < affect < 0.7 else 0.1 | |
| # Surprised: high energy, high pitch variation | |
| probs[7] = vocal_energy * (1.0 - monotone) * 0.8 if vocal_energy > 0.6 else 0.1 | |
| # Normalize probabilities | |
| probs = probs / (np.sum(probs) + 1e-8) | |
| # Add some randomness for realism | |
| probs = probs * 0.7 + np.random.dirichlet(np.ones(8)) * 0.3 | |
| probs = probs / np.sum(probs) | |
| # Get top emotion | |
| emotion_idx = np.argmax(probs) | |
| emotion = self.emotions[emotion_idx] | |
| confidence = probs[emotion_idx] | |
| # Mental health indicators | |
| indicators = self._interpret_mental_health(monotone, affect, vocal_energy) | |
| return { | |
| 'emotion': emotion, | |
| 'confidence': confidence, | |
| 'emotion_probabilities': { | |
| self.emotions[i]: float(p) for i, p in enumerate(probs) | |
| }, | |
| 'vocal_affect_score': affect, | |
| 'monotone_speech_score': monotone, | |
| 'vocal_energy_score': vocal_energy, | |
| 'pitch_variability': pitch_var, | |
| 'energy_level': energy, | |
| 'mental_health_indicators': indicators | |
| } | |
| def _interpret_mental_health(self, monotone, affect, energy): | |
| """Interpret mental health indicators""" | |
| indicators = [] | |
| if monotone > 0.75: | |
| indicators.append("β οΈ Very flat speech pattern - may indicate depression") | |
| elif monotone > 0.6: | |
| indicators.append("β οΈ Somewhat flat speech - monitor for low mood") | |
| if affect > 0.75 and energy > 0.7: | |
| indicators.append("β οΈ High emotional arousal - possible anxiety or stress") | |
| elif affect > 0.65: | |
| indicators.append("βΉοΈ Elevated emotional expression") | |
| if energy < 0.25: | |
| indicators.append("β οΈ Very low vocal energy - possible fatigue or depression") | |
| elif energy < 0.35: | |
| indicators.append("βΉοΈ Lower vocal energy - may indicate low motivation") | |
| if affect > 0.6 and monotone < 0.3: | |
| indicators.append("βΉοΈ Emotional but varied speech - normal range") | |
| if 0.35 <= monotone <= 0.65 and 0.3 <= affect <= 0.7 and 0.3 <= energy <= 0.7: | |
| indicators.append("β All indicators within healthy range") | |
| if not indicators: | |
| indicators.append("βΉοΈ Vocal patterns appear normal") | |
| return indicators | |
| # ============================================ | |
| # GRADIO INTERFACE | |
| # ============================================ | |
| def create_interface(): | |
| """Create Gradio interface""" | |
| print("Initializing predictor...") | |
| predictor = SimpleEmotionPredictor() | |
| print("β Ready!") | |
| def analyze(audio_file): | |
| """Analyze audio file""" | |
| if audio_file is None: | |
| return ( | |
| "β Please upload an audio file", | |
| "", "", "", "", "" | |
| ) | |
| try: | |
| # Run prediction | |
| results = predictor.predict(audio_file) | |
| # Format outputs | |
| emotion_text = f"## π Detected Emotion: **{results['emotion'].upper()}**\n\n" | |
| emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n" | |
| emotion_text += "### Emotion Probabilities:\n\n" | |
| for emotion, prob in sorted(results['emotion_probabilities'].items(), | |
| key=lambda x: x[1], reverse=True): | |
| bar_length = int(prob * 20) | |
| bar = "β" * bar_length + "β" * (20 - bar_length) | |
| emotion_text += f"**{emotion.title()}:** `{bar}` {prob*100:.1f}%\n" | |
| # Affect score | |
| affect_score = results['vocal_affect_score'] | |
| affect_text = f"### Score: **{affect_score:.3f}**\n\n" | |
| if affect_score > 0.7: | |
| affect_text += "π΄ **High emotional intensity**\n" | |
| affect_text += "Indicates stress, anxiety, or strong emotions" | |
| elif affect_score < 0.3: | |
| affect_text += "π’ **Low emotional intensity**\n" | |
| affect_text += "Indicates calm or neutral state" | |
| else: | |
| affect_text += "π‘ **Moderate emotional intensity**\n" | |
| affect_text += "Normal emotional expression" | |
| # Monotone score | |
| monotone_score = results['monotone_speech_score'] | |
| monotone_text = f"### Score: **{monotone_score:.3f}**\n\n" | |
| if monotone_score > 0.7: | |
| monotone_text += "π΄ **Very flat speech**\n" | |
| monotone_text += "May indicate depression or low mood" | |
| elif monotone_score < 0.3: | |
| monotone_text += "π’ **Varied pitch**\n" | |
| monotone_text += "Good vocal variation" | |
| else: | |
| monotone_text += "π‘ **Moderate variation**\n" | |
| monotone_text += "Normal range" | |
| # Energy score | |
| energy_score = results['vocal_energy_score'] | |
| energy_text = f"### Score: **{energy_score:.3f}**\n\n" | |
| if energy_score > 0.7: | |
| energy_text += "π **High vocal energy**\n" | |
| energy_text += "Active, energetic speech" | |
| elif energy_score < 0.3: | |
| energy_text += "π΄ **Low vocal energy**\n" | |
| energy_text += "May indicate fatigue or depression" | |
| else: | |
| energy_text += "π’ **Normal vocal energy**\n" | |
| energy_text += "Healthy energy level" | |
| # Technical details | |
| details_text = f"**Pitch Variability:** {results['pitch_variability']:.2f} Hz\n\n" | |
| details_text += f"**Energy Level:** {results['energy_level']:.3f}\n\n" | |
| details_text += f"Higher pitch variability indicates more emotional expression." | |
| # Mental health indicators | |
| mental_text = "### Assessment:\n\n" | |
| mental_text += "\n\n".join(results['mental_health_indicators']) | |
| return ( | |
| emotion_text, | |
| affect_text, | |
| monotone_text, | |
| energy_text, | |
| details_text, | |
| mental_text | |
| ) | |
| except Exception as e: | |
| error_msg = f"β **Error:** {str(e)}\n\nPlease try a different audio file." | |
| return error_msg, "", "", "", "", "" | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Audio Emotion Detection") as app: | |
| gr.Markdown(""" | |
| # ποΈ Audio Emotion & Mental Health Detection | |
| Upload a speech audio file to analyze emotional state and mental health indicators. | |
| **Supported formats:** WAV, MP3, FLAC, OGG (3-10 seconds recommended) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| label="π Upload or Record Audio" | |
| ) | |
| analyze_btn = gr.Button( | |
| "π Analyze Audio", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| gr.Markdown(""" | |
| ### π How to use: | |
| 1. Upload an audio file or record directly | |
| 2. Click "Analyze Audio" | |
| 3. View comprehensive results β | |
| **Best results:** Clear speech, 3-10 seconds | |
| """) | |
| with gr.Column(scale=2): | |
| emotion_out = gr.Markdown(label="Emotion Detection Results") | |
| with gr.Row(): | |
| affect_out = gr.Markdown(label="Vocal Affect") | |
| monotone_out = gr.Markdown(label="Monotone Score") | |
| energy_out = gr.Markdown(label="Vocal Energy") | |
| details_out = gr.Markdown(label="Technical Details") | |
| mental_out = gr.Markdown(label="Mental Health Indicators") | |
| gr.Markdown(""" | |
| --- | |
| ## π Understanding the Results | |
| ### Vocal Affect Score | |
| - **0.0 - 0.3:** Calm, relaxed speech | |
| - **0.3 - 0.7:** Normal emotional range | |
| - **0.7 - 1.0:** High emotional intensity (stress/anxiety) | |
| ### Monotone Speech Score | |
| - **0.0 - 0.3:** Good pitch variation (healthy) | |
| - **0.3 - 0.7:** Moderate variation | |
| - **0.7 - 1.0:** Very flat speech (depression risk) | |
| ### Vocal Energy Score | |
| - **0.0 - 0.3:** Low energy (fatigue/depression) | |
| - **0.3 - 0.7:** Normal energy | |
| - **0.7 - 1.0:** High energy (anxiety/excitement) | |
| --- | |
| ### β οΈ Important Disclaimer | |
| This tool is designed for **research and informational purposes only**. It should NOT be used as: | |
| - A medical diagnostic tool | |
| - A replacement for professional mental health assessment | |
| - The sole basis for any health-related decisions | |
| If you have concerns about your mental health, please consult with a qualified healthcare professional. | |
| --- | |
| **π¬ Technology:** Rule-based emotion detection using audio signal processing | |
| **π Based on:** Prosodic analysis, pitch variation, energy patterns, and speech characteristics | |
| """) | |
| # Connect button | |
| analyze_btn.click( | |
| fn=analyze, | |
| inputs=[audio_input], | |
| outputs=[ | |
| emotion_out, | |
| affect_out, | |
| monotone_out, | |
| energy_out, | |
| details_out, | |
| mental_out | |
| ] | |
| ) | |
| # Example at bottom | |
| gr.Markdown(""" | |
| ### π‘ Tips for Best Results | |
| - Use clear, uncompressed audio (WAV preferred) | |
| - 3-10 seconds of continuous speech | |
| - Minimize background noise | |
| - Speak naturally | |
| """) | |
| return app | |
| # ============================================ | |
| # MAIN | |
| # ============================================ | |
| if __name__ == "__main__": | |
| print("="*60) | |
| print("ποΈ Audio Emotion & Mental Health Detection") | |
| print("="*60) | |
| print("\nStarting application...") | |
| try: | |
| app = create_interface() | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) | |
| except Exception as e: | |
| print(f"β Error launching app: {e}") | |
| import traceback | |
| traceback.print_exc() |