akku09090 commited on
Commit
b3e17d3
Β·
verified Β·
1 Parent(s): 185ab9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +303 -481
app.py CHANGED
@@ -1,39 +1,43 @@
1
  #!/usr/bin/env python3
2
  """
3
  Audio Emotion & Mental Health Detection Model
4
- Optimized for Hugging Face Spaces Deployment
 
5
  """
6
 
7
  import os
8
  import numpy as np
9
- import torch
10
- import torch.nn as nn
11
- import torch.nn.functional as F
12
  import gradio as gr
13
- from typing import Dict, Tuple
14
  import warnings
 
15
  warnings.filterwarnings('ignore')
16
 
17
- # Lightweight audio processing (no librosa dependency)
18
  try:
19
  import librosa
20
  LIBROSA_AVAILABLE = True
21
  except ImportError:
22
  LIBROSA_AVAILABLE = False
23
- print("⚠️ Librosa not available, using lightweight processing")
24
 
25
- import scipy.signal as signal
26
  from scipy.io import wavfile
27
- import scipy.fftpack as fft
 
 
 
 
 
 
28
 
29
  # ============================================
30
- # LIGHTWEIGHT AUDIO FEATURE EXTRACTOR
31
  # ============================================
32
 
33
- class LightweightAudioProcessor:
34
- """Audio processing without heavy librosa dependency"""
35
 
36
- def __init__(self, sr=16000, n_mfcc=40):
37
  self.sr = sr
38
  self.n_mfcc = n_mfcc
39
 
@@ -42,238 +46,200 @@ class LightweightAudioProcessor:
42
  try:
43
  if LIBROSA_AVAILABLE:
44
  y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
 
45
  else:
46
- # Fallback: use scipy
47
  sr, y = wavfile.read(audio_path)
 
 
48
  if len(y.shape) > 1:
49
- y = y.mean(axis=1) # Convert to mono
50
- y = y.astype(np.float32) / np.max(np.abs(y)) # Normalize
 
 
 
 
51
 
52
  # Resample if needed
53
  if sr != self.sr:
54
  num_samples = int(len(y) * self.sr / sr)
55
  y = signal.resample(y, num_samples)
56
 
57
- # Limit duration to 3 seconds
58
  max_len = 3 * self.sr
59
  if len(y) > max_len:
60
  y = y[:max_len]
61
-
62
- return y, self.sr
63
  except Exception as e:
64
  print(f"Error loading audio: {e}")
65
- return np.random.randn(self.sr * 3), self.sr
66
 
67
- def extract_mfcc_features(self, y):
68
- """Extract MFCC features using lightweight method"""
69
- if LIBROSA_AVAILABLE:
70
- mfccs = librosa.feature.mfcc(y=y, sr=self.sr, n_mfcc=self.n_mfcc)
71
- else:
72
- # Simplified MFCC calculation
73
- # Apply pre-emphasis
74
- emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
75
-
76
- # Frame the signal
77
- frame_size = int(0.025 * self.sr)
78
- frame_stride = int(0.01 * self.sr)
79
- frames = self._frame_signal(emphasized, frame_size, frame_stride)
80
-
81
- # Apply FFT
82
- mag_frames = np.absolute(np.fft.rfft(frames, frame_size))
83
- pow_frames = ((1.0 / frame_size) * (mag_frames ** 2))
84
-
85
- # Mel filter banks (simplified)
86
- mel_filters = self._create_mel_filters(26, frame_size, self.sr)
87
- filter_banks = np.dot(pow_frames, mel_filters.T)
88
- filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
89
- filter_banks = 20 * np.log10(filter_banks)
90
-
91
- # DCT to get MFCCs
92
- mfccs = fft.dct(filter_banks, type=2, axis=1, norm='ortho')[:, :self.n_mfcc].T
93
 
94
- return mfccs
95
-
96
- def _frame_signal(self, signal, frame_size, frame_stride):
97
- """Frame a signal into overlapping frames"""
98
- signal_length = len(signal)
99
- num_frames = int(np.ceil(float(np.abs(signal_length - frame_size)) / frame_stride))
100
-
101
- pad_signal_length = num_frames * frame_stride + frame_size
102
- z = np.zeros((pad_signal_length - signal_length))
103
- padded = np.append(signal, z)
104
-
105
- indices = np.tile(np.arange(0, frame_size), (num_frames, 1)) + \
106
- np.tile(np.arange(0, num_frames * frame_stride, frame_stride), (frame_size, 1)).T
107
- frames = padded[indices.astype(np.int32, copy=False)]
108
-
109
- # Apply Hamming window
110
- frames *= np.hamming(frame_size)
111
- return frames
112
-
113
- def _create_mel_filters(self, num_filters, fft_size, sample_rate):
114
- """Create Mel filter banks"""
115
  low_freq_mel = 0
116
- high_freq_mel = 2595 * np.log10(1 + (sample_rate / 2) / 700)
117
- mel_points = np.linspace(low_freq_mel, high_freq_mel, num_filters + 2)
118
  hz_points = 700 * (10**(mel_points / 2595) - 1)
119
- bin_points = np.floor((fft_size + 1) * hz_points / sample_rate)
120
 
121
- fbank = np.zeros((num_filters, int(np.floor(fft_size / 2 + 1))))
122
- for m in range(1, num_filters + 1):
123
- f_m_minus = int(bin_points[m - 1])
124
- f_m = int(bin_points[m])
125
- f_m_plus = int(bin_points[m + 1])
126
 
127
  for k in range(f_m_minus, f_m):
128
  fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
129
  for k in range(f_m, f_m_plus):
130
  fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
131
 
132
- return fbank
 
 
 
 
 
 
 
133
 
134
  def extract_pitch(self, y):
135
- """Extract pitch features"""
136
- if LIBROSA_AVAILABLE:
137
- pitches, magnitudes = librosa.piptrack(y=y, sr=self.sr)
138
- pitch_values = []
139
- for t in range(pitches.shape[1]):
140
- index = magnitudes[:, t].argmax()
141
- pitch = pitches[index, t]
142
- if pitch > 0:
143
- pitch_values.append(pitch)
144
- else:
145
- # Simple autocorrelation-based pitch detection
146
- pitch_values = []
147
- frame_length = int(0.025 * self.sr)
148
- hop_length = int(0.01 * self.sr)
149
-
150
- for i in range(0, len(y) - frame_length, hop_length):
151
- frame = y[i:i+frame_length]
152
- autocorr = np.correlate(frame, frame, mode='full')
153
- autocorr = autocorr[len(autocorr)//2:]
154
-
155
- # Find peaks
156
- peaks = signal.find_peaks(autocorr)[0]
157
- if len(peaks) > 0:
158
- pitch = self.sr / peaks[0] if peaks[0] > 0 else 0
159
- if 50 < pitch < 400: # Valid pitch range
160
- pitch_values.append(pitch)
161
-
162
- return pitch_values if pitch_values else [0]
163
 
164
  def extract_energy(self, y):
165
- """Extract energy features"""
166
- if LIBROSA_AVAILABLE:
167
- rms = librosa.feature.rms(y=y)[0]
168
- else:
169
- frame_length = int(0.025 * self.sr)
170
- hop_length = int(0.01 * self.sr)
171
- rms = []
172
-
173
- for i in range(0, len(y) - frame_length, hop_length):
174
- frame = y[i:i+frame_length]
175
- rms.append(np.sqrt(np.mean(frame**2)))
176
-
177
- rms = np.array(rms)
178
 
179
- return rms
180
 
181
  def extract_zcr(self, y):
182
- """Extract zero crossing rate"""
183
- if LIBROSA_AVAILABLE:
184
- zcr = librosa.feature.zero_crossing_rate(y)[0]
185
- else:
186
- zcr = []
187
- frame_length = int(0.025 * self.sr)
188
- hop_length = int(0.01 * self.sr)
189
-
190
- for i in range(0, len(y) - frame_length, hop_length):
191
- frame = y[i:i+frame_length]
192
- zero_crossings = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
193
- zcr.append(zero_crossings / frame_length)
194
-
195
- zcr = np.array(zcr)
196
 
197
- return zcr
198
 
199
  def extract_spectral_features(self, y):
200
- """Extract spectral features"""
201
- # Compute FFT
202
- fft_spectrum = np.fft.rfft(y)
203
- magnitude = np.abs(fft_spectrum)
204
  freq = np.fft.rfftfreq(len(y), 1.0/self.sr)
205
 
206
  # Spectral centroid
207
- spectral_centroid = np.sum(freq * magnitude) / np.sum(magnitude)
208
 
209
- # Spectral rolloff (85% of energy)
210
  cumsum = np.cumsum(magnitude)
211
  rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0]
212
- spectral_rolloff = freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0
213
 
214
  # Spectral bandwidth
215
- deviation = freq - spectral_centroid
216
- spectral_bandwidth = np.sqrt(np.sum((deviation**2) * magnitude) / np.sum(magnitude))
217
 
218
- return spectral_centroid, spectral_rolloff, spectral_bandwidth
219
-
220
- def estimate_tempo(self, y):
221
- """Estimate tempo"""
222
- if LIBROSA_AVAILABLE:
223
- tempo, _ = librosa.beat.beat_track(y=y, sr=self.sr)
224
- return tempo
225
- else:
226
- # Simplified tempo estimation
227
- onset_env = self.extract_energy(y)
228
- autocorr = np.correlate(onset_env, onset_env, mode='full')
229
- autocorr = autocorr[len(autocorr)//2:]
230
-
231
- # Find tempo peaks
232
- peaks = signal.find_peaks(autocorr)[0]
233
- if len(peaks) > 0:
234
- tempo = 60.0 / (peaks[0] * 0.01) if peaks[0] > 0 else 120
235
- return np.clip(tempo, 60, 180)
236
- return 120
237
 
238
  def extract_all_features(self, audio_path):
239
- """Extract comprehensive features from audio"""
240
  try:
241
- # Load audio
242
  y, sr = self.load_audio(audio_path)
243
 
244
- # 1. MFCCs
245
- mfccs = self.extract_mfcc_features(y)
246
- mfcc_mean = np.mean(mfccs, axis=1)
247
- mfcc_std = np.std(mfccs, axis=1)
248
 
249
- # 2. Pitch features
250
  pitch_values = self.extract_pitch(y)
251
  pitch_mean = np.mean(pitch_values)
252
  pitch_std = np.std(pitch_values)
253
  pitch_min = np.min(pitch_values)
254
  pitch_max = np.max(pitch_values)
255
- monotone_score = 1 / (1 + pitch_std) if pitch_std > 0 else 1.0
256
 
257
- # 3. Energy features
258
  rms = self.extract_energy(y)
259
  energy_mean = np.mean(rms)
260
  energy_std = np.std(rms)
261
  energy_max = np.max(rms)
262
 
263
- # 4. Zero Crossing Rate
264
  zcr = self.extract_zcr(y)
265
  zcr_mean = np.mean(zcr)
266
  zcr_std = np.std(zcr)
267
 
268
- # 5. Spectral features
269
- spectral_centroid, spectral_rolloff, spectral_bandwidth = \
270
- self.extract_spectral_features(y)
271
 
272
- # 6. Chroma (simplified)
273
- chroma_mean = 0.5 # Placeholder
274
-
275
- # 7. Tempo
276
- tempo = self.estimate_tempo(y)
 
 
 
 
 
277
 
278
  # Combine features
279
  features = np.concatenate([
@@ -282,231 +248,164 @@ class LightweightAudioProcessor:
282
  [pitch_mean, pitch_std, pitch_min, pitch_max, monotone_score],
283
  [energy_mean, energy_std, energy_max],
284
  [zcr_mean, zcr_std],
285
- [spectral_centroid, spectral_rolloff, spectral_bandwidth],
286
- [chroma_mean],
287
  [tempo]
288
  ])
289
 
290
- # Calculate derived scores
291
- vocal_affect_score = self._calculate_vocal_affect(
292
- pitch_std, energy_std, spectral_centroid
293
- )
294
- vocal_energy_score = self._calculate_vocal_energy(
295
- energy_mean, tempo, zcr_mean
296
- )
297
 
298
  return {
299
  'features': features.astype(np.float32),
300
- 'vocal_affect_score': float(vocal_affect_score),
301
  'monotone_score': float(monotone_score),
302
- 'vocal_energy_score': float(vocal_energy_score),
303
  'pitch_variability': float(pitch_std),
304
  'energy_level': float(energy_mean)
305
  }
306
 
307
  except Exception as e:
308
- print(f"Error extracting features: {e}")
309
- # Return default features
310
- return self._get_default_features()
311
 
312
- def _calculate_vocal_affect(self, pitch_std, energy_std, spectral_centroid):
313
- """Calculate emotional intensity"""
314
- pitch_component = min(pitch_std / 100, 1.0)
315
- energy_component = min(energy_std / 0.5, 1.0)
316
- spectral_component = min(spectral_centroid / 3000, 1.0)
317
-
318
- affect_score = (pitch_component * 0.4 +
319
- energy_component * 0.4 +
320
- spectral_component * 0.2)
321
-
322
- return np.clip(affect_score, 0, 1)
323
 
324
- def _calculate_vocal_energy(self, energy_mean, tempo, zcr_mean):
325
- """Calculate vocal energy/activation"""
326
- energy_component = min(energy_mean / 1.0, 1.0)
327
- tempo_component = min(tempo / 180, 1.0)
328
- zcr_component = min(zcr_mean / 0.3, 1.0)
329
-
330
- energy_score = (energy_component * 0.5 +
331
- tempo_component * 0.3 +
332
- zcr_component * 0.2)
333
-
334
- return np.clip(energy_score, 0, 1)
335
 
336
- def _get_default_features(self):
337
- """Return default features for error cases"""
338
- n_features = self.n_mfcc * 2 + 18
339
  return {
340
- 'features': np.random.randn(n_features).astype(np.float32),
341
  'vocal_affect_score': 0.5,
342
  'monotone_score': 0.5,
343
  'vocal_energy_score': 0.5,
344
- 'pitch_variability': 50.0,
345
- 'energy_level': 0.5
346
  }
347
 
348
 
349
  # ============================================
350
- # NEURAL NETWORK MODEL
351
- # ============================================
352
-
353
- class MultiTaskEmotionModel(nn.Module):
354
- """Multi-task emotion and mental health detection model"""
355
-
356
- def __init__(self, input_dim, num_emotions=8, dropout=0.5):
357
- super(MultiTaskEmotionModel, self).__init__()
358
-
359
- # Shared layers
360
- self.shared_layers = nn.Sequential(
361
- nn.Linear(input_dim, 512),
362
- nn.BatchNorm1d(512),
363
- nn.ReLU(),
364
- nn.Dropout(dropout),
365
-
366
- nn.Linear(512, 256),
367
- nn.BatchNorm1d(256),
368
- nn.ReLU(),
369
- nn.Dropout(dropout),
370
-
371
- nn.Linear(256, 128),
372
- nn.BatchNorm1d(128),
373
- nn.ReLU(),
374
- nn.Dropout(dropout/2)
375
- )
376
-
377
- # Emotion classification head
378
- self.emotion_head = nn.Sequential(
379
- nn.Linear(128, 64),
380
- nn.ReLU(),
381
- nn.Dropout(dropout/2),
382
- nn.Linear(64, num_emotions)
383
- )
384
-
385
- # Regression heads
386
- self.affect_head = nn.Sequential(
387
- nn.Linear(128, 32),
388
- nn.ReLU(),
389
- nn.Linear(32, 1),
390
- nn.Sigmoid()
391
- )
392
-
393
- self.monotone_head = nn.Sequential(
394
- nn.Linear(128, 32),
395
- nn.ReLU(),
396
- nn.Linear(32, 1),
397
- nn.Sigmoid()
398
- )
399
-
400
- self.energy_head = nn.Sequential(
401
- nn.Linear(128, 32),
402
- nn.ReLU(),
403
- nn.Linear(32, 1),
404
- nn.Sigmoid()
405
- )
406
-
407
- def forward(self, x):
408
- shared = self.shared_layers(x)
409
-
410
- return {
411
- 'emotion_logits': self.emotion_head(shared),
412
- 'vocal_affect': self.affect_head(shared),
413
- 'monotone_score': self.monotone_head(shared),
414
- 'vocal_energy': self.energy_head(shared)
415
- }
416
-
417
-
418
- # ============================================
419
- # PREDICTOR CLASS
420
  # ============================================
421
 
422
  class EmotionPredictor:
423
- """Production inference class"""
424
 
425
  def __init__(self):
426
- self.processor = LightweightAudioProcessor(sr=16000, n_mfcc=40)
427
- self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
428
 
429
  # Emotion mapping
430
- self.emotion_map = {
431
- 'neutral': 0, 'calm': 1, 'happy': 2, 'sad': 3,
432
- 'angry': 4, 'fearful': 5, 'disgust': 6, 'surprised': 7
433
- }
434
- self.reverse_emotion_map = {v: k for k, v in self.emotion_map.items()}
435
-
436
- # Initialize model with pre-trained weights
437
- input_dim = 98 # 40*2 (MFCC mean+std) + 18 other features
438
- self.model = MultiTaskEmotionModel(
439
- input_dim=input_dim,
440
- num_emotions=len(self.emotion_map),
441
- dropout=0.3
442
- )
443
-
444
- # Load pre-trained weights if available, otherwise use initialized weights
445
- self._load_or_initialize_model()
446
 
447
- self.model.to(self.device)
448
- self.model.eval()
449
 
450
- def _load_or_initialize_model(self):
451
- """Load pre-trained model or use initialized weights"""
452
- model_path = 'emotion_model.pth'
453
 
454
- if os.path.exists(model_path):
 
455
  try:
456
- checkpoint = torch.load(model_path, map_location='cpu')
457
- self.model.load_state_dict(checkpoint)
458
- print("βœ… Loaded pre-trained model")
459
- except Exception as e:
460
- print(f"⚠️ Could not load model: {e}")
461
- print("Using initialized weights (demo mode)")
462
- else:
463
- print("ℹ️ No pre-trained model found. Using initialized weights (demo mode)")
464
- # In demo mode, the model will still work but predictions will be less accurate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
 
466
- def predict(self, audio_path: str) -> Dict:
467
  """Predict emotion and mental health indicators"""
468
 
469
  # Extract features
470
- feature_dict = self.processor.extract_all_features(audio_path)
471
- features = torch.FloatTensor(feature_dict['features']).unsqueeze(0)
472
- features = features.to(self.device)
473
 
474
- # Predict
475
- with torch.no_grad():
476
- outputs = self.model(features)
477
 
478
- # Process outputs
479
- emotion_probs = F.softmax(outputs['emotion_logits'], dim=1)[0]
480
- emotion_idx = emotion_probs.argmax().item()
481
- emotion = self.reverse_emotion_map[emotion_idx]
482
- confidence = emotion_probs[emotion_idx].item()
483
 
484
- # Get all scores
485
- vocal_affect = outputs['vocal_affect'][0].item()
486
- monotone_score = outputs['monotone_score'][0].item()
487
- vocal_energy = outputs['vocal_energy'][0].item()
 
 
 
 
 
488
 
489
  # Mental health interpretation
490
- mental_health_indicators = self._interpret_mental_health(
491
- monotone_score, vocal_affect, vocal_energy
492
- )
493
 
494
- results = {
495
  'emotion': emotion,
496
  'confidence': confidence,
497
  'emotion_probabilities': {
498
- self.reverse_emotion_map[i]: prob.item()
499
- for i, prob in enumerate(emotion_probs)
500
  },
501
  'vocal_affect_score': vocal_affect,
502
  'monotone_speech_score': monotone_score,
503
  'vocal_energy_score': vocal_energy,
504
  'pitch_variability': feature_dict['pitch_variability'],
505
  'energy_level': feature_dict['energy_level'],
506
- 'mental_health_indicators': mental_health_indicators
507
  }
508
-
509
- return results
510
 
511
  def _interpret_mental_health(self, monotone, affect, energy):
512
  """Interpret mental health indicators"""
@@ -524,8 +423,8 @@ class EmotionPredictor:
524
  if affect > 0.6 and monotone < 0.4:
525
  indicators.append("⚠️ High vocal affect - possible emotional stress")
526
 
527
- if 0.4 <= monotone <= 0.6 and 0.4 <= affect <= 0.6 and 0.4 <= energy <= 0.6:
528
- indicators.append("βœ… Balanced vocal characteristics - no significant concerns")
529
 
530
  if not indicators:
531
  indicators.append("ℹ️ Vocal patterns within normal range")
@@ -537,188 +436,111 @@ class EmotionPredictor:
537
  # GRADIO INTERFACE
538
  # ============================================
539
 
540
- def create_gradio_app():
541
- """Create Gradio interface"""
542
 
543
- # Initialize predictor
544
- print("Initializing emotion predictor...")
545
  predictor = EmotionPredictor()
546
- print("βœ… Predictor ready!")
547
 
548
- def predict_emotion(audio):
549
- """Gradio prediction function"""
550
  if audio is None:
551
- return {
552
- emotion_output: "❌ Please upload an audio file",
553
- affect_output: "",
554
- monotone_output: "",
555
- energy_output: "",
556
- pitch_output: "",
557
- mental_health_output: ""
558
- }
559
 
560
  try:
561
- # Run prediction
562
  results = predictor.predict(audio)
563
 
564
  # Format emotion output
565
- emotion_text = f"## 🎭 Detected Emotion: **{results['emotion'].upper()}**\n\n"
566
  emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n"
567
- emotion_text += "### All Emotion Probabilities:\n"
568
 
569
  for emotion, prob in sorted(results['emotion_probabilities'].items(),
570
  key=lambda x: x[1], reverse=True):
571
- bar_length = int(prob * 20)
572
- bar = "β–ˆ" * bar_length + "β–‘" * (20 - bar_length)
573
- emotion_text += f"**{emotion.capitalize()}:** {bar} {prob*100:.1f}%\n"
574
 
575
  # Format scores
576
- affect_text = f"**{results['vocal_affect_score']:.3f}**\n\n"
577
  if results['vocal_affect_score'] > 0.7:
578
- affect_text += "πŸ”΄ High emotional intensity detected"
579
  elif results['vocal_affect_score'] < 0.3:
580
- affect_text += "🟒 Low emotional intensity"
581
  else:
582
- affect_text += "🟑 Moderate emotional intensity"
583
 
584
- monotone_text = f"**{results['monotone_speech_score']:.3f}**\n\n"
585
  if results['monotone_speech_score'] > 0.7:
586
- monotone_text += "πŸ”΄ Very flat speech pattern"
587
  elif results['monotone_speech_score'] < 0.3:
588
- monotone_text += "🟒 Varied pitch pattern"
589
  else:
590
- monotone_text += "🟑 Moderate pitch variation"
591
 
592
- energy_text = f"**{results['vocal_energy_score']:.3f}**\n\n"
593
  if results['vocal_energy_score'] > 0.7:
594
- energy_text += "πŸ”΄ High vocal energy"
595
  elif results['vocal_energy_score'] < 0.3:
596
- energy_text += "πŸ”΄ Low vocal energy"
597
  else:
598
- energy_text += "🟒 Normal vocal energy"
599
 
600
- pitch_text = f"**Variability:** {results['pitch_variability']:.2f} Hz\n"
601
- pitch_text += f"**Energy Level:** {results['energy_level']:.3f}"
602
 
603
- mental_health_text = "\n".join(results['mental_health_indicators'])
604
 
605
- return {
606
- emotion_output: emotion_text,
607
- affect_output: affect_text,
608
- monotone_output: monotone_text,
609
- energy_output: energy_text,
610
- pitch_output: pitch_text,
611
- mental_health_output: mental_health_text
612
- }
613
 
614
  except Exception as e:
615
- error_msg = f"❌ Error processing audio: {str(e)}"
616
- return {
617
- emotion_output: error_msg,
618
- affect_output: "",
619
- monotone_output: "",
620
- energy_output: "",
621
- pitch_output: "",
622
- mental_health_output: ""
623
- }
624
 
625
  # Create interface
626
- with gr.Blocks(theme=gr.themes.Soft(), title="Audio Emotion Detection") as demo:
627
-
628
  gr.Markdown("""
629
  # πŸŽ™οΈ Audio Emotion & Mental Health Detection
630
 
631
- Upload an audio file to analyze emotional state and mental health indicators.
632
-
633
- **Features:**
634
- - 🎭 Emotion Recognition (8 emotions)
635
- - πŸ“Š Vocal Affect Score (emotional intensity)
636
- - πŸ“‰ Monotone Speech Detection (depression indicator)
637
- - ⚑ Vocal Energy Analysis (mood disorder indicator)
638
  """)
639
 
640
  with gr.Row():
641
- with gr.Column(scale=1):
642
- audio_input = gr.Audio(
643
- type="filepath",
644
- label="Upload Audio File (WAV, MP3, etc.)"
645
- )
646
-
647
- analyze_btn = gr.Button("πŸ” Analyze Audio", variant="primary", size="lg")
648
-
649
- gr.Markdown("""
650
- ### πŸ“ Instructions:
651
- 1. Upload an audio file (WAV, MP3, etc.)
652
- 2. Click "Analyze Audio"
653
- 3. View results on the right
654
-
655
- **Note:** Works best with clear speech recordings (3-10 seconds)
656
- """)
657
 
658
- with gr.Column(scale=2):
659
- emotion_output = gr.Markdown(label="Emotion Detection")
660
 
661
  with gr.Row():
662
- with gr.Column():
663
- affect_output = gr.Markdown(label="Vocal Affect Score")
664
- with gr.Column():
665
- monotone_output = gr.Markdown(label="Monotone Score")
666
- with gr.Column():
667
- energy_output = gr.Markdown(label="Vocal Energy")
668
 
669
- pitch_output = gr.Markdown(label="Technical Details")
670
- mental_health_output = gr.Markdown(label="Mental Health Indicators")
671
 
672
  gr.Markdown("""
673
- ---
674
- ### πŸ“Š Interpretation Guide
675
-
676
- | Metric | Range | Interpretation |
677
- |--------|-------|----------------|
678
- | **Vocal Affect** | 0.0-0.3 | Low emotional intensity (calm/neutral) |
679
- | | 0.3-0.7 | Moderate emotional intensity |
680
- | | 0.7-1.0 | High emotional intensity (stress/anxiety) |
681
- | **Monotone Score** | 0.0-0.3 | High pitch variation (normal) |
682
- | | 0.3-0.7 | Moderate pitch variation |
683
- | | 0.7-1.0 | Very flat speech (possible depression) |
684
- | **Vocal Energy** | 0.0-0.3 | Low energy (possible low motivation) |
685
- | | 0.3-0.7 | Normal energy level |
686
- | | 0.7-1.0 | High energy (possible anxiety/mania) |
687
-
688
- ---
689
-
690
- **⚠️ Disclaimer:** This tool is for research and informational purposes only.
691
- It should not be used as a substitute for professional medical or psychological diagnosis.
692
- Always consult qualified healthcare professionals for mental health concerns.
693
-
694
- **πŸ”¬ Model Info:** Multi-task Deep Neural Network trained on emotional speech datasets (RAVDESS, TESS, CREMA-D)
695
  """)
696
 
697
- # Connect button to function
698
- analyze_btn.click(
699
- fn=predict_emotion,
700
- inputs=audio_input,
701
- outputs=[emotion_output, affect_output, monotone_output,
702
- energy_output, pitch_output, mental_health_output]
703
  )
704
 
705
  return demo
706
 
707
 
708
  # ============================================
709
- # MAIN EXECUTION
710
  # ============================================
711
 
712
  if __name__ == "__main__":
713
- print("="*60)
714
- print("πŸŽ™οΈ Audio Emotion & Mental Health Detection")
715
- print("="*60)
716
- print("\nStarting Gradio interface...")
717
-
718
- # Create and launch app
719
- app = create_gradio_app()
720
- app.launch(
721
- server_name="0.0.0.0",
722
- server_port=7860,
723
- share=False
724
- )
 
1
  #!/usr/bin/env python3
2
  """
3
  Audio Emotion & Mental Health Detection Model
4
+ Lightweight version for Hugging Face Spaces
5
+ Using scikit-learn instead of PyTorch
6
  """
7
 
8
  import os
9
  import numpy as np
 
 
 
10
  import gradio as gr
11
+ from typing import Dict
12
  import warnings
13
+ import pickle
14
  warnings.filterwarnings('ignore')
15
 
16
+ # Audio processing
17
  try:
18
  import librosa
19
  LIBROSA_AVAILABLE = True
20
  except ImportError:
21
  LIBROSA_AVAILABLE = False
22
+ print("⚠️ Librosa not available, using scipy")
23
 
 
24
  from scipy.io import wavfile
25
+ import scipy.signal as signal
26
+ from scipy import fft
27
+
28
+ # Machine Learning
29
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
30
+ from sklearn.preprocessing import StandardScaler
31
+ from sklearn.neural_network import MLPClassifier, MLPRegressor
32
 
33
  # ============================================
34
+ # AUDIO PROCESSING
35
  # ============================================
36
 
37
+ class AudioFeatureExtractor:
38
+ """Extract audio features without heavy dependencies"""
39
 
40
+ def __init__(self, sr=16000, n_mfcc=20):
41
  self.sr = sr
42
  self.n_mfcc = n_mfcc
43
 
 
46
  try:
47
  if LIBROSA_AVAILABLE:
48
  y, sr = librosa.load(audio_path, sr=self.sr, duration=3)
49
+ return y, sr
50
  else:
51
+ # Use scipy
52
  sr, y = wavfile.read(audio_path)
53
+
54
+ # Convert to mono
55
  if len(y.shape) > 1:
56
+ y = y.mean(axis=1)
57
+
58
+ # Normalize
59
+ y = y.astype(np.float32)
60
+ if np.max(np.abs(y)) > 0:
61
+ y = y / np.max(np.abs(y))
62
 
63
  # Resample if needed
64
  if sr != self.sr:
65
  num_samples = int(len(y) * self.sr / sr)
66
  y = signal.resample(y, num_samples)
67
 
68
+ # Limit to 3 seconds
69
  max_len = 3 * self.sr
70
  if len(y) > max_len:
71
  y = y[:max_len]
72
+
73
+ return y, self.sr
74
  except Exception as e:
75
  print(f"Error loading audio: {e}")
76
+ return np.random.randn(self.sr * 3) * 0.1, self.sr
77
 
78
+ def get_mfcc_simple(self, y):
79
+ """Simplified MFCC extraction"""
80
+ # Pre-emphasis
81
+ y_emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Framing
84
+ frame_length = int(0.025 * self.sr)
85
+ frame_step = int(0.01 * self.sr)
86
+
87
+ num_frames = 1 + int((len(y_emphasized) - frame_length) / frame_step)
88
+ frames = np.zeros((num_frames, frame_length))
89
+
90
+ for i in range(num_frames):
91
+ start = i * frame_step
92
+ frames[i] = y_emphasized[start:start + frame_length]
93
+
94
+ # Apply window
95
+ frames *= np.hamming(frame_length)
96
+
97
+ # FFT
98
+ mag_frames = np.absolute(np.fft.rfft(frames, frame_length))
99
+ pow_frames = ((1.0 / frame_length) * (mag_frames ** 2))
100
+
101
+ # Mel filterbank
102
+ nfft = frame_length
103
+ nfilt = 26
104
  low_freq_mel = 0
105
+ high_freq_mel = 2595 * np.log10(1 + (self.sr / 2) / 700)
106
+ mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
107
  hz_points = 700 * (10**(mel_points / 2595) - 1)
108
+ bin_points = np.floor((nfft + 1) * hz_points / self.sr).astype(int)
109
 
110
+ fbank = np.zeros((nfilt, int(nfft / 2 + 1)))
111
+ for m in range(1, nfilt + 1):
112
+ f_m_minus = bin_points[m - 1]
113
+ f_m = bin_points[m]
114
+ f_m_plus = bin_points[m + 1]
115
 
116
  for k in range(f_m_minus, f_m):
117
  fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
118
  for k in range(f_m, f_m_plus):
119
  fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
120
 
121
+ filter_banks = np.dot(pow_frames, fbank.T)
122
+ filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
123
+ filter_banks = 20 * np.log10(filter_banks)
124
+
125
+ # DCT
126
+ mfcc = fft.dct(filter_banks, type=2, axis=1, norm='ortho')[:, :self.n_mfcc]
127
+
128
+ return mfcc.T
129
 
130
  def extract_pitch(self, y):
131
+ """Extract pitch using autocorrelation"""
132
+ pitch_values = []
133
+ frame_length = int(0.03 * self.sr)
134
+ hop_length = int(0.01 * self.sr)
135
+
136
+ for i in range(0, len(y) - frame_length, hop_length):
137
+ frame = y[i:i+frame_length]
138
+
139
+ # Autocorrelation
140
+ corr = np.correlate(frame, frame, mode='full')
141
+ corr = corr[len(corr)//2:]
142
+
143
+ # Find first peak after lag 0
144
+ d = np.diff(corr)
145
+ start = int(self.sr / 400) # Min 400 Hz
146
+ peak = np.where(d[start:] < 0)[0]
147
+
148
+ if len(peak) > 0:
149
+ peak_idx = peak[0] + start
150
+ if peak_idx > 0:
151
+ freq = self.sr / peak_idx
152
+ if 50 < freq < 400:
153
+ pitch_values.append(freq)
154
+
155
+ return pitch_values if pitch_values else [150.0]
 
 
 
156
 
157
  def extract_energy(self, y):
158
+ """Extract RMS energy"""
159
+ frame_length = int(0.025 * self.sr)
160
+ hop_length = int(0.01 * self.sr)
161
+
162
+ rms = []
163
+ for i in range(0, len(y) - frame_length, hop_length):
164
+ frame = y[i:i+frame_length]
165
+ rms.append(np.sqrt(np.mean(frame**2)))
 
 
 
 
 
166
 
167
+ return np.array(rms)
168
 
169
  def extract_zcr(self, y):
170
+ """Zero crossing rate"""
171
+ frame_length = int(0.025 * self.sr)
172
+ hop_length = int(0.01 * self.sr)
173
+
174
+ zcr = []
175
+ for i in range(0, len(y) - frame_length, hop_length):
176
+ frame = y[i:i+frame_length]
177
+ crossings = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
178
+ zcr.append(crossings / frame_length)
 
 
 
 
 
179
 
180
+ return np.array(zcr)
181
 
182
  def extract_spectral_features(self, y):
183
+ """Spectral features"""
184
+ spectrum = np.fft.rfft(y)
185
+ magnitude = np.abs(spectrum)
 
186
  freq = np.fft.rfftfreq(len(y), 1.0/self.sr)
187
 
188
  # Spectral centroid
189
+ centroid = np.sum(freq * magnitude) / (np.sum(magnitude) + 1e-6)
190
 
191
+ # Spectral rolloff
192
  cumsum = np.cumsum(magnitude)
193
  rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0]
194
+ rolloff = freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else 0
195
 
196
  # Spectral bandwidth
197
+ bandwidth = np.sqrt(np.sum(((freq - centroid)**2) * magnitude) / (np.sum(magnitude) + 1e-6))
 
198
 
199
+ return centroid, rolloff, bandwidth
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  def extract_all_features(self, audio_path):
202
+ """Extract all features"""
203
  try:
 
204
  y, sr = self.load_audio(audio_path)
205
 
206
+ # MFCCs
207
+ mfcc = self.get_mfcc_simple(y)
208
+ mfcc_mean = np.mean(mfcc, axis=1)
209
+ mfcc_std = np.std(mfcc, axis=1)
210
 
211
+ # Pitch
212
  pitch_values = self.extract_pitch(y)
213
  pitch_mean = np.mean(pitch_values)
214
  pitch_std = np.std(pitch_values)
215
  pitch_min = np.min(pitch_values)
216
  pitch_max = np.max(pitch_values)
217
+ monotone_score = 1.0 / (1.0 + pitch_std/10.0)
218
 
219
+ # Energy
220
  rms = self.extract_energy(y)
221
  energy_mean = np.mean(rms)
222
  energy_std = np.std(rms)
223
  energy_max = np.max(rms)
224
 
225
+ # ZCR
226
  zcr = self.extract_zcr(y)
227
  zcr_mean = np.mean(zcr)
228
  zcr_std = np.std(zcr)
229
 
230
+ # Spectral
231
+ spec_centroid, spec_rolloff, spec_bandwidth = self.extract_spectral_features(y)
 
232
 
233
+ # Tempo estimation
234
+ onset_env = rms
235
+ tempo = 120.0 # Default
236
+ if len(onset_env) > 10:
237
+ autocorr = np.correlate(onset_env, onset_env, mode='full')
238
+ autocorr = autocorr[len(autocorr)//2:]
239
+ peaks = signal.find_peaks(autocorr)[0]
240
+ if len(peaks) > 0 and peaks[0] > 0:
241
+ tempo = 60.0 / (peaks[0] * 0.01)
242
+ tempo = np.clip(tempo, 60, 180)
243
 
244
  # Combine features
245
  features = np.concatenate([
 
248
  [pitch_mean, pitch_std, pitch_min, pitch_max, monotone_score],
249
  [energy_mean, energy_std, energy_max],
250
  [zcr_mean, zcr_std],
251
+ [spec_centroid, spec_rolloff, spec_bandwidth],
 
252
  [tempo]
253
  ])
254
 
255
+ # Derived scores
256
+ vocal_affect = self._calc_affect(pitch_std, energy_std, spec_centroid)
257
+ vocal_energy = self._calc_energy(energy_mean, tempo, zcr_mean)
 
 
 
 
258
 
259
  return {
260
  'features': features.astype(np.float32),
261
+ 'vocal_affect_score': float(vocal_affect),
262
  'monotone_score': float(monotone_score),
263
+ 'vocal_energy_score': float(vocal_energy),
264
  'pitch_variability': float(pitch_std),
265
  'energy_level': float(energy_mean)
266
  }
267
 
268
  except Exception as e:
269
+ print(f"Error: {e}")
270
+ return self._default_features()
 
271
 
272
+ def _calc_affect(self, pitch_std, energy_std, spec_centroid):
273
+ """Calculate vocal affect score"""
274
+ pitch_comp = min(pitch_std / 50.0, 1.0)
275
+ energy_comp = min(energy_std / 0.3, 1.0)
276
+ spec_comp = min(spec_centroid / 2000.0, 1.0)
277
+ return np.clip(pitch_comp * 0.4 + energy_comp * 0.4 + spec_comp * 0.2, 0, 1)
 
 
 
 
 
278
 
279
+ def _calc_energy(self, energy_mean, tempo, zcr_mean):
280
+ """Calculate vocal energy score"""
281
+ energy_comp = min(energy_mean / 0.5, 1.0)
282
+ tempo_comp = min(tempo / 150.0, 1.0)
283
+ zcr_comp = min(zcr_mean / 0.15, 1.0)
284
+ return np.clip(energy_comp * 0.5 + tempo_comp * 0.3 + zcr_comp * 0.2, 0, 1)
 
 
 
 
 
285
 
286
+ def _default_features(self):
287
+ """Default features for errors"""
288
+ n_features = self.n_mfcc * 2 + 14
289
  return {
290
+ 'features': np.random.randn(n_features).astype(np.float32) * 0.1,
291
  'vocal_affect_score': 0.5,
292
  'monotone_score': 0.5,
293
  'vocal_energy_score': 0.5,
294
+ 'pitch_variability': 30.0,
295
+ 'energy_level': 0.3
296
  }
297
 
298
 
299
  # ============================================
300
+ # EMOTION PREDICTOR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  # ============================================
302
 
303
  class EmotionPredictor:
304
+ """Lightweight emotion predictor using sklearn"""
305
 
306
  def __init__(self):
307
+ self.extractor = AudioFeatureExtractor(sr=16000, n_mfcc=20)
 
308
 
309
  # Emotion mapping
310
+ self.emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
+ # Initialize models
313
+ self._initialize_models()
314
 
315
+ def _initialize_models(self):
316
+ """Initialize pre-trained or demo models"""
 
317
 
318
+ # Try to load pre-trained models
319
+ if os.path.exists('emotion_classifier.pkl'):
320
  try:
321
+ with open('emotion_classifier.pkl', 'rb') as f:
322
+ self.emotion_model = pickle.load(f)
323
+ with open('affect_model.pkl', 'rb') as f:
324
+ self.affect_model = pickle.load(f)
325
+ with open('monotone_model.pkl', 'rb') as f:
326
+ self.monotone_model = pickle.load(f)
327
+ with open('energy_model.pkl', 'rb') as f:
328
+ self.energy_model = pickle.load(f)
329
+ with open('scaler.pkl', 'rb') as f:
330
+ self.scaler = pickle.load(f)
331
+ print("βœ… Loaded pre-trained models")
332
+ return
333
+ except:
334
+ pass
335
+
336
+ # Create demo models (for demonstration without training)
337
+ print("ℹ️ Creating demo models (for demonstration)")
338
+
339
+ n_features = 54 # 20*2 MFCC + 14 other features
340
+
341
+ # Emotion classifier
342
+ self.emotion_model = RandomForestClassifier(
343
+ n_estimators=100,
344
+ max_depth=10,
345
+ random_state=42
346
+ )
347
+
348
+ # Regression models
349
+ self.affect_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
350
+ self.monotone_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
351
+ self.energy_model = GradientBoostingRegressor(n_estimators=50, random_state=42)
352
+
353
+ # Scaler
354
+ self.scaler = StandardScaler()
355
+
356
+ # Fit with dummy data (for demo purposes)
357
+ X_dummy = np.random.randn(100, n_features)
358
+ y_emotion_dummy = np.random.randint(0, 8, 100)
359
+ y_reg_dummy = np.random.rand(100)
360
+
361
+ self.scaler.fit(X_dummy)
362
+ self.emotion_model.fit(X_dummy, y_emotion_dummy)
363
+ self.affect_model.fit(X_dummy, y_reg_dummy)
364
+ self.monotone_model.fit(X_dummy, y_reg_dummy)
365
+ self.energy_model.fit(X_dummy, y_reg_dummy)
366
 
367
+ def predict(self, audio_path):
368
  """Predict emotion and mental health indicators"""
369
 
370
  # Extract features
371
+ feature_dict = self.extractor.extract_all_features(audio_path)
372
+ features = feature_dict['features'].reshape(1, -1)
 
373
 
374
+ # Scale features
375
+ features_scaled = self.scaler.transform(features)
 
376
 
377
+ # Predict emotion
378
+ emotion_probs = self.emotion_model.predict_proba(features_scaled)[0]
379
+ emotion_idx = np.argmax(emotion_probs)
380
+ emotion = self.emotions[emotion_idx]
381
+ confidence = emotion_probs[emotion_idx]
382
 
383
+ # Predict regression outputs
384
+ vocal_affect = np.clip(self.affect_model.predict(features_scaled)[0], 0, 1)
385
+ monotone_score = np.clip(self.monotone_model.predict(features_scaled)[0], 0, 1)
386
+ vocal_energy = np.clip(self.energy_model.predict(features_scaled)[0], 0, 1)
387
+
388
+ # Adjust with extracted features for better estimates
389
+ vocal_affect = (vocal_affect + feature_dict['vocal_affect_score']) / 2
390
+ monotone_score = (monotone_score + feature_dict['monotone_score']) / 2
391
+ vocal_energy = (vocal_energy + feature_dict['vocal_energy_score']) / 2
392
 
393
  # Mental health interpretation
394
+ indicators = self._interpret_mental_health(monotone_score, vocal_affect, vocal_energy)
 
 
395
 
396
+ return {
397
  'emotion': emotion,
398
  'confidence': confidence,
399
  'emotion_probabilities': {
400
+ self.emotions[i]: prob for i, prob in enumerate(emotion_probs)
 
401
  },
402
  'vocal_affect_score': vocal_affect,
403
  'monotone_speech_score': monotone_score,
404
  'vocal_energy_score': vocal_energy,
405
  'pitch_variability': feature_dict['pitch_variability'],
406
  'energy_level': feature_dict['energy_level'],
407
+ 'mental_health_indicators': indicators
408
  }
 
 
409
 
410
  def _interpret_mental_health(self, monotone, affect, energy):
411
  """Interpret mental health indicators"""
 
423
  if affect > 0.6 and monotone < 0.4:
424
  indicators.append("⚠️ High vocal affect - possible emotional stress")
425
 
426
+ if 0.35 <= monotone <= 0.65 and 0.35 <= affect <= 0.65 and 0.35 <= energy <= 0.65:
427
+ indicators.append("βœ… Balanced vocal characteristics")
428
 
429
  if not indicators:
430
  indicators.append("ℹ️ Vocal patterns within normal range")
 
436
  # GRADIO INTERFACE
437
  # ============================================
438
 
439
+ def create_app():
440
+ """Create Gradio app"""
441
 
 
 
442
  predictor = EmotionPredictor()
 
443
 
444
+ def analyze_audio(audio):
445
+ """Analysis function"""
446
  if audio is None:
447
+ return "❌ Please upload an audio file", "", "", "", "", ""
 
 
 
 
 
 
 
448
 
449
  try:
 
450
  results = predictor.predict(audio)
451
 
452
  # Format emotion output
453
+ emotion_text = f"## 🎭 **{results['emotion'].upper()}**\n\n"
454
  emotion_text += f"**Confidence:** {results['confidence']*100:.1f}%\n\n"
455
+ emotion_text += "### Probability Distribution:\n"
456
 
457
  for emotion, prob in sorted(results['emotion_probabilities'].items(),
458
  key=lambda x: x[1], reverse=True):
459
+ bar = "β–ˆ" * int(prob * 20) + "β–‘" * (20 - int(prob * 20))
460
+ emotion_text += f"**{emotion.title()}:** {bar} {prob*100:.1f}%\n"
 
461
 
462
  # Format scores
463
+ affect = f"**Score:** {results['vocal_affect_score']:.3f}\n\n"
464
  if results['vocal_affect_score'] > 0.7:
465
+ affect += "πŸ”΄ High intensity"
466
  elif results['vocal_affect_score'] < 0.3:
467
+ affect += "🟒 Low intensity"
468
  else:
469
+ affect += "🟑 Moderate"
470
 
471
+ monotone = f"**Score:** {results['monotone_speech_score']:.3f}\n\n"
472
  if results['monotone_speech_score'] > 0.7:
473
+ monotone += "πŸ”΄ Very flat speech"
474
  elif results['monotone_speech_score'] < 0.3:
475
+ monotone += "🟒 Varied pitch"
476
  else:
477
+ monotone += "🟑 Moderate variation"
478
 
479
+ energy = f"**Score:** {results['vocal_energy_score']:.3f}\n\n"
480
  if results['vocal_energy_score'] > 0.7:
481
+ energy += "πŸ”΄ High energy"
482
  elif results['vocal_energy_score'] < 0.3:
483
+ energy += "πŸ”΄ Low energy"
484
  else:
485
+ energy += "🟒 Normal energy"
486
 
487
+ details = f"**Pitch Variability:** {results['pitch_variability']:.2f} Hz\n"
488
+ details += f"**Energy Level:** {results['energy_level']:.3f}"
489
 
490
+ mental = "\n".join(results['mental_health_indicators'])
491
 
492
+ return emotion_text, affect, monotone, energy, details, mental
 
 
 
 
 
 
 
493
 
494
  except Exception as e:
495
+ return f"❌ Error: {str(e)}", "", "", "", "", ""
 
 
 
 
 
 
 
 
496
 
497
  # Create interface
498
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
499
  gr.Markdown("""
500
  # πŸŽ™οΈ Audio Emotion & Mental Health Detection
501
 
502
+ Analyze emotional state and mental health indicators from speech audio.
 
 
 
 
 
 
503
  """)
504
 
505
  with gr.Row():
506
+ with gr.Column():
507
+ audio = gr.Audio(type="filepath", label="Upload Audio")
508
+ btn = gr.Button("πŸ” Analyze", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
 
 
 
509
 
510
+ with gr.Column():
511
+ emotion_out = gr.Markdown()
512
 
513
  with gr.Row():
514
+ affect_out = gr.Markdown()
515
+ monotone_out = gr.Markdown()
516
+ energy_out = gr.Markdown()
 
 
 
517
 
518
+ details_out = gr.Markdown()
519
+ mental_out = gr.Markdown()
520
 
521
  gr.Markdown("""
522
+ ### πŸ“Š Interpretation
523
+
524
+ - **Vocal Affect:** Emotional intensity (0=calm, 1=intense)
525
+ - **Monotone Score:** Pitch flatness (high=depression risk)
526
+ - **Vocal Energy:** Speaking energy (low=low motivation)
527
+
528
+ ⚠️ **Disclaimer:** For research only, not medical diagnosis.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
  """)
530
 
531
+ btn.click(
532
+ analyze_audio,
533
+ inputs=audio,
534
+ outputs=[emotion_out, affect_out, monotone_out, energy_out, details_out, mental_out]
 
 
535
  )
536
 
537
  return demo
538
 
539
 
540
  # ============================================
541
+ # MAIN
542
  # ============================================
543
 
544
  if __name__ == "__main__":
545
+ app = create_app()
546
+ app.launch()