Diggz10 commited on
Commit
deaaabb
·
verified ·
1 Parent(s): 203bd74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -21
app.py CHANGED
@@ -1,13 +1,10 @@
1
- # app.py — Voice Clarity Booster with clear A/B comparison & loudness match
2
- # - Modes: MetricGAN+ (denoise), SepFormer (dereverb+denoise), Bypass
3
- # - Dry/Wet, Presence, Low-cut
4
- # - Loudness Match (optional)
5
- # - Outputs: Enhanced, A/B alternating (2s O/E flip), Delta (Original−Enhanced), Metrics
6
 
7
  import os
8
  import io
9
  import tempfile
10
- from typing import Tuple, Optional
11
 
12
  # --- Quiet noisy deprecation warnings (optional) ---
13
  import warnings
@@ -27,7 +24,7 @@ import soundfile as sf
27
  import torch
28
  import torchaudio
29
 
30
- # Optional: pyloudnorm for true LUFS matching; fallback to RMS if not available
31
  try:
32
  import pyloudnorm as pyln
33
  _HAVE_PYLN = True
@@ -96,6 +93,7 @@ def _to_mono(wav: np.ndarray) -> np.ndarray:
96
  if t <= 8: # [C, T]
97
  return wav.mean(axis=0).astype(np.float32)
98
  return wav.mean(axis=1).astype(np.float32)
 
99
  return wav.reshape(-1).astype(np.float32)
100
 
101
 
@@ -184,7 +182,7 @@ def _make_ab_alternating(orig: np.ndarray, enh: np.ndarray, sr: int, seg_sec: fl
184
 
185
 
186
  # -----------------------------
187
- # Core pipeline
188
  # -----------------------------
189
  def _run_metricgan(path_16k: str) -> torch.Tensor:
190
  enh = _get_metricgan()
@@ -218,6 +216,31 @@ def _run_sepformer(path_16k: str) -> torch.Tensor:
218
  raise RuntimeError("Unexpected SepFormer output type")
219
 
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  def _enhance_numpy_audio(
222
  audio: Tuple[int, np.ndarray],
223
  mode: str = "MetricGAN+ (denoise)",
@@ -256,6 +279,8 @@ def _enhance_numpy_audio(
256
  proc = _run_metricgan(path_16k) # [1, T@16k]
257
  elif mode.startswith("SepFormer"):
258
  proc = _run_sepformer(path_16k) # [1, T@16k]
 
 
259
  else: # Bypass (EQ only)
260
  proc = wav_16k
261
  finally:
@@ -274,7 +299,7 @@ def _enhance_numpy_audio(
274
  proc_out = _resample_torch(proc, 16000, sr_out).squeeze(0).numpy().astype(np.float32)
275
  dry_out = _resample_torch(dry_t, sr_in, sr_out).squeeze(0).numpy().astype(np.float32)
276
 
277
- # Align and mix
278
  proc_out, dry_out = _align_lengths(proc_out, dry_out)
279
  dry_wet = float(np.clip(dry_wet, 0.0, 1.0))
280
  enhanced = proc_out * dry_wet + dry_out * (1.0 - dry_wet)
@@ -299,13 +324,93 @@ def _enhance_numpy_audio(
299
  metrics = (
300
  f"Mode: {mode} | Dry/Wet: {dry_wet*100:.0f}% | Presence: {presence_db:+.1f} dB | "
301
  f"Low-cut: {lowcut_hz:.0f} Hz | Loudness match: {loud_text}\n"
302
- f"Dur: {len(enhanced)/sr_out:.2f}s | Δ (original−enhanced) RMS: {20*np.log10(rms_delta+eps):+.2f} dBFS | "
303
  f'Approx. "noise removed" ratio: {change_db:.2f} dB'
304
  )
305
 
306
  return sr_out, enhanced, delta, metrics
307
 
308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  # -----------------------------
310
  # Gradio UI
311
  # -----------------------------
@@ -342,28 +447,41 @@ def gradio_enhance(
342
 
343
 
344
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
345
- gr.Markdown("## Voice Clarity Booster — with A/B and Delta listening")
 
346
  with gr.Row():
347
- with gr.Column():
348
  in_audio = gr.Audio(
349
  sources=["upload", "microphone"],
350
  type="numpy",
351
  label="Input",
352
  )
 
 
 
 
 
 
 
353
  mode = gr.Radio(
354
- choices=["MetricGAN+ (denoise)", "SepFormer (dereverb+denoise)", "Bypass (EQ only)"],
355
- value="MetricGAN+ (denoise)",
 
 
 
 
 
356
  label="Mode",
357
  )
358
  dry_wet = gr.Slider(
359
- minimum=0, maximum=100, value=85, step=1,
360
  label="Dry/Wet Mix (%) — lower to reduce artifacts"
361
  )
362
  presence = gr.Slider(
363
- minimum=-12, maximum=12, value=0, step=0.5, label="Presence Boost (dB)"
364
  )
365
  lowcut = gr.Slider(
366
- minimum=0, maximum=200, value=0, step=5, label="Low-Cut (Hz)"
367
  )
368
  loudmatch = gr.Checkbox(value=True, label="Loudness-match enhanced to original")
369
  out_sr = gr.Radio(
@@ -371,11 +489,20 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
371
  value="Original",
372
  label="Output Sample Rate",
373
  )
374
- btn = gr.Button("Enhance")
375
- with gr.Column():
 
 
 
 
 
 
 
 
 
376
  out_audio = gr.Audio(type="numpy", label="Enhanced (autoplay)", autoplay=True)
377
- ab_audio = gr.Audio(type="numpy", label="A/B Alternating (2s O → 2s E)", autoplay=False)
378
- delta_audio = gr.Audio(type="numpy", label="Delta: Original − Enhanced", autoplay=False)
379
  metrics = gr.Markdown("")
380
 
381
  btn.click(
 
1
+ # app.py — Voice Clarity Booster with Presets, Dual-Stage "Ultimate Clean Voice",
2
+ # A/B alternating, Delta (Original−Enhanced), and Loudness Match.
 
 
 
3
 
4
  import os
5
  import io
6
  import tempfile
7
+ from typing import Tuple, Optional, Dict, Any
8
 
9
  # --- Quiet noisy deprecation warnings (optional) ---
10
  import warnings
 
24
  import torch
25
  import torchaudio
26
 
27
+ # Optional: pyloudnorm for LUFS match; fallback to RMS if not available
28
  try:
29
  import pyloudnorm as pyln
30
  _HAVE_PYLN = True
 
93
  if t <= 8: # [C, T]
94
  return wav.mean(axis=0).astype(np.float32)
95
  return wav.mean(axis=1).astype(np.float32)
96
+ # Higher dims: flatten
97
  return wav.reshape(-1).astype(np.float32)
98
 
99
 
 
182
 
183
 
184
  # -----------------------------
185
+ # Model runners
186
  # -----------------------------
187
  def _run_metricgan(path_16k: str) -> torch.Tensor:
188
  enh = _get_metricgan()
 
216
  raise RuntimeError("Unexpected SepFormer output type")
217
 
218
 
219
+ def _run_dual_stage(path_16k: str) -> torch.Tensor:
220
+ """
221
+ Ultimate Clean: SepFormer (dereverb/denoise) -> MetricGAN+ (denoise polish).
222
+ Both at 16 kHz mono.
223
+ """
224
+ # Stage 1: SepFormer
225
+ stage1 = _run_sepformer(path_16k) # [1, T]
226
+ # Save Stage 1 to temp wav, then Stage 2 MetricGAN+
227
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_mid:
228
+ sf.write(tmp_mid.name, stage1.squeeze(0).numpy(), 16000, subtype="PCM_16")
229
+ tmp_mid.flush()
230
+ mid_path = tmp_mid.name
231
+ try:
232
+ stage2 = _run_metricgan(mid_path) # [1, T]
233
+ finally:
234
+ try:
235
+ os.remove(mid_path)
236
+ except Exception:
237
+ pass
238
+ return stage2
239
+
240
+
241
+ # -----------------------------
242
+ # Core pipeline
243
+ # -----------------------------
244
  def _enhance_numpy_audio(
245
  audio: Tuple[int, np.ndarray],
246
  mode: str = "MetricGAN+ (denoise)",
 
279
  proc = _run_metricgan(path_16k) # [1, T@16k]
280
  elif mode.startswith("SepFormer"):
281
  proc = _run_sepformer(path_16k) # [1, T@16k]
282
+ elif mode.startswith("Dual-Stage"):
283
+ proc = _run_dual_stage(path_16k) # [1, T@16k]
284
  else: # Bypass (EQ only)
285
  proc = wav_16k
286
  finally:
 
299
  proc_out = _resample_torch(proc, 16000, sr_out).squeeze(0).numpy().astype(np.float32)
300
  dry_out = _resample_torch(dry_t, sr_in, sr_out).squeeze(0).numpy().astype(np.float32)
301
 
302
+ # Align and mix (dry/wet)
303
  proc_out, dry_out = _align_lengths(proc_out, dry_out)
304
  dry_wet = float(np.clip(dry_wet, 0.0, 1.0))
305
  enhanced = proc_out * dry_wet + dry_out * (1.0 - dry_wet)
 
324
  metrics = (
325
  f"Mode: {mode} | Dry/Wet: {dry_wet*100:.0f}% | Presence: {presence_db:+.1f} dB | "
326
  f"Low-cut: {lowcut_hz:.0f} Hz | Loudness match: {loud_text}\n"
327
+ f"Dur: {len(enhanced)/sr_out:.2f}s | Δ RMS: {20*np.log10(rms_delta+eps):+.2f} dBFS | "
328
  f'Approx. "noise removed" ratio: {change_db:.2f} dB'
329
  )
330
 
331
  return sr_out, enhanced, delta, metrics
332
 
333
 
334
+ # -----------------------------
335
+ # Presets
336
+ # -----------------------------
337
+ PRESETS: Dict[str, Dict[str, Any]] = {
338
+ # Maximum cleanup: dereverb + denoise chain, high dry/wet, subtle presence, mild HPF
339
+ "Ultimate Clean Voice": {
340
+ "mode": "Dual-Stage (SepFormer → MetricGAN+)",
341
+ "dry_wet": 0.92,
342
+ "presence_db": 1.5,
343
+ "lowcut_hz": 80.0,
344
+ "loudness_match": True,
345
+ },
346
+ # Natural cleanup for most cases
347
+ "Natural Speech": {
348
+ "mode": "MetricGAN+ (denoise)",
349
+ "dry_wet": 0.85,
350
+ "presence_db": 1.0,
351
+ "lowcut_hz": 50.0,
352
+ "loudness_match": True,
353
+ },
354
+ # Studio-ish clarity
355
+ "Podcast Studio": {
356
+ "mode": "MetricGAN+ (denoise)",
357
+ "dry_wet": 0.9,
358
+ "presence_db": 2.0,
359
+ "lowcut_hz": 75.0,
360
+ "loudness_match": True,
361
+ },
362
+ # Strong dereverb, blend to avoid artifacts
363
+ "Room Dereverb": {
364
+ "mode": "SepFormer (dereverb+denoise)",
365
+ "dry_wet": 0.7,
366
+ "presence_db": 0.5,
367
+ "lowcut_hz": 60.0,
368
+ "loudness_match": True,
369
+ },
370
+ # When music bed is under voice—be gentle
371
+ "Music + Voice Safe": {
372
+ "mode": "MetricGAN+ (denoise)",
373
+ "dry_wet": 0.6,
374
+ "presence_db": 0.0,
375
+ "lowcut_hz": 40.0,
376
+ "loudness_match": True,
377
+ },
378
+ # Harsh phone/zoom recordings
379
+ "Phone Call Rescue": {
380
+ "mode": "MetricGAN+ (denoise)",
381
+ "dry_wet": 0.88,
382
+ "presence_db": 2.0,
383
+ "lowcut_hz": 100.0,
384
+ "loudness_match": True,
385
+ },
386
+ # Light touch
387
+ "Gentle Denoise": {
388
+ "mode": "MetricGAN+ (denoise)",
389
+ "dry_wet": 0.65,
390
+ "presence_db": 0.0,
391
+ "lowcut_hz": 0.0,
392
+ "loudness_match": True,
393
+ },
394
+ "Custom": {} # no-op, keeps current settings
395
+ }
396
+
397
+
398
+ def _apply_preset(preset_name: str):
399
+ cfg = PRESETS.get(preset_name, {})
400
+ # Return gr.update() for each adjustable control
401
+ def upd(val=None):
402
+ return gr.update(value=val) if val is not None else gr.update()
403
+ if not cfg or preset_name == "Custom":
404
+ return upd(), upd(), upd(), upd(), upd()
405
+ return (
406
+ upd(cfg["mode"]),
407
+ upd(int(round(cfg["dry_wet"] * 100))),
408
+ upd(float(cfg["presence_db"])),
409
+ upd(float(cfg["lowcut_hz"])),
410
+ upd(bool(cfg["loudness_match"])),
411
+ )
412
+
413
+
414
  # -----------------------------
415
  # Gradio UI
416
  # -----------------------------
 
447
 
448
 
449
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
450
+ gr.Markdown("## Voice Clarity Booster — Presets, A/B, Delta, Loudness Match")
451
+
452
  with gr.Row():
453
+ with gr.Column(scale=1):
454
  in_audio = gr.Audio(
455
  sources=["upload", "microphone"],
456
  type="numpy",
457
  label="Input",
458
  )
459
+ preset = gr.Dropdown(
460
+ choices=list(PRESETS.keys()),
461
+ value="Ultimate Clean Voice",
462
+ label="Preset",
463
+ )
464
+
465
+ # Controls that presets will adjust
466
  mode = gr.Radio(
467
+ choices=[
468
+ "MetricGAN+ (denoise)",
469
+ "SepFormer (dereverb+denoise)",
470
+ "Dual-Stage (SepFormer → MetricGAN+)",
471
+ "Bypass (EQ only)"
472
+ ],
473
+ value="Dual-Stage (SepFormer → MetricGAN+)",
474
  label="Mode",
475
  )
476
  dry_wet = gr.Slider(
477
+ minimum=0, maximum=100, value=92, step=1,
478
  label="Dry/Wet Mix (%) — lower to reduce artifacts"
479
  )
480
  presence = gr.Slider(
481
+ minimum=-12, maximum=12, value=1.5, step=0.5, label="Presence Boost (dB)"
482
  )
483
  lowcut = gr.Slider(
484
+ minimum=0, maximum=200, value=80, step=5, label="Low-Cut (Hz)"
485
  )
486
  loudmatch = gr.Checkbox(value=True, label="Loudness-match enhanced to original")
487
  out_sr = gr.Radio(
 
489
  value="Original",
490
  label="Output Sample Rate",
491
  )
492
+
493
+ # Apply preset on change
494
+ preset.change(
495
+ _apply_preset,
496
+ inputs=[preset],
497
+ outputs=[mode, dry_wet, presence, lowcut, loudmatch],
498
+ )
499
+
500
+ btn = gr.Button("Enhance", variant="primary")
501
+
502
+ with gr.Column(scale=1):
503
  out_audio = gr.Audio(type="numpy", label="Enhanced (autoplay)", autoplay=True)
504
+ ab_audio = gr.Audio(type="numpy", label="A/B Alternating (2s O → 2s E)")
505
+ delta_audio = gr.Audio(type="numpy", label="Delta: Original − Enhanced")
506
  metrics = gr.Markdown("")
507
 
508
  btn.click(