Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,13 +1,10 @@
|
|
| 1 |
-
# app.py — Voice Clarity Booster with
|
| 2 |
-
#
|
| 3 |
-
# - Dry/Wet, Presence, Low-cut
|
| 4 |
-
# - Loudness Match (optional)
|
| 5 |
-
# - Outputs: Enhanced, A/B alternating (2s O/E flip), Delta (Original−Enhanced), Metrics
|
| 6 |
|
| 7 |
import os
|
| 8 |
import io
|
| 9 |
import tempfile
|
| 10 |
-
from typing import Tuple, Optional
|
| 11 |
|
| 12 |
# --- Quiet noisy deprecation warnings (optional) ---
|
| 13 |
import warnings
|
|
@@ -27,7 +24,7 @@ import soundfile as sf
|
|
| 27 |
import torch
|
| 28 |
import torchaudio
|
| 29 |
|
| 30 |
-
# Optional: pyloudnorm for
|
| 31 |
try:
|
| 32 |
import pyloudnorm as pyln
|
| 33 |
_HAVE_PYLN = True
|
|
@@ -96,6 +93,7 @@ def _to_mono(wav: np.ndarray) -> np.ndarray:
|
|
| 96 |
if t <= 8: # [C, T]
|
| 97 |
return wav.mean(axis=0).astype(np.float32)
|
| 98 |
return wav.mean(axis=1).astype(np.float32)
|
|
|
|
| 99 |
return wav.reshape(-1).astype(np.float32)
|
| 100 |
|
| 101 |
|
|
@@ -184,7 +182,7 @@ def _make_ab_alternating(orig: np.ndarray, enh: np.ndarray, sr: int, seg_sec: fl
|
|
| 184 |
|
| 185 |
|
| 186 |
# -----------------------------
|
| 187 |
-
#
|
| 188 |
# -----------------------------
|
| 189 |
def _run_metricgan(path_16k: str) -> torch.Tensor:
|
| 190 |
enh = _get_metricgan()
|
|
@@ -218,6 +216,31 @@ def _run_sepformer(path_16k: str) -> torch.Tensor:
|
|
| 218 |
raise RuntimeError("Unexpected SepFormer output type")
|
| 219 |
|
| 220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
def _enhance_numpy_audio(
|
| 222 |
audio: Tuple[int, np.ndarray],
|
| 223 |
mode: str = "MetricGAN+ (denoise)",
|
|
@@ -256,6 +279,8 @@ def _enhance_numpy_audio(
|
|
| 256 |
proc = _run_metricgan(path_16k) # [1, T@16k]
|
| 257 |
elif mode.startswith("SepFormer"):
|
| 258 |
proc = _run_sepformer(path_16k) # [1, T@16k]
|
|
|
|
|
|
|
| 259 |
else: # Bypass (EQ only)
|
| 260 |
proc = wav_16k
|
| 261 |
finally:
|
|
@@ -274,7 +299,7 @@ def _enhance_numpy_audio(
|
|
| 274 |
proc_out = _resample_torch(proc, 16000, sr_out).squeeze(0).numpy().astype(np.float32)
|
| 275 |
dry_out = _resample_torch(dry_t, sr_in, sr_out).squeeze(0).numpy().astype(np.float32)
|
| 276 |
|
| 277 |
-
# Align and mix
|
| 278 |
proc_out, dry_out = _align_lengths(proc_out, dry_out)
|
| 279 |
dry_wet = float(np.clip(dry_wet, 0.0, 1.0))
|
| 280 |
enhanced = proc_out * dry_wet + dry_out * (1.0 - dry_wet)
|
|
@@ -299,13 +324,93 @@ def _enhance_numpy_audio(
|
|
| 299 |
metrics = (
|
| 300 |
f"Mode: {mode} | Dry/Wet: {dry_wet*100:.0f}% | Presence: {presence_db:+.1f} dB | "
|
| 301 |
f"Low-cut: {lowcut_hz:.0f} Hz | Loudness match: {loud_text}\n"
|
| 302 |
-
f"Dur: {len(enhanced)/sr_out:.2f}s | Δ
|
| 303 |
f'Approx. "noise removed" ratio: {change_db:.2f} dB'
|
| 304 |
)
|
| 305 |
|
| 306 |
return sr_out, enhanced, delta, metrics
|
| 307 |
|
| 308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
# -----------------------------
|
| 310 |
# Gradio UI
|
| 311 |
# -----------------------------
|
|
@@ -342,28 +447,41 @@ def gradio_enhance(
|
|
| 342 |
|
| 343 |
|
| 344 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 345 |
-
gr.Markdown("## Voice Clarity Booster —
|
|
|
|
| 346 |
with gr.Row():
|
| 347 |
-
with gr.Column():
|
| 348 |
in_audio = gr.Audio(
|
| 349 |
sources=["upload", "microphone"],
|
| 350 |
type="numpy",
|
| 351 |
label="Input",
|
| 352 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
mode = gr.Radio(
|
| 354 |
-
choices=[
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
label="Mode",
|
| 357 |
)
|
| 358 |
dry_wet = gr.Slider(
|
| 359 |
-
minimum=0, maximum=100, value=
|
| 360 |
label="Dry/Wet Mix (%) — lower to reduce artifacts"
|
| 361 |
)
|
| 362 |
presence = gr.Slider(
|
| 363 |
-
minimum=-12, maximum=12, value=
|
| 364 |
)
|
| 365 |
lowcut = gr.Slider(
|
| 366 |
-
minimum=0, maximum=200, value=
|
| 367 |
)
|
| 368 |
loudmatch = gr.Checkbox(value=True, label="Loudness-match enhanced to original")
|
| 369 |
out_sr = gr.Radio(
|
|
@@ -371,11 +489,20 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 371 |
value="Original",
|
| 372 |
label="Output Sample Rate",
|
| 373 |
)
|
| 374 |
-
|
| 375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
out_audio = gr.Audio(type="numpy", label="Enhanced (autoplay)", autoplay=True)
|
| 377 |
-
ab_audio = gr.Audio(type="numpy", label="A/B Alternating (2s O → 2s E)"
|
| 378 |
-
delta_audio = gr.Audio(type="numpy", label="Delta: Original − Enhanced"
|
| 379 |
metrics = gr.Markdown("")
|
| 380 |
|
| 381 |
btn.click(
|
|
|
|
| 1 |
+
# app.py — Voice Clarity Booster with Presets, Dual-Stage "Ultimate Clean Voice",
|
| 2 |
+
# A/B alternating, Delta (Original−Enhanced), and Loudness Match.
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
import os
|
| 5 |
import io
|
| 6 |
import tempfile
|
| 7 |
+
from typing import Tuple, Optional, Dict, Any
|
| 8 |
|
| 9 |
# --- Quiet noisy deprecation warnings (optional) ---
|
| 10 |
import warnings
|
|
|
|
| 24 |
import torch
|
| 25 |
import torchaudio
|
| 26 |
|
| 27 |
+
# Optional: pyloudnorm for LUFS match; fallback to RMS if not available
|
| 28 |
try:
|
| 29 |
import pyloudnorm as pyln
|
| 30 |
_HAVE_PYLN = True
|
|
|
|
| 93 |
if t <= 8: # [C, T]
|
| 94 |
return wav.mean(axis=0).astype(np.float32)
|
| 95 |
return wav.mean(axis=1).astype(np.float32)
|
| 96 |
+
# Higher dims: flatten
|
| 97 |
return wav.reshape(-1).astype(np.float32)
|
| 98 |
|
| 99 |
|
|
|
|
| 182 |
|
| 183 |
|
| 184 |
# -----------------------------
|
| 185 |
+
# Model runners
|
| 186 |
# -----------------------------
|
| 187 |
def _run_metricgan(path_16k: str) -> torch.Tensor:
|
| 188 |
enh = _get_metricgan()
|
|
|
|
| 216 |
raise RuntimeError("Unexpected SepFormer output type")
|
| 217 |
|
| 218 |
|
| 219 |
+
def _run_dual_stage(path_16k: str) -> torch.Tensor:
|
| 220 |
+
"""
|
| 221 |
+
Ultimate Clean: SepFormer (dereverb/denoise) -> MetricGAN+ (denoise polish).
|
| 222 |
+
Both at 16 kHz mono.
|
| 223 |
+
"""
|
| 224 |
+
# Stage 1: SepFormer
|
| 225 |
+
stage1 = _run_sepformer(path_16k) # [1, T]
|
| 226 |
+
# Save Stage 1 to temp wav, then Stage 2 MetricGAN+
|
| 227 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_mid:
|
| 228 |
+
sf.write(tmp_mid.name, stage1.squeeze(0).numpy(), 16000, subtype="PCM_16")
|
| 229 |
+
tmp_mid.flush()
|
| 230 |
+
mid_path = tmp_mid.name
|
| 231 |
+
try:
|
| 232 |
+
stage2 = _run_metricgan(mid_path) # [1, T]
|
| 233 |
+
finally:
|
| 234 |
+
try:
|
| 235 |
+
os.remove(mid_path)
|
| 236 |
+
except Exception:
|
| 237 |
+
pass
|
| 238 |
+
return stage2
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# -----------------------------
|
| 242 |
+
# Core pipeline
|
| 243 |
+
# -----------------------------
|
| 244 |
def _enhance_numpy_audio(
|
| 245 |
audio: Tuple[int, np.ndarray],
|
| 246 |
mode: str = "MetricGAN+ (denoise)",
|
|
|
|
| 279 |
proc = _run_metricgan(path_16k) # [1, T@16k]
|
| 280 |
elif mode.startswith("SepFormer"):
|
| 281 |
proc = _run_sepformer(path_16k) # [1, T@16k]
|
| 282 |
+
elif mode.startswith("Dual-Stage"):
|
| 283 |
+
proc = _run_dual_stage(path_16k) # [1, T@16k]
|
| 284 |
else: # Bypass (EQ only)
|
| 285 |
proc = wav_16k
|
| 286 |
finally:
|
|
|
|
| 299 |
proc_out = _resample_torch(proc, 16000, sr_out).squeeze(0).numpy().astype(np.float32)
|
| 300 |
dry_out = _resample_torch(dry_t, sr_in, sr_out).squeeze(0).numpy().astype(np.float32)
|
| 301 |
|
| 302 |
+
# Align and mix (dry/wet)
|
| 303 |
proc_out, dry_out = _align_lengths(proc_out, dry_out)
|
| 304 |
dry_wet = float(np.clip(dry_wet, 0.0, 1.0))
|
| 305 |
enhanced = proc_out * dry_wet + dry_out * (1.0 - dry_wet)
|
|
|
|
| 324 |
metrics = (
|
| 325 |
f"Mode: {mode} | Dry/Wet: {dry_wet*100:.0f}% | Presence: {presence_db:+.1f} dB | "
|
| 326 |
f"Low-cut: {lowcut_hz:.0f} Hz | Loudness match: {loud_text}\n"
|
| 327 |
+
f"Dur: {len(enhanced)/sr_out:.2f}s | Δ RMS: {20*np.log10(rms_delta+eps):+.2f} dBFS | "
|
| 328 |
f'Approx. "noise removed" ratio: {change_db:.2f} dB'
|
| 329 |
)
|
| 330 |
|
| 331 |
return sr_out, enhanced, delta, metrics
|
| 332 |
|
| 333 |
|
| 334 |
+
# -----------------------------
|
| 335 |
+
# Presets
|
| 336 |
+
# -----------------------------
|
| 337 |
+
PRESETS: Dict[str, Dict[str, Any]] = {
|
| 338 |
+
# Maximum cleanup: dereverb + denoise chain, high dry/wet, subtle presence, mild HPF
|
| 339 |
+
"Ultimate Clean Voice": {
|
| 340 |
+
"mode": "Dual-Stage (SepFormer → MetricGAN+)",
|
| 341 |
+
"dry_wet": 0.92,
|
| 342 |
+
"presence_db": 1.5,
|
| 343 |
+
"lowcut_hz": 80.0,
|
| 344 |
+
"loudness_match": True,
|
| 345 |
+
},
|
| 346 |
+
# Natural cleanup for most cases
|
| 347 |
+
"Natural Speech": {
|
| 348 |
+
"mode": "MetricGAN+ (denoise)",
|
| 349 |
+
"dry_wet": 0.85,
|
| 350 |
+
"presence_db": 1.0,
|
| 351 |
+
"lowcut_hz": 50.0,
|
| 352 |
+
"loudness_match": True,
|
| 353 |
+
},
|
| 354 |
+
# Studio-ish clarity
|
| 355 |
+
"Podcast Studio": {
|
| 356 |
+
"mode": "MetricGAN+ (denoise)",
|
| 357 |
+
"dry_wet": 0.9,
|
| 358 |
+
"presence_db": 2.0,
|
| 359 |
+
"lowcut_hz": 75.0,
|
| 360 |
+
"loudness_match": True,
|
| 361 |
+
},
|
| 362 |
+
# Strong dereverb, blend to avoid artifacts
|
| 363 |
+
"Room Dereverb": {
|
| 364 |
+
"mode": "SepFormer (dereverb+denoise)",
|
| 365 |
+
"dry_wet": 0.7,
|
| 366 |
+
"presence_db": 0.5,
|
| 367 |
+
"lowcut_hz": 60.0,
|
| 368 |
+
"loudness_match": True,
|
| 369 |
+
},
|
| 370 |
+
# When music bed is under voice—be gentle
|
| 371 |
+
"Music + Voice Safe": {
|
| 372 |
+
"mode": "MetricGAN+ (denoise)",
|
| 373 |
+
"dry_wet": 0.6,
|
| 374 |
+
"presence_db": 0.0,
|
| 375 |
+
"lowcut_hz": 40.0,
|
| 376 |
+
"loudness_match": True,
|
| 377 |
+
},
|
| 378 |
+
# Harsh phone/zoom recordings
|
| 379 |
+
"Phone Call Rescue": {
|
| 380 |
+
"mode": "MetricGAN+ (denoise)",
|
| 381 |
+
"dry_wet": 0.88,
|
| 382 |
+
"presence_db": 2.0,
|
| 383 |
+
"lowcut_hz": 100.0,
|
| 384 |
+
"loudness_match": True,
|
| 385 |
+
},
|
| 386 |
+
# Light touch
|
| 387 |
+
"Gentle Denoise": {
|
| 388 |
+
"mode": "MetricGAN+ (denoise)",
|
| 389 |
+
"dry_wet": 0.65,
|
| 390 |
+
"presence_db": 0.0,
|
| 391 |
+
"lowcut_hz": 0.0,
|
| 392 |
+
"loudness_match": True,
|
| 393 |
+
},
|
| 394 |
+
"Custom": {} # no-op, keeps current settings
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
def _apply_preset(preset_name: str):
|
| 399 |
+
cfg = PRESETS.get(preset_name, {})
|
| 400 |
+
# Return gr.update() for each adjustable control
|
| 401 |
+
def upd(val=None):
|
| 402 |
+
return gr.update(value=val) if val is not None else gr.update()
|
| 403 |
+
if not cfg or preset_name == "Custom":
|
| 404 |
+
return upd(), upd(), upd(), upd(), upd()
|
| 405 |
+
return (
|
| 406 |
+
upd(cfg["mode"]),
|
| 407 |
+
upd(int(round(cfg["dry_wet"] * 100))),
|
| 408 |
+
upd(float(cfg["presence_db"])),
|
| 409 |
+
upd(float(cfg["lowcut_hz"])),
|
| 410 |
+
upd(bool(cfg["loudness_match"])),
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
|
| 414 |
# -----------------------------
|
| 415 |
# Gradio UI
|
| 416 |
# -----------------------------
|
|
|
|
| 447 |
|
| 448 |
|
| 449 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 450 |
+
gr.Markdown("## Voice Clarity Booster — Presets, A/B, Delta, Loudness Match")
|
| 451 |
+
|
| 452 |
with gr.Row():
|
| 453 |
+
with gr.Column(scale=1):
|
| 454 |
in_audio = gr.Audio(
|
| 455 |
sources=["upload", "microphone"],
|
| 456 |
type="numpy",
|
| 457 |
label="Input",
|
| 458 |
)
|
| 459 |
+
preset = gr.Dropdown(
|
| 460 |
+
choices=list(PRESETS.keys()),
|
| 461 |
+
value="Ultimate Clean Voice",
|
| 462 |
+
label="Preset",
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
# Controls that presets will adjust
|
| 466 |
mode = gr.Radio(
|
| 467 |
+
choices=[
|
| 468 |
+
"MetricGAN+ (denoise)",
|
| 469 |
+
"SepFormer (dereverb+denoise)",
|
| 470 |
+
"Dual-Stage (SepFormer → MetricGAN+)",
|
| 471 |
+
"Bypass (EQ only)"
|
| 472 |
+
],
|
| 473 |
+
value="Dual-Stage (SepFormer → MetricGAN+)",
|
| 474 |
label="Mode",
|
| 475 |
)
|
| 476 |
dry_wet = gr.Slider(
|
| 477 |
+
minimum=0, maximum=100, value=92, step=1,
|
| 478 |
label="Dry/Wet Mix (%) — lower to reduce artifacts"
|
| 479 |
)
|
| 480 |
presence = gr.Slider(
|
| 481 |
+
minimum=-12, maximum=12, value=1.5, step=0.5, label="Presence Boost (dB)"
|
| 482 |
)
|
| 483 |
lowcut = gr.Slider(
|
| 484 |
+
minimum=0, maximum=200, value=80, step=5, label="Low-Cut (Hz)"
|
| 485 |
)
|
| 486 |
loudmatch = gr.Checkbox(value=True, label="Loudness-match enhanced to original")
|
| 487 |
out_sr = gr.Radio(
|
|
|
|
| 489 |
value="Original",
|
| 490 |
label="Output Sample Rate",
|
| 491 |
)
|
| 492 |
+
|
| 493 |
+
# Apply preset on change
|
| 494 |
+
preset.change(
|
| 495 |
+
_apply_preset,
|
| 496 |
+
inputs=[preset],
|
| 497 |
+
outputs=[mode, dry_wet, presence, lowcut, loudmatch],
|
| 498 |
+
)
|
| 499 |
+
|
| 500 |
+
btn = gr.Button("Enhance", variant="primary")
|
| 501 |
+
|
| 502 |
+
with gr.Column(scale=1):
|
| 503 |
out_audio = gr.Audio(type="numpy", label="Enhanced (autoplay)", autoplay=True)
|
| 504 |
+
ab_audio = gr.Audio(type="numpy", label="A/B Alternating (2s O → 2s E)")
|
| 505 |
+
delta_audio = gr.Audio(type="numpy", label="Delta: Original − Enhanced")
|
| 506 |
metrics = gr.Markdown("")
|
| 507 |
|
| 508 |
btn.click(
|