voiceclear-zar

Running

App Files Files Community

Diggz10 commited on Aug 21

Commit

09dbb5c

verified ·

1 Parent(s): bcdc929

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -88

app.py CHANGED Viewed

@@ -1,14 +1,12 @@
-# app.py — Voice Clarity Booster (MetricGAN+) for Hugging Face Spaces
-# Fixes:
-# - Robust mono conversion (handles [T], [T,C], [C,T]) to prevent 50-byte WAVs.
-# - Output autoplay, NaN/Inf sanitization, tiny-output fallback.
-import io
 import os
 import tempfile
 from typing import Tuple, Optional
-# ---- Quiet noisy deprecation warnings (optional) ----
 import warnings
 warnings.filterwarnings(
     "ignore",
@@ -26,32 +24,47 @@ import soundfile as sf
 import torch
 import torchaudio
-# ---- SpeechBrain import: prefer new API, fall back if older version ----
 try:
-    # SpeechBrain >= 1.0
     from speechbrain.inference import SpectralMaskEnhancement
-except Exception:  # pragma: no cover
-    # Older SpeechBrain (<1.0)
     from speechbrain.pretrained import SpectralMaskEnhancement  # type: ignore
 # -----------------------------
-# Model: SpeechBrain MetricGAN+
 # -----------------------------
-_ENHANCER: Optional[SpectralMaskEnhancement] = None
 _DEVICE = "cpu"
-def _get_enhancer() -> SpectralMaskEnhancement:
-    """Lazily load the enhancer and cache it."""
-    global _ENHANCER
-    if _ENHANCER is None:
-        _ENHANCER = SpectralMaskEnhancement.from_hparams(
             source="speechbrain/metricgan-plus-voicebank",
             savedir="pretrained/metricgan_plus_voicebank",
             run_opts={"device": _DEVICE},
         )
-    return _ENHANCER
 # -----------------------------
@@ -60,40 +73,28 @@ def _get_enhancer() -> SpectralMaskEnhancement:
 def _to_mono(wav: np.ndarray) -> np.ndarray:
     """
     Ensure mono [T] float32 robustly.
-    Accepts:
-      - [T] (mono)
-      - [T, C] (samples, channels)
-      - [C, T] (channels, samples)
-      - Any 2D shape where a dimension <= 8 is 'channels'
     """
     wav = np.asarray(wav, dtype=np.float32)
     if wav.ndim == 1:
         return wav
     if wav.ndim == 2:
-        T, U = wav.shape
-        # If one dimension is 1, just squeeze
-        if 1 in (T, U):
             return wav.reshape(-1).astype(np.float32)
-        # Heuristic: if the last dim is small (<= 8), treat it as channels -> [T, C]
-        if U <= 8:
-            return wav.mean(axis=1).astype(np.float32)  # average across channel axis
-        # If the first dim is small (<= 8), treat it as channels -> [C, T]
-        if T <= 8:
             return wav.mean(axis=0).astype(np.float32)
-        # Fallback: assume [T, C]
         return wav.mean(axis=1).astype(np.float32)
-    # Higher dims: flatten channels, keep time last if possible
     return wav.reshape(-1).astype(np.float32)
 def _resample_torch(wav: torch.Tensor, sr_in: int, sr_out: int) -> torch.Tensor:
     if sr_in == sr_out:
         return wav
@@ -107,7 +108,6 @@ def _highpass(wav: torch.Tensor, sr: int, cutoff_hz: float) -> torch.Tensor:
 def _presence_boost(wav: torch.Tensor, sr: int, gain_db: float) -> torch.Tensor:
-    """Simple presence EQ around ~4.5 kHz."""
     if abs(gain_db) < 1e-6:
         return wav
     center = 4500.0
@@ -116,74 +116,119 @@ def _presence_boost(wav: torch.Tensor, sr: int, gain_db: float) -> torch.Tensor:
 def _limit_peak(wav: torch.Tensor, target_dbfs: float = -1.0) -> torch.Tensor:
-    """Peak-normalize to target dBFS and hard-limit to [-1, 1]."""
     target_amp = 10.0 ** (target_dbfs / 20.0)
     peak = torch.max(torch.abs(wav)).item()
     if peak > 0:
-        scale = min(1.0, target_amp / peak)
-        wav = wav * scale
     return torch.clamp(wav, -1.0, 1.0)
-def _sanitize(mono: np.ndarray) -> np.ndarray:
-    """Replace NaN/Inf with 0 to keep encoders happy."""
-    return np.nan_to_num(mono, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
 def _enhance_numpy_audio(
     audio: Tuple[int, np.ndarray],
-    presence_db: float = 3.0,
-    lowcut_hz: float = 75.0,
     out_sr: Optional[int] = None,
 ) -> Tuple[int, np.ndarray]:
     """
-    Core pipeline used by the Gradio UI.
     Input: (sr, np.float32 [T] or [T,C])
     Returns: (sr_out, np.float32 [T])
     """
     sr_in, wav_np = audio
-    wav_mono = _to_mono(wav_np)
-    # Guard: empty input
-    if wav_mono.size < 16:
-        # Return a short silent buffer at original SR to avoid empty files
         return sr_in, np.zeros(1600 if sr_in else 1600, dtype=np.float32)
-    wav_t = torch.from_numpy(wav_mono).unsqueeze(0)  # [1, T]
-    # MetricGAN+ expects 16 kHz mono
-    enh = _get_enhancer()
-    wav_16k = _resample_torch(wav_t, sr_in, 16000)
-    # Enhance via file path API for broad codec compatibility
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
         sf.write(tmp_in.name, wav_16k.squeeze(0).numpy(), 16000, subtype="PCM_16")
         tmp_in.flush()
-        clean = enh.enhance_file(tmp_in.name)  # torch.Tensor [1, T]
-    try:
-        os.remove(tmp_in.name)
-    except Exception:
-        pass
-    # Optional polish: high-pass & presence EQ + peak limit
-    clean = _highpass(clean, 16000, lowcut_hz)
-    clean = _presence_boost(clean, 16000, presence_db)
-    clean = _limit_peak(clean, target_dbfs=-1.0)
-    # Resample to requested output rate (or original)
     sr_out = sr_in if (out_sr is None or out_sr <= 0) else int(out_sr)
-    clean_out = _resample_torch(clean, 16000, sr_out).squeeze(0).numpy().astype(np.float32)
-    # Sanitize
-    clean_out = _sanitize(clean_out)
-    # Tiny-output fallback: if somehow too short, return processed original instead
-    if clean_out.size < 160:  # ~10 ms @16k
-        fallback = _sanitize(wav_16k.squeeze(0).numpy())
-        fallback = _resample_torch(torch.from_numpy(fallback).unsqueeze(0), 16000, sr_out).squeeze(0).numpy().astype(np.float32)
-        return sr_out, fallback
-    return sr_out, clean_out
 # -----------------------------
@@ -191,6 +236,8 @@ def _enhance_numpy_audio(
 # -----------------------------
 def gradio_enhance(
     audio: Tuple[int, np.ndarray],
     presence_db: float,
     lowcut_hz: float,
     output_sr: str,
@@ -201,25 +248,39 @@ def gradio_enhance(
     if output_sr in {"44100", "48000"}:
         out_sr = int(output_sr)
     sr_out, enhanced = _enhance_numpy_audio(
-        audio, presence_db=float(presence_db), lowcut_hz=float(lowcut_hz), out_sr=out_sr
     )
     return (sr_out, enhanced)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## Voice Clarity Booster (MetricGAN+)")
     with gr.Row():
         with gr.Column():
             in_audio = gr.Audio(
                 sources=["upload", "microphone"],
                 type="numpy",
-                label="Input (noisy speech)",
             )
             presence = gr.Slider(
-                minimum=-12, maximum=12, value=3, step=0.5, label="Presence Boost (dB)"
             )
             lowcut = gr.Slider(
-                minimum=0, maximum=200, value=75, step=5, label="Low-Cut (Hz)"
             )
             out_sr = gr.Radio(
                 choices=["Original", "44100", "48000"],
@@ -230,7 +291,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         with gr.Column():
             out_audio = gr.Audio(type="numpy", label="Enhanced", autoplay=True)
-    btn.click(gradio_enhance, inputs=[in_audio, presence, lowcut, out_sr], outputs=[out_audio])
-# IMPORTANT for Hugging Face Spaces: call launch() unguarded so the app starts.
 demo.launch()

+# app.py — Voice Clarity Booster with mode switch + dry/wet mix
+# Modes: MetricGAN+ (denoise) | SepFormer (dereverb+denoise) | Bypass (EQ only)
 import os
+import io
 import tempfile
 from typing import Tuple, Optional
+# --- Quiet noisy deprecation warnings (optional) ---
 import warnings
 warnings.filterwarnings(
     "ignore",
 import torch
 import torchaudio
+# Prefer new SpeechBrain API; fall back for older versions
 try:
     from speechbrain.inference import SpectralMaskEnhancement
+except Exception:  # < 1.0
     from speechbrain.pretrained import SpectralMaskEnhancement  # type: ignore
+try:
+    # SepFormer enhancement model (WHAMR) via separation interface
+    from speechbrain.inference import SepformerSeparation
+except Exception:
+    from speechbrain.pretrained import SepformerSeparation  # type: ignore
 # -----------------------------
+# Cached models
 # -----------------------------
 _DEVICE = "cpu"
+_ENHANCER_METRICGAN: Optional[SpectralMaskEnhancement] = None
+_ENHANCER_SEPFORMER: Optional[SepformerSeparation] = None
+def _get_metricgan() -> SpectralMaskEnhancement:
+    global _ENHANCER_METRICGAN
+    if _ENHANCER_METRICGAN is None:
+        _ENHANCER_METRICGAN = SpectralMaskEnhancement.from_hparams(
             source="speechbrain/metricgan-plus-voicebank",
             savedir="pretrained/metricgan_plus_voicebank",
             run_opts={"device": _DEVICE},
         )
+    return _ENHANCER_METRICGAN
+def _get_sepformer() -> SepformerSeparation:
+    global _ENHANCER_SEPFORMER
+    if _ENHANCER_SEPFORMER is None:
+        _ENHANCER_SEPFORMER = SepformerSeparation.from_hparams(
+            source="speechbrain/sepformer-whamr-enhancement",
+            savedir="pretrained/sepformer_whamr_enh",
+            run_opts={"device": _DEVICE},
+        )
+    return _ENHANCER_SEPFORMER
 # -----------------------------
 def _to_mono(wav: np.ndarray) -> np.ndarray:
     """
     Ensure mono [T] float32 robustly.
+    Accepts [T], [T,C], [C,T]; picks the 'channels' axis if <=8.
     """
     wav = np.asarray(wav, dtype=np.float32)
     if wav.ndim == 1:
         return wav
     if wav.ndim == 2:
+        t, u = wav.shape
+        if 1 in (t, u):
             return wav.reshape(-1).astype(np.float32)
+        if u <= 8:   # [T, C]
+            return wav.mean(axis=1).astype(np.float32)
+        if t <= 8:   # [C, T]
             return wav.mean(axis=0).astype(np.float32)
         return wav.mean(axis=1).astype(np.float32)
+    # higher dims: fall back
     return wav.reshape(-1).astype(np.float32)
+def _sanitize(mono: np.ndarray) -> np.ndarray:
+    return np.nan_to_num(mono, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
 def _resample_torch(wav: torch.Tensor, sr_in: int, sr_out: int) -> torch.Tensor:
     if sr_in == sr_out:
         return wav
 def _presence_boost(wav: torch.Tensor, sr: int, gain_db: float) -> torch.Tensor:
     if abs(gain_db) < 1e-6:
         return wav
     center = 4500.0
 def _limit_peak(wav: torch.Tensor, target_dbfs: float = -1.0) -> torch.Tensor:
     target_amp = 10.0 ** (target_dbfs / 20.0)
     peak = torch.max(torch.abs(wav)).item()
     if peak > 0:
+        wav = wav * min(1.0, target_amp / peak)
     return torch.clamp(wav, -1.0, 1.0)
+def _align_lengths(a: np.ndarray, b: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Pad/crop to same length so we can mix dry/wet safely."""
+    n = min(len(a), len(b))
+    return a[:n], b[:n]
+# -----------------------------
+# Core pipeline
+# -----------------------------
+def _run_metricgan(clean_16k_path: str) -> torch.Tensor:
+    enh = _get_metricgan()
+    with torch.no_grad():
+        out = enh.enhance_file(clean_16k_path)  # [1, T] float32 -1..1
+    return out
+def _run_sepformer(clean_16k_path: str) -> torch.Tensor:
+    sep = _get_sepformer()
+    with torch.no_grad():
+        # Some SB versions return [n_src, T]; others [1, T]
+        out = sep.separate_file(path=clean_16k_path)
+    # Normalize shape to [1, T]
+    if isinstance(out, torch.Tensor):
+        if out.dim() == 1:
+            out = out.unsqueeze(0)
+        elif out.dim() == 2 and out.shape[0] > 1:
+            out = out[:1, :]  # pick primary enhanced speech
+        return out
+    # If older API returns numpy or list, convert:
+    if hasattr(out, "numpy"):
+        t = torch.from_numpy(out)
+        if t.dim() == 1:
+            t = t.unsqueeze(0)
+        elif t.dim() == 2 and t.shape[0] > 1:
+            t = t[:1, :]
+        return t
+    if isinstance(out, (list, tuple)):
+        t = torch.tensor(out[0] if isinstance(out[0], (np.ndarray, list)) else out, dtype=torch.float32)
+        if t.dim() == 1:
+            t = t.unsqueeze(0)
+        return t
+    raise RuntimeError("Unexpected SepFormer output type")
 def _enhance_numpy_audio(
     audio: Tuple[int, np.ndarray],
+    mode: str = "MetricGAN+ (denoise)",
+    dry_wet: float = 1.0,          # 0..1 (1=fully processed)
+    presence_db: float = 0.0,      # default 0 for safer tone
+    lowcut_hz: float = 0.0,        # default 0 (off)
     out_sr: Optional[int] = None,
 ) -> Tuple[int, np.ndarray]:
     """
     Input: (sr, np.float32 [T] or [T,C])
     Returns: (sr_out, np.float32 [T])
     """
     sr_in, wav_np = audio
+    wav_mono = _sanitize(_to_mono(wav_np))
+    # Guard: tiny input
+    if wav_mono.size < 32:
         return sr_in, np.zeros(1600 if sr_in else 1600, dtype=np.float32)
+    dry_t = torch.from_numpy(wav_mono).unsqueeze(0)  # [1, T @ sr_in]
+    # Prepare 16k mono file for models
+    wav_16k = _resample_torch(dry_t, sr_in, 16000)
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
         sf.write(tmp_in.name, wav_16k.squeeze(0).numpy(), 16000, subtype="PCM_16")
         tmp_in.flush()
+        path_16k = tmp_in.name
+    try:
+        if mode.startswith("MetricGAN"):
+            proc = _run_metricgan(path_16k)  # [1, T@16k]
+        elif mode.startswith("SepFormer"):
+            proc = _run_sepformer(path_16k)  # [1, T@16k]
+        else:  # Bypass (EQ only)
+            proc = wav_16k
+    finally:
+        try:
+            os.remove(path_16k)
+        except Exception:
+            pass
+    # Subtle polish (applied to processed only)
+    proc = _highpass(proc, 16000, lowcut_hz)
+    proc = _presence_boost(proc, 16000, presence_db)
+    proc = _limit_peak(proc, target_dbfs=-1.0)
+    # Resample both to output rate for mixing & export
     sr_out = sr_in if (out_sr is None or out_sr <= 0) else int(out_sr)
+    proc_out = _resample_torch(proc, 16000, sr_out).squeeze(0).numpy().astype(np.float32)
+    dry_out = _resample_torch(dry_t, sr_in, sr_out).squeeze(0).numpy().astype(np.float32)
+    # Align and mix
+    proc_out, dry_out = _align_lengths(proc_out, dry_out)
+    dry_wet = float(np.clip(dry_wet, 0.0, 1.0))
+    mixed = (1.0 - (1.0 - dry_wet)) * proc_out + (1.0 - dry_wet) * dry_out  # equivalent to dry*(1-dw) + proc*dw
+    mixed = _sanitize(mixed)
+    # Safety: if somehow too tiny, fall back to dry
+    if mixed.size < 160:
+        return sr_out, dry_out
+    return sr_out, mixed
 # -----------------------------
 # -----------------------------
 def gradio_enhance(
     audio: Tuple[int, np.ndarray],
+    mode: str,
+    dry_wet_pct: float,
     presence_db: float,
     lowcut_hz: float,
     output_sr: str,
     if output_sr in {"44100", "48000"}:
         out_sr = int(output_sr)
     sr_out, enhanced = _enhance_numpy_audio(
+        audio,
+        mode=mode,
+        dry_wet=dry_wet_pct / 100.0,
+        presence_db=float(presence_db),
+        lowcut_hz=float(lowcut_hz),
+        out_sr=out_sr,
     )
     return (sr_out, enhanced)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## Voice Clarity Booster")
     with gr.Row():
         with gr.Column():
             in_audio = gr.Audio(
                 sources=["upload", "microphone"],
                 type="numpy",
+                label="Input",
+            )
+            mode = gr.Radio(
+                choices=["MetricGAN+ (denoise)", "SepFormer (dereverb+denoise)", "Bypass (EQ only)"],
+                value="MetricGAN+ (denoise)",
+                label="Mode",
+            )
+            dry_wet = gr.Slider(
+                minimum=0, maximum=100, value=85, step=1,
+                label="Dry/Wet Mix (%) — lower to reduce artifacts"
             )
             presence = gr.Slider(
+                minimum=-12, maximum=12, value=0, step=0.5, label="Presence Boost (dB)"
             )
             lowcut = gr.Slider(
+                minimum=0, maximum=200, value=0, step=5, label="Low-Cut (Hz)"
             )
             out_sr = gr.Radio(
                 choices=["Original", "44100", "48000"],
         with gr.Column():
             out_audio = gr.Audio(type="numpy", label="Enhanced", autoplay=True)
+    btn.click(
+        gradio_enhance,
+        inputs=[in_audio, mode, dry_wet, presence, lowcut, out_sr],
+        outputs=[out_audio],
+    )
+# Start server (Hugging Face Spaces expects this unguarded)
 demo.launch()