Spaces:

Cyber-Blacat
/

MedASR

Running

App Files Files Community

Cyber-Blacat commited on 11 days ago

Commit

3d78f04

verified ·

1 Parent(s): 9179741

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -67

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from transformers import AutoModelForCTC, AutoFeatureExtractor, AutoTokenizer
 import torch
 import numpy as np
 import warnings
 warnings.filterwarnings("ignore")
@@ -12,6 +13,27 @@ model = None
 feature_extractor = None
 tokenizer = None
 def load_model_with_token(hf_token):
     global model, feature_extractor, tokenizer
@@ -21,6 +43,7 @@ def load_model_with_token(hf_token):
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print("🔄 Loading model components...")
         feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID, token=hf_token.strip())
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=hf_token.strip())
@@ -29,22 +52,15 @@ def load_model_with_token(hf_token):
         print(f"✅ Loaded: {type(feature_extractor)}, {type(tokenizer)}, {type(model)}")
-        # 成功后：按钮显示"✅ 已加载"，转录按钮启用
         return gr.update(interactive=False, value="✅ Model Loaded Successfully!"), gr.update(interactive=True)
     except Exception as e:
         print(f"Error loading model: {e}")
         import traceback
         traceback.print_exc()
-        # 失败后：按钮显示错误信息，转录按钮禁用
         return gr.update(interactive=True, value=f"❌ Error: {str(e)}"), gr.update(interactive=False)
 def transcribe_audio(audio_input):
-    """
-    注意：audio_input 不再是文件路径，而是 Gradio 直接传来的 numpy 数组
-    格式为 tuple: (sample_rate: int, waveform: np.ndarray)
-    """
     global model, feature_extractor, tokenizer
     if audio_input is None:
@@ -54,27 +70,38 @@ def transcribe_audio(audio_input):
         return "❌ Please load the model first!"
     try:
-        # 1. 解包 Gradio 传来的音频：采样率 + 波形
         sample_rate, waveform = audio_input
-        # 如果是多通道（立体声），只取第一个声道
         if waveform.ndim == 2:
-            waveform = waveform[:, 0]  # 取单声道
-        # 2. 统一转成 float32，归一化到 [-1, 1]（如果还不是）
-        # Gradio 默认给的是 int16，范围 [-32768, 32767]，我们除以 32768 就变 float32
         if waveform.dtype == np.int16:
             waveform = waveform.astype(np.float32) / 32768.0
         elif waveform.dtype != np.float32:
             waveform = waveform.astype(np.float32)
-        # 3. 如果采样率不是 16kHz，用 librosa 重采样（可选）
         if sample_rate != 16000:
-            import librosa
             waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
             sample_rate = 16000
-        # 4. 用 LasrFeatureExtractor 处理音频
         inputs = feature_extractor(
             waveform,
             sampling_rate=sample_rate,
@@ -82,19 +109,45 @@ def transcribe_audio(audio_input):
         )
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        # 5. model generation, decode with beam search.
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
-                max_length=inputs["input_values"].shape[1] // feature_extractor.stride[0] + 50,
-                num_beams=8,  # beam search
                 temperature=1.0,
             )
-        # 6. 解码
         transcription = tokenizer.batch_decode(outputs.tolist(), skip_special_tokens=True)[0]
-        return transcription
     except Exception as e:
         import traceback
@@ -104,74 +157,55 @@ def transcribe_audio(audio_input):
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🏥 MedASR - Medical Speech Recognition")
-    gr.Markdown("AI-powered medical dictation system. Upload or record audio to transcribe.")
     gr.Markdown("---")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("## 🔑 Authentication")
-            hf_token = gr.Textbox(
-                label="HuggingFace Token",
-                type="password",
-                placeholder="hf_...",
-                info="Required to access the gated MedASR model"
-            )
-            # 用按钮本身显示状态，不再需要单独的 status 文本框
-            load_model_btn = gr.Button(
-                "🔑 Load Model",
-                variant="primary",
-                size="lg"
-            )
-        with gr.Column(scale=2):
             gr.Markdown("## 📝 Tips")
             gr.Markdown("""
-            - **Speak clearly** in English
-            - **Short phrases** work best (3-10 seconds)
-            - **Quiet environment** improves accuracy
-            - Try medical terms: *patient, diagnosis, treatment, medication*
             """)
-    gr.Markdown("---")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("## 🎙️ Audio Input")
-            audio_input = gr.Audio(
-                sources=["microphone", "upload"],
-                type="numpy",
-                label="Record or upload audio"
-            )
-            transcribe_btn = gr.Button(
-                "🔄 Transcribe",
-                variant="secondary",
-                size="lg",
-                interactive=False
-            )
-        with gr.Column():
-            gr.Markdown("## 📄 Transcription Result")
-            output_text = gr.Textbox(
-                label="",
-                lines=12,
-                placeholder="Transcription will appear here...",
-                show_label=False
-            )
-    # 事件绑定
     load_model_btn.click(
         fn=load_model_with_token,
         inputs=[hf_token],
-        # 第一个返回值更新按钮文本和状态，第二个返回值更新转录按钮的交互状态
         outputs=[load_model_btn, transcribe_btn]
     )
     transcribe_btn.click(
-        fn=transcribe_audio,
         inputs=[audio_input],
-        outputs=[output_text]
     )
 if __name__ == "__main__":

 import torch
 import numpy as np
 import warnings
+import librosa
 warnings.filterwarnings("ignore")
 feature_extractor = None
 tokenizer = None
+def normalize_audio(audio):
+    """RMS归一化"""
+    rms = np.sqrt(np.mean(audio ** 2))
+    if rms > 0:
+        audio = audio / rms
+        audio = np.clip(audio, -1.0, 1.0)
+    return audio
+def remove_silence(audio, sample_rate, threshold=0.01):
+    """去除静音段"""
+    energy = np.abs(audio)
+    above_threshold = energy > threshold
+    if not np.any(above_threshold):
+        return audio
+    start = np.where(above_threshold)[0][0]
+    end = np.where(above_threshold)[0][-1]
+    buffer = int(0.1 * sample_rate)
+    start = max(0, start - buffer)
+    end = min(len(audio), end + buffer)
+    return audio[start:end]
 def load_model_with_token(hf_token):
     global model, feature_extractor, tokenizer
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print("🔄 Loading model components...")
+        print(f"📱 Device: {device}")
         feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID, token=hf_token.strip())
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=hf_token.strip())
         print(f"✅ Loaded: {type(feature_extractor)}, {type(tokenizer)}, {type(model)}")
         return gr.update(interactive=False, value="✅ Model Loaded Successfully!"), gr.update(interactive=True)
     except Exception as e:
         print(f"Error loading model: {e}")
         import traceback
         traceback.print_exc()
         return gr.update(interactive=True, value=f"❌ Error: {str(e)}"), gr.update(interactive=False)
 def transcribe_audio(audio_input):
     global model, feature_extractor, tokenizer
     if audio_input is None:
         return "❌ Please load the model first!"
     try:
+        # 1. 解包音频
         sample_rate, waveform = audio_input
+        # 2. 转单声道
         if waveform.ndim == 2:
+            waveform = waveform[:, 0]
+        # 3. 转换为 float32 并归一化
         if waveform.dtype == np.int16:
             waveform = waveform.astype(np.float32) / 32768.0
         elif waveform.dtype != np.float32:
             waveform = waveform.astype(np.float32)
+        # 4. RMS归一化
+        waveform = normalize_audio(waveform)
+        # 5. 去除静音
+        waveform = remove_silence(waveform, sample_rate)
+        # 6. 检查长度
+        duration = len(waveform) / sample_rate
+        if duration < 0.1:
+            return "⚠️ Audio is too short."
+        if duration > 60:
+            return "⚠️ Audio is too long."
+        # 7. 重采样
         if sample_rate != 16000:
             waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
             sample_rate = 16000
+        # 8. 特征提取
         inputs = feature_extractor(
             waveform,
             sampling_rate=sample_rate,
         )
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        # 【修复部分】
+        # 自动查找包含特征数据的键（可能是 'input_features' 或 'input_values'）
+        # 过滤掉 'attention_mask'，找到真正的输入 Tensor
+        input_tensor = None
+        for key, val in inputs.items():
+            if isinstance(val, torch.Tensor) and val.ndim > 1:
+                input_tensor = val
+                break
+        if input_tensor is None:
+            return "❌ Error: Could not extract audio features."
+        # 安全地获取 stride，如果不存在则默认为 4
+        stride = 4
+        if hasattr(feature_extractor, 'stride'):
+            s = feature_extractor.stride
+            stride = s[0] if isinstance(s, (list, tuple)) else s
+        # 动态计算 max_length
+        max_length = input_tensor.shape[1] // stride + 50
+        # 9. Beam search 解码
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
+                max_length=max_length,
+                num_beams=8,  # Beam search 提升准确率
                 temperature=1.0,
             )
+        # 10. 解码
         transcription = tokenizer.batch_decode(outputs.tolist(), skip_special_tokens=True)[0]
+        # 11. 后处理
+        transcription = transcription.strip()
+        import re
+        transcription = re.sub(r'\s+', ' ', transcription)
+        return transcription if transcription else "⚠️ No speech detected."
     except Exception as e:
         import traceback
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🏥 MedASR - Medical Speech Recognition")
+    gr.Markdown("Optimized for medical dictation with Beam Search decoding.")
     gr.Markdown("---")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("## 🔑 Authentication")
+            hf_token = gr.Textbox(label="HuggingFace Token", type="password", placeholder="hf_...")
+            load_model_btn = gr.Button("🔑 Load Model", variant="primary", size="lg")
             gr.Markdown("## 📝 Tips")
             gr.Markdown("""
+            - Speak **clearly and slowly**
+            - Use **medical terms**
+            - Short audio (3-10s) is best
+            - Quiet environment
             """)
+        with gr.Column(scale=2):
+            gr.Markdown("## 🎙️ Input & Result")
+            audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy")
+            with gr.Row():
+                transcribe_btn = gr.Button("🔄 Transcribe", variant="secondary", size="lg", interactive=False)
+                clear_btn = gr.Button("🗑️ Clear", variant="stop", size="lg")
+            output_text = gr.Textbox(label="Result", lines=12, placeholder="...")
+            audio_info = gr.Textbox(label="Info", lines=2, interactive=False)
+    def transcribe_wrapper(audio_in):
+        res = transcribe_audio(audio_in)
+        info = f"Status: Success" if "❌" not in res and "⚠️" not in res else "Status: Check result"
+        return res, info
     load_model_btn.click(
         fn=load_model_with_token,
         inputs=[hf_token],
         outputs=[load_model_btn, transcribe_btn]
     )
     transcribe_btn.click(
+        fn=transcribe_wrapper,
         inputs=[audio_input],
+        outputs=[output_text, audio_info]
+    )
+    clear_btn.click(
+        fn=lambda: ("", "Ready"),
+        outputs=[output_text, audio_info]
     )
 if __name__ == "__main__":