Spaces:

Emmanuel08
/

CCI_Realtime_Transcribing_model

Sleeping

App Files Files Community

Emmanuel08 commited on Mar 5

Commit

f33e6ad

verified ·

1 Parent(s): 0c444f9

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -28

app.py CHANGED Viewed

@@ -1,37 +1,42 @@
 import torch
-import torchaudio  # ✅ Added torchaudio for resampling
 import gradio as gr
-import time
 import numpy as np
 import scipy.io.wavfile
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-# ✅ 1️⃣ Force Model to Run on CPU
 device = "cpu"
 torch_dtype = torch.float32  # Use CPU-friendly float type
-MODEL_NAME = "openai/whisper-tiny"  # ✅ Switched to smallest model for fastest performance
-# ✅ 2️⃣ Load Whisper Tiny Model on CPU
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
 )
 model.to(device)
-# ✅ 3️⃣ Load Processor & Pipeline
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
-processor.feature_extractor.sampling_rate = 16000  # ✅ Set correct sampling rate here
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
-    chunk_length_s=2,  # ✅ Process in 2-second chunks for ultra-low latency
     torch_dtype=torch_dtype,
     device=device,
 )
-# ✅ 4️⃣ Real-Time Streaming Transcription (Microphone)
 def stream_transcribe(stream, new_chunk):
     start_time = time.time()
     try:
@@ -44,18 +49,17 @@ def stream_transcribe(stream, new_chunk):
         y = y.astype(np.float32)
         y /= np.max(np.abs(y))
-        # ✅ Resample audio to 16kHz using torchaudio
         y_tensor = torch.tensor(y)
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-        y_resampled = resampler(y_tensor).numpy()
         # ✅ Append to Stream
         if stream is not None:
             stream = np.concatenate([stream, y_resampled])
         else:
             stream = y_resampled
-        # ✅ Run Transcription
         transcription = pipe({"sampling_rate": 16000, "raw": stream})["text"]
         latency = time.time() - start_time
@@ -65,17 +69,16 @@ def stream_transcribe(stream, new_chunk):
         print(f"Error: {e}")
         return stream, str(e), "Error"
-# ✅ 5️⃣ Transcription for File Upload
 def transcribe(inputs, previous_transcription):
     start_time = time.time()
     try:
         # ✅ Convert file input to correct format
         sample_rate, audio_data = inputs
-        # ✅ Resample to 16kHz using torchaudio
         audio_tensor = torch.tensor(audio_data)
-        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
-        resampled_audio = resampler(audio_tensor).numpy()
         transcription = pipe({"sampling_rate": 16000, "raw": resampled_audio})["text"]
@@ -88,14 +91,14 @@ def transcribe(inputs, previous_transcription):
         print(f"Error: {e}")
         return previous_transcription, "Error"
-# ✅ 6️⃣ Clear Function
 def clear():
     return ""
-# ✅ 7️⃣ Gradio Interface (Microphone Streaming)
 with gr.Blocks() as microphone:
-    gr.Markdown(f"# Whisper Tiny - Real-Time Transcription (CPU) 🎙️")
-    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text.")
     with gr.Row():
         input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
@@ -112,10 +115,10 @@ with gr.Blocks() as microphone:
     )
     clear_button.click(clear, outputs=[output])
-# ✅ 8️⃣ Gradio Interface (File Upload)
 with gr.Blocks() as file:
     gr.Markdown(f"# Upload Audio File for Transcription 🎵")
-    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for speech-to-text.")
     with gr.Row():
         input_audio = gr.Audio(sources=["upload"], type="numpy")
@@ -129,10 +132,10 @@ with gr.Blocks() as file:
     submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
     clear_button.click(clear, outputs=[output])
-# ✅ 9️⃣ Final Gradio App (Supports Microphone & File Upload)
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
-# ✅ 1️⃣0️⃣ Run Gradio Locally
 if __name__ == "__main__":
     demo.launch()

 import torch
+import torchaudio
 import gradio as gr
+import time
 import numpy as np
 import scipy.io.wavfile
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, BitsAndBytesConfig
+# ✅ 1️⃣ Optimize Model Selection
 device = "cpu"
 torch_dtype = torch.float32  # Use CPU-friendly float type
+MODEL_NAME = "openai/whisper-small"  # ✅ Switched to "small" for better accuracy
+# ✅ 2️⃣ Enable Quantization (Reduces Memory Usage, Speeds Up Inference)
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+# ✅ 3️⃣ Load Whisper Model on CPU with Optimized Settings
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    MODEL_NAME, quantization_config=quantization_config, torch_dtype=torch_dtype, use_safetensors=True
 )
 model.to(device)
+# ✅ 4️⃣ Load Processor & Set Default Sampling Rate
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
+processor.feature_extractor.sampling_rate = 16000  # ✅ Set correct sampling rate
+# ✅ 5️⃣ Optimized Pipeline with Beam Search for Better Accuracy
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
+    chunk_length_s=5,  # ✅ Increase chunk size for better performance
     torch_dtype=torch_dtype,
     device=device,
+    generate_kwargs={"num_beams": 5, "language": "en"},  # ✅ Beam search for better accuracy
 )
+# ✅ 6️⃣ Real-Time Streaming Transcription (Microphone)
 def stream_transcribe(stream, new_chunk):
     start_time = time.time()
     try:
         y = y.astype(np.float32)
         y /= np.max(np.abs(y))
+        # ✅ Resample audio to 16kHz using optimized torchaudio method
         y_tensor = torch.tensor(y)
+        y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
         # ✅ Append to Stream
         if stream is not None:
             stream = np.concatenate([stream, y_resampled])
         else:
             stream = y_resampled
+        # ✅ Run Transcription with Optimized Parameters
         transcription = pipe({"sampling_rate": 16000, "raw": stream})["text"]
         latency = time.time() - start_time
         print(f"Error: {e}")
         return stream, str(e), "Error"
+# ✅ 7️⃣ Transcription for File Upload
 def transcribe(inputs, previous_transcription):
     start_time = time.time()
     try:
         # ✅ Convert file input to correct format
         sample_rate, audio_data = inputs
+        # ✅ Resample using torchaudio (optimized)
         audio_tensor = torch.tensor(audio_data)
+        resampled_audio = torchaudio.functional.resample(audio_tensor, orig_freq=sample_rate, new_freq=16000).numpy()
         transcription = pipe({"sampling_rate": 16000, "raw": resampled_audio})["text"]
         print(f"Error: {e}")
         return previous_transcription, "Error"
+# ✅ 8️⃣ Clear Function
 def clear():
     return ""
+# ✅ 9️⃣ Gradio Interface (Microphone Streaming)
 with gr.Blocks() as microphone:
+    gr.Markdown(f"# Whisper Small - Real-Time Transcription (Optimized CPU) 🎙️")
+    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text with better accuracy.")
     with gr.Row():
         input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
     )
     clear_button.click(clear, outputs=[output])
+# ✅ 🔟 Gradio Interface (File Upload)
 with gr.Blocks() as file:
     gr.Markdown(f"# Upload Audio File for Transcription 🎵")
+    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")
     with gr.Row():
         input_audio = gr.Audio(sources=["upload"], type="numpy")
     submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
     clear_button.click(clear, outputs=[output])
+# ✅ 1️⃣1️⃣ Final Gradio App
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
+# ✅ 1️⃣2️⃣ Run Gradio Locally
 if __name__ == "__main__":
     demo.launch()