Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| import torch | |
| import numpy as np | |
| from scipy.signal import resample | |
| # app.py | |
| # Load your model (use_auth_token for private models) | |
| import os | |
| hf_token = os.getenv("HF_TOKEN") | |
| model = WhisperForConditionalGeneration.from_pretrained("GiftMark/akan-whisper-model", token=hf_token) | |
| processor = WhisperProcessor.from_pretrained("GiftMark/akan-whisper-model", token=hf_token) | |
| def transcribe(audio): | |
| try: | |
| if audio is None: | |
| return "No audio provided." | |
| sampling_rate, data = audio | |
| data = np.array(data).astype(np.float32) | |
| # Ensure mono | |
| if len(data.shape) > 1: | |
| data = data[:, 0] | |
| # Resample if needed | |
| target_sr = 16000 | |
| if sampling_rate != target_sr: | |
| # Calculate number of samples after resampling | |
| duration = data.shape[0] / sampling_rate | |
| new_length = int(duration * target_sr) | |
| data = resample(data, new_length) | |
| sampling_rate = target_sr | |
| inputs = processor( | |
| data, sampling_rate=sampling_rate, return_tensors="pt" | |
| ).input_features | |
| with torch.no_grad(): | |
| predicted_ids = model.generate(inputs) | |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| return transcription | |
| except Exception as e: | |
| print("Error during transcription:", e) | |
| return f"Error: {e}" | |
| demo = gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Record or upload Akan audio"), | |
| outputs=gr.Textbox(label="Transcription"), | |
| title="Akan Speech-to-Text Demo", | |
| description="Record or upload Akan audio to test the Whisper ASR model." | |
| ) | |
| demo.launch() |