import gradio as gr from transformers import WhisperProcessor, WhisperForConditionalGeneration import torch import numpy as np from scipy.signal import resample # app.py # Load your model (use_auth_token for private models) import os hf_token = os.getenv("HF_TOKEN") model = WhisperForConditionalGeneration.from_pretrained("GiftMark/akan-whisper-model", token=hf_token) processor = WhisperProcessor.from_pretrained("GiftMark/akan-whisper-model", token=hf_token) def transcribe(audio): try: if audio is None: return "No audio provided." sampling_rate, data = audio data = np.array(data).astype(np.float32) # Ensure mono if len(data.shape) > 1: data = data[:, 0] # Resample if needed target_sr = 16000 if sampling_rate != target_sr: # Calculate number of samples after resampling duration = data.shape[0] / sampling_rate new_length = int(duration * target_sr) data = resample(data, new_length) sampling_rate = target_sr inputs = processor( data, sampling_rate=sampling_rate, return_tensors="pt" ).input_features with torch.no_grad(): predicted_ids = model.generate(inputs) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription except Exception as e: print("Error during transcription:", e) return f"Error: {e}" demo = gr.Interface( fn=transcribe, inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Record or upload Akan audio"), outputs=gr.Textbox(label="Transcription"), title="Akan Speech-to-Text Demo", description="Record or upload Akan audio to test the Whisper ASR model." ) demo.launch()