import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import numpy as np
from scipy.signal import resample

# app.py
# Load your model (use_auth_token for private models)
import os
hf_token = os.getenv("HF_TOKEN")
model = WhisperForConditionalGeneration.from_pretrained("GiftMark/akan-whisper-model", token=hf_token)
processor = WhisperProcessor.from_pretrained("GiftMark/akan-whisper-model", token=hf_token)


def transcribe(audio):
    try:
        if audio is None:
            return "No audio provided."
        sampling_rate, data = audio
        data = np.array(data).astype(np.float32)
        # Ensure mono
        if len(data.shape) > 1:
            data = data[:, 0]
        # Resample if needed
        target_sr = 16000
        if sampling_rate != target_sr:
            # Calculate number of samples after resampling
            duration = data.shape[0] / sampling_rate
            new_length = int(duration * target_sr)
            data = resample(data, new_length)
            sampling_rate = target_sr

        inputs = processor(
            data, sampling_rate=sampling_rate, return_tensors="pt"
        ).input_features

        with torch.no_grad():
            predicted_ids = model.generate(inputs)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        return transcription
    except Exception as e:
        print("Error during transcription:", e)
        return f"Error: {e}"

demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Record or upload Akan audio"),
    outputs=gr.Textbox(label="Transcription"),
    title="Akan Speech-to-Text Demo",
    description="Record or upload Akan audio to test the Whisper ASR model."
)

demo.launch()