|
|
import base64 |
|
|
import os |
|
|
import gradio as gr |
|
|
from transformers import ( |
|
|
pipeline, |
|
|
AutoModelForSpeechSeq2Seq, |
|
|
AutoProcessor, |
|
|
) |
|
|
import numpy as np |
|
|
import librosa |
|
|
from datetime import datetime |
|
|
from datasets import ( |
|
|
load_dataset, |
|
|
concatenate_datasets, |
|
|
Dataset, |
|
|
DatasetDict, |
|
|
Features, |
|
|
Value, |
|
|
Audio, |
|
|
) |
|
|
|
|
|
import torch |
|
|
import spaces |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HF_DATASET_NAME = "BounharAbdelaziz/Moroccan-STT-Eval-Dataset" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_PATHS = { |
|
|
"MEDIUM": "BounharAbdelaziz/Morocco-Darija-STT-medium", |
|
|
"LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STT_MODEL_TOKEN = os.environ.get("TOKEN") |
|
|
|
|
|
|
|
|
STT_EVAL_DATASET_TOKEN = os.environ.get("TOKEN") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def encode_image_to_base64(image_path): |
|
|
with open(image_path, "rb") as image_file: |
|
|
encoded_string = base64.b64encode(image_file.read()).decode() |
|
|
return encoded_string |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_html_image(image_path): |
|
|
img_base64 = encode_image_to_base64(image_path) |
|
|
html_string = f""" |
|
|
<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;"> |
|
|
<div style="max-width: 800px; margin: auto;"> |
|
|
<img src="data:image/jpeg;base64,{img_base64}" |
|
|
style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;" |
|
|
alt="Displayed Image"> |
|
|
</div> |
|
|
</div> |
|
|
""" |
|
|
return html_string |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_to_hf_dataset(audio_signal, model_choice, transcription): |
|
|
print("[INFO] Loading dataset...") |
|
|
|
|
|
dataset = load_dataset(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN) |
|
|
print("[INFO] Dataset loaded successfully.") |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|
|
new_entry = { |
|
|
"audio": [{"array": audio_signal, "sampling_rate": 16000}], |
|
|
"transcription": [transcription], |
|
|
"model_used": [model_choice], |
|
|
"timestamp": [timestamp], |
|
|
} |
|
|
|
|
|
new_dataset = Dataset.from_dict( |
|
|
new_entry, |
|
|
features=Features({ |
|
|
"audio": Audio(sampling_rate=16000), |
|
|
"transcription": Value("string"), |
|
|
"model_used": Value("string"), |
|
|
"timestamp": Value("string"), |
|
|
}) |
|
|
) |
|
|
|
|
|
print("[INFO] Adding the new entry to the dataset...") |
|
|
train_dataset = dataset["train"] |
|
|
updated_train_dataset = concatenate_datasets([train_dataset, new_dataset]) |
|
|
dataset["train"] = updated_train_dataset |
|
|
|
|
|
print("[INFO] Pushing the updated dataset...") |
|
|
dataset.push_to_hub(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN) |
|
|
|
|
|
print("[INFO] Dataset updated and pushed successfully.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_model(model_name): |
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
model_id = MODEL_PATHS[model_name.upper()] |
|
|
|
|
|
print("[INFO] Loading processor and model...") |
|
|
model = AutoModelForSpeechSeq2Seq.from_pretrained( |
|
|
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, token=STT_MODEL_TOKEN |
|
|
) |
|
|
model.to(device) |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(model_id, token=STT_MODEL_TOKEN) |
|
|
|
|
|
|
|
|
pipe = pipeline( |
|
|
"automatic-speech-recognition", |
|
|
model=model, |
|
|
tokenizer=processor.tokenizer, |
|
|
feature_extractor=processor.feature_extractor, |
|
|
max_new_tokens=128, |
|
|
chunk_length_s=30, |
|
|
return_timestamps=False, |
|
|
|
|
|
torch_dtype=torch_dtype, |
|
|
device=device, |
|
|
generate_kwargs = {"task": "transcribe"}, |
|
|
) |
|
|
|
|
|
return pipe |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def process_audio(audio, model_choice, save_data): |
|
|
|
|
|
|
|
|
|
|
|
pipe = load_model(model_choice) |
|
|
audio_signal = audio[1] |
|
|
sample_rate = audio[0] |
|
|
audio_signal = audio_signal.astype(np.float32) |
|
|
|
|
|
if np.abs(audio_signal).max() > 1.0: |
|
|
audio_signal = audio_signal / 32768.0 |
|
|
|
|
|
if sample_rate != 16000: |
|
|
print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz") |
|
|
audio_signal = librosa.resample( |
|
|
y=audio_signal, |
|
|
orig_sr=sample_rate, |
|
|
target_sr=16000 |
|
|
) |
|
|
|
|
|
result = pipe(audio_signal) |
|
|
transcription = result["text"] |
|
|
|
|
|
if save_data: |
|
|
print(f"[INFO] Saving data to eval dataset...") |
|
|
save_to_hf_dataset(audio_signal, model_choice, transcription) |
|
|
|
|
|
return transcription |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_interface(): |
|
|
with gr.Blocks(css="footer{display:none !important}") as app: |
|
|
base_path = os.path.dirname(__file__) |
|
|
local_image_path = os.path.join(base_path, 'logo_image.png') |
|
|
gr.HTML(create_html_image(local_image_path)) |
|
|
|
|
|
gr.Markdown("# π²π¦ π Morocco Fast Speech-to-Text Transcription π") |
|
|
|
|
|
gr.Markdown("β οΈ **Nota bene**: Make sure to click on **Stop** before hitting the **Transcribe** button") |
|
|
gr.Markdown("The **Large** model is now available! π₯") |
|
|
|
|
|
with gr.Row(): |
|
|
model_choice = gr.Dropdown( |
|
|
choices=["Medium", "Large"], |
|
|
value="Large", |
|
|
label="Select one of the models" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
audio_input = gr.Audio( |
|
|
sources=["microphone", "upload"], |
|
|
type="numpy", |
|
|
label="Record Audio", |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
save_data = gr.Checkbox( |
|
|
label="Contribute to the evaluation benchmark", |
|
|
value=True, |
|
|
) |
|
|
|
|
|
submit_btn = gr.Button("Transcribe π₯") |
|
|
output_text = gr.Textbox(label="Transcription", text_align="right") |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
### ππ Notice to our dearest users π€ (coming soon) |
|
|
- By transcribing your audio, youβre actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models. |
|
|
- Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages. |
|
|
- Together, weβre building tools that better understand and serve the unique linguistic landscape of Morocco. |
|
|
- We count on your **thoughtfulness and responsibility** when using the app. Thank you for your contribution! π |
|
|
""") |
|
|
|
|
|
submit_btn.click( |
|
|
fn=process_audio, |
|
|
inputs=[audio_input, model_choice, save_data], |
|
|
outputs=output_text |
|
|
) |
|
|
|
|
|
gr.Markdown("<br/>") |
|
|
|
|
|
return app |