Spaces:

BounharAbdelaziz
/

Morocco-Fast-Speech-to-Text-Transcription

Running

App Files Files Community

Morocco-Fast-Speech-to-Text-Transcription / utils.py

BounharAbdelaziz

Update utils.py

173f3d4 verified 9 months ago

raw

history blame contribute delete

8.25 kB

	import base64
	import os
	import gradio as gr
	from transformers import (
	pipeline,
	AutoModelForSpeechSeq2Seq,
	AutoProcessor,
	)
	import numpy as np
	import librosa
	from datetime import datetime
	from datasets import (
	load_dataset,
	concatenate_datasets,
	Dataset,
	DatasetDict,
	Features,
	Value,
	Audio,
	)

	import torch
	import spaces

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	# Hugging Face evaluation dataset
	HF_DATASET_NAME = "BounharAbdelaziz/Moroccan-STT-Eval-Dataset"

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	# Models paths
	MODEL_PATHS = {
	"MEDIUM": "BounharAbdelaziz/Morocco-Darija-STT-medium",
	"LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large",
	}

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	# Access token to models
	STT_MODEL_TOKEN = os.environ.get("TOKEN")

	# Access token to dataset
	STT_EVAL_DATASET_TOKEN = os.environ.get("TOKEN")

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def encode_image_to_base64(image_path):
	with open(image_path, "rb") as image_file:
	encoded_string = base64.b64encode(image_file.read()).decode()
	return encoded_string

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def create_html_image(image_path):
	img_base64 = encode_image_to_base64(image_path)
	html_string = f"""
	<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
	<div style="max-width: 800px; margin: auto;">
	<img src="data:image/jpeg;base64,{img_base64}"
	style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
	alt="Displayed Image">
	</div>
	</div>
	"""
	return html_string

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def save_to_hf_dataset(audio_signal, model_choice, transcription):
	print("[INFO] Loading dataset...")

	dataset = load_dataset(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN)
	print("[INFO] Dataset loaded successfully.")

	timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	new_entry = {
	"audio": [{"array": audio_signal, "sampling_rate": 16000}],
	"transcription": [transcription],
	"model_used": [model_choice],
	"timestamp": [timestamp],
	}

	new_dataset = Dataset.from_dict(
	new_entry,
	features=Features({
	"audio": Audio(sampling_rate=16000),
	"transcription": Value("string"),
	"model_used": Value("string"),
	"timestamp": Value("string"),
	})
	)

	print("[INFO] Adding the new entry to the dataset...")
	train_dataset = dataset["train"]
	updated_train_dataset = concatenate_datasets([train_dataset, new_dataset])
	dataset["train"] = updated_train_dataset

	print("[INFO] Pushing the updated dataset...")
	dataset.push_to_hub(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN)

	print("[INFO] Dataset updated and pushed successfully.")

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def load_model(model_name):

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	model_id = MODEL_PATHS[model_name.upper()]

	print("[INFO] Loading processor and model...")
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, token=STT_MODEL_TOKEN
	)
	model.to(device)

	processor = AutoProcessor.from_pretrained(model_id, token=STT_MODEL_TOKEN)


	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=30,
	return_timestamps=False,
	#num_beams=4,
	torch_dtype=torch_dtype,
	device=device,
	generate_kwargs = {"task": "transcribe"}, # to make sure it always do transcription # "language":"<\|ar\|>",
	)

	return pipe

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	@spaces.GPU
	def process_audio(audio, model_choice, save_data):

	# Force to false for now, issue with dataset
	# save_data = False
	pipe = load_model(model_choice)
	audio_signal = audio[1]
	sample_rate = audio[0]
	audio_signal = audio_signal.astype(np.float32)

	if np.abs(audio_signal).max() > 1.0:
	audio_signal = audio_signal / 32768.0

	if sample_rate != 16000:
	print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz")
	audio_signal = librosa.resample(
	y=audio_signal,
	orig_sr=sample_rate,
	target_sr=16000
	)

	result = pipe(audio_signal)
	transcription = result["text"]

	if save_data:
	print(f"[INFO] Saving data to eval dataset...")
	save_to_hf_dataset(audio_signal, model_choice, transcription)

	return transcription

	# ---------------------------------------------------------------------------- #
	# ---------------------------------------------------------------------------- #

	def create_interface():
	with gr.Blocks(css="footer{display:none !important}") as app:
	base_path = os.path.dirname(__file__)
	local_image_path = os.path.join(base_path, 'logo_image.png')
	gr.HTML(create_html_image(local_image_path))

	gr.Markdown("# 🇲🇦 🚀 Morocco Fast Speech-to-Text Transcription 😍")

	gr.Markdown("⚠️ Nota bene: Make sure to click on Stop before hitting the Transcribe button")
	gr.Markdown("The Large model is now available! 🔥")

	with gr.Row():
	model_choice = gr.Dropdown(
	choices=["Medium", "Large"],
	value="Large",
	label="Select one of the models"
	)

	with gr.Row():
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="numpy",
	label="Record Audio",
	)

	with gr.Row():
	save_data = gr.Checkbox(
	label="Contribute to the evaluation benchmark",
	value=True,
	)

	submit_btn = gr.Button("Transcribe 🔥")
	output_text = gr.Textbox(label="Transcription", text_align="right")


	gr.Markdown("""
	### 📄📌 Notice to our dearest users 🤗 (coming soon)
	- By transcribing your audio, you’re actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models.
	- Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages.
	- Together, we’re building tools that better understand and serve the unique linguistic landscape of Morocco.
	- We count on your thoughtfulness and responsibility when using the app. Thank you for your contribution! 🌟
	""")

	submit_btn.click(
	fn=process_audio,
	inputs=[audio_input, model_choice, save_data],
	outputs=output_text
	)

	gr.Markdown("<br/>")

	return app