knowledge-scribe

Runtime error

App Files Files Community

knowledge-scribe / app.py

dwb2023

Update app.py

68bab0c verified over 1 year ago

raw

history blame

4.21 kB

	import os
	import json
	import time
	from datetime import datetime
	from pathlib import Path
	from uuid import uuid4
	import tempfile

	import gradio as gr
	import yt_dlp as youtube_dl
	from huggingface_hub import CommitScheduler
	from transformers import (
	BitsAndBytesConfig,
	AutoModelForSpeechSeq2Seq,
	AutoTokenizer,
	AutoFeatureExtractor,
	pipeline,
	)
	from transformers.pipelines.audio_utils import ffmpeg_read

	# import torch # If you're using PyTorch
	import spaces

	os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

	MODEL_NAME = "openai/whisper-large-v3"
	BATCH_SIZE = 8
	YT_LENGTH_LIMIT_S = 4800 # 1 hour 20 minutes

	# Quantization
	bnb_config = BitsAndBytesConfig(load_in_4bit=True)
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	MODEL_NAME,
	quantization_config=bnb_config,
	device_map="auto"
	)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)

	# bnb_config = bnb.QuantizationConfig(bits=4)
	pipe = pipeline(
	task="automatic-speech-recognition",
	model=model,
	tokenizer=tokenizer,
	feature_extractor=feature_extractor,
	chunk_length_s=30,
	# device=device,
	)

	# Define paths and create directory if not exists
	JSON_DATASET_DIR = Path("json_dataset")
	JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
	JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"

	# Initialize CommitScheduler for saving data to Hugging Face Dataset
	scheduler = CommitScheduler(
	repo_id="transcript-dataset-repo",
	repo_type="dataset",
	folder_path=JSON_DATASET_DIR,
	path_in_repo="data",
	)

	def download_yt_audio(yt_url, filename):
	info_loader = youtube_dl.YoutubeDL()
	try:
	info = info_loader.extract_info(yt_url, download=False)
	except youtube_dl.utils.DownloadError as err:
	raise gr.Error(str(err))
	file_length = info["duration"]
	if file_length > YT_LENGTH_LIMIT_S:
	yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
	file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
	raise gr.Error(
	f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
	)
	ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
	with youtube_dl.YoutubeDL(ydl_opts) as ydl:
	ydl.download([yt_url])


	@spaces.GPU
	def yt_transcribe(yt_url, task):
	with tempfile.TemporaryDirectory() as tmpdirname:
	filepath = os.path.join(tmpdirname, "video.mp4")
	download_yt_audio(yt_url, filepath)
	with open(filepath, "rb") as f:
	inputs = f.read()
	inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
	inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
	text = pipe(
	inputs,
	batch_size=BATCH_SIZE,
	generate_kwargs={"task": task},
	return_timestamps=True,
	)["text"]
	save_transcription(yt_url, text)
	return text


	def save_transcription(yt_url, transcription):
	with scheduler.lock:
	with JSON_DATASET_PATH.open("a") as f:
	json.dump(
	{
	"url": yt_url,
	"transcription": transcription,
	"datetime": datetime.now().isoformat(),
	},
	f,
	)
	f.write("\n")

	demo = gr.Blocks()

	yt_transcribe_interface = gr.Interface(
	fn=yt_transcribe,
	inputs=[
	gr.Textbox(
	lines=1,
	placeholder="Paste the URL to a YouTube video here",
	label="YouTube URL",
	),
	gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
	],
	outputs="text",
	title="Whisper Large V3: Transcribe YouTube",
	description=(
	"Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
	f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
	" arbitrary length."
	),
	allow_flagging="never",
	)

	with demo:
	gr.TabbedInterface(
	[yt_transcribe_interface], ["YouTube"]
	)

	demo.queue().launch()