Spaces:
Running
Running
| import os | |
| import time | |
| import gradio as gr | |
| from pydub import AudioSegment | |
| from transformers import pipeline | |
| is_hf = os.getenv("SYSTEM") == "spaces" | |
| generate_kwargs = { | |
| "language": "Japanese", | |
| "do_sample": False, | |
| "num_beams": 1, | |
| "no_repeat_ngram_size": 3, | |
| } | |
| model_dict = { | |
| "whisper-large-v2": "openai/whisper-large-v2", | |
| "whisper-large-v3": "openai/whisper-large-v3", | |
| "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo", | |
| "kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0", | |
| "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0", | |
| "galgame-whisper-wip": ( | |
| "litagin/galgame-whisper-wip" | |
| if is_hf | |
| else "../whisper_finetune/galgame-whisper" | |
| ), | |
| } | |
| # Download models | |
| for model in model_dict.values(): | |
| pipeline("automatic-speech-recognition", model=model) | |
| def transcribe_common(audio: str, model: str) -> tuple[str, float]: | |
| # Get duration of audio | |
| duration = AudioSegment.from_file(audio).duration_seconds | |
| if duration > 15: | |
| return "Audio too long, limit is 15 seconds", 0 | |
| start_time = time.time() | |
| pipe = pipeline("automatic-speech-recognition", model=model) | |
| end_time = time.time() | |
| return pipe(audio, generate_kwargs=generate_kwargs)["text"], end_time - start_time | |
| def transcribe_large_v2(audio) -> tuple[str, float]: | |
| return transcribe_common(audio, model_dict["whisper-large-v2"]) | |
| def transcribe_large_v3(audio) -> tuple[str, float]: | |
| return transcribe_common(audio, model_dict["whisper-large-v3"]) | |
| def transcribe_large_v3_turbo(audio) -> tuple[str, float]: | |
| return transcribe_common(audio, model_dict["whisper-large-v3-turbo"]) | |
| def transcribe_kotoba_v1(audio) -> tuple[str, float]: | |
| return transcribe_common(audio, model_dict["kotoba-whisper-v1.0"]) | |
| def transcribe_kotoba_v2(audio) -> tuple[str, float]: | |
| return transcribe_common(audio, model_dict["kotoba-whisper-v2.0"]) | |
| def transcribe_galgame_whisper(audio) -> tuple[str, float]: | |
| return transcribe_common(audio, model_dict["galgame-whisper-wip"]) | |
| initial_md = """ | |
| # Galgame-Whisper (WIP) Demo | |
| - 日本語のみ対応 | |
| - 他の書き起こしとついでに比較できるようにいろいろ入れた | |
| - 現在0.1エポックくらい | |
| - 速度はCPUです | |
| - 音声は15秒まで | |
| """ | |
| with gr.Blocks() as app: | |
| audio = gr.Audio(type="filepath") | |
| gr.Markdown("### Kotoba-Whisper-V1.0") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Whisper-Large-V2") | |
| button_v2 = gr.Button("Transcribe with Whisper-Large-V2") | |
| output_v2 = gr.Textbox() | |
| time_v2 = gr.Textbox("Time taken") | |
| with gr.Column(): | |
| gr.Markdown("### Whisper-Large-V3") | |
| button_v3 = gr.Button("Transcribe with Whisper-Large-V3") | |
| output_v3 = gr.Textbox() | |
| time_v3 = gr.Textbox("Time taken") | |
| with gr.Column(): | |
| gr.Markdown("### Whisper-Large-V3-Turbo") | |
| button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo") | |
| output_v3_turbo = gr.Textbox() | |
| time_v3_turbo = gr.Textbox() | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Kotoba-Whisper-V1.0") | |
| button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0") | |
| output_kotoba_v1 = gr.Textbox() | |
| time_kotoba_v1 = gr.Textbox("Time taken") | |
| with gr.Column(): | |
| gr.Markdown("### Kotoba-Whisper-V2.0") | |
| button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0") | |
| output_kotoba_v2 = gr.Textbox() | |
| time_kotoba_v2 = gr.Textbox("Time taken") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Galgame-Whisper (WIP)") | |
| button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)") | |
| output_galgame = gr.Textbox() | |
| time_galgame = gr.Textbox("Time taken") | |
| button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2]) | |
| button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3]) | |
| button_v3_turbo.click( | |
| transcribe_large_v3_turbo, | |
| inputs=audio, | |
| outputs=[output_v3_turbo, time_v3_turbo], | |
| ) | |
| button_kotoba_v1.click( | |
| transcribe_kotoba_v1, inputs=audio, outputs=[output_kotoba_v1, time_kotoba_v1] | |
| ) | |
| button_kotoba_v2.click( | |
| transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2] | |
| ) | |
| button_galgame.click( | |
| transcribe_galgame_whisper, | |
| inputs=audio, | |
| outputs=[output_galgame, time_galgame], | |
| ) | |
| app.launch(inbrowser=True) | |