Spaces:
Sleeping
Sleeping
| from faster_whisper import WhisperModel | |
| #import whisper | |
| import pandas as pd | |
| import gradio as gr | |
| import psutil | |
| import time | |
| import whisperx | |
| model = WhisperModel('large-v2', device="cpu", compute_type="float32") | |
| #model = whisper.load_model('large-v2') | |
| def speech_to_text(mic=None, file=None, lang=None, task='transcribe'): | |
| if mic is not None: | |
| audio = mic | |
| elif file is not None: | |
| audio = file | |
| else: | |
| raise gr.Error("You must either provide a mic recording or a file") | |
| print(lang, task) | |
| time_start = time.time() | |
| segments, info = model.transcribe(audio, task=task, language=lang, beam_size=5) | |
| #results = model.transcribe(audio, task=task, language=lang, beam_size=5) | |
| #print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) | |
| # Decode audio to Text | |
| objects = [s._asdict() for s in segments] | |
| #objects = results["segments"] | |
| print(objects) | |
| time_end = time.time() | |
| time_diff = time_end - time_start | |
| #memory = psutil.virtual_memory() | |
| # *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.* | |
| system_info = f""" | |
| *Processing time: {time_diff:.5} seconds.* | |
| """ | |
| df_results = pd.DataFrame(objects) | |
| df_results = df_results.drop(columns=['seek', 'tokens', 'avg_logprob']) | |
| return df_results, system_info | |
| theme=gr.themes.Default().set( | |
| color_accent="#e20074", | |
| # Buttons | |
| button_primary_text_color='white', | |
| button_primary_text_color_hover='black', | |
| button_primary_background_fill="#e20074", | |
| button_primary_background_fill_hover='#c00063', # --telekom-color-primary-hovered | |
| button_primary_border_color="#e20074", | |
| button_primary_border_color_hover="#c00063", | |
| stat_background_fill="#e20074", | |
| # Dark Mode | |
| button_primary_background_fill_dark="#e20074", | |
| button_primary_background_fill_hover_dark='#c00063', # --telekom-color-primary-hovered | |
| button_primary_border_color_dark="#e20074", | |
| button_primary_border_color_hover_dark="#c00063", | |
| stat_background_fill_dark="#e20074", | |
| ) | |
| with gr.Blocks(title='Whisper Demo', theme=theme) as demo: | |
| gr.Markdown(''' | |
| <div> | |
| <h1 style='text-align: center'>Simple Whisper Demo</h1> | |
| A simple Whisper demo using local CPU Inference of the largest-v2 Model | |
| </div> | |
| ''') | |
| audio_in = gr.Audio(label="Record", source='microphone', type="filepath") | |
| file_in = gr.Audio(label="Upload", source='upload', type="filepath") | |
| transcribe_btn = gr.Button("Transcribe audio", variant="primary") | |
| translate_btn = gr.Button("Translate to English") | |
| trans_df = gr.DataFrame(label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate') | |
| sys_info = gr.Markdown("") | |
| transcribe_btn.click(lambda x, y: speech_to_text(x, y, task='transcribe'), | |
| [audio_in, file_in], | |
| [trans_df, sys_info] | |
| ) | |
| translate_btn.click(lambda x, y, z: speech_to_text(x, y, task='translate'), | |
| [audio_in, file_in], | |
| [trans_df, sys_info]) | |
| demo.launch() | |