Spaces:

alakxender
/

dhivehi-tts-demos

Running on Zero

App Files Files Community

alakxender commited on Jul 3

Commit

11240c1

1 Parent(s): 7a0937a

t

Browse files

Files changed (1) hide show

app.py +30 -16

app.py CHANGED Viewed

@@ -410,22 +410,27 @@ def generate_conversation(speaker_a, speaker_b, speaker_a_audio, speaker_a_text,
 def change_model_and_update_ui(model_name):
     """Change model and update UI components"""
-    # We can't load the model here in the main process, so we just update the UI
-    # The model will be loaded on the first GPU function call
-    global current_model_id
-    # Reset current model id to force reload
-    current_model_id = None
     choices = get_speaker_choices()
     # Generate new speaker display choices for the new model
     current_prompts = get_current_speaker_prompts()
     new_speaker_choices = [(f"{prompt_data['speaker_id']}: {prompt_data['name']}", prompt_data['speaker_id'])
                           for prompt_key, prompt_data in current_prompts.items()]
     display_choices = [choice[0] for choice in new_speaker_choices]
     return (
-        gr.update(value=f"Model will load on first generation: {model_name}"),
         gr.update(choices=choices, value=choices[0]),
         gr.update(choices=choices, value=choices[1] if len(choices) > 1 else choices[0]),
         # Update simple generation speaker radio
@@ -434,8 +439,16 @@ def change_model_and_update_ui(model_name):
         gr.update(choices=display_choices, value=display_choices[0] if display_choices else "0: Speaker")
     )
 with gr.Blocks(
-    title="CSM Dhivehi Speech Generator",
     css="""
     .dhivehi-text textarea {
         font-size: 18px !important;
@@ -452,9 +465,9 @@ with gr.Blocks(
     }
     """
 ) as app:
-    with gr.Tab("🎙️ CSM Speech Generator"):
-        gr.Markdown("# 🎙️  CSM Dhivehi Speech Generator")
         # Model selection
         with gr.Row():
             model_dropdown = gr.Dropdown(
@@ -463,7 +476,7 @@ with gr.Blocks(
                 label="🤖 Select Model"
             )
             model_info = gr.Textbox(
-                value="Model will load on first generation",
                 label="Model Status",
                 interactive=False
             )
@@ -517,7 +530,7 @@ with gr.Blocks(
                         context_text = gr.Textbox(
                             label="Speaker prompt",
                             placeholder="މަންމަ ކިހާއިރެއް ދަރިފުޅުގެ އިންތިޒާރުގަ އިންނަތާ",
-                            value="އައިލީޝް އޭނާއަތަށް ލިބުނީ އެންމެ ނުވަ މަހުގަ",
                             lines=2,
                             elem_classes=["dhivehi-text"]
                         )
@@ -656,8 +669,9 @@ with gr.Blocks(
         - Long sentences: Generated long sentences seems sped up.
         - Repeating words: Generated text sometimes repeats words.
         """)
-    with gr.Tab("🎙️ Dia-1.6B Text-to-Speech Synthesis"):
-        gr.Markdown("# 🎙️  Dia-1.6B Text-to-Speech Synthesis + Dhivehi")
         gr.Markdown("Check back later...")
 if __name__ == "__main__":

 def change_model_and_update_ui(model_name):
     """Change model and update UI components"""
+    # Load the model if not already loaded
+    success = load_model(model_name)
+    if not success:
+        return (
+            gr.update(value="Error loading model"),
+            gr.update(),
+            gr.update(),
+            gr.update(),
+            gr.update()
+        )
     choices = get_speaker_choices()
     # Generate new speaker display choices for the new model
     current_prompts = get_current_speaker_prompts()
     new_speaker_choices = [(f"{prompt_data['speaker_id']}: {prompt_data['name']}", prompt_data['speaker_id'])
                           for prompt_key, prompt_data in current_prompts.items()]
     display_choices = [choice[0] for choice in new_speaker_choices]
     return (
+        gr.update(value=get_model_info()),
         gr.update(choices=choices, value=choices[0]),
         gr.update(choices=choices, value=choices[1] if len(choices) > 1 else choices[0]),
         # Update simple generation speaker radio
         gr.update(choices=display_choices, value=display_choices[0] if display_choices else "0: Speaker")
     )
+# Load the first model by default before building the UI
+try:
+    first_model = list(MODELS.keys())[0]
+    print(f"Loading default model: {first_model}")
+    load_model(first_model)
+except Exception as e:
+    print(f"Failed to load default model: {e}")
 with gr.Blocks(
+    title="Dhivehi (Thaana) Text-to-Speech",
     css="""
     .dhivehi-text textarea {
         font-size: 18px !important;
     }
     """
 ) as app:
+    with gr.Tab("🎙️ CSM-1B"):
+        gr.Markdown("# 🎙️  CSM-1B Text-to-Speech Synthesis")
+        gr.Markdown("**CSM (Conversational Speech Model)** is a speech generation model from [Sesame](sesame.com) that generates **RVQ audio codes** from text and audio inputs. The model architecture employs a [Llama](https://www.llama.com/) backbone and a smaller audio decoder that produces [Mimi](https://huggingface.co/kyutai/mimi) audio codes. This demo uses a **fine-tuned version** of the model for **Dhivehi speech synthesis**.")
         # Model selection
         with gr.Row():
             model_dropdown = gr.Dropdown(
                 label="🤖 Select Model"
             )
             model_info = gr.Textbox(
+                value=get_model_info(),
                 label="Model Status",
                 interactive=False
             )
                         context_text = gr.Textbox(
                             label="Speaker prompt",
                             placeholder="މަންމަ ކިހާއިރެއް ދަރިފުޅުގެ އިންތިޒާރުގަ އިންނަތާ",
+                            value="",
                             lines=2,
                             elem_classes=["dhivehi-text"]
                         )
         - Long sentences: Generated long sentences seems sped up.
         - Repeating words: Generated text sometimes repeats words.
         """)
+    with gr.Tab("🎙️ Dia-1.6B"):
+        gr.Markdown("# 🎙️  Dia-1.6B Text-to-Speech Synthesis")
+        gr.Markdown("Dia is a 1.6B parameter text to speech model created by [Nari Labs](https://huggingface.co/nari-labs/Dia-1.6B). This demo uses a fine-tuned version of the model with **Dhivehi (Thaana) voices**. Also supports mixed languages and can generate non-verbal, dialogue and voice-clone.")
         gr.Markdown("Check back later...")
 if __name__ == "__main__":