alakxender commited on
Commit
11240c1
·
1 Parent(s): 7a0937a
Files changed (1) hide show
  1. app.py +30 -16
app.py CHANGED
@@ -410,22 +410,27 @@ def generate_conversation(speaker_a, speaker_b, speaker_a_audio, speaker_a_text,
410
 
411
  def change_model_and_update_ui(model_name):
412
  """Change model and update UI components"""
413
- # We can't load the model here in the main process, so we just update the UI
414
- # The model will be loaded on the first GPU function call
415
- global current_model_id
416
-
417
- # Reset current model id to force reload
418
- current_model_id = None
419
 
 
 
 
 
 
 
 
 
 
420
  choices = get_speaker_choices()
421
  # Generate new speaker display choices for the new model
422
  current_prompts = get_current_speaker_prompts()
423
  new_speaker_choices = [(f"{prompt_data['speaker_id']}: {prompt_data['name']}", prompt_data['speaker_id'])
424
  for prompt_key, prompt_data in current_prompts.items()]
425
  display_choices = [choice[0] for choice in new_speaker_choices]
426
-
427
  return (
428
- gr.update(value=f"Model will load on first generation: {model_name}"),
429
  gr.update(choices=choices, value=choices[0]),
430
  gr.update(choices=choices, value=choices[1] if len(choices) > 1 else choices[0]),
431
  # Update simple generation speaker radio
@@ -434,8 +439,16 @@ def change_model_and_update_ui(model_name):
434
  gr.update(choices=display_choices, value=display_choices[0] if display_choices else "0: Speaker")
435
  )
436
 
 
 
 
 
 
 
 
 
437
  with gr.Blocks(
438
- title="CSM Dhivehi Speech Generator",
439
  css="""
440
  .dhivehi-text textarea {
441
  font-size: 18px !important;
@@ -452,9 +465,9 @@ with gr.Blocks(
452
  }
453
  """
454
  ) as app:
455
- with gr.Tab("🎙️ CSM Speech Generator"):
456
- gr.Markdown("# 🎙️ CSM Dhivehi Speech Generator")
457
-
458
  # Model selection
459
  with gr.Row():
460
  model_dropdown = gr.Dropdown(
@@ -463,7 +476,7 @@ with gr.Blocks(
463
  label="🤖 Select Model"
464
  )
465
  model_info = gr.Textbox(
466
- value="Model will load on first generation",
467
  label="Model Status",
468
  interactive=False
469
  )
@@ -517,7 +530,7 @@ with gr.Blocks(
517
  context_text = gr.Textbox(
518
  label="Speaker prompt",
519
  placeholder="މަންމަ ކިހާއިރެއް ދަރިފުޅުގެ އިންތިޒާރުގަ އިންނަތާ",
520
- value="އައިލީޝް އޭނާއަތަށް ލިބުނީ އެންމެ ނުވަ މަހުގަ",
521
  lines=2,
522
  elem_classes=["dhivehi-text"]
523
  )
@@ -656,8 +669,9 @@ with gr.Blocks(
656
  - Long sentences: Generated long sentences seems sped up.
657
  - Repeating words: Generated text sometimes repeats words.
658
  """)
659
- with gr.Tab("🎙️ Dia-1.6B Text-to-Speech Synthesis"):
660
- gr.Markdown("# 🎙️ Dia-1.6B Text-to-Speech Synthesis + Dhivehi")
 
661
  gr.Markdown("Check back later...")
662
 
663
  if __name__ == "__main__":
 
410
 
411
  def change_model_and_update_ui(model_name):
412
  """Change model and update UI components"""
413
+ # Load the model if not already loaded
414
+ success = load_model(model_name)
 
 
 
 
415
 
416
+ if not success:
417
+ return (
418
+ gr.update(value="Error loading model"),
419
+ gr.update(),
420
+ gr.update(),
421
+ gr.update(),
422
+ gr.update()
423
+ )
424
+
425
  choices = get_speaker_choices()
426
  # Generate new speaker display choices for the new model
427
  current_prompts = get_current_speaker_prompts()
428
  new_speaker_choices = [(f"{prompt_data['speaker_id']}: {prompt_data['name']}", prompt_data['speaker_id'])
429
  for prompt_key, prompt_data in current_prompts.items()]
430
  display_choices = [choice[0] for choice in new_speaker_choices]
431
+
432
  return (
433
+ gr.update(value=get_model_info()),
434
  gr.update(choices=choices, value=choices[0]),
435
  gr.update(choices=choices, value=choices[1] if len(choices) > 1 else choices[0]),
436
  # Update simple generation speaker radio
 
439
  gr.update(choices=display_choices, value=display_choices[0] if display_choices else "0: Speaker")
440
  )
441
 
442
+ # Load the first model by default before building the UI
443
+ try:
444
+ first_model = list(MODELS.keys())[0]
445
+ print(f"Loading default model: {first_model}")
446
+ load_model(first_model)
447
+ except Exception as e:
448
+ print(f"Failed to load default model: {e}")
449
+
450
  with gr.Blocks(
451
+ title="Dhivehi (Thaana) Text-to-Speech",
452
  css="""
453
  .dhivehi-text textarea {
454
  font-size: 18px !important;
 
465
  }
466
  """
467
  ) as app:
468
+ with gr.Tab("🎙️ CSM-1B"):
469
+ gr.Markdown("# 🎙️ CSM-1B Text-to-Speech Synthesis")
470
+ gr.Markdown("**CSM (Conversational Speech Model)** is a speech generation model from [Sesame](sesame.com) that generates **RVQ audio codes** from text and audio inputs. The model architecture employs a [Llama](https://www.llama.com/) backbone and a smaller audio decoder that produces [Mimi](https://huggingface.co/kyutai/mimi) audio codes. This demo uses a **fine-tuned version** of the model for **Dhivehi speech synthesis**.")
471
  # Model selection
472
  with gr.Row():
473
  model_dropdown = gr.Dropdown(
 
476
  label="🤖 Select Model"
477
  )
478
  model_info = gr.Textbox(
479
+ value=get_model_info(),
480
  label="Model Status",
481
  interactive=False
482
  )
 
530
  context_text = gr.Textbox(
531
  label="Speaker prompt",
532
  placeholder="މަންމަ ކިހާއިރެއް ދަރިފުޅުގެ އިންތިޒާރުގަ އިންނަތާ",
533
+ value="",
534
  lines=2,
535
  elem_classes=["dhivehi-text"]
536
  )
 
669
  - Long sentences: Generated long sentences seems sped up.
670
  - Repeating words: Generated text sometimes repeats words.
671
  """)
672
+ with gr.Tab("🎙️ Dia-1.6B"):
673
+ gr.Markdown("# 🎙️ Dia-1.6B Text-to-Speech Synthesis")
674
+ gr.Markdown("Dia is a 1.6B parameter text to speech model created by [Nari Labs](https://huggingface.co/nari-labs/Dia-1.6B). This demo uses a fine-tuned version of the model with **Dhivehi (Thaana) voices**. Also supports mixed languages and can generate non-verbal, dialogue and voice-clone.")
675
  gr.Markdown("Check back later...")
676
 
677
  if __name__ == "__main__":