alakxender commited on
Commit
7e668b3
·
1 Parent(s): 6cde794
Files changed (1) hide show
  1. dia_1_6B_dv.py +17 -17
dia_1_6B_dv.py CHANGED
@@ -370,21 +370,21 @@ def get_dia_1_6B_tab():
370
  )
371
  else:
372
  gr.Markdown("_(No examples configured or example prompt file missing)_")
373
- gr.Markdown(
374
- "---\n"
375
- "**General Guidelines:**\n"
376
- "- Keep input text length moderate\n"
377
- " - Short input (corresponding to under 5s of audio) will sound unnatural\n"
378
- " - Very long input (corresponding to over 20s of audio) will make the speech unnaturally fast\n\n"
379
- "- Use non-verbal tags sparingly, from the list in the README. Overusing or using unlisted non-verbals may cause weird artifacts\n\n"
380
- "- Always begin input text with [S1], and always alternate between [S1] and [S2] (i.e. [S1]... [S1]... is not good)\n\n"
381
- "**When using audio prompts (voice cloning):**\n"
382
- "- Provide the transcript of the to-be cloned audio before the generation text\n"
383
- "- Transcript must use [S1], [S2] speaker tags correctly:\n"
384
- " - Single speaker: [S1]...\n"
385
- " - Two speakers: [S1]... [S2]...\n"
386
- "- Duration of the to-be cloned audio should be 5~10 seconds for the best results\n"
387
- " - (Keep in mind: 1 second ≈ 86 tokens)\n"
388
- "- Put [S1] or [S2] (the second-to-last speaker's tag) at the end of the audio to improve audio quality at the end"
389
- )
390
  # No explicit return needed for context manager pattern
 
370
  )
371
  else:
372
  gr.Markdown("_(No examples configured or example prompt file missing)_")
373
+ gr.Markdown(
374
+ "---\n"
375
+ "**General Guidelines:**\n"
376
+ "- Keep input text length moderate\n"
377
+ " - Short input (corresponding to under 5s of audio) will sound unnatural\n"
378
+ " - Very long input (corresponding to over 20s of audio) will make the speech unnaturally fast\n\n"
379
+ "- Use non-verbal tags sparingly, from the list in the README. Overusing or using unlisted non-verbals may cause weird artifacts\n\n"
380
+ "- Always begin input text with [S1], and always alternate between [S1] and [S2] (i.e. [S1]... [S1]... is not good)\n\n"
381
+ "**When using audio prompts (voice cloning):**\n"
382
+ "- Provide the transcript of the to-be cloned audio before the generation text\n"
383
+ "- Transcript must use [S1], [S2] speaker tags correctly:\n"
384
+ " - Single speaker: [S1]...\n"
385
+ " - Two speakers: [S1]... [S2]...\n"
386
+ "- Duration of the to-be cloned audio should be 5~10 seconds for the best results\n"
387
+ " - (Keep in mind: 1 second ≈ 86 tokens)\n"
388
+ "- Put [S1] or [S2] (the second-to-last speaker's tag) at the end of the audio to improve audio quality at the end"
389
+ )
390
  # No explicit return needed for context manager pattern