meg-huggingface commited on
Commit
e8d021a
·
1 Parent(s): 9c20baa

Fixing some of the errors

Browse files
Files changed (2) hide show
  1. app.py +8 -6
  2. src/generate.py +27 -28
app.py CHANGED
@@ -179,6 +179,7 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
179
  1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, making sure the sentence isn’t part of a previous recording but instead uniquely references the current consent context.
180
  2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
181
  3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the voice clonee’s speech snippets to generate speech.
 
182
  Some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_. This means that a sentence used for consent can **also** be used for voice cloning. We demonstrate one way to do that here.
183
  """)
184
  with gr.Row():
@@ -196,11 +197,11 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
196
  )
197
  with gr.Column():
198
  consent_method = gr.Dropdown(
199
- label="Sentence generation method",
200
  choices=["Llama 3.2 3B Instruct"],
201
  value="Llama 3.2 3B Instruct"
202
  )
203
- asr_model = gr.Dropdown(label="Speech recognition model",
204
  choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
205
  "openai/whisper-base.en", # better accuracy, a bit slower
206
  "distil-whisper/distil-small.en"
@@ -209,7 +210,7 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
209
  value="openai/whisper-tiny.en",
210
  )
211
  voice_clone_model = gr.Dropdown(
212
- label="Voice cloning model",
213
  choices=["Chatterbox", ], value="Chatterbox")
214
  #with gr.Column():
215
  # pass # Just for spacing
@@ -231,6 +232,7 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
231
  value="auto",
232
  label="Device preference"
233
  )
 
234
  pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
235
  label="Match threshold")
236
 
@@ -257,8 +259,8 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
257
  with gr.Column():
258
  gr.Markdown("## Audio input")
259
  # Prepopulating with the consent audio.
260
- # Set interactive=True to be able to change.
261
- tts_audio = gr.Audio(audio_input, type="filepath")
262
  with gr.Row():
263
  with gr.Column():
264
  gr.Markdown("## Text input")
@@ -281,7 +283,7 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
281
  label="Temperature", value=.8)
282
  with gr.Row():
283
  clone_btn = gr.Button("Clone!")
284
- cloned_audio = gr.Audio()
285
  clone_btn.click(fn=clone_voice,
286
  inputs=[tts_audio, tts_text, exaggeration,
287
  cfg_weight, seed_num, temp],
 
179
  1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, making sure the sentence isn’t part of a previous recording but instead uniquely references the current consent context.
180
  2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
181
  3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the voice clonee’s speech snippets to generate speech.
182
+
183
  Some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_. This means that a sentence used for consent can **also** be used for voice cloning. We demonstrate one way to do that here.
184
  """)
185
  with gr.Row():
 
197
  )
198
  with gr.Column():
199
  consent_method = gr.Dropdown(
200
+ label="Sentence generation method (currently limited to Llama 3.2 3B Instruct)",
201
  choices=["Llama 3.2 3B Instruct"],
202
  value="Llama 3.2 3B Instruct"
203
  )
204
+ asr_model = gr.Dropdown(label="Speech recognition model (currently limited to Whisper)",
205
  choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
206
  "openai/whisper-base.en", # better accuracy, a bit slower
207
  "distil-whisper/distil-small.en"
 
210
  value="openai/whisper-tiny.en",
211
  )
212
  voice_clone_model = gr.Dropdown(
213
+ label="Voice cloning model (currently limited to Chatterbox)",
214
  choices=["Chatterbox", ], value="Chatterbox")
215
  #with gr.Column():
216
  # pass # Just for spacing
 
232
  value="auto",
233
  label="Device preference"
234
  )
235
+ # In your own code, do not provide users with the option to change this: Set it yourself.
236
  pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
237
  label="Match threshold")
238
 
 
259
  with gr.Column():
260
  gr.Markdown("## Audio input")
261
  # Prepopulating with the consent audio.
262
+ # Setting interactive=False keeps it from being possible to upload something else.
263
+ tts_audio = gr.Audio(audio_input, type="filepath", interactive=False)
264
  with gr.Row():
265
  with gr.Column():
266
  gr.Markdown("## Text input")
 
283
  label="Temperature", value=.8)
284
  with gr.Row():
285
  clone_btn = gr.Button("Clone!")
286
+ cloned_audio = gr.Audio(show_download_button=True)
287
  clone_btn.click(fn=clone_voice,
288
  inputs=[tts_audio, tts_text, exaggeration,
289
  cfg_weight, seed_num, temp],
src/generate.py CHANGED
@@ -72,45 +72,44 @@ def _extract_llama_text(result: Any) -> str:
72
  return ""
73
 
74
 
75
- def gen_sentence(audio_model_name="Chatterbox"):
76
  """
77
  Always generate a sentence via the LLM.
 
78
  """
79
  try:
80
- return gen_sentence_llm(audio_model_name=audio_model_name)
81
  except Exception as e:
82
  # Show a helpful message directly in the Target sentence box
83
  return f"[ERROR calling LLM] {type(e).__name__}: {e}"
84
 
85
  # TODO: Support more than just Llama 3.2 3B Instruct
86
- def gen_sentence_llm(
87
- sentence_method: str = "Llama 3.2 3B Instruct",
88
- audio_model_name: str = "Chatterbox",
89
- *
90
- ) -> str:
91
- """
92
- Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
93
-
94
- This function constructs a prompt describing the linguistic and ethical
95
- requirements for a consent sentence (via `get_consent_generation_prompt`)
96
- and sends it to the Llama demo hosted on Hugging Face Spaces.
97
-
98
- The response is normalized into a single English sentence suitable
99
- for reading aloud.
100
-
101
- Parameters
102
- ----------
103
- audio_model_name : str, optional
104
- The name of the voice-cloning model to mention in the sentence.
105
- Defaults to "Chatterbox".
106
-
107
- Returns
108
- -------
109
- str
110
- A clean, human-readable consent sentence.
111
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  # Generate the full natural-language prompt that the LLM will receive
113
- prompt = get_consent_generation_prompt(audio_model_name)
114
 
115
  try:
116
  # Initialize Gradio client for the Llama demo Space
 
72
  return ""
73
 
74
 
75
+ def gen_sentence(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox"):
76
  """
77
  Always generate a sentence via the LLM.
78
+ :param consent_method:
79
  """
80
  try:
81
+ return gen_sentence_llm(consent_method, voice_clone_model)
82
  except Exception as e:
83
  # Show a helpful message directly in the Target sentence box
84
  return f"[ERROR calling LLM] {type(e).__name__}: {e}"
85
 
86
  # TODO: Support more than just Llama 3.2 3B Instruct
87
+ def gen_sentence_llm(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox") -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  """
89
+ Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
90
+
91
+ This function constructs a prompt describing the linguistic and ethical
92
+ requirements for a consent sentence (via `get_consent_generation_prompt`)
93
+ and sends it to the Llama demo hosted on Hugging Face Spaces.
94
+
95
+ The response is normalized into a single English sentence suitable
96
+ for reading aloud.
97
+
98
+ Parameters
99
+ ----------
100
+ audio_model_name : str, optional
101
+ The name of the voice-cloning model to mention in the sentence.
102
+ Defaults to "Chatterbox".
103
+
104
+ Returns
105
+ -------
106
+ str
107
+ A clean, human-readable consent sentence.
108
+ :param consent_method:
109
+ :param voice_clone_model:
110
+ """
111
  # Generate the full natural-language prompt that the LLM will receive
112
+ prompt = get_consent_generation_prompt(voice_clone_model)
113
 
114
  try:
115
  # Initialize Gradio client for the Llama demo Space