File size: 14,940 Bytes
f329f75
794256c
9a41dfd
f329f75
aad4cd6
 
f329f75
9a41dfd
f329f75
9c712b0
 
9a41dfd
 
aad4cd6
63d0469
794256c
 
63d0469
f329f75
aad4cd6
 
 
2b76cdc
 
aad4cd6
 
 
 
 
 
 
 
 
 
182a1f7
f329f75
aad4cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3f1c3d
 
aad4cd6
 
f329f75
 
2b76cdc
f33c7b9
e3f1c3d
2b76cdc
e3f1c3d
2b76cdc
aad4cd6
 
 
2b76cdc
aad4cd6
 
 
 
 
 
 
 
 
 
 
 
e3f1c3d
2b76cdc
aad4cd6
 
e3f1c3d
aad4cd6
e3f1c3d
aad4cd6
 
2b76cdc
aad4cd6
e3f1c3d
aad4cd6
 
 
2b76cdc
aad4cd6
 
 
 
 
 
9a41dfd
aad4cd6
b61f112
aad4cd6
e3f1c3d
2b76cdc
 
e3f1c3d
 
aad4cd6
 
 
 
 
e3f1c3d
 
aad4cd6
b61f112
 
aad4cd6
 
f329f75
2b76cdc
 
e3f1c3d
2b76cdc
 
9a41dfd
 
 
 
ff3301e
9a41dfd
 
 
ff3301e
 
 
2b76cdc
 
9a41dfd
e3f1c3d
63d0469
ff3301e
 
2b76cdc
9c712b0
 
2b76cdc
 
 
 
 
6110073
 
 
 
 
 
 
2b76cdc
 
 
 
 
 
 
 
 
 
 
 
 
 
bc5d859
e8d021a
bc5d859
 
 
e8d021a
2b76cdc
 
 
 
 
 
 
 
e8d021a
2b76cdc
f329f75
aad4cd6
 
f329f75
 
 
 
2b76cdc
 
 
63d0469
 
 
 
 
e8d021a
aad4cd6
 
f329f75
 
 
aad4cd6
f329f75
aad4cd6
 
 
 
 
ff3301e
9a41dfd
b61f112
ff3301e
9a41dfd
 
 
 
 
 
 
 
 
e8d021a
 
9a41dfd
 
 
 
 
ff3301e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a41dfd
 
e8d021a
ff3301e
 
 
 
e3f1c3d
aad4cd6
2b76cdc
14f5917
2b76cdc
 
794256c
14f5917
e3f1c3d
 
 
2b76cdc
 
e3f1c3d
 
f329f75
aad4cd6
2b76cdc
b61f112
f329f75
 
e3f1c3d
f329f75
9a41dfd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
import gradio as gr

from gradio_client import Client, handle_file

import src.generate as generate
import src.process as process

global client

GATE_IMAGE_PATH = "./assets/voice_consent_gate_50.png"

# TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
#chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
# ------------------- UI printing functions -------------------
def clear_all():
    # target, user_transcript, score_html, result_html, diff_html, tts_ui
    return "", "", "", "", "", gr.Row.update(visible=False)


def make_result_html(pass_threshold, passed, ratio):
    """Returns HTML summarizing results.
    Parameters:
        pass_threshold: Minimum percentage of match between target and
        recognized user utterance that counts as passing.
        passed: Whether the recognized user utterance is >= `pass_threshold`.
        ratio: Sequence match ratio.
    """
    summary = (
        f"✅ Correct (≥ {int(pass_threshold * 100)}%)"
        if passed else
        f"❌ Not a match (need ≥ {int(pass_threshold * 100)}%)"
    )
    score = f"Similarity: {ratio * 100:.1f}%"
    return summary, score


def make_alignment_html(ref_tokens, hyp_tokens, alignments):
    """Returns HTML showing alignment between the target and recognized user audio.
    Parameters:
        ref_tokens: Target sentence for the user to say, tokenized.
        hyp_tokens: Recognized utterance from the user, tokenized.
        alignments: Tuples of alignment pattern (equal, delete, insert) and corresponding indices in `hyp_tokens`.
    """
    out = []
    no_match_html = ' <span style="background:#ffe0e0;text-decoration:line-through;">'
    match_html = ' <span style="background:#e0ffe0;">'
    for span in alignments:
        op, i1, i2, j1, j2 = span
        ref_string = " ".join(ref_tokens[i1:i2])
        hyp_string = " ".join(hyp_tokens[j1:j2])
        if op == "equal":
            out.append(" " + ref_string)
        elif op == "delete":
            out.append(no_match_html + ref_string + "</span>")
        elif op == "insert":
            out.append(match_html + hyp_string + "</span>")
        elif op == "replace":
            out.append(no_match_html + ref_string + "</span>")
            out.append(match_html + hyp_string + "</span>")
    html = '<div style="line-height:1.6;font-size:1rem;">' + "".join(
        out).strip() + "</div>"
    return html


def make_html(sentence_match):
    """Creates the HTML written out to the UI based on the results.
    Parameters:
        sentence_match: Class that stores the features of the target - user utterance alignment
    Returns:
        diff_html: An HTML string showing how the target sentence and recognized user utterance matches.
        result_html: An HTML string summarizing the results of the match between target and user utterance.
    """
    diff_html = make_alignment_html(sentence_match.target_tokens,
                                    sentence_match.user_tokens,
                                    sentence_match.alignments)
    result_html, score_html = make_result_html(sentence_match.pass_threshold,
                                               sentence_match.passed,
                                               sentence_match.ratio)

    return score_html, result_html, diff_html


# ------------------- Core Check (Currently English-only) -------------------
# @spaces.GPU
def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
                        asr_model_id: str, device_pref: str) -> (str, str):
    """ASR for the input audio and basic validation.
    Uses the selected ASR model `asr_model_id` to recognize words in the input `audio_path`.
    Parameters:
        audio_path: Processed audio file returned from gradio Audio component.
        target_sentence: Sentence the user needs to say.
        asr_model_id: Desired ASR model.
        device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
    Returns:
        error_msg: If there's an error, a string describing what happened.
        user_transcript: The recognized user utterance.
    """
    # Handles user interaction errors.
    if not target_sentence:
        return "Please generate a sentence first.", ""
    # TODO: Automatically stop the recording if someone presses the Transcribe & Check button.
    if audio_path is None:
        return "Please start, record, then stop the audio recording before trying to transcribe.", ""

    # Runs the automatic speech recognition
    user_transcript = process.run_asr(audio_path, asr_model_id, device_pref)

    # Handles processing errors.
    if isinstance(user_transcript, Exception):
        return f"Transcription failed: {user_transcript}", ""
    return "", user_transcript


def transcribe_check(audio_path, target_sentence, asr_model_id, device_pref,
                     pass_threshold):
    """Transcribe user, calculate match to target sentence, create results HTML.
    Parameters:
        audio_path: Local path to recorded audio.
        target_sentence: Sentence the user needs to say.
        asr_model_id: Desired ASR model.
        device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
    Returns:
        user_transcript: The recognized user utterance
        score_html: HTML string to display the score
        diff_html: HTML string for displaying the differences between target and user utterance
        result_html: HTML string describing the results, or an error message
        clone_audio: Bool for whether to allow audio cloning: This makes the audio cloning components visible
    """
    clone_audio = False
    # Transcribe user input
    error_msg, user_transcript = get_user_transcript(audio_path,
                                                     target_sentence,
                                                     asr_model_id,
                                                     device_pref)
    if error_msg:
        score_html = ""
        diff_html = ""
        result_html = error_msg
    else:
        # Calculate match details between the target and recognized user input
        sentence_match = process.SentenceMatcher(target_sentence,
                                                 user_transcript,
                                                 pass_threshold)
        if sentence_match.passed:
            clone_audio = True
        # Create the output to print out
        score_html, result_html, diff_html = make_html(sentence_match)

    return (user_transcript, score_html, result_html, diff_html,
            gr.Row(visible=clone_audio))

def clone_voice(audio_input, text_input, exaggeration_input, cfgw_input,
                seed_num_input, temperature_input):
    global client
    # Additional specifications for Chatterbox include:
    # exaggeration_input=0.5,
    # temperature_input=0.8,
    # seed_num_input=0,z
    # cfgw_input=0.5,
    # api_name="/generate_tts_audio"
    return client.predict(text_input=text_input,
                          audio_prompt_path_input=handle_file(audio_input),
                          exaggeration_input=exaggeration_input,
                          cfgw_input=cfgw_input,
                          seed_num_input=seed_num_input,
                          temperature_input=temperature_input)


# ------------------- UI -------------------
with gr.Blocks(title="Voice Consent Gate") as demo:
    gr.Markdown("# Voice Consent Gate: Demo")
    with gr.Row():
        with gr.Column():
            gr.Image(GATE_IMAGE_PATH, interactive=False, show_download_button=False)
        with gr.Column():
            with gr.Accordion(
                    label="Click for further information on this demo",
                    open=False):
                gr.Markdown("""
                    To create a basic voice cloning system with a voice consent gate, you need three parts:
                    1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, uniquely referencing the current consent context.
                    2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
                    3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the speaker's speech snippets to generate speech.
                    
                    Since some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_, a sentence used for consent can **also** be used for voice cloning. 
                    """)
    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown(
                """# 🎤 Say the Sentence (English)"""
            )
            gr.Markdown(
            """
            ## 1) Generate a sentence.  
            ## 2) Record yourself reading it.  
            ## 3) Transcribe & check your accuracy.  
            ## 4) If matched, clone your voice to speak any sentence you enter.
            """
            )
        with gr.Column():
            consent_method = gr.Dropdown(
                label="Sentence generation method (currently limited to Llama 3.2 3B Instruct)",
                choices=["Llama 3.2 3B Instruct"],
                value="Llama 3.2 3B Instruct"
            )
            asr_model = gr.Dropdown(label="Speech recognition model (currently limited to Whisper)",
                                    choices=["openai/whisper-tiny.en",  # fastest (CPU-friendly)
                                            "openai/whisper-base.en",  # better accuracy, a bit slower
                                            "distil-whisper/distil-small.en"
                                            # optional distil English model
                                            ],
                                    value="openai/whisper-tiny.en",
                                    )
            voice_clone_model = gr.Dropdown(
                label="Voice cloning model (currently limited to Chatterbox)",
                choices=["Chatterbox", ], value="Chatterbox")
    with gr.Row():
        target = gr.Textbox(label="Target sentence", interactive=False,
                            placeholder="Click 'Generate sentence'")
    with gr.Row():
        btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
        btn_clear = gr.Button("🧹 Clear")
    with gr.Row():
        consent_audio = gr.Audio(sources=["microphone"], type="filepath",
                                 label="Record your voice", key='consent_audio')
    with gr.Accordion("Advanced ASR settings", open=False):
        device_pref = gr.Radio(
            choices=["auto", "cpu", "cuda"],
            value="auto",
            label="Device preference"
        )
        # In your own code, do not provide users with the option to change this: Set it yourself.
        pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
                                   label="Match threshold")
    with gr.Row():
        btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
    with gr.Row():
        user_transcript = gr.Textbox(label="Transcription", interactive=False)
    with gr.Row():
        score_html = gr.Label(label="Score")
        result_html = gr.Label(label="Result")
    diff_html = gr.HTML(
        label="Word-level diff (red = expected but missing / green = extra or replacement)")

    gr.Markdown("## 🔁 Voice Consent Gate (opens upon consent)")
    # TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
    with gr.Row(visible=False) as tts_ui:
        # Using the render decorator so that we can access consent audio after it's recorded.
        @gr.render(inputs=consent_audio)
        def show_tts(audio_input):
            global client
            if audio_input:
                client = Client("ResembleAI/Chatterbox")
                with gr.Row():
                    with gr.Column():
                        gr.Markdown("## Audio input")
                        # Prepopulating with the consent audio.
                        # Setting interactive=False keeps it from being possible to upload something else.
                        tts_audio = gr.Audio(audio_input, type="filepath", interactive=False)
                with gr.Row():
                    with gr.Column():
                        gr.Markdown("## Text input")
                        tts_text = gr.Textbox(
                            "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
                with gr.Row():
                    # TODO: Ideally, these options aren't hardcoded -- e.g., using .load(), where they're imported, allowing for different options depending on the client.
                    with gr.Accordion("More options", open=False):
                        exaggeration = gr.Slider(
                            0.25, 2, step=.05,
                            label="Exaggeration (Neutral = 0.5, extreme values can be unstable)",
                            value=.5
                        )
                        cfg_weight = gr.Slider(
                            0.2, 1, step=.05, label="CFG/Pace", value=0.5
                        )
                        seed_num = gr.Number(value=0,
                                             label="Random seed (0 for random)")
                        temp = gr.Slider(0.05, 5, step=.05,
                                         label="Temperature", value=.8)
                with gr.Row():
                    clone_btn = gr.Button("Clone!")
                    cloned_audio = gr.Audio(show_download_button=True)
                    clone_btn.click(fn=clone_voice,
                                    inputs=[tts_audio, tts_text, exaggeration,
                                            cfg_weight, seed_num, temp],
                                    outputs=[cloned_audio])

    # -------- Events --------
    # Generate sentence: including model name + detailed prompt
    btn_gen.click(
        fn=generate.gen_sentence,
        inputs=[consent_method, voice_clone_model],
        outputs=target
    )

    btn_clear.click(
        fn=clear_all,
        outputs=[target, user_transcript, score_html, result_html, diff_html,
                 tts_ui]
    )

    btn_check.click(
        fn=transcribe_check,
        inputs=[consent_audio, target, asr_model, device_pref, pass_threshold],
        outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
    )


if __name__ == "__main__":
    demo.launch(show_error=True)