Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -155,109 +155,7 @@ def audio_function():
|
|
| 155 |
global speech
|
| 156 |
speech = transcript
|
| 157 |
return transcript,asr_outputs["chunks"],asr_outputs["text"]
|
| 158 |
-
return {
|
| 159 |
-
"speakers": transcript,
|
| 160 |
-
"chunks": asr_outputs["chunks"],
|
| 161 |
-
"text": asr_outputs["text"],
|
| 162 |
-
}
|
| 163 |
-
a=time.time()
|
| 164 |
-
DOMAIN_TYPE = "meeting" # Can be meeting or telephonic based on domain type of the audio file
|
| 165 |
-
CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
|
| 166 |
-
|
| 167 |
-
CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
CONFIG = wget.download(CONFIG_URL,"./")
|
| 171 |
-
cfg = OmegaConf.load(CONFIG)
|
| 172 |
-
# print(OmegaConf.to_yaml(cfg))
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
# Create a manifest file for input with below format.
|
| 176 |
-
# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-",
|
| 177 |
-
# "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath"="/path/to/uem/filepath"}
|
| 178 |
-
import json
|
| 179 |
-
meta = {
|
| 180 |
-
'audio_filepath': "current_out.wav",
|
| 181 |
-
'offset': 0,
|
| 182 |
-
'duration':None,
|
| 183 |
-
'label': 'infer',
|
| 184 |
-
'text': '-',
|
| 185 |
-
'num_speakers': None,
|
| 186 |
-
'rttm_filepath': None,
|
| 187 |
-
'uem_filepath' : None
|
| 188 |
-
}
|
| 189 |
-
with open(os.path.join('input_manifest.json'),'w') as fp:
|
| 190 |
-
json.dump(meta,fp)
|
| 191 |
-
fp.write('\n')
|
| 192 |
-
|
| 193 |
-
cfg.diarizer.manifest_filepath = 'input_manifest.json'
|
| 194 |
-
cfg.diarizer.out_dir = "./" # Directory to store intermediate files and prediction outputs
|
| 195 |
-
pretrained_speaker_model = 'titanet_large'
|
| 196 |
-
cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
|
| 197 |
-
cfg.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5]
|
| 198 |
-
cfg.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1]
|
| 199 |
-
cfg.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1]
|
| 200 |
-
cfg.diarizer.oracle_vad = True # ----> ORACLE VAD
|
| 201 |
-
cfg.diarizer.clustering.parameters.oracle_num_speakers = False
|
| 202 |
-
# cfg.diarizer.manifest_filepath = 'input_manifest.json'
|
| 203 |
-
# # !cat {cfg.diarizer.manifest_filepath}
|
| 204 |
-
# pretrained_speaker_model='titanet_large'
|
| 205 |
-
# cfg.diarizer.manifest_filepath = cfg.diarizer.manifest_filepath
|
| 206 |
-
# cfg.diarizer.out_dir = "./" #Directory to store intermediate files and prediction outputs
|
| 207 |
-
# cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
|
| 208 |
-
# cfg.diarizer.clustering.parameters.oracle_num_speakers=False
|
| 209 |
-
|
| 210 |
-
# Using Neural VAD and Conformer ASR
|
| 211 |
-
cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
|
| 212 |
-
cfg.diarizer.asr.model_path = 'stt_en_conformer_ctc_large'
|
| 213 |
-
cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD
|
| 214 |
-
cfg.diarizer.asr.parameters.asr_based_vad = False
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
|
| 218 |
-
asr_model = asr_decoder_ts.set_asr_model()
|
| 219 |
-
print(asr_model)
|
| 220 |
-
word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)
|
| 221 |
-
|
| 222 |
-
print("Decoded word output dictionary: \n", word_hyp)
|
| 223 |
-
print("Word-level timestamps dictionary: \n", word_ts_hyp)
|
| 224 |
-
|
| 225 |
|
| 226 |
-
asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
|
| 227 |
-
asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset
|
| 228 |
-
|
| 229 |
-
diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)
|
| 230 |
-
print("Diarization hypothesis output: \n", diar_hyp)
|
| 231 |
-
trans_info_dict = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
|
| 232 |
-
# print(trans_info_dict)
|
| 233 |
-
|
| 234 |
-
# with open(os.path.join('output_diarization.json'),'w') as fp1:
|
| 235 |
-
# json.dump(trans_info_dict,fp1)
|
| 236 |
-
# fp1.write('\n')
|
| 237 |
-
# b = time.time()
|
| 238 |
-
# print(b-a,"seconds diartization time for 50 min audio")
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
import json
|
| 242 |
-
context = ""
|
| 243 |
-
context_2 = ""
|
| 244 |
-
# global context_2
|
| 245 |
-
# with open("output.json","r") as fli:
|
| 246 |
-
# json_dict = json.load(fli)
|
| 247 |
-
# for lst in sorted(json_dict["speakers"], key=lambda x: x['timestamp'][0], reverse=False):
|
| 248 |
-
# context = context + str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " + lst["text"]+"\n"
|
| 249 |
-
# context = context + str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " + lst["speaker"]+" ; "+ lst["text"]+"\n"
|
| 250 |
-
for dct in trans_info_dict["current_out"]["sentences"]:
|
| 251 |
-
# context = context + "start_time : {} ".format(dct["start_time"]) + "end_time : {} ".format(dct["end_time"])+ "speaker : {} ".format(dct["speaker"]) + "\n"
|
| 252 |
-
context = context + str(dct["start_time"])+" : "+str(dct["end_time"]) + " = " + dct["speaker"]+" ; "+ dct["text"]+"\n"
|
| 253 |
-
context_2 = context_2 + str(dct["start_time"])+" : "+str(dct["end_time"]) + " = "+ dct["text"]+"\n"
|
| 254 |
-
global speech
|
| 255 |
-
speech = trans_info_dict["current_out"]["transcription"]
|
| 256 |
-
|
| 257 |
-
time_2 = time.time()
|
| 258 |
-
|
| 259 |
-
return context,context_2,str(int(time_2-time_1)) + " seconds"
|
| 260 |
-
|
| 261 |
def audio_function2():
|
| 262 |
# Call the function and return its result to be displayed
|
| 263 |
|
|
|
|
| 155 |
global speech
|
| 156 |
speech = transcript
|
| 157 |
return transcript,asr_outputs["chunks"],asr_outputs["text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
def audio_function2():
|
| 160 |
# Call the function and return its result to be displayed
|
| 161 |
|