Spaces:

mohan007
/

sales_audio_analysis

Build error

App Files Files Community

mohan007 commited on Jan 7

Commit

aeccecc

verified ·

1 Parent(s): c4bdf6c

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -102

app.py CHANGED Viewed

@@ -155,109 +155,7 @@ def audio_function():
     global speech
     speech  = transcript
     return transcript,asr_outputs["chunks"],asr_outputs["text"]
-    return {
-        "speakers": transcript,
-        "chunks": asr_outputs["chunks"],
-        "text": asr_outputs["text"],
-    }
-    a=time.time()
-    DOMAIN_TYPE = "meeting" # Can be meeting or telephonic based on domain type of the audio file
-    CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
-    CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"
-    CONFIG = wget.download(CONFIG_URL,"./")
-    cfg = OmegaConf.load(CONFIG)
-    # print(OmegaConf.to_yaml(cfg))
-    # Create a manifest file for input with below format.
-    # {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-",
-    # "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath"="/path/to/uem/filepath"}
-    import json
-    meta = {
-        'audio_filepath': "current_out.wav",
-        'offset': 0,
-        'duration':None,
-        'label': 'infer',
-        'text': '-',
-        'num_speakers': None,
-        'rttm_filepath': None,
-        'uem_filepath' : None
-    }
-    with open(os.path.join('input_manifest.json'),'w') as fp:
-        json.dump(meta,fp)
-        fp.write('\n')
-    cfg.diarizer.manifest_filepath = 'input_manifest.json'
-    cfg.diarizer.out_dir = "./" # Directory to store intermediate files and prediction outputs
-    pretrained_speaker_model = 'titanet_large'
-    cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
-    cfg.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5]
-    cfg.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1]
-    cfg.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1]
-    cfg.diarizer.oracle_vad = True # ----> ORACLE VAD
-    cfg.diarizer.clustering.parameters.oracle_num_speakers = False
-    # cfg.diarizer.manifest_filepath = 'input_manifest.json'
-    # # !cat {cfg.diarizer.manifest_filepath}
-    # pretrained_speaker_model='titanet_large'
-    # cfg.diarizer.manifest_filepath = cfg.diarizer.manifest_filepath
-    # cfg.diarizer.out_dir = "./" #Directory to store intermediate files and prediction outputs
-    # cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
-    # cfg.diarizer.clustering.parameters.oracle_num_speakers=False
-    # Using Neural VAD and Conformer ASR
-    cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
-    cfg.diarizer.asr.model_path = 'stt_en_conformer_ctc_large'
-    cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD
-    cfg.diarizer.asr.parameters.asr_based_vad = False
-    asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
-    asr_model = asr_decoder_ts.set_asr_model()
-    print(asr_model)
-    word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)
-    print("Decoded word output dictionary: \n", word_hyp)
-    print("Word-level timestamps dictionary: \n", word_ts_hyp)
-    asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
-    asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset
-    diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)
-    print("Diarization hypothesis output: \n", diar_hyp)
-    trans_info_dict = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
-    # print(trans_info_dict)
-    # with open(os.path.join('output_diarization.json'),'w') as fp1:
-        # json.dump(trans_info_dict,fp1)
-        # fp1.write('\n')
-    # b = time.time()
-    # print(b-a,"seconds  diartization time for 50 min audio")
-    import json
-    context  = ""
-    context_2 = ""
-    # global context_2
-    # with open("output.json","r") as fli:
-        # json_dict = json.load(fli)
-    # for lst in sorted(json_dict["speakers"], key=lambda x: x['timestamp'][0], reverse=False):
-        # context = context +  str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " +  lst["text"]+"\n"
-        # context = context +  str(lst["timestamp"][0])+" : "+str(lst["timestamp"][1]) + " = " + lst["speaker"]+" ; "+ lst["text"]+"\n"
-    for dct in trans_info_dict["current_out"]["sentences"]:
-        # context = context + "start_time : {} ".format(dct["start_time"]) + "end_time : {} ".format(dct["end_time"])+ "speaker : {} ".format(dct["speaker"]) + "\n"
-        context = context +  str(dct["start_time"])+" : "+str(dct["end_time"]) + " = " + dct["speaker"]+" ; "+ dct["text"]+"\n"
-        context_2 = context_2 +  str(dct["start_time"])+" : "+str(dct["end_time"]) + " = "+ dct["text"]+"\n"
-    global speech
-    speech = trans_info_dict["current_out"]["transcription"]
-    time_2 = time.time()
-    return context,context_2,str(int(time_2-time_1)) + " seconds"
 def audio_function2():
     # Call the function and return its result to be displayed

     global speech
     speech  = transcript
     return transcript,asr_outputs["chunks"],asr_outputs["text"]
 def audio_function2():
     # Call the function and return its result to be displayed