Spaces:
Build error
Build error
File size: 4,797 Bytes
d9f919a d68b1ee 81d7107 e1aefcd d68b1ee 81d7107 feba9aa d68b1ee 81d7107 d68b1ee 9b7eef6 feba9aa d68b1ee feba9aa d68b1ee feba9aa d68b1ee 81d7107 1ac7d5c 81d7107 1a28529 fb5c3ca 81d7107 a0314cc 81d7107 a0314cc 81d7107 a0314cc 81d7107 d68b1ee 81d7107 a0314cc d68b1ee a0314cc d68b1ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from nemo.collections.asr.models import EncDecRNNTBPEModel
from nemo.collections.asr.models import EncDecSpeakerLabelModel
import gradio as gr
import pandas as pd
import torch
import json
from omegaconf import OmegaConf
import uuid
device = "cuda" if torch.cuda.is_available() else "cpu"
model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
model.eval()
def run_diarization(path1):
print(path1)
annotation = model(path1, num_workers=0, batch_size=16)
rttm=annotation.to_rttm()
df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
lines = rttm.splitlines()
if len(lines) == 0:
df.loc[0] = 0, 0, 'No speaker found'
return df
start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
end_time = float(start_time) + float(duration)
df.loc[0] = start_time, end_time, prev_speaker, ''
for line in lines[1:]:
split = line.split()
start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
end_time = float(start_time) + float(duration)
if cur_speaker == prev_speaker:
df.loc[df.index[-1], 'end_time'] = end_time
else:
df.loc[len(df)] = start_time, end_time, cur_speaker, ''
prev_speaker = cur_speaker
hyp = get_transcripts(df, path1)
assert len(hyp) == len(df)
for i in range(len(df)):
df.loc[i, 'text'] = hyp[i]
return df
def create_manifest(df,audio_path):
filename = '/tmp/' + str(uuid.uuid4()) + '.json'
with open(filename, 'w') as f:
for i in range(len(df)):
start_time = df.iloc[i]['start_time']
end_time = df.iloc[i]['end_time']
speaker = df.iloc[i]['speaker']
dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
json.dump(dic, f)
f.write('\n')
return filename
def get_transcripts(df, audio_path):
filename = create_manifest(df,audio_path)
model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
model.eval()
config = OmegaConf.create({"manifest_filepath": filename, 'batch_size': 4})
dataloader = model._setup_transcribe_dataloader(config)
hypotheses = []
all_hypotheses = []
for test_batch in (dataloader):
encoded, encoded_len = model.forward(
input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
)
best_hyp, all_hyp = model.decoding.rnnt_decoder_predictions_tensor(
encoded,
encoded_len,
return_hypotheses=False,
partial_hypotheses=None,)
hypotheses += best_hyp
if all_hyp is not None:
all_hypotheses += all_hyp
else:
all_hypotheses += best_hyp
del encoded
del test_batch
return hypotheses
article = (
"<p style='text-align: center'>"
"<a href='https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic' target='_blank'>ποΈ Learn more about MSDD model</a> | "
"<a href='https://arxiv.org/abs/2203.15974' target='_blank'>π MSDD paper</a> | "
"<a href='https://github.com/NVIDIA/NeMo' target='_blank'>π§βπ» Repository</a>"
"</p>"
)
examples = [
["data/conversation.wav"],
["data/id10270_5r0dWxy17C8-00001.wav"],
]
microphone_interface = gr.Interface(
fn=run_diarization,
inputs=[gr.Audio(source="microphone", type="filepath", optional=True, label="Mic Audio")],
outputs=[gr.components.Dataframe()],
title="Offline Speaker Diarization with NeMo",
description="This demonstration will perform offline speaker diarization on an audio file using nemo",
article=article,
layout="vertical",
theme="huggingface",
allow_flagging=False,
live=False,
examples=examples,
)
upload_interface = gr.Interface(
fn=run_diarization,
inputs=[gr.Audio(source="upload", type='filepath',optional=True, label='Upload File')],
outputs=[gr.components.Dataframe()],
title="Offline Speaker Diarization with NeMo",
description="This demonstration will perform offline speaker diarization on an audio file using nemo",
article=article,
layout="vertical",
theme="huggingface",
allow_flagging=False,
live=False,
examples=examples,
)
demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
demo.launch(enable_queue=True)
|