File size: 4,797 Bytes
d9f919a
d68b1ee
 
81d7107
e1aefcd
 
d68b1ee
 
 
81d7107
 
 
feba9aa
d68b1ee
 
81d7107
 
d68b1ee
9b7eef6
feba9aa
d68b1ee
 
 
 
 
 
 
 
 
 
feba9aa
d68b1ee
feba9aa
d68b1ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81d7107
 
 
1ac7d5c
 
81d7107
 
 
 
1a28529
fb5c3ca
81d7107
 
a0314cc
81d7107
a0314cc
 
81d7107
a0314cc
81d7107
d68b1ee
81d7107
 
 
 
 
a0314cc
 
 
 
 
 
 
 
d68b1ee
a0314cc
 
 
 
 
 
 
 
d68b1ee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from nemo.collections.asr.models import EncDecRNNTBPEModel
from nemo.collections.asr.models import EncDecSpeakerLabelModel
import gradio as gr
import pandas as pd 
import torch
import json
from omegaconf import OmegaConf
import uuid

device = "cuda" if torch.cuda.is_available() else "cpu"

model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
model.eval()

def run_diarization(path1):
    print(path1)
    annotation = model(path1, num_workers=0, batch_size=16)
    rttm=annotation.to_rttm()
    df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
    lines = rttm.splitlines()
    if len(lines) == 0:
        df.loc[0] = 0, 0, 'No speaker found'
        return df
    start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
    end_time = float(start_time) + float(duration)
    df.loc[0] = start_time, end_time, prev_speaker, ''

    for line in lines[1:]:
        split = line.split()
        start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
        end_time = float(start_time) + float(duration)
        if cur_speaker == prev_speaker:
            df.loc[df.index[-1], 'end_time'] = end_time
        else:
            df.loc[len(df)] = start_time, end_time, cur_speaker, ''
        prev_speaker = cur_speaker
    
    hyp = get_transcripts(df, path1)

    assert len(hyp) == len(df)

    for i in range(len(df)):
        df.loc[i, 'text'] = hyp[i]

    return df

def create_manifest(df,audio_path):

    filename = '/tmp/' + str(uuid.uuid4()) + '.json'
    with open(filename, 'w') as f:
        for i in range(len(df)):
            start_time = df.iloc[i]['start_time']
            end_time = df.iloc[i]['end_time']
            speaker = df.iloc[i]['speaker']
            dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
            json.dump(dic, f)
            f.write('\n')

    return filename

def get_transcripts(df, audio_path):
    
    filename = create_manifest(df,audio_path)
    model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
    model.eval()
    config = OmegaConf.create({"manifest_filepath": filename, 'batch_size': 4})
    dataloader = model._setup_transcribe_dataloader(config)
    
    hypotheses = []
    all_hypotheses = []

    for test_batch in (dataloader):
        encoded, encoded_len = model.forward(
            input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
            )
        best_hyp, all_hyp = model.decoding.rnnt_decoder_predictions_tensor(
            encoded,
            encoded_len,
            return_hypotheses=False,
            partial_hypotheses=None,)

        hypotheses += best_hyp
        if all_hyp is not None:
            all_hypotheses += all_hyp
        else:
            all_hypotheses += best_hyp

        del encoded
        del test_batch

    return hypotheses

article = (
    "<p style='text-align: center'>"
    "<a href='https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic' target='_blank'>πŸŽ™οΈ Learn more about MSDD model</a> | "
    "<a href='https://arxiv.org/abs/2203.15974' target='_blank'>πŸ“š MSDD paper</a> | "
    "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>πŸ§‘β€πŸ’» Repository</a>"
    "</p>"
)
examples = [
    ["data/conversation.wav"],
    ["data/id10270_5r0dWxy17C8-00001.wav"],
]

microphone_interface = gr.Interface(
    fn=run_diarization,
    inputs=[gr.Audio(source="microphone", type="filepath", optional=True, label="Mic Audio")],
    outputs=[gr.components.Dataframe()],
    title="Offline Speaker Diarization with NeMo",
    description="This demonstration will perform offline speaker diarization on an audio file using nemo",
    article=article,
    layout="vertical",
    theme="huggingface",
    allow_flagging=False,
    live=False,
    examples=examples,
)

upload_interface = gr.Interface(
    fn=run_diarization,
    inputs=[gr.Audio(source="upload", type='filepath',optional=True, label='Upload File')],
    outputs=[gr.components.Dataframe()],
    title="Offline Speaker Diarization with NeMo",
    description="This demonstration will perform offline speaker diarization on an audio file using nemo",
    article=article,
    layout="vertical",
    theme="huggingface",
    allow_flagging=False,
    live=False,
    examples=examples,
)

demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])

demo.launch(enable_queue=True)