Spaces:

nithinraok
/

NeMo-Offline-Speaker-Diarization

Build error

File size: 4,797 Bytes

d9f919a
d68b1ee
 
81d7107
e1aefcd
 
d68b1ee
 
 
81d7107
 
 
feba9aa
d68b1ee
 
81d7107
 
d68b1ee
9b7eef6
feba9aa
d68b1ee
 
 
 
 
 
 
 
 
 
feba9aa
d68b1ee
feba9aa
d68b1ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81d7107
 
 
1ac7d5c
 
81d7107
 
 
 
1a28529
fb5c3ca
81d7107
 
a0314cc
81d7107
a0314cc
 
81d7107
a0314cc
81d7107
d68b1ee
81d7107
 
 
 
 
a0314cc
 
 
 
 
 
 
 
d68b1ee
a0314cc
 
 
 
 
 
 
 
d68b1ee

from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from nemo.collections.asr.models import EncDecRNNTBPEModel
from nemo.collections.asr.models import EncDecSpeakerLabelModel
import gradio as gr
import pandas as pd 
import torch
import json
from omegaconf import OmegaConf
import uuid

device = "cuda" if torch.cuda.is_available() else "cpu"

model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
model.eval()

def run_diarization(path1):
    print(path1)
    annotation = model(path1, num_workers=0, batch_size=16)
    rttm=annotation.to_rttm()
    df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
    lines = rttm.splitlines()
    if len(lines) == 0:
        df.loc[0] = 0, 0, 'No speaker found'
        return df
    start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
    end_time = float(start_time) + float(duration)
    df.loc[0] = start_time, end_time, prev_speaker, ''

    for line in lines[1:]:
        split = line.split()
        start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
        end_time = float(start_time) + float(duration)
        if cur_speaker == prev_speaker:
            df.loc[df.index[-1], 'end_time'] = end_time
        else:
            df.loc[len(df)] = start_time, end_time, cur_speaker, ''
        prev_speaker = cur_speaker
    
    hyp = get_transcripts(df, path1)

    assert len(hyp) == len(df)

    for i in range(len(df)):
        df.loc[i, 'text'] = hyp[i]

    return df

def create_manifest(df,audio_path):

    filename = '/tmp/' + str(uuid.uuid4()) + '.json'
    with open(filename, 'w') as f:
        for i in range(len(df)):
            start_time = df.iloc[i]['start_time']
            end_time = df.iloc[i]['end_time']
            speaker = df.iloc[i]['speaker']
            dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
            json.dump(dic, f)
            f.write('\n')

    return filename

def get_transcripts(df, audio_path):
    
    filename = create_manifest(df,audio_path)
    model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
    model.eval()
    config = OmegaConf.create({"manifest_filepath": filename, 'batch_size': 4})
    dataloader = model._setup_transcribe_dataloader(config)
    
    hypotheses = []
    all_hypotheses = []

    for test_batch in (dataloader):
        encoded, encoded_len = model.forward(
            input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
            )
        best_hyp, all_hyp = model.decoding.rnnt_decoder_predictions_tensor(
            encoded,
            encoded_len,
            return_hypotheses=False,
            partial_hypotheses=None,)

        hypotheses += best_hyp
        if all_hyp is not None:
            all_hypotheses += all_hyp
        else:
            all_hypotheses += best_hyp

        del encoded
        del test_batch

    return hypotheses

article = (
    "<p style='text-align: center'>"
    "<a href='https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic' target='_blank'>🎙️ Learn more about MSDD model</a> | "
    "<a href='https://arxiv.org/abs/2203.15974' target='_blank'>📚 MSDD paper</a> | "
    "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>🧑‍💻 Repository</a>"
    "</p>"
)
examples = [
    ["data/conversation.wav"],
    ["data/id10270_5r0dWxy17C8-00001.wav"],
]

microphone_interface = gr.Interface(
    fn=run_diarization,
    inputs=[gr.Audio(source="microphone", type="filepath", optional=True, label="Mic Audio")],
    outputs=[gr.components.Dataframe()],
    title="Offline Speaker Diarization with NeMo",
    description="This demonstration will perform offline speaker diarization on an audio file using nemo",
    article=article,
    layout="vertical",
    theme="huggingface",
    allow_flagging=False,
    live=False,
    examples=examples,
)

upload_interface = gr.Interface(
    fn=run_diarization,
    inputs=[gr.Audio(source="upload", type='filepath',optional=True, label='Upload File')],
    outputs=[gr.components.Dataframe()],
    title="Offline Speaker Diarization with NeMo",
    description="This demonstration will perform offline speaker diarization on an audio file using nemo",
    article=article,
    layout="vertical",
    theme="huggingface",
    allow_flagging=False,
    live=False,
    examples=examples,
)

demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])

demo.launch(enable_queue=True)