Alexa-NLU-Clone

Running

File size: 3,390 Bytes

fb4e25f
 
d66e935
59da3de
 
f9e3936
59da3de
 
0ed6760
 
 
 
 
 
 
 
 
bcf29d2
0ed6760
 
 
 
 
bcf29d2
0ed6760
 
 
 
 
bcf29d2
0ed6760
d66e935
cb25b1b
 
63deeee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1bcd824
 
 
53c197c
1bcd824
 
 
 
 
 
 
 
 
 
 
 
138be94
 
43bb9e7
138be94
 
 
 
 
 
 
 
 
 
 
43bb9e7
138be94
 
cf263be
138be94
fb4e25f

import gradio as gr

import os
import torch
import librosa
from glob import glob
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoModelForTokenClassification, TokenClassificationPipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM

# ASR
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
processor_asr = Wav2Vec2Processor.from_pretrained(model_name)
model_asr = Wav2Vec2ForCTC.from_pretrained(model_name)

# Classifier Intent
model_name = 'qanastek/XLMRoberta-Alexa-Intents-Classification'
tokenizer_intent = AutoTokenizer.from_pretrained(model_name)
model_intent = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier_intent = TextClassificationPipeline(model=model_intent, tokenizer=tokenizer_intent)

# Classifier Language
model_name = 'qanastek/51-languages-classifier'
tokenizer_langs = AutoTokenizer.from_pretrained(model_name)
model_langs = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier_language = TextClassificationPipeline(model=model_langs, tokenizer=tokenizer_langs)

# NER Extractor
model_name = 'qanastek/XLMRoberta-Alexa-Intents-NER-NLU'
tokenizer_ner = AutoTokenizer.from_pretrained(model_name)
model_ner = AutoModelForTokenClassification.from_pretrained(model_name)
predict_ner = TokenClassificationPipeline(model=model_ner, tokenizer=tokenizer_ner)

EXAMPLE_DIR = './'
examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))

def transcribe(audio_path):
        
    speech_array, sampling_rate = librosa.load(audio_path, sr=16_000)

    inputs = processor_asr(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model_asr(inputs.input_values, attention_mask=inputs.attention_mask).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    
    return processor_asr.batch_decode(predicted_ids)[0]

def getUniform(text):

    idx = 0
    res = {}

    for t in text:

        raw = t["entity"].replace("B-","").replace("I-","")
        word = t["word"].replace("▁","")

        if "B-" in t["entity"]:
            res[f"{raw}|{idx}"] = [word]
            idx += 1
        else:
            res[f"{raw}|{idx}"].append(word)

    res = [(r.split("|")[0], res[r]) for r in res]

    return res


def process(path):
    
    text = transcribe(path).replace("apizza","a pizza")

    intent_class = classifier_intent(text)[0]["label"]
    language_class = classifier_language(text)[0]["label"]
    named_entities = getUniform(predict_ner(text))

    return {
        "text": text,
        "language": language_class,
        "intent_class": intent_class,
        "named_entities": named_entities,
    }

def predict(wav_file):
    res = process(wav_file)
    return res

# iface = gr.Interface(fn=predict, inputs="text", outputs="text")

iface = gr.Interface(
    predict,
    title='Alexa NLU Clone',
    description='Upload your wav file to test the model',
    inputs=[
        gr.inputs.Audio(label='wav file', source='microphone', type='filepath')
    ],
    outputs=[
        gr.outputs.JSON(label='Slot Recognition + Intent Classification + Language Classification + ASR'),
    ],
    examples=examples,
    article='Made with ❤️ by <a href="https://www.linkedin.com/in/yanis-labrak-8a7412145/" target="_blank">Yanis Labrak</a> thanks to 🤗',
)

iface.launch()