MyAlexa / app.py
camanalo1's picture
Update app.py
15736b9 verified
raw
history blame
1.56 kB
import gradio as gr
from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
import numpy as np
import torch
import io
import soundfile as sf
# Initialize ASR pipeline
transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr")
# Initialize LLM pipeline
generator = pipeline("text-generation", model="gpt2")
# Initialize TTS tokenizer and model
tokenizer_tts = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model_tts = VitsModel.from_pretrained("facebook/mms-tts-eng")
# Initialize ASR pipeline
print("TTS Tokenizer:", tokenizer_tts) # Print the tokenizer for the TTS model
def transcribe_and_generate_audio(audio):
# Transcribe audio
asr_output = transcriber(audio)["text"]
# Generate text based on ASR output
generated_text = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
# Generate audio from text using TTS model
inputs = tokenizer_tts(text=generated_text, return_tensors="pt")
set_seed(555)
with torch.no_grad():
outputs = model_tts(**inputs)
waveform = outputs.waveform[0]
waveform_path = "output.wav"
sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
return waveform_path
# Define Gradio interface
audio_input = gr.Interface(
transcribe_and_generate_audio,
gr.Audio(sources=["microphone"], label="Speak Here"),
"audio",
title="ASR -> LLM -> TTS",
description="Speak into the microphone and hear the generated audio."
)
# Launch the interface
audio_input.launch()