import gradio as gr from transformers import pipeline, VitsTokenizer, VitsModel, set_seed import numpy as np import torch import io import soundfile as sf # Initialize ASR pipeline transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr") # Initialize LLM pipeline generator = pipeline("text-generation", model="gpt2") # Initialize TTS tokenizer and model tokenizer_tts = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") model_tts = VitsModel.from_pretrained("facebook/mms-tts-eng") # Initialize ASR pipeline print("TTS Tokenizer:", tokenizer_tts) # Print the tokenizer for the TTS model def transcribe_and_generate_audio(audio): # Transcribe audio asr_output = transcriber(audio)["text"] # Generate text based on ASR output generated_text = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text'] # Generate audio from text using TTS model inputs = tokenizer_tts(text=generated_text, return_tensors="pt") set_seed(555) with torch.no_grad(): outputs = model_tts(**inputs) waveform = outputs.waveform[0] waveform_path = "output.wav" sf.write(waveform_path, waveform.numpy(), 16000, format='wav') return waveform_path # Define Gradio interface audio_input = gr.Interface( transcribe_and_generate_audio, gr.Audio(sources=["microphone"], label="Speak Here"), "audio", title="ASR -> LLM -> TTS", description="Speak into the microphone and hear the generated audio." ) # Launch the interface audio_input.launch()