Spaces:

camanalo1
/

MyAlexa

Sleeping

MyAlexa / app.py

Update app.py

15736b9 verified over 1 year ago

1.56 kB

	import gradio as gr
	from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
	import numpy as np
	import torch
	import io
	import soundfile as sf

	# Initialize ASR pipeline
	transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr")

	# Initialize LLM pipeline
	generator = pipeline("text-generation", model="gpt2")

	# Initialize TTS tokenizer and model
	tokenizer_tts = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
	model_tts = VitsModel.from_pretrained("facebook/mms-tts-eng")

	# Initialize ASR pipeline
	print("TTS Tokenizer:", tokenizer_tts) # Print the tokenizer for the TTS model

	def transcribe_and_generate_audio(audio):

	# Transcribe audio
	asr_output = transcriber(audio)["text"]

	# Generate text based on ASR output
	generated_text = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']

	# Generate audio from text using TTS model
	inputs = tokenizer_tts(text=generated_text, return_tensors="pt")
	set_seed(555)
	with torch.no_grad():
	outputs = model_tts(**inputs)
	waveform = outputs.waveform[0]
	waveform_path = "output.wav"
	sf.write(waveform_path, waveform.numpy(), 16000, format='wav')

	return waveform_path

	# Define Gradio interface
	audio_input = gr.Interface(
	transcribe_and_generate_audio,
	gr.Audio(sources=["microphone"], label="Speak Here"),
	"audio",
	title="ASR -> LLM -> TTS",
	description="Speak into the microphone and hear the generated audio."
	)

	# Launch the interface
	audio_input.launch()