File size: 1,616 Bytes
35f8a26
0f4f655
4ae34f2
 
 
0f4f655
 
9a710b0
 
 
 
 
 
 
 
 
 
0f4f655
f23d60c
0f4f655
 
4ae34f2
 
3fdc3cc
4ae34f2
 
5d7b200
4ae34f2
 
 
 
 
 
 
 
 
 
 
0f4f655
 
 
 
 
3fdc3cc
0f4f655
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import gradio as gr
from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
import numpy as np
import torch
import io
import soundfile as sf

# Initialize ASR pipeline
transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr")

# Initialize LLM pipeline
generator = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)

# Initialize TTS tokenizer and model
tokenizer_tts = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model_tts = VitsModel.from_pretrained("facebook/mms-tts-eng")

# Initialize ASR pipeline
print("TTS Tokenizer:", tokenizer_tts)  # Print the tokenizer for the TTS model

def transcribe_and_generate_audio(audio):

    # Transcribe audio
    asr_output = transcriber(audio)["text"]

    # Generate text based on ASR output
    generated_text = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']

    # Generate audio from text using TTS model
    inputs = tokenizer_tts(text=generated_text, return_tensors="pt")
    set_seed(555)
    with torch.no_grad():
        outputs = model_tts(**inputs)
    waveform = outputs.waveform[0]
    waveform_path = "output.wav"
    sf.write(waveform_path, waveform.numpy(), 16000, format='wav')

    return waveform_path

# Define Gradio interface
audio_input = gr.Interface(
    transcribe_and_generate_audio,
    gr.Audio(sources=["microphone"], label="Speak Here"),
    "audio",
    title="ASR -> LLM -> TTS",
    description="Speak into the microphone and hear the generated audio."
)

# Launch the interface
audio_input.launch()