import torch import torchaudio from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from speechbrain.pretrained import EncoderClassifier import gradio as gr # Load model & processor processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Load speaker embedding từ file WAV classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb") signal, fs = torchaudio.load("speaker_vie.wav") if fs != 16000: signal = torchaudio.functional.resample(signal, fs, 16000) embeddings = classifier.encode_batch(signal) speaker_embeddings = embeddings.squeeze(0) # Hàm chuyển văn bản thành giọng nói def tts_fn(text): inputs = processor(text=text, return_tensors="pt") speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) output_path = "output.wav" torchaudio.save(output_path, speech.unsqueeze(0), 16000) return output_path # Giao diện Gradio gr.Interface(fn=tts_fn, inputs="text", outputs="audio", title="Vietnamese TTS with SpeechT5").launch()