import gradio as gr import torch import numpy as np from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset # sample tensor sample_tensor = torch.load("generated_speech.pt") sample_text = "namaste kya apa mujhe bata sakate haim ki kala apaki phlaita kisa samaya hai kya apa taiksi lemge" # STT model stt_checkpoint = "navodit17/speecht5_finetuned_indic_tts_hi" stt_processor = SpeechT5Processor.from_pretrained(stt_checkpoint) stt_model = SpeechT5ForTextToSpeech.from_pretrained(stt_checkpoint) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # speaker_embeddings = torch.tensor([[ # -7.9096e-02, 3.2141e-02, 4.4062e-02, 2.2458e-02, 2.4818e-02, # 1.0870e-02, -6.9989e-02, 2.9361e-02, 4.9650e-02, 1.4352e-02, # -8.0856e-02, -6.9597e-02, 5.1308e-02, 2.0282e-02, 5.4853e-02, # 7.3063e-02, 1.3617e-02, 2.8050e-02, 1.6760e-03, 4.5911e-03, # 2.2377e-02, 3.5073e-02, -1.2015e-02, -5.6730e-02, -5.4457e-02, # -5.6295e-03, -6.0393e-02, 1.5303e-02, 4.4781e-02, 4.2038e-02, # -2.6033e-03, 4.3630e-02, 1.7581e-02, 1.0797e-02, 4.1421e-02, # -6.9390e-02, 5.0698e-02, 3.5905e-02, 2.7092e-02, -4.9815e-02, # 3.9357e-02, 1.8817e-02, 2.8782e-02, 4.3815e-02, 2.6008e-02, # -7.6643e-02, -1.4641e-02, 9.5107e-03, -1.0249e-01, 4.3864e-02, # 2.4998e-02, 3.7842e-02, 2.0267e-02, 4.0481e-02, -8.5064e-02, # -2.1616e-03, 1.5996e-02, 1.0865e-02, 3.2461e-02, 3.8486e-02, # 3.6491e-02, -5.9280e-03, -1.1070e-02, 1.6478e-02, 2.4622e-02, # 4.9041e-02, 2.8210e-02, -5.2851e-02, -5.6465e-02, -5.9150e-02, # 2.7470e-02, 5.4932e-03, 7.2415e-03, 1.3297e-02, 4.7910e-02, # 3.7617e-02, 1.5726e-02, 2.0426e-02, -5.4756e-02, -7.1068e-02, # -7.2809e-02, -7.8266e-02, -5.7242e-02, -7.1732e-02, -2.2402e-02, # -6.6500e-02, -7.6155e-02, 6.3509e-02, -6.7639e-03, -6.8526e-03, # 2.5241e-02, -4.5859e-02, 2.0980e-02, -5.7519e-02, 4.9442e-02, # -2.2280e-02, 5.3843e-02, 4.3883e-02, -4.6968e-02, -8.6317e-02, # 6.1945e-03, -7.8510e-02, -7.2955e-02, 4.6143e-03, 6.8236e-02, # -5.7523e-02, 2.6043e-02, 5.1134e-02, 5.0466e-02, 1.9361e-02, # -7.3835e-02, 5.8783e-02, 7.8403e-02, 1.4997e-02, 1.8314e-02, # 6.3094e-02, -7.9442e-02, 3.4601e-02, -5.3276e-02, -5.0826e-02, # 2.4606e-02, -6.8361e-02, 2.6284e-02, 4.2329e-02, -3.7599e-02, # 4.1646e-02, -9.5280e-02, 3.7492e-02, 3.7636e-02, 2.5985e-02, # -3.0050e-02, 3.2662e-02, 2.7723e-02, 3.8104e-02, 1.8247e-02, # -7.3857e-02, -7.5490e-02, 3.2894e-02, -7.2749e-02, -3.6701e-02, # 2.2667e-02, 4.1351e-03, -1.4796e-02, 4.1243e-02, -6.9272e-02, # 2.4523e-02, 2.1793e-02, -3.3412e-03, 2.7912e-02, -5.5684e-02, # 4.8057e-02, -5.1125e-02, -8.6508e-02, 1.6578e-02, 1.4219e-02, # 3.8626e-02, 2.8588e-02, -8.8628e-02, -7.7785e-02, 4.5904e-02, # 2.6973e-02, 1.1173e-02, 3.4062e-02, 1.5100e-02, -1.1940e-02, # 5.4919e-03, 5.3976e-02, 3.3862e-02, 1.7793e-02, 2.7416e-02, # 5.0325e-02, -9.2786e-02, 3.4933e-02, -6.3649e-02, 1.7891e-02, # 4.2497e-02, -6.2080e-02, 3.1213e-02, 2.6646e-02, -7.2364e-02, # 2.3743e-02, -6.4803e-02, -6.8434e-05, 2.2999e-02, -7.8435e-02, # -7.1068e-03, 2.0802e-02, 3.8085e-02, -5.5679e-02, -5.2630e-02, # -6.0600e-02, 2.3879e-02, -5.8713e-02, 1.2526e-02, 8.7441e-03, # 1.2976e-02, 2.3702e-02, 2.0858e-02, 2.4530e-02, -6.1161e-03, # 1.6387e-02, 2.9424e-02, -6.9881e-02, 1.0703e-02, 5.4566e-02, # 9.7716e-03, 4.1892e-02, 7.9958e-03, 4.0326e-02, -3.9815e-03, # -8.0707e-04, -5.9334e-02, -5.2023e-02, 2.4852e-02, -6.4731e-02, # 5.9305e-02, 3.0249e-02, -5.8866e-02, 4.2771e-02, 2.5907e-02, # -4.9304e-02, 1.9540e-02, -6.2296e-02, -1.7946e-02, -8.4763e-03, # 2.7271e-02, 2.8420e-02, 5.1065e-02, 3.1372e-02, 4.7098e-02, # 2.6642e-02, 4.0554e-02, 3.0486e-02, -1.4875e-02, 2.8971e-02, # -2.8165e-02, 4.5303e-02, 1.7752e-02, 1.1463e-03, 4.1254e-02, # -5.5486e-02, 1.4259e-02, -8.8242e-02, 4.6154e-02, -4.7821e-02, # 4.6743e-02, 2.4079e-02, 5.9683e-02, 4.8124e-02, 4.4341e-02, # 2.6699e-02, 4.2861e-02, 5.9677e-03, -5.6233e-02, 2.2145e-02, # 3.6767e-02, -3.2707e-03, 2.9193e-02, -8.3184e-02, 2.9720e-02, # 1.6997e-02, -8.8428e-02, 3.9235e-02, 2.8460e-02, 5.2879e-04, # 3.4858e-02, 4.1993e-02, 4.5816e-02, 6.6310e-03, -4.0764e-03, # 4.1234e-02, -1.3845e-02, 3.9914e-03, 4.9223e-02, -6.4104e-02, # 1.7539e-02, 5.6693e-02, 1.4442e-03, -7.4935e-02, -6.3044e-02, # 4.0006e-03, 4.8351e-02, 3.9536e-02, -8.7633e-02, 2.9052e-02, # 5.1906e-02, 1.2489e-02, 5.8764e-02, -6.9203e-02, 4.2202e-03, # 4.1723e-02, 7.5111e-03, 2.1593e-02, 3.7314e-02, 1.9330e-02, # 8.8582e-03, 1.7286e-02, 4.1805e-02, 2.7086e-02, 1.3443e-02, # -4.9905e-02, 4.1805e-02, 1.4801e-02, -5.4013e-02, 2.9406e-02, # 4.8653e-02, 1.5568e-02, 3.5359e-02, 5.9202e-02, 1.8950e-02, # 6.6025e-02, 3.6152e-02, -6.8674e-02, 3.7966e-02, -7.0162e-02, # -6.5415e-02, -2.1472e-02, 1.3661e-02, -6.1625e-02, 5.2603e-02, # -4.2280e-02, -3.5216e-03, -3.5183e-02, 1.0041e-02, 3.2294e-02, # -6.7117e-02, 3.7613e-02, 4.9022e-02, -6.9640e-02, -6.6330e-02, # -5.7341e-02, 3.2368e-02, 5.1048e-02, 3.5536e-02, -5.7165e-02, # 3.9687e-02, 5.5177e-02, 3.6400e-03, 2.2232e-04, 1.0508e-02, # 9.4798e-03, -4.9671e-02, 1.5729e-02, -1.6415e-03, -6.5341e-02, # 1.1698e-02, 4.2636e-02, 3.0220e-02, -5.1484e-02, -5.8948e-02, # -6.5812e-02, -3.3869e-02, -8.1614e-02, 9.3173e-02, 2.7790e-02, # -3.7140e-02, -7.4221e-03, 4.7300e-02, -5.5298e-02, 2.2071e-02, # -8.1595e-02, 6.8659e-03, 8.6731e-03, 1.7781e-03, 4.9692e-02, # 1.8681e-02, -4.8615e-02, 4.8314e-03, 2.4954e-02, 6.3759e-02, # 7.7778e-03, 4.2505e-02, -6.9391e-02, 3.7088e-02, -9.7483e-03, # -6.1993e-04, 5.2777e-02, 1.2955e-02, 6.6815e-02, 4.6009e-02, # -9.4540e-02, -8.8816e-02, 3.7671e-02, 3.2664e-03, 3.8003e-02, # 2.6832e-02, 6.7603e-02, -6.1109e-02, 3.9013e-02, 4.2810e-02, # 6.7511e-03, 3.2843e-02, -4.2086e-02, 5.1029e-02, 2.9837e-02, # -1.5323e-02, 2.4238e-02, -6.2738e-02, 6.7823e-03, 3.2687e-02, # -5.8093e-03, -4.2954e-02, 2.5780e-02, 4.0528e-02, 2.4579e-02, # 9.3824e-03, 4.6847e-04, -1.0616e-01, -5.4627e-02, -7.2340e-02, # 1.0230e-02, -5.9172e-03, 2.7507e-02, -1.1043e-02, 4.9054e-02, # -7.6480e-02, 3.5482e-02, -7.6632e-02, -1.0587e-01, -5.0521e-03, # -8.1530e-02, 1.9312e-02, 5.2127e-02, 6.3460e-03, -5.5675e-02, # 3.2647e-02, 5.0215e-02, 5.0249e-02, 3.8105e-02, -4.5931e-02, # -4.5676e-02, -9.0062e-03, -1.7592e-02, 2.7216e-02, 3.1704e-02, # 2.2958e-02, 5.5290e-02, 9.7256e-03, 1.7059e-02, -6.9045e-02, # 1.3051e-02, 2.0521e-02, 4.5282e-02, 2.9233e-03, -3.0717e-02, # 2.6435e-02, 1.9568e-02, 9.9746e-03, 3.0479e-02, -9.6263e-03, # 3.9715e-02, 2.3348e-02, 6.4526e-02, 3.9307e-02, 2.5429e-02, # 2.4707e-02, 3.0577e-02, -7.1778e-02, -8.8073e-02, 3.7356e-02, # -7.6534e-03, 2.5788e-02, 3.7859e-02, 4.2421e-02, -1.0225e-01, # 3.5744e-02, 4.9693e-02, 8.0407e-04, 3.3523e-02, 2.7724e-02, # -2.7828e-03, 6.0185e-02, 2.4983e-02, 1.8167e-03, 6.2133e-03, # -6.7665e-02, 2.4738e-02, -5.1167e-03, 2.7496e-02, 3.8240e-02, # -7.5278e-02, -4.1977e-02, 3.0779e-02, 5.3046e-02, 2.9874e-02, # 8.0589e-02, -6.3608e-02, 1.8703e-02, 9.5655e-03, -1.2092e-02, # -5.1363e-02, 3.7178e-02, -3.4604e-02, 4.1522e-02, -9.3374e-03, # -2.2800e-02, -6.7766e-02, 3.4822e-02, -5.2781e-02, -6.7118e-02, # 5.7408e-03, 4.5285e-02, -6.4813e-02, 3.5704e-02, 1.0203e-02, # -9.9155e-03, 1.6483e-02, 3.9745e-02, 6.8487e-02, 1.9586e-02, # 3.5887e-02, -5.5557e-02 # ]]) embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) print(speaker_embeddings.shape) def synthesise(text): inputs = stt_processor(text=text, return_tensors="pt") print(inputs["input_ids"].shape) print(inputs["input_ids"].dtype) speech = stt_model.generate_speech( inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder, ) return speech def speech_to_speech_translation(audio): synthesised_speech = synthesise(sample_text) print(f"Synthesised speech shape: {synthesised_speech.shape}") print("Generated waveform dtype:", synthesised_speech.dtype) synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16) # synthesised_speech = (sample_tensor.numpy() * 32767).astype(np.int16) return 16000, synthesised_speech # Gradio Interface file_translate = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.Audio(label="Generated Speech", type="numpy"), ) mic_translate = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.Audio(label="Generated Speech", type="numpy"), ) demo = gr.Blocks() with demo: gr.TabbedInterface([file_translate, mic_translate], ["Audio File", "Microphone"]) demo.launch(debug=True)