Spaces:

switin06
/

Cricket-Commentary

Sleeping

App Files Files Community

switin06 commited on 28 days ago

Commit

b466443

verified ·

1 Parent(s): d134fe0

changed TTS

Browse files

Files changed (1) hide show

inference.py +14 -14

inference.py CHANGED Viewed

@@ -12,7 +12,6 @@ import numpy as np
 from tqdm import tqdm
 import os
 from dotenv import load_dotenv
-from elevenlabs.client import ElevenLabs
 from IPython.display import Audio
 import re
 from groq import Groq
@@ -21,6 +20,7 @@ from pydub import AudioSegment
 import shutil
 import gradio as gr
 from huggingface_hub import hf_hub_download
 groq_key = os.environ["GROQ_API_KEY"]
 tts_key = os.environ["ElevenLabs"]
 class TemporalTransformerEncoder(nn.Module):
@@ -214,7 +214,6 @@ If no boundary (four/six) is mentioned, do not add one.
 Maintain proper punctuation and clarity for TTS (Text-to-Speech) delivery.
 Output only the cleaned commentary. No extra text.
 Input:
 {commentary}
@@ -233,17 +232,18 @@ Output:
   print("="*50)
   return chat_completion.choices[0].message.content.strip()
-def text_to_speech(text, elevenlabs_api_key, output_path):
-    elevenlabs = ElevenLabs(api_key=tts_key)
-    audio_stream = elevenlabs.text_to_speech.convert(
-        text=text,
-        voice_id="URgDTjqBVr48zeu6FETI",
-        model_id="eleven_multilingual_v2",
-        output_format="mp3_44100_128",
-    )
-    with open(output_path, "wb") as f:
-        for chunk in audio_stream:
-            f.write(chunk)
 def mix_audio(video_path, voice_path, crowd_path, output_path):
     video = VideoFileClip(video_path)
     video_duration_ms = video.duration * 1000
@@ -285,7 +285,7 @@ def main(video_path):
     # Text to speech
     tts_path = "commentary_final.mp3"
-    text_to_speech(clean_commentary,tts_key, tts_path)
     short_audio_path = "pro_audio3.mp3"
     os.system(f"ffmpeg -y -i {tts_path} -ss 0 -t 3 {short_audio_path}")

 from tqdm import tqdm
 import os
 from dotenv import load_dotenv
 from IPython.display import Audio
 import re
 from groq import Groq
 import shutil
 import gradio as gr
 from huggingface_hub import hf_hub_download
+from TTS.api import TTS
 groq_key = os.environ["GROQ_API_KEY"]
 tts_key = os.environ["ElevenLabs"]
 class TemporalTransformerEncoder(nn.Module):
 Maintain proper punctuation and clarity for TTS (Text-to-Speech) delivery.
 Output only the cleaned commentary. No extra text.
 Input:
 {commentary}
   print("="*50)
   return chat_completion.choices[0].message.content.strip()
+def text_to_speech(text, output_path, speed=1.3):
+    raw_path = "raw_commentary.wav"
+    # Load multilingual TTS model
+    tts = TTS(model_name="tts_models/en/multi-dataset", progress_bar=False, gpu=torch.cuda.is_available())
+    # Save original audio
+    tts.tts_to_file(text=text, file_path=raw_path)
+    # Speed up using ffmpeg
+    os.system(f"ffmpeg -y -i {raw_path} -filter:a atempo={speed} {output_path}")
+    os.remove(raw_path)
 def mix_audio(video_path, voice_path, crowd_path, output_path):
     video = VideoFileClip(video_path)
     video_duration_ms = video.duration * 1000
     # Text to speech
     tts_path = "commentary_final.mp3"
+    text_to_speech(clean_commentary, tts_path, speed=1.35)
     short_audio_path = "pro_audio3.mp3"
     os.system(f"ffmpeg -y -i {tts_path} -ss 0 -t 3 {short_audio_path}")