Spaces:
Sleeping
Sleeping
changed TTS
Browse files- inference.py +14 -14
inference.py
CHANGED
@@ -12,7 +12,6 @@ import numpy as np
|
|
12 |
from tqdm import tqdm
|
13 |
import os
|
14 |
from dotenv import load_dotenv
|
15 |
-
from elevenlabs.client import ElevenLabs
|
16 |
from IPython.display import Audio
|
17 |
import re
|
18 |
from groq import Groq
|
@@ -21,6 +20,7 @@ from pydub import AudioSegment
|
|
21 |
import shutil
|
22 |
import gradio as gr
|
23 |
from huggingface_hub import hf_hub_download
|
|
|
24 |
groq_key = os.environ["GROQ_API_KEY"]
|
25 |
tts_key = os.environ["ElevenLabs"]
|
26 |
class TemporalTransformerEncoder(nn.Module):
|
@@ -214,7 +214,6 @@ If no boundary (four/six) is mentioned, do not add one.
|
|
214 |
Maintain proper punctuation and clarity for TTS (Text-to-Speech) delivery.
|
215 |
|
216 |
Output only the cleaned commentary. No extra text.
|
217 |
-
|
218 |
Input:
|
219 |
{commentary}
|
220 |
|
@@ -233,17 +232,18 @@ Output:
|
|
233 |
print("="*50)
|
234 |
return chat_completion.choices[0].message.content.strip()
|
235 |
|
236 |
-
def text_to_speech(text,
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
)
|
244 |
-
|
245 |
-
|
246 |
-
|
|
|
247 |
def mix_audio(video_path, voice_path, crowd_path, output_path):
|
248 |
video = VideoFileClip(video_path)
|
249 |
video_duration_ms = video.duration * 1000
|
@@ -285,7 +285,7 @@ def main(video_path):
|
|
285 |
|
286 |
# Text to speech
|
287 |
tts_path = "commentary_final.mp3"
|
288 |
-
text_to_speech(clean_commentary,
|
289 |
|
290 |
short_audio_path = "pro_audio3.mp3"
|
291 |
os.system(f"ffmpeg -y -i {tts_path} -ss 0 -t 3 {short_audio_path}")
|
|
|
12 |
from tqdm import tqdm
|
13 |
import os
|
14 |
from dotenv import load_dotenv
|
|
|
15 |
from IPython.display import Audio
|
16 |
import re
|
17 |
from groq import Groq
|
|
|
20 |
import shutil
|
21 |
import gradio as gr
|
22 |
from huggingface_hub import hf_hub_download
|
23 |
+
from TTS.api import TTS
|
24 |
groq_key = os.environ["GROQ_API_KEY"]
|
25 |
tts_key = os.environ["ElevenLabs"]
|
26 |
class TemporalTransformerEncoder(nn.Module):
|
|
|
214 |
Maintain proper punctuation and clarity for TTS (Text-to-Speech) delivery.
|
215 |
|
216 |
Output only the cleaned commentary. No extra text.
|
|
|
217 |
Input:
|
218 |
{commentary}
|
219 |
|
|
|
232 |
print("="*50)
|
233 |
return chat_completion.choices[0].message.content.strip()
|
234 |
|
235 |
+
def text_to_speech(text, output_path, speed=1.3):
|
236 |
+
raw_path = "raw_commentary.wav"
|
237 |
+
|
238 |
+
# Load multilingual TTS model
|
239 |
+
tts = TTS(model_name="tts_models/en/multi-dataset", progress_bar=False, gpu=torch.cuda.is_available())
|
240 |
+
|
241 |
+
# Save original audio
|
242 |
+
tts.tts_to_file(text=text, file_path=raw_path)
|
243 |
+
|
244 |
+
# Speed up using ffmpeg
|
245 |
+
os.system(f"ffmpeg -y -i {raw_path} -filter:a atempo={speed} {output_path}")
|
246 |
+
os.remove(raw_path)
|
247 |
def mix_audio(video_path, voice_path, crowd_path, output_path):
|
248 |
video = VideoFileClip(video_path)
|
249 |
video_duration_ms = video.duration * 1000
|
|
|
285 |
|
286 |
# Text to speech
|
287 |
tts_path = "commentary_final.mp3"
|
288 |
+
text_to_speech(clean_commentary, tts_path, speed=1.35)
|
289 |
|
290 |
short_audio_path = "pro_audio3.mp3"
|
291 |
os.system(f"ffmpeg -y -i {tts_path} -ss 0 -t 3 {short_audio_path}")
|