switin06 commited on
Commit
b466443
·
verified ·
1 Parent(s): d134fe0

changed TTS

Browse files
Files changed (1) hide show
  1. inference.py +14 -14
inference.py CHANGED
@@ -12,7 +12,6 @@ import numpy as np
12
  from tqdm import tqdm
13
  import os
14
  from dotenv import load_dotenv
15
- from elevenlabs.client import ElevenLabs
16
  from IPython.display import Audio
17
  import re
18
  from groq import Groq
@@ -21,6 +20,7 @@ from pydub import AudioSegment
21
  import shutil
22
  import gradio as gr
23
  from huggingface_hub import hf_hub_download
 
24
  groq_key = os.environ["GROQ_API_KEY"]
25
  tts_key = os.environ["ElevenLabs"]
26
  class TemporalTransformerEncoder(nn.Module):
@@ -214,7 +214,6 @@ If no boundary (four/six) is mentioned, do not add one.
214
  Maintain proper punctuation and clarity for TTS (Text-to-Speech) delivery.
215
 
216
  Output only the cleaned commentary. No extra text.
217
-
218
  Input:
219
  {commentary}
220
 
@@ -233,17 +232,18 @@ Output:
233
  print("="*50)
234
  return chat_completion.choices[0].message.content.strip()
235
 
236
- def text_to_speech(text, elevenlabs_api_key, output_path):
237
- elevenlabs = ElevenLabs(api_key=tts_key)
238
- audio_stream = elevenlabs.text_to_speech.convert(
239
- text=text,
240
- voice_id="URgDTjqBVr48zeu6FETI",
241
- model_id="eleven_multilingual_v2",
242
- output_format="mp3_44100_128",
243
- )
244
- with open(output_path, "wb") as f:
245
- for chunk in audio_stream:
246
- f.write(chunk)
 
247
  def mix_audio(video_path, voice_path, crowd_path, output_path):
248
  video = VideoFileClip(video_path)
249
  video_duration_ms = video.duration * 1000
@@ -285,7 +285,7 @@ def main(video_path):
285
 
286
  # Text to speech
287
  tts_path = "commentary_final.mp3"
288
- text_to_speech(clean_commentary,tts_key, tts_path)
289
 
290
  short_audio_path = "pro_audio3.mp3"
291
  os.system(f"ffmpeg -y -i {tts_path} -ss 0 -t 3 {short_audio_path}")
 
12
  from tqdm import tqdm
13
  import os
14
  from dotenv import load_dotenv
 
15
  from IPython.display import Audio
16
  import re
17
  from groq import Groq
 
20
  import shutil
21
  import gradio as gr
22
  from huggingface_hub import hf_hub_download
23
+ from TTS.api import TTS
24
  groq_key = os.environ["GROQ_API_KEY"]
25
  tts_key = os.environ["ElevenLabs"]
26
  class TemporalTransformerEncoder(nn.Module):
 
214
  Maintain proper punctuation and clarity for TTS (Text-to-Speech) delivery.
215
 
216
  Output only the cleaned commentary. No extra text.
 
217
  Input:
218
  {commentary}
219
 
 
232
  print("="*50)
233
  return chat_completion.choices[0].message.content.strip()
234
 
235
+ def text_to_speech(text, output_path, speed=1.3):
236
+ raw_path = "raw_commentary.wav"
237
+
238
+ # Load multilingual TTS model
239
+ tts = TTS(model_name="tts_models/en/multi-dataset", progress_bar=False, gpu=torch.cuda.is_available())
240
+
241
+ # Save original audio
242
+ tts.tts_to_file(text=text, file_path=raw_path)
243
+
244
+ # Speed up using ffmpeg
245
+ os.system(f"ffmpeg -y -i {raw_path} -filter:a atempo={speed} {output_path}")
246
+ os.remove(raw_path)
247
  def mix_audio(video_path, voice_path, crowd_path, output_path):
248
  video = VideoFileClip(video_path)
249
  video_duration_ms = video.duration * 1000
 
285
 
286
  # Text to speech
287
  tts_path = "commentary_final.mp3"
288
+ text_to_speech(clean_commentary, tts_path, speed=1.35)
289
 
290
  short_audio_path = "pro_audio3.mp3"
291
  os.system(f"ffmpeg -y -i {tts_path} -ss 0 -t 3 {short_audio_path}")