naveenk-ai commited on
Commit
3c51bbc
·
verified ·
1 Parent(s): 24c1b32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -318
app.py CHANGED
@@ -1,340 +1,94 @@
1
- import sys
2
- from pathlib import Path
3
  import os
4
  import torch
5
- import openvino as ov
6
  import gradio as gr
 
7
  import langid
8
- import ipywidgets as widgets
9
- from IPython.display import Audio
10
- # from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass
11
- # import openvoice.se_extractor as se_extractor
12
- import nncf
13
- import subprocess
14
-
15
-
16
- # Clone the repo and set up the environment
17
- repo_dir = Path("OpenVoice")
18
- if not repo_dir.exists():
19
- subprocess.run(["git", "clone", "https://github.com/myshell-ai/OpenVoice"])
20
- orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
21
- english_path = Path("OpenVoice/openvoice/text/english.py")
22
-
23
- english_path.rename(orig_english_path)
24
-
25
- with orig_english_path.open("r") as f:
26
- data = f.read()
27
- data = data.replace("unidecode", "anyascii")
28
- with english_path.open("w") as out_f:
29
- out_f.write(data)
30
- sys.path.append(str(repo_dir))
31
-
32
- # Install the required packages
33
- # %pip install -q "librosa>=0.8.1" "wavmark>=0.0.3" "faster-whisper>=0.9.0" "pydub>=0.25.1" "whisper-timestamped>=1.14.2" "tqdm" "inflect>=7.0.0" "eng_to_ipa>=0.0.2" "pypinyin>=0.50.0" \
34
- # "cn2an>=0.5.22" "jieba>=0.42.1" "langid>=1.1.6" "gradio>=4.15" "ipywebrtc" "anyascii" "openvino>=2023.3" "torch>=2.1" "nncf>=2.11.0"
35
-
36
- from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass
37
  import openvoice.se_extractor as se_extractor
38
 
39
- packages = [
40
- "librosa>=0.8.1",
41
- "wavmark>=0.0.3",
42
- "faster-whisper>=0.9.0",
43
- "pydub>=0.25.1",
44
- "whisper-timestamped>=1.14.2",
45
- "tqdm",
46
- "inflect>=7.0.0",
47
- "eng_to_ipa>=0.0.2",
48
- "pypinyin>=0.50.0",
49
- "ipywidgets"
50
- ]
51
-
52
- subprocess.run(["pip", "install"] + packages, check=True)
53
-
54
- core = ov.Core()
55
-
56
  CKPT_BASE_PATH = "checkpoints"
 
 
 
 
57
 
58
- en_suffix = f"{CKPT_BASE_PATH}/base_speakers/EN"
59
- zh_suffix = f"{CKPT_BASE_PATH}/base_speakers/ZH"
60
- converter_suffix = f"{CKPT_BASE_PATH}/converter"
61
-
62
- enable_chinese_lang = False
63
-
64
  def download_from_hf_hub(filename, local_dir="./"):
65
- from huggingface_hub import hf_hub_download
66
  os.makedirs(local_dir, exist_ok=True)
67
  hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir)
68
 
69
- download_from_hf_hub(f"{converter_suffix}/checkpoint.pth")
70
- download_from_hf_hub(f"{converter_suffix}/config.json")
71
- download_from_hf_hub(f"{en_suffix}/checkpoint.pth")
72
- download_from_hf_hub(f"{en_suffix}/config.json")
73
-
74
- download_from_hf_hub(f"{en_suffix}/en_default_se.pth")
75
- download_from_hf_hub(f"{en_suffix}/en_style_se.pth")
76
-
77
- if enable_chinese_lang:
78
- download_from_hf_hub(f"{zh_suffix}/checkpoint.pth")
79
- download_from_hf_hub(f"{zh_suffix}/config.json")
80
- download_from_hf_hub(f"{zh_suffix}/zh_default_se.pth")
81
 
 
82
  pt_device = "cpu"
83
-
84
- en_base_speaker_tts = BaseSpeakerTTS(f"{en_suffix}/config.json", device=pt_device)
85
- en_base_speaker_tts.load_ckpt(f"{en_suffix}/checkpoint.pth")
86
-
87
- tone_color_converter = ToneColorConverter(f"{converter_suffix}/config.json", device=pt_device)
88
- tone_color_converter.load_ckpt(f"{converter_suffix}/checkpoint.pth")
89
-
90
- if enable_chinese_lang:
91
- zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_suffix}/config.json", device=pt_device)
92
- zh_base_speaker_tts.load_ckpt(f"{zh_suffix}/checkpoint.pth")
93
- else:
94
- zh_base_speaker_tts = None
95
-
96
- class OVOpenVoiceBase(torch.nn.Module):
97
- def __init__(self, voice_model: OpenVoiceBaseClass):
98
- super().__init__()
99
- self.voice_model = voice_model
100
- for par in voice_model.model.parameters():
101
- par.requires_grad = False
102
-
103
- class OVOpenVoiceTTS(OVOpenVoiceBase):
104
- def get_example_input(self):
105
- stn_tst = self.voice_model.get_text("this is original text", self.voice_model.hps, False)
106
- x_tst = stn_tst.unsqueeze(0)
107
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
108
- speaker_id = torch.LongTensor([1])
109
- noise_scale = torch.tensor(0.667)
110
- length_scale = torch.tensor(1.0)
111
- noise_scale_w = torch.tensor(0.6)
112
- return (
113
- x_tst,
114
- x_tst_lengths,
115
- speaker_id,
116
- noise_scale,
117
- length_scale,
118
- noise_scale_w,
119
- )
120
-
121
- def forward(self, x, x_lengths, sid, noise_scale, length_scale, noise_scale_w):
122
- return self.voice_model.model.infer(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w)
123
-
124
- class OVOpenVoiceConverter(OVOpenVoiceBase):
125
- def get_example_input(self):
126
- y = torch.randn([1, 513, 238], dtype=torch.float32)
127
- y_lengths = torch.LongTensor([y.size(-1)])
128
- target_se = torch.randn(*(1, 256, 1))
129
- source_se = torch.randn(*(1, 256, 1))
130
- tau = torch.tensor(0.3)
131
- return (y, y_lengths, source_se, target_se, tau)
132
-
133
- def forward(self, y, y_lengths, sid_src, sid_tgt, tau):
134
- return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau)
135
-
136
- IRS_PATH = "openvino_irs/"
137
- EN_TTS_IR = f"{IRS_PATH}/openvoice_en_tts.xml"
138
- ZH_TTS_IR = f"{IRS_PATH}/openvoice_zh_tts.xml"
139
- VOICE_CONVERTER_IR = f"{IRS_PATH}/openvoice_tone_conversion.xml"
140
-
141
- paths = [EN_TTS_IR, VOICE_CONVERTER_IR]
142
- models = [
143
- OVOpenVoiceTTS(en_base_speaker_tts),
144
- OVOpenVoiceConverter(tone_color_converter),
145
- ]
146
- if enable_chinese_lang:
147
- models.append(OVOpenVoiceTTS(zh_base_speaker_tts))
148
- paths.append(ZH_TTS_IR)
149
- ov_models = []
150
-
151
- for model, path in zip(models, paths):
152
- if not os.path.exists(path):
153
- ov_model = ov.convert_model(model, example_input=model.get_example_input())
154
- ov_model = nncf.compress_weights(ov_model)
155
- ov.save_model(ov_model, path)
156
- else:
157
- ov_model = core.read_model(path)
158
- ov_models.append(ov_model)
159
-
160
- ov_en_tts, ov_voice_conversion = ov_models[:2]
161
- if enable_chinese_lang:
162
- ov_zh_tts = ov_models[-1]
163
-
164
-
165
- REFERENCE_VOICES_PATH = f"{repo_dir}/resources/"
166
- reference_speakers = [
167
- *[path for path in os.listdir(REFERENCE_VOICES_PATH) if os.path.splitext(path)[-1] == ".mp3"],
168
- "record_manually",
169
- "load_manually",
170
- ]
171
-
172
- ref_speaker = widgets.Dropdown(
173
- options=reference_speakers,
174
- value=reference_speakers[0],
175
- description="reference voice from which tone color will be copied",
176
- disabled=False,
177
- )
178
-
179
- ref_speaker
180
-
181
- OUTPUT_DIR = "outputs/"
182
- os.makedirs(OUTPUT_DIR, exist_ok=True)
183
-
184
- ref_speaker_path = f"{REFERENCE_VOICES_PATH}/{ref_speaker.value}"
185
- allowed_audio_types = ".mp4,.mp3,.wav,.wma,.aac,.m4a,.m4b,.webm"
186
-
187
- if ref_speaker.value == "record_manually":
188
- ref_speaker_path = f"{OUTPUT_DIR}/custom_example_sample.webm"
189
- from ipywebrtc import AudioRecorder, CameraStream
190
-
191
- camera = CameraStream(constraints={"audio": True, "video": False})
192
- recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True)
193
- display(recorder)
194
-
195
- elif ref_speaker.value == "load_manually":
196
- upload_ref = widgets.FileUpload(
197
- accept=allowed_audio_types,
198
- multiple=False,
199
- description="Select audio with reference voice",
200
  )
201
- display(upload_ref)
202
-
203
- def save_audio(voice_source: widgets.FileUpload, out_path: str):
204
- with open(out_path, "wb") as output_file:
205
- assert len(voice_source.value) > 0, "Please select audio file"
206
- output_file.write(voice_source.value[0]["content"])
207
-
208
- en_source_default_se = torch.load(f"{en_suffix}/en_default_se.pth")
209
- en_source_style_se = torch.load(f"{en_suffix}/en_style_se.pth")
210
- zh_source_se = torch.load(f"{zh_suffix}/zh_default_se.pth") if enable_chinese_lang else None
211
-
212
- target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
213
-
214
- def get_pathched_infer(ov_model: ov.Model, device: str) -> callable:
215
- compiled_model = core.compile_model(ov_model, device)
216
-
217
- def infer_impl(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w):
218
- ov_output = compiled_model((x, x_lengths, sid, noise_scale, length_scale, noise_scale_w))
219
- return (torch.tensor(ov_output[0]),)
220
-
221
- return infer_impl
222
-
223
- def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable:
224
- compiled_model = core.compile_model(ov_model, device)
225
-
226
- def voice_conversion_impl(y, y_lengths, sid_src, sid_tgt, tau):
227
- ov_output = compiled_model((y, y_lengths, sid_src, sid_tgt, tau))
228
- return (torch.tensor(ov_output[0]),)
229
 
230
- return voice_conversion_impl
231
 
232
- core = ov.Core()
233
-
234
- device = widgets.Dropdown(
235
- options=core.available_devices + ["AUTO"],
236
- value="AUTO",
237
- description="Device:",
238
- disabled=False,
239
- )
240
- device
241
-
242
- en_base_speaker_tts.model.infer = get_pathched_infer(ov_en_tts, device.value)
243
- tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value)
244
- if enable_chinese_lang:
245
- zh_base_speaker_tts.model.infer = get_pathched_infer(ov_zh_tts, device.value)
246
-
247
- supported_languages = ["zh", "en"]
248
-
249
- def build_predict(
250
- output_dir,
251
- tone_color_converter,
252
- en_tts_model,
253
- zh_tts_model,
254
- en_source_default_se,
255
- en_source_style_se,
256
- zh_source_se,
257
- supported_languages,
258
- ):
259
- def predict(
260
- input_text,
261
- reference_audio,
262
- speaker,
263
- noise_scale=0.667,
264
- length_scale=1.0,
265
- noise_scale_w=0.8,
266
- tone_color=False,
267
- ):
268
- if reference_audio:
269
- ref_audio_path = f"{output_dir}/input_audio.wav"
270
- save_audio(reference_audio, ref_audio_path)
271
- target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True)
272
- else:
273
- if speaker == "record_manually":
274
- raise ValueError("Manual recording is not implemented in this example.")
275
- elif speaker == "load_manually":
276
- raise ValueError("Loading a manual audio file is not implemented in this example.")
277
- else:
278
- ref_audio_path = f"{REFERENCE_VOICES_PATH}/{speaker}"
279
- target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True)
280
 
281
- lang = langid.classify(input_text)[0]
282
- if lang not in supported_languages:
283
- return f"Unsupported language: {lang}"
284
-
285
- tts_model = en_tts_model if lang == "en" else zh_tts_model
286
-
287
- stn_tst = tts_model.get_text(input_text, tts_model.hps, False)
288
- x_tst = stn_tst.unsqueeze(0)
289
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
290
- speaker_id = torch.LongTensor([1])
291
- noise_scale = torch.tensor(noise_scale)
292
- length_scale = torch.tensor(length_scale)
293
- noise_scale_w = torch.tensor(noise_scale_w)
294
-
295
- with torch.no_grad():
296
- audio = tts_model.model.infer(x_tst, x_tst_lengths, speaker_id, noise_scale, length_scale, noise_scale_w)[0]
297
- if tone_color:
298
- source_se = en_source_style_se if lang == "en" else zh_source_se
299
- audio = tone_color_converter.model.voice_conversion(audio, x_tst_lengths, source_se, target_se, torch.tensor(0.3))[0]
300
-
301
- audio = audio.squeeze().cpu().numpy()
302
- output_path = f"{output_dir}/output_audio.wav"
303
- Audio(audio, rate=tts_model.hps.data.sampling_rate).save(output_path)
304
-
305
- return output_path
306
-
307
- return predict
308
-
309
- OUTPUT_DIR = "output_audio"
310
- os.makedirs(OUTPUT_DIR, exist_ok=True)
311
 
312
- predict_fn = build_predict(
313
- OUTPUT_DIR,
314
- tone_color_converter,
315
- en_base_speaker_tts,
316
- zh_base_speaker_tts,
317
- en_source_default_se,
318
- en_source_style_se,
319
- zh_source_se,
320
- supported_languages,
321
- )
322
 
323
- def gradio_interface():
324
- input_text = gr.Textbox(lines=2, placeholder="Enter text here...")
325
- reference_audio = gr.Audio(type="filepath", label="Reference Audio")
326
- speaker = gr.Dropdown(choices=reference_speakers, value="record_manually", label="Select Speaker")
327
- noise_scale = gr.Slider(minimum=0.1, maximum=1.0, value=0.667, label="Noise Scale")
328
- length_scale = gr.Slider(minimum=0.1, maximum=2.0, value=1.0, label="Length Scale")
329
- noise_scale_w = gr.Slider(minimum=0.1, maximum=1.0, value=0.8, label="Noise Scale W")
330
- tone_color = gr.Checkbox(value=False, label="Enable Tone Color Conversion")
331
 
332
- gr.Interface(
333
- fn=predict_fn,
334
- inputs=[input_text, reference_audio, speaker, noise_scale, length_scale, noise_scale_w, tone_color],
335
- outputs=gr.Audio(type="filepath", label="Generated Audio"),
336
- title="Speech Generation and Tone Conversion",
337
- description="Generate speech and convert tone using the OpenVoice model.",
338
- ).launch()
339
 
340
- # end
 
 
 
 
 
 
1
  import os
2
  import torch
 
3
  import gradio as gr
4
+ from huggingface_hub import hf_hub_download
5
  import langid
6
+ from openvoice.api import BaseSpeakerTTS, ToneColorConverter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import openvoice.se_extractor as se_extractor
8
 
9
+ # Constants
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  CKPT_BASE_PATH = "checkpoints"
11
+ EN_SUFFIX = f"{CKPT_BASE_PATH}/base_speakers/EN"
12
+ CONVERTER_SUFFIX = f"{CKPT_BASE_PATH}/converter"
13
+ OUTPUT_DIR = "outputs/"
14
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
15
 
16
+ # Download necessary files
 
 
 
 
 
17
  def download_from_hf_hub(filename, local_dir="./"):
 
18
  os.makedirs(local_dir, exist_ok=True)
19
  hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir)
20
 
21
+ for file in [f"{CONVERTER_SUFFIX}/checkpoint.pth", f"{CONVERTER_SUFFIX}/config.json",
22
+ f"{EN_SUFFIX}/checkpoint.pth", f"{EN_SUFFIX}/config.json",
23
+ f"{EN_SUFFIX}/en_default_se.pth", f"{EN_SUFFIX}/en_style_se.pth"]:
24
+ download_from_hf_hub(file)
 
 
 
 
 
 
 
 
25
 
26
+ # Initialize models
27
  pt_device = "cpu"
28
+ en_base_speaker_tts = BaseSpeakerTTS(f"{EN_SUFFIX}/config.json", device=pt_device)
29
+ en_base_speaker_tts.load_ckpt(f"{EN_SUFFIX}/checkpoint.pth")
30
+
31
+ tone_color_converter = ToneColorConverter(f"{CONVERTER_SUFFIX}/config.json", device=pt_device)
32
+ tone_color_converter.load_ckpt(f"{CONVERTER_SUFFIX}/checkpoint.pth")
33
+
34
+ en_source_default_se = torch.load(f"{EN_SUFFIX}/en_default_se.pth")
35
+ en_source_style_se = torch.load(f"{EN_SUFFIX}/en_style_se.pth")
36
+
37
+ # Main prediction function
38
+ def predict(prompt, style, audio_file_pth, tau):
39
+ if len(prompt) < 2 or len(prompt) > 200:
40
+ return "Text should be between 2 and 200 characters.", None
41
+
42
+ try:
43
+ target_se, _ = se_extractor.get_se(audio_file_pth, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
44
+ except Exception as e:
45
+ return f"Error getting target tone color: {str(e)}", None
46
+
47
+ src_path = f"{OUTPUT_DIR}/tmp.wav"
48
+ en_base_speaker_tts.tts(prompt, src_path, speaker=style, language="English")
49
+
50
+ save_path = f"{OUTPUT_DIR}/output.wav"
51
+ tone_color_converter.convert(
52
+ audio_src_path=src_path,
53
+ src_se=en_source_style_se if style != "default" else en_source_default_se,
54
+ tgt_se=target_se,
55
+ output_path=save_path,
56
+ tau=tau
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ return "Voice cloning completed successfully.", save_path
60
 
61
+ # Gradio interface
62
+ def create_demo():
63
+ with gr.Blocks() as demo:
64
+ gr.Markdown("# OpenVoice: Voice Cloning Demo")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ with gr.Row():
67
+ input_text = gr.Textbox(label="Text to speak", placeholder="Enter text here (2-200 characters)")
68
+ style = gr.Dropdown(
69
+ label="Style",
70
+ choices=["default", "whispering", "cheerful", "terrified", "angry", "sad", "friendly"],
71
+ value="default"
72
+ )
73
+
74
+ with gr.Row():
75
+ reference_audio = gr.Audio(label="Reference Audio", type="filepath")
76
+ tau_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Tau (Voice similarity)", info="Higher values make the output more similar to the reference voice")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ submit_button = gr.Button("Generate Voice")
79
+
80
+ output_text = gr.Textbox(label="Status")
81
+ output_audio = gr.Audio(label="Generated Audio")
 
 
 
 
 
 
82
 
83
+ submit_button.click(
84
+ predict,
85
+ inputs=[input_text, style, reference_audio, tau_slider],
86
+ outputs=[output_text, output_audio]
87
+ )
 
 
 
88
 
89
+ return demo
 
 
 
 
 
 
90
 
91
+ # Launch the demo
92
+ if __name__ == "__main__":
93
+ demo = create_demo()
94
+ demo.launch()