Spaces:

naveenk-ai
/

openvoice_voicecloning_win

Running

App Files Files Community

naveenk-ai commited on Jul 26, 2024

Commit

3c51bbc

verified ·

1 Parent(s): 24c1b32

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -318

app.py CHANGED Viewed

@@ -1,340 +1,94 @@
-import sys
-from pathlib import Path
 import os
 import torch
-import openvino as ov
 import gradio as gr
 import langid
-import ipywidgets as widgets
-from IPython.display import Audio
-# from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass
-# import openvoice.se_extractor as se_extractor
-import nncf
-import subprocess
-# Clone the repo and set up the environment
-repo_dir = Path("OpenVoice")
-if not repo_dir.exists():
-    subprocess.run(["git", "clone", "https://github.com/myshell-ai/OpenVoice"])
-    orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
-    english_path = Path("OpenVoice/openvoice/text/english.py")
-    english_path.rename(orig_english_path)
-    with orig_english_path.open("r") as f:
-        data = f.read()
-        data = data.replace("unidecode", "anyascii")
-        with english_path.open("w") as out_f:
-            out_f.write(data)
-sys.path.append(str(repo_dir))
-# Install the required packages
-# %pip install -q "librosa>=0.8.1" "wavmark>=0.0.3" "faster-whisper>=0.9.0" "pydub>=0.25.1" "whisper-timestamped>=1.14.2" "tqdm" "inflect>=7.0.0" "eng_to_ipa>=0.0.2" "pypinyin>=0.50.0" \
-# "cn2an>=0.5.22" "jieba>=0.42.1" "langid>=1.1.6" "gradio>=4.15" "ipywebrtc" "anyascii" "openvino>=2023.3" "torch>=2.1" "nncf>=2.11.0"
-from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass
 import openvoice.se_extractor as se_extractor
-packages = [
-    "librosa>=0.8.1",
-    "wavmark>=0.0.3",
-    "faster-whisper>=0.9.0",
-    "pydub>=0.25.1",
-    "whisper-timestamped>=1.14.2",
-    "tqdm",
-    "inflect>=7.0.0",
-    "eng_to_ipa>=0.0.2",
-    "pypinyin>=0.50.0",
-    "ipywidgets"
-]
-subprocess.run(["pip", "install"] + packages, check=True)
-core = ov.Core()
 CKPT_BASE_PATH = "checkpoints"
-en_suffix = f"{CKPT_BASE_PATH}/base_speakers/EN"
-zh_suffix = f"{CKPT_BASE_PATH}/base_speakers/ZH"
-converter_suffix = f"{CKPT_BASE_PATH}/converter"
-enable_chinese_lang = False
 def download_from_hf_hub(filename, local_dir="./"):
-    from huggingface_hub import hf_hub_download
     os.makedirs(local_dir, exist_ok=True)
     hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir)
-download_from_hf_hub(f"{converter_suffix}/checkpoint.pth")
-download_from_hf_hub(f"{converter_suffix}/config.json")
-download_from_hf_hub(f"{en_suffix}/checkpoint.pth")
-download_from_hf_hub(f"{en_suffix}/config.json")
-download_from_hf_hub(f"{en_suffix}/en_default_se.pth")
-download_from_hf_hub(f"{en_suffix}/en_style_se.pth")
-if enable_chinese_lang:
-    download_from_hf_hub(f"{zh_suffix}/checkpoint.pth")
-    download_from_hf_hub(f"{zh_suffix}/config.json")
-    download_from_hf_hub(f"{zh_suffix}/zh_default_se.pth")
 pt_device = "cpu"
-en_base_speaker_tts = BaseSpeakerTTS(f"{en_suffix}/config.json", device=pt_device)
-en_base_speaker_tts.load_ckpt(f"{en_suffix}/checkpoint.pth")
-tone_color_converter = ToneColorConverter(f"{converter_suffix}/config.json", device=pt_device)
-tone_color_converter.load_ckpt(f"{converter_suffix}/checkpoint.pth")
-if enable_chinese_lang:
-    zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_suffix}/config.json", device=pt_device)
-    zh_base_speaker_tts.load_ckpt(f"{zh_suffix}/checkpoint.pth")
-else:
-    zh_base_speaker_tts = None
-class OVOpenVoiceBase(torch.nn.Module):
-    def __init__(self, voice_model: OpenVoiceBaseClass):
-        super().__init__()
-        self.voice_model = voice_model
-        for par in voice_model.model.parameters():
-            par.requires_grad = False
-class OVOpenVoiceTTS(OVOpenVoiceBase):
-    def get_example_input(self):
-        stn_tst = self.voice_model.get_text("this is original text", self.voice_model.hps, False)
-        x_tst = stn_tst.unsqueeze(0)
-        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
-        speaker_id = torch.LongTensor([1])
-        noise_scale = torch.tensor(0.667)
-        length_scale = torch.tensor(1.0)
-        noise_scale_w = torch.tensor(0.6)
-        return (
-            x_tst,
-            x_tst_lengths,
-            speaker_id,
-            noise_scale,
-            length_scale,
-            noise_scale_w,
-        )
-    def forward(self, x, x_lengths, sid, noise_scale, length_scale, noise_scale_w):
-        return self.voice_model.model.infer(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w)
-class OVOpenVoiceConverter(OVOpenVoiceBase):
-    def get_example_input(self):
-        y = torch.randn([1, 513, 238], dtype=torch.float32)
-        y_lengths = torch.LongTensor([y.size(-1)])
-        target_se = torch.randn(*(1, 256, 1))
-        source_se = torch.randn(*(1, 256, 1))
-        tau = torch.tensor(0.3)
-        return (y, y_lengths, source_se, target_se, tau)
-    def forward(self, y, y_lengths, sid_src, sid_tgt, tau):
-        return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau)
-IRS_PATH = "openvino_irs/"
-EN_TTS_IR = f"{IRS_PATH}/openvoice_en_tts.xml"
-ZH_TTS_IR = f"{IRS_PATH}/openvoice_zh_tts.xml"
-VOICE_CONVERTER_IR = f"{IRS_PATH}/openvoice_tone_conversion.xml"
-paths = [EN_TTS_IR, VOICE_CONVERTER_IR]
-models = [
-    OVOpenVoiceTTS(en_base_speaker_tts),
-    OVOpenVoiceConverter(tone_color_converter),
-]
-if enable_chinese_lang:
-    models.append(OVOpenVoiceTTS(zh_base_speaker_tts))
-    paths.append(ZH_TTS_IR)
-ov_models = []
-for model, path in zip(models, paths):
-    if not os.path.exists(path):
-        ov_model = ov.convert_model(model, example_input=model.get_example_input())
-        ov_model = nncf.compress_weights(ov_model)
-        ov.save_model(ov_model, path)
-    else:
-        ov_model = core.read_model(path)
-    ov_models.append(ov_model)
-ov_en_tts, ov_voice_conversion = ov_models[:2]
-if enable_chinese_lang:
-    ov_zh_tts = ov_models[-1]
-REFERENCE_VOICES_PATH = f"{repo_dir}/resources/"
-reference_speakers = [
-    *[path for path in os.listdir(REFERENCE_VOICES_PATH) if os.path.splitext(path)[-1] == ".mp3"],
-    "record_manually",
-    "load_manually",
-]
-ref_speaker = widgets.Dropdown(
-    options=reference_speakers,
-    value=reference_speakers[0],
-    description="reference voice from which tone color will be copied",
-    disabled=False,
-)
-ref_speaker
-OUTPUT_DIR = "outputs/"
-os.makedirs(OUTPUT_DIR, exist_ok=True)
-ref_speaker_path = f"{REFERENCE_VOICES_PATH}/{ref_speaker.value}"
-allowed_audio_types = ".mp4,.mp3,.wav,.wma,.aac,.m4a,.m4b,.webm"
-if ref_speaker.value == "record_manually":
-    ref_speaker_path = f"{OUTPUT_DIR}/custom_example_sample.webm"
-    from ipywebrtc import AudioRecorder, CameraStream
-    camera = CameraStream(constraints={"audio": True, "video": False})
-    recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True)
-    display(recorder)
-elif ref_speaker.value == "load_manually":
-    upload_ref = widgets.FileUpload(
-        accept=allowed_audio_types,
-        multiple=False,
-        description="Select audio with reference voice",
     )
-    display(upload_ref)
-def save_audio(voice_source: widgets.FileUpload, out_path: str):
-    with open(out_path, "wb") as output_file:
-        assert len(voice_source.value) > 0, "Please select audio file"
-        output_file.write(voice_source.value[0]["content"])
-en_source_default_se = torch.load(f"{en_suffix}/en_default_se.pth")
-en_source_style_se = torch.load(f"{en_suffix}/en_style_se.pth")
-zh_source_se = torch.load(f"{zh_suffix}/zh_default_se.pth") if enable_chinese_lang else None
-target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
-def get_pathched_infer(ov_model: ov.Model, device: str) -> callable:
-    compiled_model = core.compile_model(ov_model, device)
-    def infer_impl(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w):
-        ov_output = compiled_model((x, x_lengths, sid, noise_scale, length_scale, noise_scale_w))
-        return (torch.tensor(ov_output[0]),)
-    return infer_impl
-def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable:
-    compiled_model = core.compile_model(ov_model, device)
-    def voice_conversion_impl(y, y_lengths, sid_src, sid_tgt, tau):
-        ov_output = compiled_model((y, y_lengths, sid_src, sid_tgt, tau))
-        return (torch.tensor(ov_output[0]),)
-    return voice_conversion_impl
-core = ov.Core()
-device = widgets.Dropdown(
-    options=core.available_devices + ["AUTO"],
-    value="AUTO",
-    description="Device:",
-    disabled=False,
-)
-device
-en_base_speaker_tts.model.infer = get_pathched_infer(ov_en_tts, device.value)
-tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value)
-if enable_chinese_lang:
-    zh_base_speaker_tts.model.infer = get_pathched_infer(ov_zh_tts, device.value)
-supported_languages = ["zh", "en"]
-def build_predict(
-    output_dir,
-    tone_color_converter,
-    en_tts_model,
-    zh_tts_model,
-    en_source_default_se,
-    en_source_style_se,
-    zh_source_se,
-    supported_languages,
-):
-    def predict(
-        input_text,
-        reference_audio,
-        speaker,
-        noise_scale=0.667,
-        length_scale=1.0,
-        noise_scale_w=0.8,
-        tone_color=False,
-    ):
-        if reference_audio:
-            ref_audio_path = f"{output_dir}/input_audio.wav"
-            save_audio(reference_audio, ref_audio_path)
-            target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True)
-        else:
-            if speaker == "record_manually":
-                raise ValueError("Manual recording is not implemented in this example.")
-            elif speaker == "load_manually":
-                raise ValueError("Loading a manual audio file is not implemented in this example.")
-            else:
-                ref_audio_path = f"{REFERENCE_VOICES_PATH}/{speaker}"
-                target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True)
-        lang = langid.classify(input_text)[0]
-        if lang not in supported_languages:
-            return f"Unsupported language: {lang}"
-        tts_model = en_tts_model if lang == "en" else zh_tts_model
-        stn_tst = tts_model.get_text(input_text, tts_model.hps, False)
-        x_tst = stn_tst.unsqueeze(0)
-        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
-        speaker_id = torch.LongTensor([1])
-        noise_scale = torch.tensor(noise_scale)
-        length_scale = torch.tensor(length_scale)
-        noise_scale_w = torch.tensor(noise_scale_w)
-        with torch.no_grad():
-            audio = tts_model.model.infer(x_tst, x_tst_lengths, speaker_id, noise_scale, length_scale, noise_scale_w)[0]
-            if tone_color:
-                source_se = en_source_style_se if lang == "en" else zh_source_se
-                audio = tone_color_converter.model.voice_conversion(audio, x_tst_lengths, source_se, target_se, torch.tensor(0.3))[0]
-        audio = audio.squeeze().cpu().numpy()
-        output_path = f"{output_dir}/output_audio.wav"
-        Audio(audio, rate=tts_model.hps.data.sampling_rate).save(output_path)
-        return output_path
-    return predict
-OUTPUT_DIR = "output_audio"
-os.makedirs(OUTPUT_DIR, exist_ok=True)
-predict_fn = build_predict(
-    OUTPUT_DIR,
-    tone_color_converter,
-    en_base_speaker_tts,
-    zh_base_speaker_tts,
-    en_source_default_se,
-    en_source_style_se,
-    zh_source_se,
-    supported_languages,
-)
-def gradio_interface():
-    input_text = gr.Textbox(lines=2, placeholder="Enter text here...")
-    reference_audio = gr.Audio(type="filepath", label="Reference Audio")
-    speaker = gr.Dropdown(choices=reference_speakers, value="record_manually", label="Select Speaker")
-    noise_scale = gr.Slider(minimum=0.1, maximum=1.0, value=0.667, label="Noise Scale")
-    length_scale = gr.Slider(minimum=0.1, maximum=2.0, value=1.0, label="Length Scale")
-    noise_scale_w = gr.Slider(minimum=0.1, maximum=1.0, value=0.8, label="Noise Scale W")
-    tone_color = gr.Checkbox(value=False, label="Enable Tone Color Conversion")
-    gr.Interface(
-        fn=predict_fn,
-        inputs=[input_text, reference_audio, speaker, noise_scale, length_scale, noise_scale_w, tone_color],
-        outputs=gr.Audio(type="filepath", label="Generated Audio"),
-        title="Speech Generation and Tone Conversion",
-        description="Generate speech and convert tone using the OpenVoice model.",
-    ).launch()
-# end

 import os
 import torch
 import gradio as gr
+from huggingface_hub import hf_hub_download
 import langid
+from openvoice.api import BaseSpeakerTTS, ToneColorConverter
 import openvoice.se_extractor as se_extractor
+# Constants
 CKPT_BASE_PATH = "checkpoints"
+EN_SUFFIX = f"{CKPT_BASE_PATH}/base_speakers/EN"
+CONVERTER_SUFFIX = f"{CKPT_BASE_PATH}/converter"
+OUTPUT_DIR = "outputs/"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# Download necessary files
 def download_from_hf_hub(filename, local_dir="./"):
     os.makedirs(local_dir, exist_ok=True)
     hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir)
+for file in [f"{CONVERTER_SUFFIX}/checkpoint.pth", f"{CONVERTER_SUFFIX}/config.json",
+             f"{EN_SUFFIX}/checkpoint.pth", f"{EN_SUFFIX}/config.json",
+             f"{EN_SUFFIX}/en_default_se.pth", f"{EN_SUFFIX}/en_style_se.pth"]:
+    download_from_hf_hub(file)
+# Initialize models
 pt_device = "cpu"
+en_base_speaker_tts = BaseSpeakerTTS(f"{EN_SUFFIX}/config.json", device=pt_device)
+en_base_speaker_tts.load_ckpt(f"{EN_SUFFIX}/checkpoint.pth")
+tone_color_converter = ToneColorConverter(f"{CONVERTER_SUFFIX}/config.json", device=pt_device)
+tone_color_converter.load_ckpt(f"{CONVERTER_SUFFIX}/checkpoint.pth")
+en_source_default_se = torch.load(f"{EN_SUFFIX}/en_default_se.pth")
+en_source_style_se = torch.load(f"{EN_SUFFIX}/en_style_se.pth")
+# Main prediction function
+def predict(prompt, style, audio_file_pth, tau):
+    if len(prompt) < 2 or len(prompt) > 200:
+        return "Text should be between 2 and 200 characters.", None
+    try:
+        target_se, _ = se_extractor.get_se(audio_file_pth, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
+    except Exception as e:
+        return f"Error getting target tone color: {str(e)}", None
+    src_path = f"{OUTPUT_DIR}/tmp.wav"
+    en_base_speaker_tts.tts(prompt, src_path, speaker=style, language="English")
+    save_path = f"{OUTPUT_DIR}/output.wav"
+    tone_color_converter.convert(
+        audio_src_path=src_path,
+        src_se=en_source_style_se if style != "default" else en_source_default_se,
+        tgt_se=target_se,
+        output_path=save_path,
+        tau=tau
     )
+    return "Voice cloning completed successfully.", save_path
+# Gradio interface
+def create_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# OpenVoice: Voice Cloning Demo")
+        with gr.Row():
+            input_text = gr.Textbox(label="Text to speak", placeholder="Enter text here (2-200 characters)")
+            style = gr.Dropdown(
+                label="Style",
+                choices=["default", "whispering", "cheerful", "terrified", "angry", "sad", "friendly"],
+                value="default"
+            )
+        with gr.Row():
+            reference_audio = gr.Audio(label="Reference Audio", type="filepath")
+            tau_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Tau (Voice similarity)", info="Higher values make the output more similar to the reference voice")
+        submit_button = gr.Button("Generate Voice")
+        output_text = gr.Textbox(label="Status")
+        output_audio = gr.Audio(label="Generated Audio")
+        submit_button.click(
+            predict,
+            inputs=[input_text, style, reference_audio, tau_slider],
+            outputs=[output_text, output_audio]
+        )
+    return demo
+# Launch the demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()