Shashashasha's picture
Update app.py
afefb87
raw
history blame contribute delete
4.15 kB
import json
from pathlib import Path
import gradio as gr
import librosa
import numpy as np
import torch
from huggingface_hub import hf_hub_download, list_repo_files
from so_vits_svc_fork.hparams import HParams
from so_vits_svc_fork.inference.core import Svc
##########################################################
# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME
##########################################################
repo_id = "Shashashasha/katya-zaychick"
ckpt_name = None # None will pick latest
##########################################################
# Figure out the latest generator by taking highest value one.
# Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
if ckpt_name is None:
latest_id = sorted(
[
int(Path(x).stem.split("_")[1])
for x in list_repo_files(repo_id)
if x.startswith("G_") and x.endswith(".pth")
]
)[-1]
ckpt_name = f"G_{latest_id}.pth"
generator_path = hf_hub_download(repo_id, ckpt_name)
config_path = hf_hub_download(repo_id, "config.json")
hparams = HParams(**json.loads(Path(config_path).read_text()))
speakers = list(hparams.spk.keys())
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=None)
def predict(
speaker,
audio,
transpose: int = 0,
auto_predict_f0: bool = False,
cluster_infer_ratio: float = 0,
noise_scale: float = 0.4,
f0_method: str = "crepe",
db_thresh: int = -40,
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
):
audio, _ = librosa.load(audio, sr=model.target_sample)
audio = model.infer_silence(
audio.astype(np.float32),
speaker=speaker,
transpose=transpose,
auto_predict_f0=auto_predict_f0,
cluster_infer_ratio=cluster_infer_ratio,
noise_scale=noise_scale,
f0_method=f0_method,
db_thresh=db_thresh,
pad_seconds=pad_seconds,
chunk_seconds=chunk_seconds,
absolute_thresh=absolute_thresh,
)
return model.target_sample, audio
description=f"""
Это катя нейросеть! Настройки не крутите. А если вам пофиг крутите ломайте!
А что тут написать?
""".strip()
article="""
<p style='text-align: center'>
<a href='https://github.com/voicepaw/so-vits-svc-fork' target='_blank'>Github Repo</a>
</p>
""".strip()
interface_mic = gr.Interface(
predict,
inputs=[
gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"),
gr.Audio(type="filepath", source="microphone", label="Source Audio"),
gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
gr.Checkbox(False, label="Auto Predict F0"),
gr.Slider(0.0, 1.0, value=0.0, step=0.1, label='cluster infer ratio'),
gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value='crepe', label="f0 method"),
],
outputs="audio",
title="Voice Cloning",
description=description,
article=article,
)
interface_file = gr.Interface(
predict,
inputs=[
gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"),
gr.Audio(type="filepath", source="upload", label="Source Audio"),
gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
gr.Checkbox(False, label="Auto Predict F0"),
gr.Slider(0.0, 1.0, value=0.0, step=0.1, label='cluster infer ratio'),
gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value='crepe', label="f0 method"),
],
outputs="audio",
title="Йоши Нейросеть",
description=description,
article=article,
)
interface = gr.TabbedInterface(
[interface_mic, interface_file],
["Микрофон", "Аудио файл"],
)
if __name__ == '__main__':
interface.launch()