|
import re
|
|
import wave
|
|
import pyaudio
|
|
import time
|
|
import os
|
|
import soundfile as sf
|
|
import sys
|
|
import nltk
|
|
from tools.i18n.i18n import I18nAuto
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
|
|
sys.path.append('./GPT-SoVITS-v2-240821/GPT_SoVITS')
|
|
from inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
|
|
i18n = I18nAuto()
|
|
nltk.download('averaged_perceptron_tagger')
|
|
nltk.download('averaged_perceptron_tagger_eng')
|
|
|
|
def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path):
|
|
|
|
with open(ref_text_path, 'r', encoding='utf-8') as file:
|
|
ref_text = file.read()
|
|
|
|
|
|
with open(target_text_path, 'r', encoding='utf-8') as file:
|
|
target_text = file.read()
|
|
|
|
|
|
change_gpt_weights(gpt_path=GPT_model_path)
|
|
change_sovits_weights(sovits_path=SoVITS_model_path)
|
|
|
|
|
|
synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path,
|
|
prompt_text=ref_text,
|
|
prompt_language=i18n(ref_language),
|
|
text=target_text,
|
|
text_language=i18n(target_language), top_p=1, temperature=1)
|
|
|
|
result_list = list(synthesis_result)
|
|
|
|
if result_list:
|
|
last_sampling_rate, last_audio_data = result_list[-1]
|
|
output_wav_path = os.path.join(output_path, "output.wav")
|
|
sf.write(output_wav_path, last_audio_data, last_sampling_rate)
|
|
print(f"Audio saved to {output_wav_path}")
|
|
|
|
bat_file_path = 'GPT-SoVITS-v2-240821\\go-cli.bat'
|
|
model_name = "model/Qwen2.5-7B-Instruct"
|
|
print("初始化中...")
|
|
with open('background.txt', 'r', encoding='utf-8') as file:
|
|
background = file.read()
|
|
|
|
def extract_language(text):
|
|
text = re.sub(r'([^)]*)', '', text)
|
|
text = re.sub(r'【[^】]*】', '', text)
|
|
return text
|
|
|
|
def play_wav(file_path):
|
|
with wave.open(file_path, 'rb') as wf:
|
|
p = pyaudio.PyAudio()
|
|
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
|
|
channels=wf.getnchannels(),
|
|
rate=wf.getframerate(),
|
|
output=True)
|
|
data = wf.readframes(1024)
|
|
while data:
|
|
stream.write(data)
|
|
data = wf.readframes(1024)
|
|
stream.stop_stream()
|
|
stream.close()
|
|
p.terminate()
|
|
|
|
|
|
quantization_config = BitsAndBytesConfig(
|
|
load_in_4bit=True,
|
|
bnb_4bit_compute_dtype="float16",
|
|
bnb_4bit_quant_type="nf4",
|
|
bnb_4bit_use_double_quant=True
|
|
)
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_name,
|
|
quantization_config=quantization_config,
|
|
torch_dtype="auto",
|
|
device_map="auto"
|
|
)
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
print("初始化完成!输入exit退出")
|
|
|
|
while 1:
|
|
prompt = input("用户:")
|
|
if prompt == 'exit':
|
|
break
|
|
start_time = time.time()
|
|
messages = [
|
|
{"role": "system", "content": background},
|
|
{"role": "user", "content": prompt}
|
|
]
|
|
text = tokenizer.apply_chat_template(
|
|
messages,
|
|
tokenize=False,
|
|
add_generation_prompt=True
|
|
)
|
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
|
|
|
print("LLM 流萤猫酱:", end="", flush=True)
|
|
response = ""
|
|
streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, buffer_size=1)
|
|
model.generate(model_inputs.input_ids, streamer=streamer, max_new_tokens=512)
|
|
for text in streamer:
|
|
if text:
|
|
print(text, end="", flush=True)
|
|
response += text
|
|
print("")
|
|
print("流萤猫酱耗时:", time.time() - start_time)
|
|
target_text = extract_language(response)
|
|
|
|
with open('target_text.txt', 'w', encoding='utf-8') as file:
|
|
file.write(target_text)
|
|
synthesize("GPT_weights_v2/流萤-e10.ckpt", "SoVITS_weights_v2/流萤_e15_s810.pth", "firefly/ref_audio/example.wav", "ref_text.txt", "中文", "target_text.txt", "中文", "output")
|
|
print("合成完成,耗时:", time.time() - start_time)
|
|
|
|
play_wav("output/output.wav")
|
|
|