import re import wave import pyaudio import time import os import soundfile as sf import sys import nltk from tools.i18n.i18n import I18nAuto from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer sys.path.append('./GPT-SoVITS-v2-240821/GPT_SoVITS') from inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav i18n = I18nAuto() nltk.download('averaged_perceptron_tagger') nltk.download('averaged_perceptron_tagger_eng') def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path): # Read reference text with open(ref_text_path, 'r', encoding='utf-8') as file: ref_text = file.read() # Read target text with open(target_text_path, 'r', encoding='utf-8') as file: target_text = file.read() # Change model weights change_gpt_weights(gpt_path=GPT_model_path) change_sovits_weights(sovits_path=SoVITS_model_path) # Synthesize audio synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, prompt_text=ref_text, prompt_language=i18n(ref_language), text=target_text, text_language=i18n(target_language), top_p=1, temperature=1) result_list = list(synthesis_result) if result_list: last_sampling_rate, last_audio_data = result_list[-1] output_wav_path = os.path.join(output_path, "output.wav") sf.write(output_wav_path, last_audio_data, last_sampling_rate) print(f"Audio saved to {output_wav_path}") bat_file_path = 'GPT-SoVITS-v2-240821\\go-cli.bat' model_name = "model/Qwen2.5-7B-Instruct" print("初始化中...") with open('background.txt', 'r', encoding='utf-8') as file: background = file.read() def extract_language(text): text = re.sub(r'([^)]*)', '', text) text = re.sub(r'【[^】]*】', '', text) return text def play_wav(file_path): with wave.open(file_path, 'rb') as wf: p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), channels=wf.getnchannels(), rate=wf.getframerate(), output=True) data = wf.readframes(1024) while data: stream.write(data) data = wf.readframes(1024) stream.stop_stream() stream.close() p.terminate() # 使用 4 位量化配置 quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype="float16", bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True ) model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=quantization_config, torch_dtype="auto", device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name) print("初始化完成!输入exit退出") while 1: prompt = input("用户:") if prompt == 'exit': break start_time = time.time() messages = [ {"role": "system", "content": background}, {"role": "user", "content": prompt} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) print("LLM 流萤猫酱:", end="", flush=True) response = "" streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, buffer_size=1) model.generate(model_inputs.input_ids, streamer=streamer, max_new_tokens=512) for text in streamer: if text: print(text, end="", flush=True) response += text print("") print("流萤猫酱耗时:", time.time() - start_time) target_text = extract_language(response) with open('target_text.txt', 'w', encoding='utf-8') as file: file.write(target_text) synthesize("GPT_weights_v2/流萤-e10.ckpt", "SoVITS_weights_v2/流萤_e15_s810.pth", "firefly/ref_audio/example.wav", "ref_text.txt", "中文", "target_text.txt", "中文", "output") print("合成完成,耗时:", time.time() - start_time) #print("流萤猫酱:",response) play_wav("output/output.wav")