Firefly-Neko / firefly-neko-txt.py

Upload firefly-neko-txt.py with huggingface_hub

530fe46 verified 3 months ago

4.49 kB

	import re
	import wave
	import pyaudio
	import time
	import os
	import soundfile as sf
	import sys
	import nltk
	from tools.i18n.i18n import I18nAuto
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
	sys.path.append('./GPT-SoVITS-v2-240821/GPT_SoVITS')
	from inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
	i18n = I18nAuto()
	nltk.download('averaged_perceptron_tagger')
	nltk.download('averaged_perceptron_tagger_eng')

	def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path):
	# Read reference text
	with open(ref_text_path, 'r', encoding='utf-8') as file:
	ref_text = file.read()

	# Read target text
	with open(target_text_path, 'r', encoding='utf-8') as file:
	target_text = file.read()

	# Change model weights
	change_gpt_weights(gpt_path=GPT_model_path)
	change_sovits_weights(sovits_path=SoVITS_model_path)

	# Synthesize audio
	synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path,
	prompt_text=ref_text,
	prompt_language=i18n(ref_language),
	text=target_text,
	text_language=i18n(target_language), top_p=1, temperature=1)

	result_list = list(synthesis_result)

	if result_list:
	last_sampling_rate, last_audio_data = result_list[-1]
	output_wav_path = os.path.join(output_path, "output.wav")
	sf.write(output_wav_path, last_audio_data, last_sampling_rate)
	print(f"Audio saved to {output_wav_path}")

	bat_file_path = 'GPT-SoVITS-v2-240821\\go-cli.bat'
	model_name = "model/Qwen2.5-7B-Instruct"
	print("初始化中...")
	with open('background.txt', 'r', encoding='utf-8') as file:
	background = file.read()

	def extract_language(text):
	text = re.sub(r'（[^）]*）', '', text)
	text = re.sub(r'【[^】]*】', '', text)
	return text

	def play_wav(file_path):
	with wave.open(file_path, 'rb') as wf:
	p = pyaudio.PyAudio()
	stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
	channels=wf.getnchannels(),
	rate=wf.getframerate(),
	output=True)
	data = wf.readframes(1024)
	while data:
	stream.write(data)
	data = wf.readframes(1024)
	stream.stop_stream()
	stream.close()
	p.terminate()

	# 使用 4 位量化配置
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype="float16",
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True
	)

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	quantization_config=quantization_config,
	torch_dtype="auto",
	device_map="auto"
	)
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	print("初始化完成！输入exit退出")

	while 1:
	prompt = input("用户：")
	if prompt == 'exit':
	break
	start_time = time.time()
	messages = [
	{"role": "system", "content": background},
	{"role": "user", "content": prompt}
	]
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

	print("LLM 流萤猫酱：", end="", flush=True)
	response = ""
	streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, buffer_size=1)
	model.generate(model_inputs.input_ids, streamer=streamer, max_new_tokens=512)
	for text in streamer:
	if text:
	print(text, end="", flush=True)
	response += text
	print("")
	print("流萤猫酱耗时：", time.time() - start_time)
	target_text = extract_language(response)

	with open('target_text.txt', 'w', encoding='utf-8') as file:
	file.write(target_text)
	synthesize("GPT_weights_v2/流萤-e10.ckpt", "SoVITS_weights_v2/流萤_e15_s810.pth", "firefly/ref_audio/example.wav", "ref_text.txt", "中文", "target_text.txt", "中文", "output")
	print("合成完成，耗时：", time.time() - start_time)
	#print("流萤猫酱:",response)
	play_wav("output/output.wav")