Firefly-Neko / firefly-neko-txt.py
Shiina-Mahiru's picture
Upload firefly-neko-txt.py with huggingface_hub
530fe46 verified
raw
history blame
4.49 kB
import re
import wave
import pyaudio
import time
import os
import soundfile as sf
import sys
import nltk
from tools.i18n.i18n import I18nAuto
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
sys.path.append('./GPT-SoVITS-v2-240821/GPT_SoVITS')
from inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
i18n = I18nAuto()
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path):
# Read reference text
with open(ref_text_path, 'r', encoding='utf-8') as file:
ref_text = file.read()
# Read target text
with open(target_text_path, 'r', encoding='utf-8') as file:
target_text = file.read()
# Change model weights
change_gpt_weights(gpt_path=GPT_model_path)
change_sovits_weights(sovits_path=SoVITS_model_path)
# Synthesize audio
synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path,
prompt_text=ref_text,
prompt_language=i18n(ref_language),
text=target_text,
text_language=i18n(target_language), top_p=1, temperature=1)
result_list = list(synthesis_result)
if result_list:
last_sampling_rate, last_audio_data = result_list[-1]
output_wav_path = os.path.join(output_path, "output.wav")
sf.write(output_wav_path, last_audio_data, last_sampling_rate)
print(f"Audio saved to {output_wav_path}")
bat_file_path = 'GPT-SoVITS-v2-240821\\go-cli.bat'
model_name = "model/Qwen2.5-7B-Instruct"
print("初始化中...")
with open('background.txt', 'r', encoding='utf-8') as file:
background = file.read()
def extract_language(text):
text = re.sub(r'([^)]*)', '', text)
text = re.sub(r'【[^】]*】', '', text)
return text
def play_wav(file_path):
with wave.open(file_path, 'rb') as wf:
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
data = wf.readframes(1024)
while data:
stream.write(data)
data = wf.readframes(1024)
stream.stop_stream()
stream.close()
p.terminate()
# 使用 4 位量化配置
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype="float16",
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config,
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("初始化完成!输入exit退出")
while 1:
prompt = input("用户:")
if prompt == 'exit':
break
start_time = time.time()
messages = [
{"role": "system", "content": background},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
print("LLM 流萤猫酱:", end="", flush=True)
response = ""
streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, buffer_size=1)
model.generate(model_inputs.input_ids, streamer=streamer, max_new_tokens=512)
for text in streamer:
if text:
print(text, end="", flush=True)
response += text
print("")
print("流萤猫酱耗时:", time.time() - start_time)
target_text = extract_language(response)
with open('target_text.txt', 'w', encoding='utf-8') as file:
file.write(target_text)
synthesize("GPT_weights_v2/流萤-e10.ckpt", "SoVITS_weights_v2/流萤_e15_s810.pth", "firefly/ref_audio/example.wav", "ref_text.txt", "中文", "target_text.txt", "中文", "output")
print("合成完成,耗时:", time.time() - start_time)
#print("流萤猫酱:",response)
play_wav("output/output.wav")