File size: 5,865 Bytes

4d6be10

import os
import time
import re
import wave
import pyaudio
import subprocess
import numpy as np
from funasr import AutoModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bat_file_path = 'GPT-SoVITS-v2-240821\\go-cli.bat'
model_name = "model/Qwen2.5-7B-Instruct"
print("初始化中...")

with open('background.txt', 'r', encoding='utf-8') as file:
        background = file.read()
with open('STT-background.txt', 'r', encoding='utf-8') as file:
        stt_background = file.read()

def extract_language(text):
    text = re.sub(r'（[^）]*）', '', text)
    text = re.sub(r'【[^】]*】', '', text)
    return text
    
def play_wav(file_path):
    with wave.open(file_path, 'rb') as wf:
        p = pyaudio.PyAudio()
        stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                        channels=wf.getnchannels(),
                        rate=wf.getframerate(),
                        output=True)
        data = wf.readframes(1024)
        while data:
            stream.write(data)
            data = wf.readframes(1024)
        stream.stop_stream()
        stream.close()
        p.terminate()

# 使用 4 位量化配置
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
subprocess.run([bat_file_path], shell=True)

def correct(sentence):
    messages = [
        {"role": "system", "content": stt_background},
        {"role": "user", "content": sentence}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

model_dir = "model"

stt_model = AutoModel(
    model=f"{model_dir}/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", 
    vad_model=f"{model_dir}/speech_fsmn_vad_zh-cn-16k-common-pytorch", 
    punc_model=f"{model_dir}/punc_ct-transformer_cn-en-common-vocab471067-large",  
    disable_update=True
)

def stt():
    chunk_size = 16000 * 3  # 3s
    #chunk_stride = chunk_size  # 确保每块长度足够

    # 初始化麦克风输入
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=chunk_size)

    cache = {}
    result_text = ""
    sound_threshold = 500
    wait_time = 1
    no_sound_start_time = time.time()
    try:
        while True:
            audio_data = stream.read(chunk_size)
            speech_chunk = np.frombuffer(audio_data, dtype=np.int16)
            if np.max(speech_chunk) > sound_threshold:
                # 保存音频块为临时文件
                temp_wav_path = "temp_chunk.wav"
                with wave.open(temp_wav_path, 'wb') as wf:
                    wf.setnchannels(1)
                    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
                    wf.setframerate(16000)
                    wf.writeframes(speech_chunk.tobytes())
                res = stt_model.generate(input=temp_wav_path, cache=cache, is_final=False, chunk_size=chunk_size)
                os.remove(temp_wav_path)
                #print(f"Model output: {res}")
                if res and len(res[0]["text"]) > 0:
                    result_text += res[0]["text"]
                    #corrected_text = correct(sentence=result_text)
                    print("未修改：", result_text)
                    #print("Qwen2.5修改：", corrected_text)
                    no_sound_start_time = time.time()
            else:
                if len(result_text) > 0 and time.time() - no_sound_start_time > wait_time:
                    print("已停顿，开始修正")
                    print("Qwen2.5修正：",correct(result_text))
                    no_sound_start_time = time.time()
                    return correct(result_text)
                
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()

print("初始化完成！")

while True:
    prompt = stt()
    #if prompt == '退出':
    #    break
    messages = [
        {"role": "system", "content": background},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    target_text = extract_language(response)
    with open('GPT-SoVITS-v2-240821/target_text.txt', 'w', encoding='utf-8') as file:
        file.write(target_text)
    subprocess.run([bat_file_path], shell=True)
    print("流萤猫酱:",response)
    play_wav("GPT-SoVITS-v2-240821/output/output.wav")