refactor(core): Architecturally decouple Audio, ASR, and TTS modules
Browse filesThis major refactoring decouples core functionalities from the services layer, establishing clearer architectural boundaries and improving modularity. The changes adhere to the principles of Separation of Concerns and single-responsibility, making the system more maintainable and extensible.
Key changes include:
1. **Decoupled Audio Subsystem (Strategy Pattern)**:
- The monolithic `AudioCapture` class has been split into multiple, single-responsibility components within the `audio/` directory.
- `base_capture.py` introduces an abstract base class for all capture strategies.
- `pyaudio_capture.py` and `aec_capture.py` provide concrete implementations for standard and echo-cancelled audio capture.
- The main `capture.py` now acts as a Facade, selecting the appropriate capture strategy at runtime, hiding implementation details from the rest of the application.
2. **Introduced PlayerService to Separate Concerns**:
- Resolved a reverse dependency where `audio/player.py` was dependent on `services/mixins.py`.
- `audio/player.py` is now a pure `AudioPlayer`, stripped of all business logic. Its sole responsibility is to play raw audio data from a queue.
- A new `services/player_service.py` has been created to handle all business logic previously in the player, such as processing `VoiceTask`, managing state, and updating history. This service now directs the pure `AudioPlayer`.
3. **Elevated ASR and TTS to Top-Level Modules**:
- Relocated speech recognition models (`recognizers`) from `services/speech` to a new, dedicated `asr/` module.
- Relocated speech synthesis models (`generators`) from `services/audio` to a new, dedicated `tts/` module.
- This gives ASR and TTS first-class status within the project architecture, clarifying their roles as core, independent capabilities rather than sub-components of a generic service.
- src/voice_dialogue/__init__.py +0 -14
- src/voice_dialogue/api/core/config.py +1 -1
- src/voice_dialogue/api/core/lifespan.py +1 -1
- src/voice_dialogue/api/core/service_factories.py +9 -10
- src/voice_dialogue/api/routes/asr_routes.py +1 -1
- src/voice_dialogue/api/routes/system_routes.py +1 -1
- src/voice_dialogue/api/routes/tts_routes.py +1 -1
- src/voice_dialogue/{services/speech/recognizers → asr}/__init__.py +0 -0
- src/voice_dialogue/{services/speech/recognizers → asr}/manager.py +0 -0
- src/voice_dialogue/{services/speech/recognizers → asr}/models/__init__.py +0 -0
- src/voice_dialogue/{services/speech/recognizers → asr}/models/base.py +0 -0
- src/voice_dialogue/{services/speech/recognizers → asr}/models/funasr.py +3 -3
- src/voice_dialogue/{services/speech/recognizers → asr}/models/whisper.py +3 -3
- src/voice_dialogue/{services/speech/recognizers → asr}/utils.py +0 -0
- src/voice_dialogue/{services/audio → audio}/__init__.py +2 -4
- src/voice_dialogue/audio/capture/__init__.py +78 -0
- src/voice_dialogue/audio/capture/aec_capture.py +74 -0
- src/voice_dialogue/audio/capture/base_capture.py +50 -0
- src/voice_dialogue/audio/capture/pyaudio_capture.py +69 -0
- src/voice_dialogue/audio/player.py +10 -0
- src/voice_dialogue/{services/audio → audio}/vad.py +0 -0
- src/voice_dialogue/config/speaker_config.py +1 -1
- src/voice_dialogue/core/launcher.py +5 -9
- src/voice_dialogue/{services/text → llm}/__init__.py +0 -0
- src/voice_dialogue/{services/text → llm}/processor.py +0 -0
- src/voice_dialogue/services/__init__.py +13 -0
- src/voice_dialogue/services/{speech/recognizer.py → asr_service.py} +2 -2
- src/voice_dialogue/services/audio/capture.py +0 -148
- src/voice_dialogue/services/{audio/player.py → audio_player_service.py} +4 -14
- src/voice_dialogue/services/{text/generator.py → llm_service.py} +4 -4
- src/voice_dialogue/services/mixins.py +1 -1
- src/voice_dialogue/services/speech/__init__.py +0 -4
- src/voice_dialogue/services/{speech/monitor.py → speech_monitor.py} +1 -1
- src/voice_dialogue/services/{audio/generator.py → tts_service.py} +1 -1
- src/voice_dialogue/{services/audio/generators → tts}/__init__.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/configs/__init__.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/configs/kokoro.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/configs/moyoyo.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/manager.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/models/__init__.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/models/base.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/models/kokoro.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/models/moyoyo.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/runtime/__init__.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/runtime/interface.py +0 -0
- src/voice_dialogue/{services/audio/generators → tts}/runtime/kokoro.py +3 -3
- src/voice_dialogue/{services/audio/generators → tts}/runtime/moyoyo.py +3 -3
- src/voice_dialogue/utils/audio_utils.py +17 -0
- tests/test_llm_dialogue.py +1 -1
@@ -1,14 +0,0 @@
|
|
1 |
-
from .core.constants import (
|
2 |
-
audio_frames_queue,
|
3 |
-
user_voice_queue,
|
4 |
-
transcribed_text_queue,
|
5 |
-
text_input_queue,
|
6 |
-
audio_output_queue
|
7 |
-
)
|
8 |
-
from .services.audio.capture import AudioCapture
|
9 |
-
from .services.audio.generator import TTSAudioGenerator
|
10 |
-
from .services.audio.generators.models import tts_config_registry
|
11 |
-
from .services.audio.player import AudioStreamPlayer
|
12 |
-
from .services.speech.monitor import SpeechStateMonitor
|
13 |
-
from .services.speech.recognizer import ASRWorker
|
14 |
-
from .services.text.generator import LLMResponseGenerator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,6 +1,6 @@
|
|
1 |
from typing import Dict, Any
|
2 |
|
3 |
-
from voice_dialogue.
|
4 |
from voice_dialogue.utils.logger import logger
|
5 |
|
6 |
|
|
|
1 |
from typing import Dict, Any
|
2 |
|
3 |
+
from voice_dialogue.tts import tts_config_registry
|
4 |
from voice_dialogue.utils.logger import logger
|
5 |
|
6 |
|
@@ -3,7 +3,7 @@ from contextlib import asynccontextmanager
|
|
3 |
|
4 |
from fastapi import FastAPI
|
5 |
|
6 |
-
from voice_dialogue.
|
7 |
from voice_dialogue.utils import get_system_language, logger
|
8 |
from .config import TTSConfigInitializer
|
9 |
from .service_factories import get_core_voice_service_definitions
|
|
|
3 |
|
4 |
from fastapi import FastAPI
|
5 |
|
6 |
+
from voice_dialogue.tts import tts_config_registry
|
7 |
from voice_dialogue.utils import get_system_language, logger
|
8 |
from .config import TTSConfigInitializer
|
9 |
from .service_factories import get_core_voice_service_definitions
|
@@ -1,11 +1,10 @@
|
|
|
|
1 |
from voice_dialogue.core.constants import (
|
2 |
transcribed_text_queue, text_input_queue, audio_output_queue,
|
3 |
audio_frames_queue, user_voice_queue, websocket_message_queue
|
4 |
)
|
5 |
-
from voice_dialogue.services
|
6 |
-
from voice_dialogue.
|
7 |
-
from voice_dialogue.services.speech import SpeechStateMonitor, ASRWorker
|
8 |
-
from voice_dialogue.services.text.generator import LLMResponseGenerator
|
9 |
from .service_manager import ServiceDefinition
|
10 |
|
11 |
|
@@ -30,18 +29,18 @@ class ServiceFactories:
|
|
30 |
)
|
31 |
|
32 |
@staticmethod
|
33 |
-
def create_asr_worker(language: str) ->
|
34 |
"""创建ASR服务"""
|
35 |
-
return
|
36 |
user_voice_queue=user_voice_queue,
|
37 |
transcribed_text_queue=transcribed_text_queue,
|
38 |
language=language
|
39 |
)
|
40 |
|
41 |
@staticmethod
|
42 |
-
def create_llm_generator() ->
|
43 |
"""创建LLM文本生成服务"""
|
44 |
-
return
|
45 |
user_question_queue=transcribed_text_queue,
|
46 |
generated_answer_queue=text_input_queue,
|
47 |
websocket_message_queue=websocket_message_queue,
|
@@ -60,9 +59,9 @@ class ServiceFactories:
|
|
60 |
)
|
61 |
|
62 |
@staticmethod
|
63 |
-
def create_audio_player() ->
|
64 |
"""创建音频播放服务"""
|
65 |
-
return
|
66 |
audio_playing_queue=audio_output_queue,
|
67 |
websocket_message_queue=websocket_message_queue
|
68 |
)
|
|
|
1 |
+
from voice_dialogue.audio.capture import AudioCapture
|
2 |
from voice_dialogue.core.constants import (
|
3 |
transcribed_text_queue, text_input_queue, audio_output_queue,
|
4 |
audio_frames_queue, user_voice_queue, websocket_message_queue
|
5 |
)
|
6 |
+
from voice_dialogue.services import SpeechStateMonitor, ASRService, AudioPlayerService, LLMService, TTSAudioGenerator
|
7 |
+
from voice_dialogue.tts import BaseTTSConfig, tts_config_registry
|
|
|
|
|
8 |
from .service_manager import ServiceDefinition
|
9 |
|
10 |
|
|
|
29 |
)
|
30 |
|
31 |
@staticmethod
|
32 |
+
def create_asr_worker(language: str) -> ASRService:
|
33 |
"""创建ASR服务"""
|
34 |
+
return ASRService(
|
35 |
user_voice_queue=user_voice_queue,
|
36 |
transcribed_text_queue=transcribed_text_queue,
|
37 |
language=language
|
38 |
)
|
39 |
|
40 |
@staticmethod
|
41 |
+
def create_llm_generator() -> LLMService:
|
42 |
"""创建LLM文本生成服务"""
|
43 |
+
return LLMService(
|
44 |
user_question_queue=transcribed_text_queue,
|
45 |
generated_answer_queue=text_input_queue,
|
46 |
websocket_message_queue=websocket_message_queue,
|
|
|
59 |
)
|
60 |
|
61 |
@staticmethod
|
62 |
+
def create_audio_player() -> AudioPlayerService:
|
63 |
"""创建音频播放服务"""
|
64 |
+
return AudioPlayerService(
|
65 |
audio_playing_queue=audio_output_queue,
|
66 |
websocket_message_queue=websocket_message_queue
|
67 |
)
|
@@ -1,6 +1,6 @@
|
|
1 |
from fastapi import APIRouter, HTTPException, Request, BackgroundTasks
|
2 |
|
3 |
-
from voice_dialogue.
|
4 |
from voice_dialogue.utils.logger import logger
|
5 |
from ..core.service_factories import get_asr_worker_service_definition
|
6 |
from ..schemas.asr_schemas import (
|
|
|
1 |
from fastapi import APIRouter, HTTPException, Request, BackgroundTasks
|
2 |
|
3 |
+
from voice_dialogue.asr import asr_manager
|
4 |
from voice_dialogue.utils.logger import logger
|
5 |
from ..core.service_factories import get_asr_worker_service_definition
|
6 |
from ..schemas.asr_schemas import (
|
@@ -121,7 +121,7 @@ async def stop_system(request: Request):
|
|
121 |
audio_capture_service = service_manager.get_service("audio_capture")
|
122 |
if audio_capture_service:
|
123 |
try:
|
124 |
-
audio_capture_service.
|
125 |
logger.info("音频捕获服务已停止")
|
126 |
|
127 |
# 等待服务停止
|
|
|
121 |
audio_capture_service = service_manager.get_service("audio_capture")
|
122 |
if audio_capture_service:
|
123 |
try:
|
124 |
+
audio_capture_service.stop()
|
125 |
logger.info("音频捕获服务已停止")
|
126 |
|
127 |
# 等待服务停止
|
@@ -3,7 +3,7 @@ from typing import Optional
|
|
3 |
from fastapi import APIRouter, HTTPException, BackgroundTasks, Request
|
4 |
from fastapi.responses import FileResponse
|
5 |
|
6 |
-
from voice_dialogue.
|
7 |
from voice_dialogue.utils.logger import logger
|
8 |
from ..core.service_factories import get_tts_audio_generator_service_definition
|
9 |
from ..schemas.tts_schemas import (
|
|
|
3 |
from fastapi import APIRouter, HTTPException, BackgroundTasks, Request
|
4 |
from fastapi.responses import FileResponse
|
5 |
|
6 |
+
from voice_dialogue.tts import tts_config_registry
|
7 |
from voice_dialogue.utils.logger import logger
|
8 |
from ..core.service_factories import get_tts_audio_generator_service_definition
|
9 |
from ..schemas.tts_schemas import (
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -4,10 +4,10 @@ import typing
|
|
4 |
import numpy as np
|
5 |
from funasr_onnx import SeacoParaformer, CT_Transformer
|
6 |
|
|
|
|
|
|
|
7 |
from voice_dialogue.config import paths
|
8 |
-
from voice_dialogue.services.speech.recognizers.manager import asr_tables
|
9 |
-
from voice_dialogue.services.speech.recognizers.models.base import ASRInterface
|
10 |
-
from voice_dialogue.services.speech.recognizers.utils import ensure_minimum_audio_duration
|
11 |
from voice_dialogue.utils.logger import logger
|
12 |
|
13 |
|
|
|
4 |
import numpy as np
|
5 |
from funasr_onnx import SeacoParaformer, CT_Transformer
|
6 |
|
7 |
+
from voice_dialogue.asr.manager import asr_tables
|
8 |
+
from voice_dialogue.asr.models.base import ASRInterface
|
9 |
+
from voice_dialogue.asr.utils import ensure_minimum_audio_duration
|
10 |
from voice_dialogue.config import paths
|
|
|
|
|
|
|
11 |
from voice_dialogue.utils.logger import logger
|
12 |
|
13 |
|
@@ -3,10 +3,10 @@ import typing
|
|
3 |
import numpy as np
|
4 |
from pywhispercpp.model import Model
|
5 |
|
|
|
|
|
|
|
6 |
from voice_dialogue.config import paths
|
7 |
-
from voice_dialogue.services.speech.recognizers.manager import asr_tables
|
8 |
-
from voice_dialogue.services.speech.recognizers.models.base import ASRInterface
|
9 |
-
from voice_dialogue.services.speech.recognizers.utils import ensure_minimum_audio_duration
|
10 |
from voice_dialogue.utils.logger import logger
|
11 |
|
12 |
|
|
|
3 |
import numpy as np
|
4 |
from pywhispercpp.model import Model
|
5 |
|
6 |
+
from voice_dialogue.asr.manager import asr_tables
|
7 |
+
from voice_dialogue.asr.models.base import ASRInterface
|
8 |
+
from voice_dialogue.asr.utils import ensure_minimum_audio_duration
|
9 |
from voice_dialogue.config import paths
|
|
|
|
|
|
|
10 |
from voice_dialogue.utils.logger import logger
|
11 |
|
12 |
|
File without changes
|
@@ -1,9 +1,7 @@
|
|
1 |
from .capture import AudioCapture
|
2 |
-
from .
|
3 |
-
from .player import AudioStreamPlayer
|
4 |
|
5 |
__all__ = (
|
6 |
"AudioCapture",
|
7 |
-
"
|
8 |
-
"AudioStreamPlayer",
|
9 |
)
|
|
|
1 |
from .capture import AudioCapture
|
2 |
+
from .player import play_audio
|
|
|
3 |
|
4 |
__all__ = (
|
5 |
"AudioCapture",
|
6 |
+
"play_audio",
|
|
|
7 |
)
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
音频捕获模块门面。
|
3 |
+
|
4 |
+
根据配置选择并管理具体的音频捕获策略。
|
5 |
+
"""
|
6 |
+
from multiprocessing import Queue
|
7 |
+
|
8 |
+
from voice_dialogue.utils.logger import logger
|
9 |
+
from .aec_capture import AecCapture
|
10 |
+
from .pyaudio_capture import PyAudioCapture
|
11 |
+
|
12 |
+
|
13 |
+
class AudioCapture:
|
14 |
+
"""
|
15 |
+
音频捕获器门面 (Facade)。
|
16 |
+
|
17 |
+
根据配置选择并管理具体的音频捕获策略(PyAudio 或 AEC)。
|
18 |
+
为上层应用提供统一的、简化的音频捕获接口。
|
19 |
+
它不是一个线程,而是线程安全策略的管理者。
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(
|
23 |
+
self,
|
24 |
+
audio_frames_queue: Queue,
|
25 |
+
enable_echo_cancellation: bool = True,
|
26 |
+
):
|
27 |
+
"""
|
28 |
+
初始化音频捕获器。
|
29 |
+
|
30 |
+
Args:
|
31 |
+
audio_frames_queue (Queue): 用于存放捕获的音频帧的队列。
|
32 |
+
enable_echo_cancellation (bool): 是否启用回声消除功能。
|
33 |
+
若为 True,则使用 AEC 原生库;
|
34 |
+
否则,使用 PyAudio。
|
35 |
+
"""
|
36 |
+
self._strategy = None
|
37 |
+
try:
|
38 |
+
if enable_echo_cancellation:
|
39 |
+
self._strategy = AecCapture(audio_frames_queue=audio_frames_queue)
|
40 |
+
else:
|
41 |
+
self._strategy = PyAudioCapture(audio_frames_queue=audio_frames_queue)
|
42 |
+
logger.info(f"音频捕获策略已选择: {self._strategy.__class__.__name__}")
|
43 |
+
except Exception as e:
|
44 |
+
logger.error(
|
45 |
+
f"初始化 {AecCapture.__name__ if enable_echo_cancellation else PyAudioCapture.__name__} 失败: {e}, 将回退到 PyAudio。")
|
46 |
+
# 只有在尝试 AEC 失败时才回退
|
47 |
+
if not isinstance(self._strategy, PyAudioCapture):
|
48 |
+
self._strategy = PyAudioCapture(audio_frames_queue=audio_frames_queue)
|
49 |
+
logger.info(f"已回退到音频捕获策略: {self._strategy.__class__.__name__}")
|
50 |
+
|
51 |
+
def start(self):
|
52 |
+
"""启动音频捕获线程。"""
|
53 |
+
self._strategy.start()
|
54 |
+
|
55 |
+
def stop(self):
|
56 |
+
"""停止音频捕获线程。"""
|
57 |
+
self._strategy.exit()
|
58 |
+
|
59 |
+
def pause(self):
|
60 |
+
"""暂停音频捕获。"""
|
61 |
+
self._strategy.pause()
|
62 |
+
|
63 |
+
def resume(self):
|
64 |
+
"""恢复音频捕获。"""
|
65 |
+
self._strategy.resume()
|
66 |
+
|
67 |
+
@property
|
68 |
+
def is_paused(self) -> bool:
|
69 |
+
"""检查捕获器是否已暂停。"""
|
70 |
+
return self._strategy.is_paused
|
71 |
+
|
72 |
+
@property
|
73 |
+
def is_ready(self) -> bool:
|
74 |
+
"""检查捕获线程是否已准备就绪。"""
|
75 |
+
return self._strategy.is_ready
|
76 |
+
|
77 |
+
def is_alive(self):
|
78 |
+
return self._strategy.is_alive()
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ctypes
|
2 |
+
import time
|
3 |
+
from multiprocessing import Queue
|
4 |
+
|
5 |
+
from voice_dialogue.config.paths import LIBRARIES_PATH
|
6 |
+
from voice_dialogue.utils.logger import logger
|
7 |
+
from .base_capture import BaseCapture
|
8 |
+
|
9 |
+
|
10 |
+
class AecCapture(BaseCapture):
|
11 |
+
"""
|
12 |
+
使用 macOS 原生库进行支持 AEC 的音频捕获策略。
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(self, audio_frames_queue: Queue, **kwargs):
|
16 |
+
super().__init__(audio_frames_queue=audio_frames_queue, **kwargs)
|
17 |
+
|
18 |
+
def _load_library(self):
|
19 |
+
"""加载并配置 AEC 原生库。"""
|
20 |
+
try:
|
21 |
+
audio_recorder = ctypes.CDLL(LIBRARIES_PATH / 'libAudioCapture.dylib')
|
22 |
+
audio_recorder.getAudioData.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_bool)]
|
23 |
+
audio_recorder.getAudioData.restype = ctypes.POINTER(ctypes.c_ubyte)
|
24 |
+
audio_recorder.freeAudioData.argtypes = [ctypes.POINTER(ctypes.c_ubyte)]
|
25 |
+
return audio_recorder
|
26 |
+
except Exception as e:
|
27 |
+
logger.error(f"加载 AEC 动态库失败: {e}")
|
28 |
+
raise
|
29 |
+
|
30 |
+
def _capture_loop(self, audio_recorder):
|
31 |
+
"""AEC 音频捕获的主循环。"""
|
32 |
+
logger.info("使用 AEC 音频捕获器开始采集...")
|
33 |
+
audio_recorder.startRecord()
|
34 |
+
self.is_ready = True
|
35 |
+
|
36 |
+
while not self.is_exited:
|
37 |
+
size = ctypes.c_int(0)
|
38 |
+
is_voice_active = ctypes.c_bool(False)
|
39 |
+
# 从原生库获取音频数据
|
40 |
+
data_ptr = audio_recorder.getAudioData(ctypes.byref(size), ctypes.byref(is_voice_active))
|
41 |
+
|
42 |
+
if data_ptr and size.value > 0:
|
43 |
+
audio_data = bytes(data_ptr[: size.value])
|
44 |
+
|
45 |
+
if not self.is_paused:
|
46 |
+
# 将音频帧和语音活动状态一同放入队列
|
47 |
+
self.audio_frames_queue.put((audio_data, is_voice_active.value))
|
48 |
+
|
49 |
+
# 释放原生库分配的内存
|
50 |
+
audio_recorder.freeAudioData(data_ptr)
|
51 |
+
else:
|
52 |
+
# 无数据时短暂休眠,避免CPU空转
|
53 |
+
time.sleep(0.01)
|
54 |
+
|
55 |
+
def _cleanup(self, audio_recorder):
|
56 |
+
"""清理 AEC 资源。"""
|
57 |
+
logger.info("停止 AEC 音频采集...")
|
58 |
+
if not audio_recorder:
|
59 |
+
return
|
60 |
+
audio_recorder.stopRecord()
|
61 |
+
|
62 |
+
def run(self):
|
63 |
+
"""
|
64 |
+
线程主循环,执行 AEC 音频捕获。
|
65 |
+
"""
|
66 |
+
audio_recorder = None
|
67 |
+
try:
|
68 |
+
audio_recorder = self._load_library()
|
69 |
+
self._capture_loop(audio_recorder)
|
70 |
+
except Exception as e:
|
71 |
+
logger.error(f'回声消除音频捕获器运行时发生错误: {e}')
|
72 |
+
# 如果 AEC 失败,这里可以考虑触发一个事件或回退机制,但目前只记录错误
|
73 |
+
finally:
|
74 |
+
self._cleanup(audio_recorder)
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import threading
|
2 |
+
from abc import ABC, abstractmethod
|
3 |
+
from multiprocessing import Queue
|
4 |
+
|
5 |
+
from voice_dialogue.core.base import BaseThread
|
6 |
+
|
7 |
+
|
8 |
+
class BaseCapture(BaseThread, ABC):
|
9 |
+
"""
|
10 |
+
抽象音频捕获器基类。
|
11 |
+
|
12 |
+
定义了所有音频捕获策略应遵循的通用接口。
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(
|
16 |
+
self,
|
17 |
+
audio_frames_queue: Queue,
|
18 |
+
group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None
|
19 |
+
):
|
20 |
+
"""
|
21 |
+
初始化音频捕获器。
|
22 |
+
|
23 |
+
Args:
|
24 |
+
audio_frames_queue (Queue): 用于存放捕获的音频帧的队列。
|
25 |
+
"""
|
26 |
+
super().__init__(group, target, name, args, kwargs, daemon=daemon)
|
27 |
+
self.audio_frames_queue = audio_frames_queue
|
28 |
+
self._pause_event = threading.Event()
|
29 |
+
|
30 |
+
@property
|
31 |
+
def is_paused(self) -> bool:
|
32 |
+
"""检查捕获器是否已暂停。"""
|
33 |
+
return self._pause_event.is_set()
|
34 |
+
|
35 |
+
def pause(self):
|
36 |
+
"""暂停音频捕获。"""
|
37 |
+
self._pause_event.set()
|
38 |
+
|
39 |
+
def resume(self):
|
40 |
+
"""恢复音频捕获。"""
|
41 |
+
self._pause_event.clear()
|
42 |
+
|
43 |
+
@abstractmethod
|
44 |
+
def run(self):
|
45 |
+
"""
|
46 |
+
线程主循环。
|
47 |
+
|
48 |
+
子类必须实现此方法以提供具体的音频捕获逻辑。
|
49 |
+
"""
|
50 |
+
raise NotImplementedError
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from multiprocessing import Queue
|
2 |
+
|
3 |
+
import pyaudio
|
4 |
+
|
5 |
+
from voice_dialogue.utils.logger import logger
|
6 |
+
from .base_capture import BaseCapture
|
7 |
+
|
8 |
+
|
9 |
+
class PyAudioCapture(BaseCapture):
|
10 |
+
"""
|
11 |
+
使用 PyAudio 进行标准的音频采集策略。
|
12 |
+
"""
|
13 |
+
|
14 |
+
def __init__(self, audio_frames_queue: Queue, **kwargs):
|
15 |
+
super().__init__(audio_frames_queue=audio_frames_queue, **kwargs)
|
16 |
+
|
17 |
+
def _init_pyaudio(self):
|
18 |
+
"""初始化 PyAudio 并返回实例和配置。"""
|
19 |
+
p = pyaudio.PyAudio()
|
20 |
+
chunk = 1024
|
21 |
+
sample_rate = 16000
|
22 |
+
return p, chunk, sample_rate
|
23 |
+
|
24 |
+
def _open_stream(self, p, chunk, sample_rate):
|
25 |
+
"""打开 PyAudio 音频流。"""
|
26 |
+
return p.open(
|
27 |
+
format=pyaudio.paInt16,
|
28 |
+
channels=1,
|
29 |
+
rate=sample_rate,
|
30 |
+
input=True,
|
31 |
+
frames_per_buffer=chunk,
|
32 |
+
)
|
33 |
+
|
34 |
+
def _capture_loop(self, stream, chunk):
|
35 |
+
"""PyAudio 音频捕获的主循环。"""
|
36 |
+
logger.info("使用 PyAudio 开始音频采集...")
|
37 |
+
self.is_ready = True
|
38 |
+
|
39 |
+
while not self.is_exited:
|
40 |
+
data = stream.read(chunk, exception_on_overflow=False)
|
41 |
+
if data is None:
|
42 |
+
continue
|
43 |
+
|
44 |
+
if self.is_paused:
|
45 |
+
continue
|
46 |
+
|
47 |
+
self.audio_frames_queue.put(data)
|
48 |
+
|
49 |
+
def _cleanup(self, stream, p):
|
50 |
+
"""清理 PyAudio 资源。"""
|
51 |
+
logger.info("停止 PyAudio 音频采集...")
|
52 |
+
stream.stop_stream()
|
53 |
+
stream.close()
|
54 |
+
p.terminate()
|
55 |
+
|
56 |
+
def run(self):
|
57 |
+
"""
|
58 |
+
线程主循环,执行 PyAudio 音频采集。
|
59 |
+
"""
|
60 |
+
p, chunk, sample_rate = self._init_pyaudio()
|
61 |
+
stream = None
|
62 |
+
try:
|
63 |
+
stream = self._open_stream(p, chunk, sample_rate)
|
64 |
+
self._capture_loop(stream, chunk)
|
65 |
+
except Exception as e:
|
66 |
+
logger.error(f'PyAudio 音频捕获器运行时发生错误: {e}')
|
67 |
+
finally:
|
68 |
+
if stream:
|
69 |
+
self._cleanup(stream, p)
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tempfile
|
2 |
+
|
3 |
+
import soundfile as sf
|
4 |
+
from playsound import playsound
|
5 |
+
|
6 |
+
|
7 |
+
def play_audio(audio_data, sample_rate=16000):
|
8 |
+
with tempfile.NamedTemporaryFile('w+b', suffix='.wav') as soundfile:
|
9 |
+
sf.write(soundfile, audio_data, samplerate=sample_rate, subtype='PCM_16', closefd=False)
|
10 |
+
playsound(soundfile.name, block=True)
|
File without changes
|
@@ -4,7 +4,7 @@ TTS说话人配置管理
|
|
4 |
提供说话人配置的查找、映射和管理功能
|
5 |
"""
|
6 |
|
7 |
-
from voice_dialogue.
|
8 |
|
9 |
|
10 |
def get_tts_config_by_speaker_name(speaker_name: str):
|
|
|
4 |
提供说话人配置的查找、映射和管理功能
|
5 |
"""
|
6 |
|
7 |
+
from voice_dialogue.tts.models import tts_config_registry
|
8 |
|
9 |
|
10 |
def get_tts_config_by_speaker_name(speaker_name: str):
|
@@ -6,6 +6,7 @@
|
|
6 |
|
7 |
import time
|
8 |
|
|
|
9 |
from voice_dialogue.config.speaker_config import get_tts_config_by_speaker_name, get_available_speaker_names
|
10 |
from voice_dialogue.core.constants import (
|
11 |
audio_frames_queue,
|
@@ -14,12 +15,7 @@ from voice_dialogue.core.constants import (
|
|
14 |
text_input_queue,
|
15 |
audio_output_queue
|
16 |
)
|
17 |
-
from voice_dialogue.services
|
18 |
-
from voice_dialogue.services.audio.generator import TTSAudioGenerator
|
19 |
-
from voice_dialogue.services.audio.player import AudioStreamPlayer
|
20 |
-
from voice_dialogue.services.speech.monitor import SpeechStateMonitor
|
21 |
-
from voice_dialogue.services.speech.recognizer import ASRWorker
|
22 |
-
from voice_dialogue.services.text.generator import LLMResponseGenerator
|
23 |
from voice_dialogue.utils.logger import logger
|
24 |
|
25 |
|
@@ -62,7 +58,7 @@ def launch_system(
|
|
62 |
threads = []
|
63 |
|
64 |
# 语音识别
|
65 |
-
asr_worker =
|
66 |
user_voice_queue=user_voice_queue,
|
67 |
transcribed_text_queue=transcribed_text_queue,
|
68 |
language=user_language
|
@@ -72,7 +68,7 @@ def launch_system(
|
|
72 |
threads.append(asr_worker)
|
73 |
|
74 |
# 文本生成
|
75 |
-
text_generator =
|
76 |
user_question_queue=transcribed_text_queue,
|
77 |
generated_answer_queue=text_input_queue
|
78 |
)
|
@@ -98,7 +94,7 @@ def launch_system(
|
|
98 |
threads.append(audio_generator)
|
99 |
|
100 |
# 音频播放
|
101 |
-
audio_player =
|
102 |
audio_player.daemon = True
|
103 |
audio_player.start()
|
104 |
threads.append(audio_player)
|
|
|
6 |
|
7 |
import time
|
8 |
|
9 |
+
from voice_dialogue.audio.capture import AudioCapture
|
10 |
from voice_dialogue.config.speaker_config import get_tts_config_by_speaker_name, get_available_speaker_names
|
11 |
from voice_dialogue.core.constants import (
|
12 |
audio_frames_queue,
|
|
|
15 |
text_input_queue,
|
16 |
audio_output_queue
|
17 |
)
|
18 |
+
from voice_dialogue.services import ASRService, LLMService, AudioPlayerService, SpeechStateMonitor, TTSAudioGenerator
|
|
|
|
|
|
|
|
|
|
|
19 |
from voice_dialogue.utils.logger import logger
|
20 |
|
21 |
|
|
|
58 |
threads = []
|
59 |
|
60 |
# 语音识别
|
61 |
+
asr_worker = ASRService(
|
62 |
user_voice_queue=user_voice_queue,
|
63 |
transcribed_text_queue=transcribed_text_queue,
|
64 |
language=user_language
|
|
|
68 |
threads.append(asr_worker)
|
69 |
|
70 |
# 文本生成
|
71 |
+
text_generator = LLMService(
|
72 |
user_question_queue=transcribed_text_queue,
|
73 |
generated_answer_queue=text_input_queue
|
74 |
)
|
|
|
94 |
threads.append(audio_generator)
|
95 |
|
96 |
# 音频播放
|
97 |
+
audio_player = AudioPlayerService(audio_playing_queue=audio_output_queue)
|
98 |
audio_player.daemon = True
|
99 |
audio_player.start()
|
100 |
threads.append(audio_player)
|
File without changes
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .asr_service import ASRService
|
2 |
+
from .audio_player_service import AudioPlayerService
|
3 |
+
from .llm_service import LLMService
|
4 |
+
from .speech_monitor import SpeechStateMonitor
|
5 |
+
from .tts_service import TTSAudioGenerator
|
6 |
+
|
7 |
+
__all__ = (
|
8 |
+
'ASRService',
|
9 |
+
'AudioPlayerService',
|
10 |
+
'LLMService',
|
11 |
+
'SpeechStateMonitor',
|
12 |
+
'TTSAudioGenerator',
|
13 |
+
)
|
@@ -9,10 +9,10 @@ from voice_dialogue.core.constants import user_still_speaking_event, voice_state
|
|
9 |
from voice_dialogue.models.voice_task import VoiceTask
|
10 |
from voice_dialogue.services.mixins import PerformanceLogMixin
|
11 |
from voice_dialogue.utils.cache import LRUCacheDict
|
12 |
-
from .
|
13 |
|
14 |
|
15 |
-
class
|
16 |
def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None,
|
17 |
user_voice_queue: Queue,
|
18 |
transcribed_text_queue: Queue,
|
|
|
9 |
from voice_dialogue.models.voice_task import VoiceTask
|
10 |
from voice_dialogue.services.mixins import PerformanceLogMixin
|
11 |
from voice_dialogue.utils.cache import LRUCacheDict
|
12 |
+
from voice_dialogue.asr import asr_manager
|
13 |
|
14 |
|
15 |
+
class ASRService(BaseThread, PerformanceLogMixin):
|
16 |
def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None,
|
17 |
user_voice_queue: Queue,
|
18 |
transcribed_text_queue: Queue,
|
@@ -1,148 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
音频捕获模块
|
3 |
-
|
4 |
-
提供两种音频采集方式:
|
5 |
-
1. 使用 PyAudio 进行标准音频采集。
|
6 |
-
2. 使用集成了声学回声消除(AEC)和语音活动检测(VAD)的 macOS 原生库进行音频采集。
|
7 |
-
"""
|
8 |
-
|
9 |
-
import ctypes
|
10 |
-
import threading
|
11 |
-
import time
|
12 |
-
from multiprocessing import Queue
|
13 |
-
|
14 |
-
import pyaudio
|
15 |
-
|
16 |
-
from voice_dialogue.config.paths import LIBRARIES_PATH
|
17 |
-
from voice_dialogue.core.base import BaseThread
|
18 |
-
from voice_dialogue.utils.logger import logger
|
19 |
-
|
20 |
-
|
21 |
-
class AudioCapture(BaseThread):
|
22 |
-
"""
|
23 |
-
音频捕获器。
|
24 |
-
|
25 |
-
根据配置选择使用 PyAudio 或带回声消除(AEC)的 macOS 原生库进行音频采集。
|
26 |
-
作为一个后台线程运行,将捕获的音频帧放入队列中。
|
27 |
-
"""
|
28 |
-
|
29 |
-
def __init__(
|
30 |
-
self, group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None,
|
31 |
-
audio_frames_queue: Queue = None,
|
32 |
-
enable_echo_cancellation: bool = True,
|
33 |
-
):
|
34 |
-
"""
|
35 |
-
初始化音频捕获器。
|
36 |
-
|
37 |
-
Args:
|
38 |
-
audio_frames_queue (Queue): 用于存放捕获的音频帧的队列。
|
39 |
-
enable_echo_cancellation (bool): 是否启用回声消除功能。
|
40 |
-
若为 True,则使用原生库进行捕获;
|
41 |
-
否则,使用 PyAudio。
|
42 |
-
"""
|
43 |
-
super().__init__(group, target, name, args, kwargs, daemon=daemon)
|
44 |
-
|
45 |
-
self.audio_frames_queue = audio_frames_queue
|
46 |
-
self._pause_event = threading.Event()
|
47 |
-
self._enable_echo_cancellation = enable_echo_cancellation
|
48 |
-
|
49 |
-
@property
|
50 |
-
def is_paused(self) -> bool:
|
51 |
-
"""检查捕获器是否已暂停。"""
|
52 |
-
return self._pause_event.is_set()
|
53 |
-
|
54 |
-
def pause(self):
|
55 |
-
"""暂停音频捕获。"""
|
56 |
-
self._pause_event.set()
|
57 |
-
|
58 |
-
def resume(self):
|
59 |
-
"""恢复音频捕获。"""
|
60 |
-
self._pause_event.clear()
|
61 |
-
|
62 |
-
def run(self):
|
63 |
-
"""
|
64 |
-
线程主循环。
|
65 |
-
|
66 |
-
根据 `_enable_echo_cancellation` 标志,分派到相应的捕获方法。
|
67 |
-
"""
|
68 |
-
if self._enable_echo_cancellation:
|
69 |
-
self._run_with_aec()
|
70 |
-
else:
|
71 |
-
self._run()
|
72 |
-
|
73 |
-
def _run(self):
|
74 |
-
"""
|
75 |
-
使用 PyAudio 进行标准的音频采集。
|
76 |
-
|
77 |
-
此方法不提供回声消除或语音活动检测。
|
78 |
-
"""
|
79 |
-
p = pyaudio.PyAudio()
|
80 |
-
chunk = 1024
|
81 |
-
sample_rate = 16000
|
82 |
-
stream = p.open(
|
83 |
-
format=pyaudio.paInt16,
|
84 |
-
channels=1,
|
85 |
-
rate=sample_rate,
|
86 |
-
input=True,
|
87 |
-
frames_per_buffer=chunk,
|
88 |
-
)
|
89 |
-
|
90 |
-
logger.info("使用 PyAudio 开始音频采集...")
|
91 |
-
self.is_ready = True
|
92 |
-
|
93 |
-
try:
|
94 |
-
while not self.is_exited:
|
95 |
-
data = stream.read(chunk, exception_on_overflow=False)
|
96 |
-
if data is None:
|
97 |
-
continue
|
98 |
-
|
99 |
-
if self.is_paused:
|
100 |
-
continue
|
101 |
-
|
102 |
-
self.audio_frames_queue.put(data)
|
103 |
-
|
104 |
-
except Exception as e:
|
105 |
-
logger.error(f'PyAudio 音频捕获器运行时发生错误: {e}')
|
106 |
-
finally:
|
107 |
-
logger.info("停止 PyAudio 音频采集...")
|
108 |
-
stream.stop_stream()
|
109 |
-
stream.close()
|
110 |
-
p.terminate()
|
111 |
-
|
112 |
-
def _run_with_aec(self):
|
113 |
-
"""
|
114 |
-
使用 macOS 原生库进行音频捕获。
|
115 |
-
|
116 |
-
此方法通过 ctypes 调用外部动态库,支持声学回声消除(AEC)和语音活动检测(VAD)。
|
117 |
-
"""
|
118 |
-
audio_recorder = ctypes.CDLL(LIBRARIES_PATH / 'libAudioCapture.dylib')
|
119 |
-
audio_recorder.getAudioData.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_bool)]
|
120 |
-
audio_recorder.getAudioData.restype = ctypes.POINTER(ctypes.c_ubyte)
|
121 |
-
audio_recorder.freeAudioData.argtypes = [ctypes.POINTER(ctypes.c_ubyte)]
|
122 |
-
|
123 |
-
audio_recorder.startRecord()
|
124 |
-
self.is_ready = True
|
125 |
-
|
126 |
-
try:
|
127 |
-
while not self.is_exited:
|
128 |
-
size = ctypes.c_int(0)
|
129 |
-
is_voice_active = ctypes.c_bool(False)
|
130 |
-
# 从原生库获取音频数据
|
131 |
-
data_ptr = audio_recorder.getAudioData(ctypes.byref(size), ctypes.byref(is_voice_active))
|
132 |
-
|
133 |
-
if data_ptr and size.value > 0:
|
134 |
-
audio_data = bytes(data_ptr[: size.value])
|
135 |
-
|
136 |
-
if not self.is_paused:
|
137 |
-
# 将音频帧和语音活动状态一同放入队列
|
138 |
-
self.audio_frames_queue.put((audio_data, is_voice_active.value))
|
139 |
-
|
140 |
-
# 释放原生库分配的内存
|
141 |
-
audio_recorder.freeAudioData(data_ptr)
|
142 |
-
else:
|
143 |
-
# 无数据时短暂休眠,避免CPU空转
|
144 |
-
time.sleep(0.01)
|
145 |
-
except Exception as e:
|
146 |
-
logger.error(f'回声消除音频捕获器运行时发生错误: {e}')
|
147 |
-
finally:
|
148 |
-
audio_recorder.stopRecord()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,22 +1,17 @@
|
|
1 |
-
import tempfile
|
2 |
import time
|
3 |
from multiprocessing import Queue
|
4 |
from queue import Empty
|
5 |
from typing import Optional
|
6 |
|
7 |
-
|
8 |
-
from playsound import playsound
|
9 |
-
|
10 |
from voice_dialogue.core.base import BaseThread
|
11 |
-
from voice_dialogue.core.constants import
|
12 |
-
voice_state_manager, silence_over_threshold_event
|
13 |
-
)
|
14 |
from voice_dialogue.models.voice_task import VoiceTask, AnswerDisplayMessage
|
15 |
from voice_dialogue.services.mixins import TaskStatusMixin, HistoryMixin, PerformanceLogMixin
|
16 |
from voice_dialogue.utils.logger import logger
|
17 |
|
18 |
|
19 |
-
class
|
20 |
"""音频流播放器 - 负责播放生成的音频并管理播放状态"""
|
21 |
|
22 |
def __init__(
|
@@ -69,16 +64,11 @@ class AudioStreamPlayer(BaseThread, TaskStatusMixin, HistoryMixin, PerformanceLo
|
|
69 |
|
70 |
if not self.is_stopped:
|
71 |
audio_data, sample_rate = voice_task.tts_generated_sentence_audio
|
72 |
-
|
73 |
|
74 |
# 任务处理完毕,跳出内部循环
|
75 |
break
|
76 |
|
77 |
-
def _play_audio(self, audio_data, sample_rate=16000):
|
78 |
-
with tempfile.NamedTemporaryFile('w+b', suffix='.wav') as soundfile:
|
79 |
-
sf.write(soundfile, audio_data, samplerate=sample_rate, subtype='PCM_16', closefd=False)
|
80 |
-
playsound(soundfile.name, block=True)
|
81 |
-
|
82 |
def run(self):
|
83 |
"""
|
84 |
主运行循环。
|
|
|
|
|
1 |
import time
|
2 |
from multiprocessing import Queue
|
3 |
from queue import Empty
|
4 |
from typing import Optional
|
5 |
|
6 |
+
from voice_dialogue.audio.player import play_audio
|
|
|
|
|
7 |
from voice_dialogue.core.base import BaseThread
|
8 |
+
from voice_dialogue.core.constants import voice_state_manager, silence_over_threshold_event
|
|
|
|
|
9 |
from voice_dialogue.models.voice_task import VoiceTask, AnswerDisplayMessage
|
10 |
from voice_dialogue.services.mixins import TaskStatusMixin, HistoryMixin, PerformanceLogMixin
|
11 |
from voice_dialogue.utils.logger import logger
|
12 |
|
13 |
|
14 |
+
class AudioPlayerService(BaseThread, TaskStatusMixin, HistoryMixin, PerformanceLogMixin):
|
15 |
"""音频流播放器 - 负责播放生成的音频并管理播放状态"""
|
16 |
|
17 |
def __init__(
|
|
|
64 |
|
65 |
if not self.is_stopped:
|
66 |
audio_data, sample_rate = voice_task.tts_generated_sentence_audio
|
67 |
+
play_audio(audio_data, sample_rate)
|
68 |
|
69 |
# 任务处理完毕,跳出内部循环
|
70 |
break
|
71 |
|
|
|
|
|
|
|
|
|
|
|
72 |
def run(self):
|
73 |
"""
|
74 |
主运行循环。
|
@@ -10,16 +10,16 @@ from voice_dialogue.config.llm_config import get_llm_model_params, get_apple_sil
|
|
10 |
from voice_dialogue.config.user_config import get_prompt
|
11 |
from voice_dialogue.core.base import BaseThread
|
12 |
from voice_dialogue.core.constants import chat_history_cache
|
13 |
-
from voice_dialogue.
|
14 |
-
from voice_dialogue.services.mixins import TaskStatusMixin
|
15 |
-
from voice_dialogue.services.text.processor import (
|
16 |
preprocess_sentence_text, create_langchain_chat_llamacpp_instance,
|
17 |
create_langchain_pipeline, warmup_langchain_pipeline
|
18 |
)
|
|
|
|
|
19 |
from voice_dialogue.utils.logger import logger
|
20 |
|
21 |
|
22 |
-
class
|
23 |
"""LLM 回答生成器 - 负责使用语言模型生成回答文本"""
|
24 |
|
25 |
def __init__(
|
|
|
10 |
from voice_dialogue.config.user_config import get_prompt
|
11 |
from voice_dialogue.core.base import BaseThread
|
12 |
from voice_dialogue.core.constants import chat_history_cache
|
13 |
+
from voice_dialogue.llm.processor import (
|
|
|
|
|
14 |
preprocess_sentence_text, create_langchain_chat_llamacpp_instance,
|
15 |
create_langchain_pipeline, warmup_langchain_pipeline
|
16 |
)
|
17 |
+
from voice_dialogue.models.voice_task import VoiceTask, QuestionDisplayMessage
|
18 |
+
from voice_dialogue.services.mixins import TaskStatusMixin
|
19 |
from voice_dialogue.utils.logger import logger
|
20 |
|
21 |
|
22 |
+
class LLMService(BaseThread, TaskStatusMixin):
|
23 |
"""LLM 回答生成器 - 负责使用语言模型生成回答文本"""
|
24 |
|
25 |
def __init__(
|
@@ -74,7 +74,7 @@ class PerformanceLogMixin:
|
|
74 |
return
|
75 |
|
76 |
try:
|
77 |
-
from voice_dialogue.
|
78 |
|
79 |
asr_duration = getattr(voice_task, 'whisper_end_time', 0) - getattr(voice_task, 'whisper_start_time', 0)
|
80 |
llm_duration = getattr(voice_task, 'llm_end_time', 0) - getattr(voice_task, 'llm_start_time', 0)
|
|
|
74 |
return
|
75 |
|
76 |
try:
|
77 |
+
from voice_dialogue.utils.audio_utils import calculate_audio_duration
|
78 |
|
79 |
asr_duration = getattr(voice_task, 'whisper_end_time', 0) - getattr(voice_task, 'whisper_start_time', 0)
|
80 |
llm_duration = getattr(voice_task, 'llm_end_time', 0) - getattr(voice_task, 'llm_start_time', 0)
|
@@ -1,4 +0,0 @@
|
|
1 |
-
from .recognizer import ASRWorker
|
2 |
-
from .monitor import SpeechStateMonitor
|
3 |
-
|
4 |
-
__all__ = ['ASRWorker', 'SpeechStateMonitor']
|
|
|
|
|
|
|
|
|
|
@@ -13,13 +13,13 @@ from queue import Empty
|
|
13 |
import librosa
|
14 |
import numpy as np
|
15 |
|
|
|
16 |
from voice_dialogue.core.base import BaseThread
|
17 |
from voice_dialogue.core.constants import (
|
18 |
voice_state_manager, silence_over_threshold_event, user_still_speaking_event, session_manager
|
19 |
)
|
20 |
from voice_dialogue.core.enums import AudioState
|
21 |
from voice_dialogue.models.voice_task import VoiceTask
|
22 |
-
from voice_dialogue.services.audio.vad import SileroVAD
|
23 |
from voice_dialogue.services.utils import normalize_audio_frame, calculate_audio_duration
|
24 |
from voice_dialogue.utils.logger import logger
|
25 |
|
|
|
13 |
import librosa
|
14 |
import numpy as np
|
15 |
|
16 |
+
from voice_dialogue.audio.vad import SileroVAD
|
17 |
from voice_dialogue.core.base import BaseThread
|
18 |
from voice_dialogue.core.constants import (
|
19 |
voice_state_manager, silence_over_threshold_event, user_still_speaking_event, session_manager
|
20 |
)
|
21 |
from voice_dialogue.core.enums import AudioState
|
22 |
from voice_dialogue.models.voice_task import VoiceTask
|
|
|
23 |
from voice_dialogue.services.utils import normalize_audio_frame, calculate_audio_duration
|
24 |
from voice_dialogue.utils.logger import logger
|
25 |
|
@@ -7,8 +7,8 @@ from voice_dialogue.core.constants import voice_state_manager
|
|
7 |
from voice_dialogue.models.voice_task import VoiceTask
|
8 |
from voice_dialogue.services.mixins import TaskStatusMixin
|
9 |
from voice_dialogue.services.utils import has_no_words
|
|
|
10 |
from voice_dialogue.utils.logger import logger
|
11 |
-
from .generators import tts_manager, BaseTTSConfig
|
12 |
|
13 |
|
14 |
class TTSAudioGenerator(BaseThread, TaskStatusMixin):
|
|
|
7 |
from voice_dialogue.models.voice_task import VoiceTask
|
8 |
from voice_dialogue.services.mixins import TaskStatusMixin
|
9 |
from voice_dialogue.services.utils import has_no_words
|
10 |
+
from voice_dialogue.tts import tts_manager, BaseTTSConfig
|
11 |
from voice_dialogue.utils.logger import logger
|
|
|
12 |
|
13 |
|
14 |
class TTSAudioGenerator(BaseThread, TaskStatusMixin):
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -3,9 +3,9 @@ from typing import Tuple, Optional
|
|
3 |
import numpy as np
|
4 |
from kokoro_onnx import Kokoro
|
5 |
|
6 |
-
from voice_dialogue.
|
7 |
-
from voice_dialogue.
|
8 |
-
from voice_dialogue.
|
9 |
from voice_dialogue.utils.logger import logger
|
10 |
|
11 |
|
|
|
3 |
import numpy as np
|
4 |
from kokoro_onnx import Kokoro
|
5 |
|
6 |
+
from voice_dialogue.tts.configs.kokoro import KokoroTTSConfig
|
7 |
+
from voice_dialogue.tts.manager import tts_tables
|
8 |
+
from voice_dialogue.tts.runtime.interface import TTSInterface
|
9 |
from voice_dialogue.utils.logger import logger
|
10 |
|
11 |
|
@@ -6,9 +6,9 @@ from typing import Tuple
|
|
6 |
import numpy as np
|
7 |
|
8 |
from voice_dialogue.config.paths import load_third_party
|
9 |
-
from voice_dialogue.
|
10 |
-
from voice_dialogue.
|
11 |
-
from voice_dialogue.
|
12 |
from voice_dialogue.utils.logger import logger
|
13 |
|
14 |
load_third_party()
|
|
|
6 |
import numpy as np
|
7 |
|
8 |
from voice_dialogue.config.paths import load_third_party
|
9 |
+
from voice_dialogue.tts.manager import tts_tables
|
10 |
+
from voice_dialogue.tts.models.moyoyo import MoYoYoTTSConfig
|
11 |
+
from voice_dialogue.tts.runtime.interface import TTSInterface
|
12 |
from voice_dialogue.utils.logger import logger
|
13 |
|
14 |
load_third_party()
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
def calculate_audio_duration(audio_data: np.ndarray, sample_rate: int) -> float:
|
5 |
+
"""
|
6 |
+
计算音频数据的时长(秒)。
|
7 |
+
|
8 |
+
Args:
|
9 |
+
audio_data (np.ndarray): 音频数据数组。
|
10 |
+
sample_rate (int): 采样率。
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
float: 音频时长(秒)。
|
14 |
+
"""
|
15 |
+
if audio_data is None or sample_rate == 0:
|
16 |
+
return 0.0
|
17 |
+
return len(audio_data) / sample_rate
|
@@ -13,7 +13,7 @@ if lib_path.exists() and lib_path.as_posix() not in sys.path:
|
|
13 |
sys.path.insert(0, lib_path.as_posix())
|
14 |
|
15 |
from voice_dialogue.config.llm_config import get_llm_model_params, BUILTIN_LLM_MODEL_PATH
|
16 |
-
from voice_dialogue.
|
17 |
|
18 |
CHINESE_SYSTEM_PROMPT = (
|
19 |
"你是AI助手。请以自然流畅的中文口语化表达直接回答问题,避免冗余的思考过程。"
|
|
|
13 |
sys.path.insert(0, lib_path.as_posix())
|
14 |
|
15 |
from voice_dialogue.config.llm_config import get_llm_model_params, BUILTIN_LLM_MODEL_PATH
|
16 |
+
from voice_dialogue.llm.processor import create_langchain_pipeline
|
17 |
|
18 |
CHINESE_SYSTEM_PROMPT = (
|
19 |
"你是AI助手。请以自然流畅的中文口语化表达直接回答问题,避免冗余的思考过程。"
|