fix: move pyaudio from py core dependency to extension

wangyue.demon · wangyue.demon · commit d8c68b4ae29f · 2025-11-10T15:08:52.000+08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,9 +54,7 @@ database = [
     "tos>=2.8.4",                   # For TOS storage and Viking DB
     "mem0ai==0.1.118",              # For mem0
 ]
-tts = [
-    "pyaudio>=0.2.14",
-]
+speech = []
 eval = [
     "prometheus-client>=0.22.1",    # For exporting data to Prometheus pushgateway
     "deepeval>=3.2.6",              # For DeepEval-based evaluation
diff --git a/tests/tools/builtin_tools/test_tts.py b/tests/tools/builtin_tools/test_tts.py
@@ -86,8 +86,7 @@ def test_tts_failure(self, mock_session):
         mock_session.return_value.post.assert_called_once()
 
     @patch("builtins.open")
-    @patch("pyaudio.PyAudio")
-    def test_handle_server_response_success(self, mock_pyaudio, mock_open):
+    def test_handle_server_response_success(self, mock_open):
         """Test successful response handling"""
         # Setup mock response
         mock_response = MagicMock()
@@ -96,15 +95,10 @@ def test_handle_server_response_success(self, mock_pyaudio, mock_open):
             json.dumps({"code": 20000000}),
         ]
 
-        # Setup mock audio stream
-        mock_stream = MagicMock()
-        mock_pyaudio.return_value.open.return_value = mock_stream
-
         # Call function
         handle_server_response(mock_response, "test.pcm")
 
         # Assertions
-        mock_stream.write.assert_called_with(b"audio_chunk")
         mock_open.assert_called_once_with("test.pcm", "wb")
 
     @patch("builtins.open")
diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
@@ -24,26 +24,9 @@
 from google.adk.tools import ToolContext
 from veadk.config import getenv, settings
 from veadk.utils.logger import get_logger
-from veadk.utils.audio_manager import AudioDeviceManager, AudioConfig
 
 logger = get_logger(__name__)
 
-input_audio_config = {
-    "chunk": 3200,
-    "format": "pcm",
-    "channels": 1,
-    "sample_rate": 16000,
-    "bit_size": 8,
-}
-
-output_audio_config = {
-    "chunk": 3200,
-    "format": "pcm",
-    "channels": 1,
-    "sample_rate": 24000,
-    "bit_size": 8,
-}
-
 
 def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
     """TTS provides users with the ability to convert text to speech, turning the text content of LLM into audio.
@@ -57,7 +40,7 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
         A dict with the saved audio path.
     """
     url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
-    audio_save_path = ""
+    temp_dir = getenv("TOOL_VESPEECH_AUDIO_OUTPUT_PATH", tempfile.gettempdir())
 
     app_id = getenv("TOOL_VESPEECH_APP_ID")
     speaker = getenv(
@@ -106,10 +89,13 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
         logger.debug(f"Request TTS server with payload: {payload}.")
         response = session.post(url, headers=headers, json=payload, stream=True)
 
+        os.makedirs(temp_dir, exist_ok=True)
         with tempfile.NamedTemporaryFile(
-            suffix=".pcm", delete=False, dir=tempfile.gettempdir()
+            suffix=".pcm", delete=False, dir=temp_dir
         ) as tmp:
             audio_save_path = tmp.name  # e.g. /tmp/tts_12345.pcm
+            logger.debug(f"Created temporary file: {audio_save_path}")
+
         handle_server_response(response, audio_save_path)
 
     except Exception as e:
@@ -122,8 +108,6 @@ def text_to_speech(text: str, tool_context: ToolContext) -> Dict[str, Any]:
             f"Execution Error: {e}"
         }
     finally:
-        if audio_save_path and os.path.exists(audio_save_path):
-            os.remove(audio_save_path)
         if response:
             response.close()
         session.close()
@@ -150,18 +134,29 @@ def handle_server_response(
     audio_queue = queue.Queue()
     total_audio_size = 0
 
-    audio_device = AudioDeviceManager(
-        AudioConfig(**input_audio_config), AudioConfig(**output_audio_config)
-    )
-
-    # init output stream
-    output_stream = audio_device.open_output_stream()
+    output_stream, player_thread = None, None
     stop_event = threading.Event()
-    player_thread = threading.Thread(
-        target=_audio_player_thread, args=(audio_queue, output_stream, stop_event)
-    )
-    player_thread.daemon = True
-    player_thread.start()
+    try:
+        from veadk.utils.audio_manager import (
+            AudioDeviceManager,
+            AudioConfig,
+            input_audio_config,
+            output_audio_config,
+        )
+
+        audio_device = AudioDeviceManager(
+            AudioConfig(**input_audio_config), AudioConfig(**output_audio_config)
+        )
+
+        # init output stream
+        output_stream = audio_device.open_output_stream()
+        player_thread = threading.Thread(
+            target=_audio_player_thread, args=(audio_queue, output_stream, stop_event)
+        )
+        player_thread.daemon = True
+        player_thread.start()
+    except Exception as e:
+        logger.error(f"Failed to initialize audio device: {e}")
 
     try:
         for chunk in response.iter_lines(decode_unicode=True):
@@ -194,10 +189,12 @@ def handle_server_response(
         logger.error(f"handle tts failed: {e}, response: {response}")
         raise
     finally:
-        audio_queue.join()
-        stop_event.set()
-        player_thread.join()
-        output_stream.close()
+        if output_stream:
+            audio_queue.join()
+            stop_event.set()
+            if player_thread and player_thread.is_alive():
+                player_thread.join()
+            output_stream.close()
 
 
 def _audio_player_thread(audio_queue, output_stream, stop_event):
diff --git a/veadk/utils/audio_manager.py b/veadk/utils/audio_manager.py
@@ -15,7 +15,29 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import pyaudio
+try:
+    import pyaudio
+
+    PYAUDIO_AVAILABLE = True
+except ImportError:
+    pyaudio = None
+    PYAUDIO_AVAILABLE = False
+
+input_audio_config = {
+    "chunk": 3200,
+    "format": "pcm",
+    "channels": 1,
+    "sample_rate": 16000,
+    "bit_size": pyaudio.paInt16,
+}
+
+output_audio_config = {
+    "chunk": 3200,
+    "format": "pcm",
+    "channels": 1,
+    "sample_rate": 24000,
+    "bit_size": pyaudio.paInt16,
+}
 
 
 @dataclass
@@ -33,6 +55,11 @@ class AudioDeviceManager:
     """audio device manager, handle audio input/output"""
 
     def __init__(self, input_config: AudioConfig, output_config: AudioConfig):
+        if not PYAUDIO_AVAILABLE:
+            raise RuntimeError(
+                "pyaudio is not installed. Please install it via: "
+                "pip install veadk-python[speech]"
+            )
         self.input_config = input_config
         self.output_config = output_config
         self.pyaudio = pyaudio.PyAudio()