feat(tools): support volcano text-to-speech tool

wangyuelucky · wangyue.demon · web-flow · commit fe6474ab6c10 · 2025-11-10T15:20:14.000+08:00
* support volcano TTS tools

* update pyaudio dependency

* code lint

* code lint

* fix(tts): improve functionality and reliability of text-to-speech conversion

* fix(tts): improve return structured dict with 'saved_audio_path'

* fix(tts): enhance error message and clarify docstring

* fix(tts): enhance error message and clarify docstring

* auth(veauth): support query tts app_key from openapi

* auth(veauth): support query tts app_key from openapi

* auth(veauth): implement speech token retrieval with credential fallback

* update config.yaml.full

* fix: move pyaudio from py core dependency to extension

* fix: move pyaudio from py core dependency to extension

---------

Co-authored-by: wangyue.demon &lt;wangyue.demon@bytedance.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -196,4 +196,7 @@ cython_debug/
 
 **/.nuxt
 **/.data
-**./output
+**./output
+
+*.mp3
+*.pcm
diff --git a/config.yaml.full b/config.yaml.full
@@ -46,6 +46,11 @@ tool:
   web_scraper: 
     endpoint: 
     api_key:    # `token`
+  # [optional] https://console.volcengine.com/speech/new/experience/tts
+  text_to_speech:
+    app_id:     # `app_id`
+    api_key:    # `app_secret`
+    speaker:    # `speaker`
   # [optional] https://open.larkoffice.com/app
   lark: 
     endpoint:   # `app_id`
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,7 @@ database = [
     "tos>=2.8.4",                   # For TOS storage and Viking DB
     "mem0ai==0.1.118",              # For mem0
 ]
+speech = []
 eval = [
     "prometheus-client>=0.22.1",    # For exporting data to Prometheus pushgateway
     "deepeval>=3.2.6",              # For DeepEval-based evaluation
diff --git a/tests/auth/veauth/test_speech_veauth.py b/tests/auth/veauth/test_speech_veauth.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from unittest.mock import patch, MagicMock
+from veadk.auth.veauth.speech_veauth import get_speech_token
+
+
+# Test cases
+
+
+def test_get_speech_token_with_env_vars(monkeypatch):
+    """Test when credentials are available in environment variables"""
+    # Setup
+    monkeypatch.setenv("VOLCENGINE_ACCESS_KEY", "test_access_key")
+    monkeypatch.setenv("VOLCENGINE_SECRET_KEY", "test_secret_key")
+
+    mock_response = {"Result": {"APIKeys": [{"APIKey": "test_api_key"}]}}
+
+    with patch("veadk.auth.veauth.speech_veauth.ve_request") as mock_ve_request:
+        mock_ve_request.return_value = mock_response
+
+        # Execute
+        result = get_speech_token()
+
+        # Verify
+        assert result == "test_api_key"
+        mock_ve_request.assert_called_once_with(
+            request_body={
+                "ProjectName": "default",
+                "OnlyAvailable": True,
+            },
+            header={"X-Security-Token": ""},
+            action="ListAPIKeys",
+            ak="test_access_key",
+            sk="test_secret_key",
+            service="speech_saas_prod",
+            version="2025-05-20",
+            region="cn-beijing",
+            host="open.volcengineapi.com",
+        )
+
+
+def test_get_speech_token_with_vefaas_iam(monkeypatch):
+    """Test when credentials are obtained from vefaas iam"""
+    # Setup
+    monkeypatch.delenv("VOLCENGINE_ACCESS_KEY", raising=False)
+    monkeypatch.delenv("VOLCENGINE_SECRET_KEY", raising=False)
+
+    mock_cred = MagicMock()
+    mock_cred.access_key_id = "vefaas_access_key"
+    mock_cred.secret_access_key = "vefaas_secret_key"
+    mock_cred.session_token = "vefaas_session_token"
+
+    mock_response = {"Result": {"APIKeys": [{"APIKey": "vefaas_api_key"}]}}
+
+    with (
+        patch(
+            "veadk.auth.veauth.speech_veauth.get_credential_from_vefaas_iam"
+        ) as mock_get_cred,
+        patch("veadk.auth.veauth.speech_veauth.ve_request") as mock_ve_request,
+    ):
+        mock_get_cred.return_value = mock_cred
+        mock_ve_request.return_value = mock_response
+
+        # Execute
+        result = get_speech_token(region="cn-shanghai")
+
+        # Verify
+        assert result == "vefaas_api_key"
+        mock_get_cred.assert_called_once()
+        mock_ve_request.assert_called_once_with(
+            request_body={
+                "ProjectName": "default",
+                "OnlyAvailable": True,
+            },
+            header={"X-Security-Token": "vefaas_session_token"},
+            action="ListAPIKeys",
+            ak="vefaas_access_key",
+            sk="vefaas_secret_key",
+            service="speech_saas_prod",
+            version="2025-05-20",
+            region="cn-shanghai",
+            host="open.volcengineapi.com",
+        )
+
+
+def test_get_speech_token_invalid_response():
+    """Test when API response is invalid"""
+    # Setup
+    monkeypatch = pytest.MonkeyPatch()
+    monkeypatch.setenv("VOLCENGINE_ACCESS_KEY", "test_access_key")
+    monkeypatch.setenv("VOLCENGINE_SECRET_KEY", "test_secret_key")
+
+    mock_response = {"Error": {"Message": "Invalid request"}}
+
+    with patch("veadk.auth.veauth.speech_veauth.ve_request") as mock_ve_request:
+        mock_ve_request.return_value = mock_response
+
+        # Execute & Verify
+        with pytest.raises(ValueError, match="Failed to get speech api key list"):
+            get_speech_token()
diff --git a/tests/tools/builtin_tools/test_tts.py b/tests/tools/builtin_tools/test_tts.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import queue
+import json
+import base64
+import requests
+from unittest import TestCase
+from unittest.mock import patch, MagicMock
+from google.adk.tools import ToolContext
+from veadk.tools.builtin_tools.tts import (
+    text_to_speech,
+    handle_server_response,
+    save_output_to_file,
+    _audio_player_thread,
+)
+
+
+class TestTTS(TestCase):
+    def setUp(self):
+        self.mock_tool_context = MagicMock(spec=ToolContext)
+        self.mock_tool_context._invocation_context = MagicMock()
+        self.mock_tool_context._invocation_context.user_id = "test_user"
+
+        # Mock environment variables
+        self.patcher_env = patch.dict(
+            "os.environ",
+            {
+                "TOOL_VESPEECH_APP_ID": "test_app_id",
+                "TOOL_VESPEECH_API_KEY": "test_api_key",
+                "TOOL_VESPEECH_SPEAKER": "test_speaker",
+            },
+        )
+        self.patcher_env.start()
+
+    def tearDown(self):
+        self.patcher_env.stop()
+
+    @patch("requests.Session")
+    def test_tts_success(self, mock_session):
+        """Test successful TTS request"""
+        # Setup mock response
+        mock_response = MagicMock()
+        mock_response.headers = {"X-Tt-Logid": "test_log_id"}
+        mock_response.iter_lines.return_value = [
+            json.dumps({"code": 0, "data": base64.b64encode(b"audio_chunk").decode()}),
+            json.dumps({"code": 20000000}),
+        ]
+        mock_session.return_value.post.return_value = mock_response
+
+        # Call function
+        result = text_to_speech("test text", self.mock_tool_context)
+
+        # Assertions
+        self.assertIsInstance(result, dict)
+        self.assertIn("saved_audio_path", result)
+        mock_session.return_value.post.assert_called_once()
+        mock_response.close.assert_called_once()
+
+    @patch("requests.Session")
+    def test_tts_failure(self, mock_session):
+        """Test TTS request failure"""
+        # Setup mock to raise exception
+        mock_session.return_value.post.side_effect = (
+            requests.exceptions.RequestException("Test error")
+        )
+
+        # Call function
+        result = text_to_speech("test text", self.mock_tool_context)
+
+        # Assertions
+        self.assertIsInstance(result, dict)
+        self.assertIn("error", result)
+        self.assertIn("Test error", result["error"])
+        mock_session.return_value.post.assert_called_once()
+
+    @patch("builtins.open")
+    def test_handle_server_response_success(self, mock_open):
+        """Test successful response handling"""
+        # Setup mock response
+        mock_response = MagicMock()
+        mock_response.iter_lines.return_value = [
+            json.dumps({"code": 0, "data": base64.b64encode(b"audio_chunk").decode()}),
+            json.dumps({"code": 20000000}),
+        ]
+
+        # Call function
+        handle_server_response(mock_response, "test.pcm")
+
+        # Assertions
+        mock_open.assert_called_once_with("test.pcm", "wb")
+
+    @patch("builtins.open")
+    def test_save_output_to_file_success(self, mock_open):
+        """Test successful audio file save"""
+        # Setup mock file handler
+        mock_file = MagicMock()
+        mock_open.return_value.__enter__.return_value = mock_file
+
+        # Call function
+        save_output_to_file(b"audio_data", "test.pcm")
+
+        # Assertions
+        mock_open.assert_called_once_with("test.pcm", "wb")
+        mock_file.write.assert_called_once_with(b"audio_data")
+
+    @patch("time.sleep")
+    def test_audio_player_thread(self, mock_sleep):
+        """Test audio player thread"""
+        # Setup test data
+        mock_queue = MagicMock()
+        mock_queue.get.side_effect = [b"audio_data", queue.Empty]
+        mock_stream = MagicMock()
+        stop_event = MagicMock()
+        stop_event.is_set.side_effect = [False, True]
+
+        # Call function
+        _audio_player_thread(mock_queue, mock_stream, stop_event)
+
+        # Assertions
+        mock_stream.write.assert_called_once_with(b"audio_data")
+        mock_queue.task_done.assert_called_once()
diff --git a/veadk/auth/veauth/speech_veauth.py b/veadk/auth/veauth/speech_veauth.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from veadk.auth.veauth.utils import get_credential_from_vefaas_iam
+from veadk.utils.logger import get_logger
+from veadk.utils.volcengine_sign import ve_request
+
+logger = get_logger(__name__)
+
+
+def get_speech_token(region: str = "cn-beijing") -> str:
+    logger.info("Fetching speech token...")
+
+    access_key = os.getenv("VOLCENGINE_ACCESS_KEY")
+    secret_key = os.getenv("VOLCENGINE_SECRET_KEY")
+    session_token = ""
+
+    if not (access_key and secret_key):
+        # try to get from vefaas iam
+        cred = get_credential_from_vefaas_iam()
+        access_key = cred.access_key_id
+        secret_key = cred.secret_access_key
+        session_token = cred.session_token
+
+    res = ve_request(
+        request_body={"ProjectName": "default", "OnlyAvailable": True},
+        header={"X-Security-Token": session_token},
+        action="ListAPIKeys",
+        ak=access_key,
+        sk=secret_key,
+        service="speech_saas_prod",
+        version="2025-05-20",
+        region=region,
+        host="open.volcengineapi.com",
+    )
+    try:
+        first_api_key_id = res["Result"]["APIKeys"][0]["APIKey"]
+        logger.info("Successfully fetching speech API Key.")
+        return first_api_key_id
+    except KeyError:
+        raise ValueError(f"Failed to get speech api key list: {res}")
diff --git a/veadk/configs/tool_configs.py b/veadk/configs/tool_configs.py
@@ -20,6 +20,7 @@
 
 from veadk.auth.veauth.prompt_pilot_veauth import PromptPilotVeAuth
 from veadk.auth.veauth.vesearch_veauth import VesearchVeAuth
+from veadk.auth.veauth.speech_veauth import get_speech_token
 
 
 class PromptPilotConfig(BaseModel):
@@ -38,5 +39,16 @@ def api_key(self) -> str:
         return os.getenv("TOOL_VESEARCH_API_KEY") or VesearchVeAuth().token
 
 
+class VeSpeechConfig(BaseSettings):
+    model_config = SettingsConfigDict(env_prefix="TOOL_VESPEECH_")
+
+    endpoint: int | str = ""
+
+    @cached_property
+    def api_key(self) -> str:
+        return os.getenv("TOOL_VESPEECH_API_KEY") or get_speech_token()
+
+
 class BuiltinToolConfigs(BaseModel):
     vesearch: VeSearchConfig = Field(default_factory=VeSearchConfig)
+    vespeech: VeSpeechConfig = Field(default_factory=VeSpeechConfig)
diff --git a/veadk/tools/builtin_tools/tts.py b/veadk/tools/builtin_tools/tts.py
diff --git a/veadk/utils/audio_manager.py b/veadk/utils/audio_manager.py

Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ database = [`
`54`	`54`	`"tos>=2.8.4", # For TOS storage and Viking DB`
`55`	`55`	`"mem0ai==0.1.118", # For mem0`
`56`	`56`	`]`
	`57`	`+speech = []`
`57`	`58`	`eval = [`
`58`	`59`	`"prometheus-client>=0.22.1", # For exporting data to Prometheus pushgateway`
`59`	`60`	`"deepeval>=3.2.6", # For DeepEval-based evaluation`