Skip to content

Commit fe6474a

Browse files
wangyueluckywangyue.demon
andauthored
feat(tools): support volcano text-to-speech tool
* support volcano TTS tools * update pyaudio dependency * code lint * code lint * fix(tts): improve functionality and reliability of text-to-speech conversion * fix(tts): improve return structured dict with 'saved_audio_path' * fix(tts): enhance error message and clarify docstring * fix(tts): enhance error message and clarify docstring * auth(veauth): support query tts app_key from openapi * auth(veauth): support query tts app_key from openapi * auth(veauth): implement speech token retrieval with credential fallback * update config.yaml.full * fix: move pyaudio from py core dependency to extension * fix: move pyaudio from py core dependency to extension --------- Co-authored-by: wangyue.demon <[email protected]>
1 parent 9d42b34 commit fe6474a

File tree

9 files changed

+670
-1
lines changed

9 files changed

+670
-1
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,4 +196,7 @@ cython_debug/
196196

197197
**/.nuxt
198198
**/.data
199-
**./output
199+
**./output
200+
201+
*.mp3
202+
*.pcm

config.yaml.full

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ tool:
4646
web_scraper:
4747
endpoint:
4848
api_key: # `token`
49+
# [optional] https://console.volcengine.com/speech/new/experience/tts
50+
text_to_speech:
51+
app_id: # `app_id`
52+
api_key: # `app_secret`
53+
speaker: # `speaker`
4954
# [optional] https://open.larkoffice.com/app
5055
lark:
5156
endpoint: # `app_id`

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ database = [
5454
"tos>=2.8.4", # For TOS storage and Viking DB
5555
"mem0ai==0.1.118", # For mem0
5656
]
57+
speech = []
5758
eval = [
5859
"prometheus-client>=0.22.1", # For exporting data to Prometheus pushgateway
5960
"deepeval>=3.2.6", # For DeepEval-based evaluation
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
from unittest.mock import patch, MagicMock
17+
from veadk.auth.veauth.speech_veauth import get_speech_token
18+
19+
20+
# Test cases
21+
22+
23+
def test_get_speech_token_with_env_vars(monkeypatch):
24+
"""Test when credentials are available in environment variables"""
25+
# Setup
26+
monkeypatch.setenv("VOLCENGINE_ACCESS_KEY", "test_access_key")
27+
monkeypatch.setenv("VOLCENGINE_SECRET_KEY", "test_secret_key")
28+
29+
mock_response = {"Result": {"APIKeys": [{"APIKey": "test_api_key"}]}}
30+
31+
with patch("veadk.auth.veauth.speech_veauth.ve_request") as mock_ve_request:
32+
mock_ve_request.return_value = mock_response
33+
34+
# Execute
35+
result = get_speech_token()
36+
37+
# Verify
38+
assert result == "test_api_key"
39+
mock_ve_request.assert_called_once_with(
40+
request_body={
41+
"ProjectName": "default",
42+
"OnlyAvailable": True,
43+
},
44+
header={"X-Security-Token": ""},
45+
action="ListAPIKeys",
46+
ak="test_access_key",
47+
sk="test_secret_key",
48+
service="speech_saas_prod",
49+
version="2025-05-20",
50+
region="cn-beijing",
51+
host="open.volcengineapi.com",
52+
)
53+
54+
55+
def test_get_speech_token_with_vefaas_iam(monkeypatch):
56+
"""Test when credentials are obtained from vefaas iam"""
57+
# Setup
58+
monkeypatch.delenv("VOLCENGINE_ACCESS_KEY", raising=False)
59+
monkeypatch.delenv("VOLCENGINE_SECRET_KEY", raising=False)
60+
61+
mock_cred = MagicMock()
62+
mock_cred.access_key_id = "vefaas_access_key"
63+
mock_cred.secret_access_key = "vefaas_secret_key"
64+
mock_cred.session_token = "vefaas_session_token"
65+
66+
mock_response = {"Result": {"APIKeys": [{"APIKey": "vefaas_api_key"}]}}
67+
68+
with (
69+
patch(
70+
"veadk.auth.veauth.speech_veauth.get_credential_from_vefaas_iam"
71+
) as mock_get_cred,
72+
patch("veadk.auth.veauth.speech_veauth.ve_request") as mock_ve_request,
73+
):
74+
mock_get_cred.return_value = mock_cred
75+
mock_ve_request.return_value = mock_response
76+
77+
# Execute
78+
result = get_speech_token(region="cn-shanghai")
79+
80+
# Verify
81+
assert result == "vefaas_api_key"
82+
mock_get_cred.assert_called_once()
83+
mock_ve_request.assert_called_once_with(
84+
request_body={
85+
"ProjectName": "default",
86+
"OnlyAvailable": True,
87+
},
88+
header={"X-Security-Token": "vefaas_session_token"},
89+
action="ListAPIKeys",
90+
ak="vefaas_access_key",
91+
sk="vefaas_secret_key",
92+
service="speech_saas_prod",
93+
version="2025-05-20",
94+
region="cn-shanghai",
95+
host="open.volcengineapi.com",
96+
)
97+
98+
99+
def test_get_speech_token_invalid_response():
100+
"""Test when API response is invalid"""
101+
# Setup
102+
monkeypatch = pytest.MonkeyPatch()
103+
monkeypatch.setenv("VOLCENGINE_ACCESS_KEY", "test_access_key")
104+
monkeypatch.setenv("VOLCENGINE_SECRET_KEY", "test_secret_key")
105+
106+
mock_response = {"Error": {"Message": "Invalid request"}}
107+
108+
with patch("veadk.auth.veauth.speech_veauth.ve_request") as mock_ve_request:
109+
mock_ve_request.return_value = mock_response
110+
111+
# Execute & Verify
112+
with pytest.raises(ValueError, match="Failed to get speech api key list"):
113+
get_speech_token()
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import queue
16+
import json
17+
import base64
18+
import requests
19+
from unittest import TestCase
20+
from unittest.mock import patch, MagicMock
21+
from google.adk.tools import ToolContext
22+
from veadk.tools.builtin_tools.tts import (
23+
text_to_speech,
24+
handle_server_response,
25+
save_output_to_file,
26+
_audio_player_thread,
27+
)
28+
29+
30+
class TestTTS(TestCase):
31+
def setUp(self):
32+
self.mock_tool_context = MagicMock(spec=ToolContext)
33+
self.mock_tool_context._invocation_context = MagicMock()
34+
self.mock_tool_context._invocation_context.user_id = "test_user"
35+
36+
# Mock environment variables
37+
self.patcher_env = patch.dict(
38+
"os.environ",
39+
{
40+
"TOOL_VESPEECH_APP_ID": "test_app_id",
41+
"TOOL_VESPEECH_API_KEY": "test_api_key",
42+
"TOOL_VESPEECH_SPEAKER": "test_speaker",
43+
},
44+
)
45+
self.patcher_env.start()
46+
47+
def tearDown(self):
48+
self.patcher_env.stop()
49+
50+
@patch("requests.Session")
51+
def test_tts_success(self, mock_session):
52+
"""Test successful TTS request"""
53+
# Setup mock response
54+
mock_response = MagicMock()
55+
mock_response.headers = {"X-Tt-Logid": "test_log_id"}
56+
mock_response.iter_lines.return_value = [
57+
json.dumps({"code": 0, "data": base64.b64encode(b"audio_chunk").decode()}),
58+
json.dumps({"code": 20000000}),
59+
]
60+
mock_session.return_value.post.return_value = mock_response
61+
62+
# Call function
63+
result = text_to_speech("test text", self.mock_tool_context)
64+
65+
# Assertions
66+
self.assertIsInstance(result, dict)
67+
self.assertIn("saved_audio_path", result)
68+
mock_session.return_value.post.assert_called_once()
69+
mock_response.close.assert_called_once()
70+
71+
@patch("requests.Session")
72+
def test_tts_failure(self, mock_session):
73+
"""Test TTS request failure"""
74+
# Setup mock to raise exception
75+
mock_session.return_value.post.side_effect = (
76+
requests.exceptions.RequestException("Test error")
77+
)
78+
79+
# Call function
80+
result = text_to_speech("test text", self.mock_tool_context)
81+
82+
# Assertions
83+
self.assertIsInstance(result, dict)
84+
self.assertIn("error", result)
85+
self.assertIn("Test error", result["error"])
86+
mock_session.return_value.post.assert_called_once()
87+
88+
@patch("builtins.open")
89+
def test_handle_server_response_success(self, mock_open):
90+
"""Test successful response handling"""
91+
# Setup mock response
92+
mock_response = MagicMock()
93+
mock_response.iter_lines.return_value = [
94+
json.dumps({"code": 0, "data": base64.b64encode(b"audio_chunk").decode()}),
95+
json.dumps({"code": 20000000}),
96+
]
97+
98+
# Call function
99+
handle_server_response(mock_response, "test.pcm")
100+
101+
# Assertions
102+
mock_open.assert_called_once_with("test.pcm", "wb")
103+
104+
@patch("builtins.open")
105+
def test_save_output_to_file_success(self, mock_open):
106+
"""Test successful audio file save"""
107+
# Setup mock file handler
108+
mock_file = MagicMock()
109+
mock_open.return_value.__enter__.return_value = mock_file
110+
111+
# Call function
112+
save_output_to_file(b"audio_data", "test.pcm")
113+
114+
# Assertions
115+
mock_open.assert_called_once_with("test.pcm", "wb")
116+
mock_file.write.assert_called_once_with(b"audio_data")
117+
118+
@patch("time.sleep")
119+
def test_audio_player_thread(self, mock_sleep):
120+
"""Test audio player thread"""
121+
# Setup test data
122+
mock_queue = MagicMock()
123+
mock_queue.get.side_effect = [b"audio_data", queue.Empty]
124+
mock_stream = MagicMock()
125+
stop_event = MagicMock()
126+
stop_event.is_set.side_effect = [False, True]
127+
128+
# Call function
129+
_audio_player_thread(mock_queue, mock_stream, stop_event)
130+
131+
# Assertions
132+
mock_stream.write.assert_called_once_with(b"audio_data")
133+
mock_queue.task_done.assert_called_once()

veadk/auth/veauth/speech_veauth.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
17+
from veadk.auth.veauth.utils import get_credential_from_vefaas_iam
18+
from veadk.utils.logger import get_logger
19+
from veadk.utils.volcengine_sign import ve_request
20+
21+
logger = get_logger(__name__)
22+
23+
24+
def get_speech_token(region: str = "cn-beijing") -> str:
25+
logger.info("Fetching speech token...")
26+
27+
access_key = os.getenv("VOLCENGINE_ACCESS_KEY")
28+
secret_key = os.getenv("VOLCENGINE_SECRET_KEY")
29+
session_token = ""
30+
31+
if not (access_key and secret_key):
32+
# try to get from vefaas iam
33+
cred = get_credential_from_vefaas_iam()
34+
access_key = cred.access_key_id
35+
secret_key = cred.secret_access_key
36+
session_token = cred.session_token
37+
38+
res = ve_request(
39+
request_body={"ProjectName": "default", "OnlyAvailable": True},
40+
header={"X-Security-Token": session_token},
41+
action="ListAPIKeys",
42+
ak=access_key,
43+
sk=secret_key,
44+
service="speech_saas_prod",
45+
version="2025-05-20",
46+
region=region,
47+
host="open.volcengineapi.com",
48+
)
49+
try:
50+
first_api_key_id = res["Result"]["APIKeys"][0]["APIKey"]
51+
logger.info("Successfully fetching speech API Key.")
52+
return first_api_key_id
53+
except KeyError:
54+
raise ValueError(f"Failed to get speech api key list: {res}")

veadk/configs/tool_configs.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
from veadk.auth.veauth.prompt_pilot_veauth import PromptPilotVeAuth
2222
from veadk.auth.veauth.vesearch_veauth import VesearchVeAuth
23+
from veadk.auth.veauth.speech_veauth import get_speech_token
2324

2425

2526
class PromptPilotConfig(BaseModel):
@@ -38,5 +39,16 @@ def api_key(self) -> str:
3839
return os.getenv("TOOL_VESEARCH_API_KEY") or VesearchVeAuth().token
3940

4041

42+
class VeSpeechConfig(BaseSettings):
43+
model_config = SettingsConfigDict(env_prefix="TOOL_VESPEECH_")
44+
45+
endpoint: int | str = ""
46+
47+
@cached_property
48+
def api_key(self) -> str:
49+
return os.getenv("TOOL_VESPEECH_API_KEY") or get_speech_token()
50+
51+
4152
class BuiltinToolConfigs(BaseModel):
4253
vesearch: VeSearchConfig = Field(default_factory=VeSearchConfig)
54+
vespeech: VeSpeechConfig = Field(default_factory=VeSpeechConfig)

0 commit comments

Comments
 (0)