feat: tts streaming (#90)

iceAndFireisFailed · yuhongxiao · 赵嘉琦 · web-flow · commit 0d314d2e01a1 · 2025-07-16T13:43:03.000+08:00
* 接口变更 * 新增音频合成流式返回 * 增加流失音频合成功能 * feat:豆神一期，添加音色，查询音色列表 * feat:APIKEY * feat:APIKEY * Revert "feat:APIKEY" This reverts commit 5775bc5. * Revert "feat:APIKEY" This reverts commit 616b652. * Revert "feat:豆神一期，添加音色，查询音色列表" This reverts commit 7d79be9. * 优化 * 优化测试代码 * 参数优化 --------- Co-authored-by: yuhongxiao <hongxiao.yu@aminer.cn> Co-authored-by: 赵嘉琦 <jiaqi.zhao@aminer.cn> Co-authored-by: tomsun28 <tomsun28@outlook.com>
diff --git a/tests/integration_tests/test_audio.py b/tests/integration_tests/test_audio.py
@@ -1,3 +1,5 @@
+import base64
+import json
 import logging
 import logging.config
 from pathlib import Path
@@ -14,7 +16,8 @@ def test_audio_speech(logging_conf):
 		response = client.audio.speech(
 			model='cogtts',
 			input='你好呀,欢迎来到智谱开放平台',
-			voice='female',
+			voice='tongtong',
+			stream=False,
 			response_format='wav',
 		)
 		response.stream_to_file(speech_file_path)
@@ -26,6 +29,37 @@ def test_audio_speech(logging_conf):
 	except zhipuai.core._errors.APIStatusError as err:
 		print(err)
 
+def test_audio_speech_streaming(logging_conf):
+	logging.config.dictConfig(logging_conf)  # type: ignore
+	client = ZhipuAI()  # 填写您自己的APIKey
+	try:
+		response = client.audio.speech(
+			model='cogtts',
+			input='你好呀,欢迎来到智谱开放平台',
+			voice='tongtong',
+			stream=True,
+			response_format='wav',
+		)
+		with open("output.pcm", "wb") as f:
+			for item in response:
+				choice = item.choices[0]
+				index = choice.index
+				finish_reason = choice.finish_reason
+				audio_delta = choice.delta.content
+				if finish_reason is not None:
+					break
+				f.write(base64.b64decode(audio_delta))
+				print(f"{index}.finish_reason = {finish_reason}, audio_delta = {len(audio_delta)}")
+
+	except zhipuai.core._errors.APIRequestFailedError as err:
+		print(err)
+	except zhipuai.core._errors.APIInternalError as err:
+		print(err)
+	except zhipuai.core._errors.APIStatusError as err:
+		print(err)
+	except Exception as e:
+		print(e)
+
 
 def test_audio_customization(logging_conf):
 	logging.config.dictConfig(logging_conf)
diff --git a/zhipuai/api_resource/audio/audio.py b/zhipuai/api_resource/audio/audio.py
@@ -9,7 +9,7 @@
 from zhipuai.types.audio import AudioSpeechParams
 from ...types.audio import audio_customization_param
 
-from zhipuai.core import BaseAPI, maybe_transform
+from zhipuai.core import BaseAPI, maybe_transform, StreamResponse
 from zhipuai.core import NOT_GIVEN, Body, Headers, NotGiven, FileTypes
 from zhipuai.core import _legacy_response
 
@@ -20,6 +20,7 @@
     make_request_options,
 )
 from zhipuai.core import deepcopy_minimal
+from ...types.audio.audio_speech_chunk import AudioSpeechChunk
 
 if TYPE_CHECKING:
     from zhipuai._client import ZhipuAI
@@ -46,15 +47,17 @@ def speech(
             sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN,
             request_id: str = None,
             user_id: str = None,
+            stream: bool = False,
             extra_headers: Headers | None = None,
             extra_body: Body | None = None,
             timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> _legacy_response.HttpxBinaryResponseContent:
+    ) -> _legacy_response.HttpxBinaryResponseContent | StreamResponse[AudioSpeechChunk]:
         body = deepcopy_minimal(
             {
                 "model": model,
                 "input": input,
                 "voice": voice,
+                "stream": stream,
                 "response_format": response_format,
                 "sensitive_word_check": sensitive_word_check,
                 "request_id": request_id,
@@ -63,11 +66,13 @@ def speech(
         )
         return self._post(
             "/audio/speech",
-            body=maybe_transform(body, AudioSpeechParams),
+            body=body,
             options=make_request_options(
                 extra_headers=extra_headers, extra_body=extra_body, timeout=timeout
             ),
-            cast_type=_legacy_response.HttpxBinaryResponseContent
+            cast_type=_legacy_response.HttpxBinaryResponseContent,
+            stream= stream or False,
+            stream_cls=StreamResponse[AudioSpeechChunk]
         )
 
     def customization(
diff --git a/zhipuai/types/audio/audio_speech_chunk.py b/zhipuai/types/audio/audio_speech_chunk.py
@@ -0,0 +1,32 @@
+from typing import List, Optional, Dict, Any
+
+from ...core import BaseModel
+
+__all__ = [
+    "AudioSpeechChunk",
+    "AudioError",
+    "AudioSpeechChoice",
+    "AudioSpeechDelta"
+]
+
+
+class AudioSpeechDelta(BaseModel):
+    content: Optional[str] = None
+    role: Optional[str] = None
+
+
+class AudioSpeechChoice(BaseModel):
+    delta: AudioSpeechDelta
+    finish_reason: Optional[str] = None
+    index: int
+
+class AudioError:
+    code: Optional[str] = None
+    message: Optional[str] = None
+
+
+class AudioSpeechChunk(BaseModel):
+    choices: List[AudioSpeechChoice]
+    request_id: Optional[str] = None
+    created: Optional[int] = None
+    error: Optional[AudioError] = None