Merge pull request #75 from MetaGLM/feature/cogtts-0106

haiyang679 · web-flow · commit 4569cd146090 · 2025-06-26T11:02:00.000+08:00
Feature/cogtts 0106
diff --git a/tests/integration_tests/test_audio.py b/tests/integration_tests/test_audio.py
@@ -0,0 +1,49 @@
+from zhipuai import ZhipuAI
+import zhipuai
+
+import logging
+import logging.config
+from pathlib import Path
+
+
+def test_audio_speech(logging_conf):
+    logging.config.dictConfig(logging_conf)  # type: ignore
+    client = ZhipuAI()  # 填写您自己的APIKey
+    try:
+        speech_file_path = Path(__file__).parent / "speech.wav"
+        response = client.audio.speech(
+            model="cogtts",
+            input="你好呀,欢迎来到智谱开放平台",
+            voice="female",
+            response_format="wav"
+        )
+        response.stream_to_file(speech_file_path)
+
+    except zhipuai.core._errors.APIRequestFailedError as err:
+        print(err)
+    except zhipuai.core._errors.APIInternalError as err:
+        print(err)
+    except zhipuai.core._errors.APIStatusError as err:
+        print(err)
+
+def test_audio_customization(logging_conf):
+    logging.config.dictConfig(logging_conf)
+    client = ZhipuAI()  # 填写您自己的APIKey
+    with open('/Users/jhy/Desktop/tts/test_case_8s.wav', 'rb') as file:
+        try:
+            speech_file_path = Path(__file__).parent / "customization.wav"
+            response = client.audio.customization(
+                model="cogtts",
+                input="你好呀,欢迎来到智谱开放平台",
+                voice_text="这是一条测试用例",
+                voice_data=file,
+                response_format="wav"
+            )
+            response.stream_to_file(speech_file_path)
+
+        except zhipuai.core._errors.APIRequestFailedError as err:
+            print(err)
+        except zhipuai.core._errors.APIInternalError as err:
+            print(err)
+        except zhipuai.core._errors.APIStatusError as err:
+            print(err)
diff --git a/tests/integration_tests/test_videos.py b/tests/integration_tests/test_videos.py
@@ -10,7 +10,7 @@ def test_videos(logging_conf):
     client = ZhipuAI()  # 填写您自己的APIKey
     try:
         response = client.videos.generations(
-            model="cogvideo",
+            model="cogvideox",
             prompt="一个开船的人",
 
             user_id="1212222"
diff --git a/zhipuai/api_resource/__init__.py b/zhipuai/api_resource/__init__.py
@@ -49,6 +49,10 @@
     Agents
 )
 
+from .audio import (
+    Audio
+)
+
 __all__ = [
     'Videos',
     'AsyncCompletions',
diff --git a/zhipuai/api_resource/audio/audio.py b/zhipuai/api_resource/audio/audio.py
@@ -1,12 +1,112 @@
 from __future__ import annotations
 
-from ...core import BaseAPI, cached_property
+from typing import TYPE_CHECKING, List, Mapping, cast, Optional, Dict
 from .transcriptions import Transcriptions
 
+from zhipuai.core._utils import extract_files
+
+from zhipuai.types.sensitive_word_check import SensitiveWordCheckRequest
+from zhipuai.types.audio import AudioSpeechParams
+from ...types.audio import audio_customization_param
 
+from zhipuai.core import BaseAPI, maybe_transform
+from zhipuai.core import NOT_GIVEN, Body, Headers, NotGiven, FileTypes
+from zhipuai.core import _legacy_response
+
+import httpx
+from ...core import BaseAPI, cached_property
+
+from zhipuai.core import (
+    make_request_options,
+)
+from zhipuai.core import deepcopy_minimal
+
+if TYPE_CHECKING:
+    from zhipuai._client import ZhipuAI
 
 __all__ = ["Audio"]
+
+
 class Audio(BaseAPI):
+
     @cached_property
     def transcriptions(self) -> Transcriptions:
         return Transcriptions(self._client)
+
+    def __init__(self, client: "ZhipuAI") -> None:
+        super().__init__(client)
+
+    def speech(
+            self,
+            *,
+            model: str,
+            input: str = None,
+            voice: str = None,
+            response_format: str = None,
+            sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN,
+            request_id: str = None,
+            user_id: str = None,
+            extra_headers: Headers | None = None,
+            extra_body: Body | None = None,
+            timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> _legacy_response.HttpxBinaryResponseContent:
+        body = deepcopy_minimal(
+            {
+                "model": model,
+                "input": input,
+                "voice": voice,
+                "response_format": response_format,
+                "sensitive_word_check": sensitive_word_check,
+                "request_id": request_id,
+                "user_id": user_id
+            }
+        )
+        return self._post(
+            "/audio/speech",
+            body=maybe_transform(body, AudioSpeechParams),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_body=extra_body, timeout=timeout
+            ),
+            cast_type=_legacy_response.HttpxBinaryResponseContent
+        )
+
+    def customization(
+            self,
+            *,
+            model: str,
+            input: str = None,
+            voice_text: str = None,
+            voice_data: FileTypes = None,
+            response_format: str = None,
+            sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN,
+            request_id: str = None,
+            user_id: str = None,
+            extra_headers: Headers | None = None,
+            extra_body: Body | None = None,
+            timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> _legacy_response.HttpxBinaryResponseContent:
+        body = deepcopy_minimal(
+            {
+                "model": model,
+                "input": input,
+                "voice_text": voice_text,
+                "voice_data": voice_data,
+                "response_format": response_format,
+                "sensitive_word_check": sensitive_word_check,
+                "request_id": request_id,
+                "user_id": user_id
+            }
+        )
+        files = extract_files(cast(Mapping[str, object], body), paths=[["voice_data"]])
+
+        if files:
+            extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
+        return self._post(
+            "/audio/customization",
+            body=maybe_transform(body, audio_customization_param.AudioCustomizationParam),
+            files=files,
+            options=make_request_options(
+                extra_headers=extra_headers, extra_body=extra_body, timeout=timeout
+            ),
+            cast_type=_legacy_response.HttpxBinaryResponseContent
+        )
diff --git a/zhipuai/core/_sse_client.py b/zhipuai/core/_sse_client.py
@@ -54,7 +54,7 @@ def __stream__(self) -> Iterator[ResponseT]:
                 data = sse.json_data()
                 if isinstance(data, Mapping) and data.get("agent_id"):
                     yield self._data_process_func(data=data, cast_type=self._cast_type, response=self.response)
-                    break
+                    continue
                 if isinstance(data, Mapping) and data.get("error"):
                     raise APIResponseError(
                         message="An error occurred during streaming",
diff --git a/zhipuai/types/audio/__init__.py b/zhipuai/types/audio/__init__.py
@@ -1,6 +1,12 @@
+from .audio_speech_params import(
+    AudioSpeechParams
+)
 
+from .audio_customization_param import(
+    AudioCustomizationParam
+)
 from .transcriptions_create_param import(
     TranscriptionsParam
 )
 
-__all__ = ["TranscriptionsParam"]
+__all__ = ["AudioSpeechParams","AudioCustomizationParam","TranscriptionsParam"]
diff --git a/zhipuai/types/audio/audio_customization_param.py b/zhipuai/types/audio/audio_customization_param.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from typing import List, Optional
+
+from typing_extensions import Literal, Required, TypedDict
+__all__ = ["AudioCustomizationParam"]
+
+from ..sensitive_word_check import SensitiveWordCheckRequest
+
+class AudioCustomizationParam(TypedDict, total=False):
+    model: str
+    """模型编码"""
+    input: str
+    """需要生成语音的文本"""
+    voice_text: str
+    """需要生成语音的音色"""
+    response_format: str
+    """需要生成语音文件的格式"""
+    sensitive_word_check: Optional[SensitiveWordCheckRequest]
+    request_id: str
+    """由用户端传参，需保证唯一性；用于区分每次请求的唯一标识，用户端不传时平台会默认生成。"""
+    user_id: str
+    """用户端。"""
+
+
diff --git a/zhipuai/types/audio/audio_speech_params.py b/zhipuai/types/audio/audio_speech_params.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from typing import List, Optional
+
+from typing_extensions import Literal, Required, TypedDict
+
+__all__ = ["AudioSpeechParams"]
+
+from ..sensitive_word_check import SensitiveWordCheckRequest
+
+
+class AudioSpeechParams(TypedDict, total=False):
+    model: str
+    """模型编码"""
+    input: str
+    """需要生成语音的文本"""
+    voice: str
+    """需要生成语音的音色"""
+    response_format: str
+    """需要生成语音文件的格式"""
+    sensitive_word_check: Optional[SensitiveWordCheckRequest]
+    request_id: str
+    """由用户端传参，需保证唯一性；用于区分每次请求的唯一标识，用户端不传时平台会默认生成。"""
+    user_id: str
+    """用户端。"""