diff --git a/examples/audio_speech_example.py b/examples/audio_speech_example.py new file mode 100644 index 0000000..d53cf36 --- /dev/null +++ b/examples/audio_speech_example.py @@ -0,0 +1,82 @@ +from zai import ZaiClient +import os +import traceback +import uuid + + +# Change working directory to project root +script_dir = os.path.dirname(os.path.abspath(__file__)) +project_root = os.path.dirname(script_dir) +os.chdir(project_root) + + +def text_to_speech_non_stream(): + # Initialize client + client = ZaiClient() + + # Audio format + # Supported response formats: wav, pcm (default) + response_format = 'pcm' + + try: + # Generate speech audio from text + response = client.audio.speech( + model='glm-tts', + input='Hello, this is a test for text-to-speech functionality.', + voice='tongtong', + response_format=response_format, + stream=False + ) + + # Save audio to file with unique name + output_file = f"audio_speech_{uuid.uuid4()}.{response_format}" + with open(output_file, 'wb') as f: + f.write(response.content) + + print(f"Audio saved to {os.path.abspath(output_file)}") + except Exception as e: + print(f"Exception: {e}\nTraceback: {traceback.format_exc()}") + raise + + +def text_to_speech_stream(): + # Initialize client + client = ZaiClient() + + # Audio format + # Streaming only supports pcm format + response_format = 'pcm' + + try: + # Generate speech audio with streaming + response = client.audio.speech( + model='glm-tts', + input='Hello, this is a test for text-to-speech functionality.', + voice='tongtong', + response_format=response_format, + stream=True + ) + + # Process streaming response + chunk_index = 0 + for chunk in response: + try: + choice = chunk.choices[0] + if choice.delta is None: + break + if choice.delta.content: + print(f"[Chunk {chunk_index}] {choice.delta.content}") + chunk_index += 1 + except (AttributeError, IndexError) as e: + print(f"Exception: {e}\nTraceback: {traceback.format_exc()}") + except Exception as e: + print(f"Exception: {e}\nTraceback: {traceback.format_exc()}") + raise + + +if __name__ == '__main__': + # Non-streaming text to speech + text_to_speech_non_stream() + + # Streaming text to speech + # text_to_speech_stream() diff --git a/examples/audio_transcriptions_example.py b/examples/audio_transcriptions_example.py new file mode 100644 index 0000000..5b860c6 --- /dev/null +++ b/examples/audio_transcriptions_example.py @@ -0,0 +1,83 @@ +from zai import ZaiClient +import os +import traceback + + +# Change working directory to project root +script_dir = os.path.dirname(os.path.abspath(__file__)) +project_root = os.path.dirname(script_dir) +os.chdir(project_root) + + +def audio_transcription_non_stream(): + # Initialize client + client = ZaiClient() + + # Audio file path + # Supported formats: .wav, .mp3 + # File size limit: <= 25 MB, Duration limit: <= 30 seconds + audio_file_path = "tests/integration_tests/asr.wav" + + # Check if file exists + if not os.path.exists(audio_file_path): + print(f"Audio file not found: {audio_file_path}") + return + + try: + # Open the audio file and create transcription + with open(audio_file_path, 'rb') as audio_file: + response = client.audio.transcriptions.create( + model='glm-asr-2512', + file=audio_file, + stream=False + ) + + # Print transcription result + print(response.text) + except Exception as e: + print(f"Exception: {e}\nTraceback: {traceback.format_exc()}") + raise + + +def audio_transcription_stream(): + # Initialize client + client = ZaiClient() + + # Audio file path + # Supported formats: .wav, .mp3 + # File size limit: <= 25 MB, Duration limit: <= 30 seconds + audio_file_path = "tests/integration_tests/asr.wav" + + # Check if file exists + if not os.path.exists(audio_file_path): + print(f"Audio file not found: {audio_file_path}") + return + + try: + # Open the audio file and create transcription with streaming + with open(audio_file_path, 'rb') as audio_file: + response = client.audio.transcriptions.create( + model='glm-asr-2512', + file=audio_file, + stream=True + ) + + # Process streaming response + print("Streaming transcription:") + for chunk in response: + try: + if hasattr(chunk, 'delta') and chunk.delta: + print(chunk.delta, flush=True) + except (AttributeError, IndexError) as e: + print(f"Exception: {e}\nTraceback: {traceback.format_exc()}") + except Exception as e: + print(f"Exception: {e}\nTraceback: {traceback.format_exc()}") + raise + + +if __name__ == '__main__': + # Non-streaming audio transcription + audio_transcription_non_stream() + + # Streaming audio transcription + # audio_transcription_stream() \ No newline at end of file diff --git a/src/zai/api_resource/audio/audio.py b/src/zai/api_resource/audio/audio.py index a3cd152..e442fe7 100644 --- a/src/zai/api_resource/audio/audio.py +++ b/src/zai/api_resource/audio/audio.py @@ -51,6 +51,7 @@ def speech( input: str = None, voice: str = None, response_format: str = None, + watermark_enabled: Optional[bool] | NotGiven = NOT_GIVEN, sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN, request_id: str = None, user_id: str = None, @@ -66,13 +67,18 @@ def speech( Generate speech audio from text input Arguments: - model (str): The model to use for speech generation - input (str): The text to convert to speech - voice (str): The voice to use for speech generation - response_format (str): The format of the response audio + model (str): The model to use for speech generation (e.g., 'glm-tts') + input (str): The text to convert to speech (max length: 1024 characters) + voice (str): The voice to use for speech generation (e.g., 'tongtong', 'chuichui', 'xiaochen', etc.) + response_format (str): The format of the response audio ('wav' or 'pcm', default 'pcm') + watermark_enabled (Optional[bool]): Whether to enable watermark on generated audio sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration request_id (str): Unique identifier for the request user_id (str): User identifier + encode_format (str): Encoding format for streaming response ('base64' or 'hex', default 'base64') + speed (float): Speech speed, default 1.0, valid range [0.5, 2] + volume (float): Audio volume, default 1.0, valid range (0, 10] + stream (bool): Whether to use streaming output (default False) extra_headers (Headers): Additional headers to send extra_body (Body): Additional body parameters timeout (float | httpx.Timeout): Request timeout @@ -83,6 +89,8 @@ def speech( 'input': input, 'voice': voice, 'response_format': response_format, + 'watermark_enabled': watermark_enabled, + 'sensitive_word_check': sensitive_word_check, 'encode_format': encode_format, 'request_id': request_id, 'user_id': user_id, diff --git a/src/zai/api_resource/audio/transcriptions.py b/src/zai/api_resource/audio/transcriptions.py index 5eff96d..9a421a2 100644 --- a/src/zai/api_resource/audio/transcriptions.py +++ b/src/zai/api_resource/audio/transcriptions.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Mapping, Optional, cast +from typing import TYPE_CHECKING, List, Mapping, Optional, cast import httpx from typing_extensions import Literal @@ -43,10 +43,12 @@ def create( *, file: FileTypes, model: str, + file_base64: Optional[str] | NotGiven = NOT_GIVEN, + prompt: Optional[str] | NotGiven = NOT_GIVEN, + hotwords: Optional[List[str]] | NotGiven = NOT_GIVEN, request_id: Optional[str] | NotGiven = NOT_GIVEN, user_id: Optional[str] | NotGiven = NOT_GIVEN, stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN, - temperature: Optional[float] | NotGiven = NOT_GIVEN, sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN, extra_headers: Headers | None = None, extra_body: Body | None = None, @@ -58,28 +60,26 @@ def create( Arguments: file (FileTypes): Audio file to transcribe model (str): The model to use for transcription + file_base64 (Optional[str]): Base64 encoded audio file (alternative to file) + prompt (Optional[str]): Previous transcription result for context + hotwords (Optional[List[str]]): Hot words to improve recognition rate request_id (Optional[str]): Unique identifier for the request user_id (Optional[str]): User identifier stream (Optional[Literal[False]] | Literal[True]): Whether to stream the response - temperature (Optional[float]): Sampling temperature for transcription sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration extra_headers (Headers): Additional headers to send extra_body (Body): Additional body parameters timeout (float | httpx.Timeout): Request timeout """ - if temperature is not None and temperature != NOT_GIVEN: - if temperature <= 0: - temperature = 0.01 - if temperature >= 1: - temperature = 0.99 - body = deepcopy_minimal( { 'model': model, 'file': file, + 'file_base64': file_base64, + 'prompt': prompt, + 'hotwords': hotwords, 'request_id': request_id, 'user_id': user_id, - 'temperature': temperature, 'sensitive_word_check': sensitive_word_check, 'stream': stream, } diff --git a/src/zai/types/audio/audio_speech_params.py b/src/zai/types/audio/audio_speech_params.py index 8843ab3..cff36b5 100644 --- a/src/zai/types/audio/audio_speech_params.py +++ b/src/zai/types/audio/audio_speech_params.py @@ -16,6 +16,11 @@ class AudioSpeechParams(TypedDict, total=False): input (str): Text to be converted to speech voice (str): Voice tone for speech generation response_format (str): Format of the generated audio file + watermark_enabled (Optional[bool]): Whether to enable watermark on generated audio + encode_format (str): Encoding format for streaming response (base64 or hex) + speed (float): Speech speed, default 1.0, range [0.5, 2] + volume (float): Audio volume, default 1.0, range (0, 10] + stream (bool): Whether to use streaming output sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration request_id (str): Request ID passed by client, must be unique; used to distinguish each request, platform will generate default if not provided by client @@ -26,10 +31,11 @@ class AudioSpeechParams(TypedDict, total=False): input: str voice: str response_format: str - sensitive_word_check: Optional[SensitiveWordCheckRequest] - request_id: str - user_id: str + watermark_enabled: Optional[bool] encode_format: str speed: float volume: float stream: bool + sensitive_word_check: Optional[SensitiveWordCheckRequest] + request_id: str + user_id: str diff --git a/src/zai/types/audio/transcriptions_create_param.py b/src/zai/types/audio/transcriptions_create_param.py index ae4fef2..0a63d0a 100644 --- a/src/zai/types/audio/transcriptions_create_param.py +++ b/src/zai/types/audio/transcriptions_create_param.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Optional +from typing import List, Optional from typing_extensions import TypedDict @@ -13,7 +13,10 @@ class TranscriptionsParam(TypedDict, total=False): Attributes: model (str): Model encoding. - temperature (float): Sampling temperature. + file (str): Audio file to transcribe. + file_base64 (str): Base64 encoded audio file (alternative to file). + prompt (str): Previous transcription result for context in long text scenarios. + hotwords (List[str]): Hot words to improve recognition rate for specific domain vocabulary. stream (bool): Whether to use streaming output. sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration. request_id (str): Passed by the client, must ensure uniqueness; used to distinguish @@ -23,7 +26,10 @@ class TranscriptionsParam(TypedDict, total=False): """ model: str - temperature: float + file: str + file_base64: str + prompt: str + hotwords: List[str] stream: bool sensitive_word_check: Optional[SensitiveWordCheckRequest] request_id: str