zai-org · 666ycy · Jan 8, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/examples/audio_speech_example.py b/examples/audio_speech_example.py
@@ -0,0 +1,82 @@
+from zai import ZaiClient
+import os
+import traceback
+import uuid
+
+
+# Change working directory to project root
+script_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(script_dir)
+os.chdir(project_root)
+
+
+def text_to_speech_non_stream():
+	# Initialize client
+	client = ZaiClient()
+
+	# Audio format
+	# Supported response formats: wav, pcm (default)
+	response_format = 'pcm'
+
+	try:
+		# Generate speech audio from text
+		response = client.audio.speech(
+			model='glm-tts',
+			input='Hello, this is a test for text-to-speech functionality.',
+			voice='tongtong',
+			response_format=response_format,
+			stream=False
+		)
+
+		# Save audio to file with unique name
+		output_file = f"audio_speech_{uuid.uuid4()}.{response_format}"
+		with open(output_file, 'wb') as f:
+			f.write(response.content)
+
+		print(f"Audio saved to {os.path.abspath(output_file)}")
+	except Exception as e:
+		print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
+		raise
+
+
+def text_to_speech_stream():
+	# Initialize client
+	client = ZaiClient()
+
+	# Audio format
+	# Streaming only supports pcm format
+	response_format = 'pcm'
+
+	try:
+		# Generate speech audio with streaming
+		response = client.audio.speech(
+			model='glm-tts',
+			input='Hello, this is a test for text-to-speech functionality.',
+			voice='tongtong',
+			response_format=response_format,
+			stream=True
+		)
+
+		# Process streaming response
+		chunk_index = 0
+		for chunk in response:
+			try:
+				choice = chunk.choices[0]
+				if choice.delta is None:
+					break
+				if choice.delta.content:
+					print(f"[Chunk {chunk_index}] {choice.delta.content}")
+					chunk_index += 1
+			except (AttributeError, IndexError) as e:
+				print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
+	except Exception as e:
+		print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
+		raise
+
+
+if __name__ == '__main__':
+	# Non-streaming text to speech
+	text_to_speech_non_stream()
+
+	# Streaming text to speech
+	# text_to_speech_stream()
diff --git a/examples/audio_transcriptions_example.py b/examples/audio_transcriptions_example.py
@@ -0,0 +1,83 @@
+from zai import ZaiClient
+import os
+import traceback
+
+
+# Change working directory to project root
+script_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(script_dir)
+os.chdir(project_root)
+
+
+def audio_transcription_non_stream():
+	# Initialize client
+	client = ZaiClient()
+
+	# Audio file path
+	# Supported formats: .wav, .mp3
+	# File size limit: <= 25 MB, Duration limit: <= 30 seconds
+	audio_file_path = "tests/integration_tests/asr.wav"
+
+	# Check if file exists
+	if not os.path.exists(audio_file_path):
+		print(f"Audio file not found: {audio_file_path}")
+		return
+
+	try:
+		# Open the audio file and create transcription
+		with open(audio_file_path, 'rb') as audio_file:
+			response = client.audio.transcriptions.create(
+				model='glm-asr-2512',
+				file=audio_file,
+				stream=False
+			)
+
+		# Print transcription result
+		print(response.text)
+	except Exception as e:
+		print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
+		raise
+
+
+def audio_transcription_stream():
+	# Initialize client
+	client = ZaiClient()
+
+	# Audio file path
+	# Supported formats: .wav, .mp3
+	# File size limit: <= 25 MB, Duration limit: <= 30 seconds
+	audio_file_path = "tests/integration_tests/asr.wav"
+
+	# Check if file exists
+	if not os.path.exists(audio_file_path):
+		print(f"Audio file not found: {audio_file_path}")
+		return
+
+	try:
+		# Open the audio file and create transcription with streaming
+		with open(audio_file_path, 'rb') as audio_file:
+			response = client.audio.transcriptions.create(
+				model='glm-asr-2512',
+				file=audio_file,
+				stream=True
+			)
+
+		# Process streaming response
+		print("Streaming transcription:")
+		for chunk in response:
+			try:
+				if hasattr(chunk, 'delta') and chunk.delta:
+					print(chunk.delta, flush=True)
+			except (AttributeError, IndexError) as e:
+				print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
+	except Exception as e:
+		print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
+		raise
+
+
+if __name__ == '__main__':
+	# Non-streaming audio transcription
+	audio_transcription_non_stream()
+
+	# Streaming audio transcription
+	# audio_transcription_stream()
diff --git a/src/zai/api_resource/audio/audio.py b/src/zai/api_resource/audio/audio.py
@@ -51,6 +51,7 @@ def speech(
 		input: str = None,
 		voice: str = None,
 		response_format: str = None,
+		watermark_enabled: Optional[bool] | NotGiven = NOT_GIVEN,
 		sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN,
 		request_id: str = None,
 		user_id: str = None,
@@ -66,13 +67,18 @@ def speech(
 		Generate speech audio from text input
 
 		Arguments:
-			model (str): The model to use for speech generation
-			input (str): The text to convert to speech
-			voice (str): The voice to use for speech generation
-			response_format (str): The format of the response audio
+			model (str): The model to use for speech generation (e.g., 'glm-tts')
+			input (str): The text to convert to speech (max length: 1024 characters)
+			voice (str): The voice to use for speech generation (e.g., 'tongtong', 'chuichui', 'xiaochen', etc.)
+			response_format (str): The format of the response audio ('wav' or 'pcm', default 'pcm')
+			watermark_enabled (Optional[bool]): Whether to enable watermark on generated audio
 			sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration
 			request_id (str): Unique identifier for the request
 			user_id (str): User identifier
+			encode_format (str): Encoding format for streaming response ('base64' or 'hex', default 'base64')
+			speed (float): Speech speed, default 1.0, valid range [0.5, 2]
+			volume (float): Audio volume, default 1.0, valid range (0, 10]
+			stream (bool): Whether to use streaming output (default False)
 			extra_headers (Headers): Additional headers to send
 			extra_body (Body): Additional body parameters
 			timeout (float | httpx.Timeout): Request timeout
@@ -83,6 +89,8 @@ def speech(
 				'input': input,
 				'voice': voice,
 				'response_format': response_format,
+				'watermark_enabled': watermark_enabled,
+				'sensitive_word_check': sensitive_word_check,
 				'encode_format': encode_format,
 				'request_id': request_id,
 				'user_id': user_id,

diff --git a/src/zai/api_resource/audio/transcriptions.py b/src/zai/api_resource/audio/transcriptions.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING, Mapping, Optional, cast
+from typing import TYPE_CHECKING, List, Mapping, Optional, cast
 
 import httpx
 from typing_extensions import Literal
@@ -43,10 +43,12 @@ def create(
 		*,
 		file: FileTypes,
 		model: str,
+		file_base64: Optional[str] | NotGiven = NOT_GIVEN,
+		prompt: Optional[str] | NotGiven = NOT_GIVEN,
+		hotwords: Optional[List[str]] | NotGiven = NOT_GIVEN,
 		request_id: Optional[str] | NotGiven = NOT_GIVEN,
 		user_id: Optional[str] | NotGiven = NOT_GIVEN,
 		stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
-		temperature: Optional[float] | NotGiven = NOT_GIVEN,
 		sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN,
 		extra_headers: Headers | None = None,
 		extra_body: Body | None = None,
@@ -58,28 +60,26 @@ def create(
 		Arguments:
 			file (FileTypes): Audio file to transcribe
 			model (str): The model to use for transcription
+			file_base64 (Optional[str]): Base64 encoded audio file (alternative to file)
+			prompt (Optional[str]): Previous transcription result for context
+			hotwords (Optional[List[str]]): Hot words to improve recognition rate
 			request_id (Optional[str]): Unique identifier for the request
 			user_id (Optional[str]): User identifier
 			stream (Optional[Literal[False]] | Literal[True]): Whether to stream the response
-			temperature (Optional[float]): Sampling temperature for transcription
 			sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration
 			extra_headers (Headers): Additional headers to send
 			extra_body (Body): Additional body parameters
 			timeout (float | httpx.Timeout): Request timeout
 		"""
-		if temperature is not None and temperature != NOT_GIVEN:
-			if temperature <= 0:
-				temperature = 0.01
-			if temperature >= 1:
-				temperature = 0.99
-
 		body = deepcopy_minimal(
 			{
 				'model': model,
 				'file': file,
+				'file_base64': file_base64,
+				'prompt': prompt,
+				'hotwords': hotwords,
 				'request_id': request_id,
 				'user_id': user_id,
-				'temperature': temperature,
 				'sensitive_word_check': sensitive_word_check,
 				'stream': stream,
 			}

diff --git a/src/zai/types/audio/audio_speech_params.py b/src/zai/types/audio/audio_speech_params.py
@@ -16,6 +16,11 @@ class AudioSpeechParams(TypedDict, total=False):
 		input (str): Text to be converted to speech
 		voice (str): Voice tone for speech generation
 		response_format (str): Format of the generated audio file
+		watermark_enabled (Optional[bool]): Whether to enable watermark on generated audio
+		encode_format (str): Encoding format for streaming response (base64 or hex)
+		speed (float): Speech speed, default 1.0, range [0.5, 2]
+		volume (float): Audio volume, default 1.0, range (0, 10]
+		stream (bool): Whether to use streaming output
 		sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration
 		request_id (str): Request ID passed by client, must be unique; used to distinguish each request,
 			platform will generate default if not provided by client
@@ -26,10 +31,11 @@ class AudioSpeechParams(TypedDict, total=False):
 	input: str
 	voice: str
 	response_format: str
-	sensitive_word_check: Optional[SensitiveWordCheckRequest]
-	request_id: str
-	user_id: str
+	watermark_enabled: Optional[bool]
 	encode_format: str
 	speed: float
 	volume: float
 	stream: bool
+	sensitive_word_check: Optional[SensitiveWordCheckRequest]
+	request_id: str
+	user_id: str
diff --git a/src/zai/types/audio/transcriptions_create_param.py b/src/zai/types/audio/transcriptions_create_param.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Optional
+from typing import List, Optional
 
 from typing_extensions import TypedDict
 
@@ -13,7 +13,10 @@ class TranscriptionsParam(TypedDict, total=False):
 
 	Attributes:
 		model (str): Model encoding.
-		temperature (float): Sampling temperature.
+		file (str): Audio file to transcribe.
+		file_base64 (str): Base64 encoded audio file (alternative to file).
+		prompt (str): Previous transcription result for context in long text scenarios.
+		hotwords (List[str]): Hot words to improve recognition rate for specific domain vocabulary.
 		stream (bool): Whether to use streaming output.
 		sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration.
 		request_id (str): Passed by the client, must ensure uniqueness; used to distinguish
@@ -23,7 +26,10 @@ class TranscriptionsParam(TypedDict, total=False):
 	"""
 
 	model: str
-	temperature: float
+	file: str
+	file_base64: str
+	prompt: str
+	hotwords: List[str]
 	stream: bool
 	sensitive_word_check: Optional[SensitiveWordCheckRequest]
 	request_id: str