Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions examples/audio_speech_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from zai import ZaiClient
import os
import traceback
import uuid


# Change working directory to project root
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
os.chdir(project_root)


def text_to_speech_non_stream():
# Initialize client
client = ZaiClient()

# Audio format
# Supported response formats: wav, pcm (default)
response_format = 'pcm'

try:
# Generate speech audio from text
response = client.audio.speech(
model='glm-tts',
input='Hello, this is a test for text-to-speech functionality.',
voice='tongtong',
response_format=response_format,
stream=False
)

# Save audio to file with unique name
output_file = f"audio_speech_{uuid.uuid4()}.{response_format}"
with open(output_file, 'wb') as f:
f.write(response.content)

print(f"Audio saved to {os.path.abspath(output_file)}")
except Exception as e:
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
raise


def text_to_speech_stream():
# Initialize client
client = ZaiClient()

# Audio format
# Streaming only supports pcm format
response_format = 'pcm'

try:
# Generate speech audio with streaming
response = client.audio.speech(
model='glm-tts',
input='Hello, this is a test for text-to-speech functionality.',
voice='tongtong',
response_format=response_format,
stream=True
)

# Process streaming response
chunk_index = 0
for chunk in response:
try:
choice = chunk.choices[0]
if choice.delta is None:
break
if choice.delta.content:
print(f"[Chunk {chunk_index}] {choice.delta.content}")
chunk_index += 1
except (AttributeError, IndexError) as e:
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
except Exception as e:
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
raise


if __name__ == '__main__':
# Non-streaming text to speech
text_to_speech_non_stream()

# Streaming text to speech
# text_to_speech_stream()
83 changes: 83 additions & 0 deletions examples/audio_transcriptions_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from zai import ZaiClient
import os
import traceback


# Change working directory to project root
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
os.chdir(project_root)


def audio_transcription_non_stream():
# Initialize client
client = ZaiClient()

# Audio file path
# Supported formats: .wav, .mp3
# File size limit: <= 25 MB, Duration limit: <= 30 seconds
audio_file_path = "tests/integration_tests/asr.wav"

# Check if file exists
if not os.path.exists(audio_file_path):
print(f"Audio file not found: {audio_file_path}")
return

try:
# Open the audio file and create transcription
with open(audio_file_path, 'rb') as audio_file:
response = client.audio.transcriptions.create(
model='glm-asr-2512',
file=audio_file,
stream=False
)

# Print transcription result
print(response.text)
except Exception as e:
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
raise


def audio_transcription_stream():
# Initialize client
client = ZaiClient()

# Audio file path
# Supported formats: .wav, .mp3
# File size limit: <= 25 MB, Duration limit: <= 30 seconds
audio_file_path = "tests/integration_tests/asr.wav"

# Check if file exists
if not os.path.exists(audio_file_path):
print(f"Audio file not found: {audio_file_path}")
return

try:
# Open the audio file and create transcription with streaming
with open(audio_file_path, 'rb') as audio_file:
response = client.audio.transcriptions.create(
model='glm-asr-2512',
file=audio_file,
stream=True
)

# Process streaming response
print("Streaming transcription:")
for chunk in response:
try:
if hasattr(chunk, 'delta') and chunk.delta:
print(chunk.delta, flush=True)
except (AttributeError, IndexError) as e:
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
except Exception as e:
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
raise


if __name__ == '__main__':
# Non-streaming audio transcription
audio_transcription_non_stream()

# Streaming audio transcription
# audio_transcription_stream()
16 changes: 12 additions & 4 deletions src/zai/api_resource/audio/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def speech(
input: str = None,
voice: str = None,
response_format: str = None,
watermark_enabled: Optional[bool] | NotGiven = NOT_GIVEN,
sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN,
request_id: str = None,
user_id: str = None,
Expand All @@ -66,13 +67,18 @@ def speech(
Generate speech audio from text input

Arguments:
model (str): The model to use for speech generation
input (str): The text to convert to speech
voice (str): The voice to use for speech generation
response_format (str): The format of the response audio
model (str): The model to use for speech generation (e.g., 'glm-tts')
input (str): The text to convert to speech (max length: 1024 characters)
voice (str): The voice to use for speech generation (e.g., 'tongtong', 'chuichui', 'xiaochen', etc.)
response_format (str): The format of the response audio ('wav' or 'pcm', default 'pcm')
watermark_enabled (Optional[bool]): Whether to enable watermark on generated audio
sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration
request_id (str): Unique identifier for the request
user_id (str): User identifier
encode_format (str): Encoding format for streaming response ('base64' or 'hex', default 'base64')
speed (float): Speech speed, default 1.0, valid range [0.5, 2]
volume (float): Audio volume, default 1.0, valid range (0, 10]
stream (bool): Whether to use streaming output (default False)
extra_headers (Headers): Additional headers to send
extra_body (Body): Additional body parameters
timeout (float | httpx.Timeout): Request timeout
Expand All @@ -83,6 +89,8 @@ def speech(
'input': input,
'voice': voice,
'response_format': response_format,
'watermark_enabled': watermark_enabled,
'sensitive_word_check': sensitive_word_check,
'encode_format': encode_format,
'request_id': request_id,
'user_id': user_id,
Expand Down
20 changes: 10 additions & 10 deletions src/zai/api_resource/audio/transcriptions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Mapping, Optional, cast
from typing import TYPE_CHECKING, List, Mapping, Optional, cast

import httpx
from typing_extensions import Literal
Expand Down Expand Up @@ -43,10 +43,12 @@ def create(
*,
file: FileTypes,
model: str,
file_base64: Optional[str] | NotGiven = NOT_GIVEN,
prompt: Optional[str] | NotGiven = NOT_GIVEN,
hotwords: Optional[List[str]] | NotGiven = NOT_GIVEN,
request_id: Optional[str] | NotGiven = NOT_GIVEN,
user_id: Optional[str] | NotGiven = NOT_GIVEN,
stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
temperature: Optional[float] | NotGiven = NOT_GIVEN,
sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN,
extra_headers: Headers | None = None,
extra_body: Body | None = None,
Expand All @@ -58,28 +60,26 @@ def create(
Arguments:
file (FileTypes): Audio file to transcribe
model (str): The model to use for transcription
file_base64 (Optional[str]): Base64 encoded audio file (alternative to file)
prompt (Optional[str]): Previous transcription result for context
hotwords (Optional[List[str]]): Hot words to improve recognition rate
request_id (Optional[str]): Unique identifier for the request
user_id (Optional[str]): User identifier
stream (Optional[Literal[False]] | Literal[True]): Whether to stream the response
temperature (Optional[float]): Sampling temperature for transcription
sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration
extra_headers (Headers): Additional headers to send
extra_body (Body): Additional body parameters
timeout (float | httpx.Timeout): Request timeout
"""
if temperature is not None and temperature != NOT_GIVEN:
if temperature <= 0:
temperature = 0.01
if temperature >= 1:
temperature = 0.99

body = deepcopy_minimal(
{
'model': model,
'file': file,
'file_base64': file_base64,
'prompt': prompt,
'hotwords': hotwords,
'request_id': request_id,
'user_id': user_id,
'temperature': temperature,
'sensitive_word_check': sensitive_word_check,
'stream': stream,
}
Expand Down
12 changes: 9 additions & 3 deletions src/zai/types/audio/audio_speech_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ class AudioSpeechParams(TypedDict, total=False):
input (str): Text to be converted to speech
voice (str): Voice tone for speech generation
response_format (str): Format of the generated audio file
watermark_enabled (Optional[bool]): Whether to enable watermark on generated audio
encode_format (str): Encoding format for streaming response (base64 or hex)
speed (float): Speech speed, default 1.0, range [0.5, 2]
volume (float): Audio volume, default 1.0, range (0, 10]
stream (bool): Whether to use streaming output
sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration
request_id (str): Request ID passed by client, must be unique; used to distinguish each request,
platform will generate default if not provided by client
Expand All @@ -26,10 +31,11 @@ class AudioSpeechParams(TypedDict, total=False):
input: str
voice: str
response_format: str
sensitive_word_check: Optional[SensitiveWordCheckRequest]
request_id: str
user_id: str
watermark_enabled: Optional[bool]
encode_format: str
speed: float
volume: float
stream: bool
sensitive_word_check: Optional[SensitiveWordCheckRequest]
request_id: str
user_id: str
12 changes: 9 additions & 3 deletions src/zai/types/audio/transcriptions_create_param.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import Optional
from typing import List, Optional

from typing_extensions import TypedDict

Expand All @@ -13,7 +13,10 @@ class TranscriptionsParam(TypedDict, total=False):

Attributes:
model (str): Model encoding.
temperature (float): Sampling temperature.
file (str): Audio file to transcribe.
file_base64 (str): Base64 encoded audio file (alternative to file).
prompt (str): Previous transcription result for context in long text scenarios.
hotwords (List[str]): Hot words to improve recognition rate for specific domain vocabulary.
stream (bool): Whether to use streaming output.
sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration.
request_id (str): Passed by the client, must ensure uniqueness; used to distinguish
Expand All @@ -23,7 +26,10 @@ class TranscriptionsParam(TypedDict, total=False):
"""

model: str
temperature: float
file: str
file_base64: str
prompt: str
hotwords: List[str]
stream: bool
sensitive_word_check: Optional[SensitiveWordCheckRequest]
request_id: str
Expand Down