Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
([#111](https://github.com/alibaba/loongsuite-python-agent/pull/111))
- Initial implementation of DashScope instrumentation
([#66](https://github.com/alibaba/loongsuite-python-agent/pull/66))

### Fixed

- Fix MIME type inference logic for speech synthesis instrumentation
([#115](https://github.com/alibaba/loongsuite-python-agent/pull/115))
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,20 @@ Supported APIs
* ``ImageSynthesis.async_call`` (async task submission)
* ``ImageSynthesis.wait`` (async task waiting)

* **Speech Synthesis V1**

* ``SpeechSynthesizer.call`` (V1)

* **Speech Synthesis V2**

* ``SpeechSynthesizer.call`` (V2)

* **Video Synthesis**

* ``VideoSynthesis.call`` (sync)
* ``VideoSynthesis.async_call`` (async task submission)
* ``VideoSynthesis.wait`` (async task waiting)


Captured Attributes
--------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@
wrap_multimodal_conversation_call,
wrap_speech_synthesis_call,
wrap_speech_synthesis_v2_call,
wrap_speech_synthesis_v2_streaming_call,
wrap_text_embedding_call,
wrap_text_rerank_call,
wrap_video_synthesis_async_call,
Expand Down Expand Up @@ -219,13 +218,6 @@ def wrap_speech_synthesis_v2_call_with_provider(
wrapped, instance, args, kwargs, handler=handler
)

def wrap_speech_synthesis_v2_streaming_call_with_provider(
wrapped, instance, args, kwargs
):
return wrap_speech_synthesis_v2_streaming_call(
wrapped, instance, args, kwargs, handler=handler
)

# Instrument Generation.call (sync)
try:
wrap_function_wrapper(
Expand Down Expand Up @@ -379,19 +371,6 @@ def wrap_speech_synthesis_v2_streaming_call_with_provider(
f"Failed to instrument SpeechSynthesizer.call (V2): {e}"
)

# Instrument SpeechSynthesizer.streaming_call (V2)
try:
wrap_function_wrapper(
module=_MODULE_SPEECH_SYNTHESIS_V2,
name="SpeechSynthesizer.streaming_call",
wrapper=wrap_speech_synthesis_v2_streaming_call_with_provider,
)
logger.debug("Instrumented SpeechSynthesizer.streaming_call (V2)")
except Exception as e:
logger.warning(
f"Failed to instrument SpeechSynthesizer.streaming_call (V2): {e}"
)

def _uninstrument(self, **kwargs):
"""Uninstrument the DashScope SDK.

Expand Down Expand Up @@ -430,10 +409,6 @@ def _uninstrument(self, **kwargs):
dashscope.audio.tts_v2.speech_synthesizer.SpeechSynthesizer,
"call",
)
unwrap(
dashscope.audio.tts_v2.speech_synthesizer.SpeechSynthesizer,
"streaming_call",
)
unwrap(dashscope.embeddings.text_embedding.TextEmbedding, "call")
unwrap(dashscope.rerank.text_rerank.TextReRank, "call")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from .speech_synthesis import (
wrap_speech_synthesis_call,
wrap_speech_synthesis_v2_call,
wrap_speech_synthesis_v2_streaming_call,
)
from .video_synthesis import (
wrap_video_synthesis_async_call,
Expand Down Expand Up @@ -60,5 +59,4 @@
# SpeechSynthesis
"wrap_speech_synthesis_call",
"wrap_speech_synthesis_v2_call",
"wrap_speech_synthesis_v2_streaming_call",
]
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from opentelemetry.util.genai.types import Error

from ..utils import (
_convert_speech_format_to_mime_type,
_create_invocation_from_speech_synthesis,
_create_invocation_from_speech_synthesis_v2,
_update_invocation_from_speech_synthesis_response,
Expand Down Expand Up @@ -59,6 +60,9 @@ def wrap_speech_synthesis_call(wrapped, instance, args, kwargs, handler=None):
# Create invocation object
invocation = _create_invocation_from_speech_synthesis(kwargs, model)

speech_format = kwargs.get("format", "wav") # default format is wav
mime_type = _convert_speech_format_to_mime_type(speech_format)

# Start LLM invocation (creates span)
handler.start_llm(invocation)

Expand All @@ -68,7 +72,7 @@ def wrap_speech_synthesis_call(wrapped, instance, args, kwargs, handler=None):

# Update invocation with response data
_update_invocation_from_speech_synthesis_response(
invocation, result
invocation, result, mime_type
)
handler.stop_llm(invocation)

Expand Down Expand Up @@ -115,15 +119,13 @@ def wrap_speech_synthesis_v2_call(
model = getattr(instance, "_model", None) or getattr(
instance, "model", "unknown"
)
voice = getattr(instance, "_voice", None) or getattr(
instance, "voice", None
)
speech_format = getattr(instance, "aformat", "mp3")
mime_type = _convert_speech_format_to_mime_type(speech_format)

text = args[0] if args else kwargs.get("text", "")

# Create invocation object
invocation = _create_invocation_from_speech_synthesis_v2(
model, text, voice
)
invocation = _create_invocation_from_speech_synthesis_v2(model, text)

# Start LLM invocation (creates span)
handler.start_llm(invocation)
Expand All @@ -135,7 +137,7 @@ def wrap_speech_synthesis_v2_call(
# Update invocation with response data
if result is not None:
_update_invocation_from_speech_synthesis_v2_response(
invocation, result
invocation, result, mime_type
)
handler.stop_llm(invocation)

Expand All @@ -151,64 +153,3 @@ def wrap_speech_synthesis_v2_call(
"Error in speech synthesis V2 instrumentation wrapper: %s", e
)
return wrapped(*args, **kwargs)


def wrap_speech_synthesis_v2_streaming_call(
wrapped, instance, args, kwargs, handler=None
):
"""Wrapper for SpeechSynthesizerV2.streaming_call.

Note: This is a streaming input method. The user calls it multiple times
to send text, then calls streaming_complete() to finish.

For now, we just instrument individual streaming_call() invocations.

Args:
wrapped: The original function being wrapped
instance: The SpeechSynthesizer instance
args: Positional arguments (text)
kwargs: Keyword arguments
handler: ExtendedTelemetryHandler instance (created during instrumentation)
"""
if handler is None:
logger.warning("Handler not provided, skipping instrumentation")
return wrapped(*args, **kwargs)

try:
# Extract model and voice from instance
model = getattr(instance, "_model", None) or getattr(
instance, "model", "unknown"
)
voice = getattr(instance, "_voice", None) or getattr(
instance, "voice", None
)
text = args[0] if args else kwargs.get("text", "")

# Create invocation object
invocation = _create_invocation_from_speech_synthesis_v2(
model, text, voice
)
invocation.operation_name = "streaming_call"

# Start LLM invocation (creates span)
handler.start_llm(invocation)

try:
# Execute the wrapped call
result = wrapped(*args, **kwargs)

# For streaming_call, there's no immediate response
handler.stop_llm(invocation)

return result

except Exception as e:
error = Error(message=str(e), type=type(e))
handler.fail_llm(invocation, error)
raise

except Exception as e:
logger.exception(
"Error in speech synthesis V2 streaming_call wrapper: %s", e
)
return wrapped(*args, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@

# Multimodal utilities
from .multimodal import (
# SpeechSynthesis
_convert_speech_format_to_mime_type,
# ImageSynthesis
_create_invocation_from_image_synthesis,
# MultiModalConversation
_create_invocation_from_multimodal_conversation,
# SpeechSynthesis
_create_invocation_from_speech_synthesis,
_create_invocation_from_speech_synthesis_v2,
# VideoSynthesis
Expand Down Expand Up @@ -86,6 +87,7 @@
"_update_invocation_from_video_synthesis_response",
"_update_invocation_from_video_synthesis_async_response",
# SpeechSynthesis
"_convert_speech_format_to_mime_type",
"_create_invocation_from_speech_synthesis",
"_update_invocation_from_speech_synthesis_response",
"_create_invocation_from_speech_synthesis_v2",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,11 @@

from __future__ import annotations

import base64
import logging
from typing import Any, List, Optional

from opentelemetry.util.genai.types import (
Base64Blob,
Blob,
InputMessage,
LLMInvocation,
OutputMessage,
Expand Down Expand Up @@ -713,13 +712,14 @@ def _create_invocation_from_speech_synthesis(


def _update_invocation_from_speech_synthesis_response(
invocation: LLMInvocation, response: Any
invocation: LLMInvocation, response: Any, mime_type: Optional[str] = None
) -> None:
"""Update LLMInvocation with SpeechSynthesizer response data.

Args:
invocation: LLMInvocation to update
response: SpeechSynthesisResult object
mime_type: MIME type of audio (optional)
"""
if not response:
return
Expand All @@ -735,16 +735,14 @@ def _update_invocation_from_speech_synthesis_response(
if callable(audio_data):
audio_bytes = audio_data()
if audio_bytes:
# Encode audio as base64 and store in output_messages
audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
invocation.output_messages = [
OutputMessage(
role="assistant",
parts=[
Base64Blob(
mime_type="audio/wav",
Blob(
mime_type=mime_type,
modality="audio",
content=audio_base64,
content=audio_bytes,
)
],
finish_reason="stop",
Expand All @@ -759,14 +757,13 @@ def _update_invocation_from_speech_synthesis_response(


def _create_invocation_from_speech_synthesis_v2(
model: str, text: str, voice: Optional[str] = None
model: str, text: str
) -> LLMInvocation:
"""Create LLMInvocation from SpeechSynthesizerV2.call args.

Args:
model: Model name
text: Text to synthesize
voice: Voice name (optional)

Returns:
LLMInvocation object
Expand All @@ -788,27 +785,49 @@ def _create_invocation_from_speech_synthesis_v2(


def _update_invocation_from_speech_synthesis_v2_response(
invocation: LLMInvocation, audio_data: bytes
invocation: LLMInvocation,
audio_data: bytes,
mime_type: Optional[str] = None,
) -> None:
"""Update LLMInvocation with SpeechSynthesizerV2 response data.

Args:
invocation: LLMInvocation to update
audio_data: Audio data bytes
mime_type: MIME type of audio (optional)
"""
if audio_data:
# Encode audio as base64 and store in output_messages
audio_base64 = base64.b64encode(audio_data).decode("utf-8")
invocation.output_messages = [
OutputMessage(
role="assistant",
parts=[
Base64Blob(
mime_type="audio/mp3", # V2 typically returns mp3
Blob(
mime_type=mime_type,
modality="audio",
content=audio_base64,
content=audio_data,
)
],
finish_reason="stop",
)
]


def _convert_speech_format_to_mime_type(speech_format: str) -> Optional[str]:
"""Convert from speech format to mime type.

Args:
speech_format: speech format of DashScope

Returns:
the mime type of speech
"""
if speech_format == "wav":
return "audio/wav"
elif speech_format == "mp3":
return "audio/mp3"
elif speech_format == "pcm":
return "audio/pcm"
elif speech_format == "opus":
return "audio/opus"
else:
return None
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def on_close(self):
self.events.append("close")


@skip_without_api_key
@pytest.mark.skip("Not support now")
def test_speech_synthesis_v2_streaming_call_basic(
instrument_with_content, span_exporter
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ def _get_llm_response_attributes( # pylint: disable=too-many-branches
if (
invocation.monotonic_first_token_s is not None
and invocation.monotonic_start_s is not None
and invocation.monotonic_first_token_s >= invocation.monotonic_start_s
):
ttft_ns = int(
(invocation.monotonic_first_token_s - invocation.monotonic_start_s)
Expand Down
Loading