Fix MIME type inference logic for speech synthesis instrumentation (#115)

Cirilla-zmh · web-flow · commit 4d7ef481a43c · 2026-02-09T09:42:46.000+08:00
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/CHANGELOG.md b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/CHANGELOG.md
@@ -12,3 +12,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
   ([#111](https://github.com/alibaba/loongsuite-python-agent/pull/111))
 - Initial implementation of DashScope instrumentation
   ([#66](https://github.com/alibaba/loongsuite-python-agent/pull/66))
+
+### Fixed
+
+- Fix MIME type inference logic for speech synthesis instrumentation
+  ([#115](https://github.com/alibaba/loongsuite-python-agent/pull/115))
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/README.rst b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/README.rst
@@ -68,6 +68,20 @@ Supported APIs
   * ``ImageSynthesis.async_call`` (async task submission)
   * ``ImageSynthesis.wait`` (async task waiting)
 
+* **Speech Synthesis V1**
+  
+  * ``SpeechSynthesizer.call`` (V1)
+
+* **Speech Synthesis V2**
+  
+  * ``SpeechSynthesizer.call`` (V2)
+
+* **Video Synthesis**
+  
+  * ``VideoSynthesis.call`` (sync)
+  * ``VideoSynthesis.async_call`` (async task submission)
+  * ``VideoSynthesis.wait`` (async task waiting)
+
 
 Captured Attributes
 --------------------
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/__init__.py
@@ -60,7 +60,6 @@
     wrap_multimodal_conversation_call,
     wrap_speech_synthesis_call,
     wrap_speech_synthesis_v2_call,
-    wrap_speech_synthesis_v2_streaming_call,
     wrap_text_embedding_call,
     wrap_text_rerank_call,
     wrap_video_synthesis_async_call,
@@ -219,13 +218,6 @@ def wrap_speech_synthesis_v2_call_with_provider(
                 wrapped, instance, args, kwargs, handler=handler
             )
 
-        def wrap_speech_synthesis_v2_streaming_call_with_provider(
-            wrapped, instance, args, kwargs
-        ):
-            return wrap_speech_synthesis_v2_streaming_call(
-                wrapped, instance, args, kwargs, handler=handler
-            )
-
         # Instrument Generation.call (sync)
         try:
             wrap_function_wrapper(
@@ -379,19 +371,6 @@ def wrap_speech_synthesis_v2_streaming_call_with_provider(
                 f"Failed to instrument SpeechSynthesizer.call (V2): {e}"
             )
 
-        # Instrument SpeechSynthesizer.streaming_call (V2)
-        try:
-            wrap_function_wrapper(
-                module=_MODULE_SPEECH_SYNTHESIS_V2,
-                name="SpeechSynthesizer.streaming_call",
-                wrapper=wrap_speech_synthesis_v2_streaming_call_with_provider,
-            )
-            logger.debug("Instrumented SpeechSynthesizer.streaming_call (V2)")
-        except Exception as e:
-            logger.warning(
-                f"Failed to instrument SpeechSynthesizer.streaming_call (V2): {e}"
-            )
-
     def _uninstrument(self, **kwargs):
         """Uninstrument the DashScope SDK.
 
@@ -430,10 +409,6 @@ def _uninstrument(self, **kwargs):
             dashscope.audio.tts_v2.speech_synthesizer.SpeechSynthesizer,
             "call",
         )
-        unwrap(
-            dashscope.audio.tts_v2.speech_synthesizer.SpeechSynthesizer,
-            "streaming_call",
-        )
         unwrap(dashscope.embeddings.text_embedding.TextEmbedding, "call")
         unwrap(dashscope.rerank.text_rerank.TextReRank, "call")
 
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/__init__.py
@@ -31,7 +31,6 @@
 from .speech_synthesis import (
     wrap_speech_synthesis_call,
     wrap_speech_synthesis_v2_call,
-    wrap_speech_synthesis_v2_streaming_call,
 )
 from .video_synthesis import (
     wrap_video_synthesis_async_call,
@@ -60,5 +59,4 @@
     # SpeechSynthesis
     "wrap_speech_synthesis_call",
     "wrap_speech_synthesis_v2_call",
-    "wrap_speech_synthesis_v2_streaming_call",
 ]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/speech_synthesis.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/speech_synthesis.py
@@ -21,6 +21,7 @@
 from opentelemetry.util.genai.types import Error
 
 from ..utils import (
+    _convert_speech_format_to_mime_type,
     _create_invocation_from_speech_synthesis,
     _create_invocation_from_speech_synthesis_v2,
     _update_invocation_from_speech_synthesis_response,
@@ -59,6 +60,9 @@ def wrap_speech_synthesis_call(wrapped, instance, args, kwargs, handler=None):
         # Create invocation object
         invocation = _create_invocation_from_speech_synthesis(kwargs, model)
 
+        speech_format = kwargs.get("format", "wav")  # default format is wav
+        mime_type = _convert_speech_format_to_mime_type(speech_format)
+
         # Start LLM invocation (creates span)
         handler.start_llm(invocation)
 
@@ -68,7 +72,7 @@ def wrap_speech_synthesis_call(wrapped, instance, args, kwargs, handler=None):
 
             # Update invocation with response data
             _update_invocation_from_speech_synthesis_response(
-                invocation, result
+                invocation, result, mime_type
             )
             handler.stop_llm(invocation)
 
@@ -115,15 +119,13 @@ def wrap_speech_synthesis_v2_call(
         model = getattr(instance, "_model", None) or getattr(
             instance, "model", "unknown"
         )
-        voice = getattr(instance, "_voice", None) or getattr(
-            instance, "voice", None
-        )
+        speech_format = getattr(instance, "aformat", "mp3")
+        mime_type = _convert_speech_format_to_mime_type(speech_format)
+
         text = args[0] if args else kwargs.get("text", "")
 
         # Create invocation object
-        invocation = _create_invocation_from_speech_synthesis_v2(
-            model, text, voice
-        )
+        invocation = _create_invocation_from_speech_synthesis_v2(model, text)
 
         # Start LLM invocation (creates span)
         handler.start_llm(invocation)
@@ -135,7 +137,7 @@ def wrap_speech_synthesis_v2_call(
             # Update invocation with response data
             if result is not None:
                 _update_invocation_from_speech_synthesis_v2_response(
-                    invocation, result
+                    invocation, result, mime_type
                 )
             handler.stop_llm(invocation)
 
@@ -151,64 +153,3 @@ def wrap_speech_synthesis_v2_call(
             "Error in speech synthesis V2 instrumentation wrapper: %s", e
         )
         return wrapped(*args, **kwargs)
-
-
-def wrap_speech_synthesis_v2_streaming_call(
-    wrapped, instance, args, kwargs, handler=None
-):
-    """Wrapper for SpeechSynthesizerV2.streaming_call.
-
-    Note: This is a streaming input method. The user calls it multiple times
-    to send text, then calls streaming_complete() to finish.
-
-    For now, we just instrument individual streaming_call() invocations.
-
-    Args:
-        wrapped: The original function being wrapped
-        instance: The SpeechSynthesizer instance
-        args: Positional arguments (text)
-        kwargs: Keyword arguments
-        handler: ExtendedTelemetryHandler instance (created during instrumentation)
-    """
-    if handler is None:
-        logger.warning("Handler not provided, skipping instrumentation")
-        return wrapped(*args, **kwargs)
-
-    try:
-        # Extract model and voice from instance
-        model = getattr(instance, "_model", None) or getattr(
-            instance, "model", "unknown"
-        )
-        voice = getattr(instance, "_voice", None) or getattr(
-            instance, "voice", None
-        )
-        text = args[0] if args else kwargs.get("text", "")
-
-        # Create invocation object
-        invocation = _create_invocation_from_speech_synthesis_v2(
-            model, text, voice
-        )
-        invocation.operation_name = "streaming_call"
-
-        # Start LLM invocation (creates span)
-        handler.start_llm(invocation)
-
-        try:
-            # Execute the wrapped call
-            result = wrapped(*args, **kwargs)
-
-            # For streaming_call, there's no immediate response
-            handler.stop_llm(invocation)
-
-            return result
-
-        except Exception as e:
-            error = Error(message=str(e), type=type(e))
-            handler.fail_llm(invocation, error)
-            raise
-
-    except Exception as e:
-        logger.exception(
-            "Error in speech synthesis V2 streaming_call wrapper: %s", e
-        )
-        return wrapped(*args, **kwargs)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/__init__.py
@@ -39,11 +39,12 @@
 
 # Multimodal utilities
 from .multimodal import (
+    # SpeechSynthesis
+    _convert_speech_format_to_mime_type,
     # ImageSynthesis
     _create_invocation_from_image_synthesis,
     # MultiModalConversation
     _create_invocation_from_multimodal_conversation,
-    # SpeechSynthesis
     _create_invocation_from_speech_synthesis,
     _create_invocation_from_speech_synthesis_v2,
     # VideoSynthesis
@@ -86,6 +87,7 @@
     "_update_invocation_from_video_synthesis_response",
     "_update_invocation_from_video_synthesis_async_response",
     # SpeechSynthesis
+    "_convert_speech_format_to_mime_type",
     "_create_invocation_from_speech_synthesis",
     "_update_invocation_from_speech_synthesis_response",
     "_create_invocation_from_speech_synthesis_v2",
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/multimodal.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/multimodal.py
@@ -23,12 +23,11 @@
 
 from __future__ import annotations
 
-import base64
 import logging
 from typing import Any, List, Optional
 
 from opentelemetry.util.genai.types import (
-    Base64Blob,
+    Blob,
     InputMessage,
     LLMInvocation,
     OutputMessage,
@@ -713,13 +712,14 @@ def _create_invocation_from_speech_synthesis(
 
 
 def _update_invocation_from_speech_synthesis_response(
-    invocation: LLMInvocation, response: Any
+    invocation: LLMInvocation, response: Any, mime_type: Optional[str] = None
 ) -> None:
     """Update LLMInvocation with SpeechSynthesizer response data.
 
     Args:
         invocation: LLMInvocation to update
         response: SpeechSynthesisResult object
+        mime_type: MIME type of audio (optional)
     """
     if not response:
         return
@@ -735,16 +735,14 @@ def _update_invocation_from_speech_synthesis_response(
         if callable(audio_data):
             audio_bytes = audio_data()
             if audio_bytes:
-                # Encode audio as base64 and store in output_messages
-                audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
                 invocation.output_messages = [
                     OutputMessage(
                         role="assistant",
                         parts=[
-                            Base64Blob(
-                                mime_type="audio/wav",
+                            Blob(
+                                mime_type=mime_type,
                                 modality="audio",
-                                content=audio_base64,
+                                content=audio_bytes,
                             )
                         ],
                         finish_reason="stop",
@@ -759,14 +757,13 @@ def _update_invocation_from_speech_synthesis_response(
 
 
 def _create_invocation_from_speech_synthesis_v2(
-    model: str, text: str, voice: Optional[str] = None
+    model: str, text: str
 ) -> LLMInvocation:
     """Create LLMInvocation from SpeechSynthesizerV2.call args.
 
     Args:
         model: Model name
         text: Text to synthesize
-        voice: Voice name (optional)
 
     Returns:
         LLMInvocation object
@@ -788,27 +785,49 @@ def _create_invocation_from_speech_synthesis_v2(
 
 
 def _update_invocation_from_speech_synthesis_v2_response(
-    invocation: LLMInvocation, audio_data: bytes
+    invocation: LLMInvocation,
+    audio_data: bytes,
+    mime_type: Optional[str] = None,
 ) -> None:
     """Update LLMInvocation with SpeechSynthesizerV2 response data.
 
     Args:
         invocation: LLMInvocation to update
         audio_data: Audio data bytes
+        mime_type: MIME type of audio (optional)
     """
     if audio_data:
-        # Encode audio as base64 and store in output_messages
-        audio_base64 = base64.b64encode(audio_data).decode("utf-8")
         invocation.output_messages = [
             OutputMessage(
                 role="assistant",
                 parts=[
-                    Base64Blob(
-                        mime_type="audio/mp3",  # V2 typically returns mp3
+                    Blob(
+                        mime_type=mime_type,
                         modality="audio",
-                        content=audio_base64,
+                        content=audio_data,
                     )
                 ],
                 finish_reason="stop",
             )
         ]
+
+
+def _convert_speech_format_to_mime_type(speech_format: str) -> Optional[str]:
+    """Convert from speech format to mime type.
+
+    Args:
+        speech_format: speech format of DashScope
+
+    Returns:
+        the mime type of speech
+    """
+    if speech_format == "wav":
+        return "audio/wav"
+    elif speech_format == "mp3":
+        return "audio/mpeg"
+    elif speech_format == "pcm":
+        return "audio/pcm"
+    elif speech_format == "opus":
+        return "audio/opus"
+    else:
+        return None
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/tests/test_speech_synthesis.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/tests/test_speech_synthesis.py
diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/span_utils.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/span_utils.py

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,6 @@`
`31`	`31`	`from .speech_synthesis import (`
`32`	`32`	`wrap_speech_synthesis_call,`
`33`	`33`	`wrap_speech_synthesis_v2_call,`
`34`		`- wrap_speech_synthesis_v2_streaming_call,`
`35`	`34`	`)`
`36`	`35`	`from .video_synthesis import (`
`37`	`36`	`wrap_video_synthesis_async_call,`
`@@ -60,5 +59,4 @@`
`60`	`59`	`# SpeechSynthesis`
`61`	`60`	`"wrap_speech_synthesis_call",`
`62`	`61`	`"wrap_speech_synthesis_v2_call",`
`63`		`- "wrap_speech_synthesis_v2_streaming_call",`
`64`	`62`	`]`