diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/CHANGELOG.md b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/CHANGELOG.md index 9e6ec556e..d60b49ec6 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/CHANGELOG.md +++ b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/CHANGELOG.md @@ -12,3 +12,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ([#111](https://github.com/alibaba/loongsuite-python-agent/pull/111)) - Initial implementation of DashScope instrumentation ([#66](https://github.com/alibaba/loongsuite-python-agent/pull/66)) + +### Fixed + +- Fix MIME type inference logic for speech synthesis instrumentation + ([#115](https://github.com/alibaba/loongsuite-python-agent/pull/115)) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/README.rst b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/README.rst index a5739e0ab..25c7aca5a 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/README.rst +++ b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/README.rst @@ -68,6 +68,20 @@ Supported APIs * ``ImageSynthesis.async_call`` (async task submission) * ``ImageSynthesis.wait`` (async task waiting) +* **Speech Synthesis V1** + + * ``SpeechSynthesizer.call`` (V1) + +* **Speech Synthesis V2** + + * ``SpeechSynthesizer.call`` (V2) + +* **Video Synthesis** + + * ``VideoSynthesis.call`` (sync) + * ``VideoSynthesis.async_call`` (async task submission) + * ``VideoSynthesis.wait`` (async task waiting) + Captured Attributes -------------------- diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/__init__.py index 7d5f49e0f..2edd5f951 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/__init__.py +++ b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/__init__.py @@ -60,7 +60,6 @@ wrap_multimodal_conversation_call, wrap_speech_synthesis_call, wrap_speech_synthesis_v2_call, - wrap_speech_synthesis_v2_streaming_call, wrap_text_embedding_call, wrap_text_rerank_call, wrap_video_synthesis_async_call, @@ -219,13 +218,6 @@ def wrap_speech_synthesis_v2_call_with_provider( wrapped, instance, args, kwargs, handler=handler ) - def wrap_speech_synthesis_v2_streaming_call_with_provider( - wrapped, instance, args, kwargs - ): - return wrap_speech_synthesis_v2_streaming_call( - wrapped, instance, args, kwargs, handler=handler - ) - # Instrument Generation.call (sync) try: wrap_function_wrapper( @@ -379,19 +371,6 @@ def wrap_speech_synthesis_v2_streaming_call_with_provider( f"Failed to instrument SpeechSynthesizer.call (V2): {e}" ) - # Instrument SpeechSynthesizer.streaming_call (V2) - try: - wrap_function_wrapper( - module=_MODULE_SPEECH_SYNTHESIS_V2, - name="SpeechSynthesizer.streaming_call", - wrapper=wrap_speech_synthesis_v2_streaming_call_with_provider, - ) - logger.debug("Instrumented SpeechSynthesizer.streaming_call (V2)") - except Exception as e: - logger.warning( - f"Failed to instrument SpeechSynthesizer.streaming_call (V2): {e}" - ) - def _uninstrument(self, **kwargs): """Uninstrument the DashScope SDK. @@ -430,10 +409,6 @@ def _uninstrument(self, **kwargs): dashscope.audio.tts_v2.speech_synthesizer.SpeechSynthesizer, "call", ) - unwrap( - dashscope.audio.tts_v2.speech_synthesizer.SpeechSynthesizer, - "streaming_call", - ) unwrap(dashscope.embeddings.text_embedding.TextEmbedding, "call") unwrap(dashscope.rerank.text_rerank.TextReRank, "call") diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/__init__.py index 8a3c4afe3..9277e22a3 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/__init__.py +++ b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/__init__.py @@ -31,7 +31,6 @@ from .speech_synthesis import ( wrap_speech_synthesis_call, wrap_speech_synthesis_v2_call, - wrap_speech_synthesis_v2_streaming_call, ) from .video_synthesis import ( wrap_video_synthesis_async_call, @@ -60,5 +59,4 @@ # SpeechSynthesis "wrap_speech_synthesis_call", "wrap_speech_synthesis_v2_call", - "wrap_speech_synthesis_v2_streaming_call", ] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/speech_synthesis.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/speech_synthesis.py index 07e65ad7c..81fd1529b 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/speech_synthesis.py +++ b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/speech_synthesis.py @@ -21,6 +21,7 @@ from opentelemetry.util.genai.types import Error from ..utils import ( + _convert_speech_format_to_mime_type, _create_invocation_from_speech_synthesis, _create_invocation_from_speech_synthesis_v2, _update_invocation_from_speech_synthesis_response, @@ -59,6 +60,9 @@ def wrap_speech_synthesis_call(wrapped, instance, args, kwargs, handler=None): # Create invocation object invocation = _create_invocation_from_speech_synthesis(kwargs, model) + speech_format = kwargs.get("format", "wav") # default format is wav + mime_type = _convert_speech_format_to_mime_type(speech_format) + # Start LLM invocation (creates span) handler.start_llm(invocation) @@ -68,7 +72,7 @@ def wrap_speech_synthesis_call(wrapped, instance, args, kwargs, handler=None): # Update invocation with response data _update_invocation_from_speech_synthesis_response( - invocation, result + invocation, result, mime_type ) handler.stop_llm(invocation) @@ -115,15 +119,13 @@ def wrap_speech_synthesis_v2_call( model = getattr(instance, "_model", None) or getattr( instance, "model", "unknown" ) - voice = getattr(instance, "_voice", None) or getattr( - instance, "voice", None - ) + speech_format = getattr(instance, "aformat", "mp3") + mime_type = _convert_speech_format_to_mime_type(speech_format) + text = args[0] if args else kwargs.get("text", "") # Create invocation object - invocation = _create_invocation_from_speech_synthesis_v2( - model, text, voice - ) + invocation = _create_invocation_from_speech_synthesis_v2(model, text) # Start LLM invocation (creates span) handler.start_llm(invocation) @@ -135,7 +137,7 @@ def wrap_speech_synthesis_v2_call( # Update invocation with response data if result is not None: _update_invocation_from_speech_synthesis_v2_response( - invocation, result + invocation, result, mime_type ) handler.stop_llm(invocation) @@ -151,64 +153,3 @@ def wrap_speech_synthesis_v2_call( "Error in speech synthesis V2 instrumentation wrapper: %s", e ) return wrapped(*args, **kwargs) - - -def wrap_speech_synthesis_v2_streaming_call( - wrapped, instance, args, kwargs, handler=None -): - """Wrapper for SpeechSynthesizerV2.streaming_call. - - Note: This is a streaming input method. The user calls it multiple times - to send text, then calls streaming_complete() to finish. - - For now, we just instrument individual streaming_call() invocations. - - Args: - wrapped: The original function being wrapped - instance: The SpeechSynthesizer instance - args: Positional arguments (text) - kwargs: Keyword arguments - handler: ExtendedTelemetryHandler instance (created during instrumentation) - """ - if handler is None: - logger.warning("Handler not provided, skipping instrumentation") - return wrapped(*args, **kwargs) - - try: - # Extract model and voice from instance - model = getattr(instance, "_model", None) or getattr( - instance, "model", "unknown" - ) - voice = getattr(instance, "_voice", None) or getattr( - instance, "voice", None - ) - text = args[0] if args else kwargs.get("text", "") - - # Create invocation object - invocation = _create_invocation_from_speech_synthesis_v2( - model, text, voice - ) - invocation.operation_name = "streaming_call" - - # Start LLM invocation (creates span) - handler.start_llm(invocation) - - try: - # Execute the wrapped call - result = wrapped(*args, **kwargs) - - # For streaming_call, there's no immediate response - handler.stop_llm(invocation) - - return result - - except Exception as e: - error = Error(message=str(e), type=type(e)) - handler.fail_llm(invocation, error) - raise - - except Exception as e: - logger.exception( - "Error in speech synthesis V2 streaming_call wrapper: %s", e - ) - return wrapped(*args, **kwargs) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/__init__.py index 1da571eb5..95dcf56c2 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/__init__.py +++ b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/__init__.py @@ -39,11 +39,12 @@ # Multimodal utilities from .multimodal import ( + # SpeechSynthesis + _convert_speech_format_to_mime_type, # ImageSynthesis _create_invocation_from_image_synthesis, # MultiModalConversation _create_invocation_from_multimodal_conversation, - # SpeechSynthesis _create_invocation_from_speech_synthesis, _create_invocation_from_speech_synthesis_v2, # VideoSynthesis @@ -86,6 +87,7 @@ "_update_invocation_from_video_synthesis_response", "_update_invocation_from_video_synthesis_async_response", # SpeechSynthesis + "_convert_speech_format_to_mime_type", "_create_invocation_from_speech_synthesis", "_update_invocation_from_speech_synthesis_response", "_create_invocation_from_speech_synthesis_v2", diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/multimodal.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/multimodal.py index 2975a7440..88f2ea3f2 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/multimodal.py +++ b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/multimodal.py @@ -23,12 +23,11 @@ from __future__ import annotations -import base64 import logging from typing import Any, List, Optional from opentelemetry.util.genai.types import ( - Base64Blob, + Blob, InputMessage, LLMInvocation, OutputMessage, @@ -713,13 +712,14 @@ def _create_invocation_from_speech_synthesis( def _update_invocation_from_speech_synthesis_response( - invocation: LLMInvocation, response: Any + invocation: LLMInvocation, response: Any, mime_type: Optional[str] = None ) -> None: """Update LLMInvocation with SpeechSynthesizer response data. Args: invocation: LLMInvocation to update response: SpeechSynthesisResult object + mime_type: MIME type of audio (optional) """ if not response: return @@ -735,16 +735,14 @@ def _update_invocation_from_speech_synthesis_response( if callable(audio_data): audio_bytes = audio_data() if audio_bytes: - # Encode audio as base64 and store in output_messages - audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") invocation.output_messages = [ OutputMessage( role="assistant", parts=[ - Base64Blob( - mime_type="audio/wav", + Blob( + mime_type=mime_type, modality="audio", - content=audio_base64, + content=audio_bytes, ) ], finish_reason="stop", @@ -759,14 +757,13 @@ def _update_invocation_from_speech_synthesis_response( def _create_invocation_from_speech_synthesis_v2( - model: str, text: str, voice: Optional[str] = None + model: str, text: str ) -> LLMInvocation: """Create LLMInvocation from SpeechSynthesizerV2.call args. Args: model: Model name text: Text to synthesize - voice: Voice name (optional) Returns: LLMInvocation object @@ -788,27 +785,49 @@ def _create_invocation_from_speech_synthesis_v2( def _update_invocation_from_speech_synthesis_v2_response( - invocation: LLMInvocation, audio_data: bytes + invocation: LLMInvocation, + audio_data: bytes, + mime_type: Optional[str] = None, ) -> None: """Update LLMInvocation with SpeechSynthesizerV2 response data. Args: invocation: LLMInvocation to update audio_data: Audio data bytes + mime_type: MIME type of audio (optional) """ if audio_data: - # Encode audio as base64 and store in output_messages - audio_base64 = base64.b64encode(audio_data).decode("utf-8") invocation.output_messages = [ OutputMessage( role="assistant", parts=[ - Base64Blob( - mime_type="audio/mp3", # V2 typically returns mp3 + Blob( + mime_type=mime_type, modality="audio", - content=audio_base64, + content=audio_data, ) ], finish_reason="stop", ) ] + + +def _convert_speech_format_to_mime_type(speech_format: str) -> Optional[str]: + """Convert from speech format to mime type. + + Args: + speech_format: speech format of DashScope + + Returns: + the mime type of speech + """ + if speech_format == "wav": + return "audio/wav" + elif speech_format == "mp3": + return "audio/mpeg" + elif speech_format == "pcm": + return "audio/pcm" + elif speech_format == "opus": + return "audio/opus" + else: + return None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/tests/test_speech_synthesis.py b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/tests/test_speech_synthesis.py index 6f97d0baa..4a1c2e2a4 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/tests/test_speech_synthesis.py +++ b/instrumentation-loongsuite/loongsuite-instrumentation-dashscope/tests/test_speech_synthesis.py @@ -21,6 +21,9 @@ from dashscope.audio.tts import SpeechSynthesizer from dashscope.audio.tts_v2 import SpeechSynthesizer as SpeechSynthesizerV2 +from opentelemetry.instrumentation.dashscope.utils.multimodal import ( + _convert_speech_format_to_mime_type, +) from opentelemetry.semconv._incubating.attributes import ( gen_ai_attributes as GenAIAttributes, ) @@ -306,7 +309,10 @@ def on_close(self): self.events.append("close") -@skip_without_api_key +@pytest.mark.skip( + "Streaming call functionality for SpeechSynthesizer V2 is currently not supported " + "by the instrumentation test infrastructure (WebSocket-based streaming is disabled)." +) def test_speech_synthesis_v2_streaming_call_basic( instrument_with_content, span_exporter ): @@ -366,3 +372,63 @@ def test_speech_synthesis_v2_streaming_call_basic( print( "✓ SpeechSynthesizer V2 streaming_call (basic) completed successfully" ) + + +# ============================================================================ +# Unit tests for _convert_speech_format_to_mime_type +# ============================================================================ + + +def test_convert_speech_format_to_mime_type_wav(): + """Test _convert_speech_format_to_mime_type with wav format.""" + result = _convert_speech_format_to_mime_type("wav") + assert result == "audio/wav" + + +def test_convert_speech_format_to_mime_type_mp3(): + """Test _convert_speech_format_to_mime_type with mp3 format.""" + result = _convert_speech_format_to_mime_type("mp3") + assert result == "audio/mpeg" + + +def test_convert_speech_format_to_mime_type_pcm(): + """Test _convert_speech_format_to_mime_type with pcm format.""" + result = _convert_speech_format_to_mime_type("pcm") + assert result == "audio/pcm" + + +def test_convert_speech_format_to_mime_type_opus(): + """Test _convert_speech_format_to_mime_type with opus format.""" + result = _convert_speech_format_to_mime_type("opus") + assert result == "audio/opus" + + +def test_convert_speech_format_to_mime_type_unknown(): + """Test _convert_speech_format_to_mime_type with unknown format.""" + result = _convert_speech_format_to_mime_type("unknown") + assert result is None + + +def test_convert_speech_format_to_mime_type_empty_string(): + """Test _convert_speech_format_to_mime_type with empty string.""" + result = _convert_speech_format_to_mime_type("") + assert result is None + + +def test_convert_speech_format_to_mime_type_case_sensitive(): + """Test _convert_speech_format_to_mime_type is case sensitive.""" + # The function should be case sensitive, so uppercase should return None + result = _convert_speech_format_to_mime_type("WAV") + assert result is None + + result = _convert_speech_format_to_mime_type("MP3") + assert result is None + + +def test_convert_speech_format_to_mime_type_other_formats(): + """Test _convert_speech_format_to_mime_type with other common audio formats.""" + # Test some other common audio formats that are not supported + assert _convert_speech_format_to_mime_type("aac") is None + assert _convert_speech_format_to_mime_type("flac") is None + assert _convert_speech_format_to_mime_type("ogg") is None + assert _convert_speech_format_to_mime_type("m4a") is None diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/span_utils.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/span_utils.py index 6c948aabd..bae4196ab 100644 --- a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/span_utils.py +++ b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/span_utils.py @@ -374,6 +374,7 @@ def _get_llm_response_attributes( # pylint: disable=too-many-branches if ( invocation.monotonic_first_token_s is not None and invocation.monotonic_start_s is not None + and invocation.monotonic_first_token_s >= invocation.monotonic_start_s ): ttft_ns = int( (invocation.monotonic_first_token_s - invocation.monotonic_start_s)