Skip to content

Commit 4d7ef48

Browse files
authored
Fix MIME type inference logic for speech synthesis instrumentation (#115)
1 parent 3cb78c7 commit 4d7ef48

File tree

9 files changed

+135
-114
lines changed

9 files changed

+135
-114
lines changed

instrumentation-loongsuite/loongsuite-instrumentation-dashscope/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
1212
([#111](https://github.com/alibaba/loongsuite-python-agent/pull/111))
1313
- Initial implementation of DashScope instrumentation
1414
([#66](https://github.com/alibaba/loongsuite-python-agent/pull/66))
15+
16+
### Fixed
17+
18+
- Fix MIME type inference logic for speech synthesis instrumentation
19+
([#115](https://github.com/alibaba/loongsuite-python-agent/pull/115))

instrumentation-loongsuite/loongsuite-instrumentation-dashscope/README.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,20 @@ Supported APIs
6868
* ``ImageSynthesis.async_call`` (async task submission)
6969
* ``ImageSynthesis.wait`` (async task waiting)
7070

71+
* **Speech Synthesis V1**
72+
73+
* ``SpeechSynthesizer.call`` (V1)
74+
75+
* **Speech Synthesis V2**
76+
77+
* ``SpeechSynthesizer.call`` (V2)
78+
79+
* **Video Synthesis**
80+
81+
* ``VideoSynthesis.call`` (sync)
82+
* ``VideoSynthesis.async_call`` (async task submission)
83+
* ``VideoSynthesis.wait`` (async task waiting)
84+
7185

7286
Captured Attributes
7387
--------------------

instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/__init__.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@
6060
wrap_multimodal_conversation_call,
6161
wrap_speech_synthesis_call,
6262
wrap_speech_synthesis_v2_call,
63-
wrap_speech_synthesis_v2_streaming_call,
6463
wrap_text_embedding_call,
6564
wrap_text_rerank_call,
6665
wrap_video_synthesis_async_call,
@@ -219,13 +218,6 @@ def wrap_speech_synthesis_v2_call_with_provider(
219218
wrapped, instance, args, kwargs, handler=handler
220219
)
221220

222-
def wrap_speech_synthesis_v2_streaming_call_with_provider(
223-
wrapped, instance, args, kwargs
224-
):
225-
return wrap_speech_synthesis_v2_streaming_call(
226-
wrapped, instance, args, kwargs, handler=handler
227-
)
228-
229221
# Instrument Generation.call (sync)
230222
try:
231223
wrap_function_wrapper(
@@ -379,19 +371,6 @@ def wrap_speech_synthesis_v2_streaming_call_with_provider(
379371
f"Failed to instrument SpeechSynthesizer.call (V2): {e}"
380372
)
381373

382-
# Instrument SpeechSynthesizer.streaming_call (V2)
383-
try:
384-
wrap_function_wrapper(
385-
module=_MODULE_SPEECH_SYNTHESIS_V2,
386-
name="SpeechSynthesizer.streaming_call",
387-
wrapper=wrap_speech_synthesis_v2_streaming_call_with_provider,
388-
)
389-
logger.debug("Instrumented SpeechSynthesizer.streaming_call (V2)")
390-
except Exception as e:
391-
logger.warning(
392-
f"Failed to instrument SpeechSynthesizer.streaming_call (V2): {e}"
393-
)
394-
395374
def _uninstrument(self, **kwargs):
396375
"""Uninstrument the DashScope SDK.
397376
@@ -430,10 +409,6 @@ def _uninstrument(self, **kwargs):
430409
dashscope.audio.tts_v2.speech_synthesizer.SpeechSynthesizer,
431410
"call",
432411
)
433-
unwrap(
434-
dashscope.audio.tts_v2.speech_synthesizer.SpeechSynthesizer,
435-
"streaming_call",
436-
)
437412
unwrap(dashscope.embeddings.text_embedding.TextEmbedding, "call")
438413
unwrap(dashscope.rerank.text_rerank.TextReRank, "call")
439414

instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
from .speech_synthesis import (
3232
wrap_speech_synthesis_call,
3333
wrap_speech_synthesis_v2_call,
34-
wrap_speech_synthesis_v2_streaming_call,
3534
)
3635
from .video_synthesis import (
3736
wrap_video_synthesis_async_call,
@@ -60,5 +59,4 @@
6059
# SpeechSynthesis
6160
"wrap_speech_synthesis_call",
6261
"wrap_speech_synthesis_v2_call",
63-
"wrap_speech_synthesis_v2_streaming_call",
6462
]

instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/patch/speech_synthesis.py

Lines changed: 10 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from opentelemetry.util.genai.types import Error
2222

2323
from ..utils import (
24+
_convert_speech_format_to_mime_type,
2425
_create_invocation_from_speech_synthesis,
2526
_create_invocation_from_speech_synthesis_v2,
2627
_update_invocation_from_speech_synthesis_response,
@@ -59,6 +60,9 @@ def wrap_speech_synthesis_call(wrapped, instance, args, kwargs, handler=None):
5960
# Create invocation object
6061
invocation = _create_invocation_from_speech_synthesis(kwargs, model)
6162

63+
speech_format = kwargs.get("format", "wav") # default format is wav
64+
mime_type = _convert_speech_format_to_mime_type(speech_format)
65+
6266
# Start LLM invocation (creates span)
6367
handler.start_llm(invocation)
6468

@@ -68,7 +72,7 @@ def wrap_speech_synthesis_call(wrapped, instance, args, kwargs, handler=None):
6872

6973
# Update invocation with response data
7074
_update_invocation_from_speech_synthesis_response(
71-
invocation, result
75+
invocation, result, mime_type
7276
)
7377
handler.stop_llm(invocation)
7478

@@ -115,15 +119,13 @@ def wrap_speech_synthesis_v2_call(
115119
model = getattr(instance, "_model", None) or getattr(
116120
instance, "model", "unknown"
117121
)
118-
voice = getattr(instance, "_voice", None) or getattr(
119-
instance, "voice", None
120-
)
122+
speech_format = getattr(instance, "aformat", "mp3")
123+
mime_type = _convert_speech_format_to_mime_type(speech_format)
124+
121125
text = args[0] if args else kwargs.get("text", "")
122126

123127
# Create invocation object
124-
invocation = _create_invocation_from_speech_synthesis_v2(
125-
model, text, voice
126-
)
128+
invocation = _create_invocation_from_speech_synthesis_v2(model, text)
127129

128130
# Start LLM invocation (creates span)
129131
handler.start_llm(invocation)
@@ -135,7 +137,7 @@ def wrap_speech_synthesis_v2_call(
135137
# Update invocation with response data
136138
if result is not None:
137139
_update_invocation_from_speech_synthesis_v2_response(
138-
invocation, result
140+
invocation, result, mime_type
139141
)
140142
handler.stop_llm(invocation)
141143

@@ -151,64 +153,3 @@ def wrap_speech_synthesis_v2_call(
151153
"Error in speech synthesis V2 instrumentation wrapper: %s", e
152154
)
153155
return wrapped(*args, **kwargs)
154-
155-
156-
def wrap_speech_synthesis_v2_streaming_call(
157-
wrapped, instance, args, kwargs, handler=None
158-
):
159-
"""Wrapper for SpeechSynthesizerV2.streaming_call.
160-
161-
Note: This is a streaming input method. The user calls it multiple times
162-
to send text, then calls streaming_complete() to finish.
163-
164-
For now, we just instrument individual streaming_call() invocations.
165-
166-
Args:
167-
wrapped: The original function being wrapped
168-
instance: The SpeechSynthesizer instance
169-
args: Positional arguments (text)
170-
kwargs: Keyword arguments
171-
handler: ExtendedTelemetryHandler instance (created during instrumentation)
172-
"""
173-
if handler is None:
174-
logger.warning("Handler not provided, skipping instrumentation")
175-
return wrapped(*args, **kwargs)
176-
177-
try:
178-
# Extract model and voice from instance
179-
model = getattr(instance, "_model", None) or getattr(
180-
instance, "model", "unknown"
181-
)
182-
voice = getattr(instance, "_voice", None) or getattr(
183-
instance, "voice", None
184-
)
185-
text = args[0] if args else kwargs.get("text", "")
186-
187-
# Create invocation object
188-
invocation = _create_invocation_from_speech_synthesis_v2(
189-
model, text, voice
190-
)
191-
invocation.operation_name = "streaming_call"
192-
193-
# Start LLM invocation (creates span)
194-
handler.start_llm(invocation)
195-
196-
try:
197-
# Execute the wrapped call
198-
result = wrapped(*args, **kwargs)
199-
200-
# For streaming_call, there's no immediate response
201-
handler.stop_llm(invocation)
202-
203-
return result
204-
205-
except Exception as e:
206-
error = Error(message=str(e), type=type(e))
207-
handler.fail_llm(invocation, error)
208-
raise
209-
210-
except Exception as e:
211-
logger.exception(
212-
"Error in speech synthesis V2 streaming_call wrapper: %s", e
213-
)
214-
return wrapped(*args, **kwargs)

instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,12 @@
3939

4040
# Multimodal utilities
4141
from .multimodal import (
42+
# SpeechSynthesis
43+
_convert_speech_format_to_mime_type,
4244
# ImageSynthesis
4345
_create_invocation_from_image_synthesis,
4446
# MultiModalConversation
4547
_create_invocation_from_multimodal_conversation,
46-
# SpeechSynthesis
4748
_create_invocation_from_speech_synthesis,
4849
_create_invocation_from_speech_synthesis_v2,
4950
# VideoSynthesis
@@ -86,6 +87,7 @@
8687
"_update_invocation_from_video_synthesis_response",
8788
"_update_invocation_from_video_synthesis_async_response",
8889
# SpeechSynthesis
90+
"_convert_speech_format_to_mime_type",
8991
"_create_invocation_from_speech_synthesis",
9092
"_update_invocation_from_speech_synthesis_response",
9193
"_create_invocation_from_speech_synthesis_v2",

instrumentation-loongsuite/loongsuite-instrumentation-dashscope/src/opentelemetry/instrumentation/dashscope/utils/multimodal.py

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,11 @@
2323

2424
from __future__ import annotations
2525

26-
import base64
2726
import logging
2827
from typing import Any, List, Optional
2928

3029
from opentelemetry.util.genai.types import (
31-
Base64Blob,
30+
Blob,
3231
InputMessage,
3332
LLMInvocation,
3433
OutputMessage,
@@ -713,13 +712,14 @@ def _create_invocation_from_speech_synthesis(
713712

714713

715714
def _update_invocation_from_speech_synthesis_response(
716-
invocation: LLMInvocation, response: Any
715+
invocation: LLMInvocation, response: Any, mime_type: Optional[str] = None
717716
) -> None:
718717
"""Update LLMInvocation with SpeechSynthesizer response data.
719718
720719
Args:
721720
invocation: LLMInvocation to update
722721
response: SpeechSynthesisResult object
722+
mime_type: MIME type of audio (optional)
723723
"""
724724
if not response:
725725
return
@@ -735,16 +735,14 @@ def _update_invocation_from_speech_synthesis_response(
735735
if callable(audio_data):
736736
audio_bytes = audio_data()
737737
if audio_bytes:
738-
# Encode audio as base64 and store in output_messages
739-
audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
740738
invocation.output_messages = [
741739
OutputMessage(
742740
role="assistant",
743741
parts=[
744-
Base64Blob(
745-
mime_type="audio/wav",
742+
Blob(
743+
mime_type=mime_type,
746744
modality="audio",
747-
content=audio_base64,
745+
content=audio_bytes,
748746
)
749747
],
750748
finish_reason="stop",
@@ -759,14 +757,13 @@ def _update_invocation_from_speech_synthesis_response(
759757

760758

761759
def _create_invocation_from_speech_synthesis_v2(
762-
model: str, text: str, voice: Optional[str] = None
760+
model: str, text: str
763761
) -> LLMInvocation:
764762
"""Create LLMInvocation from SpeechSynthesizerV2.call args.
765763
766764
Args:
767765
model: Model name
768766
text: Text to synthesize
769-
voice: Voice name (optional)
770767
771768
Returns:
772769
LLMInvocation object
@@ -788,27 +785,49 @@ def _create_invocation_from_speech_synthesis_v2(
788785

789786

790787
def _update_invocation_from_speech_synthesis_v2_response(
791-
invocation: LLMInvocation, audio_data: bytes
788+
invocation: LLMInvocation,
789+
audio_data: bytes,
790+
mime_type: Optional[str] = None,
792791
) -> None:
793792
"""Update LLMInvocation with SpeechSynthesizerV2 response data.
794793
795794
Args:
796795
invocation: LLMInvocation to update
797796
audio_data: Audio data bytes
797+
mime_type: MIME type of audio (optional)
798798
"""
799799
if audio_data:
800-
# Encode audio as base64 and store in output_messages
801-
audio_base64 = base64.b64encode(audio_data).decode("utf-8")
802800
invocation.output_messages = [
803801
OutputMessage(
804802
role="assistant",
805803
parts=[
806-
Base64Blob(
807-
mime_type="audio/mp3", # V2 typically returns mp3
804+
Blob(
805+
mime_type=mime_type,
808806
modality="audio",
809-
content=audio_base64,
807+
content=audio_data,
810808
)
811809
],
812810
finish_reason="stop",
813811
)
814812
]
813+
814+
815+
def _convert_speech_format_to_mime_type(speech_format: str) -> Optional[str]:
816+
"""Convert from speech format to mime type.
817+
818+
Args:
819+
speech_format: speech format of DashScope
820+
821+
Returns:
822+
the mime type of speech
823+
"""
824+
if speech_format == "wav":
825+
return "audio/wav"
826+
elif speech_format == "mp3":
827+
return "audio/mpeg"
828+
elif speech_format == "pcm":
829+
return "audio/pcm"
830+
elif speech_format == "opus":
831+
return "audio/opus"
832+
else:
833+
return None

0 commit comments

Comments
 (0)