Skip to content

Commit 15da461

Browse files
authored
updates to binary data tracing (#43854)
* updates to binary data tracing * code formatter changes * pylint fix * updated readme based on review feedback
1 parent dfa92c7 commit 15da461

File tree

4 files changed

+3123
-19
lines changed

4 files changed

+3123
-19
lines changed

sdk/ai/azure-ai-projects/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,15 @@ To enable content recording, set the `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE
418418

419419
The AI Projects client library automatically instruments OpenAI responses and conversations operations through `AiProjectInstrumentation`. You can disable this instrumentation by setting the environment variable `AZURE_TRACING_GEN_AI_INSTRUMENT_RESPONSES_API` to `false`. If the environment variable is not set, the responses and conversations APIs will be instrumented by default.
420420

421+
### Tracing Binary Data
422+
423+
Binary data are images and files sent to the service as input messages. When you enable content recording (`OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT` set to `true`), by default you only trace file IDs and filenames. To enable full binary data tracing, set `AZURE_TRACING_GEN_AI_INCLUDE_BINARY_DATA` to `true`. In this case:
424+
425+
- **Images**: Image URLs (including data URIs with base64-encoded content) are included
426+
- **Files**: File data is included if sent via the API
427+
428+
**Important:** Binary data can contain sensitive information and may significantly increase trace size. Some trace backends and tracing implementations may have limitations on the maximum size of trace data that can be sent to and/or supported by the backend. Ensure your observability backend and tracing implementation support the expected trace payload sizes when enabling binary data tracing.
429+
421430
### Additional resources
422431

423432
For more information see:

sdk/ai/azure-ai-projects/azure/ai/projects/telemetry/_responses_instrumentor.py

Lines changed: 187 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252

5353
_responses_traces_enabled: bool = False
5454
_trace_responses_content: bool = False
55+
_trace_binary_data: bool = False
5556

5657
# Azure OpenAI system identifier for traces
5758
AZURE_OPENAI_SYSTEM = "azure.openai"
@@ -132,6 +133,14 @@ def is_content_recording_enabled(self) -> bool:
132133
"""
133134
return self._impl.is_content_recording_enabled()
134135

136+
def is_binary_data_enabled(self) -> bool:
137+
"""This function gets the binary data tracing value.
138+
139+
:return: A bool value indicating whether binary data tracing is enabled.
140+
:rtype: bool
141+
"""
142+
return self._impl.is_binary_data_enabled()
143+
135144

136145
class _ResponsesInstrumentorPreview: # pylint: disable=too-many-instance-attributes,too-many-statements,too-many-public-methods
137146
"""
@@ -389,10 +398,14 @@ def instrument(self, enable_content_recording: Optional[bool] = None):
389398
os.environ.get("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "false")
390399
)
391400

401+
# Check if binary data tracing is enabled
402+
enable_binary_data = self._str_to_bool(os.environ.get("AZURE_TRACING_GEN_AI_INCLUDE_BINARY_DATA", "false"))
403+
392404
if not self.is_instrumented():
393-
self._instrument_responses(enable_content_recording)
405+
self._instrument_responses(enable_content_recording, enable_binary_data)
394406
else:
395407
self.set_enable_content_recording(enable_content_recording)
408+
self.set_enable_binary_data(enable_binary_data)
396409

397410
def uninstrument(self):
398411
"""
@@ -431,6 +444,23 @@ def is_content_recording_enabled(self) -> bool:
431444
"""
432445
return self._is_content_recording_enabled()
433446

447+
def set_enable_binary_data(self, enable_binary_data: bool = False) -> None:
448+
"""This function sets the binary data tracing value.
449+
450+
:param enable_binary_data: Indicates whether tracing of binary data (such as images) should be enabled.
451+
This only takes effect when content recording is also enabled.
452+
:type enable_binary_data: bool
453+
"""
454+
self._set_enable_binary_data(enable_binary_data=enable_binary_data)
455+
456+
def is_binary_data_enabled(self) -> bool:
457+
"""This function gets the binary data tracing value.
458+
459+
:return: A bool value indicating whether binary data tracing is enabled.
460+
:rtype: bool
461+
"""
462+
return self._is_binary_data_enabled()
463+
434464
def _set_attributes(self, span: "AbstractSpan", *attrs: Tuple[str, Any]) -> None:
435465
for attr in attrs:
436466
span.add_attribute(attr[0], attr[1])
@@ -474,7 +504,8 @@ def _add_message_event(
474504
event_body: Dict[str, Any] = {}
475505

476506
if _trace_responses_content and content:
477-
event_body["text"] = content
507+
# Use consistent structured format with content array
508+
event_body["content"] = [{"type": "text", "text": content}]
478509

479510
attributes = self._create_event_attributes(
480511
conversation_id=conversation_id,
@@ -560,6 +591,138 @@ def _add_tool_message_events(
560591
# Use "tool" for the event name: gen_ai.tool.message
561592
span.span_instance.add_event(name="gen_ai.tool.message", attributes=attributes)
562593

594+
# pylint: disable=too-many-branches
595+
def _add_structured_input_events(
596+
self,
597+
span: "AbstractSpan",
598+
input_list: List[Any],
599+
conversation_id: Optional[str] = None,
600+
) -> None:
601+
"""
602+
Add message events for structured input (list format).
603+
This handles cases like messages with images, multi-part content, etc.
604+
"""
605+
for input_item in input_list:
606+
try:
607+
# Extract role - handle both dict and object
608+
if isinstance(input_item, dict):
609+
role = input_item.get("role", "user")
610+
content = input_item.get("content")
611+
else:
612+
role = getattr(input_item, "role", "user")
613+
content = getattr(input_item, "content", None)
614+
615+
if not content:
616+
continue
617+
618+
# Build structured event content with content parts
619+
event_body: Dict[str, Any] = {}
620+
621+
# Only process content if content recording is enabled
622+
if _trace_responses_content:
623+
content_parts = []
624+
has_non_text_content = False
625+
626+
# Content can be a list of content items
627+
if isinstance(content, list):
628+
for content_item in content:
629+
content_type = None
630+
631+
# Handle dict format
632+
if isinstance(content_item, dict):
633+
content_type = content_item.get("type")
634+
if content_type in ("input_text", "text"):
635+
text = content_item.get("text")
636+
if text:
637+
content_parts.append({"type": "text", "text": text})
638+
elif content_type == "input_image":
639+
has_non_text_content = True
640+
image_part = {"type": "image"}
641+
# Include image data if binary data tracing is enabled
642+
if _trace_binary_data:
643+
image_url = content_item.get("image_url")
644+
if image_url:
645+
image_part["image_url"] = image_url
646+
content_parts.append(image_part)
647+
elif content_type == "input_file":
648+
has_non_text_content = True
649+
file_part = {"type": "file"}
650+
# Only include filename and file_id if content recording is enabled
651+
filename = content_item.get("filename")
652+
if filename:
653+
file_part["filename"] = filename
654+
file_id = content_item.get("file_id")
655+
if file_id:
656+
file_part["file_id"] = file_id
657+
# Only include file_data if binary data tracing is enabled
658+
if _trace_binary_data:
659+
file_data = content_item.get("file_data")
660+
if file_data:
661+
file_part["file_data"] = file_data
662+
content_parts.append(file_part)
663+
elif content_type:
664+
# Other content types (audio, video, etc.)
665+
has_non_text_content = True
666+
content_parts.append({"type": content_type})
667+
668+
# Handle object format
669+
elif hasattr(content_item, "type"):
670+
content_type = getattr(content_item, "type", None)
671+
if content_type in ("input_text", "text"):
672+
text = getattr(content_item, "text", None)
673+
if text:
674+
content_parts.append({"type": "text", "text": text})
675+
elif content_type == "input_image":
676+
has_non_text_content = True
677+
image_part = {"type": "image"}
678+
# Include image data if binary data tracing is enabled
679+
if _trace_binary_data:
680+
image_url = getattr(content_item, "image_url", None)
681+
if image_url:
682+
image_part["image_url"] = image_url
683+
content_parts.append(image_part)
684+
elif content_type == "input_file":
685+
has_non_text_content = True
686+
file_part = {"type": "file"}
687+
# Only include filename and file_id if content recording is enabled
688+
filename = getattr(content_item, "filename", None)
689+
if filename:
690+
file_part["filename"] = filename
691+
file_id = getattr(content_item, "file_id", None)
692+
if file_id:
693+
file_part["file_id"] = file_id
694+
# Only include file_data if binary data tracing is enabled
695+
if _trace_binary_data:
696+
file_data = getattr(content_item, "file_data", None)
697+
if file_data:
698+
file_part["file_data"] = file_data
699+
content_parts.append(file_part)
700+
elif content_type:
701+
# Other content types
702+
has_non_text_content = True
703+
content_parts.append({"type": content_type})
704+
705+
# Only add content if we have content parts
706+
if content_parts:
707+
# Always use consistent structured format
708+
event_body["content"] = content_parts
709+
710+
# Create event attributes
711+
attributes = self._create_event_attributes(
712+
conversation_id=conversation_id,
713+
message_role=role,
714+
)
715+
attributes[GEN_AI_EVENT_CONTENT] = json.dumps(event_body, ensure_ascii=False)
716+
717+
# Add the event
718+
event_name = f"gen_ai.{role}.message"
719+
span.span_instance.add_event(name=event_name, attributes=attributes)
720+
721+
except Exception: # pylint: disable=broad-exception-caught
722+
# Skip items that can't be processed
723+
logger.debug("Failed to process structured input item: %s", input_item, exc_info=True)
724+
continue
725+
563726
def _emit_tool_call_event(
564727
self,
565728
span: "AbstractSpan",
@@ -868,6 +1031,14 @@ def start_responses_span(
8681031
content=input_text,
8691032
conversation_id=conversation_id,
8701033
)
1034+
elif isinstance(input_to_check, list) and not has_tool_outputs:
1035+
# Handle structured input (list format) - extract text content from user messages
1036+
# This handles cases like image inputs with text prompts
1037+
self._add_structured_input_events(
1038+
span,
1039+
input_list=input_to_check,
1040+
conversation_id=conversation_id,
1041+
)
8711042

8721043
return span
8731044

@@ -2982,7 +3153,7 @@ def _available_responses_apis_and_injectors(self):
29823153
"""
29833154
yield from self._generate_api_and_injector(self._all_api_list())
29843155

2985-
def _instrument_responses(self, enable_content_tracing: bool = False):
3156+
def _instrument_responses(self, enable_content_tracing: bool = False, enable_binary_data: bool = False):
29863157
"""This function modifies the methods of the Responses API classes to
29873158
inject logic before calling the original methods.
29883159
The original methods are stored as _original attributes of the methods.
@@ -2991,15 +3162,20 @@ def _instrument_responses(self, enable_content_tracing: bool = False):
29913162
This also controls whether function call tool function names,
29923163
parameter names and parameter values are traced.
29933164
:type enable_content_tracing: bool
3165+
:param enable_binary_data: Indicates whether tracing of binary data (such as images) should be enabled.
3166+
This only takes effect when content recording is also enabled.
3167+
:type enable_binary_data: bool
29943168
"""
29953169
# pylint: disable=W0603
29963170
global _responses_traces_enabled
29973171
global _trace_responses_content
3172+
global _trace_binary_data
29983173
if _responses_traces_enabled:
29993174
return
30003175

30013176
_responses_traces_enabled = True
30023177
_trace_responses_content = enable_content_tracing
3178+
_trace_binary_data = enable_binary_data
30033179

30043180
# Initialize metrics instruments
30053181
self._initialize_metrics()
@@ -3050,6 +3226,14 @@ def _is_content_recording_enabled(self) -> bool:
30503226
global _trace_responses_content
30513227
return _trace_responses_content
30523228

3229+
def _set_enable_binary_data(self, enable_binary_data: bool = False) -> None:
3230+
global _trace_binary_data
3231+
_trace_binary_data = enable_binary_data
3232+
3233+
def _is_binary_data_enabled(self) -> bool:
3234+
global _trace_binary_data
3235+
return _trace_binary_data
3236+
30533237
def record_error(self, span, exc):
30543238
# pyright: ignore [reportPossiblyUnboundVariable]
30553239
span.span_instance.set_status(StatusCode.ERROR, str(exc))

0 commit comments

Comments
 (0)