Skip to content

Commit 706e1d6

Browse files
committed
updates to binary data tracing
1 parent 17c996f commit 706e1d6

File tree

4 files changed

+3286
-19
lines changed

4 files changed

+3286
-19
lines changed

sdk/ai/azure-ai-projects/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,20 @@ To enable content recording, set the `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE
418418

419419
The AI Projects client library automatically instruments OpenAI responses and conversations operations through `AiProjectInstrumentation`. You can disable this instrumentation by setting the environment variable `AZURE_TRACING_GEN_AI_INSTRUMENT_RESPONSES_API` to `false`. If the environment variable is not set, the responses and conversations APIs will be instrumented by default.
420420

421+
### Tracing Binary Data
422+
423+
By default, binary data such as images and files included with input is not captured in traces. To include binary data in traces, set the environment variable `AZURE_TRACING_GEN_AI_INCLUDE_BINARY_DATA` to `true`. If the environment variable is not set, binary data defaults to not being included.
424+
425+
Binary data tracing works in combination with content recording:
426+
- **When content recording is enabled**: File IDs and filenames are included in traces
427+
- **When both content recording and binary data tracing are enabled**:
428+
- **Images**: Image URLs (including data URIs with base64-encoded content) are included
429+
- **Files**: File data is included if sent via the API
430+
431+
**Note:** Binary data tracing requires content recording to be enabled (see [Enabling content recording](#enabling-content-recording)). If content recording is disabled, binary data will not be included regardless of the `AZURE_TRACING_GEN_AI_INCLUDE_BINARY_DATA` setting.
432+
433+
**Important:** Binary data can contain sensitive information and may significantly increase trace size. Some trace backends and tracing implementations may have limitations on the maximum size of trace data that can be sent to and/or supported by the backend. Ensure your observability backend and tracing implementation support the expected trace payload sizes when enabling binary data tracing.
434+
421435
### Additional resources
422436

423437
For more information see:

sdk/ai/azure-ai-projects/azure/ai/projects/telemetry/_responses_instrumentor.py

Lines changed: 188 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252

5353
_responses_traces_enabled: bool = False
5454
_trace_responses_content: bool = False
55+
_trace_binary_data: bool = False
5556

5657
# Azure OpenAI system identifier for traces
5758
AZURE_OPENAI_SYSTEM = "azure.openai"
@@ -132,6 +133,14 @@ def is_content_recording_enabled(self) -> bool:
132133
"""
133134
return self._impl.is_content_recording_enabled()
134135

136+
def is_binary_data_enabled(self) -> bool:
137+
"""This function gets the binary data tracing value.
138+
139+
:return: A bool value indicating whether binary data tracing is enabled.
140+
:rtype: bool
141+
"""
142+
return self._impl.is_binary_data_enabled()
143+
135144

136145
class _ResponsesInstrumentorPreview: # pylint: disable=too-many-instance-attributes,too-many-statements,too-many-public-methods
137146
"""
@@ -389,10 +398,16 @@ def instrument(self, enable_content_recording: Optional[bool] = None):
389398
os.environ.get("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "false")
390399
)
391400

401+
# Check if binary data tracing is enabled
402+
enable_binary_data = self._str_to_bool(
403+
os.environ.get("AZURE_TRACING_GEN_AI_INCLUDE_BINARY_DATA", "false")
404+
)
405+
392406
if not self.is_instrumented():
393-
self._instrument_responses(enable_content_recording)
407+
self._instrument_responses(enable_content_recording, enable_binary_data)
394408
else:
395409
self.set_enable_content_recording(enable_content_recording)
410+
self.set_enable_binary_data(enable_binary_data)
396411

397412
def uninstrument(self):
398413
"""
@@ -431,6 +446,23 @@ def is_content_recording_enabled(self) -> bool:
431446
"""
432447
return self._is_content_recording_enabled()
433448

449+
def set_enable_binary_data(self, enable_binary_data: bool = False) -> None:
450+
"""This function sets the binary data tracing value.
451+
452+
:param enable_binary_data: Indicates whether tracing of binary data (such as images) should be enabled.
453+
This only takes effect when content recording is also enabled.
454+
:type enable_binary_data: bool
455+
"""
456+
self._set_enable_binary_data(enable_binary_data=enable_binary_data)
457+
458+
def is_binary_data_enabled(self) -> bool:
459+
"""This function gets the binary data tracing value.
460+
461+
:return: A bool value indicating whether binary data tracing is enabled.
462+
:rtype: bool
463+
"""
464+
return self._is_binary_data_enabled()
465+
434466
def _set_attributes(self, span: "AbstractSpan", *attrs: Tuple[str, Any]) -> None:
435467
for attr in attrs:
436468
span.add_attribute(attr[0], attr[1])
@@ -474,7 +506,8 @@ def _add_message_event(
474506
event_body: Dict[str, Any] = {}
475507

476508
if _trace_responses_content and content:
477-
event_body["text"] = content
509+
# Use consistent structured format with content array
510+
event_body["content"] = [{"type": "text", "text": content}]
478511

479512
attributes = self._create_event_attributes(
480513
conversation_id=conversation_id,
@@ -560,6 +593,137 @@ def _add_tool_message_events(
560593
# Use "tool" for the event name: gen_ai.tool.message
561594
span.span_instance.add_event(name="gen_ai.tool.message", attributes=attributes)
562595

596+
def _add_structured_input_events(
597+
self,
598+
span: "AbstractSpan",
599+
input_list: List[Any],
600+
conversation_id: Optional[str] = None,
601+
) -> None:
602+
"""
603+
Add message events for structured input (list format).
604+
This handles cases like messages with images, multi-part content, etc.
605+
"""
606+
for input_item in input_list:
607+
try:
608+
# Extract role - handle both dict and object
609+
if isinstance(input_item, dict):
610+
role = input_item.get("role", "user")
611+
content = input_item.get("content")
612+
else:
613+
role = getattr(input_item, "role", "user")
614+
content = getattr(input_item, "content", None)
615+
616+
if not content:
617+
continue
618+
619+
# Build structured event content with content parts
620+
event_body: Dict[str, Any] = {}
621+
622+
# Only process content if content recording is enabled
623+
if _trace_responses_content:
624+
content_parts = []
625+
has_non_text_content = False
626+
627+
# Content can be a list of content items
628+
if isinstance(content, list):
629+
for content_item in content:
630+
content_type = None
631+
632+
# Handle dict format
633+
if isinstance(content_item, dict):
634+
content_type = content_item.get("type")
635+
if content_type in ("input_text", "text"):
636+
text = content_item.get("text")
637+
if text:
638+
content_parts.append({"type": "text", "text": text})
639+
elif content_type == "input_image":
640+
has_non_text_content = True
641+
image_part = {"type": "image"}
642+
# Include image data if binary data tracing is enabled
643+
if _trace_binary_data:
644+
image_url = content_item.get("image_url")
645+
if image_url:
646+
image_part["image_url"] = image_url
647+
content_parts.append(image_part)
648+
elif content_type == "input_file":
649+
has_non_text_content = True
650+
file_part = {"type": "file"}
651+
# Only include filename and file_id if content recording is enabled
652+
filename = content_item.get("filename")
653+
if filename:
654+
file_part["filename"] = filename
655+
file_id = content_item.get("file_id")
656+
if file_id:
657+
file_part["file_id"] = file_id
658+
# Only include file_data if binary data tracing is enabled
659+
if _trace_binary_data:
660+
file_data = content_item.get("file_data")
661+
if file_data:
662+
file_part["file_data"] = file_data
663+
content_parts.append(file_part)
664+
elif content_type:
665+
# Other content types (audio, video, etc.)
666+
has_non_text_content = True
667+
content_parts.append({"type": content_type})
668+
669+
# Handle object format
670+
elif hasattr(content_item, "type"):
671+
content_type = getattr(content_item, "type", None)
672+
if content_type in ("input_text", "text"):
673+
text = getattr(content_item, "text", None)
674+
if text:
675+
content_parts.append({"type": "text", "text": text})
676+
elif content_type == "input_image":
677+
has_non_text_content = True
678+
image_part = {"type": "image"}
679+
# Include image data if binary data tracing is enabled
680+
if _trace_binary_data:
681+
image_url = getattr(content_item, "image_url", None)
682+
if image_url:
683+
image_part["image_url"] = image_url
684+
content_parts.append(image_part)
685+
elif content_type == "input_file":
686+
has_non_text_content = True
687+
file_part = {"type": "file"}
688+
# Only include filename and file_id if content recording is enabled
689+
filename = getattr(content_item, "filename", None)
690+
if filename:
691+
file_part["filename"] = filename
692+
file_id = getattr(content_item, "file_id", None)
693+
if file_id:
694+
file_part["file_id"] = file_id
695+
# Only include file_data if binary data tracing is enabled
696+
if _trace_binary_data:
697+
file_data = getattr(content_item, "file_data", None)
698+
if file_data:
699+
file_part["file_data"] = file_data
700+
content_parts.append(file_part)
701+
elif content_type:
702+
# Other content types
703+
has_non_text_content = True
704+
content_parts.append({"type": content_type})
705+
706+
# Only add content if we have content parts
707+
if content_parts:
708+
# Always use consistent structured format
709+
event_body["content"] = content_parts
710+
711+
# Create event attributes
712+
attributes = self._create_event_attributes(
713+
conversation_id=conversation_id,
714+
message_role=role,
715+
)
716+
attributes[GEN_AI_EVENT_CONTENT] = json.dumps(event_body, ensure_ascii=False)
717+
718+
# Add the event
719+
event_name = f"gen_ai.{role}.message"
720+
span.span_instance.add_event(name=event_name, attributes=attributes)
721+
722+
except Exception: # pylint: disable=broad-exception-caught
723+
# Skip items that can't be processed
724+
logger.debug("Failed to process structured input item: %s", input_item, exc_info=True)
725+
continue
726+
563727
def _emit_tool_call_event(
564728
self,
565729
span: "AbstractSpan",
@@ -868,6 +1032,14 @@ def start_responses_span(
8681032
content=input_text,
8691033
conversation_id=conversation_id,
8701034
)
1035+
elif isinstance(input_to_check, list) and not has_tool_outputs:
1036+
# Handle structured input (list format) - extract text content from user messages
1037+
# This handles cases like image inputs with text prompts
1038+
self._add_structured_input_events(
1039+
span,
1040+
input_list=input_to_check,
1041+
conversation_id=conversation_id,
1042+
)
8711043

8721044
return span
8731045

@@ -2982,7 +3154,7 @@ def _available_responses_apis_and_injectors(self):
29823154
"""
29833155
yield from self._generate_api_and_injector(self._all_api_list())
29843156

2985-
def _instrument_responses(self, enable_content_tracing: bool = False):
3157+
def _instrument_responses(self, enable_content_tracing: bool = False, enable_binary_data: bool = False):
29863158
"""This function modifies the methods of the Responses API classes to
29873159
inject logic before calling the original methods.
29883160
The original methods are stored as _original attributes of the methods.
@@ -2991,15 +3163,20 @@ def _instrument_responses(self, enable_content_tracing: bool = False):
29913163
This also controls whether function call tool function names,
29923164
parameter names and parameter values are traced.
29933165
:type enable_content_tracing: bool
3166+
:param enable_binary_data: Indicates whether tracing of binary data (such as images) should be enabled.
3167+
This only takes effect when content recording is also enabled.
3168+
:type enable_binary_data: bool
29943169
"""
29953170
# pylint: disable=W0603
29963171
global _responses_traces_enabled
29973172
global _trace_responses_content
3173+
global _trace_binary_data
29983174
if _responses_traces_enabled:
29993175
return
30003176

30013177
_responses_traces_enabled = True
30023178
_trace_responses_content = enable_content_tracing
3179+
_trace_binary_data = enable_binary_data
30033180

30043181
# Initialize metrics instruments
30053182
self._initialize_metrics()
@@ -3050,6 +3227,14 @@ def _is_content_recording_enabled(self) -> bool:
30503227
global _trace_responses_content
30513228
return _trace_responses_content
30523229

3230+
def _set_enable_binary_data(self, enable_binary_data: bool = False) -> None:
3231+
global _trace_binary_data
3232+
_trace_binary_data = enable_binary_data
3233+
3234+
def _is_binary_data_enabled(self) -> bool:
3235+
global _trace_binary_data
3236+
return _trace_binary_data
3237+
30533238
def record_error(self, span, exc):
30543239
# pyright: ignore [reportPossiblyUnboundVariable]
30553240
span.span_instance.set_status(StatusCode.ERROR, str(exc))

0 commit comments

Comments
 (0)