GenAI: emit system_instructions as typed text objects; add redaction mode returning [{"type":"text","content":"readacted"}] when sensitive capture disabled; add TODO to restore processors on uninstrument

nagkumar91 · nagkumar91 · commit 353fdb05e81c · 2025-09-30T08:16:48.000-07:00
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-agents/src/opentelemetry/instrumentation/openai_agents/__init__.py b/instrumentation-genai/opentelemetry-instrumentation-openai-agents/src/opentelemetry/instrumentation/openai_agents/__init__.py
@@ -115,6 +115,7 @@ def _instrument(self, **kwargs):
     def _uninstrument(self, **kwargs):
         """Uninstruments the OpenAI library for agent frameworks."""
         # No-op: optional processor registry may not be present.
+        # TODO: maintain the old list of processors and restore it upon _uninstrument.
         return
 
     def instrumentation_dependencies(self) -> Collection[str]:
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-agents/src/opentelemetry/instrumentation/openai_agents/genai_semantic_processor.py b/instrumentation-genai/opentelemetry-instrumentation-openai-agents/src/opentelemetry/instrumentation/openai_agents/genai_semantic_processor.py
@@ -357,21 +357,67 @@ def _record_metrics(
 
     def _collect_system_instructions(
         self, messages: Sequence[Any] | None
-    ) -> list[str]:
-        """Return list of system/ai role message contents."""
+    ) -> list[dict[str, str]]:
+        """Return system/ai role instructions as typed text objects.
+
+        Enforces format: [{"type": "text", "content": "..."}].
+        Handles message content that may be a string, list of parts,
+        or a dict with text/content fields.
+        """
         if not messages:
             return []
-        out: list[str] = []
+        out: list[dict[str, str]] = []
         for m in messages:
             if not isinstance(m, dict):
                 continue
             role = m.get("role")
             if role in {"system", "ai"}:
                 content = m.get("content")
-                if content is not None:
-                    out.append(str(content))
+                out.extend(self._normalize_to_text_parts(content))
         return out
 
+    def _normalize_to_text_parts(self, content: Any) -> list[dict[str, str]]:
+        """Normalize arbitrary content into typed text parts.
+
+        - String -> [{type: text, content: <string>}]
+        - List/Tuple -> map each item to a text part (string/dict supported)
+        - Dict -> use 'text' or 'content' field when available; else str(dict)
+        - Other -> str(value)
+        """
+        parts: list[dict[str, str]] = []
+        if content is None:
+            return parts
+        if isinstance(content, str):
+            parts.append({"type": "text", "content": content})
+            return parts
+        if isinstance(content, (list, tuple)):
+            for item in content:
+                if isinstance(item, str):
+                    parts.append({"type": "text", "content": item})
+                elif isinstance(item, dict):
+                    txt = item.get("text") or item.get("content")
+                    if isinstance(txt, str) and txt:
+                        parts.append({"type": "text", "content": txt})
+                    else:
+                        parts.append({"type": "text", "content": str(item)})
+                else:
+                    parts.append({"type": "text", "content": str(item)})
+            return parts
+        if isinstance(content, dict):
+            txt = content.get("text") or content.get("content")
+            if isinstance(txt, str) and txt:
+                parts.append({"type": "text", "content": txt})
+            else:
+                parts.append({"type": "text", "content": str(content)})
+            return parts
+        # Fallback for other types
+        parts.append({"type": "text", "content": str(content)})
+        return parts
+
+    def _redacted_text_parts(self) -> list[dict[str, str]]:
+        """Return a single redacted text part for system instructions."""
+        return [{"type": "text", "content": "readacted"}]
+
     def _infer_output_type(self, span_data: Any) -> str:
         """Infer gen_ai.output.type for multiple span kinds."""
         if isinstance(span_data, FunctionSpanData):
@@ -746,7 +792,12 @@ def _get_attributes_from_generation_span_data(
 
             # System instructions
             if self._capture_system_instructions and span_data.input:
-                sys_instr = self._collect_system_instructions(span_data.input)
+                if self.include_sensitive_data:
+                    sys_instr = self._collect_system_instructions(
+                        span_data.input
+                    )
+                else:
+                    sys_instr = self._redacted_text_parts()
                 if sys_instr:
                     yield (
                         GEN_AI_SYSTEM_INSTRUCTIONS,
@@ -808,7 +859,7 @@ def _get_attributes_from_agent_span_data(
             try:
                 defs = span_data.agent_definitions
                 if isinstance(defs, (list, tuple)):
-                    collected: list[str] = []
+                    collected: list[dict[str, str]] = []
                     for d in defs:
                         if isinstance(d, dict):
                             msgs = d.get("messages") or d.get(
@@ -955,7 +1006,12 @@ def _get_attributes_from_response_span_data(
 
             # System instructions
             if self._capture_system_instructions and span_data.input:
-                sys_instr = self._collect_system_instructions(span_data.input)
+                if self.include_sensitive_data:
+                    sys_instr = self._collect_system_instructions(
+                        span_data.input
+                    )
+                else:
+                    sys_instr = self._redacted_text_parts()
                 if sys_instr:
                     yield (
                         GEN_AI_SYSTEM_INSTRUCTIONS,