feat(llma): multimodal-capture (#378)

carlos-marchal-ph · web-flow · commit da8653305ff0 · 2025-12-11T15:00:53.000Z
diff --git a/posthog/ai/gemini/gemini_converter.py b/posthog/ai/gemini/gemini_converter.py
@@ -29,35 +29,76 @@ class GeminiMessage(TypedDict, total=False):
     text: str
 
 
-def _extract_text_from_parts(parts: List[Any]) -> str:
+def _format_parts_as_content_blocks(parts: List[Any]) -> List[FormattedContentItem]:
     """
-    Extract and concatenate text from a parts array.
+    Format Gemini parts array into structured content blocks.
+
+    Preserves structure for multimodal content (text + images) instead of
+    concatenating everything into a string.
 
     Args:
-        parts: List of parts that may contain text content
+        parts: List of parts that may contain text, inline_data, etc.
 
     Returns:
-        Concatenated text from all parts
+        List of formatted content blocks
     """
-
-    content_parts = []
+    content_blocks: List[FormattedContentItem] = []
 
     for part in parts:
+        # Handle dict with text field
         if isinstance(part, dict) and "text" in part:
-            content_parts.append(part["text"])
+            content_blocks.append({"type": "text", "text": part["text"]})
 
+        # Handle string parts
         elif isinstance(part, str):
-            content_parts.append(part)
+            content_blocks.append({"type": "text", "text": part})
+
+        # Handle dict with inline_data (images, documents, etc.)
+        elif isinstance(part, dict) and "inline_data" in part:
+            inline_data = part["inline_data"]
+            mime_type = inline_data.get("mime_type", "")
+            content_type = "image" if mime_type.startswith("image/") else "document"
+
+            content_blocks.append(
+                {
+                    "type": content_type,
+                    "inline_data": inline_data,
+                }
+            )
 
+        # Handle object with text attribute
         elif hasattr(part, "text"):
-            # Get the text attribute value
             text_value = getattr(part, "text", "")
-            content_parts.append(text_value if text_value else str(part))
-
-        else:
-            content_parts.append(str(part))
+            if text_value:
+                content_blocks.append({"type": "text", "text": text_value})
+
+        # Handle object with inline_data attribute
+        elif hasattr(part, "inline_data"):
+            inline_data = part.inline_data
+            # Convert to dict if needed
+            if hasattr(inline_data, "mime_type") and hasattr(inline_data, "data"):
+                # Determine type based on mime_type
+                mime_type = inline_data.mime_type
+                content_type = "image" if mime_type.startswith("image/") else "document"
+
+                content_blocks.append(
+                    {
+                        "type": content_type,
+                        "inline_data": {
+                            "mime_type": mime_type,
+                            "data": inline_data.data,
+                        },
+                    }
+                )
+            else:
+                content_blocks.append(
+                    {
+                        "type": "image",
+                        "inline_data": inline_data,
+                    }
+                )
 
-    return "".join(content_parts)
+    return content_blocks
 
 
 def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
@@ -73,16 +114,17 @@ def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
 
     # Handle dict format with parts array (Gemini-specific format)
     if "parts" in item and isinstance(item["parts"], list):
-        content = _extract_text_from_parts(item["parts"])
-        return {"role": item.get("role", "user"), "content": content}
+        content_blocks = _format_parts_as_content_blocks(item["parts"])
+        return {"role": item.get("role", "user"), "content": content_blocks}
 
     # Handle dict with content field
     if "content" in item:
         content = item["content"]
 
         if isinstance(content, list):
-            # If content is a list, extract text from it
-            content = _extract_text_from_parts(content)
+            # If content is a list, format it as content blocks
+            content_blocks = _format_parts_as_content_blocks(content)
+            return {"role": item.get("role", "user"), "content": content_blocks}
 
         elif not isinstance(content, str):
             content = str(content)
@@ -110,14 +152,14 @@ def _format_object_message(item: Any) -> FormattedMessage:
 
     # Handle object with parts attribute
     if hasattr(item, "parts") and hasattr(item.parts, "__iter__"):
-        content = _extract_text_from_parts(item.parts)
+        content_blocks = _format_parts_as_content_blocks(list(item.parts))
         role = getattr(item, "role", "user") if hasattr(item, "role") else "user"
 
         # Ensure role is a string
         if not isinstance(role, str):
             role = "user"
 
-        return {"role": role, "content": content}
+        return {"role": role, "content": content_blocks}
 
     # Handle object with text attribute
     if hasattr(item, "text"):
@@ -140,7 +182,8 @@ def _format_object_message(item: Any) -> FormattedMessage:
         content = item.content
 
         if isinstance(content, list):
-            content = _extract_text_from_parts(content)
+            content_blocks = _format_parts_as_content_blocks(content)
+            return {"role": role, "content": content_blocks}
 
         elif not isinstance(content, str):
             content = str(content)
@@ -193,6 +236,29 @@ def format_gemini_response(response: Any) -> List[FormattedMessage]:
                                 }
                             )
 
+                        elif hasattr(part, "inline_data") and part.inline_data:
+                            # Handle audio/media inline data
+                            import base64
+
+                            inline_data = part.inline_data
+                            mime_type = getattr(inline_data, "mime_type", "audio/pcm")
+                            raw_data = getattr(inline_data, "data", b"")
+
+                            # Encode binary data as base64 string for JSON serialization
+                            if isinstance(raw_data, bytes):
+                                data = base64.b64encode(raw_data).decode("utf-8")
+                            else:
+                                # Already a string (base64)
+                                data = raw_data
+
+                            content.append(
+                                {
+                                    "type": "audio",
+                                    "mime_type": mime_type,
+                                    "data": data,
+                                }
+                            )
+
                 if content:
                     output.append(
                         {
diff --git a/posthog/ai/openai/openai_converter.py b/posthog/ai/openai/openai_converter.py
@@ -67,6 +67,12 @@ def format_openai_response(response: Any) -> List[FormattedMessage]:
                             }
                         )
 
+                # Handle audio output (gpt-4o-audio-preview)
+                if hasattr(choice.message, "audio") and choice.message.audio:
+                    # Convert Pydantic model to dict to capture all fields from OpenAI
+                    audio_dict = choice.message.audio.model_dump()
+                    content.append({"type": "audio", **audio_dict})
+
         if content:
             output.append(
                 {
diff --git a/posthog/ai/sanitization.py b/posthog/ai/sanitization.py
@@ -1,10 +1,20 @@
+import os
 import re
 from typing import Any
 from urllib.parse import urlparse
 
 REDACTED_IMAGE_PLACEHOLDER = "[base64 image redacted]"
 
 
+def _is_multimodal_enabled() -> bool:
+    """Check if multimodal capture is enabled via environment variable."""
+    return os.environ.get("_INTERNAL_LLMA_MULTIMODAL", "").lower() in (
+        "true",
+        "1",
+        "yes",
+    )
+
+
 def is_base64_data_url(text: str) -> bool:
     return re.match(r"^data:([^;]+);base64,", text) is not None
 
@@ -27,6 +37,9 @@ def is_raw_base64(text: str) -> bool:
 
 
 def redact_base64_data_url(value: Any) -> Any:
+    if _is_multimodal_enabled():
+        return value
+
     if not isinstance(value, str):
         return value
 
@@ -83,6 +96,11 @@ def sanitize_openai_image(item: Any) -> Any:
             },
         }
 
+    if item.get("type") == "audio" and "data" in item:
+        if _is_multimodal_enabled():
+            return item
+        return {**item, "data": REDACTED_IMAGE_PLACEHOLDER}
+
     return item
 
 
@@ -100,6 +118,9 @@ def sanitize_openai_response_image(item: Any) -> Any:
 
 
 def sanitize_anthropic_image(item: Any) -> Any:
+    if _is_multimodal_enabled():
+        return item
+
     if not isinstance(item, dict):
         return item
 
@@ -109,8 +130,6 @@ def sanitize_anthropic_image(item: Any) -> Any:
         and item["source"].get("type") == "base64"
         and "data" in item["source"]
     ):
-        # For Anthropic, if the source type is "base64", we should always redact the data
-        # The provider is explicitly telling us this is base64 data
         return {
             **item,
             "source": {
@@ -123,6 +142,9 @@ def sanitize_anthropic_image(item: Any) -> Any:
 
 
 def sanitize_gemini_part(part: Any) -> Any:
+    if _is_multimodal_enabled():
+        return part
+
     if not isinstance(part, dict):
         return part
 
@@ -131,8 +153,6 @@ def sanitize_gemini_part(part: Any) -> Any:
         and isinstance(part["inline_data"], dict)
         and "data" in part["inline_data"]
     ):
-        # For Gemini, the inline_data structure indicates base64 data
-        # We should redact any string data in this context
         return {
             **part,
             "inline_data": {
@@ -185,7 +205,9 @@ def sanitize_langchain_image(item: Any) -> Any:
         and isinstance(item.get("source"), dict)
         and "data" in item["source"]
     ):
-        # Anthropic style - raw base64 in structured format, always redact
+        if _is_multimodal_enabled():
+            return item
+
         return {
             **item,
             "source": {
diff --git a/posthog/test/ai/gemini/test_gemini.py b/posthog/test/ai/gemini/test_gemini.py
@@ -407,7 +407,9 @@ def test_new_client_different_input_formats(
     )
     call_args = mock_client.capture.call_args[1]
     props = call_args["properties"]
-    assert props["$ai_input"] == [{"role": "user", "content": "hey"}]
+    assert props["$ai_input"] == [
+        {"role": "user", "content": [{"type": "text", "text": "hey"}]}
+    ]
 
     # Test multiple parts in the parts array
     mock_client.reset_mock()
@@ -418,7 +420,15 @@ def test_new_client_different_input_formats(
     )
     call_args = mock_client.capture.call_args[1]
     props = call_args["properties"]
-    assert props["$ai_input"] == [{"role": "user", "content": "Hello world"}]
+    assert props["$ai_input"] == [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Hello "},
+                {"type": "text", "text": "world"},
+            ],
+        }
+    ]
 
     # Test list input with string
     mock_client.capture.reset_mock()
diff --git a/posthog/test/ai/gemini/test_gemini_async.py b/posthog/test/ai/gemini/test_gemini_async.py
@@ -392,7 +392,9 @@ async def test_async_client_different_input_formats(
     )
     call_args = mock_client.capture.call_args[1]
     props = call_args["properties"]
-    assert props["$ai_input"] == [{"role": "user", "content": "hey"}]
+    assert props["$ai_input"] == [
+        {"role": "user", "content": [{"type": "text", "text": "hey"}]}
+    ]
 
     # Test multiple parts in the parts array
     mock_client.reset_mock()
@@ -403,7 +405,15 @@ async def test_async_client_different_input_formats(
     )
     call_args = mock_client.capture.call_args[1]
     props = call_args["properties"]
-    assert props["$ai_input"] == [{"role": "user", "content": "Hello world"}]
+    assert props["$ai_input"] == [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Hello "},
+                {"type": "text", "text": "world"},
+            ],
+        }
+    ]
 
     # Test list input with string
     mock_client.capture.reset_mock()
diff --git a/posthog/test/ai/test_sanitization.py b/posthog/test/ai/test_sanitization.py

Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,12 @@ def format_openai_response(response: Any) -> List[FormattedMessage]:`
`67`	`67`	`}`
`68`	`68`	`)`
`69`	`69`
	`70`	`+ # Handle audio output (gpt-4o-audio-preview)`
	`71`	`+ if hasattr(choice.message, "audio") and choice.message.audio:`
	`72`	`+ # Convert Pydantic model to dict to capture all fields from OpenAI`
	`73`	`+ audio_dict = choice.message.audio.model_dump()`
	`74`	`+ content.append({"type": "audio", **audio_dict})`
	`75`	`+`
`70`	`76`	`if content:`
`71`	`77`	`output.append(`
`72`	`78`	`{`