Skip to content

Commit 0455e23

Browse files
feat(llma)/multimodal-capture
1 parent 7c7f529 commit 0455e23

File tree

4 files changed

+288
-25
lines changed

4 files changed

+288
-25
lines changed

posthog/ai/gemini/gemini_converter.py

Lines changed: 78 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -29,35 +29,69 @@ class GeminiMessage(TypedDict, total=False):
2929
text: str
3030

3131

32-
def _extract_text_from_parts(parts: List[Any]) -> str:
32+
def _format_parts_as_content_blocks(parts: List[Any]) -> List[FormattedContentItem]:
3333
"""
34-
Extract and concatenate text from a parts array.
34+
Format Gemini parts array into structured content blocks.
35+
36+
Preserves structure for multimodal content (text + images) instead of
37+
concatenating everything into a string.
3538
3639
Args:
37-
parts: List of parts that may contain text content
40+
parts: List of parts that may contain text, inline_data, etc.
3841
3942
Returns:
40-
Concatenated text from all parts
43+
List of formatted content blocks
4144
"""
42-
43-
content_parts = []
45+
content_blocks: List[FormattedContentItem] = []
4446

4547
for part in parts:
48+
# Handle dict with text field
4649
if isinstance(part, dict) and "text" in part:
47-
content_parts.append(part["text"])
50+
content_blocks.append({"type": "text", "text": part["text"]})
4851

52+
# Handle string parts
4953
elif isinstance(part, str):
50-
content_parts.append(part)
54+
content_blocks.append({"type": "text", "text": part})
5155

56+
# Handle dict with inline_data (images)
57+
elif isinstance(part, dict) and "inline_data" in part:
58+
inline_data = part["inline_data"]
59+
content_blocks.append(
60+
{
61+
"type": "image",
62+
"inline_data": inline_data,
63+
}
64+
)
65+
66+
# Handle object with text attribute
5267
elif hasattr(part, "text"):
53-
# Get the text attribute value
5468
text_value = getattr(part, "text", "")
55-
content_parts.append(text_value if text_value else str(part))
56-
57-
else:
58-
content_parts.append(str(part))
69+
if text_value:
70+
content_blocks.append({"type": "text", "text": text_value})
71+
72+
# Handle object with inline_data attribute
73+
elif hasattr(part, "inline_data"):
74+
inline_data = part.inline_data
75+
# Convert to dict if needed
76+
if hasattr(inline_data, "mime_type") and hasattr(inline_data, "data"):
77+
content_blocks.append(
78+
{
79+
"type": "image",
80+
"inline_data": {
81+
"mime_type": inline_data.mime_type,
82+
"data": inline_data.data,
83+
},
84+
}
85+
)
86+
else:
87+
content_blocks.append(
88+
{
89+
"type": "image",
90+
"inline_data": inline_data,
91+
}
92+
)
5993

60-
return "".join(content_parts)
94+
return content_blocks
6195

6296

6397
def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
@@ -73,16 +107,17 @@ def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
73107

74108
# Handle dict format with parts array (Gemini-specific format)
75109
if "parts" in item and isinstance(item["parts"], list):
76-
content = _extract_text_from_parts(item["parts"])
77-
return {"role": item.get("role", "user"), "content": content}
110+
content_blocks = _format_parts_as_content_blocks(item["parts"])
111+
return {"role": item.get("role", "user"), "content": content_blocks}
78112

79113
# Handle dict with content field
80114
if "content" in item:
81115
content = item["content"]
82116

83117
if isinstance(content, list):
84-
# If content is a list, extract text from it
85-
content = _extract_text_from_parts(content)
118+
# If content is a list, format it as content blocks
119+
content_blocks = _format_parts_as_content_blocks(content)
120+
return {"role": item.get("role", "user"), "content": content_blocks}
86121

87122
elif not isinstance(content, str):
88123
content = str(content)
@@ -110,14 +145,14 @@ def _format_object_message(item: Any) -> FormattedMessage:
110145

111146
# Handle object with parts attribute
112147
if hasattr(item, "parts") and hasattr(item.parts, "__iter__"):
113-
content = _extract_text_from_parts(item.parts)
148+
content_blocks = _format_parts_as_content_blocks(list(item.parts))
114149
role = getattr(item, "role", "user") if hasattr(item, "role") else "user"
115150

116151
# Ensure role is a string
117152
if not isinstance(role, str):
118153
role = "user"
119154

120-
return {"role": role, "content": content}
155+
return {"role": role, "content": content_blocks}
121156

122157
# Handle object with text attribute
123158
if hasattr(item, "text"):
@@ -193,6 +228,29 @@ def format_gemini_response(response: Any) -> List[FormattedMessage]:
193228
}
194229
)
195230

231+
elif hasattr(part, "inline_data") and part.inline_data:
232+
# Handle audio/media inline data
233+
import base64
234+
235+
inline_data = part.inline_data
236+
mime_type = getattr(inline_data, "mime_type", "audio/pcm")
237+
raw_data = getattr(inline_data, "data", b"")
238+
239+
# Encode binary data as base64 string for JSON serialization
240+
if isinstance(raw_data, bytes):
241+
data = base64.b64encode(raw_data).decode("utf-8")
242+
else:
243+
# Already a string (base64)
244+
data = raw_data
245+
246+
content.append(
247+
{
248+
"type": "audio",
249+
"mime_type": mime_type,
250+
"data": data,
251+
}
252+
)
253+
196254
if content:
197255
output.append(
198256
{

posthog/ai/openai/openai_converter.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,12 @@ def format_openai_response(response: Any) -> List[FormattedMessage]:
6767
}
6868
)
6969

70+
# Handle audio output (gpt-4o-audio-preview)
71+
if hasattr(choice.message, "audio") and choice.message.audio:
72+
# Convert Pydantic model to dict to capture all fields from OpenAI
73+
audio_dict = choice.message.audio.model_dump()
74+
content.append({"type": "audio", **audio_dict})
75+
7076
if content:
7177
output.append(
7278
{

posthog/ai/sanitization.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
1+
import os
12
import re
23
from typing import Any
34
from urllib.parse import urlparse
45

56
REDACTED_IMAGE_PLACEHOLDER = "[base64 image redacted]"
67

78

9+
def _is_multimodal_enabled() -> bool:
10+
"""Check if multimodal capture is enabled via environment variable."""
11+
return os.environ.get("_INTERNAL_LLMA_MULTIMODAL", "").lower() in ("true", "1", "yes")
12+
13+
814
def is_base64_data_url(text: str) -> bool:
915
return re.match(r"^data:([^;]+);base64,", text) is not None
1016

@@ -27,6 +33,9 @@ def is_raw_base64(text: str) -> bool:
2733

2834

2935
def redact_base64_data_url(value: Any) -> Any:
36+
if _is_multimodal_enabled():
37+
return value
38+
3039
if not isinstance(value, str):
3140
return value
3241

@@ -83,6 +92,11 @@ def sanitize_openai_image(item: Any) -> Any:
8392
},
8493
}
8594

95+
if item.get("type") == "audio" and "data" in item:
96+
if _is_multimodal_enabled():
97+
return item
98+
return {**item, "data": REDACTED_IMAGE_PLACEHOLDER}
99+
86100
return item
87101

88102

@@ -100,6 +114,9 @@ def sanitize_openai_response_image(item: Any) -> Any:
100114

101115

102116
def sanitize_anthropic_image(item: Any) -> Any:
117+
if _is_multimodal_enabled():
118+
return item
119+
103120
if not isinstance(item, dict):
104121
return item
105122

@@ -109,8 +126,6 @@ def sanitize_anthropic_image(item: Any) -> Any:
109126
and item["source"].get("type") == "base64"
110127
and "data" in item["source"]
111128
):
112-
# For Anthropic, if the source type is "base64", we should always redact the data
113-
# The provider is explicitly telling us this is base64 data
114129
return {
115130
**item,
116131
"source": {
@@ -123,6 +138,9 @@ def sanitize_anthropic_image(item: Any) -> Any:
123138

124139

125140
def sanitize_gemini_part(part: Any) -> Any:
141+
if _is_multimodal_enabled():
142+
return part
143+
126144
if not isinstance(part, dict):
127145
return part
128146

@@ -131,8 +149,6 @@ def sanitize_gemini_part(part: Any) -> Any:
131149
and isinstance(part["inline_data"], dict)
132150
and "data" in part["inline_data"]
133151
):
134-
# For Gemini, the inline_data structure indicates base64 data
135-
# We should redact any string data in this context
136152
return {
137153
**part,
138154
"inline_data": {
@@ -185,7 +201,9 @@ def sanitize_langchain_image(item: Any) -> Any:
185201
and isinstance(item.get("source"), dict)
186202
and "data" in item["source"]
187203
):
188-
# Anthropic style - raw base64 in structured format, always redact
204+
if _is_multimodal_enabled():
205+
return item
206+
189207
return {
190208
**item,
191209
"source": {

0 commit comments

Comments
 (0)