Skip to content

Commit da86533

Browse files
feat(llma): multimodal-capture (#378)
1 parent 88a7c5e commit da86533

File tree

6 files changed

+331
-30
lines changed

6 files changed

+331
-30
lines changed

posthog/ai/gemini/gemini_converter.py

Lines changed: 87 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -29,35 +29,76 @@ class GeminiMessage(TypedDict, total=False):
2929
text: str
3030

3131

32-
def _extract_text_from_parts(parts: List[Any]) -> str:
32+
def _format_parts_as_content_blocks(parts: List[Any]) -> List[FormattedContentItem]:
3333
"""
34-
Extract and concatenate text from a parts array.
34+
Format Gemini parts array into structured content blocks.
35+
36+
Preserves structure for multimodal content (text + images) instead of
37+
concatenating everything into a string.
3538
3639
Args:
37-
parts: List of parts that may contain text content
40+
parts: List of parts that may contain text, inline_data, etc.
3841
3942
Returns:
40-
Concatenated text from all parts
43+
List of formatted content blocks
4144
"""
42-
43-
content_parts = []
45+
content_blocks: List[FormattedContentItem] = []
4446

4547
for part in parts:
48+
# Handle dict with text field
4649
if isinstance(part, dict) and "text" in part:
47-
content_parts.append(part["text"])
50+
content_blocks.append({"type": "text", "text": part["text"]})
4851

52+
# Handle string parts
4953
elif isinstance(part, str):
50-
content_parts.append(part)
54+
content_blocks.append({"type": "text", "text": part})
55+
56+
# Handle dict with inline_data (images, documents, etc.)
57+
elif isinstance(part, dict) and "inline_data" in part:
58+
inline_data = part["inline_data"]
59+
mime_type = inline_data.get("mime_type", "")
60+
content_type = "image" if mime_type.startswith("image/") else "document"
61+
62+
content_blocks.append(
63+
{
64+
"type": content_type,
65+
"inline_data": inline_data,
66+
}
67+
)
5168

69+
# Handle object with text attribute
5270
elif hasattr(part, "text"):
53-
# Get the text attribute value
5471
text_value = getattr(part, "text", "")
55-
content_parts.append(text_value if text_value else str(part))
56-
57-
else:
58-
content_parts.append(str(part))
72+
if text_value:
73+
content_blocks.append({"type": "text", "text": text_value})
74+
75+
# Handle object with inline_data attribute
76+
elif hasattr(part, "inline_data"):
77+
inline_data = part.inline_data
78+
# Convert to dict if needed
79+
if hasattr(inline_data, "mime_type") and hasattr(inline_data, "data"):
80+
# Determine type based on mime_type
81+
mime_type = inline_data.mime_type
82+
content_type = "image" if mime_type.startswith("image/") else "document"
83+
84+
content_blocks.append(
85+
{
86+
"type": content_type,
87+
"inline_data": {
88+
"mime_type": mime_type,
89+
"data": inline_data.data,
90+
},
91+
}
92+
)
93+
else:
94+
content_blocks.append(
95+
{
96+
"type": "image",
97+
"inline_data": inline_data,
98+
}
99+
)
59100

60-
return "".join(content_parts)
101+
return content_blocks
61102

62103

63104
def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
@@ -73,16 +114,17 @@ def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
73114

74115
# Handle dict format with parts array (Gemini-specific format)
75116
if "parts" in item and isinstance(item["parts"], list):
76-
content = _extract_text_from_parts(item["parts"])
77-
return {"role": item.get("role", "user"), "content": content}
117+
content_blocks = _format_parts_as_content_blocks(item["parts"])
118+
return {"role": item.get("role", "user"), "content": content_blocks}
78119

79120
# Handle dict with content field
80121
if "content" in item:
81122
content = item["content"]
82123

83124
if isinstance(content, list):
84-
# If content is a list, extract text from it
85-
content = _extract_text_from_parts(content)
125+
# If content is a list, format it as content blocks
126+
content_blocks = _format_parts_as_content_blocks(content)
127+
return {"role": item.get("role", "user"), "content": content_blocks}
86128

87129
elif not isinstance(content, str):
88130
content = str(content)
@@ -110,14 +152,14 @@ def _format_object_message(item: Any) -> FormattedMessage:
110152

111153
# Handle object with parts attribute
112154
if hasattr(item, "parts") and hasattr(item.parts, "__iter__"):
113-
content = _extract_text_from_parts(item.parts)
155+
content_blocks = _format_parts_as_content_blocks(list(item.parts))
114156
role = getattr(item, "role", "user") if hasattr(item, "role") else "user"
115157

116158
# Ensure role is a string
117159
if not isinstance(role, str):
118160
role = "user"
119161

120-
return {"role": role, "content": content}
162+
return {"role": role, "content": content_blocks}
121163

122164
# Handle object with text attribute
123165
if hasattr(item, "text"):
@@ -140,7 +182,8 @@ def _format_object_message(item: Any) -> FormattedMessage:
140182
content = item.content
141183

142184
if isinstance(content, list):
143-
content = _extract_text_from_parts(content)
185+
content_blocks = _format_parts_as_content_blocks(content)
186+
return {"role": role, "content": content_blocks}
144187

145188
elif not isinstance(content, str):
146189
content = str(content)
@@ -193,6 +236,29 @@ def format_gemini_response(response: Any) -> List[FormattedMessage]:
193236
}
194237
)
195238

239+
elif hasattr(part, "inline_data") and part.inline_data:
240+
# Handle audio/media inline data
241+
import base64
242+
243+
inline_data = part.inline_data
244+
mime_type = getattr(inline_data, "mime_type", "audio/pcm")
245+
raw_data = getattr(inline_data, "data", b"")
246+
247+
# Encode binary data as base64 string for JSON serialization
248+
if isinstance(raw_data, bytes):
249+
data = base64.b64encode(raw_data).decode("utf-8")
250+
else:
251+
# Already a string (base64)
252+
data = raw_data
253+
254+
content.append(
255+
{
256+
"type": "audio",
257+
"mime_type": mime_type,
258+
"data": data,
259+
}
260+
)
261+
196262
if content:
197263
output.append(
198264
{

posthog/ai/openai/openai_converter.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,12 @@ def format_openai_response(response: Any) -> List[FormattedMessage]:
6767
}
6868
)
6969

70+
# Handle audio output (gpt-4o-audio-preview)
71+
if hasattr(choice.message, "audio") and choice.message.audio:
72+
# Convert Pydantic model to dict to capture all fields from OpenAI
73+
audio_dict = choice.message.audio.model_dump()
74+
content.append({"type": "audio", **audio_dict})
75+
7076
if content:
7177
output.append(
7278
{

posthog/ai/sanitization.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,20 @@
1+
import os
12
import re
23
from typing import Any
34
from urllib.parse import urlparse
45

56
REDACTED_IMAGE_PLACEHOLDER = "[base64 image redacted]"
67

78

9+
def _is_multimodal_enabled() -> bool:
10+
"""Check if multimodal capture is enabled via environment variable."""
11+
return os.environ.get("_INTERNAL_LLMA_MULTIMODAL", "").lower() in (
12+
"true",
13+
"1",
14+
"yes",
15+
)
16+
17+
818
def is_base64_data_url(text: str) -> bool:
919
return re.match(r"^data:([^;]+);base64,", text) is not None
1020

@@ -27,6 +37,9 @@ def is_raw_base64(text: str) -> bool:
2737

2838

2939
def redact_base64_data_url(value: Any) -> Any:
40+
if _is_multimodal_enabled():
41+
return value
42+
3043
if not isinstance(value, str):
3144
return value
3245

@@ -83,6 +96,11 @@ def sanitize_openai_image(item: Any) -> Any:
8396
},
8497
}
8598

99+
if item.get("type") == "audio" and "data" in item:
100+
if _is_multimodal_enabled():
101+
return item
102+
return {**item, "data": REDACTED_IMAGE_PLACEHOLDER}
103+
86104
return item
87105

88106

@@ -100,6 +118,9 @@ def sanitize_openai_response_image(item: Any) -> Any:
100118

101119

102120
def sanitize_anthropic_image(item: Any) -> Any:
121+
if _is_multimodal_enabled():
122+
return item
123+
103124
if not isinstance(item, dict):
104125
return item
105126

@@ -109,8 +130,6 @@ def sanitize_anthropic_image(item: Any) -> Any:
109130
and item["source"].get("type") == "base64"
110131
and "data" in item["source"]
111132
):
112-
# For Anthropic, if the source type is "base64", we should always redact the data
113-
# The provider is explicitly telling us this is base64 data
114133
return {
115134
**item,
116135
"source": {
@@ -123,6 +142,9 @@ def sanitize_anthropic_image(item: Any) -> Any:
123142

124143

125144
def sanitize_gemini_part(part: Any) -> Any:
145+
if _is_multimodal_enabled():
146+
return part
147+
126148
if not isinstance(part, dict):
127149
return part
128150

@@ -131,8 +153,6 @@ def sanitize_gemini_part(part: Any) -> Any:
131153
and isinstance(part["inline_data"], dict)
132154
and "data" in part["inline_data"]
133155
):
134-
# For Gemini, the inline_data structure indicates base64 data
135-
# We should redact any string data in this context
136156
return {
137157
**part,
138158
"inline_data": {
@@ -185,7 +205,9 @@ def sanitize_langchain_image(item: Any) -> Any:
185205
and isinstance(item.get("source"), dict)
186206
and "data" in item["source"]
187207
):
188-
# Anthropic style - raw base64 in structured format, always redact
208+
if _is_multimodal_enabled():
209+
return item
210+
189211
return {
190212
**item,
191213
"source": {

posthog/test/ai/gemini/test_gemini.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,9 @@ def test_new_client_different_input_formats(
407407
)
408408
call_args = mock_client.capture.call_args[1]
409409
props = call_args["properties"]
410-
assert props["$ai_input"] == [{"role": "user", "content": "hey"}]
410+
assert props["$ai_input"] == [
411+
{"role": "user", "content": [{"type": "text", "text": "hey"}]}
412+
]
411413

412414
# Test multiple parts in the parts array
413415
mock_client.reset_mock()
@@ -418,7 +420,15 @@ def test_new_client_different_input_formats(
418420
)
419421
call_args = mock_client.capture.call_args[1]
420422
props = call_args["properties"]
421-
assert props["$ai_input"] == [{"role": "user", "content": "Hello world"}]
423+
assert props["$ai_input"] == [
424+
{
425+
"role": "user",
426+
"content": [
427+
{"type": "text", "text": "Hello "},
428+
{"type": "text", "text": "world"},
429+
],
430+
}
431+
]
422432

423433
# Test list input with string
424434
mock_client.capture.reset_mock()

posthog/test/ai/gemini/test_gemini_async.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,9 @@ async def test_async_client_different_input_formats(
392392
)
393393
call_args = mock_client.capture.call_args[1]
394394
props = call_args["properties"]
395-
assert props["$ai_input"] == [{"role": "user", "content": "hey"}]
395+
assert props["$ai_input"] == [
396+
{"role": "user", "content": [{"type": "text", "text": "hey"}]}
397+
]
396398

397399
# Test multiple parts in the parts array
398400
mock_client.reset_mock()
@@ -403,7 +405,15 @@ async def test_async_client_different_input_formats(
403405
)
404406
call_args = mock_client.capture.call_args[1]
405407
props = call_args["properties"]
406-
assert props["$ai_input"] == [{"role": "user", "content": "Hello world"}]
408+
assert props["$ai_input"] == [
409+
{
410+
"role": "user",
411+
"content": [
412+
{"type": "text", "text": "Hello "},
413+
{"type": "text", "text": "world"},
414+
],
415+
}
416+
]
407417

408418
# Test list input with string
409419
mock_client.capture.reset_mock()

0 commit comments

Comments
 (0)