Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 6.7.1 - 2025-09-01

- fix: Add base64 inline image sanitization

# 6.7.0 - 2025-08-26

- feat: Add support for feature flag dependencies
Expand Down
3 changes: 2 additions & 1 deletion posthog/ai/anthropic/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
merge_system_prompt,
with_privacy_mode,
)
from posthog.ai.sanitization import sanitize_anthropic
from posthog.client import Client as PostHogClient
from posthog import setup

Expand Down Expand Up @@ -184,7 +185,7 @@ def _capture_streaming_event(
"$ai_input": with_privacy_mode(
self._client._ph_client,
posthog_privacy_mode,
merge_system_prompt(kwargs, "anthropic"),
sanitize_anthropic(merge_system_prompt(kwargs, "anthropic")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down
3 changes: 2 additions & 1 deletion posthog/ai/anthropic/anthropic_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
merge_system_prompt,
with_privacy_mode,
)
from posthog.ai.sanitization import sanitize_anthropic
from posthog.client import Client as PostHogClient


Expand Down Expand Up @@ -184,7 +185,7 @@ async def _capture_streaming_event(
"$ai_input": with_privacy_mode(
self._client._ph_client,
posthog_privacy_mode,
merge_system_prompt(kwargs, "anthropic"),
sanitize_anthropic(merge_system_prompt(kwargs, "anthropic")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down
3 changes: 2 additions & 1 deletion posthog/ai/gemini/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
get_model_params,
with_privacy_mode,
)
from posthog.ai.sanitization import sanitize_gemini
from posthog.client import Client as PostHogClient


Expand Down Expand Up @@ -347,7 +348,7 @@ def _capture_streaming_event(
"$ai_input": with_privacy_mode(
self._ph_client,
privacy_mode,
self._format_input(contents),
sanitize_gemini(self._format_input(contents)),
),
"$ai_output_choices": with_privacy_mode(
self._ph_client,
Expand Down
5 changes: 3 additions & 2 deletions posthog/ai/langchain/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

from posthog import setup
from posthog.ai.utils import get_model_params, with_privacy_mode
from posthog.ai.sanitization import sanitize_langchain
from posthog.client import Client

log = logging.getLogger("posthog")
Expand Down Expand Up @@ -480,7 +481,7 @@ def _capture_trace_or_span(
event_properties = {
"$ai_trace_id": trace_id,
"$ai_input_state": with_privacy_mode(
self._ph_client, self._privacy_mode, run.input
self._ph_client, self._privacy_mode, sanitize_langchain(run.input)
),
"$ai_latency": run.latency,
"$ai_span_name": run.name,
Expand Down Expand Up @@ -550,7 +551,7 @@ def _capture_generation(
"$ai_model": run.model,
"$ai_model_parameters": run.model_params,
"$ai_input": with_privacy_mode(
self._ph_client, self._privacy_mode, run.input
self._ph_client, self._privacy_mode, sanitize_langchain(run.input)
),
"$ai_http_status": 200,
"$ai_latency": run.latency,
Expand Down
13 changes: 10 additions & 3 deletions posthog/ai/openai/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
get_model_params,
with_privacy_mode,
)
from posthog.ai.sanitization import sanitize_openai, sanitize_openai_response
from posthog.client import Client as PostHogClient
from posthog import setup

Expand Down Expand Up @@ -194,7 +195,9 @@ def _capture_streaming_event(
"$ai_model": kwargs.get("model"),
"$ai_model_parameters": get_model_params(kwargs),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai_response(kwargs.get("input")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down Expand Up @@ -427,7 +430,9 @@ def _capture_streaming_event(
"$ai_model": kwargs.get("model"),
"$ai_model_parameters": get_model_params(kwargs),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("messages")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai(kwargs.get("messages")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down Expand Up @@ -518,7 +523,9 @@ def create(
"$ai_provider": "openai",
"$ai_model": kwargs.get("model"),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai_response(kwargs.get("input")),
),
"$ai_http_status": 200,
"$ai_input_tokens": usage_stats.get("prompt_tokens", 0),
Expand Down
13 changes: 10 additions & 3 deletions posthog/ai/openai/openai_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
get_model_params,
with_privacy_mode,
)
from posthog.ai.sanitization import sanitize_openai, sanitize_openai_response
from posthog.client import Client as PostHogClient


Expand Down Expand Up @@ -195,7 +196,9 @@ async def _capture_streaming_event(
"$ai_model": kwargs.get("model"),
"$ai_model_parameters": get_model_params(kwargs),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai_response(kwargs.get("input")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down Expand Up @@ -431,7 +434,9 @@ async def _capture_streaming_event(
"$ai_model": kwargs.get("model"),
"$ai_model_parameters": get_model_params(kwargs),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("messages")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai(kwargs.get("messages")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down Expand Up @@ -522,7 +527,9 @@ async def create(
"$ai_provider": "openai",
"$ai_model": kwargs.get("model"),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai_response(kwargs.get("input")),
),
"$ai_http_status": 200,
"$ai_input_tokens": usage_stats.get("prompt_tokens", 0),
Expand Down
226 changes: 226 additions & 0 deletions posthog/ai/sanitization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
import re
from typing import Any
from urllib.parse import urlparse

REDACTED_IMAGE_PLACEHOLDER = "[base64 image redacted]"


def is_base64_data_url(text: str) -> bool:
return re.match(r"^data:([^;]+);base64,", text) is not None


def is_valid_url(text: str) -> bool:
try:
result = urlparse(text)
return bool(result.scheme and result.netloc)
except Exception:
pass

return text.startswith(("/", "./", "../"))


def is_raw_base64(text: str) -> bool:
if is_valid_url(text):
return False

return len(text) > 20 and re.match(r"^[A-Za-z0-9+/]+=*$", text) is not None


def redact_base64_data_url(value: Any) -> Any:
if not isinstance(value, str):
return value

if is_base64_data_url(value):
return REDACTED_IMAGE_PLACEHOLDER

if is_raw_base64(value):
return REDACTED_IMAGE_PLACEHOLDER

return value


def process_messages(messages: Any, transform_content_func) -> Any:
if not messages:
return messages

def process_content(content: Any) -> Any:
if isinstance(content, str):
return content

if not content:
return content

if isinstance(content, list):
return [transform_content_func(item) for item in content]

return transform_content_func(content)

def process_message(msg: Any) -> Any:
if not isinstance(msg, dict) or "content" not in msg:
return msg
return {**msg, "content": process_content(msg["content"])}

if isinstance(messages, list):
return [process_message(msg) for msg in messages]

return process_message(messages)


def sanitize_openai_image(item: Any) -> Any:
if not isinstance(item, dict):
return item

if (
item.get("type") == "image_url"
and isinstance(item.get("image_url"), dict)
and "url" in item["image_url"]
):
return {
**item,
"image_url": {
**item["image_url"],
"url": redact_base64_data_url(item["image_url"]["url"]),
},
}

return item


def sanitize_openai_response_image(item: Any) -> Any:
if not isinstance(item, dict):
return item

if item.get("type") == "input_image" and "image_url" in item:
return {
**item,
"image_url": redact_base64_data_url(item["image_url"]),
}

return item


def sanitize_anthropic_image(item: Any) -> Any:
if not isinstance(item, dict):
return item

if (
item.get("type") == "image"
and isinstance(item.get("source"), dict)
and item["source"].get("type") == "base64"
and "data" in item["source"]
):
# For Anthropic, if the source type is "base64", we should always redact the data
# The provider is explicitly telling us this is base64 data
return {
**item,
"source": {
**item["source"],
"data": REDACTED_IMAGE_PLACEHOLDER,
},
}

return item


def sanitize_gemini_part(part: Any) -> Any:
if not isinstance(part, dict):
return part

if (
"inline_data" in part
and isinstance(part["inline_data"], dict)
and "data" in part["inline_data"]
):
# For Gemini, the inline_data structure indicates base64 data
# We should redact any string data in this context
return {
**part,
"inline_data": {
**part["inline_data"],
"data": REDACTED_IMAGE_PLACEHOLDER,
},
}

return part


def process_gemini_item(item: Any) -> Any:
if not isinstance(item, dict):
return item

if "parts" in item and item["parts"]:
parts = item["parts"]
if isinstance(parts, list):
parts = [sanitize_gemini_part(part) for part in parts]
else:
parts = sanitize_gemini_part(parts)

return {**item, "parts": parts}

return item


def sanitize_langchain_image(item: Any) -> Any:
if not isinstance(item, dict):
return item

if (
item.get("type") == "image_url"
and isinstance(item.get("image_url"), dict)
and "url" in item["image_url"]
):
return {
**item,
"image_url": {
**item["image_url"],
"url": redact_base64_data_url(item["image_url"]["url"]),
},
}

if item.get("type") == "image" and "data" in item:
return {**item, "data": redact_base64_data_url(item["data"])}

if (
item.get("type") == "image"
and isinstance(item.get("source"), dict)
and "data" in item["source"]
):
# Anthropic style - raw base64 in structured format, always redact
return {
**item,
"source": {
**item["source"],
"data": REDACTED_IMAGE_PLACEHOLDER,
},
}

if item.get("type") == "media" and "data" in item:
return {**item, "data": redact_base64_data_url(item["data"])}

return item


def sanitize_openai(data: Any) -> Any:
return process_messages(data, sanitize_openai_image)


def sanitize_openai_response(data: Any) -> Any:
return process_messages(data, sanitize_openai_response_image)


def sanitize_anthropic(data: Any) -> Any:
return process_messages(data, sanitize_anthropic_image)


def sanitize_gemini(data: Any) -> Any:
if not data:
return data

if isinstance(data, list):
return [process_gemini_item(item) for item in data]

return process_gemini_item(data)


def sanitize_langchain(data: Any) -> Any:
return process_messages(data, sanitize_langchain_image)
Loading
Loading