Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion posthog/ai/anthropic/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
merge_system_prompt,
with_privacy_mode,
)
from posthog.ai.sanitization import sanitize_anthropic
from posthog.client import Client as PostHogClient
from posthog import setup

Expand Down Expand Up @@ -184,7 +185,7 @@ def _capture_streaming_event(
"$ai_input": with_privacy_mode(
self._client._ph_client,
posthog_privacy_mode,
merge_system_prompt(kwargs, "anthropic"),
sanitize_anthropic(merge_system_prompt(kwargs, "anthropic")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down
3 changes: 2 additions & 1 deletion posthog/ai/anthropic/anthropic_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
merge_system_prompt,
with_privacy_mode,
)
from posthog.ai.sanitization import sanitize_anthropic
from posthog.client import Client as PostHogClient


Expand Down Expand Up @@ -184,7 +185,7 @@ async def _capture_streaming_event(
"$ai_input": with_privacy_mode(
self._client._ph_client,
posthog_privacy_mode,
merge_system_prompt(kwargs, "anthropic"),
sanitize_anthropic(merge_system_prompt(kwargs, "anthropic")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down
3 changes: 2 additions & 1 deletion posthog/ai/gemini/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
get_model_params,
with_privacy_mode,
)
from posthog.ai.sanitization import sanitize_gemini
from posthog.client import Client as PostHogClient


Expand Down Expand Up @@ -347,7 +348,7 @@ def _capture_streaming_event(
"$ai_input": with_privacy_mode(
self._ph_client,
privacy_mode,
self._format_input(contents),
sanitize_gemini(self._format_input(contents)),
),
"$ai_output_choices": with_privacy_mode(
self._ph_client,
Expand Down
5 changes: 3 additions & 2 deletions posthog/ai/langchain/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

from posthog import setup
from posthog.ai.utils import get_model_params, with_privacy_mode
from posthog.ai.sanitization import sanitize_langchain
from posthog.client import Client

log = logging.getLogger("posthog")
Expand Down Expand Up @@ -480,7 +481,7 @@ def _capture_trace_or_span(
event_properties = {
"$ai_trace_id": trace_id,
"$ai_input_state": with_privacy_mode(
self._ph_client, self._privacy_mode, run.input
self._ph_client, self._privacy_mode, sanitize_langchain(run.input)
),
"$ai_latency": run.latency,
"$ai_span_name": run.name,
Expand Down Expand Up @@ -550,7 +551,7 @@ def _capture_generation(
"$ai_model": run.model,
"$ai_model_parameters": run.model_params,
"$ai_input": with_privacy_mode(
self._ph_client, self._privacy_mode, run.input
self._ph_client, self._privacy_mode, sanitize_langchain(run.input)
),
"$ai_http_status": 200,
"$ai_latency": run.latency,
Expand Down
13 changes: 10 additions & 3 deletions posthog/ai/openai/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
get_model_params,
with_privacy_mode,
)
from posthog.ai.sanitization import sanitize_openai, sanitize_openai_response
from posthog.client import Client as PostHogClient
from posthog import setup

Expand Down Expand Up @@ -194,7 +195,9 @@ def _capture_streaming_event(
"$ai_model": kwargs.get("model"),
"$ai_model_parameters": get_model_params(kwargs),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai_response(kwargs.get("input")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down Expand Up @@ -427,7 +430,9 @@ def _capture_streaming_event(
"$ai_model": kwargs.get("model"),
"$ai_model_parameters": get_model_params(kwargs),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("messages")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai(kwargs.get("messages")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down Expand Up @@ -518,7 +523,9 @@ def create(
"$ai_provider": "openai",
"$ai_model": kwargs.get("model"),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai_response(kwargs.get("input")),
),
"$ai_http_status": 200,
"$ai_input_tokens": usage_stats.get("prompt_tokens", 0),
Expand Down
13 changes: 10 additions & 3 deletions posthog/ai/openai/openai_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
get_model_params,
with_privacy_mode,
)
from posthog.ai.sanitization import sanitize_openai, sanitize_openai_response
from posthog.client import Client as PostHogClient


Expand Down Expand Up @@ -195,7 +196,9 @@ async def _capture_streaming_event(
"$ai_model": kwargs.get("model"),
"$ai_model_parameters": get_model_params(kwargs),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai_response(kwargs.get("input")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down Expand Up @@ -431,7 +434,9 @@ async def _capture_streaming_event(
"$ai_model": kwargs.get("model"),
"$ai_model_parameters": get_model_params(kwargs),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("messages")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai(kwargs.get("messages")),
),
"$ai_output_choices": with_privacy_mode(
self._client._ph_client,
Expand Down Expand Up @@ -522,7 +527,9 @@ async def create(
"$ai_provider": "openai",
"$ai_model": kwargs.get("model"),
"$ai_input": with_privacy_mode(
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
self._client._ph_client,
posthog_privacy_mode,
sanitize_openai_response(kwargs.get("input")),
),
"$ai_http_status": 200,
"$ai_input_tokens": usage_stats.get("prompt_tokens", 0),
Expand Down
226 changes: 226 additions & 0 deletions posthog/ai/sanitization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
import re
from typing import Any
from urllib.parse import urlparse

REDACTED_IMAGE_PLACEHOLDER = "[base64 image redacted]"


def is_base64_data_url(text: str) -> bool:
return re.match(r"^data:([^;]+);base64,", text) is not None


def is_valid_url(text: str) -> bool:
try:
result = urlparse(text)
return bool(result.scheme and result.netloc)
except Exception:
pass

return text.startswith(("/", "./", "../"))


def is_raw_base64(text: str) -> bool:
if is_valid_url(text):
return False

return len(text) > 20 and re.match(r"^[A-Za-z0-9+/]+=*$", text) is not None


def redact_base64_data_url(value: Any) -> Any:
if not isinstance(value, str):
return value

if is_base64_data_url(value):
return REDACTED_IMAGE_PLACEHOLDER

if is_raw_base64(value):
return REDACTED_IMAGE_PLACEHOLDER

return value


def process_messages(messages: Any, transform_content_func) -> Any:
if not messages:
return messages

def process_content(content: Any) -> Any:
if isinstance(content, str):
return content

if not content:
return content

if isinstance(content, list):
return [transform_content_func(item) for item in content]

return transform_content_func(content)

def process_message(msg: Any) -> Any:
if not isinstance(msg, dict) or "content" not in msg:
return msg
return {**msg, "content": process_content(msg["content"])}

if isinstance(messages, list):
return [process_message(msg) for msg in messages]

return process_message(messages)


def sanitize_openai_image(item: Any) -> Any:
if not isinstance(item, dict):
return item

if (
item.get("type") == "image_url"
and isinstance(item.get("image_url"), dict)
and "url" in item["image_url"]
):
return {
**item,
"image_url": {
**item["image_url"],
"url": redact_base64_data_url(item["image_url"]["url"]),
},
}

return item


def sanitize_openai_response_image(item: Any) -> Any:
if not isinstance(item, dict):
return item

if item.get("type") == "input_image" and "image_url" in item:
return {
**item,
"image_url": redact_base64_data_url(item["image_url"]),
}

return item


def sanitize_anthropic_image(item: Any) -> Any:
if not isinstance(item, dict):
return item

if (
item.get("type") == "image"
and isinstance(item.get("source"), dict)
and item["source"].get("type") == "base64"
and "data" in item["source"]
):
# For Anthropic, if the source type is "base64", we should always redact the data
# The provider is explicitly telling us this is base64 data
return {
**item,
"source": {
**item["source"],
"data": REDACTED_IMAGE_PLACEHOLDER,
},
}

return item


def sanitize_gemini_part(part: Any) -> Any:
if not isinstance(part, dict):
return part

if (
"inline_data" in part
and isinstance(part["inline_data"], dict)
and "data" in part["inline_data"]
):
# For Gemini, the inline_data structure indicates base64 data
# We should redact any string data in this context
return {
**part,
"inline_data": {
**part["inline_data"],
"data": REDACTED_IMAGE_PLACEHOLDER,
},
}

return part


def process_gemini_item(item: Any) -> Any:
if not isinstance(item, dict):
return item

if "parts" in item and item["parts"]:
parts = item["parts"]
if isinstance(parts, list):
parts = [sanitize_gemini_part(part) for part in parts]
else:
parts = sanitize_gemini_part(parts)

return {**item, "parts": parts}

return item


def sanitize_langchain_image(item: Any) -> Any:
if not isinstance(item, dict):
return item

if (
item.get("type") == "image_url"
and isinstance(item.get("image_url"), dict)
and "url" in item["image_url"]
):
return {
**item,
"image_url": {
**item["image_url"],
"url": redact_base64_data_url(item["image_url"]["url"]),
},
}

if item.get("type") == "image" and "data" in item:
return {**item, "data": redact_base64_data_url(item["data"])}

if (
item.get("type") == "image"
and isinstance(item.get("source"), dict)
and "data" in item["source"]
):
# Anthropic style - raw base64 in structured format, always redact
return {
**item,
"source": {
**item["source"],
"data": REDACTED_IMAGE_PLACEHOLDER,
},
}

if item.get("type") == "media" and "data" in item:
return {**item, "data": redact_base64_data_url(item["data"])}

return item


def sanitize_openai(data: Any) -> Any:
return process_messages(data, sanitize_openai_image)


def sanitize_openai_response(data: Any) -> Any:
return process_messages(data, sanitize_openai_response_image)


def sanitize_anthropic(data: Any) -> Any:
return process_messages(data, sanitize_anthropic_image)


def sanitize_gemini(data: Any) -> Any:
if not data:
return data

if isinstance(data, list):
return [process_gemini_item(item) for item in data]

return process_gemini_item(data)


def sanitize_langchain(data: Any) -> Any:
return process_messages(data, sanitize_langchain_image)
Loading
Loading