From ed4574abe7edb6fccf0e64d741580dabfb0501f3 Mon Sep 17 00:00:00 2001 From: Alex Lebedev Date: Thu, 27 Nov 2025 21:13:18 +0100 Subject: [PATCH 1/6] feat: Filter out the events before the replay start. --- ee/hogai/session_summaries/constants.py | 3 ++ .../session_summaries/session/input_data.py | 18 +++++++++- .../session_summaries/session/output_data.py | 35 ++++++++++--------- .../session/summarize_session.py | 5 ++- ee/hogai/session_summaries/tests/conftest.py | 10 ++++-- ee/hogai/session_summaries/utils.py | 10 ++++++ .../summarize_session_group.py | 5 ++- 7 files changed, 64 insertions(+), 22 deletions(-) diff --git a/ee/hogai/session_summaries/constants.py b/ee/hogai/session_summaries/constants.py index 3ec309a881eb4..71bf35115d472 100644 --- a/ee/hogai/session_summaries/constants.py +++ b/ee/hogai/session_summaries/constants.py @@ -15,6 +15,9 @@ HALLUCINATED_EVENTS_MIN_RATIO = 0.15 # If more than 15% of events in the summary hallucinated, fail the summarization # Minimum number of sessions to use group summary logic (find patterns) instead of summarizing them separately GROUP_SUMMARIES_MIN_SESSIONS = 5 +EVENTS_BEFORE_REPLAY_START_THRESHOLD_MS = ( + 1000 # Don't include events before the cutoff as they are not visible in the replay +) # Temporal SESSION_SUMMARIES_DB_DATA_REDIS_TTL = 60 * 60 * 24 # How long to store the DB data in Redis within Temporal jobs diff --git a/ee/hogai/session_summaries/session/input_data.py b/ee/hogai/session_summaries/session/input_data.py index c831496771bdd..90bd660853253 100644 --- a/ee/hogai/session_summaries/session/input_data.py +++ b/ee/hogai/session_summaries/session/input_data.py @@ -10,11 +10,12 @@ from posthog.session_recordings.models.metadata import RecordingMetadata from posthog.session_recordings.queries.session_replay_events import SessionReplayEvents +from ee.hogai.session_summaries.constants import EVENTS_BEFORE_REPLAY_START_THRESHOLD_MS from ee.hogai.session_summaries.local.input_data import ( _get_production_session_events_locally, _get_production_session_metadata_locally, ) -from ee.hogai.session_summaries.utils import get_column_index +from ee.hogai.session_summaries.utils import calculate_time_since_start, get_column_index logger = structlog.get_logger(__name__) @@ -179,7 +180,9 @@ def add_context_and_filter_events( session_events_columns: list[str], session_events: list[tuple[str | datetime.datetime | list[str] | None, ...]], session_id: str, + session_start_time: datetime.datetime, ) -> tuple[list[str], list[tuple[str | datetime.datetime | list[str] | None, ...]]]: + timestamp_index = get_column_index(session_events_columns, "timestamp") indexes = { "event": get_column_index(session_events_columns, "event"), "$event_type": get_column_index(session_events_columns, "$event_type"), @@ -199,7 +202,20 @@ def add_context_and_filter_events( i for i, col in enumerate(session_events_columns) if col not in COLUMNS_TO_REMOVE_FROM_LLM_CONTEXT ] updated_events = [] + # Events are chronologically ordered, so once we find an event after replay start, all subsequent events are too + past_replay_start = False for event in session_events: + # Filter out events that occurred before or exactly at replay start, as we can't confirm them with video + if not past_replay_start: + event_timestamp = event[timestamp_index] + if not isinstance(event_timestamp, str) and not isinstance(event_timestamp, datetime.datetime): + msg = f"Event timestamp is not a string or datetime: {event_timestamp}" + logger.error(msg, signals_type="session-summaries", session_id=session_id) + raise ValueError(msg) + ms_since_start = calculate_time_since_start(event_timestamp, session_start_time) + if ms_since_start <= EVENTS_BEFORE_REPLAY_START_THRESHOLD_MS: + continue + past_replay_start = True updated_event: list[str | datetime.datetime | list[str] | None] = list(event) # Check for errors worth keeping in the context if event[indexes["event"]] == "$exception": diff --git a/ee/hogai/session_summaries/session/output_data.py b/ee/hogai/session_summaries/session/output_data.py index abec7fd5a4a1c..a5c52250e2976 100644 --- a/ee/hogai/session_summaries/session/output_data.py +++ b/ee/hogai/session_summaries/session/output_data.py @@ -1,4 +1,3 @@ -from datetime import datetime from enum import Enum from typing import Any @@ -7,7 +6,12 @@ from ee.hogai.session_summaries import SummaryValidationError from ee.hogai.session_summaries.constants import HALLUCINATED_EVENTS_MIN_RATIO -from ee.hogai.session_summaries.utils import get_column_index, prepare_datetime, unpack_full_event_id +from ee.hogai.session_summaries.utils import ( + calculate_time_since_start, + get_column_index, + prepare_datetime, + unpack_full_event_id, +) from ee.hogai.utils.yaml import load_yaml_from_raw_llm_content logger = structlog.get_logger(__name__) @@ -274,7 +278,8 @@ def _remove_hallucinated_events( f"Too many hallucinated events ({len(hallucinated_events)}/{total_summary_events}) for session id ({session_id})" f"in the raw session summary: {[x[-1] for x in hallucinated_events]} " ) - logger.error(msg, session_id=session_id, signals_type="session-summaries") + if final_validation: + logger.error(msg, session_id=session_id, signals_type="session-summaries") raise SummaryValidationError(msg) # Reverse to not break indexes for group_index, event_index, event in reversed(hallucinated_events): @@ -362,14 +367,6 @@ def load_raw_session_summary_from_llm_content( return raw_session_summary -# TODO Rework the logic, so events before the recording are marked as "LOAD", not 00:00 -def calculate_time_since_start(session_timestamp: str, session_start_time: datetime | None) -> int | None: - if not session_start_time or not session_timestamp: - return None - timestamp_datetime = datetime.fromisoformat(session_timestamp) - return max(0, int((timestamp_datetime - session_start_time).total_seconds() * 1000)) - - def _validate_enriched_summary( data: dict[str, Any], session_id: str, final_validation: bool ) -> SessionSummarySerializer: @@ -454,6 +451,7 @@ def _calculate_segment_meta( raw_key_actions: list[dict[str, Any]] | None, session_duration: int, session_id: str, + final_validation: bool, ) -> SegmentMetaSerializer: # Find first and the last event in the segment segment_index = raw_segment.get("index") @@ -599,11 +597,12 @@ def _calculate_segment_meta( # TODO: Factor of two is arbitrary, find a better solution if duration <= 0 or fallback_duration // duration > 2: # Checking only duration as events are sorted chronologically - logger.warning( - f"Duration change is drastic (fallback: {fallback_duration} -> segments: {duration}) - using fallback data for session_id {session_id}", - session_id=session_id, - signals_type="session-summaries", - ) + if final_validation: + logger.warning( + f"Duration change is drastic (fallback: {fallback_duration} -> segments: {duration}) - using fallback data for session_id {session_id}", + session_id=session_id, + signals_type="session-summaries", + ) segment_meta_data["duration"] = fallback_duration segment_meta_data["duration_percentage"] = fallback_duration_percentage segment_meta_data["events_count"] = fallback_events_count @@ -657,12 +656,14 @@ def enrich_raw_session_summary_with_meta( simplified_events_mapping=simplified_events_mapping, raw_key_actions=raw_key_actions, session_id=session_id, + final_validation=final_validation, ) # Validate the serializer to be able to use `.data` if not segment_meta.is_valid(): # Most of the fields are optional, so failed validation should be reported msg = f"Error validating segment meta against the schema when summarizing session_id {session_id}: {segment_meta.errors}" - logger.error(msg, session_id=session_id, signals_type="session-summaries") + if final_validation: + logger.error(msg, session_id=session_id, signals_type="session-summaries") raise SummaryValidationError(msg) enriched_segment["meta"] = segment_meta.data enriched_segments.append(enriched_segment) diff --git a/ee/hogai/session_summaries/session/summarize_session.py b/ee/hogai/session_summaries/session/summarize_session.py index 3073455e8cbd4..68213621fbbd0 100644 --- a/ee/hogai/session_summaries/session/summarize_session.py +++ b/ee/hogai/session_summaries/session/summarize_session.py @@ -101,7 +101,10 @@ async def get_session_data_from_db(session_id: str, team_id: int, local_reads_pr # Raise any unexpected errors raise session_events_columns, session_events = add_context_and_filter_events( - session_events_columns=session_events_columns, session_events=session_events, session_id=session_id + session_events_columns=session_events_columns, + session_events=session_events, + session_id=session_id, + session_start_time=session_metadata["start_time"], ) # TODO Get web analytics data on URLs to better understand what the user was doing diff --git a/ee/hogai/session_summaries/tests/conftest.py b/ee/hogai/session_summaries/tests/conftest.py index b3a337073718f..0c7bf2dab7435 100644 --- a/ee/hogai/session_summaries/tests/conftest.py +++ b/ee/hogai/session_summaries/tests/conftest.py @@ -476,7 +476,13 @@ def mock_chat_completion(mock_valid_llm_yaml_response: str) -> ChatCompletion: @pytest.fixture -def mock_raw_metadata(mock_session_id: str) -> dict[str, Any]: +def mock_session_start_time() -> datetime: + """Session replay start time - events before this should be filtered out""" + return datetime(2025, 3, 31, 18, 40, 32, 302000, tzinfo=UTC) + + +@pytest.fixture +def mock_raw_metadata(mock_session_id: str, mock_session_start_time: datetime) -> dict[str, Any]: return { "id": mock_session_id, # Anonymized distinct_id for testing @@ -486,7 +492,7 @@ def mock_raw_metadata(mock_session_id: str) -> dict[str, Any]: "recording_duration": 5323, "active_seconds": 1947, "inactive_seconds": 3375, - "start_time": "2025-03-31T18:40:32.302000Z", + "start_time": mock_session_start_time, "end_time": "2025-03-31T18:54:15.789000Z", "click_count": 679, "keypress_count": 668, diff --git a/ee/hogai/session_summaries/utils.py b/ee/hogai/session_summaries/utils.py index 244b976e62499..e5ddb51f76384 100644 --- a/ee/hogai/session_summaries/utils.py +++ b/ee/hogai/session_summaries/utils.py @@ -168,3 +168,13 @@ def logging_session_ids(session_ids: list[str]) -> str: """Log a list of session ids in a readable format.""" # Having 150 chars (4 uuids) is enough to identify the sessions and stay readable return f"Session IDs: {str(session_ids)[:MAX_SESSION_IDS_COMBINED_LOGGING_LENGTH]}" + + +def calculate_time_since_start(event_timestamp: str | datetime, session_start_time: datetime) -> int: + """ + Calculate milliseconds between event timestamp and session start time. + Returns 0 for events that occurred before or exactly at session start. + """ + if isinstance(event_timestamp, str): + event_timestamp = datetime.fromisoformat(event_timestamp) + return max(0, int((event_timestamp - session_start_time).total_seconds() * 1000)) diff --git a/posthog/temporal/ai/session_summary/summarize_session_group.py b/posthog/temporal/ai/session_summary/summarize_session_group.py index 8da845ad21350..fbda41e92786d 100644 --- a/posthog/temporal/ai/session_summary/summarize_session_group.py +++ b/posthog/temporal/ai/session_summary/summarize_session_group.py @@ -183,7 +183,10 @@ async def fetch_session_batch_events_activity( continue # Prepare the data to be used by the next activity filtered_columns, filtered_events = add_context_and_filter_events( - session_events_columns=columns, session_events=session_events, session_id=session_id + session_events_columns=columns, + session_events=session_events, + session_id=session_id, + session_start_time=session_metadata["start_time"], ) session_db_data = SessionSummaryDBData( session_metadata=session_metadata, session_events_columns=filtered_columns, session_events=filtered_events From 3d90014a8b46dccba75067ac1284a091ac6f3dc0 Mon Sep 17 00:00:00 2001 From: Alex Lebedev Date: Thu, 27 Nov 2025 21:13:35 +0100 Subject: [PATCH 2/6] feat: Update tests. --- .../tests/test_input_data.py | 107 ++++++++++++++++-- .../tests/test_output_data.py | 6 +- 2 files changed, 101 insertions(+), 12 deletions(-) diff --git a/ee/hogai/session_summaries/tests/test_input_data.py b/ee/hogai/session_summaries/tests/test_input_data.py index b92b400506540..6cc77da61d4bc 100644 --- a/ee/hogai/session_summaries/tests/test_input_data.py +++ b/ee/hogai/session_summaries/tests/test_input_data.py @@ -1,5 +1,5 @@ import json -from datetime import datetime +from datetime import UTC, datetime from typing import Any import pytest @@ -17,6 +17,9 @@ get_session_events, ) +# Timestamp after the mock session start time (2025-03-31T18:40:32.302000Z) +MOCK_EVENT_TIMESTAMP = datetime(2025, 3, 31, 18, 40, 39, 302000, tzinfo=UTC) + @pytest.fixture def mock_event_indexes(mock_raw_events_columns: list[str]) -> dict[str, int]: @@ -95,7 +98,7 @@ def test_get_improved_elements_chain_elements(): ( ( "$autocapture", - None, + MOCK_EVENT_TIMESTAMP, "", ["Click me"], [], @@ -113,7 +116,7 @@ def test_get_improved_elements_chain_elements(): ), ( "$autocapture", - None, + MOCK_EVENT_TIMESTAMP, "", ["Click me"], ["button"], @@ -131,7 +134,7 @@ def test_get_improved_elements_chain_elements(): ( ( "$autocapture", - None, + MOCK_EVENT_TIMESTAMP, "", [], [], @@ -155,7 +158,7 @@ def test_get_improved_elements_chain_elements(): ( ( "$autocapture", - None, + MOCK_EVENT_TIMESTAMP, "", [], [], @@ -173,7 +176,7 @@ def test_get_improved_elements_chain_elements(): ), ( "$autocapture", - None, + MOCK_EVENT_TIMESTAMP, "", ["Click me", "Create project"], ["button", "a"], @@ -191,7 +194,7 @@ def test_get_improved_elements_chain_elements(): ( ( "user_clicked_button", - None, + MOCK_EVENT_TIMESTAMP, "", [], [], @@ -209,7 +212,7 @@ def test_get_improved_elements_chain_elements(): ), ( "user_clicked_button", - None, + MOCK_EVENT_TIMESTAMP, "", [], [], @@ -227,13 +230,17 @@ def test_get_improved_elements_chain_elements(): ) def test_add_context_and_filter_events( mock_event_indexes: dict[str, int], + mock_session_start_time: datetime, input_event: tuple[Any, ...], expected_event: tuple[Any, ...] | None, should_keep: bool, ): test_columns = list(mock_event_indexes.keys()) updated_columns, updated_events = add_context_and_filter_events( - session_events_columns=test_columns, session_events=[input_event], session_id="test_session_id" + session_events_columns=test_columns, + session_events=[input_event], + session_id="test_session_id", + session_start_time=mock_session_start_time, ) # Check columns are updated (and columns excessive from LLM context are removed) @@ -249,6 +256,88 @@ def test_add_context_and_filter_events( assert len(updated_events) == 0 +@pytest.mark.parametrize( + "event_timestamps,expected_kept_count", + [ + # All events before replay start - none kept + ( + [ + datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start + datetime(2025, 3, 31, 18, 40, 31, 0, tzinfo=UTC), # Before start + datetime(2025, 3, 31, 18, 40, 32, 302000, tzinfo=UTC), # Exactly at start (filtered) + ], + 0, + ), + # First event before, second at start, third after - only third kept + ( + [ + datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start + datetime(2025, 3, 31, 18, 40, 32, 302000, tzinfo=UTC), # Exactly at start (filtered) + datetime(2025, 3, 31, 18, 40, 33, 0, tzinfo=UTC), # After start + ], + 1, + ), + # All events after replay start - all kept + ( + [ + datetime(2025, 3, 31, 18, 40, 33, 0, tzinfo=UTC), # After start + datetime(2025, 3, 31, 18, 40, 34, 0, tzinfo=UTC), # After start + datetime(2025, 3, 31, 18, 40, 35, 0, tzinfo=UTC), # After start + ], + 3, + ), + # Mix: two before, one after - one kept + ( + [ + datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start + datetime(2025, 3, 31, 18, 40, 31, 0, tzinfo=UTC), # Before start + datetime(2025, 3, 31, 18, 40, 39, 302000, tzinfo=UTC), # After start + ], + 1, + ), + ], +) +def test_filter_events_before_replay_start( + mock_raw_events_columns: list[str], + mock_session_start_time: datetime, + event_timestamps: list[datetime], + expected_kept_count: int, +): + """Test that events occurring before or exactly at replay start are filtered out.""" + # Create events with different timestamps but valid context (so they're not filtered for other reasons) + events: list[tuple[Any, ...]] = [] + for i, ts in enumerate(event_timestamps): + events.append( + ( + "$pageview", # System event - not filtered for lack of context + ts, + "", + [], + [], + None, + None, + None, + [], + "", + [], + [], + [], + [], + [], + f"00000000-0000-0000-0001-00000000000{i}", + ) + ) + + updated_columns, updated_events = add_context_and_filter_events( + session_events_columns=mock_raw_events_columns, + session_events=events, + session_id="test_session_id", + session_start_time=mock_session_start_time, + ) + + assert len(updated_events) == expected_kept_count + + @pytest.mark.parametrize( "pages_data,expected_count,expected_iterations,expected_error", [ diff --git a/ee/hogai/session_summaries/tests/test_output_data.py b/ee/hogai/session_summaries/tests/test_output_data.py index 185e423b7ceef..98d7485cefd76 100644 --- a/ee/hogai/session_summaries/tests/test_output_data.py +++ b/ee/hogai/session_summaries/tests/test_output_data.py @@ -194,12 +194,12 @@ def test_load_raw_session_summary_invalid_schema( ("2024-03-01T12:00:02+00:00", datetime(2024, 3, 1, 12, 0, 0, tzinfo=UTC), 2000), # 2 seconds after ("2024-03-01T12:00:00+00:00", datetime(2024, 3, 1, 12, 0, 0, tzinfo=UTC), 0), # same time ("2024-03-01T11:59:59+00:00", datetime(2024, 3, 1, 12, 0, 0, tzinfo=UTC), 0), # 1 second before (clamped to 0) - (None, datetime(2024, 3, 1, 12, 0, 0, tzinfo=UTC), None), # no event time - ("2024-03-01T12:00:02+00:00", None, None), # no start time ("2024-03-01T13:00:00+00:00", datetime(2024, 3, 1, 12, 0, 0, tzinfo=UTC), 3600000), # 1 hour after + # Also accepts datetime objects for event_time + (datetime(2024, 3, 1, 12, 0, 2, tzinfo=UTC), datetime(2024, 3, 1, 12, 0, 0, tzinfo=UTC), 2000), ], ) -def test_calculate_time_since_start(event_time: str, start_time: datetime, expected: int) -> None: +def test_calculate_time_since_start(event_time: str | datetime, start_time: datetime, expected: int) -> None: result = calculate_time_since_start(event_time, start_time) assert result == expected From 6b04622a5a6cc81abbff8292538fcfad7ceaddf3 Mon Sep 17 00:00:00 2001 From: Alex Lebedev Date: Fri, 28 Nov 2025 11:17:44 +0100 Subject: [PATCH 3/6] chore: Increase threshold to 5s. --- ee/hogai/session_summaries/constants.py | 6 ++-- .../tests/test_input_data.py | 29 ++++++++++--------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/ee/hogai/session_summaries/constants.py b/ee/hogai/session_summaries/constants.py index e192799dff7da..004cc3452b3a6 100644 --- a/ee/hogai/session_summaries/constants.py +++ b/ee/hogai/session_summaries/constants.py @@ -15,9 +15,9 @@ HALLUCINATED_EVENTS_MIN_RATIO = 0.15 # If more than 15% of events in the summary hallucinated, fail the summarization # Minimum number of sessions to use group summary logic (find patterns) instead of summarizing them separately GROUP_SUMMARIES_MIN_SESSIONS = 5 -EVENTS_BEFORE_REPLAY_START_THRESHOLD_MS = ( - 1000 # Don't include events before the cutoff as they are not visible in the replay -) +# Don't include events that are happened before the replay started or at the very start, as we can't verify them with videos +# (first N seconds are hardest to render), iterate if we find a better way to generate Replay videos +EVENTS_BEFORE_REPLAY_START_THRESHOLD_MS = 5000 # Temporal SESSION_SUMMARIES_DB_DATA_REDIS_TTL = 60 * 60 * 24 # How long to store the DB data in Redis within Temporal jobs diff --git a/ee/hogai/session_summaries/tests/test_input_data.py b/ee/hogai/session_summaries/tests/test_input_data.py index 6cc77da61d4bc..a33a7f94deae7 100644 --- a/ee/hogai/session_summaries/tests/test_input_data.py +++ b/ee/hogai/session_summaries/tests/test_input_data.py @@ -259,39 +259,40 @@ def test_add_context_and_filter_events( @pytest.mark.parametrize( "event_timestamps,expected_kept_count", [ - # All events before replay start - none kept + # All events within 5s threshold - none kept + # Session starts at 18:40:32.302000, threshold is 5000ms, so events at or before 18:40:37.302000 are filtered ( [ datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start - datetime(2025, 3, 31, 18, 40, 31, 0, tzinfo=UTC), # Before start datetime(2025, 3, 31, 18, 40, 32, 302000, tzinfo=UTC), # Exactly at start (filtered) + datetime(2025, 3, 31, 18, 40, 37, 302000, tzinfo=UTC), # Exactly at threshold (filtered) ], 0, ), - # First event before, second at start, third after - only third kept + # First two within threshold, third after - only third kept ( [ - datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start - datetime(2025, 3, 31, 18, 40, 32, 302000, tzinfo=UTC), # Exactly at start (filtered) - datetime(2025, 3, 31, 18, 40, 33, 0, tzinfo=UTC), # After start + datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start (filtered) + datetime(2025, 3, 31, 18, 40, 35, 0, tzinfo=UTC), # Within 5s threshold (filtered) + datetime(2025, 3, 31, 18, 40, 38, 0, tzinfo=UTC), # After threshold (~5.7s after start) ], 1, ), - # All events after replay start - all kept + # All events after 5s threshold - all kept ( [ - datetime(2025, 3, 31, 18, 40, 33, 0, tzinfo=UTC), # After start - datetime(2025, 3, 31, 18, 40, 34, 0, tzinfo=UTC), # After start - datetime(2025, 3, 31, 18, 40, 35, 0, tzinfo=UTC), # After start + datetime(2025, 3, 31, 18, 40, 38, 0, tzinfo=UTC), # After threshold + datetime(2025, 3, 31, 18, 40, 39, 0, tzinfo=UTC), # After threshold + datetime(2025, 3, 31, 18, 40, 40, 0, tzinfo=UTC), # After threshold ], 3, ), - # Mix: two before, one after - one kept + # Mix: two within threshold, one after - one kept ( [ - datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start - datetime(2025, 3, 31, 18, 40, 31, 0, tzinfo=UTC), # Before start - datetime(2025, 3, 31, 18, 40, 39, 302000, tzinfo=UTC), # After start + datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start (filtered) + datetime(2025, 3, 31, 18, 40, 36, 0, tzinfo=UTC), # Within 5s threshold (filtered) + datetime(2025, 3, 31, 18, 40, 39, 302000, tzinfo=UTC), # After threshold (~7s after start) ], 1, ), From 5c18bfff84d74070f210525ea395006df47a76fe Mon Sep 17 00:00:00 2001 From: Alex Lebedev Date: Fri, 28 Nov 2025 14:06:19 +0100 Subject: [PATCH 4/6] feat: Rename constant. --- ee/hogai/session_summaries/constants.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ee/hogai/session_summaries/constants.py b/ee/hogai/session_summaries/constants.py index 004cc3452b3a6..2dff6168315e8 100644 --- a/ee/hogai/session_summaries/constants.py +++ b/ee/hogai/session_summaries/constants.py @@ -15,9 +15,9 @@ HALLUCINATED_EVENTS_MIN_RATIO = 0.15 # If more than 15% of events in the summary hallucinated, fail the summarization # Minimum number of sessions to use group summary logic (find patterns) instead of summarizing them separately GROUP_SUMMARIES_MIN_SESSIONS = 5 -# Don't include events that are happened before the replay started or at the very start, as we can't verify them with videos -# (first N seconds are hardest to render), iterate if we find a better way to generate Replay videos -EVENTS_BEFORE_REPLAY_START_THRESHOLD_MS = 5000 +# Don't include events that are happened before or after the replay started, or at the very start/end, +# as we can't verify them with videos confidently,iterate if we find a better way to generate Replay videos +EVENTS_BEFORE_AFTER_REPLAY_START_THRESHOLD_MS = 5000 # Temporal SESSION_SUMMARIES_DB_DATA_REDIS_TTL = 60 * 60 * 24 # How long to store the DB data in Redis within Temporal jobs From 30bdebef2420fe01df729439f64c1a7755477c8b Mon Sep 17 00:00:00 2001 From: Alex Lebedev Date: Tue, 2 Dec 2025 12:58:50 +0200 Subject: [PATCH 5/6] chore: Rename. --- ee/hogai/session_summaries/constants.py | 2 +- ee/hogai/session_summaries/session/input_data.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ee/hogai/session_summaries/constants.py b/ee/hogai/session_summaries/constants.py index 2dff6168315e8..457608063dfd2 100644 --- a/ee/hogai/session_summaries/constants.py +++ b/ee/hogai/session_summaries/constants.py @@ -17,7 +17,7 @@ GROUP_SUMMARIES_MIN_SESSIONS = 5 # Don't include events that are happened before or after the replay started, or at the very start/end, # as we can't verify them with videos confidently,iterate if we find a better way to generate Replay videos -EVENTS_BEFORE_AFTER_REPLAY_START_THRESHOLD_MS = 5000 +SESSION_EVENTS_REPLAY_CUTOFF_MS = 5000 # Temporal SESSION_SUMMARIES_DB_DATA_REDIS_TTL = 60 * 60 * 24 # How long to store the DB data in Redis within Temporal jobs diff --git a/ee/hogai/session_summaries/session/input_data.py b/ee/hogai/session_summaries/session/input_data.py index 90bd660853253..f7a9da5bf3f2d 100644 --- a/ee/hogai/session_summaries/session/input_data.py +++ b/ee/hogai/session_summaries/session/input_data.py @@ -10,7 +10,7 @@ from posthog.session_recordings.models.metadata import RecordingMetadata from posthog.session_recordings.queries.session_replay_events import SessionReplayEvents -from ee.hogai.session_summaries.constants import EVENTS_BEFORE_REPLAY_START_THRESHOLD_MS +from ee.hogai.session_summaries.constants import SESSION_EVENTS_REPLAY_CUTOFF_MS from ee.hogai.session_summaries.local.input_data import ( _get_production_session_events_locally, _get_production_session_metadata_locally, @@ -213,7 +213,7 @@ def add_context_and_filter_events( logger.error(msg, signals_type="session-summaries", session_id=session_id) raise ValueError(msg) ms_since_start = calculate_time_since_start(event_timestamp, session_start_time) - if ms_since_start <= EVENTS_BEFORE_REPLAY_START_THRESHOLD_MS: + if ms_since_start <= SESSION_EVENTS_REPLAY_CUTOFF_MS: continue past_replay_start = True updated_event: list[str | datetime.datetime | list[str] | None] = list(event) From 866020ed2695c451f3ace898def468b4627deb62 Mon Sep 17 00:00:00 2001 From: Alex Lebedev Date: Tue, 2 Dec 2025 14:39:17 +0200 Subject: [PATCH 6/6] feat: Also filter out events before the very end. --- .../session_summaries/session/input_data.py | 20 ++++-- .../session/summarize_session.py | 1 + ee/hogai/session_summaries/tests/conftest.py | 12 +++- .../tests/test_input_data.py | 69 ++++++++++++++++--- ee/hogai/session_summaries/utils.py | 10 +++ .../summarize_session_group.py | 1 + 6 files changed, 93 insertions(+), 20 deletions(-) diff --git a/ee/hogai/session_summaries/session/input_data.py b/ee/hogai/session_summaries/session/input_data.py index f7a9da5bf3f2d..ddc76de460272 100644 --- a/ee/hogai/session_summaries/session/input_data.py +++ b/ee/hogai/session_summaries/session/input_data.py @@ -15,7 +15,7 @@ _get_production_session_events_locally, _get_production_session_metadata_locally, ) -from ee.hogai.session_summaries.utils import calculate_time_since_start, get_column_index +from ee.hogai.session_summaries.utils import calculate_time_since_start, calculate_time_till_end, get_column_index logger = structlog.get_logger(__name__) @@ -181,6 +181,7 @@ def add_context_and_filter_events( session_events: list[tuple[str | datetime.datetime | list[str] | None, ...]], session_id: str, session_start_time: datetime.datetime, + session_end_time: datetime.datetime, ) -> tuple[list[str], list[tuple[str | datetime.datetime | list[str] | None, ...]]]: timestamp_index = get_column_index(session_events_columns, "timestamp") indexes = { @@ -205,17 +206,22 @@ def add_context_and_filter_events( # Events are chronologically ordered, so once we find an event after replay start, all subsequent events are too past_replay_start = False for event in session_events: - # Filter out events that occurred before or exactly at replay start, as we can't confirm them with video + event_timestamp = event[timestamp_index] + if not isinstance(event_timestamp, str) and not isinstance(event_timestamp, datetime.datetime): + msg = f"Event timestamp is not a string or datetime: {event_timestamp}" + logger.error(msg, signals_type="session-summaries", session_id=session_id) + raise ValueError(msg) + # Filter out events that occurred before or near replay start, as we can't confirm them with video if not past_replay_start: - event_timestamp = event[timestamp_index] - if not isinstance(event_timestamp, str) and not isinstance(event_timestamp, datetime.datetime): - msg = f"Event timestamp is not a string or datetime: {event_timestamp}" - logger.error(msg, signals_type="session-summaries", session_id=session_id) - raise ValueError(msg) ms_since_start = calculate_time_since_start(event_timestamp, session_start_time) if ms_since_start <= SESSION_EVENTS_REPLAY_CUTOFF_MS: continue + # No need to check the time from the start anymore, as events are sorted chronologically past_replay_start = True + # Filter out events that occurred after or near replay end, as we can't confirm them with video + ms_till_end = calculate_time_till_end(event_timestamp, session_end_time) + if ms_till_end <= SESSION_EVENTS_REPLAY_CUTOFF_MS: + continue updated_event: list[str | datetime.datetime | list[str] | None] = list(event) # Check for errors worth keeping in the context if event[indexes["event"]] == "$exception": diff --git a/ee/hogai/session_summaries/session/summarize_session.py b/ee/hogai/session_summaries/session/summarize_session.py index cf30226abad45..b5c156615e381 100644 --- a/ee/hogai/session_summaries/session/summarize_session.py +++ b/ee/hogai/session_summaries/session/summarize_session.py @@ -106,6 +106,7 @@ async def get_session_data_from_db(session_id: str, team_id: int, local_reads_pr session_events=session_events, session_id=session_id, session_start_time=session_metadata["start_time"], + session_end_time=session_metadata["end_time"], ) # TODO Get web analytics data on URLs to better understand what the user was doing diff --git a/ee/hogai/session_summaries/tests/conftest.py b/ee/hogai/session_summaries/tests/conftest.py index 0c7bf2dab7435..5e1f83551dc39 100644 --- a/ee/hogai/session_summaries/tests/conftest.py +++ b/ee/hogai/session_summaries/tests/conftest.py @@ -482,7 +482,15 @@ def mock_session_start_time() -> datetime: @pytest.fixture -def mock_raw_metadata(mock_session_id: str, mock_session_start_time: datetime) -> dict[str, Any]: +def mock_session_end_time() -> datetime: + """Session replay end time - events after this should be filtered out""" + return datetime(2025, 3, 31, 18, 54, 15, 789000, tzinfo=UTC) + + +@pytest.fixture +def mock_raw_metadata( + mock_session_id: str, mock_session_start_time: datetime, mock_session_end_time: datetime +) -> dict[str, Any]: return { "id": mock_session_id, # Anonymized distinct_id for testing @@ -493,7 +501,7 @@ def mock_raw_metadata(mock_session_id: str, mock_session_start_time: datetime) - "active_seconds": 1947, "inactive_seconds": 3375, "start_time": mock_session_start_time, - "end_time": "2025-03-31T18:54:15.789000Z", + "end_time": mock_session_end_time, "click_count": 679, "keypress_count": 668, "mouse_activity_count": 6629, diff --git a/ee/hogai/session_summaries/tests/test_input_data.py b/ee/hogai/session_summaries/tests/test_input_data.py index a33a7f94deae7..3a00331a88eb0 100644 --- a/ee/hogai/session_summaries/tests/test_input_data.py +++ b/ee/hogai/session_summaries/tests/test_input_data.py @@ -231,6 +231,7 @@ def test_get_improved_elements_chain_elements(): def test_add_context_and_filter_events( mock_event_indexes: dict[str, int], mock_session_start_time: datetime, + mock_session_end_time: datetime, input_event: tuple[Any, ...], expected_event: tuple[Any, ...] | None, should_keep: bool, @@ -241,6 +242,7 @@ def test_add_context_and_filter_events( session_events=[input_event], session_id="test_session_id", session_start_time=mock_session_start_time, + session_end_time=mock_session_end_time, ) # Check columns are updated (and columns excessive from LLM context are removed) @@ -259,8 +261,12 @@ def test_add_context_and_filter_events( @pytest.mark.parametrize( "event_timestamps,expected_kept_count", [ - # All events within 5s threshold - none kept - # Session starts at 18:40:32.302000, threshold is 5000ms, so events at or before 18:40:37.302000 are filtered + # Session starts at 18:40:32.302000, ends at 18:54:15.789000 + # Events within 5s of start (<=18:40:37.302000) are filtered + # Events within 5s of end (>=18:54:10.789000) are filtered + # + # --- Before replay start filtering --- + # All events within 5s of start - none kept ( [ datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start @@ -269,7 +275,7 @@ def test_add_context_and_filter_events( ], 0, ), - # First two within threshold, third after - only third kept + # First two within threshold of start, third after - only third kept ( [ datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start (filtered) @@ -278,16 +284,16 @@ def test_add_context_and_filter_events( ], 1, ), - # All events after 5s threshold - all kept + # All events in valid middle range - all kept ( [ - datetime(2025, 3, 31, 18, 40, 38, 0, tzinfo=UTC), # After threshold - datetime(2025, 3, 31, 18, 40, 39, 0, tzinfo=UTC), # After threshold - datetime(2025, 3, 31, 18, 40, 40, 0, tzinfo=UTC), # After threshold + datetime(2025, 3, 31, 18, 40, 38, 0, tzinfo=UTC), # After start threshold + datetime(2025, 3, 31, 18, 45, 0, 0, tzinfo=UTC), # Middle of session + datetime(2025, 3, 31, 18, 50, 0, 0, tzinfo=UTC), # Still before end threshold ], 3, ), - # Mix: two within threshold, one after - one kept + # Mix: two within start threshold, one after - one kept ( [ datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start (filtered) @@ -296,16 +302,56 @@ def test_add_context_and_filter_events( ], 1, ), + # + # --- After replay end filtering --- + # All events within 5s of end - none kept (assuming they're after start threshold) + ( + [ + datetime(2025, 3, 31, 18, 54, 10, 789000, tzinfo=UTC), # Exactly at end threshold (filtered) + datetime(2025, 3, 31, 18, 54, 15, 789000, tzinfo=UTC), # Exactly at end (filtered) + datetime(2025, 3, 31, 18, 54, 20, 0, tzinfo=UTC), # After end (filtered) + ], + 0, + ), + # First event before end threshold, rest within - only first kept + ( + [ + datetime(2025, 3, 31, 18, 54, 5, 0, tzinfo=UTC), # Before end threshold (~10s before end) + datetime(2025, 3, 31, 18, 54, 12, 0, tzinfo=UTC), # Within 5s of end (filtered) + datetime(2025, 3, 31, 18, 54, 20, 0, tzinfo=UTC), # After end (filtered) + ], + 1, + ), + # + # --- Combined before/after filtering --- + # Events at both ends filtered, middle kept + ( + [ + datetime(2025, 3, 31, 18, 40, 35, 0, tzinfo=UTC), # Within start threshold (filtered) + datetime(2025, 3, 31, 18, 45, 0, 0, tzinfo=UTC), # Middle of session (kept) + datetime(2025, 3, 31, 18, 54, 12, 0, tzinfo=UTC), # Within end threshold (filtered) + ], + 1, + ), + # All events outside valid range - none kept + ( + [ + datetime(2025, 3, 31, 18, 40, 30, 0, tzinfo=UTC), # Before start (filtered) + datetime(2025, 3, 31, 18, 40, 37, 0, tzinfo=UTC), # Within start threshold (filtered) + datetime(2025, 3, 31, 18, 54, 15, 789000, tzinfo=UTC), # At end (filtered) + ], + 0, + ), ], ) -def test_filter_events_before_replay_start( +def test_filter_events_before_after_replay_session( mock_raw_events_columns: list[str], mock_session_start_time: datetime, + mock_session_end_time: datetime, event_timestamps: list[datetime], expected_kept_count: int, ): - """Test that events occurring before or exactly at replay start are filtered out.""" - # Create events with different timestamps but valid context (so they're not filtered for other reasons) + """Test that events occurring before start or after end of replay session are filtered out.""" events: list[tuple[Any, ...]] = [] for i, ts in enumerate(event_timestamps): events.append( @@ -334,6 +380,7 @@ def test_filter_events_before_replay_start( session_events=events, session_id="test_session_id", session_start_time=mock_session_start_time, + session_end_time=mock_session_end_time, ) assert len(updated_events) == expected_kept_count diff --git a/ee/hogai/session_summaries/utils.py b/ee/hogai/session_summaries/utils.py index e5ddb51f76384..be232de1b0684 100644 --- a/ee/hogai/session_summaries/utils.py +++ b/ee/hogai/session_summaries/utils.py @@ -178,3 +178,13 @@ def calculate_time_since_start(event_timestamp: str | datetime, session_start_ti if isinstance(event_timestamp, str): event_timestamp = datetime.fromisoformat(event_timestamp) return max(0, int((event_timestamp - session_start_time).total_seconds() * 1000)) + + +def calculate_time_till_end(event_timestamp: str | datetime, session_end_time: datetime) -> int: + """ + Calculate milliseconds remaining until session end time. + Returns 0 for events that occurred at or after session end. + """ + if isinstance(event_timestamp, str): + event_timestamp = datetime.fromisoformat(event_timestamp) + return max(0, int((session_end_time - event_timestamp).total_seconds() * 1000)) diff --git a/posthog/temporal/ai/session_summary/summarize_session_group.py b/posthog/temporal/ai/session_summary/summarize_session_group.py index dd0056c710022..cbff790271d60 100644 --- a/posthog/temporal/ai/session_summary/summarize_session_group.py +++ b/posthog/temporal/ai/session_summary/summarize_session_group.py @@ -188,6 +188,7 @@ async def fetch_session_batch_events_activity( session_events=session_events, session_id=session_id, session_start_time=session_metadata["start_time"], + session_end_time=session_metadata["end_time"], ) session_db_data = SessionSummaryDBData( session_metadata=session_metadata, session_events_columns=filtered_columns, session_events=filtered_events