Skip to content

Commit 1af5ccc

Browse files
authored
feat(signals): Ignore events before replay session starts / after ends + 5s (#42248)
1 parent 65f3ef9 commit 1af5ccc

File tree

9 files changed

+240
-35
lines changed

9 files changed

+240
-35
lines changed

ee/hogai/session_summaries/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
HALLUCINATED_EVENTS_MIN_RATIO = 0.15 # If more than 15% of events in the summary hallucinated, fail the summarization
1616
# Minimum number of sessions to use group summary logic (find patterns) instead of summarizing them separately
1717
GROUP_SUMMARIES_MIN_SESSIONS = 5
18+
# Don't include events that are happened before or after the replay started, or at the very start/end,
19+
# as we can't verify them with videos confidently,iterate if we find a better way to generate Replay videos
20+
SESSION_EVENTS_REPLAY_CUTOFF_MS = 5000
1821

1922
# Temporal
2023
SESSION_SUMMARIES_DB_DATA_REDIS_TTL = 60 * 60 * 24 # How long to store the DB data in Redis within Temporal jobs

ee/hogai/session_summaries/session/input_data.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,12 @@
1010
from posthog.session_recordings.models.metadata import RecordingMetadata
1111
from posthog.session_recordings.queries.session_replay_events import SessionReplayEvents
1212

13+
from ee.hogai.session_summaries.constants import SESSION_EVENTS_REPLAY_CUTOFF_MS
1314
from ee.hogai.session_summaries.local.input_data import (
1415
_get_production_session_events_locally,
1516
_get_production_session_metadata_locally,
1617
)
17-
from ee.hogai.session_summaries.utils import get_column_index
18+
from ee.hogai.session_summaries.utils import calculate_time_since_start, calculate_time_till_end, get_column_index
1819

1920
logger = structlog.get_logger(__name__)
2021

@@ -179,7 +180,10 @@ def add_context_and_filter_events(
179180
session_events_columns: list[str],
180181
session_events: list[tuple[str | datetime.datetime | list[str] | None, ...]],
181182
session_id: str,
183+
session_start_time: datetime.datetime,
184+
session_end_time: datetime.datetime,
182185
) -> tuple[list[str], list[tuple[str | datetime.datetime | list[str] | None, ...]]]:
186+
timestamp_index = get_column_index(session_events_columns, "timestamp")
183187
indexes = {
184188
"event": get_column_index(session_events_columns, "event"),
185189
"$event_type": get_column_index(session_events_columns, "$event_type"),
@@ -199,7 +203,25 @@ def add_context_and_filter_events(
199203
i for i, col in enumerate(session_events_columns) if col not in COLUMNS_TO_REMOVE_FROM_LLM_CONTEXT
200204
]
201205
updated_events = []
206+
# Events are chronologically ordered, so once we find an event after replay start, all subsequent events are too
207+
past_replay_start = False
202208
for event in session_events:
209+
event_timestamp = event[timestamp_index]
210+
if not isinstance(event_timestamp, str) and not isinstance(event_timestamp, datetime.datetime):
211+
msg = f"Event timestamp is not a string or datetime: {event_timestamp}"
212+
logger.error(msg, signals_type="session-summaries", session_id=session_id)
213+
raise ValueError(msg)
214+
# Filter out events that occurred before or near replay start, as we can't confirm them with video
215+
if not past_replay_start:
216+
ms_since_start = calculate_time_since_start(event_timestamp, session_start_time)
217+
if ms_since_start <= SESSION_EVENTS_REPLAY_CUTOFF_MS:
218+
continue
219+
# No need to check the time from the start anymore, as events are sorted chronologically
220+
past_replay_start = True
221+
# Filter out events that occurred after or near replay end, as we can't confirm them with video
222+
ms_till_end = calculate_time_till_end(event_timestamp, session_end_time)
223+
if ms_till_end <= SESSION_EVENTS_REPLAY_CUTOFF_MS:
224+
continue
203225
updated_event: list[str | datetime.datetime | list[str] | None] = list(event)
204226
# Check for errors worth keeping in the context
205227
if event[indexes["event"]] == "$exception":

ee/hogai/session_summaries/session/output_data.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from datetime import datetime
21
from enum import Enum
32
from typing import Any
43

@@ -7,7 +6,12 @@
76

87
from ee.hogai.session_summaries import SummaryValidationError
98
from ee.hogai.session_summaries.constants import HALLUCINATED_EVENTS_MIN_RATIO
10-
from ee.hogai.session_summaries.utils import get_column_index, prepare_datetime, unpack_full_event_id
9+
from ee.hogai.session_summaries.utils import (
10+
calculate_time_since_start,
11+
get_column_index,
12+
prepare_datetime,
13+
unpack_full_event_id,
14+
)
1115
from ee.hogai.utils.yaml import load_yaml_from_raw_llm_content
1216

1317
logger = structlog.get_logger(__name__)
@@ -274,7 +278,8 @@ def _remove_hallucinated_events(
274278
f"Too many hallucinated events ({len(hallucinated_events)}/{total_summary_events}) for session id ({session_id})"
275279
f"in the raw session summary: {[x[-1] for x in hallucinated_events]} "
276280
)
277-
logger.error(msg, session_id=session_id, signals_type="session-summaries")
281+
if final_validation:
282+
logger.error(msg, session_id=session_id, signals_type="session-summaries")
278283
raise SummaryValidationError(msg)
279284
# Reverse to not break indexes
280285
for group_index, event_index, event in reversed(hallucinated_events):
@@ -362,14 +367,6 @@ def load_raw_session_summary_from_llm_content(
362367
return raw_session_summary
363368

364369

365-
# TODO Rework the logic, so events before the recording are marked as "LOAD", not 00:00
366-
def calculate_time_since_start(session_timestamp: str, session_start_time: datetime | None) -> int | None:
367-
if not session_start_time or not session_timestamp:
368-
return None
369-
timestamp_datetime = datetime.fromisoformat(session_timestamp)
370-
return max(0, int((timestamp_datetime - session_start_time).total_seconds() * 1000))
371-
372-
373370
def _validate_enriched_summary(
374371
data: dict[str, Any], session_id: str, final_validation: bool
375372
) -> SessionSummarySerializer:
@@ -454,6 +451,7 @@ def _calculate_segment_meta(
454451
raw_key_actions: list[dict[str, Any]] | None,
455452
session_duration: int,
456453
session_id: str,
454+
final_validation: bool,
457455
) -> SegmentMetaSerializer:
458456
# Find first and the last event in the segment
459457
segment_index = raw_segment.get("index")
@@ -599,11 +597,12 @@ def _calculate_segment_meta(
599597
# TODO: Factor of two is arbitrary, find a better solution
600598
if duration <= 0 or fallback_duration // duration > 2:
601599
# Checking only duration as events are sorted chronologically
602-
logger.warning(
603-
f"Duration change is drastic (fallback: {fallback_duration} -> segments: {duration}) - using fallback data for session_id {session_id}",
604-
session_id=session_id,
605-
signals_type="session-summaries",
606-
)
600+
if final_validation:
601+
logger.warning(
602+
f"Duration change is drastic (fallback: {fallback_duration} -> segments: {duration}) - using fallback data for session_id {session_id}",
603+
session_id=session_id,
604+
signals_type="session-summaries",
605+
)
607606
segment_meta_data["duration"] = fallback_duration
608607
segment_meta_data["duration_percentage"] = fallback_duration_percentage
609608
segment_meta_data["events_count"] = fallback_events_count
@@ -657,12 +656,14 @@ def enrich_raw_session_summary_with_meta(
657656
simplified_events_mapping=simplified_events_mapping,
658657
raw_key_actions=raw_key_actions,
659658
session_id=session_id,
659+
final_validation=final_validation,
660660
)
661661
# Validate the serializer to be able to use `.data`
662662
if not segment_meta.is_valid():
663663
# Most of the fields are optional, so failed validation should be reported
664664
msg = f"Error validating segment meta against the schema when summarizing session_id {session_id}: {segment_meta.errors}"
665-
logger.error(msg, session_id=session_id, signals_type="session-summaries")
665+
if final_validation:
666+
logger.error(msg, session_id=session_id, signals_type="session-summaries")
666667
raise SummaryValidationError(msg)
667668
enriched_segment["meta"] = segment_meta.data
668669
enriched_segments.append(enriched_segment)

ee/hogai/session_summaries/session/summarize_session.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,11 @@ async def get_session_data_from_db(session_id: str, team_id: int, local_reads_pr
102102
# Raise any unexpected errors
103103
raise
104104
session_events_columns, session_events = add_context_and_filter_events(
105-
session_events_columns=session_events_columns, session_events=session_events, session_id=session_id
105+
session_events_columns=session_events_columns,
106+
session_events=session_events,
107+
session_id=session_id,
108+
session_start_time=session_metadata["start_time"],
109+
session_end_time=session_metadata["end_time"],
106110
)
107111

108112
# TODO Get web analytics data on URLs to better understand what the user was doing

ee/hogai/session_summaries/tests/conftest.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,21 @@ def mock_chat_completion(mock_valid_llm_yaml_response: str) -> ChatCompletion:
476476

477477

478478
@pytest.fixture
479-
def mock_raw_metadata(mock_session_id: str) -> dict[str, Any]:
479+
def mock_session_start_time() -> datetime:
480+
"""Session replay start time - events before this should be filtered out"""
481+
return datetime(2025, 3, 31, 18, 40, 32, 302000, tzinfo=UTC)
482+
483+
484+
@pytest.fixture
485+
def mock_session_end_time() -> datetime:
486+
"""Session replay end time - events after this should be filtered out"""
487+
return datetime(2025, 3, 31, 18, 54, 15, 789000, tzinfo=UTC)
488+
489+
490+
@pytest.fixture
491+
def mock_raw_metadata(
492+
mock_session_id: str, mock_session_start_time: datetime, mock_session_end_time: datetime
493+
) -> dict[str, Any]:
480494
return {
481495
"id": mock_session_id,
482496
# Anonymized distinct_id for testing
@@ -486,8 +500,8 @@ def mock_raw_metadata(mock_session_id: str) -> dict[str, Any]:
486500
"recording_duration": 5323,
487501
"active_seconds": 1947,
488502
"inactive_seconds": 3375,
489-
"start_time": "2025-03-31T18:40:32.302000Z",
490-
"end_time": "2025-03-31T18:54:15.789000Z",
503+
"start_time": mock_session_start_time,
504+
"end_time": mock_session_end_time,
491505
"click_count": 679,
492506
"keypress_count": 668,
493507
"mouse_activity_count": 6629,

0 commit comments

Comments
 (0)