Skip to content

Commit e3b3ad2

Browse files
Improve integration test: add training variance, force_metrics_backfill, and explicit parameters
Co-Authored-By: Yosef Arbiv <[email protected]>
1 parent 098d625 commit e3b3ad2

File tree

1 file changed

+51
-24
lines changed

1 file changed

+51
-24
lines changed

integration_tests/tests/test_event_freshness_anomalies.py

Lines changed: 51 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import random
21
from datetime import datetime, timedelta
32

43
import pytest
@@ -98,61 +97,85 @@ def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
9897
Test the exclude_detection_period_from_training flag functionality for event freshness anomalies.
9998
10099
Scenario:
101-
- 14 days total: 7 days normal (small jitter) + 7 days anomalous (large lag)
102-
- Without exclusion: 7 anomalous days contaminate training, test passes
103-
- With exclusion: only 7 normal days in training, anomaly detected, test fails
100+
- 7 days of normal data (5 minute lag between event and update) - training period
101+
- 7 days of anomalous data (5 hour lag) - detection period
102+
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
103+
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
104+
105+
Mirrors the volume anomalies test pattern with:
106+
- Daily buckets (not hourly) to avoid boundary alignment issues
107+
- Mid-day event times (12:00) to avoid spillover across day boundaries
108+
- Explicit training_period and detection_period parameters
109+
- Explicit backfill_days to ensure exclusion logic works correctly
104110
"""
105-
test_started_at = datetime.utcnow().replace(hour=0, minute=0, second=0)
111+
utc_now = datetime.utcnow()
112+
test_started_at = (utc_now + timedelta(days=1)).replace(
113+
hour=0, minute=0, second=0, microsecond=0
114+
)
106115

107-
random.seed(42)
108-
normal_start = test_started_at - timedelta(days=14)
116+
# Generate 7 days of normal data with varying lag (2-8 minutes) to ensure training_stddev > 0
117+
training_lags_minutes = [2, 3, 4, 5, 6, 7, 8]
109118
normal_data = []
110-
for date in generate_dates(normal_start, step=STEP, days_back=7):
111-
jitter_minutes = random.randint(0, 10)
119+
for i in range(7):
120+
event_date = test_started_at - timedelta(days=14 - i)
121+
event_time = event_date.replace(hour=12, minute=0, second=0, microsecond=0)
122+
update_time = event_time + timedelta(minutes=training_lags_minutes[i])
112123
normal_data.append(
113124
{
114-
EVENT_TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT),
115-
UPDATE_TIMESTAMP_COLUMN: (
116-
date + timedelta(minutes=jitter_minutes)
117-
).strftime(DATE_FORMAT),
125+
EVENT_TIMESTAMP_COLUMN: event_time.strftime(DATE_FORMAT),
126+
UPDATE_TIMESTAMP_COLUMN: update_time.strftime(DATE_FORMAT),
118127
}
119128
)
120129

121-
anomalous_start = test_started_at - timedelta(days=7)
130+
# Generate 7 days of anomalous data with 5-hour lag (detection period)
122131
anomalous_data = []
123-
for date in generate_dates(anomalous_start, step=STEP, days_back=7):
132+
for i in range(7):
133+
event_date = test_started_at - timedelta(days=7 - i)
134+
event_time = event_date.replace(hour=12, minute=0, second=0, microsecond=0)
135+
update_time = event_time + timedelta(hours=5)
124136
anomalous_data.append(
125137
{
126-
EVENT_TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT),
127-
UPDATE_TIMESTAMP_COLUMN: (date + timedelta(hours=5)).strftime(
128-
DATE_FORMAT
129-
),
138+
EVENT_TIMESTAMP_COLUMN: event_time.strftime(DATE_FORMAT),
139+
UPDATE_TIMESTAMP_COLUMN: update_time.strftime(DATE_FORMAT),
130140
}
131141
)
132142

133143
all_data = normal_data + anomalous_data
134144

145+
# Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
135146
test_args_without_exclusion = {
136147
"event_timestamp_column": EVENT_TIMESTAMP_COLUMN,
137148
"update_timestamp_column": UPDATE_TIMESTAMP_COLUMN,
138-
"days_back": 14,
139-
"backfill_days": 7,
140-
"time_bucket": {"period": "hour", "count": 1},
149+
"training_period": {"period": "day", "count": 7},
150+
"detection_period": {"period": "day", "count": 7},
151+
"backfill_days": 7, # Explicit backfill_days for exclusion logic
152+
"time_bucket": {
153+
"period": "day",
154+
"count": 1,
155+
}, # Daily buckets to avoid boundary issues
141156
"sensitivity": 3,
157+
"anomaly_direction": "spike", # Explicit direction since we're testing increased lag
158+
"min_training_set_size": 5, # Explicit minimum to avoid threshold issues
159+
# exclude_detection_period_from_training is not set (defaults to False/None)
142160
}
143161

144162
test_result_without_exclusion = dbt_project.test(
145163
test_id + "_without_exclusion",
146164
TEST_NAME,
147165
test_args_without_exclusion,
148166
data=all_data,
149-
test_vars={"custom_run_started_at": test_started_at.isoformat()},
167+
test_vars={
168+
"custom_run_started_at": test_started_at.isoformat(),
169+
"force_metrics_backfill": True,
170+
},
150171
)
151172

173+
# This should PASS because the anomaly is included in training, making it part of the baseline
152174
assert (
153175
test_result_without_exclusion["status"] == "pass"
154176
), "Test should pass when anomaly is included in training"
155177

178+
# Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
156179
test_args_with_exclusion = {
157180
**test_args_without_exclusion,
158181
"exclude_detection_period_from_training": True,
@@ -163,9 +186,13 @@ def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
163186
TEST_NAME,
164187
test_args_with_exclusion,
165188
data=all_data,
166-
test_vars={"custom_run_started_at": test_started_at.isoformat()},
189+
test_vars={
190+
"custom_run_started_at": test_started_at.isoformat(),
191+
"force_metrics_backfill": True,
192+
},
167193
)
168194

195+
# This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
169196
assert (
170197
test_result_with_exclusion["status"] == "fail"
171198
), "Test should fail when anomaly is excluded from training"

0 commit comments

Comments
 (0)