1- import random
21from datetime import datetime , timedelta
32
43import pytest
@@ -98,61 +97,85 @@ def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
9897 Test the exclude_detection_period_from_training flag functionality for event freshness anomalies.
9998
10099 Scenario:
101- - 14 days total: 7 days normal (small jitter) + 7 days anomalous (large lag)
102- - Without exclusion: 7 anomalous days contaminate training, test passes
103- - With exclusion: only 7 normal days in training, anomaly detected, test fails
100+ - 7 days of normal data (5 minute lag between event and update) - training period
101+ - 7 days of anomalous data (5 hour lag) - detection period
102+ - Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
103+ - With exclusion: anomaly excluded from training, test fails (detects anomaly)
104+
105+ Mirrors the volume anomalies test pattern with:
106+ - Daily buckets (not hourly) to avoid boundary alignment issues
107+ - Mid-day event times (12:00) to avoid spillover across day boundaries
108+ - Explicit training_period and detection_period parameters
109+ - Explicit backfill_days to ensure exclusion logic works correctly
104110 """
105- test_started_at = datetime .utcnow ().replace (hour = 0 , minute = 0 , second = 0 )
111+ utc_now = datetime .utcnow ()
112+ test_started_at = (utc_now + timedelta (days = 1 )).replace (
113+ hour = 0 , minute = 0 , second = 0 , microsecond = 0
114+ )
106115
107- random . seed ( 42 )
108- normal_start = test_started_at - timedelta ( days = 14 )
116+ # Generate 7 days of normal data with varying lag (2-8 minutes) to ensure training_stddev > 0
117+ training_lags_minutes = [ 2 , 3 , 4 , 5 , 6 , 7 , 8 ]
109118 normal_data = []
110- for date in generate_dates (normal_start , step = STEP , days_back = 7 ):
111- jitter_minutes = random .randint (0 , 10 )
119+ for i in range (7 ):
120+ event_date = test_started_at - timedelta (days = 14 - i )
121+ event_time = event_date .replace (hour = 12 , minute = 0 , second = 0 , microsecond = 0 )
122+ update_time = event_time + timedelta (minutes = training_lags_minutes [i ])
112123 normal_data .append (
113124 {
114- EVENT_TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT ),
115- UPDATE_TIMESTAMP_COLUMN : (
116- date + timedelta (minutes = jitter_minutes )
117- ).strftime (DATE_FORMAT ),
125+ EVENT_TIMESTAMP_COLUMN : event_time .strftime (DATE_FORMAT ),
126+ UPDATE_TIMESTAMP_COLUMN : update_time .strftime (DATE_FORMAT ),
118127 }
119128 )
120129
121- anomalous_start = test_started_at - timedelta ( days = 7 )
130+ # Generate 7 days of anomalous data with 5-hour lag (detection period )
122131 anomalous_data = []
123- for date in generate_dates (anomalous_start , step = STEP , days_back = 7 ):
132+ for i in range (7 ):
133+ event_date = test_started_at - timedelta (days = 7 - i )
134+ event_time = event_date .replace (hour = 12 , minute = 0 , second = 0 , microsecond = 0 )
135+ update_time = event_time + timedelta (hours = 5 )
124136 anomalous_data .append (
125137 {
126- EVENT_TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT ),
127- UPDATE_TIMESTAMP_COLUMN : (date + timedelta (hours = 5 )).strftime (
128- DATE_FORMAT
129- ),
138+ EVENT_TIMESTAMP_COLUMN : event_time .strftime (DATE_FORMAT ),
139+ UPDATE_TIMESTAMP_COLUMN : update_time .strftime (DATE_FORMAT ),
130140 }
131141 )
132142
133143 all_data = normal_data + anomalous_data
134144
145+ # Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
135146 test_args_without_exclusion = {
136147 "event_timestamp_column" : EVENT_TIMESTAMP_COLUMN ,
137148 "update_timestamp_column" : UPDATE_TIMESTAMP_COLUMN ,
138- "days_back" : 14 ,
139- "backfill_days" : 7 ,
140- "time_bucket" : {"period" : "hour" , "count" : 1 },
149+ "training_period" : {"period" : "day" , "count" : 7 },
150+ "detection_period" : {"period" : "day" , "count" : 7 },
151+ "backfill_days" : 7 , # Explicit backfill_days for exclusion logic
152+ "time_bucket" : {
153+ "period" : "day" ,
154+ "count" : 1 ,
155+ }, # Daily buckets to avoid boundary issues
141156 "sensitivity" : 3 ,
157+ "anomaly_direction" : "spike" , # Explicit direction since we're testing increased lag
158+ "min_training_set_size" : 5 , # Explicit minimum to avoid threshold issues
159+ # exclude_detection_period_from_training is not set (defaults to False/None)
142160 }
143161
144162 test_result_without_exclusion = dbt_project .test (
145163 test_id + "_without_exclusion" ,
146164 TEST_NAME ,
147165 test_args_without_exclusion ,
148166 data = all_data ,
149- test_vars = {"custom_run_started_at" : test_started_at .isoformat ()},
167+ test_vars = {
168+ "custom_run_started_at" : test_started_at .isoformat (),
169+ "force_metrics_backfill" : True ,
170+ },
150171 )
151172
173+ # This should PASS because the anomaly is included in training, making it part of the baseline
152174 assert (
153175 test_result_without_exclusion ["status" ] == "pass"
154176 ), "Test should pass when anomaly is included in training"
155177
178+ # Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
156179 test_args_with_exclusion = {
157180 ** test_args_without_exclusion ,
158181 "exclude_detection_period_from_training" : True ,
@@ -163,9 +186,13 @@ def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
163186 TEST_NAME ,
164187 test_args_with_exclusion ,
165188 data = all_data ,
166- test_vars = {"custom_run_started_at" : test_started_at .isoformat ()},
189+ test_vars = {
190+ "custom_run_started_at" : test_started_at .isoformat (),
191+ "force_metrics_backfill" : True ,
192+ },
167193 )
168194
195+ # This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
169196 assert (
170197 test_result_with_exclusion ["status" ] == "fail"
171198 ), "Test should fail when anomaly is excluded from training"
0 commit comments