@@ -241,44 +241,60 @@ def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
241241 Test the exclude_detection_period_from_training flag functionality for freshness anomalies.
242242
243243 Scenario:
244- - 7 days of normal data with frequent updates (every 2 hours) from day -14 to day -8
245- - 7 days of anomalous data (only 1 update per day at noon) from day -7 to day -1
246- - Detection period: last 7 days (days -7 to -1)
247- - Training period: 7 days
248- - Without exclusion: training = detection window (anomalous pattern) → test passes
249- - With exclusion: training = days -14 to -8 (normal pattern) → test fails (detects anomaly)
244+ - 30 days of normal data with frequent updates (every 2 hours)
245+ - 7 days of anomalous data (only 1 update per day at noon) in detection period
246+ - Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
247+ - With exclusion: anomaly excluded from training, test fails (detects anomaly)
248+
249+ Data Generation Details:
250+ - Normal data: days -67 to -37 (generate_dates goes backward from base_date for days_back days)
251+ - Anomalous data: days -14 to -7 at noon (once per day)
252+ - detection_end: utc_now + 1 day (to include "today" in the detection period)
253+ - Detection period: 7 days back from detection_end = days -6 to 0
254+
255+ Why This Works:
256+ The freshness metric measures the maximum time gap between consecutive updates within each
257+ daily bucket. Even though the anomalous events are at days -14 to -7 (outside the detection
258+ period of days -6 to 0), the freshness metric for days -6 to 0 is still high because the
259+ last update was at day -7 noon. This "propagation effect" means the freshness values in the
260+ detection period reflect the missing updates, making the test work correctly.
261+
262+ - Without exclusion: The high freshness values in days -6 to 0 are included in training,
263+ normalizing them into the baseline → test PASSES
264+ - With exclusion: The high freshness values in days -6 to 0 are excluded from training,
265+ so they stand out against the normal baseline → test FAILS
250266 """
251267 utc_now = datetime .utcnow ().replace (hour = 0 , minute = 0 , second = 0 , microsecond = 0 )
252268
253- # Generate 7 days of normal data with frequent updates (every 2 hours) from day -14 to day -8
269+ # Generate 30 days of normal data with frequent updates (every 2 hours)
254270 normal_data = [
255271 {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT )}
256272 for date in generate_dates (
257- base_date = utc_now - timedelta (days = 8 ),
273+ base_date = utc_now - timedelta (days = 37 ),
258274 step = timedelta (hours = 2 ),
259- days_back = 7 ,
275+ days_back = 30 ,
260276 )
261277 ]
262278
263- # Generate 7 days of anomalous data (only 1 update per day at noon) from day -7 to day -1
279+ # Generate 7 days of anomalous data (only 1 update per day at noon)
264280 anomalous_data = [
265281 {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT )}
266282 for date in generate_dates (
267- base_date = (utc_now - timedelta (days = 1 )).replace (hour = 12 , minute = 0 ),
283+ base_date = (utc_now - timedelta (days = 7 )).replace (hour = 12 , minute = 0 ),
268284 step = timedelta (hours = 24 ),
269285 days_back = 7 ,
270286 )
271287 ]
272288
273289 all_data = normal_data + anomalous_data
274290
275- # Test 1: WITHOUT exclusion (should pass - training includes detection window with anomalous pattern )
291+ # Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training )
276292 test_args_without_exclusion = {
277293 "timestamp_column" : TIMESTAMP_COLUMN ,
278- "training_period" : {"period" : "day" , "count" : 7 },
294+ "training_period" : {"period" : "day" , "count" : 30 },
279295 "detection_period" : {"period" : "day" , "count" : 7 },
280296 "time_bucket" : {"period" : "day" , "count" : 1 },
281- "days_back" : 20 ,
297+ "days_back" : 40 ,
282298 "backfill_days" : 0 ,
283299 "sensitivity" : 3 ,
284300 "min_training_set_size" : 5 ,
@@ -289,7 +305,7 @@ def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
289305 },
290306 }
291307
292- detection_end = utc_now
308+ detection_end = utc_now + timedelta ( days = 1 )
293309
294310 test_result_without_exclusion = dbt_project .test (
295311 test_id + "_without_exclusion" ,
0 commit comments