@@ -241,71 +241,69 @@ def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
241241 Test the exclude_detection_period_from_training flag functionality for freshness anomalies.
242242
243243 Scenario:
244- - 30 days of normal data with frequent updates (every 2 hours)
245- - 7 days of anomalous data (only 1 update per day at noon) in detection period
246- - Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
247- - With exclusion: anomaly excluded from training, test fails (detects anomaly)
244+ - 7 days of normal data with frequent updates (every 2 hours) from day -14 to day -8
245+ - 7 days of anomalous data (only 1 update per day at noon) from day -7 to day -1
246+ - Detection period: last 7 days (days -7 to -1)
247+ - Training period: 14 days
248+ - Without exclusion: training includes detection window → anomalies normalized → test PASSES
249+ - With exclusion: training excludes detection window → anomalies detected → test FAILS
248250
249251 Data Generation Details:
250- - Normal data: days -67 to -37 (generate_dates goes backward from base_date for days_back days)
251- - Anomalous data: days -14 to -7 at noon (once per day)
252- - detection_end: utc_now + 1 day (to include "today" in the detection period)
253- - Detection period: 7 days back from detection_end = days -6 to 0
254-
255- Why This Works:
256- The freshness metric measures the maximum time gap between consecutive updates within each
257- daily bucket. Even though the anomalous events are at days -14 to -7 (outside the detection
258- period of days -6 to 0), the freshness metric for days -6 to 0 is still high because the
259- last update was at day -7 noon. This "propagation effect" means the freshness values in the
260- detection period reflect the missing updates, making the test work correctly.
261-
262- - Without exclusion: The high freshness values in days -6 to 0 are included in training,
263- normalizing them into the baseline → test PASSES
264- - With exclusion: The high freshness values in days -6 to 0 are excluded from training,
265- so they stand out against the normal baseline → test FAILS
252+ - Normal data: days -14 to -8 (generate_dates goes backward from base_date for days_back days)
253+ - Anomalous data: days -7 to -1 at noon (once per day)
254+ - detection_end: utc_now (detection period covers the last 7 days ending at now)
255+ - Detection period: 7 days back from detection_end = days -7 to -1
256+
257+ Why This Configuration Works:
258+ - training_period = 14 days ensures there are training buckets available when exclusion is enabled
259+ - Without exclusion: Training window includes both normal (days -14 to -8) and anomalous
260+ (days -7 to -1) data. The anomalous pattern becomes part of the baseline → test PASSES
261+ - With exclusion: Training window includes only normal data (days -14 to -8). The anomalous
262+ pattern in detection (days -7 to -1) stands out against the normal baseline → test FAILS
263+ - min_training_set_size = 3 (reduced from 5) ensures enough buckets are evaluated
266264 """
267265 utc_now = datetime .utcnow ().replace (hour = 0 , minute = 0 , second = 0 , microsecond = 0 )
268266
269- # Generate 30 days of normal data with frequent updates (every 2 hours)
267+ # Generate 7 days of normal data with frequent updates (every 2 hours) from day -14 to day -8
270268 normal_data = [
271269 {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT )}
272270 for date in generate_dates (
273- base_date = utc_now - timedelta (days = 37 ),
271+ base_date = utc_now - timedelta (days = 8 ),
274272 step = timedelta (hours = 2 ),
275- days_back = 30 ,
273+ days_back = 7 ,
276274 )
277275 ]
278276
279- # Generate 7 days of anomalous data (only 1 update per day at noon)
277+ # Generate 7 days of anomalous data (only 1 update per day at noon) from day -7 to day -1
280278 anomalous_data = [
281279 {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT )}
282280 for date in generate_dates (
283- base_date = (utc_now - timedelta (days = 7 )).replace (hour = 12 , minute = 0 ),
281+ base_date = (utc_now - timedelta (days = 1 )).replace (hour = 12 , minute = 0 ),
284282 step = timedelta (hours = 24 ),
285283 days_back = 7 ,
286284 )
287285 ]
288286
289287 all_data = normal_data + anomalous_data
290288
291- # Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training )
289+ # Test 1: WITHOUT exclusion (should pass - training includes detection window with anomalous pattern )
292290 test_args_without_exclusion = {
293291 "timestamp_column" : TIMESTAMP_COLUMN ,
294- "training_period" : {"period" : "day" , "count" : 30 },
292+ "training_period" : {"period" : "day" , "count" : 14 },
295293 "detection_period" : {"period" : "day" , "count" : 7 },
296294 "time_bucket" : {"period" : "day" , "count" : 1 },
297- "days_back" : 40 ,
295+ "days_back" : 20 ,
298296 "backfill_days" : 0 ,
299297 "sensitivity" : 3 ,
300- "min_training_set_size" : 5 ,
298+ "min_training_set_size" : 3 ,
301299 "anomaly_direction" : "spike" ,
302300 "ignore_small_changes" : {
303301 "spike_failure_percent_threshold" : 0 ,
304302 "drop_failure_percent_threshold" : 0 ,
305303 },
306304 }
307305
308- detection_end = utc_now + timedelta ( days = 1 )
306+ detection_end = utc_now
309307
310308 test_result_without_exclusion = dbt_project .test (
311309 test_id + "_without_exclusion" ,
0 commit comments