@@ -483,67 +483,99 @@ def test_anomalous_boolean_column_anomalies(test_id: str, dbt_project: DbtProjec
483483def test_column_anomalies_exclude_detection_period_from_training (
484484 test_id : str , dbt_project : DbtProject
485485):
486+ """
487+ Test the exclude_detection_period_from_training flag functionality for column anomalies.
488+
489+ Scenario:
490+ - 30 days of normal data with low null count (0-2 nulls per day)
491+ - 7 days of anomalous data with high null count (20 nulls per day) in detection period
492+ - Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
493+ - With exclusion: anomaly excluded from training, test fails (detects anomaly)
494+ """
486495 utc_today = datetime .utcnow ().date ()
487- test_date , * training_dates = generate_dates (base_date = utc_today - timedelta (1 ))
488-
489- data : List [Dict [str , Any ]] = [
490- {
491- TIMESTAMP_COLUMN : cur_date .strftime (DATE_FORMAT ),
492- "superhero" : superhero ,
493- }
494- for cur_date in training_dates
495- for superhero in ["Superman" , "Batman" ]
496- ]
497-
498- data += [
499- {TIMESTAMP_COLUMN : test_date .strftime (DATE_FORMAT ), "superhero" : None }
500- for _ in range (10 )
501- ]
502496
503- test_args_false = {
497+ # Generate 30 days of normal data with low null count (0-2 nulls per day)
498+ normal_data = []
499+ for i in range (30 ):
500+ date = utc_today - timedelta (days = 37 - i )
501+ normal_data .extend (
502+ [
503+ {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT ), "superhero" : superhero }
504+ for superhero in ["Superman" , "Batman" , "Wonder Woman" , "Flash" ] * 5
505+ ]
506+ )
507+ null_count = i % 3
508+ normal_data .extend (
509+ [
510+ {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT ), "superhero" : None }
511+ for _ in range (null_count )
512+ ]
513+ )
514+
515+ # Generate 7 days of anomalous data with high null count (20 nulls per day)
516+ anomalous_data = []
517+ for i in range (7 ):
518+ date = utc_today - timedelta (days = 7 - i )
519+ anomalous_data .extend (
520+ [
521+ {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT ), "superhero" : superhero }
522+ for superhero in ["Superman" , "Batman" ]
523+ ]
524+ )
525+ anomalous_data .extend (
526+ [
527+ {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT ), "superhero" : None }
528+ for _ in range (20 )
529+ ]
530+ )
531+
532+ all_data = normal_data + anomalous_data
533+
534+ # Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
535+ test_args_without_exclusion = {
504536 "timestamp_column" : TIMESTAMP_COLUMN ,
505537 "column_anomalies" : ["null_count" ],
506538 "time_bucket" : {"period" : "day" , "count" : 1 },
507- "training_period" : {"period" : "day" , "count" : 1 },
508- "detection_period" : {"period" : "day" , "count" : 1 },
509- "min_training_set_size" : 1 ,
539+ "training_period" : {"period" : "day" , "count" : 30 },
540+ "detection_period" : {"period" : "day" , "count" : 7 },
541+ "min_training_set_size" : 5 ,
510542 "anomaly_sensitivity" : 3 ,
511543 "anomaly_direction" : "spike" ,
512544 "exclude_detection_period_from_training" : False ,
513545 }
514- test_result_false = dbt_project .test (
515- test_id ,
546+
547+ test_result_without_exclusion = dbt_project .test (
548+ test_id + "_without_exclusion" ,
516549 DBT_TEST_NAME ,
517- test_args_false ,
518- data = data ,
550+ test_args_without_exclusion ,
551+ data = all_data ,
519552 test_column = "superhero" ,
520553 test_vars = {"force_metrics_backfill" : True },
521554 )
522- assert test_result_false ["status" ] == "pass" , (
555+
556+ # This should PASS because the anomaly is included in training, making it part of the baseline
557+ assert test_result_without_exclusion ["status" ] == "pass" , (
523558 "Expected PASS when exclude_detection_period_from_training=False "
524559 "(detection data included in training baseline)"
525560 )
526561
527- test_args_true = {
528- "timestamp_column" : TIMESTAMP_COLUMN ,
529- "column_anomalies" : ["null_count" ],
530- "time_bucket" : {"period" : "day" , "count" : 1 },
531- "training_period" : {"period" : "day" , "count" : 1 },
532- "detection_period" : {"period" : "day" , "count" : 1 },
533- "min_training_set_size" : 1 ,
534- "anomaly_sensitivity" : 3 ,
535- "anomaly_direction" : "spike" ,
562+ # Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
563+ test_args_with_exclusion = {
564+ ** test_args_without_exclusion ,
536565 "exclude_detection_period_from_training" : True ,
537566 }
538- test_result_true = dbt_project .test (
539- test_id ,
567+
568+ test_result_with_exclusion = dbt_project .test (
569+ test_id + "_with_exclusion" ,
540570 DBT_TEST_NAME ,
541- test_args_true ,
542- data = data ,
571+ test_args_with_exclusion ,
572+ data = all_data ,
543573 test_column = "superhero" ,
544574 test_vars = {"force_metrics_backfill" : True },
545575 )
546- assert test_result_true ["status" ] == "fail" , (
576+
577+ # This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
578+ assert test_result_with_exclusion ["status" ] == "fail" , (
547579 "Expected FAIL when exclude_detection_period_from_training=True "
548580 "(detection data excluded from training baseline, anomaly detected)"
549581 )
0 commit comments