@@ -476,3 +476,105 @@ def test_anomalous_boolean_column_anomalies(test_id: str, dbt_project: DbtProjec
476476 "count_true" ,
477477 "count_false" ,
478478 }
479+
480+
481+ # Anomalies currently not supported on ClickHouse
482+ @pytest .mark .skip_targets (["clickhouse" ])
483+ def test_col_anom_excl_detect_train (test_id : str , dbt_project : DbtProject ):
484+ """
485+ Test the exclude_detection_period_from_training flag functionality for column anomalies.
486+
487+ Scenario:
488+ - 30 days of normal data with low null count (0-2 nulls per day)
489+ - 7 days of anomalous data with high null count (20 nulls per day) in detection period
490+ - Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
491+ - With exclusion: anomaly excluded from training, test fails (detects anomaly)
492+ """
493+ utc_today = datetime .utcnow ().date ()
494+
495+ # Generate 30 days of normal data with variance in null count (8, 10, 12 pattern)
496+ normal_pattern = [8 , 10 , 12 ]
497+ normal_data = []
498+ for i in range (30 ):
499+ date = utc_today - timedelta (days = 37 - i )
500+ null_count = normal_pattern [i % 3 ]
501+ normal_data .extend (
502+ [
503+ {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT ), "superhero" : superhero }
504+ for superhero in ["Superman" , "Batman" , "Wonder Woman" , "Flash" ] * 10
505+ ]
506+ )
507+ normal_data .extend (
508+ [
509+ {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT ), "superhero" : None }
510+ for _ in range (null_count )
511+ ]
512+ )
513+
514+ # Generate 7 days of anomalous data (20 nulls per day) - 100% increase from mean
515+ anomalous_data = []
516+ for i in range (7 ):
517+ date = utc_today - timedelta (days = 7 - i )
518+ anomalous_data .extend (
519+ [
520+ {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT ), "superhero" : superhero }
521+ for superhero in ["Superman" , "Batman" , "Wonder Woman" , "Flash" ] * 10
522+ ]
523+ )
524+ anomalous_data .extend (
525+ [
526+ {TIMESTAMP_COLUMN : date .strftime (DATE_FORMAT ), "superhero" : None }
527+ for _ in range (20 )
528+ ]
529+ )
530+
531+ all_data = normal_data + anomalous_data
532+
533+ # Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
534+ test_args_without_exclusion = {
535+ "timestamp_column" : TIMESTAMP_COLUMN ,
536+ "column_anomalies" : ["null_count" ],
537+ "time_bucket" : {"period" : "day" , "count" : 1 },
538+ "training_period" : {"period" : "day" , "count" : 30 },
539+ "detection_period" : {"period" : "day" , "count" : 7 },
540+ "min_training_set_size" : 5 ,
541+ "anomaly_sensitivity" : 5 ,
542+ "anomaly_direction" : "spike" ,
543+ "exclude_detection_period_from_training" : False ,
544+ }
545+
546+ test_result_without_exclusion = dbt_project .test (
547+ test_id + "_f" ,
548+ DBT_TEST_NAME ,
549+ test_args_without_exclusion ,
550+ data = all_data ,
551+ test_column = "superhero" ,
552+ test_vars = {"force_metrics_backfill" : True },
553+ )
554+
555+ # This should PASS because the anomaly is included in training, making it part of the baseline
556+ assert test_result_without_exclusion ["status" ] == "pass" , (
557+ "Expected PASS when exclude_detection_period_from_training=False "
558+ "(detection data included in training baseline)"
559+ )
560+
561+ # Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
562+ test_args_with_exclusion = {
563+ ** test_args_without_exclusion ,
564+ "exclude_detection_period_from_training" : True ,
565+ }
566+
567+ test_result_with_exclusion = dbt_project .test (
568+ test_id + "_t" ,
569+ DBT_TEST_NAME ,
570+ test_args_with_exclusion ,
571+ data = all_data ,
572+ test_column = "superhero" ,
573+ test_vars = {"force_metrics_backfill" : True },
574+ )
575+
576+ # This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
577+ assert test_result_with_exclusion ["status" ] == "fail" , (
578+ "Expected FAIL when exclude_detection_period_from_training=True "
579+ "(detection data excluded from training baseline, anomaly detected)"
580+ )
0 commit comments