Add exclude_detection_period_from_training flag to column anomalies tests (#889)

devin-ai-integration[bot] · arbiv · web-flow · commit cccc439febfd · 2025-11-20T12:46:51.000+02:00
---------

Co-authored-by: Devin AI &lt;158243242+devin-ai-integration[bot]@users.noreply.github.com&gt;
Co-authored-by: Yosef Arbiv &lt;yosef.arbiv@gmail.com&gt;
diff --git a/integration_tests/tests/test_column_anomalies.py b/integration_tests/tests/test_column_anomalies.py
@@ -476,3 +476,105 @@ def test_anomalous_boolean_column_anomalies(test_id: str, dbt_project: DbtProjec
         "count_true",
         "count_false",
     }
+
+
+# Anomalies currently not supported on ClickHouse
+@pytest.mark.skip_targets(["clickhouse"])
+def test_col_anom_excl_detect_train(test_id: str, dbt_project: DbtProject):
+    """
+    Test the exclude_detection_period_from_training flag functionality for column anomalies.
+
+    Scenario:
+    - 30 days of normal data with low null count (0-2 nulls per day)
+    - 7 days of anomalous data with high null count (20 nulls per day) in detection period
+    - Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
+    - With exclusion: anomaly excluded from training, test fails (detects anomaly)
+    """
+    utc_today = datetime.utcnow().date()
+
+    # Generate 30 days of normal data with variance in null count (8, 10, 12 pattern)
+    normal_pattern = [8, 10, 12]
+    normal_data = []
+    for i in range(30):
+        date = utc_today - timedelta(days=37 - i)
+        null_count = normal_pattern[i % 3]
+        normal_data.extend(
+            [
+                {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero}
+                for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 10
+            ]
+        )
+        normal_data.extend(
+            [
+                {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
+                for _ in range(null_count)
+            ]
+        )
+
+    # Generate 7 days of anomalous data (20 nulls per day) - 100% increase from mean
+    anomalous_data = []
+    for i in range(7):
+        date = utc_today - timedelta(days=7 - i)
+        anomalous_data.extend(
+            [
+                {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero}
+                for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 10
+            ]
+        )
+        anomalous_data.extend(
+            [
+                {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
+                for _ in range(20)
+            ]
+        )
+
+    all_data = normal_data + anomalous_data
+
+    # Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
+    test_args_without_exclusion = {
+        "timestamp_column": TIMESTAMP_COLUMN,
+        "column_anomalies": ["null_count"],
+        "time_bucket": {"period": "day", "count": 1},
+        "training_period": {"period": "day", "count": 30},
+        "detection_period": {"period": "day", "count": 7},
+        "min_training_set_size": 5,
+        "anomaly_sensitivity": 5,
+        "anomaly_direction": "spike",
+        "exclude_detection_period_from_training": False,
+    }
+
+    test_result_without_exclusion = dbt_project.test(
+        test_id + "_f",
+        DBT_TEST_NAME,
+        test_args_without_exclusion,
+        data=all_data,
+        test_column="superhero",
+        test_vars={"force_metrics_backfill": True},
+    )
+
+    # This should PASS because the anomaly is included in training, making it part of the baseline
+    assert test_result_without_exclusion["status"] == "pass", (
+        "Expected PASS when exclude_detection_period_from_training=False "
+        "(detection data included in training baseline)"
+    )
+
+    # Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
+    test_args_with_exclusion = {
+        **test_args_without_exclusion,
+        "exclude_detection_period_from_training": True,
+    }
+
+    test_result_with_exclusion = dbt_project.test(
+        test_id + "_t",
+        DBT_TEST_NAME,
+        test_args_with_exclusion,
+        data=all_data,
+        test_column="superhero",
+        test_vars={"force_metrics_backfill": True},
+    )
+
+    # This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
+    assert test_result_with_exclusion["status"] == "fail", (
+        "Expected FAIL when exclude_detection_period_from_training=True "
+        "(detection data excluded from training baseline, anomaly detected)"
+    )
diff --git a/macros/edr/tests/test_all_columns_anomalies.sql b/macros/edr/tests/test_all_columns_anomalies.sql
@@ -1,4 +1,4 @@
-{% test all_columns_anomalies(model, column_anomalies, exclude_prefix, exclude_regexp, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions) %}
+{% test all_columns_anomalies(model, column_anomalies, exclude_prefix, exclude_regexp, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions, exclude_detection_period_from_training=false) %}
     {{ config(tags = ['elementary-tests']) }}
     {%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %}
         {% set model_relation = elementary.get_model_relation_for_test(model, elementary.get_test_model()) %}
@@ -37,7 +37,8 @@
                                                                                                    anomaly_exclude_metrics=anomaly_exclude_metrics,
                                                                                                    detection_period=detection_period,
                                                                                                    training_period=training_period,
-                                                                                                   dimensions=dimensions) %}
+                                                                                                   dimensions=dimensions,
+                                                                                                   exclude_detection_period_from_training=exclude_detection_period_from_training) %}
 
         {%- if not test_configuration %}
             {{ exceptions.raise_compiler_error("Failed to create test configuration dict for test `{}`".format(test_table_name)) }}
diff --git a/macros/edr/tests/test_column_anomalies.sql b/macros/edr/tests/test_column_anomalies.sql
@@ -1,4 +1,4 @@
-{% test column_anomalies(model, column_name, column_anomalies, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions) %}
+{% test column_anomalies(model, column_name, column_anomalies, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions, exclude_detection_period_from_training=false) %}
     {{ config(tags = ['elementary-tests']) }}
     {%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %}
         {% set model_relation = elementary.get_model_relation_for_test(model, elementary.get_test_model()) %}
@@ -36,7 +36,8 @@
                                                                                                    anomaly_exclude_metrics=anomaly_exclude_metrics,
                                                                                                    detection_period=detection_period,
                                                                                                    training_period=training_period,
-                                                                                                   dimensions=dimensions) %}
+                                                                                                   dimensions=dimensions,
+                                                                                                   exclude_detection_period_from_training=exclude_detection_period_from_training) %}
 
         {%- if not test_configuration %}
             {{ exceptions.raise_compiler_error("Failed to create test configuration dict for test `{}`".format(test_table_name)) }}