Skip to content

Commit cccc439

Browse files
Add exclude_detection_period_from_training flag to column anomalies tests (#889)
--------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Yosef Arbiv <[email protected]>
1 parent e160076 commit cccc439

File tree

3 files changed

+108
-4
lines changed

3 files changed

+108
-4
lines changed

integration_tests/tests/test_column_anomalies.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,3 +476,105 @@ def test_anomalous_boolean_column_anomalies(test_id: str, dbt_project: DbtProjec
476476
"count_true",
477477
"count_false",
478478
}
479+
480+
481+
# Anomalies currently not supported on ClickHouse
482+
@pytest.mark.skip_targets(["clickhouse"])
483+
def test_col_anom_excl_detect_train(test_id: str, dbt_project: DbtProject):
484+
"""
485+
Test the exclude_detection_period_from_training flag functionality for column anomalies.
486+
487+
Scenario:
488+
- 30 days of normal data with low null count (0-2 nulls per day)
489+
- 7 days of anomalous data with high null count (20 nulls per day) in detection period
490+
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
491+
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
492+
"""
493+
utc_today = datetime.utcnow().date()
494+
495+
# Generate 30 days of normal data with variance in null count (8, 10, 12 pattern)
496+
normal_pattern = [8, 10, 12]
497+
normal_data = []
498+
for i in range(30):
499+
date = utc_today - timedelta(days=37 - i)
500+
null_count = normal_pattern[i % 3]
501+
normal_data.extend(
502+
[
503+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero}
504+
for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 10
505+
]
506+
)
507+
normal_data.extend(
508+
[
509+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
510+
for _ in range(null_count)
511+
]
512+
)
513+
514+
# Generate 7 days of anomalous data (20 nulls per day) - 100% increase from mean
515+
anomalous_data = []
516+
for i in range(7):
517+
date = utc_today - timedelta(days=7 - i)
518+
anomalous_data.extend(
519+
[
520+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero}
521+
for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 10
522+
]
523+
)
524+
anomalous_data.extend(
525+
[
526+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
527+
for _ in range(20)
528+
]
529+
)
530+
531+
all_data = normal_data + anomalous_data
532+
533+
# Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
534+
test_args_without_exclusion = {
535+
"timestamp_column": TIMESTAMP_COLUMN,
536+
"column_anomalies": ["null_count"],
537+
"time_bucket": {"period": "day", "count": 1},
538+
"training_period": {"period": "day", "count": 30},
539+
"detection_period": {"period": "day", "count": 7},
540+
"min_training_set_size": 5,
541+
"anomaly_sensitivity": 5,
542+
"anomaly_direction": "spike",
543+
"exclude_detection_period_from_training": False,
544+
}
545+
546+
test_result_without_exclusion = dbt_project.test(
547+
test_id + "_f",
548+
DBT_TEST_NAME,
549+
test_args_without_exclusion,
550+
data=all_data,
551+
test_column="superhero",
552+
test_vars={"force_metrics_backfill": True},
553+
)
554+
555+
# This should PASS because the anomaly is included in training, making it part of the baseline
556+
assert test_result_without_exclusion["status"] == "pass", (
557+
"Expected PASS when exclude_detection_period_from_training=False "
558+
"(detection data included in training baseline)"
559+
)
560+
561+
# Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
562+
test_args_with_exclusion = {
563+
**test_args_without_exclusion,
564+
"exclude_detection_period_from_training": True,
565+
}
566+
567+
test_result_with_exclusion = dbt_project.test(
568+
test_id + "_t",
569+
DBT_TEST_NAME,
570+
test_args_with_exclusion,
571+
data=all_data,
572+
test_column="superhero",
573+
test_vars={"force_metrics_backfill": True},
574+
)
575+
576+
# This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
577+
assert test_result_with_exclusion["status"] == "fail", (
578+
"Expected FAIL when exclude_detection_period_from_training=True "
579+
"(detection data excluded from training baseline, anomaly detected)"
580+
)

macros/edr/tests/test_all_columns_anomalies.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{% test all_columns_anomalies(model, column_anomalies, exclude_prefix, exclude_regexp, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions) %}
1+
{% test all_columns_anomalies(model, column_anomalies, exclude_prefix, exclude_regexp, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions, exclude_detection_period_from_training=false) %}
22
{{ config(tags = ['elementary-tests']) }}
33
{%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %}
44
{% set model_relation = elementary.get_model_relation_for_test(model, elementary.get_test_model()) %}
@@ -37,7 +37,8 @@
3737
anomaly_exclude_metrics=anomaly_exclude_metrics,
3838
detection_period=detection_period,
3939
training_period=training_period,
40-
dimensions=dimensions) %}
40+
dimensions=dimensions,
41+
exclude_detection_period_from_training=exclude_detection_period_from_training) %}
4142

4243
{%- if not test_configuration %}
4344
{{ exceptions.raise_compiler_error("Failed to create test configuration dict for test `{}`".format(test_table_name)) }}

macros/edr/tests/test_column_anomalies.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{% test column_anomalies(model, column_name, column_anomalies, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions) %}
1+
{% test column_anomalies(model, column_name, column_anomalies, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions, exclude_detection_period_from_training=false) %}
22
{{ config(tags = ['elementary-tests']) }}
33
{%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %}
44
{% set model_relation = elementary.get_model_relation_for_test(model, elementary.get_test_model()) %}
@@ -36,7 +36,8 @@
3636
anomaly_exclude_metrics=anomaly_exclude_metrics,
3737
detection_period=detection_period,
3838
training_period=training_period,
39-
dimensions=dimensions) %}
39+
dimensions=dimensions,
40+
exclude_detection_period_from_training=exclude_detection_period_from_training) %}
4041

4142
{%- if not test_configuration %}
4243
{{ exceptions.raise_compiler_error("Failed to create test configuration dict for test `{}`".format(test_table_name)) }}

0 commit comments

Comments
 (0)