Skip to content

Commit 8a70f2a

Browse files
Fix test_column_anomalies_exclude_detection_period_from_training with more substantial dataset
- Use 30 days of normal data with low null count (0-2 nulls/day) instead of 1 day - Use 7 days of anomalous data with high null count (20 nulls/day) instead of 1 day - Update training period to 30 days and detection period to 7 days - Add more data per day to create clearer anomaly signal - Use separate test IDs for the two test runs to avoid conflicts - Pattern matches successful volume and freshness anomalies tests Co-Authored-By: Yosef Arbiv <[email protected]>
1 parent 6b60a66 commit 8a70f2a

File tree

1 file changed

+70
-38
lines changed

1 file changed

+70
-38
lines changed

integration_tests/tests/test_column_anomalies.py

Lines changed: 70 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -483,67 +483,99 @@ def test_anomalous_boolean_column_anomalies(test_id: str, dbt_project: DbtProjec
483483
def test_column_anomalies_exclude_detection_period_from_training(
484484
test_id: str, dbt_project: DbtProject
485485
):
486+
"""
487+
Test the exclude_detection_period_from_training flag functionality for column anomalies.
488+
489+
Scenario:
490+
- 30 days of normal data with low null count (0-2 nulls per day)
491+
- 7 days of anomalous data with high null count (20 nulls per day) in detection period
492+
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
493+
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
494+
"""
486495
utc_today = datetime.utcnow().date()
487-
test_date, *training_dates = generate_dates(base_date=utc_today - timedelta(1))
488-
489-
data: List[Dict[str, Any]] = [
490-
{
491-
TIMESTAMP_COLUMN: cur_date.strftime(DATE_FORMAT),
492-
"superhero": superhero,
493-
}
494-
for cur_date in training_dates
495-
for superhero in ["Superman", "Batman"]
496-
]
497-
498-
data += [
499-
{TIMESTAMP_COLUMN: test_date.strftime(DATE_FORMAT), "superhero": None}
500-
for _ in range(10)
501-
]
502496

503-
test_args_false = {
497+
# Generate 30 days of normal data with low null count (0-2 nulls per day)
498+
normal_data = []
499+
for i in range(30):
500+
date = utc_today - timedelta(days=37 - i)
501+
normal_data.extend(
502+
[
503+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero}
504+
for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 5
505+
]
506+
)
507+
null_count = i % 3
508+
normal_data.extend(
509+
[
510+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
511+
for _ in range(null_count)
512+
]
513+
)
514+
515+
# Generate 7 days of anomalous data with high null count (20 nulls per day)
516+
anomalous_data = []
517+
for i in range(7):
518+
date = utc_today - timedelta(days=7 - i)
519+
anomalous_data.extend(
520+
[
521+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero}
522+
for superhero in ["Superman", "Batman"]
523+
]
524+
)
525+
anomalous_data.extend(
526+
[
527+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
528+
for _ in range(20)
529+
]
530+
)
531+
532+
all_data = normal_data + anomalous_data
533+
534+
# Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
535+
test_args_without_exclusion = {
504536
"timestamp_column": TIMESTAMP_COLUMN,
505537
"column_anomalies": ["null_count"],
506538
"time_bucket": {"period": "day", "count": 1},
507-
"training_period": {"period": "day", "count": 1},
508-
"detection_period": {"period": "day", "count": 1},
509-
"min_training_set_size": 1,
539+
"training_period": {"period": "day", "count": 30},
540+
"detection_period": {"period": "day", "count": 7},
541+
"min_training_set_size": 5,
510542
"anomaly_sensitivity": 3,
511543
"anomaly_direction": "spike",
512544
"exclude_detection_period_from_training": False,
513545
}
514-
test_result_false = dbt_project.test(
515-
test_id,
546+
547+
test_result_without_exclusion = dbt_project.test(
548+
test_id + "_without_exclusion",
516549
DBT_TEST_NAME,
517-
test_args_false,
518-
data=data,
550+
test_args_without_exclusion,
551+
data=all_data,
519552
test_column="superhero",
520553
test_vars={"force_metrics_backfill": True},
521554
)
522-
assert test_result_false["status"] == "pass", (
555+
556+
# This should PASS because the anomaly is included in training, making it part of the baseline
557+
assert test_result_without_exclusion["status"] == "pass", (
523558
"Expected PASS when exclude_detection_period_from_training=False "
524559
"(detection data included in training baseline)"
525560
)
526561

527-
test_args_true = {
528-
"timestamp_column": TIMESTAMP_COLUMN,
529-
"column_anomalies": ["null_count"],
530-
"time_bucket": {"period": "day", "count": 1},
531-
"training_period": {"period": "day", "count": 1},
532-
"detection_period": {"period": "day", "count": 1},
533-
"min_training_set_size": 1,
534-
"anomaly_sensitivity": 3,
535-
"anomaly_direction": "spike",
562+
# Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
563+
test_args_with_exclusion = {
564+
**test_args_without_exclusion,
536565
"exclude_detection_period_from_training": True,
537566
}
538-
test_result_true = dbt_project.test(
539-
test_id,
567+
568+
test_result_with_exclusion = dbt_project.test(
569+
test_id + "_with_exclusion",
540570
DBT_TEST_NAME,
541-
test_args_true,
542-
data=data,
571+
test_args_with_exclusion,
572+
data=all_data,
543573
test_column="superhero",
544574
test_vars={"force_metrics_backfill": True},
545575
)
546-
assert test_result_true["status"] == "fail", (
576+
577+
# This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
578+
assert test_result_with_exclusion["status"] == "fail", (
547579
"Expected FAIL when exclude_detection_period_from_training=True "
548580
"(detection data excluded from training baseline, anomaly detected)"
549581
)

0 commit comments

Comments
 (0)