Skip to content

Commit 23555db

Browse files
Add integration test for exclude_detection_period_from_training in freshness anomalies
Co-Authored-By: Yosef Arbiv <yosef.arbiv@gmail.com>
1 parent f432db7 commit 23555db

File tree

1 file changed

+74
-0
lines changed

1 file changed

+74
-0
lines changed

integration_tests/tests/test_freshness_anomalies.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,3 +233,77 @@ def test_first_metric_null(test_id, dbt_project: DbtProject):
233233
materialization="incremental",
234234
)
235235
assert result["status"] == "pass"
236+
237+
238+
# Test for exclude_detection_period_from_training functionality
239+
# This test demonstrates the use case where:
240+
# 1. Detection period contains anomalous freshness data that would normally be included in training
241+
# 2. With exclude_detection_period_from_training=False: anomaly is missed (test passes) because training includes the anomaly
242+
# 3. With exclude_detection_period_from_training=True: anomaly is detected (test fails) because training excludes the anomaly
243+
@pytest.mark.skip_targets(["clickhouse"])
244+
def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
245+
"""
246+
Test the exclude_detection_period_from_training flag functionality for freshness anomalies.
247+
248+
Scenario:
249+
- 30 days of normal data with consistent freshness (data arrives every 2 hours)
250+
- 3 days of anomalous data (data arrives every 8 hours - slower/stale) in detection period
251+
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
252+
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
253+
"""
254+
utc_now = datetime.utcnow()
255+
256+
# Generate 30 days of normal data with consistent freshness (every 2 hours)
257+
normal_data = [
258+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT)}
259+
for date in generate_dates(
260+
utc_now - timedelta(days=33), step=timedelta(hours=2), days_back=30
261+
)
262+
]
263+
264+
anomalous_data = [
265+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT)}
266+
for date in generate_dates(utc_now, step=timedelta(hours=8), days_back=3)
267+
]
268+
269+
all_data = normal_data + anomalous_data
270+
271+
# Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
272+
test_args_without_exclusion = {
273+
"timestamp_column": TIMESTAMP_COLUMN,
274+
"training_period": {"period": "day", "count": 30},
275+
"detection_period": {"period": "day", "count": 3},
276+
"time_bucket": {"period": "day", "count": 1},
277+
"sensitivity": 5, # Higher sensitivity to allow anomaly to be absorbed
278+
# exclude_detection_period_from_training is not set (defaults to False/None)
279+
}
280+
281+
test_result_without_exclusion = dbt_project.test(
282+
test_id + "_without_exclusion",
283+
TEST_NAME,
284+
test_args_without_exclusion,
285+
data=all_data,
286+
)
287+
288+
# This should PASS because the anomaly is included in training, making it part of the baseline
289+
assert (
290+
test_result_without_exclusion["status"] == "pass"
291+
), "Test should pass when anomaly is included in training"
292+
293+
# Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
294+
test_args_with_exclusion = {
295+
**test_args_without_exclusion,
296+
"exclude_detection_period_from_training": True,
297+
}
298+
299+
test_result_with_exclusion = dbt_project.test(
300+
test_id + "_with_exclusion",
301+
TEST_NAME,
302+
test_args_with_exclusion,
303+
data=all_data,
304+
)
305+
306+
# This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
307+
assert (
308+
test_result_with_exclusion["status"] == "fail"
309+
), "Test should fail when anomaly is excluded from training"

0 commit comments

Comments
 (0)