Skip to content

Commit 94002a8

Browse files
Add integration test for exclude_detection_period_from_training with freshness anomalies
- Test validates that without the flag, anomalies in detection period are included in training (test passes, anomaly missed) - Test validates that with the flag, anomalies in detection period are excluded from training (test fails, anomaly detected) - Follows same pattern as test_volume_anomalies.py::test_exclude_detection_from_training Co-Authored-By: Yosef Arbiv <[email protected]>
1 parent f4b672b commit 94002a8

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed

integration_tests/tests/test_freshness_anomalies.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,3 +233,79 @@ def test_first_metric_null(test_id, dbt_project: DbtProject):
233233
materialization="incremental",
234234
)
235235
assert result["status"] == "pass"
236+
237+
238+
@pytest.mark.skip_targets(["clickhouse"])
239+
def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
240+
"""
241+
Test the exclude_detection_period_from_training flag functionality for freshness anomalies.
242+
243+
Scenario:
244+
- 30 days of normal data with consistent update frequency (every 2 hours)
245+
- 7 days of anomalous data (slower updates every 8 hours) in detection period
246+
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
247+
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
248+
"""
249+
utc_now = datetime.utcnow()
250+
251+
# Generate 30 days of normal data with consistent update frequency (every 2 hours)
252+
normal_data = [
253+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT)}
254+
for date in generate_dates(
255+
base_date=utc_now - timedelta(days=37),
256+
step=timedelta(hours=2),
257+
days_back=30,
258+
)
259+
]
260+
261+
# Generate 7 days of anomalous data (slower updates every 8 hours) - this will be in detection period
262+
anomalous_data = [
263+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT)}
264+
for date in generate_dates(
265+
base_date=utc_now - timedelta(days=7),
266+
step=timedelta(hours=8), # 4x slower than normal
267+
days_back=7,
268+
)
269+
]
270+
271+
all_data = normal_data + anomalous_data
272+
273+
# Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
274+
test_args_without_exclusion = {
275+
"timestamp_column": TIMESTAMP_COLUMN,
276+
"training_period": {"period": "day", "count": 30},
277+
"detection_period": {"period": "day", "count": 7},
278+
"time_bucket": {"period": "day", "count": 1},
279+
"sensitivity": 5, # Higher sensitivity to allow anomaly to be absorbed
280+
# exclude_detection_period_from_training is not set (defaults to False/None)
281+
}
282+
283+
test_result_without_exclusion = dbt_project.test(
284+
test_id + "_without_exclusion",
285+
TEST_NAME,
286+
test_args_without_exclusion,
287+
data=all_data,
288+
)
289+
290+
# This should PASS because the anomaly is included in training, making it part of the baseline
291+
assert (
292+
test_result_without_exclusion["status"] == "pass"
293+
), "Test should pass when anomaly is included in training"
294+
295+
# Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
296+
test_args_with_exclusion = {
297+
**test_args_without_exclusion,
298+
"exclude_detection_period_from_training": True,
299+
}
300+
301+
test_result_with_exclusion = dbt_project.test(
302+
test_id + "_with_exclusion",
303+
TEST_NAME,
304+
test_args_with_exclusion,
305+
data=all_data,
306+
)
307+
308+
# This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
309+
assert (
310+
test_result_with_exclusion["status"] == "fail"
311+
), "Test should fail when anomaly is excluded from training"

0 commit comments

Comments
 (0)