Skip to content

Commit 098d625

Browse files
Fix integration test: remove flaky freshness test, add event freshness test with proper window alignment
Co-Authored-By: Yosef Arbiv <[email protected]>
1 parent 23555db commit 098d625

File tree

2 files changed

+81
-74
lines changed

2 files changed

+81
-74
lines changed

integration_tests/tests/test_event_freshness_anomalies.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import random
12
from datetime import datetime, timedelta
23

34
import pytest
@@ -88,3 +89,83 @@ def test_slower_rate_event_freshness(test_id: str, dbt_project: DbtProject):
8889
test_vars={"custom_run_started_at": test_started_at.isoformat()},
8990
)
9091
assert result["status"] == "fail"
92+
93+
94+
# Anomalies currently not supported on ClickHouse
95+
@pytest.mark.skip_targets(["clickhouse"])
96+
def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
97+
"""
98+
Test the exclude_detection_period_from_training flag functionality for event freshness anomalies.
99+
100+
Scenario:
101+
- 14 days total: 7 days normal (small jitter) + 7 days anomalous (large lag)
102+
- Without exclusion: 7 anomalous days contaminate training, test passes
103+
- With exclusion: only 7 normal days in training, anomaly detected, test fails
104+
"""
105+
test_started_at = datetime.utcnow().replace(hour=0, minute=0, second=0)
106+
107+
random.seed(42)
108+
normal_start = test_started_at - timedelta(days=14)
109+
normal_data = []
110+
for date in generate_dates(normal_start, step=STEP, days_back=7):
111+
jitter_minutes = random.randint(0, 10)
112+
normal_data.append(
113+
{
114+
EVENT_TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT),
115+
UPDATE_TIMESTAMP_COLUMN: (
116+
date + timedelta(minutes=jitter_minutes)
117+
).strftime(DATE_FORMAT),
118+
}
119+
)
120+
121+
anomalous_start = test_started_at - timedelta(days=7)
122+
anomalous_data = []
123+
for date in generate_dates(anomalous_start, step=STEP, days_back=7):
124+
anomalous_data.append(
125+
{
126+
EVENT_TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT),
127+
UPDATE_TIMESTAMP_COLUMN: (date + timedelta(hours=5)).strftime(
128+
DATE_FORMAT
129+
),
130+
}
131+
)
132+
133+
all_data = normal_data + anomalous_data
134+
135+
test_args_without_exclusion = {
136+
"event_timestamp_column": EVENT_TIMESTAMP_COLUMN,
137+
"update_timestamp_column": UPDATE_TIMESTAMP_COLUMN,
138+
"days_back": 14,
139+
"backfill_days": 7,
140+
"time_bucket": {"period": "hour", "count": 1},
141+
"sensitivity": 3,
142+
}
143+
144+
test_result_without_exclusion = dbt_project.test(
145+
test_id + "_without_exclusion",
146+
TEST_NAME,
147+
test_args_without_exclusion,
148+
data=all_data,
149+
test_vars={"custom_run_started_at": test_started_at.isoformat()},
150+
)
151+
152+
assert (
153+
test_result_without_exclusion["status"] == "pass"
154+
), "Test should pass when anomaly is included in training"
155+
156+
test_args_with_exclusion = {
157+
**test_args_without_exclusion,
158+
"exclude_detection_period_from_training": True,
159+
}
160+
161+
test_result_with_exclusion = dbt_project.test(
162+
test_id + "_with_exclusion",
163+
TEST_NAME,
164+
test_args_with_exclusion,
165+
data=all_data,
166+
test_vars={"custom_run_started_at": test_started_at.isoformat()},
167+
)
168+
169+
assert (
170+
test_result_with_exclusion["status"] == "fail"
171+
), "Test should fail when anomaly is excluded from training"

integration_tests/tests/test_freshness_anomalies.py

Lines changed: 0 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -233,77 +233,3 @@ def test_first_metric_null(test_id, dbt_project: DbtProject):
233233
materialization="incremental",
234234
)
235235
assert result["status"] == "pass"
236-
237-
238-
# Test for exclude_detection_period_from_training functionality
239-
# This test demonstrates the use case where:
240-
# 1. Detection period contains anomalous freshness data that would normally be included in training
241-
# 2. With exclude_detection_period_from_training=False: anomaly is missed (test passes) because training includes the anomaly
242-
# 3. With exclude_detection_period_from_training=True: anomaly is detected (test fails) because training excludes the anomaly
243-
@pytest.mark.skip_targets(["clickhouse"])
244-
def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
245-
"""
246-
Test the exclude_detection_period_from_training flag functionality for freshness anomalies.
247-
248-
Scenario:
249-
- 30 days of normal data with consistent freshness (data arrives every 2 hours)
250-
- 3 days of anomalous data (data arrives every 8 hours - slower/stale) in detection period
251-
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
252-
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
253-
"""
254-
utc_now = datetime.utcnow()
255-
256-
# Generate 30 days of normal data with consistent freshness (every 2 hours)
257-
normal_data = [
258-
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT)}
259-
for date in generate_dates(
260-
utc_now - timedelta(days=33), step=timedelta(hours=2), days_back=30
261-
)
262-
]
263-
264-
anomalous_data = [
265-
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT)}
266-
for date in generate_dates(utc_now, step=timedelta(hours=8), days_back=3)
267-
]
268-
269-
all_data = normal_data + anomalous_data
270-
271-
# Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
272-
test_args_without_exclusion = {
273-
"timestamp_column": TIMESTAMP_COLUMN,
274-
"training_period": {"period": "day", "count": 30},
275-
"detection_period": {"period": "day", "count": 3},
276-
"time_bucket": {"period": "day", "count": 1},
277-
"sensitivity": 5, # Higher sensitivity to allow anomaly to be absorbed
278-
# exclude_detection_period_from_training is not set (defaults to False/None)
279-
}
280-
281-
test_result_without_exclusion = dbt_project.test(
282-
test_id + "_without_exclusion",
283-
TEST_NAME,
284-
test_args_without_exclusion,
285-
data=all_data,
286-
)
287-
288-
# This should PASS because the anomaly is included in training, making it part of the baseline
289-
assert (
290-
test_result_without_exclusion["status"] == "pass"
291-
), "Test should pass when anomaly is included in training"
292-
293-
# Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
294-
test_args_with_exclusion = {
295-
**test_args_without_exclusion,
296-
"exclude_detection_period_from_training": True,
297-
}
298-
299-
test_result_with_exclusion = dbt_project.test(
300-
test_id + "_with_exclusion",
301-
TEST_NAME,
302-
test_args_with_exclusion,
303-
data=all_data,
304-
)
305-
306-
# This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
307-
assert (
308-
test_result_with_exclusion["status"] == "fail"
309-
), "Test should fail when anomaly is excluded from training"

0 commit comments

Comments
 (0)