Skip to content

Commit 5cd80c5

Browse files
Revert "Align test data with detection period for clarity"
This reverts commit 38b64db.
1 parent dc423c0 commit 5cd80c5

File tree

1 file changed

+31
-15
lines changed

1 file changed

+31
-15
lines changed

integration_tests/tests/test_freshness_anomalies.py

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -241,44 +241,60 @@ def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
241241
Test the exclude_detection_period_from_training flag functionality for freshness anomalies.
242242
243243
Scenario:
244-
- 7 days of normal data with frequent updates (every 2 hours) from day -14 to day -8
245-
- 7 days of anomalous data (only 1 update per day at noon) from day -7 to day -1
246-
- Detection period: last 7 days (days -7 to -1)
247-
- Training period: 7 days
248-
- Without exclusion: training = detection window (anomalous pattern) → test passes
249-
- With exclusion: training = days -14 to -8 (normal pattern) → test fails (detects anomaly)
244+
- 30 days of normal data with frequent updates (every 2 hours)
245+
- 7 days of anomalous data (only 1 update per day at noon) in detection period
246+
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
247+
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
248+
249+
Data Generation Details:
250+
- Normal data: days -67 to -37 (generate_dates goes backward from base_date for days_back days)
251+
- Anomalous data: days -14 to -7 at noon (once per day)
252+
- detection_end: utc_now + 1 day (to include "today" in the detection period)
253+
- Detection period: 7 days back from detection_end = days -6 to 0
254+
255+
Why This Works:
256+
The freshness metric measures the maximum time gap between consecutive updates within each
257+
daily bucket. Even though the anomalous events are at days -14 to -7 (outside the detection
258+
period of days -6 to 0), the freshness metric for days -6 to 0 is still high because the
259+
last update was at day -7 noon. This "propagation effect" means the freshness values in the
260+
detection period reflect the missing updates, making the test work correctly.
261+
262+
- Without exclusion: The high freshness values in days -6 to 0 are included in training,
263+
normalizing them into the baseline → test PASSES
264+
- With exclusion: The high freshness values in days -6 to 0 are excluded from training,
265+
so they stand out against the normal baseline → test FAILS
250266
"""
251267
utc_now = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
252268

253-
# Generate 7 days of normal data with frequent updates (every 2 hours) from day -14 to day -8
269+
# Generate 30 days of normal data with frequent updates (every 2 hours)
254270
normal_data = [
255271
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT)}
256272
for date in generate_dates(
257-
base_date=utc_now - timedelta(days=8),
273+
base_date=utc_now - timedelta(days=37),
258274
step=timedelta(hours=2),
259-
days_back=7,
275+
days_back=30,
260276
)
261277
]
262278

263-
# Generate 7 days of anomalous data (only 1 update per day at noon) from day -7 to day -1
279+
# Generate 7 days of anomalous data (only 1 update per day at noon)
264280
anomalous_data = [
265281
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT)}
266282
for date in generate_dates(
267-
base_date=(utc_now - timedelta(days=1)).replace(hour=12, minute=0),
283+
base_date=(utc_now - timedelta(days=7)).replace(hour=12, minute=0),
268284
step=timedelta(hours=24),
269285
days_back=7,
270286
)
271287
]
272288

273289
all_data = normal_data + anomalous_data
274290

275-
# Test 1: WITHOUT exclusion (should pass - training includes detection window with anomalous pattern)
291+
# Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
276292
test_args_without_exclusion = {
277293
"timestamp_column": TIMESTAMP_COLUMN,
278-
"training_period": {"period": "day", "count": 7},
294+
"training_period": {"period": "day", "count": 30},
279295
"detection_period": {"period": "day", "count": 7},
280296
"time_bucket": {"period": "day", "count": 1},
281-
"days_back": 20,
297+
"days_back": 40,
282298
"backfill_days": 0,
283299
"sensitivity": 3,
284300
"min_training_set_size": 5,
@@ -289,7 +305,7 @@ def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject):
289305
},
290306
}
291307

292-
detection_end = utc_now
308+
detection_end = utc_now + timedelta(days=1)
293309

294310
test_result_without_exclusion = dbt_project.test(
295311
test_id + "_without_exclusion",

0 commit comments

Comments
 (0)