Skip to content

Commit 36878be

Browse files
Add test for exclude_detection_period_from_training flag in column anomaly tests
Co-Authored-By: Yosef Arbiv <[email protected]>
1 parent 02779f6 commit 36878be

File tree

1 file changed

+118
-0
lines changed

1 file changed

+118
-0
lines changed

integration_tests/tests/test_all_columns_anomalies.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,121 @@ def test_anomalyless_all_columns_anomalies_all_monitors_sanity(
153153
test_id, DBT_TEST_NAME, test_args, data=data, multiple_results=True
154154
)
155155
assert all([res["status"] == "pass" for res in test_results])
156+
157+
158+
# Anomalies currently not supported on ClickHouse
159+
@pytest.mark.skip_targets(["clickhouse"])
160+
def test_exclude_detection_from_training_all_columns(
161+
test_id: str, dbt_project: DbtProject
162+
):
163+
"""
164+
Test the exclude_detection_period_from_training flag functionality for column anomalies.
165+
166+
Scenario:
167+
- 30 days of normal data with consistent null_count pattern (2 nulls per day)
168+
- 7 days of anomalous data (10 nulls per day) in detection period
169+
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
170+
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
171+
"""
172+
utc_now = datetime.utcnow()
173+
174+
# Generate 30 days of normal data with consistent null_count (2 nulls per day)
175+
normal_data = []
176+
for i in range(30):
177+
date = utc_now - timedelta(days=37 - i)
178+
normal_data.extend(
179+
[
180+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
181+
for _ in range(2)
182+
]
183+
)
184+
normal_data.extend(
185+
[
186+
{
187+
TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT),
188+
"superhero": "Superman" if i % 2 == 0 else "Batman",
189+
}
190+
for _ in range(8)
191+
]
192+
)
193+
194+
# Generate 7 days of anomalous data (10 nulls per day) - this will be in detection period
195+
anomalous_data = []
196+
for i in range(7):
197+
date = utc_now - timedelta(days=7 - i)
198+
anomalous_data.extend(
199+
[
200+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
201+
for _ in range(10)
202+
]
203+
)
204+
anomalous_data.extend(
205+
[
206+
{
207+
TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT),
208+
"superhero": "Superman" if i % 2 == 0 else "Batman",
209+
}
210+
for _ in range(0) # No non-null values to keep total similar
211+
]
212+
)
213+
214+
all_data = normal_data + anomalous_data
215+
216+
# Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training)
217+
test_args_without_exclusion = {
218+
"timestamp_column": TIMESTAMP_COLUMN,
219+
"column_anomalies": ["null_count"],
220+
"training_period": {"period": "day", "count": 30},
221+
"detection_period": {"period": "day", "count": 7},
222+
"time_bucket": {"period": "day", "count": 1},
223+
"sensitivity": 5, # Higher sensitivity to allow anomaly to be absorbed
224+
# exclude_detection_period_from_training is not set (defaults to False/None)
225+
}
226+
227+
test_results_without_exclusion = dbt_project.test(
228+
test_id + "_without_exclusion",
229+
DBT_TEST_NAME,
230+
test_args_without_exclusion,
231+
data=all_data,
232+
multiple_results=True,
233+
)
234+
235+
# This should PASS because the anomaly is included in training, making it part of the baseline
236+
superhero_result = next(
237+
(
238+
res
239+
for res in test_results_without_exclusion
240+
if res["column_name"].lower() == "superhero"
241+
),
242+
None,
243+
)
244+
assert (
245+
superhero_result and superhero_result["status"] == "pass"
246+
), "Test should pass when anomaly is included in training"
247+
248+
# Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training)
249+
test_args_with_exclusion = {
250+
**test_args_without_exclusion,
251+
"exclude_detection_period_from_training": True,
252+
}
253+
254+
test_results_with_exclusion = dbt_project.test(
255+
test_id + "_with_exclusion",
256+
DBT_TEST_NAME,
257+
test_args_with_exclusion,
258+
data=all_data,
259+
multiple_results=True,
260+
)
261+
262+
# This should FAIL because the anomaly is excluded from training, so it's detected as anomalous
263+
superhero_result = next(
264+
(
265+
res
266+
for res in test_results_with_exclusion
267+
if res["column_name"].lower() == "superhero"
268+
),
269+
None,
270+
)
271+
assert (
272+
superhero_result and superhero_result["status"] == "fail"
273+
), "Test should fail when anomaly is excluded from training"

0 commit comments

Comments
 (0)