Skip to content

Commit d6c7bb3

Browse files
Add exclude_detection_period_from_training flag to column/dimension anomaly tests (#891)
* Add exclude_detection_period_from_training flag to column and dimension anomaly tests Co-Authored-By: Yosef Arbiv <[email protected]> * Add test for exclude_detection_period_from_training flag in column anomaly tests Co-Authored-By: Yosef Arbiv <[email protected]> * Fix test_exclude_detection_from_training_all_columns: shorten test ID suffixes and adjust test data for proper anomaly detection Co-Authored-By: Yosef Arbiv <[email protected]> * Fix datetime.utcnow() deprecation: use datetime.now(timezone.utc) instead Co-Authored-By: Yosef Arbiv <[email protected]> * Address PR feedback: revert dimension anomalies change and refactor test with parametrization Co-Authored-By: Yosef Arbiv <[email protected]> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Yosef Arbiv <[email protected]>
1 parent cccc439 commit d6c7bb3

File tree

1 file changed

+102
-1
lines changed

1 file changed

+102
-1
lines changed

integration_tests/tests/test_all_columns_anomalies.py

Lines changed: 102 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from datetime import datetime, timedelta
1+
from datetime import datetime, timedelta, timezone
22
from typing import Any, Dict, List
33

44
import pytest
@@ -153,3 +153,104 @@ def test_anomalyless_all_columns_anomalies_all_monitors_sanity(
153153
test_id, DBT_TEST_NAME, test_args, data=data, multiple_results=True
154154
)
155155
assert all([res["status"] == "pass" for res in test_results])
156+
157+
158+
# Anomalies currently not supported on ClickHouse
159+
@pytest.mark.skip_targets(["clickhouse"])
160+
@pytest.mark.parametrize(
161+
"exclude_detection,expected_status",
162+
[
163+
(False, "pass"),
164+
(True, "fail"),
165+
],
166+
ids=["without_exclusion", "with_exclusion"],
167+
)
168+
def test_anomaly_in_detection_period(
169+
test_id: str,
170+
dbt_project: DbtProject,
171+
exclude_detection: bool,
172+
expected_status: str,
173+
):
174+
"""
175+
Test the exclude_detection_period_from_training flag functionality for column anomalies.
176+
177+
Scenario:
178+
- 30 days of normal data with variance in null_count pattern (8, 10, 12 nulls per day)
179+
- 7 days of anomalous data (20 nulls per day) in detection period
180+
- Without exclusion (exclude_detection=False): anomaly gets included in training baseline, test passes
181+
- With exclusion (exclude_detection=True): anomaly excluded from training, test fails (detects anomaly)
182+
"""
183+
utc_now = datetime.now(timezone.utc)
184+
185+
# Generate 30 days of normal data with variance in null_count (8, 10, 12 pattern)
186+
normal_pattern = [8, 10, 12]
187+
normal_data = []
188+
for i in range(30):
189+
date = utc_now - timedelta(days=37 - i)
190+
null_count = normal_pattern[i % 3]
191+
normal_data.extend(
192+
[
193+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
194+
for _ in range(null_count)
195+
]
196+
)
197+
normal_data.extend(
198+
[
199+
{
200+
TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT),
201+
"superhero": "Superman" if i % 2 == 0 else "Batman",
202+
}
203+
for _ in range(40 - null_count)
204+
]
205+
)
206+
207+
# Generate 7 days of anomalous data (20 nulls per day) - 100% increase from mean
208+
anomalous_data = []
209+
for i in range(7):
210+
date = utc_now - timedelta(days=7 - i)
211+
anomalous_data.extend(
212+
[
213+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None}
214+
for _ in range(20)
215+
]
216+
)
217+
anomalous_data.extend(
218+
[
219+
{
220+
TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT),
221+
"superhero": "Superman" if i % 2 == 0 else "Batman",
222+
}
223+
for _ in range(20)
224+
]
225+
)
226+
227+
all_data = normal_data + anomalous_data
228+
229+
test_args = {
230+
"timestamp_column": TIMESTAMP_COLUMN,
231+
"column_anomalies": ["null_count"],
232+
"training_period": {"period": "day", "count": 30},
233+
"detection_period": {"period": "day", "count": 7},
234+
"time_bucket": {"period": "day", "count": 1},
235+
"sensitivity": 5,
236+
}
237+
238+
if exclude_detection:
239+
test_args["exclude_detection_period_from_training"] = True
240+
241+
test_results = dbt_project.test(
242+
test_id,
243+
DBT_TEST_NAME,
244+
test_args,
245+
data=all_data,
246+
multiple_results=True,
247+
)
248+
249+
superhero_result = next(
250+
(res for res in test_results if res["column_name"].lower() == "superhero"),
251+
None,
252+
)
253+
assert superhero_result is not None, "superhero column result not found"
254+
assert (
255+
superhero_result["status"] == expected_status
256+
), f"Expected status '{expected_status}' but got '{superhero_result['status']}' (exclude_detection={exclude_detection})"

0 commit comments

Comments
 (0)