Skip to content

Commit 15c3290

Browse files
Add exclude_detection_period_from_training flag to dimension anomaly test (#890)
* Add exclude_detection_period_from_training flag to dimension anomaly test - Added exclude_detection_period_from_training parameter to test_dimension_anomalies macro signature with default value false - Passed the parameter through to get_anomalies_test_configuration - This brings dimension anomalies in line with table/volume anomalies which already support this flag - The underlying logic in get_anomaly_scores_query.sql already handles this parameter for all anomaly types Co-Authored-By: Yosef Arbiv <[email protected]> * Add integration test for exclude_detection_period_from_training in dimension anomalies - Added test_dimension_exclude_detection_from_training to demonstrate the flag's behavior - Test shows that without exclusion, anomaly is missed (test passes) because training includes the detection period - Test shows that with exclusion, anomaly is detected (test fails) because training excludes the detection period - Uses 30 days of normal data with variance (45/50/55 pattern) and 7 days of anomalous data (72/28 distribution) - Follows the same pattern as test_exclude_detection_from_training in test_volume_anomalies.py Co-Authored-By: Yosef Arbiv <[email protected]> * Fix test_dimension_exclude_detection_from_training: shorten test ID suffixes to avoid Postgres 63-char limit Co-Authored-By: Yosef Arbiv <[email protected]> * Refactor test to use parametrization: rename to test_anomaly_in_detection_period - Rename test_dimension_exclude_detection_from_training to test_anomaly_in_detection_period - Add @pytest.mark.parametrize decorator with exclude_detection and expected_status parameters - Use descriptive IDs: include_detection_in_training and exclude_detection_from_training - Consolidate two test cases into one parametrized test for better maintainability - Addresses reviewer feedback on PR #890 Co-Authored-By: Yosef Arbiv <[email protected]> * Fix Postgres 63-char identifier limit: shorten parametrize IDs and remove redundant suffix - Change parametrize IDs from 'include_detection_in_training'/'exclude_detection_from_training' to 'exclude_false'/'exclude_true' - Remove redundant suffix (_incl/_excl) since pytest parametrize IDs already differentiate test cases - New table names: test_anomaly_in_detection_period_exclude_false (44 chars) and test_anomaly_in_detection_period_exclude_true (43 chars) - Both are well under Postgres 63-character limit - Fixes CI failures on Postgres (latest_official and latest_pre) Co-Authored-By: Yosef Arbiv <[email protected]> * Fix test_anomaly_in_detection_period to use date object instead of datetime Change utc_now from datetime.utcnow() to datetime.utcnow().date() to match the pattern used in other tests. Date arithmetic already works correctly with date objects. --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Yosef Arbiv <[email protected]> Co-authored-by: arbiv <[email protected]>
1 parent d6c7bb3 commit 15c3290

File tree

2 files changed

+104
-2
lines changed

2 files changed

+104
-2
lines changed

integration_tests/tests/test_dimension_anomalies.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,3 +218,104 @@ def test_dimension_anomalies_with_timestamp_exclude_final_results(
218218
test_result = dbt_project.test(test_id, DBT_TEST_NAME, test_args, data=data)
219219
assert test_result["status"] == "fail"
220220
assert test_result["failures"] == 1
221+
222+
223+
# Test for exclude_detection_period_from_training functionality
224+
# This test demonstrates the use case where:
225+
# 1. Detection period contains anomalous distribution data that would normally be included in training
226+
# 2. With exclude_detection=False: anomaly is missed (test passes) because training includes the anomaly
227+
# 3. With exclude_detection=True: anomaly is detected (test fails) because training excludes the anomaly
228+
@pytest.mark.skip_targets(["clickhouse"])
229+
@pytest.mark.parametrize(
230+
"exclude_detection,expected_status",
231+
[
232+
(False, "pass"), # include detection in training → anomaly absorbed
233+
(True, "fail"), # exclude detection from training → anomaly detected
234+
],
235+
ids=[
236+
"exclude_false",
237+
"exclude_true",
238+
], # Shortened to stay under Postgres 63-char limit
239+
)
240+
def test_anomaly_in_detection_period(
241+
test_id: str,
242+
dbt_project: DbtProject,
243+
exclude_detection: bool,
244+
expected_status: str,
245+
):
246+
"""
247+
Test the exclude_detection_period_from_training flag functionality for dimension anomalies.
248+
249+
Scenario:
250+
- 30 days of normal data with variance (45/50/55 Superman, 55/50/45 Spiderman pattern)
251+
- 7 days of anomalous data (72 Superman, 28 Spiderman per day) in detection period
252+
- Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly)
253+
- With exclusion: anomaly excluded from training, test fails (detects anomaly)
254+
255+
Note: Parametrize IDs are shortened to avoid Postgres 63-character identifier limit.
256+
"""
257+
utc_now = datetime.utcnow().date()
258+
259+
# Generate 30 days of normal data with variance (45/50/55 pattern for Superman)
260+
normal_pattern = [45, 50, 55]
261+
normal_data = []
262+
for i in range(30):
263+
date = utc_now - timedelta(days=37 - i)
264+
superman_count = normal_pattern[i % 3]
265+
spiderman_count = 100 - superman_count
266+
normal_data.extend(
267+
[
268+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": "Superman"}
269+
for _ in range(superman_count)
270+
]
271+
)
272+
normal_data.extend(
273+
[
274+
{
275+
TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT),
276+
"superhero": "Spiderman",
277+
}
278+
for _ in range(spiderman_count)
279+
]
280+
)
281+
282+
# Generate 7 days of anomalous data (72 Superman, 28 Spiderman per day) - this will be in detection period
283+
anomalous_data = []
284+
for i in range(7):
285+
date = utc_now - timedelta(days=7 - i)
286+
anomalous_data.extend(
287+
[
288+
{TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": "Superman"}
289+
for _ in range(72)
290+
]
291+
)
292+
anomalous_data.extend(
293+
[
294+
{
295+
TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT),
296+
"superhero": "Spiderman",
297+
}
298+
for _ in range(28)
299+
]
300+
)
301+
302+
all_data = normal_data + anomalous_data
303+
304+
test_args = {
305+
**DBT_TEST_ARGS,
306+
"training_period": {"period": "day", "count": 30},
307+
"detection_period": {"period": "day", "count": 7},
308+
"time_bucket": {"period": "day", "count": 1},
309+
"sensitivity": 5,
310+
}
311+
if exclude_detection:
312+
test_args["exclude_detection_period_from_training"] = True
313+
314+
test_result = dbt_project.test(
315+
test_id,
316+
DBT_TEST_NAME,
317+
test_args,
318+
data=all_data,
319+
)
320+
321+
assert test_result["status"] == expected_status

macros/edr/tests/test_dimension_anomalies.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{% test dimension_anomalies(model, dimensions, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, exclude_final_results) %}
1+
{% test dimension_anomalies(model, dimensions, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, exclude_final_results, exclude_detection_period_from_training=false) %}
22
{{ config(tags = ['elementary-tests']) }}
33
{%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %}
44
{% set model_relation = elementary.get_model_relation_for_test(model, elementary.get_test_model()) %}
@@ -39,7 +39,8 @@
3939
anomaly_exclude_metrics=anomaly_exclude_metrics,
4040
detection_period=detection_period,
4141
training_period=training_period,
42-
exclude_final_results=exclude_final_results) %}
42+
exclude_final_results=exclude_final_results,
43+
exclude_detection_period_from_training=exclude_detection_period_from_training) %}
4344

4445
{%- if not test_configuration %}
4546
{{ exceptions.raise_compiler_error("Failed to create test configuration dict for test `{}`".format(test_table_name)) }}

0 commit comments

Comments
 (0)