Skip to content

Commit 515faa4

Browse files
Benjamin Gutzmanngutzbenj
authored andcommitted
Refactor IqrOutlierCheck to consistently exclude date filters from WHERE clauses and data requirements
1 parent 247ee6e commit 515faa4

File tree

4 files changed

+22
-11
lines changed

4 files changed

+22
-11
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ Types of changes:
2020

2121
- Refactor executor to bulk load data into DuckDB memory and run checks from there
2222
- Update ValuesInSetCheck config to require value_set as a set of str, bool, or int
23+
- Refactor IqrOutlierCheck to consistently exclude date filters from WHERE clauses and data requirements
2324

2425
### Fixed
2526

src/koality/checks.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,11 @@ def assemble_query(self) -> str:
474474
"""Assemble the complete SQL query for this check."""
475475
main_query = self.query_boilerplate(self.transformation_statement())
476476

477-
if where_statement := self.assemble_where_statement(self.filters):
477+
filters = self.filters.copy()
478+
if isinstance(self, IqrOutlierCheck):
479+
filters = {name: cfg for name, cfg in filters.items() if cfg.get("type") != "date"}
480+
481+
if where_statement := self.assemble_where_statement(filters):
478482
return main_query + "\n" + where_statement
479483

480484
return main_query
@@ -1579,9 +1583,6 @@ def __init__(
15791583
monitor_only=monitor_only,
15801584
)
15811585

1582-
# Remove date filter from WHERE clause (it's used in the interval SQL, not WHERE)
1583-
self.filters = {name: cfg for name, cfg in self.filters.items() if cfg.get("type") != "date"}
1584-
15851586
def transformation_statement(self) -> str:
15861587
"""Return the SQL statement for IQR-based outlier detection."""
15871588
# TODO: currently we only raise an error if there is no data for the date
@@ -1592,10 +1593,12 @@ def transformation_statement(self) -> str:
15921593
date_col = self.date_filter["column"]
15931594
date_val = self.date_filter["value"]
15941595

1595-
if self.filters:
1596-
filter_columns = ",\n".join([v["column"] for v in self.filters.values()])
1596+
filters = {k: v for k, v in self.filters.items() if v["type"] != "date"}
1597+
1598+
if filters:
1599+
filter_columns = ",\n".join([v["column"] for v in filters.values()])
15971600
filter_columns = ",\n" + filter_columns
1598-
where_statement = self.assemble_where_statement(self.filters)
1601+
where_statement = self.assemble_where_statement(filters)
15991602
where_statement = "\nAND\n" + where_statement.removeprefix("WHERE\n")
16001603
return f"""
16011604
WITH
@@ -1670,7 +1673,9 @@ def assemble_data_exists_query(self) -> str:
16701673
date_col = self.date_filter["column"]
16711674
date_val = self.date_filter["value"]
16721675

1673-
where_statement = self.assemble_where_statement(self.filters)
1676+
filters = {k: v for k, v in self.filters.items() if v["type"] != "date"}
1677+
1678+
where_statement = self.assemble_where_statement(filters)
16741679
if where_statement:
16751680
where_statement = f"{where_statement} AND CAST({date_col} AS DATE) = DATE '{date_val}'"
16761681
else:

src/koality/executor.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ def _get_dataset_cache_key(check_instance: DataQualityCheck) -> tuple:
202202

203203
return table, database_accessor, date_value, filters_key
204204

205-
def get_data_requirements(self) -> defaultdict[str, defaultdict[str, set]]:
205+
def get_data_requirements(self) -> defaultdict[str, defaultdict[str, set]]: # noqa: C901
206206
"""Aggregate data requirements from all checks.
207207
208208
This method collects all required tables, columns, and filter configurations
@@ -216,10 +216,11 @@ def get_data_requirements(self) -> defaultdict[str, defaultdict[str, set]]:
216216
data_requirements = defaultdict(lambda: defaultdict(set))
217217
for check in self.checks:
218218
table_name = check.table
219+
check_filters = check.filters
219220
# Add check-specific columns and filter columns to the requirements
220221
if check.check_column and check.check_column != "*":
221222
data_requirements[table_name]["columns"].add(check.check_column)
222-
for _filter in check.filters.values():
223+
for _filter in check_filters.values():
223224
if "column" in _filter:
224225
data_requirements[table_name]["columns"].add(_filter["column"])
225226

@@ -234,8 +235,11 @@ def get_data_requirements(self) -> defaultdict[str, defaultdict[str, set]]:
234235
if "column" in _filter:
235236
data_requirements[check.right_table]["columns"].add(_filter["column"])
236237

238+
if isinstance(check, IqrOutlierCheck):
239+
check_filters = {k: v for k, v in check.filters.items() if v.get("type") != "date"}
240+
237241
# Store unique filter configurations for each table
238-
filter_key = frozenset((name, frozenset(config.items())) for name, config in check.filters.items())
242+
filter_key = frozenset((name, frozenset(config.items())) for name, config in check_filters.items())
239243
data_requirements[table_name]["filters"].add(filter_key)
240244
return data_requirements
241245

tests/integration/test_column_checks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,7 @@ def test_iqr_outlier_check_two_shops_success(duckdb_client_iqr_two_shops: duckdb
646646
},
647647
)
648648
result = check(duckdb_client_iqr_two_shops)
649+
assert result["METRIC_NAME"] == "VALUE_outlier_iqr_both_1_5"
649650
assert result["VALUE"] == 101.0
650651
assert result["RESULT"] == "SUCCESS"
651652
assert result["IDENTIFIER"] == "SHOP_ID=abcd"

0 commit comments

Comments
 (0)