Skip to content

Commit 0523e8e

Browse files
Benjamin Gutzmanngutzbenj
authored andcommitted
Ensure MatchRateCheck only requires the check column from the left table
1 parent 08b4541 commit 0523e8e

File tree

4 files changed

+76
-12
lines changed

4 files changed

+76
-12
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Types of changes:
2727
### Fixed
2828

2929
- Quote table identifiers in bulk SELECTs when loading data into DuckDB memory to avoid BigQuery binder errors for identifiers that look like project IDs (e.g., `EC0601`). Added an integration test covering the quoting behavior.
30+
- Ensure MatchRateCheck only requires the check column from the left table; the right table now only contributes join and filter columns to avoid unnecessary column selection and errors.
3031

3132
## [0.9.0] - 2026-01-16
3233

docs/checks/matchrate.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# MatchRateCheck — check_column placement
2+
3+
Guidance:
4+
5+
- The `check_column` for `MatchRateCheck` must be a column present in the left-hand table only.
6+
- The right-hand table is only expected to provide join columns and filter columns; it must not be required to contain `check_column`.
7+
8+
Configuration example:
9+
10+
```yaml
11+
- defaults:
12+
check_type: MatchRateCheck
13+
check_column: product_number # must exist on the left table
14+
join_columns_left:
15+
- BQ_PARTITIONTIME
16+
- shopId
17+
- product_number
18+
join_columns_right:
19+
- BQ_PARTITIONTIME
20+
- value.shopId
21+
- product_number
22+
checks:
23+
- left_table: project.dataset.left_table
24+
right_table: project.dataset.right_table
25+
filters:
26+
shop_id:
27+
value: SHOP01
28+
```
29+
30+
Notes:
31+
32+
- The executor only requests the `check_column` from the left table during bulk loading; the right table will only be queried for its join and filter columns.
33+
- This avoids errors when the right table does not contain the `check_column` or when its identifier resembles a BigQuery project ID.

src/koality/executor.py

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,33 @@ def get_data_requirements(self) -> defaultdict[str, defaultdict[str, set]]: # n
215215
"""
216216
data_requirements = defaultdict(lambda: defaultdict(set))
217217
for check in self.checks:
218+
# Skip synthetic JOIN table entries created for MatchRateCheck; handle left/right tables explicitly
219+
if isinstance(check, MatchRateCheck):
220+
# Add columns and filter columns for left table
221+
if check.check_column and check.check_column != "*":
222+
data_requirements[check.left_table]["columns"].add(check.check_column)
223+
for _filter in check.filters_left.values():
224+
if "column" in _filter:
225+
data_requirements[check.left_table]["columns"].add(_filter["column"])
226+
data_requirements[check.left_table]["columns"].update(check.join_columns_left)
227+
228+
# Add only filter and join columns for right table (check_column is only from left table)
229+
for _filter in check.filters_right.values():
230+
if "column" in _filter:
231+
data_requirements[check.right_table]["columns"].add(_filter["column"])
232+
data_requirements[check.right_table]["columns"].update(check.join_columns_right)
233+
234+
# Store unique filter configurations for both tables
235+
filter_key_left = frozenset(
236+
(name, frozenset(config.items())) for name, config in check.filters_left.items()
237+
)
238+
filter_key_right = frozenset(
239+
(name, frozenset(config.items())) for name, config in check.filters_right.items()
240+
)
241+
data_requirements[check.left_table]["filters"].add(filter_key_left)
242+
data_requirements[check.right_table]["filters"].add(filter_key_right)
243+
continue
244+
218245
table_name = check.table
219246
check_filters = check.filters
220247
# Add check-specific columns and filter columns to the requirements
@@ -224,16 +251,12 @@ def get_data_requirements(self) -> defaultdict[str, defaultdict[str, set]]: # n
224251
if "column" in _filter:
225252
data_requirements[table_name]["columns"].add(_filter["column"])
226253

227-
# For MatchRateCheck, add columns from both left and right tables
228-
if isinstance(check, MatchRateCheck):
229-
data_requirements[check.left_table]["columns"].update(check.join_columns_left)
230-
data_requirements[check.right_table]["columns"].update(check.join_columns_right)
231-
for _filter in check.filters_left.values():
232-
if "column" in _filter:
233-
data_requirements[check.left_table]["columns"].add(_filter["column"])
234-
for _filter in check.filters_right.values():
235-
if "column" in _filter:
236-
data_requirements[check.right_table]["columns"].add(_filter["column"])
254+
if isinstance(check, IqrOutlierCheck):
255+
check_filters = {k: v for k, v in check.filters.items() if v.get("type") != "date"}
256+
257+
# Store unique filter configurations for each table
258+
filter_key = frozenset((name, frozenset(config.items())) for name, config in check_filters.items())
259+
data_requirements[table_name]["filters"].add(filter_key)
237260

238261
if isinstance(check, IqrOutlierCheck):
239262
check_filters = {k: v for k, v in check.filters.items() if v.get("type") != "date"}
@@ -271,10 +294,16 @@ def fetch_data_into_memory(self, data_requirements: defaultdict[str, defaultdict
271294
if all_filters_sql:
272295
final_where_clause = "WHERE " + " OR ".join(all_filters_sql)
273296

297+
# Determine appropriate table quoting depending on database provider
298+
if self.database_provider and getattr(self.database_provider, "type", "").lower() == "bigquery":
299+
table_ref = f"`{table}`"
300+
else:
301+
table_ref = f'"{table}"'
302+
274303
# Construct the bulk SELECT query
275304
select_query = f"""
276305
SELECT {columns}
277-
FROM "{table}"
306+
FROM {table_ref}
278307
{final_where_clause}
279308
""" # noqa: S608
280309

tests/integration/test_executor_table_quoting.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,5 @@ def fake_execute_query(
5858
executor.fetch_data_into_memory(data_requirements)
5959

6060
assert captured["query"] is not None
61-
assert f'FROM "{table_name}"' in captured["query"]
61+
# Expect backticks for BigQuery provider
62+
assert f"FROM `{table_name}`" in captured["query"]

0 commit comments

Comments
 (0)