Skip to content

Commit 8e826ec

Browse files
viiryaclaude
andauthored
feat(datafusion): Add Boolean predicate pushdown support (#2082)
## Which issue does this PR close? - Closes #. ## What changes are included in this PR? This commit adds comprehensive support for pushing down Boolean predicates to the Iceberg table scan layer, improving query performance by filtering data at the storage level. Changes: - Enhanced expr_to_predicate.rs to handle boolean column expressions: * Bare boolean columns in filters (e.g., WHERE is_active) are converted to column = true predicates * NOT of boolean columns (e.g., WHERE NOT is_active) are converted to column = false predicates * Added Boolean scalar value to Datum conversion - Added comprehensive sqllogictest (boolean_predicate_pushdown.slt) with: * Tests for is_active = true/false with EXPLAIN verification * Tests for is_active != true with EXPLAIN verification * Tests for combined predicates (AND/OR) * Tests for IS NULL/IS NOT NULL on boolean columns - Created test_boolean_table in engine setup for testing - Updated test schedule and show_tables baseline All tests verify that predicates are successfully pushed down to IcebergTableScan, not just executed in FilterExec. ## Are these changes tested? --------- Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent 1aee142 commit 8e826ec

File tree

3 files changed

+191
-2
lines changed

3 files changed

+191
-2
lines changed

crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,18 @@ pub fn convert_filters_to_predicate(filters: &[Expr]) -> Option<Predicate> {
5151
fn convert_filter_to_predicate(expr: &Expr) -> Option<Predicate> {
5252
match to_iceberg_predicate(expr) {
5353
TransformedResult::Predicate(predicate) => Some(predicate),
54-
TransformedResult::Column(_) | TransformedResult::Literal(_) => {
55-
unreachable!("Not a valid expression: {:?}", expr)
54+
TransformedResult::Column(column) => {
55+
// A bare column in a filter context represents a boolean column check
56+
// Convert it to: column = true
57+
Some(Predicate::Binary(BinaryExpression::new(
58+
PredicateOperator::Eq,
59+
column,
60+
Datum::bool(true),
61+
)))
62+
}
63+
TransformedResult::Literal(_) => {
64+
// Literal values in filter context cannot be pushed down
65+
None
5666
}
5767
_ => None,
5868
}
@@ -75,6 +85,14 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult {
7585
let expr = to_iceberg_predicate(exp);
7686
match expr {
7787
TransformedResult::Predicate(p) => TransformedResult::Predicate(!p),
88+
TransformedResult::Column(column) => {
89+
// NOT of a bare boolean column: NOT col => col = false
90+
TransformedResult::Predicate(Predicate::Binary(BinaryExpression::new(
91+
PredicateOperator::Eq,
92+
column,
93+
Datum::bool(false),
94+
)))
95+
}
7896
_ => TransformedResult::NotTransformed,
7997
}
8098
}
@@ -254,6 +272,7 @@ const MILLIS_PER_DAY: i64 = 24 * 60 * 60 * 1000;
254272
/// Convert a scalar value to an iceberg datum.
255273
fn scalar_value_to_datum(value: &ScalarValue) -> Option<Datum> {
256274
match value {
275+
ScalarValue::Boolean(Some(v)) => Some(Datum::bool(*v)),
257276
ScalarValue::Int8(Some(v)) => Some(Datum::int(*v as i32)),
258277
ScalarValue::Int16(Some(v)) => Some(Datum::int(*v as i32)),
259278
ScalarValue::Int32(Some(v)) => Some(Datum::int(*v)),
@@ -509,6 +528,23 @@ mod tests {
509528
assert_eq!(predicate, expected_predicate);
510529
}
511530

531+
#[test]
532+
fn test_scalar_value_to_datum_boolean() {
533+
use datafusion::common::ScalarValue;
534+
535+
// Test boolean true
536+
let datum = super::scalar_value_to_datum(&ScalarValue::Boolean(Some(true)));
537+
assert_eq!(datum, Some(Datum::bool(true)));
538+
539+
// Test boolean false
540+
let datum = super::scalar_value_to_datum(&ScalarValue::Boolean(Some(false)));
541+
assert_eq!(datum, Some(Datum::bool(false)));
542+
543+
// Test None boolean
544+
let datum = super::scalar_value_to_datum(&ScalarValue::Boolean(None));
545+
assert_eq!(datum, None);
546+
}
547+
512548
#[test]
513549
fn test_predicate_conversion_with_like_starts_with() {
514550
let sql = "bar LIKE 'test%'";
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
[engines]
19+
df = { type = "datafusion" }
20+
21+
[[steps]]
22+
engine = "df"
23+
slt = "df_test/boolean_predicate_pushdown.slt"
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
# Create test table with boolean column
19+
statement ok
20+
CREATE TABLE default.default.test_boolean_table (id INT NOT NULL, is_active BOOLEAN, description STRING)
21+
22+
# Insert test data into test_boolean_table
23+
statement ok
24+
INSERT INTO default.default.test_boolean_table VALUES
25+
(1, true, 'Active user'),
26+
(2, false, 'Inactive user'),
27+
(3, true, 'Premium member'),
28+
(4, false, 'Trial expired'),
29+
(5, true, 'Verified account'),
30+
(6, NULL, 'Pending verification')
31+
32+
# Verify boolean equality predicate is pushed down to IcebergTableScan
33+
query TT
34+
EXPLAIN SELECT * FROM default.default.test_boolean_table WHERE is_active = true
35+
----
36+
logical_plan
37+
01)Filter: default.default.test_boolean_table.is_active
38+
02)--TableScan: default.default.test_boolean_table projection=[id, is_active, description], partial_filters=[default.default.test_boolean_table.is_active]
39+
physical_plan
40+
01)CoalesceBatchesExec: target_batch_size=8192
41+
02)--FilterExec: is_active@1
42+
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
43+
04)------CooperativeExec
44+
05)--------IcebergTableScan projection:[id,is_active,description] predicate:[is_active = true]
45+
46+
# Query with is_active = true
47+
query ITT rowsort
48+
SELECT * FROM default.default.test_boolean_table WHERE is_active = true
49+
----
50+
1 true Active user
51+
3 true Premium member
52+
5 true Verified account
53+
54+
# Verify boolean false predicate is pushed down to IcebergTableScan
55+
query TT
56+
EXPLAIN SELECT * FROM default.default.test_boolean_table WHERE is_active = false
57+
----
58+
logical_plan
59+
01)Filter: NOT default.default.test_boolean_table.is_active
60+
02)--TableScan: default.default.test_boolean_table projection=[id, is_active, description], partial_filters=[NOT default.default.test_boolean_table.is_active]
61+
physical_plan
62+
01)CoalesceBatchesExec: target_batch_size=8192
63+
02)--FilterExec: NOT is_active@1
64+
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
65+
04)------CooperativeExec
66+
05)--------IcebergTableScan projection:[id,is_active,description] predicate:[is_active = false]
67+
68+
# Query with is_active = false
69+
query ITT rowsort
70+
SELECT * FROM default.default.test_boolean_table WHERE is_active = false
71+
----
72+
2 false Inactive user
73+
4 false Trial expired
74+
75+
# Verify boolean NOT EQUAL predicate is pushed down
76+
query TT
77+
EXPLAIN SELECT * FROM default.default.test_boolean_table WHERE is_active != true
78+
----
79+
logical_plan
80+
01)Filter: NOT default.default.test_boolean_table.is_active
81+
02)--TableScan: default.default.test_boolean_table projection=[id, is_active, description], partial_filters=[NOT default.default.test_boolean_table.is_active]
82+
physical_plan
83+
01)CoalesceBatchesExec: target_batch_size=8192
84+
02)--FilterExec: NOT is_active@1
85+
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
86+
04)------CooperativeExec
87+
05)--------IcebergTableScan projection:[id,is_active,description] predicate:[is_active = false]
88+
89+
# Query with is_active != true (includes false and NULL)
90+
query ITT rowsort
91+
SELECT * FROM default.default.test_boolean_table WHERE is_active != true
92+
----
93+
2 false Inactive user
94+
4 false Trial expired
95+
96+
# Test combined boolean predicates with AND
97+
query ITT rowsort
98+
SELECT * FROM default.default.test_boolean_table WHERE is_active = true AND id > 2
99+
----
100+
3 true Premium member
101+
5 true Verified account
102+
103+
# Test combined boolean predicates with OR
104+
query ITT rowsort
105+
SELECT * FROM default.default.test_boolean_table WHERE is_active = true OR id = 2
106+
----
107+
1 true Active user
108+
2 false Inactive user
109+
3 true Premium member
110+
5 true Verified account
111+
112+
# Test IS NULL on boolean column
113+
query ITT
114+
SELECT * FROM default.default.test_boolean_table WHERE is_active IS NULL
115+
----
116+
6 NULL Pending verification
117+
118+
# Test IS NOT NULL on boolean column
119+
query ITT rowsort
120+
SELECT * FROM default.default.test_boolean_table WHERE is_active IS NOT NULL
121+
----
122+
1 true Active user
123+
2 false Inactive user
124+
3 true Premium member
125+
4 false Trial expired
126+
5 true Verified account
127+
128+
# Clean up: Drop the test table
129+
statement ok
130+
DROP TABLE default.default.test_boolean_table

0 commit comments

Comments
 (0)