Simplify predicates in PushDownFilter optimizer rule (#16362)

xudong963 · web-flow · commit 969ed5e10315 · 2025-06-25T09:51:04.000+08:00
* Simplify predicates in filter

* add slt test

* Use BtreeMap to make tests stable

* process edge coner

* add doc for simplify_predicates.rs

* add as_literal to make code neat

* reorgnize file

* reduce clone call
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs
@@ -2069,6 +2069,15 @@ impl Expr {
             _ => None,
         }
     }
+
+    /// Check if the Expr is literal and get the literal value if it is.
+    pub fn as_literal(&self) -> Option<&ScalarValue> {
+        if let Expr::Literal(lit, _) = self {
+            Some(lit)
+        } else {
+            None
+        }
+    }
 }
 
 impl Normalizeable for Expr {
diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs
@@ -40,6 +40,7 @@ use datafusion_expr::{
 };
 
 use crate::optimizer::ApplyOrder;
+use crate::simplify_expressions::simplify_predicates;
 use crate::utils::{has_all_column_refs, is_restrict_null_predicate};
 use crate::{OptimizerConfig, OptimizerRule};
 
@@ -779,6 +780,18 @@ impl OptimizerRule for PushDownFilter {
             return Ok(Transformed::no(plan));
         };
 
+        let predicate = split_conjunction_owned(filter.predicate.clone());
+        let old_predicate_len = predicate.len();
+        let new_predicates = simplify_predicates(predicate)?;
+        if old_predicate_len != new_predicates.len() {
+            let Some(new_predicate) = conjunction(new_predicates) else {
+                // new_predicates is empty - remove the filter entirely
+                // Return the child plan without the filter
+                return Ok(Transformed::yes(Arc::unwrap_or_clone(filter.input)));
+            };
+            filter.predicate = new_predicate;
+        }
+
         match Arc::unwrap_or_clone(filter.input) {
             LogicalPlan::Filter(child_filter) => {
                 let parents_predicates = split_conjunction_owned(filter.predicate);
diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs
@@ -23,6 +23,7 @@ mod guarantees;
 mod inlist_simplifier;
 mod regex;
 pub mod simplify_exprs;
+mod simplify_predicates;
 mod unwrap_cast;
 mod utils;
 
@@ -31,6 +32,7 @@ pub use datafusion_expr::simplify::{SimplifyContext, SimplifyInfo};
 
 pub use expr_simplifier::*;
 pub use simplify_exprs::*;
+pub use simplify_predicates::simplify_predicates;
 
 // Export for test in datafusion/core/tests/optimizer_integration.rs
 pub use guarantees::GuaranteeRewriter;
diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_predicates.rs b/datafusion/optimizer/src/simplify_expressions/simplify_predicates.rs
@@ -0,0 +1,247 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Simplifies predicates by reducing redundant or overlapping conditions.
+//!
+//! This module provides functionality to optimize logical predicates used in query planning
+//! by eliminating redundant conditions, thus reducing the number of predicates to evaluate.
+//! Unlike the simplifier in `simplify_expressions/simplify_exprs.rs`, which focuses on
+//! general expression simplification (e.g., constant folding and algebraic simplifications),
+//! this module specifically targets predicate optimization by handling containment relationships.
+//! For example, it can simplify `x > 5 AND x > 6` to just `x > 6`, as the latter condition
+//! encompasses the former, resulting in fewer checks during query execution.
+
+use datafusion_common::{Column, Result, ScalarValue};
+use datafusion_expr::{BinaryExpr, Cast, Expr, Operator};
+use std::collections::BTreeMap;
+
+/// Simplifies a list of predicates by removing redundancies.
+///
+/// This function takes a vector of predicate expressions and groups them by the column they reference.
+/// Predicates that reference a single column and are comparison operations (e.g., >, >=, <, <=, =)
+/// are analyzed to remove redundant conditions. For instance, `x > 5 AND x > 6` is simplified to
+/// `x > 6`. Other predicates that do not fit this pattern are retained as-is.
+///
+/// # Arguments
+/// * `predicates` - A vector of `Expr` representing the predicates to simplify.
+///
+/// # Returns
+/// A `Result` containing a vector of simplified `Expr` predicates.
+pub fn simplify_predicates(predicates: Vec<Expr>) -> Result<Vec<Expr>> {
+    // Early return for simple cases
+    if predicates.len() <= 1 {
+        return Ok(predicates);
+    }
+
+    // Group predicates by their column reference
+    let mut column_predicates: BTreeMap<Column, Vec<Expr>> = BTreeMap::new();
+    let mut other_predicates = Vec::new();
+
+    for pred in predicates {
+        match &pred {
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op:
+                    Operator::Gt
+                    | Operator::GtEq
+                    | Operator::Lt
+                    | Operator::LtEq
+                    | Operator::Eq,
+                right,
+            }) => {
+                let left_col = extract_column_from_expr(left);
+                let right_col = extract_column_from_expr(right);
+                if let (Some(col), Some(_)) = (&left_col, right.as_literal()) {
+                    column_predicates.entry(col.clone()).or_default().push(pred);
+                } else if let (Some(_), Some(col)) = (left.as_literal(), &right_col) {
+                    column_predicates.entry(col.clone()).or_default().push(pred);
+                } else {
+                    other_predicates.push(pred);
+                }
+            }
+            _ => other_predicates.push(pred),
+        }
+    }
+
+    // Process each column's predicates to remove redundancies
+    let mut result = other_predicates;
+    for (_, preds) in column_predicates {
+        let simplified = simplify_column_predicates(preds)?;
+        result.extend(simplified);
+    }
+
+    Ok(result)
+}
+
+/// Simplifies predicates related to a single column.
+///
+/// This function processes a list of predicates that all reference the same column and
+/// simplifies them based on their operators. It groups predicates into greater-than (>, >=),
+/// less-than (<, <=), and equality (=) categories, then selects the most restrictive condition
+/// in each category to reduce redundancy. For example, among `x > 5` and `x > 6`, only `x > 6`
+/// is retained as it is more restrictive.
+///
+/// # Arguments
+/// * `predicates` - A vector of `Expr` representing predicates for a single column.
+///
+/// # Returns
+/// A `Result` containing a vector of simplified `Expr` predicates for the column.
+fn simplify_column_predicates(predicates: Vec<Expr>) -> Result<Vec<Expr>> {
+    if predicates.len() <= 1 {
+        return Ok(predicates);
+    }
+
+    // Group by operator type, but combining similar operators
+    let mut greater_predicates = Vec::new(); // Combines > and >=
+    let mut less_predicates = Vec::new(); // Combines < and <=
+    let mut eq_predicates = Vec::new();
+
+    for pred in predicates {
+        match &pred {
+            Expr::BinaryExpr(BinaryExpr { left: _, op, right }) => {
+                match (op, right.as_literal().is_some()) {
+                    (Operator::Gt, true)
+                    | (Operator::Lt, false)
+                    | (Operator::GtEq, true)
+                    | (Operator::LtEq, false) => greater_predicates.push(pred),
+                    (Operator::Lt, true)
+                    | (Operator::Gt, false)
+                    | (Operator::LtEq, true)
+                    | (Operator::GtEq, false) => less_predicates.push(pred),
+                    (Operator::Eq, _) => eq_predicates.push(pred),
+                    _ => unreachable!("Unexpected operator: {}", op),
+                }
+            }
+            _ => unreachable!("Unexpected predicate {}", pred.to_string()),
+        }
+    }
+
+    let mut result = Vec::new();
+
+    if !eq_predicates.is_empty() {
+        // If there are many equality predicates, we can only keep one if they are all the same
+        if eq_predicates.len() == 1
+            || eq_predicates.iter().all(|e| e == &eq_predicates[0])
+        {
+            result.push(eq_predicates.pop().unwrap());
+        } else {
+            // If they are not the same, add a false predicate
+            result.push(Expr::Literal(ScalarValue::Boolean(Some(false)), None));
+        }
+    }
+
+    // Handle all greater-than-style predicates (keep the most restrictive - highest value)
+    if !greater_predicates.is_empty() {
+        if let Some(most_restrictive) =
+            find_most_restrictive_predicate(&greater_predicates, true)?
+        {
+            result.push(most_restrictive);
+        } else {
+            result.extend(greater_predicates);
+        }
+    }
+
+    // Handle all less-than-style predicates (keep the most restrictive - lowest value)
+    if !less_predicates.is_empty() {
+        if let Some(most_restrictive) =
+            find_most_restrictive_predicate(&less_predicates, false)?
+        {
+            result.push(most_restrictive);
+        } else {
+            result.extend(less_predicates);
+        }
+    }
+
+    Ok(result)
+}
+
+/// Finds the most restrictive predicate from a list based on literal values.
+///
+/// This function iterates through a list of predicates to identify the most restrictive one
+/// by comparing their literal values. For greater-than predicates, the highest value is most
+/// restrictive, while for less-than predicates, the lowest value is most restrictive.
+///
+/// # Arguments
+/// * `predicates` - A slice of `Expr` representing predicates to compare.
+/// * `find_greater` - A boolean indicating whether to find the highest value (true for >, >=)
+///   or the lowest value (false for <, <=).
+///
+/// # Returns
+/// A `Result` containing an `Option<Expr>` with the most restrictive predicate, if any.
+fn find_most_restrictive_predicate(
+    predicates: &[Expr],
+    find_greater: bool,
+) -> Result<Option<Expr>> {
+    if predicates.is_empty() {
+        return Ok(None);
+    }
+
+    let mut most_restrictive_idx = 0;
+    let mut best_value: Option<&ScalarValue> = None;
+
+    for (idx, pred) in predicates.iter().enumerate() {
+        if let Expr::BinaryExpr(BinaryExpr { left, op: _, right }) = pred {
+            // Extract the literal value based on which side has it
+            let scalar_value = match (right.as_literal(), left.as_literal()) {
+                (Some(scalar), _) => Some(scalar),
+                (_, Some(scalar)) => Some(scalar),
+                _ => None,
+            };
+
+            if let Some(scalar) = scalar_value {
+                if let Some(current_best) = best_value {
+                    if let Some(comparison) = scalar.partial_cmp(current_best) {
+                        let is_better = if find_greater {
+                            comparison == std::cmp::Ordering::Greater
+                        } else {
+                            comparison == std::cmp::Ordering::Less
+                        };
+
+                        if is_better {
+                            best_value = Some(scalar);
+                            most_restrictive_idx = idx;
+                        }
+                    }
+                } else {
+                    best_value = Some(scalar);
+                    most_restrictive_idx = idx;
+                }
+            }
+        }
+    }
+
+    Ok(Some(predicates[most_restrictive_idx].clone()))
+}
+
+/// Extracts a column reference from an expression, if present.
+///
+/// This function checks if the given expression is a column reference or contains one,
+/// such as within a cast operation. It returns the `Column` if found.
+///
+/// # Arguments
+/// * `expr` - A reference to an `Expr` to inspect for a column reference.
+///
+/// # Returns
+/// An `Option<Column>` containing the column reference if found, otherwise `None`.
+fn extract_column_from_expr(expr: &Expr) -> Option<Column> {
+    match expr {
+        Expr::Column(col) => Some(col.clone()),
+        // Handle cases where the column might be wrapped in a cast or other operation
+        Expr::Cast(Cast { expr, .. }) => extract_column_from_expr(expr),
+        _ => None,
+    }
+}
diff --git a/datafusion/sqllogictest/test_files/simplify_predicates.slt b/datafusion/sqllogictest/test_files/simplify_predicates.slt

Original file line number	Diff line number	Diff line change
`@@ -2069,6 +2069,15 @@ impl Expr {`
`2069`	`2069`	`_ => None,`
`2070`	`2070`	`}`
`2071`	`2071`	`}`
	`2072`	`+`
	`2073`	`+ /// Check if the Expr is literal and get the literal value if it is.`
	`2074`	`+ pub fn as_literal(&self) -> Option<&ScalarValue> {`
	`2075`	`+ if let Expr::Literal(lit, _) = self {`
	`2076`	`+ Some(lit)`
	`2077`	`+ } else {`
	`2078`	`+ None`
	`2079`	`+ }`
	`2080`	`+ }`
`2072`	`2081`	`}`
`2073`	`2082`
`2074`	`2083`	`impl Normalizeable for Expr {`