From 4fb414760e2ddf78d69ba9a2a986caf56eecf3e3 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Thu, 1 Jan 2026 00:45:15 +0530 Subject: [PATCH 1/8] Fix ScalarValue partial ordering behavior with NULLs --- datafusion/common/src/scalar/mod.rs | 31 ++++++++++++++++++++++++----- datafusion/common/src/utils/mod.rs | 12 +++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index e4e048ad3c0d8..772ee972bb4fb 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -568,10 +568,15 @@ impl PartialEq for ScalarValue { impl PartialOrd for ScalarValue { fn partial_cmp(&self, other: &Self) -> Option { use ScalarValue::*; + + if self.is_null() || other.is_null() { + return None; + } // This purposely doesn't have a catch-all "(_, _)" so that // any newly added enum variant will require editing this list // or else face a compile error match (self, other) { + (Null, _) | (_, Null) => None, (Decimal32(v1, p1, s1), Decimal32(v2, p2, s2)) => { if p1.eq(p2) && s1.eq(s2) { v1.partial_cmp(v2) @@ -723,8 +728,6 @@ impl PartialOrd for ScalarValue { if k1 == k2 { v1.partial_cmp(v2) } else { None } } (Dictionary(_, _), _) => None, - (Null, Null) => Some(Ordering::Equal), - (Null, _) => None, } } } @@ -5760,10 +5763,9 @@ mod tests { .unwrap(), Ordering::Less ); - assert_eq!( + assert!( ScalarValue::try_cmp(&ScalarValue::Int32(None), &ScalarValue::Int32(Some(2))) - .unwrap(), - Ordering::Less + .is_err() ); assert_starts_with( ScalarValue::try_cmp( @@ -9348,4 +9350,23 @@ mod tests { ] ); } + #[test] + fn scalar_partial_ordering_nulls() { + use ScalarValue::*; + + assert_eq!( + Int32(Some(3)).partial_cmp(&Int32(None)), + None + ); + + assert_eq!( + Int32(None).partial_cmp(&Int32(Some(3))), + None + ); + + assert_eq!( + Int32(None).partial_cmp(&Int32(None)), + None + ); + } } diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 03310a7bde193..e4716baa711ff 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -1026,22 +1026,26 @@ mod tests { ScalarValue::Int32(Some(2)), Null, ScalarValue::Int32(Some(0)), - ] < vec![ + ] + .partial_cmp(&vec![ ScalarValue::Int32(Some(2)), Null, ScalarValue::Int32(Some(1)), - ] + ]) + .is_none() ); assert!( vec![ ScalarValue::Int32(Some(2)), ScalarValue::Int32(None), ScalarValue::Int32(Some(0)), - ] < vec![ + ] + .partial_cmp(&vec![ ScalarValue::Int32(Some(2)), ScalarValue::Int32(None), ScalarValue::Int32(Some(1)), - ] + ]) + .is_none() ); } From 390c7c78017f57a8991b7091d1cd4b1cd6195729 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Thu, 1 Jan 2026 00:57:02 +0530 Subject: [PATCH 2/8] Remove unreachable NULL match arm in ScalarValue::partial_cmp --- datafusion/common/src/scalar/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 772ee972bb4fb..2bb6ade832ab7 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -576,7 +576,6 @@ impl PartialOrd for ScalarValue { // any newly added enum variant will require editing this list // or else face a compile error match (self, other) { - (Null, _) | (_, Null) => None, (Decimal32(v1, p1, s1), Decimal32(v2, p2, s2)) => { if p1.eq(p2) && s1.eq(s2) { v1.partial_cmp(v2) @@ -728,6 +727,8 @@ impl PartialOrd for ScalarValue { if k1 == k2 { v1.partial_cmp(v2) } else { None } } (Dictionary(_, _), _) => None, + // Null is handled by the early return above, but we need this for exhaustiveness + (Null, _) => None, } } } From 500697ef13948dc2a0d6525cc8ceee72dfca026e Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Thu, 1 Jan 2026 00:59:11 +0530 Subject: [PATCH 3/8] Add tests for explicit ScalarValue::Null partial ordering --- datafusion/common/src/scalar/mod.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 2bb6ade832ab7..a7d5317ac49b3 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -9369,5 +9369,20 @@ mod tests { Int32(None).partial_cmp(&Int32(None)), None ); + + assert_eq!( + Null.partial_cmp(&Int32(Some(3))), + None + ); + + assert_eq!( + Int32(Some(3)).partial_cmp(&Null), + None + ); + + assert_eq!( + Null.partial_cmp(&Null), + None + ); } } From d868e5b3c6fe88d5eb3f9522a98d7c5f97fc79f8 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Thu, 1 Jan 2026 15:16:24 +0530 Subject: [PATCH 4/8] docs: clarify try_cmp behavior for NULL inputs --- datafusion/common/src/scalar/mod.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index a7d5317ac49b3..a39746851ceb0 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -4021,11 +4021,15 @@ impl ScalarValue { arr1 == &right } - /// Compare `self` with `other` and return an `Ordering`. + /// Compare two `ScalarValue`s. /// - /// This is the same as [`PartialOrd`] except that it returns - /// `Err` if the values cannot be compared, e.g., they have incompatible data types. - pub fn try_cmp(&self, other: &Self) -> Result { + /// Returns an error if: + /// * the values are of incompatible types, or + /// * either value is NULL. + /// + /// This differs from `partial_cmp`, which returns `None` for NULL inputs + /// instead of an error. + pub fn try_cmp(&self, other: &Self) -> Result { self.partial_cmp(other).ok_or_else(|| { _internal_datafusion_err!("Uncomparable values: {self:?}, {other:?}") }) From 1e8b53294c63ffbd7f90a0d6c8a854101205ae18 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Thu, 1 Jan 2026 15:20:51 +0530 Subject: [PATCH 5/8] Align vector ordering with PostgreSQL NULL semantics --- datafusion/common/src/utils/mod.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index e4716baa711ff..0c630a6beb2bd 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -1027,12 +1027,11 @@ mod tests { Null, ScalarValue::Int32(Some(0)), ] - .partial_cmp(&vec![ + < vec![ ScalarValue::Int32(Some(2)), Null, ScalarValue::Int32(Some(1)), - ]) - .is_none() + ] ); assert!( vec![ @@ -1040,12 +1039,11 @@ mod tests { ScalarValue::Int32(None), ScalarValue::Int32(Some(0)), ] - .partial_cmp(&vec![ + < vec![ ScalarValue::Int32(Some(2)), ScalarValue::Int32(None), ScalarValue::Int32(Some(1)), - ]) - .is_none() + ] ); } From 552a1e9ce5fb3eb951ae4a62310772e19d73d737 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Thu, 1 Jan 2026 21:11:24 +0530 Subject: [PATCH 6/8] Align vector ordering tests with ScalarValue NULL partial_cmp semantics --- datafusion/common/src/scalar/mod.rs | 6 ++++-- datafusion/common/src/utils/mod.rs | 16 ++++++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index a39746851ceb0..d5674e65636f6 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -727,8 +727,10 @@ impl PartialOrd for ScalarValue { if k1 == k2 { v1.partial_cmp(v2) } else { None } } (Dictionary(_, _), _) => None, - // Null is handled by the early return above, but we need this for exhaustiveness - (Null, _) => None, + // Nulls are handled by the early return above + (Null, _) | (_, Null) => unreachable!( + "Nulls are already handled before entering ScalarValue::partial_cmp match" + ), } } } diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 0c630a6beb2bd..51de9a76cfd0a 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -1021,29 +1021,33 @@ mod tests { fn vector_ord() { assert!(vec![1, 0, 0, 0, 0, 0, 0, 1] < vec![1, 0, 0, 0, 0, 0, 0, 2]); assert!(vec![1, 0, 0, 0, 0, 0, 1, 1] > vec![1, 0, 0, 0, 0, 0, 0, 2]); - assert!( + // Vectors containing Null values cannot be compared because + // ScalarValue::partial_cmp returns None for null comparisons + assert_eq!( vec![ ScalarValue::Int32(Some(2)), Null, ScalarValue::Int32(Some(0)), ] - < vec![ + .partial_cmp(&vec![ ScalarValue::Int32(Some(2)), Null, ScalarValue::Int32(Some(1)), - ] + ]), + None ); - assert!( + assert_eq!( vec![ ScalarValue::Int32(Some(2)), ScalarValue::Int32(None), ScalarValue::Int32(Some(0)), ] - < vec![ + .partial_cmp(&vec![ ScalarValue::Int32(Some(2)), ScalarValue::Int32(None), ScalarValue::Int32(Some(1)), - ] + ]), + None ); } From 8cdb9693b00c480dd5e8ef0ee9ea64219f23abda Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Sat, 3 Jan 2026 00:34:45 +0530 Subject: [PATCH 7/8] Fix ScalarValue partial_cmp and clippy warnings in datafusion-common --- datafusion/common/src/error.rs | 1 - datafusion/common/src/hash_utils.rs | 1 + datafusion/common/src/scalar/mod.rs | 39 +++++++---------------------- 3 files changed, 10 insertions(+), 31 deletions(-) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 4f681896dfc66..e6b61e2b8f15f 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -1243,7 +1243,6 @@ mod test { // To pass the test the environment variable RUST_BACKTRACE should be set to 1 to enforce backtrace #[cfg(feature = "backtrace")] #[test] - #[expect(clippy::unnecessary_literal_unwrap)] fn test_enabled_backtrace() { match std::env::var("RUST_BACKTRACE") { Ok(val) if val == "1" => {} diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index 98dd1f235aee7..1342face3998b 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -276,6 +276,7 @@ fn hash_array( /// HAS_NULLS: do we have to check null in the inner loop /// HAS_BUFFERS: if true, array has external buffers; if false, all strings are inlined/ less then 12 bytes /// REHASH: if true, combining with existing hash, otherwise initializing +#[cfg(not(feature = "force_hash_collisions"))] #[inline(never)] fn hash_string_view_array_inner< T: ByteViewType, diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index d5674e65636f6..4694ce077ba52 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -727,10 +727,7 @@ impl PartialOrd for ScalarValue { if k1 == k2 { v1.partial_cmp(v2) } else { None } } (Dictionary(_, _), _) => None, - // Nulls are handled by the early return above - (Null, _) | (_, Null) => unreachable!( - "Nulls are already handled before entering ScalarValue::partial_cmp match" - ), + _ => None, } } } @@ -4023,7 +4020,7 @@ impl ScalarValue { arr1 == &right } - /// Compare two `ScalarValue`s. + /// Compare two `ScalarValue`s. /// /// Returns an error if: /// * the values are of incompatible types, or @@ -4031,7 +4028,7 @@ impl ScalarValue { /// /// This differs from `partial_cmp`, which returns `None` for NULL inputs /// instead of an error. - pub fn try_cmp(&self, other: &Self) -> Result { + pub fn try_cmp(&self, other: &Self) -> Result { self.partial_cmp(other).ok_or_else(|| { _internal_datafusion_err!("Uncomparable values: {self:?}, {other:?}") }) @@ -9361,34 +9358,16 @@ mod tests { fn scalar_partial_ordering_nulls() { use ScalarValue::*; - assert_eq!( - Int32(Some(3)).partial_cmp(&Int32(None)), - None - ); + assert_eq!(Int32(Some(3)).partial_cmp(&Int32(None)), None); - assert_eq!( - Int32(None).partial_cmp(&Int32(Some(3))), - None - ); + assert_eq!(Int32(None).partial_cmp(&Int32(Some(3))), None); - assert_eq!( - Int32(None).partial_cmp(&Int32(None)), - None - ); + assert_eq!(Int32(None).partial_cmp(&Int32(None)), None); - assert_eq!( - Null.partial_cmp(&Int32(Some(3))), - None - ); + assert_eq!(Null.partial_cmp(&Int32(Some(3))), None); - assert_eq!( - Int32(Some(3)).partial_cmp(&Null), - None - ); + assert_eq!(Int32(Some(3)).partial_cmp(&Null), None); - assert_eq!( - Null.partial_cmp(&Null), - None - ); + assert_eq!(Null.partial_cmp(&Null), None); } } From 3f8c1db92e98ea7a3cdd24b0673ab5dc59d9b421 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Sat, 3 Jan 2026 00:35:40 +0530 Subject: [PATCH 8/8] Fix ORDER BY rewrite for aggregate aliases and preserve casts --- datafusion/expr/src/expr_rewriter/order_by.rs | 50 +++++++++++++++++-- datafusion/expr/src/logical_plan/plan.rs | 2 +- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index ec22be525464b..38e256eb1739a 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -24,7 +24,7 @@ use crate::{Cast, Expr, LogicalPlan, TryCast, expr::Sort}; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, }; -use datafusion_common::{Column, Result}; +use datafusion_common::{Column, Result, TableReference}; /// Rewrite sort on aggregate expressions to sort on the column of aggregate output /// For example, `max(x)` is written to `col("max(x)")` @@ -107,7 +107,7 @@ fn rewrite_in_terms_of_projection( for proj_expr in proj_exprs { proj_expr.apply(|e| { if expr_match(&search_col, e) { - found = Some(e.clone()); + found = Some(proj_expr.clone()); return Ok(TreeNodeRecursion::Stop); } Ok(TreeNodeRecursion::Continue) @@ -115,16 +115,56 @@ fn rewrite_in_terms_of_projection( } if let Some(found) = found { + // Determine what to return based on the original expression type + let result_expr = if let Expr::Column(original_col) = &expr { + // For plain columns, preserve the original qualification status + Expr::Column(Column::new( + original_col.relation.clone(), + search_col.try_as_col().unwrap().name.clone(), + )) + } else { + // For other expressions (aggregates, etc.), return a column reference + // to the projection output, unless it's wrapped in a cast + match &normalized_expr { + Expr::Cast(_) | Expr::TryCast(_) => { + // For casts, use the projection expression to preserve aliases + found + } + _ => { + // For aggregates and other expressions, create a column reference + // Split the column name at the last dot to handle legacy qualified names + let col_name = search_col.try_as_col().unwrap().name.as_str(); + let col_ref = if let Some((relation, field_name)) = + col_name.rsplit_once('.') + { + Expr::Column(Column::new( + Some(TableReference::bare(relation)), + field_name, + )) + } else { + search_col + }; + + // If the projection expression has an alias, preserve it + if let Expr::Alias(Alias { name, .. }) = &found { + col_ref.alias(name.clone()) + } else { + col_ref + } + } + } + }; + return Ok(Transformed::yes(match normalized_expr { Expr::Cast(Cast { expr: _, data_type }) => Expr::Cast(Cast { - expr: Box::new(found), + expr: Box::new(result_expr), data_type, }), Expr::TryCast(TryCast { expr: _, data_type }) => Expr::TryCast(TryCast { - expr: Box::new(found), + expr: Box::new(result_expr), data_type, }), - _ => found, + _ => result_expr, })); } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 4219c24bfc9c9..e8ace62330642 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -4761,7 +4761,7 @@ mod tests { let f2 = count_window_function(schema_without_metadata()); assert_eq!(f, f2); assert_eq!(hash(&f), hash(&f2)); - assert_eq!(f.partial_cmp(&f2), Some(Ordering::Equal)); + assert_eq!(f.partial_cmp(&f2), None); // Same like `f`, except for schema metadata let o = count_window_function(schema_with_metadata());