Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
de30cb0
create first test
sdf-jkl Nov 6, 2025
052ab95
year_literal_to_type_with_op function
sdf-jkl Nov 10, 2025
d9a4253
kinda works
sdf-jkl Nov 10, 2025
83bae02
getting_there
sdf-jkl Nov 17, 2025
eec27c1
Merge branch 'apache:main' into pre-image-support
sdf-jkl Nov 17, 2025
1e794c3
taplo format
sdf-jkl Nov 19, 2025
2ec0b9a
add op_swap test
sdf-jkl Nov 19, 2025
630f5c5
Fix comment
sdf-jkl Nov 19, 2025
fbcfb97
commented out attempt for inlist support
sdf-jkl Nov 19, 2025
2b54409
restructure the changes
sdf-jkl Nov 20, 2025
3260ee9
Update datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
sdf-jkl Nov 20, 2025
cdfc2f5
Merge branch 'main' into pre-image-support
sdf-jkl Nov 20, 2025
d011b51
Fix merge error
sdf-jkl Nov 20, 2025
cd390df
cargo fmt plus some fixes
sdf-jkl Nov 21, 2025
36bb529
Add sqllogictest
sdf-jkl Nov 21, 2025
7c4dd9c
Add other than year date_part tests
sdf-jkl Nov 21, 2025
cdb9cb6
Fix docs
sdf-jkl Nov 21, 2025
57ee667
cargo fmt
sdf-jkl Nov 21, 2025
bb50792
Cargo fmt + doc changes
sdf-jkl Nov 21, 2025
fa3b31c
Merge branch 'main' into pre-image-support
sdf-jkl Nov 21, 2025
729951b
Rewrite logic
sdf-jkl Dec 1, 2025
8b62a2d
Merge branch 'pre-image-support' of https://github.com/sdf-jkl/datafu…
sdf-jkl Dec 1, 2025
ab9b444
support IsDistinctFrom and IsNotDistinctFrom
sdf-jkl Dec 2, 2025
5286626
Update sqllogictests
sdf-jkl Dec 2, 2025
5106049
Remove commented out changes
sdf-jkl Dec 3, 2025
19fafd1
Merge branch 'main' into pre-image-support
sdf-jkl Dec 3, 2025
516e637
Merge branch 'main' into pre-image-support
sdf-jkl Dec 8, 2025
5899a3a
Merge branch 'main' into pre-image-support
sdf-jkl Dec 8, 2025
e1c358f
Implementing suggestions
sdf-jkl Dec 12, 2025
63761a1
Merge branch 'pre-image-support' of https://github.com/sdf-jkl/datafu…
sdf-jkl Dec 12, 2025
c79e937
fix ci
sdf-jkl Dec 12, 2025
0318c62
Merge branch 'main' of https://github.com/apache/datafusion into pre-…
sdf-jkl Dec 12, 2025
352b3ff
cargo fmt
sdf-jkl Dec 12, 2025
7dbdc00
Move tests to core/optimizer/tests
sdf-jkl Dec 15, 2025
19ba392
Merge branch 'main' of https://github.com/apache/datafusion into pre-…
sdf-jkl Dec 16, 2025
5456976
Merge branch 'main' of https://github.com/apache/datafusion into pre-…
sdf-jkl Dec 22, 2025
114eb50
Merge branch 'main' of https://github.com/apache/datafusion into pre-…
sdf-jkl Dec 23, 2025
130413d
Improve IsDistinctFrom, IsNotDistinctFrom logic and add unit tests fo…
sdf-jkl Dec 23, 2025
ae05293
Return sqllogictests + add unit tests
sdf-jkl Dec 23, 2025
bc91039
Merge branch 'main' into pre-image-support
sdf-jkl Jan 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 201 additions & 2 deletions datafusion/core/tests/optimizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
//! Tests for the DataFusion SQL query planner that require functions from the
//! datafusion-functions crate.

use datafusion_expr::execution_props::ExecutionProps;
use datafusion_expr::simplify::SimplifyContext;
use datafusion_optimizer::simplify_expressions::ExprSimplifier;
use insta::assert_snapshot;
use std::any::Any;
use std::collections::HashMap;
Expand All @@ -26,13 +29,16 @@ use std::sync::Arc;
use arrow::datatypes::{
DataType, Field, Fields, Schema, SchemaBuilder, SchemaRef, TimeUnit,
};
use datafusion::functions::datetime::expr_fn;
use datafusion_common::config::ConfigOptions;
use datafusion_common::tree_node::TransformedResult;
use datafusion_common::{DFSchema, Result, ScalarValue, TableReference, plan_err};
use datafusion_common::{
DFSchema, DFSchemaRef, Result, ScalarValue, TableReference, plan_err,
};
use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
use datafusion_expr::{
AggregateUDF, BinaryExpr, Expr, ExprSchemable, LogicalPlan, Operator, ScalarUDF,
TableSource, WindowUDF, col, lit,
TableSource, WindowUDF, and, col, lit, or,
};
use datafusion_functions::core::expr_ext::FieldAccessor;
use datafusion_optimizer::analyzer::Analyzer;
Expand Down Expand Up @@ -378,3 +384,196 @@ fn validate_unchanged_cases(guarantees: &[(Expr, NullableInterval)], cases: &[Ex
);
}
}

// DatePart preimage tests
#[test]
fn test_preimage_date_part_date32_eq() {
let schema = expr_test_schema();
// date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01
let expr_lt = expr_fn::date_part(lit("year"), col("date32")).eq(lit(2024i32));
let expected = and(
col("date32").gt_eq(lit(ScalarValue::Date32(Some(19723)))),
col("date32").lt(lit(ScalarValue::Date32(Some(20089)))),
);
assert_eq!(optimize_test(expr_lt, &schema), expected)
}

#[test]
fn test_preimage_date_part_date64_not_eq() {
let schema = expr_test_schema();
// date_part(c1, DatePart::Year) <> 2024 -> c1 < 2024-01-01 AND c1 >= 2025-01-01
let expr_lt = expr_fn::date_part(lit("year"), col("date64")).not_eq(lit(2024i32));
let expected = or(
col("date64").lt(lit(ScalarValue::Date64(Some(19723 * 86_400_000)))),
col("date64").gt_eq(lit(ScalarValue::Date64(Some(20089 * 86_400_000)))),
);
assert_eq!(optimize_test(expr_lt, &schema), expected)
}

#[test]
fn test_preimage_date_part_timestamp_nano_lt() {
let schema = expr_test_schema();
let expr_lt = expr_fn::date_part(lit("year"), col("ts_nano_none")).lt(lit(2024i32));
let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond(
Some(19723 * 86_400_000_000_000),
None,
)));
assert_eq!(optimize_test(expr_lt, &schema), expected)
}

#[test]
fn test_preimage_date_part_timestamp_nano_utc_gt() {
let schema = expr_test_schema();
let expr_lt = expr_fn::date_part(lit("year"), col("ts_nano_utc")).gt(lit(2024i32));
let expected = col("ts_nano_utc").gt_eq(lit(ScalarValue::TimestampNanosecond(
Some(20089 * 86_400_000_000_000),
None,
)));
assert_eq!(optimize_test(expr_lt, &schema), expected)
}

#[test]
fn test_preimage_date_part_timestamp_sec_est_gt_eq() {
let schema = expr_test_schema();
let expr_lt = expr_fn::date_part(lit("year"), col("ts_sec_est")).gt_eq(lit(2024i32));
let expected = col("ts_sec_est").gt_eq(lit(ScalarValue::TimestampSecond(
Some(19723 * 86_400),
None,
)));
assert_eq!(optimize_test(expr_lt, &schema), expected)
}

#[test]
fn test_preimage_date_part_timestamp_sec_est_lt_eq() {
let schema = expr_test_schema();
let expr_lt = expr_fn::date_part(lit("year"), col("ts_mic_pt")).lt_eq(lit(2024i32));
let expected = col("ts_mic_pt").lt(lit(ScalarValue::TimestampMicrosecond(
Some(20089 * 86_400_000_000),
None,
)));
assert_eq!(optimize_test(expr_lt, &schema), expected)
}

#[test]
fn test_preimage_date_part_timestamp_nano_lt_swap() {
let schema = expr_test_schema();
let expr_lt = lit(2024i32).gt(expr_fn::date_part(lit("year"), col("ts_nano_none")));
let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond(
Some(19723 * 86_400_000_000_000),
None,
)));
assert_eq!(optimize_test(expr_lt, &schema), expected)
}

#[test]
fn test_preimage_date_part_date32_is_not_distinct_from() {
let schema = expr_test_schema();
// date_part(c1, DatePart::Year) is not distinct from 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 (the null handling part is dropped since rhs is not null)
let expr_lt = Expr::BinaryExpr(BinaryExpr {
left: Box::new(expr_fn::date_part(lit("year"), col("date32"))),
op: Operator::IsNotDistinctFrom,
right: Box::new(lit(2024i32)),
});
let expected = and(
col("date32").gt_eq(lit(ScalarValue::Date32(Some(19723)))),
col("date32").lt(lit(ScalarValue::Date32(Some(20089)))),
);
assert_eq!(optimize_test(expr_lt, &schema), expected)
}

#[test]
// Should not simplify - interval can't be calculated
fn test_preimage_date_part_date32_is_not_distinct_from_null() {
let schema = expr_test_schema();
// date_part(c1, DatePart::Year) is not distinct from Null -> unchanged
let expr_lt = Expr::BinaryExpr(BinaryExpr {
left: Box::new(expr_fn::date_part(lit("year"), col("date32"))),
op: Operator::IsNotDistinctFrom,
right: Box::new(lit(ScalarValue::Null)),
});
assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt)
}

#[test]
fn test_preimage_date_part_date64_is_distinct_from() {
let schema = expr_test_schema();
// date_part(c1, DatePart::Year) is distinct from 2024 -> c1 < 2024-01-01 OR c1 >= 2025-01-01 or c1 is NULL
let expr_lt = Expr::BinaryExpr(BinaryExpr {
left: Box::new(expr_fn::date_part(lit("year"), col("date64"))),
op: Operator::IsDistinctFrom,
right: Box::new(lit(2024i32)),
});
let expected = col("date64")
.lt(lit(ScalarValue::Date64(Some(19723 * 86_400_000))))
.or(col("date64").gt_eq(lit(ScalarValue::Date64(Some(20089 * 86_400_000)))))
.or(col("date64").is_null());
assert_eq!(optimize_test(expr_lt, &schema), expected)
}

#[test]
// Should not simplify - interval can't be calculated
fn test_preimage_date_part_date64_is_distinct_from_null() {
let schema = expr_test_schema();
// date_part(c1, DatePart::Year) is distinct from 2024 -> c1 < 2024-01-01 OR c1 >= unchanged
let expr_lt = Expr::BinaryExpr(BinaryExpr {
left: Box::new(expr_fn::date_part(lit("year"), col("date64"))),
op: Operator::IsDistinctFrom,
right: Box::new(lit(ScalarValue::Null)),
});
assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt)
}

#[test]
// Should not simplify
fn test_preimage_date_part_not_year_date32_eq() {
let schema = expr_test_schema();
// date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01
let expr_lt = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32));
assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt)
}

fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr {
let props = ExecutionProps::new();
let simplifier =
ExprSimplifier::new(SimplifyContext::new(&props).with_schema(Arc::clone(schema)));

simplifier.simplify(expr).unwrap()
}

fn expr_test_schema() -> DFSchemaRef {
Arc::new(
DFSchema::from_unqualified_fields(
vec![
Field::new("date32", DataType::Date32, true),
Field::new("date64", DataType::Date64, true),
Field::new("ts_nano_none", timestamp_nano_none_type(), true),
Field::new("ts_nano_utc", timestamp_nano_utc_type(), true),
Field::new("ts_sec_est", timestamp_sec_est_type(), true),
Field::new("ts_mic_pt", timestamp_mic_pt_type(), true),
]
.into(),
HashMap::new(),
)
.unwrap(),
)
}

fn timestamp_nano_none_type() -> DataType {
DataType::Timestamp(TimeUnit::Nanosecond, None)
}

// this is the type that now() returns
fn timestamp_nano_utc_type() -> DataType {
let utc = Some("+0:00".into());
DataType::Timestamp(TimeUnit::Nanosecond, utc)
}

fn timestamp_sec_est_type() -> DataType {
let est = Some("-5:00".into());
DataType::Timestamp(TimeUnit::Second, est)
}

fn timestamp_mic_pt_type() -> DataType {
let pt = Some("-8::00".into());
DataType::Timestamp(TimeUnit::Microsecond, pt)
}
62 changes: 62 additions & 0 deletions datafusion/expr/src/udf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,25 @@ impl ScalarUDF {
self.inner.simplify(args, info)
}

/// Return a preimage
///
/// See [`ScalarUDFImpl::preimage`] for more details.
pub fn preimage(
&self,
args: &[Expr],
lit_expr: &Expr,
info: &dyn SimplifyInfo,
) -> Result<Option<Interval>> {
self.inner.preimage(args, lit_expr, info)
}

/// Return inner column from function args
///
/// See [`ScalarUDFImpl::column_expr`]
pub fn column_expr(&self, args: &[Expr]) -> Option<Expr> {
self.inner.column_expr(args)
Comment on lines +244 to +245
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a little helper method to ScalarUDFImpl to extract the inner columnar Expr

}

#[deprecated(since = "50.0.0", note = "Use `return_field_from_args` instead.")]
pub fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool {
#[expect(deprecated)]
Expand Down Expand Up @@ -696,6 +715,36 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
Ok(ExprSimplifyResult::Original(args))
}

/// Returns the [preimage] for this function and the specified scalar value, if any.
///
/// A preimage is a single contiguous [`Interval`] of values where the function
/// will always return `lit_value`
///
/// This rewrite is described in the [ClickHouse Paper] and is particularly
/// useful for simplifying expressions `date_part` or equivalent functions. The
/// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you
/// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates
/// covering the entire year of 2024. Thus, you can rewrite the expression to `k
/// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable.
///
/// This should only return a preimage if the function takes a single argument
///
/// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf
/// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image
fn preimage(
&self,
_args: &[Expr],
_lit_expr: &Expr,
_info: &dyn SimplifyInfo,
) -> Result<Option<Interval>> {
Ok(None)
}

// Return the inner column expression from this function
fn column_expr(&self, _args: &[Expr]) -> Option<Expr> {
None
}

/// Returns true if some of this `exprs` subexpressions may not be evaluated
/// and thus any side effects (like divide by zero) may not be encountered.
///
Expand Down Expand Up @@ -926,6 +975,19 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
self.inner.simplify(args, info)
}

fn preimage(
&self,
args: &[Expr],
lit_expr: &Expr,
info: &dyn SimplifyInfo,
) -> Result<Option<Interval>> {
self.inner.preimage(args, lit_expr, info)
}

fn column_expr(&self, args: &[Expr]) -> Option<Expr> {
self.inner.column_expr(args)
}

fn conditional_arguments<'a>(
&self,
args: &'a [Expr],
Expand Down
Loading
Loading