From d3a2bdd0ee7cbd3183f66dbf1608668c870b05a8 Mon Sep 17 00:00:00 2001 From: Omer Hadari Date: Wed, 5 Feb 2025 17:19:37 +0200 Subject: [PATCH 1/4] Support TO_TIMESTAMP like a cast --- .../src/physical_plan/expr_to_predicate.rs | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 03fb132f2d..36820daf3f 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -17,6 +17,7 @@ use std::vec; +use datafusion::functions::datetime::to_timestamp::ToTimestampFunc; use datafusion::logical_expr::{Expr, Operator}; use datafusion::scalar::ScalarValue; use iceberg::expr::{BinaryExpression, Predicate, PredicateOperator, Reference, UnaryExpression}; @@ -120,6 +121,21 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult { } } Expr::Cast(c) => to_iceberg_predicate(&c.expr), + Expr::ScalarFunction(func) => { + if func + .func + .inner() + .as_any() + .downcast_ref::() + .is_some() + && func.args.len() == 1 + // More than 1 argument means it's a custom format - not + // supported for now + { + return to_iceberg_predicate(&func.args[0]); + } + TransformedResult::NotTransformed + } _ => TransformedResult::NotTransformed, } } @@ -403,4 +419,39 @@ mod tests { Reference::new("ts").greater_than_or_equal_to(Datum::string("2023-01-05T00:00:00")); assert_eq!(predicate, expected_predicate); } + + #[test] + fn test_to_timestamp_comparison_creates_predicate() { + let sql = "TO_TIMESTAMP(ts) >= timestamp '2023-01-05T00:00:00'"; + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + let expected_predicate = + Reference::new("ts").greater_than_or_equal_to(Datum::string("2023-01-05T00:00:00")); + assert_eq!(predicate, expected_predicate); + } + + #[test] + fn test_to_timestamp_comparison_to_cast_creates_predicate() { + let sql = "TO_TIMESTAMP(ts) >= CAST('2023-01-05T00:00:00' AS TIMESTAMP)"; + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + let expected_predicate = + Reference::new("ts").greater_than_or_equal_to(Datum::string("2023-01-05T00:00:00")); + assert_eq!(predicate, expected_predicate); + } + + #[test] + fn test_to_timestamp_with_custom_format_does_not_create_predicate() { + let sql = + "TO_TIMESTAMP(ts, 'YYYY-DD-MMTmm:HH:SS') >= CAST('2023-01-05T00:00:00' AS TIMESTAMP)"; + let predicate = convert_to_iceberg_predicate(sql); + assert_eq!(predicate, None); + } + + //#[test] + //fn test_to_date_comparison_creates_predicate() { + // let sql = "TO_DATE(ts) >= CAST('2023-01-05T00:00:00' AS DATE)"; + // let predicate = convert_to_iceberg_predicate(sql).unwrap(); + // let expected_predicate = + // Reference::new("ts").greater_than_or_equal_to(Datum::string("2023-01-05")); + // assert_eq!(predicate, expected_predicate); + //} } From 0e07afba772bc4503b0d069338d15e94fbaeb0f7 Mon Sep 17 00:00:00 2001 From: Omer Hadari Date: Sat, 8 Feb 2025 21:09:34 +0200 Subject: [PATCH 2/4] Remove explicit cast to timestamp from tests --- .../datafusion/src/physical_plan/expr_to_predicate.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 36820daf3f..90de0b19b7 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -422,7 +422,7 @@ mod tests { #[test] fn test_to_timestamp_comparison_creates_predicate() { - let sql = "TO_TIMESTAMP(ts) >= timestamp '2023-01-05T00:00:00'"; + let sql = "ts >= timestamp '2023-01-05T00:00:00'"; let predicate = convert_to_iceberg_predicate(sql).unwrap(); let expected_predicate = Reference::new("ts").greater_than_or_equal_to(Datum::string("2023-01-05T00:00:00")); @@ -431,7 +431,7 @@ mod tests { #[test] fn test_to_timestamp_comparison_to_cast_creates_predicate() { - let sql = "TO_TIMESTAMP(ts) >= CAST('2023-01-05T00:00:00' AS TIMESTAMP)"; + let sql = "ts >= CAST('2023-01-05T00:00:00' AS TIMESTAMP)"; let predicate = convert_to_iceberg_predicate(sql).unwrap(); let expected_predicate = Reference::new("ts").greater_than_or_equal_to(Datum::string("2023-01-05T00:00:00")); From 735e7dffd29d9b1f2efb5a7514a46b9a4cf820fb Mon Sep 17 00:00:00 2001 From: Omer Hadari Date: Sat, 8 Feb 2025 21:55:15 +0200 Subject: [PATCH 3/4] Convert function calls to cast, deal with dates --- .../src/physical_plan/expr_to_predicate.rs | 71 ++++++++++++++++--- 1 file changed, 60 insertions(+), 11 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 90de0b19b7..8c6192a8b2 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -17,8 +17,10 @@ use std::vec; +use datafusion::arrow::datatypes::{DataType, TimeUnit}; +use datafusion::functions::datetime::to_date::ToDateFunc; use datafusion::functions::datetime::to_timestamp::ToTimestampFunc; -use datafusion::logical_expr::{Expr, Operator}; +use datafusion::logical_expr::{Cast, Expr, Operator}; use datafusion::scalar::ScalarValue; use iceberg::expr::{BinaryExpression, Predicate, PredicateOperator, Reference, UnaryExpression}; use iceberg::spec::Datum; @@ -120,7 +122,20 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult { _ => TransformedResult::NotTransformed, } } - Expr::Cast(c) => to_iceberg_predicate(&c.expr), + Expr::Cast(c) => { + if DataType::Date32 == c.data_type || DataType::Date64 == c.data_type { + match c.expr.as_ref() { + Expr::Literal(ScalarValue::Utf8(Some(literal))) => { + let date = literal.split('T').next(); + if let Some(date) = date { + return TransformedResult::Literal(Datum::string(date)); + } + } + _ => return TransformedResult::NotTransformed, + } + } + to_iceberg_predicate(&c.expr) + } Expr::ScalarFunction(func) => { if func .func @@ -128,11 +143,29 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult { .as_any() .downcast_ref::() .is_some() + // More than 1 argument means it's a custom format - not + // supported for now && func.args.len() == 1 + { + return to_iceberg_predicate(&Expr::Cast(Cast::new( + Box::new(func.args[0].clone()), + DataType::Timestamp(TimeUnit::Nanosecond, None), + ))); + } + if func + .func + .inner() + .as_any() + .downcast_ref::() + .is_some() // More than 1 argument means it's a custom format - not // supported for now + && func.args.len() == 1 { - return to_iceberg_predicate(&func.args[0]); + return to_iceberg_predicate(&Expr::Cast(Cast::new( + Box::new(func.args[0].clone()), + DataType::Date32, + ))); } TransformedResult::NotTransformed } @@ -446,12 +479,28 @@ mod tests { assert_eq!(predicate, None); } - //#[test] - //fn test_to_date_comparison_creates_predicate() { - // let sql = "TO_DATE(ts) >= CAST('2023-01-05T00:00:00' AS DATE)"; - // let predicate = convert_to_iceberg_predicate(sql).unwrap(); - // let expected_predicate = - // Reference::new("ts").greater_than_or_equal_to(Datum::string("2023-01-05")); - // assert_eq!(predicate, expected_predicate); - //} + #[test] + fn test_to_date_comparison_creates_predicate() { + let sql = "ts >= CAST('2023-01-05T11:11:11' AS DATE)"; + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + let expected_predicate = + Reference::new("ts").greater_than_or_equal_to(Datum::string("2023-01-05")); + assert_eq!(predicate, expected_predicate); + } + + #[test] + /// When casting to DATE, usually the value is converted to datetime or timestamp, + /// and then it is truncated. DayTransform is not yet supported fully here. + /// It is specifically implemented for Strings because it is the most common use case. + /// When actual support is implemented, this test will fail and should be removed. + /// For now it is here in order to make sure that the value from within the cast + /// is not used as-is when casting to date, because it can create false predicates. + /// + /// (Consider for example `ts > CAST('2023-01-05T11:11:11' AS DATE)` which would be + /// creates a different predicate than `ts > CAST('2023-01-05T11:11:11' AS TIMESTAMP)`) + fn test_to_date_from_non_string_is_ignored() { + let sql = "ts >= CAST(123456789 AS DATE)"; + let predicate = convert_to_iceberg_predicate(sql); + assert_eq!(predicate, None); + } } From 20044dc912acdcf532766b9d766a9cf337ad17c3 Mon Sep 17 00:00:00 2001 From: Omer Hadari Date: Sat, 8 Feb 2025 22:17:46 +0200 Subject: [PATCH 4/4] Fix comment --- .../datafusion/src/physical_plan/expr_to_predicate.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 8c6192a8b2..5f745824fd 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -496,8 +496,8 @@ mod tests { /// For now it is here in order to make sure that the value from within the cast /// is not used as-is when casting to date, because it can create false predicates. /// - /// (Consider for example `ts > CAST('2023-01-05T11:11:11' AS DATE)` which would be - /// creates a different predicate than `ts > CAST('2023-01-05T11:11:11' AS TIMESTAMP)`) + /// (Consider for example `ts > CAST('2023-01-05T11:11:11' AS DATE)` which should + /// create a different predicate than `ts > CAST('2023-01-05T11:11:11' AS TIMESTAMP)`) fn test_to_date_from_non_string_is_ignored() { let sql = "ts >= CAST(123456789 AS DATE)"; let predicate = convert_to_iceberg_predicate(sql);