Skip to content

Commit 05451da

Browse files
authored
feat(spark): implement Spark date_diff function (apache#19845)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes apache#123` indicates that this PR will close issue apache#123. --> - Closes apache#19844 - Part of apache#15914 ## Rationale for this change Add support for spark https://spark.apache.org/docs/latest/api/sql/index.html#date_diff function ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? yes in SLT ## Are there any user-facing changes? yes
1 parent 4ed808a commit 05451da

File tree

4 files changed

+262
-48
lines changed

4 files changed

+262
-48
lines changed
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::any::Any;
19+
use std::sync::Arc;
20+
21+
use arrow::datatypes::{DataType, Field, FieldRef};
22+
use datafusion_common::types::{NativeType, logical_date, logical_string};
23+
use datafusion_common::utils::take_function_args;
24+
use datafusion_common::{Result, internal_err};
25+
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
26+
use datafusion_expr::{
27+
Coercion, ColumnarValue, Expr, ExprSchemable, Operator, ReturnFieldArgs,
28+
ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
29+
binary_expr,
30+
};
31+
32+
/// <https://spark.apache.org/docs/latest/api/sql/index.html#date_diff>
33+
#[derive(Debug, PartialEq, Eq, Hash)]
34+
pub struct SparkDateDiff {
35+
signature: Signature,
36+
aliases: Vec<String>,
37+
}
38+
39+
impl Default for SparkDateDiff {
40+
fn default() -> Self {
41+
Self::new()
42+
}
43+
}
44+
45+
impl SparkDateDiff {
46+
pub fn new() -> Self {
47+
Self {
48+
signature: Signature::coercible(
49+
vec![
50+
Coercion::new_implicit(
51+
TypeSignatureClass::Native(logical_date()),
52+
vec![
53+
TypeSignatureClass::Native(logical_string()),
54+
TypeSignatureClass::Timestamp,
55+
],
56+
NativeType::Date,
57+
),
58+
Coercion::new_implicit(
59+
TypeSignatureClass::Native(logical_date()),
60+
vec![
61+
TypeSignatureClass::Native(logical_string()),
62+
TypeSignatureClass::Timestamp,
63+
],
64+
NativeType::Date,
65+
),
66+
],
67+
Volatility::Immutable,
68+
),
69+
aliases: vec!["datediff".to_string()],
70+
}
71+
}
72+
}
73+
74+
impl ScalarUDFImpl for SparkDateDiff {
75+
fn as_any(&self) -> &dyn Any {
76+
self
77+
}
78+
79+
fn name(&self) -> &str {
80+
"date_diff"
81+
}
82+
83+
fn aliases(&self) -> &[String] {
84+
&self.aliases
85+
}
86+
87+
fn signature(&self) -> &Signature {
88+
&self.signature
89+
}
90+
91+
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
92+
internal_err!("return_field_from_args should be used instead")
93+
}
94+
95+
fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
96+
let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
97+
Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable)))
98+
}
99+
100+
fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
101+
internal_err!(
102+
"Apache Spark `date_diff` should have been simplified to standard subtraction"
103+
)
104+
}
105+
106+
fn simplify(
107+
&self,
108+
args: Vec<Expr>,
109+
info: &SimplifyContext,
110+
) -> Result<ExprSimplifyResult> {
111+
let [end, start] = take_function_args(self.name(), args)?;
112+
let end = end.cast_to(&DataType::Date32, info.schema())?;
113+
let start = start.cast_to(&DataType::Date32, info.schema())?;
114+
Ok(ExprSimplifyResult::Simplified(
115+
binary_expr(end, Operator::Minus, start)
116+
.cast_to(&DataType::Int32, info.schema())?,
117+
))
118+
}
119+
}

datafusion/spark/src/function/datetime/mod.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// under the License.
1717

1818
pub mod date_add;
19+
pub mod date_diff;
1920
pub mod date_part;
2021
pub mod date_sub;
2122
pub mod date_trunc;
@@ -32,6 +33,7 @@ use datafusion_functions::make_udf_function;
3233
use std::sync::Arc;
3334

3435
make_udf_function!(date_add::SparkDateAdd, date_add);
36+
make_udf_function!(date_diff::SparkDateDiff, date_diff);
3537
make_udf_function!(date_part::SparkDatePart, date_part);
3638
make_udf_function!(date_sub::SparkDateSub, date_sub);
3739
make_udf_function!(date_trunc::SparkDateTrunc, date_trunc);
@@ -91,6 +93,11 @@ pub mod expr_fn {
9193
"Returns the first date which is later than start_date and named as indicated. The function returns NULL if at least one of the input parameters is NULL.",
9294
arg1 arg2
9395
));
96+
export_functions!((
97+
date_diff,
98+
"Returns the number of days from start `start` to end `end`.",
99+
end start
100+
));
94101
export_functions!((
95102
date_trunc,
96103
"Truncates a timestamp `ts` to the unit specified by the format `fmt`.",
@@ -110,13 +117,13 @@ pub mod expr_fn {
110117
date_part,
111118
"Extracts a part of the date or time from a date, time, or timestamp expression.",
112119
arg1 arg2
113-
114120
));
115121
}
116122

117123
pub fn functions() -> Vec<Arc<ScalarUDF>> {
118124
vec![
119125
date_add(),
126+
date_diff(),
120127
date_part(),
121128
date_sub(),
122129
date_trunc(),

datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt

Lines changed: 135 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,138 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
# This file was originally created by a porting script from:
19-
# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
20-
# This file is part of the implementation of the datafusion-spark function library.
21-
# For more information, please see:
22-
# https://github.com/apache/datafusion/issues/15914
23-
24-
## Original Query: SELECT date_diff('2009-07-30', '2009-07-31');
25-
## PySpark 3.5.5 Result: {'date_diff(2009-07-30, 2009-07-31)': -1, 'typeof(date_diff(2009-07-30, 2009-07-31))': 'int', 'typeof(2009-07-30)': 'string', 'typeof(2009-07-31)': 'string'}
26-
#query
27-
#SELECT date_diff('2009-07-30'::string, '2009-07-31'::string);
28-
29-
## Original Query: SELECT date_diff('2009-07-31', '2009-07-30');
30-
## PySpark 3.5.5 Result: {'date_diff(2009-07-31, 2009-07-30)': 1, 'typeof(date_diff(2009-07-31, 2009-07-30))': 'int', 'typeof(2009-07-31)': 'string', 'typeof(2009-07-30)': 'string'}
31-
#query
32-
#SELECT date_diff('2009-07-31'::string, '2009-07-30'::string);
18+
# date input
19+
query I
20+
SELECT date_diff('2009-07-30'::date, '2009-07-31'::date);
21+
----
22+
-1
23+
24+
query I
25+
SELECT date_diff('2009-07-31'::date, '2009-07-30'::date);
26+
----
27+
1
28+
29+
query I
30+
SELECT date_diff('2009-07-31'::string, '2009-07-30'::date);
31+
----
32+
1
33+
34+
query I
35+
SELECT date_diff('2009-07-31'::timestamp, '2009-07-30'::date);
36+
----
37+
1
38+
39+
# Date64 input
40+
query I
41+
SELECT date_diff(arrow_cast('2009-07-31', 'Date64'), arrow_cast('2009-07-30', 'Date64'));
42+
----
43+
1
44+
45+
query I
46+
SELECT date_diff(arrow_cast('2009-07-30', 'Date64'), arrow_cast('2009-07-31', 'Date64'));
47+
----
48+
-1
49+
50+
# Mixed Date32 and Date64 input
51+
query I
52+
SELECT date_diff('2009-07-31'::date, arrow_cast('2009-07-30', 'Date64'));
53+
----
54+
1
55+
56+
query I
57+
SELECT date_diff(arrow_cast('2009-07-31', 'Date64'), '2009-07-30'::date);
58+
----
59+
1
60+
61+
62+
# Same date returns 0
63+
query I
64+
SELECT date_diff('2009-07-30'::date, '2009-07-30'::date);
65+
----
66+
0
67+
68+
# Large difference
69+
query I
70+
SELECT date_diff('2020-01-01'::date, '1970-01-01'::date);
71+
----
72+
18262
73+
74+
# timestamp input
75+
query I
76+
SELECT date_diff('2009-07-30 12:34:56'::timestamp, '2009-07-31 23:45:01'::timestamp);
77+
----
78+
-1
79+
80+
query I
81+
SELECT date_diff('2009-07-31 23:45:01'::timestamp, '2009-07-30 12:34:56'::timestamp);
82+
----
83+
1
84+
85+
query I
86+
SELECT date_diff('2009-07-31 23:45:01'::string, '2009-07-30 12:34:56'::timestamp);
87+
----
88+
1
89+
90+
# string input
91+
query I
92+
SELECT date_diff('2009-07-30', '2009-07-31');
93+
----
94+
-1
95+
96+
query I
97+
SELECT date_diff('2009-07-31', '2009-07-30');
98+
----
99+
1
100+
101+
# NULL handling
102+
query I
103+
SELECT date_diff(NULL::date, '2009-07-30'::date);
104+
----
105+
NULL
106+
107+
query I
108+
SELECT date_diff('2009-07-31'::date, NULL::date);
109+
----
110+
NULL
111+
112+
query I
113+
SELECT date_diff(NULL::date, NULL::date);
114+
----
115+
NULL
116+
117+
query I
118+
SELECT date_diff(column1, column2)
119+
FROM VALUES
120+
('2009-07-30'::date, '2009-07-31'::date),
121+
('2009-07-31'::date, '2009-07-30'::date),
122+
(NULL::date, '2009-07-30'::date),
123+
('2009-07-31'::date, NULL::date),
124+
(NULL::date, NULL::date);
125+
----
126+
-1
127+
1
128+
NULL
129+
NULL
130+
NULL
131+
132+
133+
# Alias datediff
134+
query I
135+
SELECT datediff('2009-07-30'::date, '2009-07-31'::date);
136+
----
137+
-1
138+
139+
query I
140+
SELECT datediff(column1, column2)
141+
FROM VALUES
142+
('2009-07-30'::date, '2009-07-31'::date),
143+
('2009-07-31'::date, '2009-07-30'::date),
144+
(NULL::date, '2009-07-30'::date),
145+
('2009-07-31'::date, NULL::date),
146+
(NULL::date, NULL::date);
147+
----
148+
-1
149+
1
150+
NULL
151+
NULL
152+
NULL

datafusion/sqllogictest/test_files/spark/datetime/datediff.slt

Lines changed: 0 additions & 32 deletions
This file was deleted.

0 commit comments

Comments
 (0)