Skip to content

Commit 902d3b3

Browse files
fix: Add custom nullability for Spark LIKE function (#19218)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #19176 ## Rationale for this change This PR adds custom nullability handling for the Spark LIKE function. Previously, the function was using the default is_nullable which always returns true, which is not correct. ## What changes are included in this PR? - Implemented return_field_from_args() to handle custom nullability logic - The result is nullable if any of the input arguments is nullable - This matches Spark's behavior where LIKE(NULL, pattern) or LIKE(str, NULL) returns NULL - Updated return_type() to use internal_err! pattern to enforce use of return_field_from_args - Added comprehensive nullability tests covering all combinations: - Non-nullable when both inputs are non-nullable - Nullable when first input is nullable - Nullable when second input is nullable - Nullable when both inputs are nullable ## Testing All existing tests pass, including the addition add ones. The implementation follows the same pattern used by other Spark functions in the codebase (like shuffle and array).
1 parent 0bd8809 commit 902d3b3

File tree

1 file changed

+86
-6
lines changed
  • datafusion/spark/src/function/string

1 file changed

+86
-6
lines changed

datafusion/spark/src/function/string/like.rs

Lines changed: 86 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717

1818
use arrow::array::ArrayRef;
1919
use arrow::compute::like;
20-
use arrow::datatypes::DataType;
21-
use datafusion_common::{Result, exec_err};
20+
use arrow::datatypes::{DataType, Field, FieldRef};
21+
use datafusion_common::{Result, exec_err, internal_err};
2222
use datafusion_expr::ColumnarValue;
23-
use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
23+
use datafusion_expr::{
24+
ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
25+
};
2426
use datafusion_functions::utils::make_scalar_function;
2527
use std::any::Any;
2628
use std::sync::Arc;
@@ -60,7 +62,16 @@ impl ScalarUDFImpl for SparkLike {
6062
}
6163

6264
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
63-
Ok(DataType::Boolean)
65+
internal_err!("return_field_from_args should be used instead")
66+
}
67+
68+
fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
69+
let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
70+
Ok(Arc::new(Field::new(
71+
self.name(),
72+
DataType::Boolean,
73+
nullable,
74+
)))
6475
}
6576

6677
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -83,9 +94,9 @@ mod tests {
8394
use super::*;
8495
use crate::function::utils::test::test_scalar_function;
8596
use arrow::array::{Array, BooleanArray};
86-
use arrow::datatypes::DataType::Boolean;
97+
use arrow::datatypes::{DataType::Boolean, Field};
8798
use datafusion_common::{Result, ScalarValue};
88-
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
99+
use datafusion_expr::{ColumnarValue, ReturnFieldArgs, ScalarUDFImpl};
89100

90101
macro_rules! test_like_string_invoke {
91102
($INPUT1:expr, $INPUT2:expr, $EXPECTED:expr) => {
@@ -175,4 +186,73 @@ mod tests {
175186

176187
Ok(())
177188
}
189+
190+
#[test]
191+
fn test_like_nullability() {
192+
let like = SparkLike::new();
193+
194+
// Test with non-nullable arguments
195+
let non_nullable_field1 = Arc::new(Field::new("str", DataType::Utf8, false));
196+
let non_nullable_field2 = Arc::new(Field::new("pattern", DataType::Utf8, false));
197+
198+
let both_non_nullable = like
199+
.return_field_from_args(ReturnFieldArgs {
200+
arg_fields: &[
201+
Arc::clone(&non_nullable_field1),
202+
Arc::clone(&non_nullable_field2),
203+
],
204+
scalar_arguments: &[None, None],
205+
})
206+
.unwrap();
207+
208+
// The result should not be nullable when both inputs are non-nullable
209+
assert!(!both_non_nullable.is_nullable());
210+
assert_eq!(both_non_nullable.data_type(), &Boolean);
211+
212+
// Test with first argument nullable
213+
let nullable_field1 = Arc::new(Field::new("str", DataType::Utf8, true));
214+
215+
let first_nullable = like
216+
.return_field_from_args(ReturnFieldArgs {
217+
arg_fields: &[
218+
Arc::clone(&nullable_field1),
219+
Arc::clone(&non_nullable_field2),
220+
],
221+
scalar_arguments: &[None, None],
222+
})
223+
.unwrap();
224+
225+
// The result should be nullable when first input is nullable
226+
assert!(first_nullable.is_nullable());
227+
assert_eq!(first_nullable.data_type(), &Boolean);
228+
229+
// Test with second argument nullable
230+
let nullable_field2 = Arc::new(Field::new("pattern", DataType::Utf8, true));
231+
232+
let second_nullable = like
233+
.return_field_from_args(ReturnFieldArgs {
234+
arg_fields: &[
235+
Arc::clone(&non_nullable_field1),
236+
Arc::clone(&nullable_field2),
237+
],
238+
scalar_arguments: &[None, None],
239+
})
240+
.unwrap();
241+
242+
// The result should be nullable when second input is nullable
243+
assert!(second_nullable.is_nullable());
244+
assert_eq!(second_nullable.data_type(), &Boolean);
245+
246+
// Test with both arguments nullable
247+
let first_second_nullable = like
248+
.return_field_from_args(ReturnFieldArgs {
249+
arg_fields: &[Arc::clone(&nullable_field1), Arc::clone(&nullable_field2)],
250+
scalar_arguments: &[None, None],
251+
})
252+
.unwrap();
253+
254+
// The result should be nullable when both inputs are nullable
255+
assert!(first_second_nullable.is_nullable());
256+
assert_eq!(first_second_nullable.data_type(), &Boolean);
257+
}
178258
}

0 commit comments

Comments
 (0)