diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 59fed88b14bf..ce3d4ced4e3a 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -17,16 +17,14 @@ mod kernels; -use crate::expressions::binary::kernels::concat_elements_utf8view; use crate::intervals::cp_solver::{propagate_arithmetic, propagate_comparison}; use crate::PhysicalExpr; use std::hash::Hash; use std::{any::Any, sync::Arc}; use arrow::array::*; -use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene}; +use arrow::compute::kernels::boolean::{and_kleene, or_kleene}; use arrow::compute::kernels::cmp::*; -use arrow::compute::kernels::comparison::{regexp_is_match, regexp_is_match_scalar}; use arrow::compute::kernels::concat_elements::concat_elements_utf8; use arrow::compute::{ cast, filter_record_batch, ilike, like, nilike, nlike, SlicesIterator, @@ -50,6 +48,7 @@ use kernels::{ bitwise_and_dyn, bitwise_and_dyn_scalar, bitwise_or_dyn, bitwise_or_dyn_scalar, bitwise_shift_left_dyn, bitwise_shift_left_dyn_scalar, bitwise_shift_right_dyn, bitwise_shift_right_dyn_scalar, bitwise_xor_dyn, bitwise_xor_dyn_scalar, + concat_elements_utf8view, regex_match_dyn, regex_match_dyn_scalar, }; /// Binary expression @@ -166,177 +165,6 @@ fn boolean_op( op(ll, rr).map(|t| Arc::new(t) as _) } -macro_rules! binary_string_array_flag_op { - ($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{ - match $LEFT.data_type() { - DataType::Utf8 => { - compute_utf8_flag_op!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG) - }, - DataType::Utf8View => { - compute_utf8view_flag_op!($LEFT, $RIGHT, $OP, StringViewArray, $NOT, $FLAG) - } - DataType::LargeUtf8 => { - compute_utf8_flag_op!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG) - }, - other => internal_err!( - "Data type {} not supported for binary_string_array_flag_op operation '{}' on string array", - other, stringify!($OP) - ), - } - }}; -} - -/// Invoke a compute kernel on a pair of binary data arrays with flags -macro_rules! compute_utf8_flag_op { - ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{ - let ll = $LEFT - .as_any() - .downcast_ref::<$ARRAYTYPE>() - .expect("compute_utf8_flag_op failed to downcast array"); - let rr = $RIGHT - .as_any() - .downcast_ref::<$ARRAYTYPE>() - .expect("compute_utf8_flag_op failed to downcast array"); - - let flag = if $FLAG { - Some($ARRAYTYPE::from(vec!["i"; ll.len()])) - } else { - None - }; - let mut array = $OP(ll, rr, flag.as_ref())?; - if $NOT { - array = not(&array).unwrap(); - } - Ok(Arc::new(array)) - }}; -} - -/// Invoke a compute kernel on a pair of binary data arrays with flags -macro_rules! compute_utf8view_flag_op { - ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{ - let ll = $LEFT - .as_any() - .downcast_ref::<$ARRAYTYPE>() - .expect("compute_utf8view_flag_op failed to downcast array"); - let rr = $RIGHT - .as_any() - .downcast_ref::<$ARRAYTYPE>() - .expect("compute_utf8view_flag_op failed to downcast array"); - - let flag = if $FLAG { - Some($ARRAYTYPE::from(vec!["i"; ll.len()])) - } else { - None - }; - let mut array = $OP(ll, rr, flag.as_ref())?; - if $NOT { - array = not(&array).unwrap(); - } - Ok(Arc::new(array)) - }}; -} - -macro_rules! binary_string_array_flag_op_scalar { - ($LEFT:ident, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{ - // This macro is slightly different from binary_string_array_flag_op because, when comparing with a scalar value, - // the query can be optimized in such a way that operands will be dicts, so we need to support it here - let result: Result> = match $LEFT.data_type() { - DataType::Utf8 => { - compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG) - }, - DataType::Utf8View => { - compute_utf8view_flag_op_scalar!($LEFT, $RIGHT, $OP, StringViewArray, $NOT, $FLAG) - } - DataType::LargeUtf8 => { - compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG) - }, - DataType::Dictionary(_, _) => { - let values = $LEFT.as_any_dictionary().values(); - - match values.data_type() { - DataType::Utf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG), - DataType::Utf8View => compute_utf8view_flag_op_scalar!(values, $RIGHT, $OP, StringViewArray, $NOT, $FLAG), - DataType::LargeUtf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG), - other => internal_err!( - "Data type {} not supported as a dictionary value type for binary_string_array_flag_op_scalar operation '{}' on string array", - other, stringify!($OP) - ), - }.map( - // downcast_dictionary_array duplicates code per possible key type, so we aim to do all prep work before - |evaluated_values| downcast_dictionary_array! { - $LEFT => { - let unpacked_dict = evaluated_values.take_iter($LEFT.keys().iter().map(|opt| opt.map(|v| v as _))).collect::(); - Arc::new(unpacked_dict) as _ - }, - _ => unreachable!(), - } - ) - }, - other => internal_err!( - "Data type {} not supported for binary_string_array_flag_op_scalar operation '{}' on string array", - other, stringify!($OP) - ), - }; - Some(result) - }}; -} - -/// Invoke a compute kernel on a data array and a scalar value with flag -macro_rules! compute_utf8_flag_op_scalar { - ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{ - let ll = $LEFT - .as_any() - .downcast_ref::<$ARRAYTYPE>() - .expect("compute_utf8_flag_op_scalar failed to downcast array"); - - let string_value = match $RIGHT.try_as_str() { - Some(Some(string_value)) => string_value, - // null literal or non string - _ => return internal_err!( - "compute_utf8_flag_op_scalar failed to cast literal value {} for operation '{}'", - $RIGHT, stringify!($OP) - ) - }; - - let flag = $FLAG.then_some("i"); - let mut array = - paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?; - if $NOT { - array = not(&array).unwrap(); - } - - Ok(Arc::new(array)) - }}; -} - -/// Invoke a compute kernel on a data array and a scalar value with flag -macro_rules! compute_utf8view_flag_op_scalar { - ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{ - let ll = $LEFT - .as_any() - .downcast_ref::<$ARRAYTYPE>() - .expect("compute_utf8view_flag_op_scalar failed to downcast array"); - - let string_value = match $RIGHT.try_as_str() { - Some(Some(string_value)) => string_value, - // null literal or non string - _ => return internal_err!( - "compute_utf8view_flag_op_scalar failed to cast literal value {} for operation '{}'", - $RIGHT, stringify!($OP) - ) - }; - - let flag = $FLAG.then_some("i"); - let mut array = - paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?; - if $NOT { - array = not(&array).unwrap(); - } - - Ok(Arc::new(array)) - }}; -} - impl PhysicalExpr for BinaryExpr { /// Return a reference to Any that can be used for downcasting fn as_any(&self) -> &dyn Any { @@ -752,34 +580,10 @@ impl BinaryExpr { ) -> Result>> { use Operator::*; let scalar_result = match &self.op { - RegexMatch => binary_string_array_flag_op_scalar!( - array, - scalar, - regexp_is_match, - false, - false - ), - RegexIMatch => binary_string_array_flag_op_scalar!( - array, - scalar, - regexp_is_match, - false, - true - ), - RegexNotMatch => binary_string_array_flag_op_scalar!( - array, - scalar, - regexp_is_match, - true, - false - ), - RegexNotIMatch => binary_string_array_flag_op_scalar!( - array, - scalar, - regexp_is_match, - true, - true - ), + RegexMatch => regex_match_dyn_scalar(array, scalar, false, false), + RegexIMatch => regex_match_dyn_scalar(array, scalar, false, true), + RegexNotMatch => regex_match_dyn_scalar(array, scalar, true, false), + RegexNotIMatch => regex_match_dyn_scalar(array, scalar, true, true), BitwiseAnd => bitwise_and_dyn_scalar(array, scalar), BitwiseOr => bitwise_or_dyn_scalar(array, scalar), BitwiseXor => bitwise_xor_dyn_scalar(array, scalar), @@ -828,18 +632,10 @@ impl BinaryExpr { ) } } - RegexMatch => { - binary_string_array_flag_op!(left, right, regexp_is_match, false, false) - } - RegexIMatch => { - binary_string_array_flag_op!(left, right, regexp_is_match, false, true) - } - RegexNotMatch => { - binary_string_array_flag_op!(left, right, regexp_is_match, true, false) - } - RegexNotIMatch => { - binary_string_array_flag_op!(left, right, regexp_is_match, true, true) - } + RegexMatch => regex_match_dyn(left, right, false, false), + RegexIMatch => regex_match_dyn(left, right, false, true), + RegexNotMatch => regex_match_dyn(left, right, true, false), + RegexNotIMatch => regex_match_dyn(left, right, true, true), BitwiseAnd => bitwise_and_dyn(left, right), BitwiseOr => bitwise_or_dyn(left, right), BitwiseXor => bitwise_xor_dyn(left, right), diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr/src/expressions/binary/kernels.rs index d2553146cbf1..71d1242eea85 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels.rs @@ -23,15 +23,17 @@ use arrow::compute::kernels::bitwise::{ bitwise_shift_left_scalar, bitwise_shift_right, bitwise_shift_right_scalar, bitwise_xor, bitwise_xor_scalar, }; +use arrow::compute::kernels::boolean::not; +use arrow::compute::kernels::comparison::{regexp_is_match, regexp_is_match_scalar}; use arrow::datatypes::DataType; -use datafusion_common::plan_err; +use arrow::error::ArrowError; +use datafusion_common::{internal_err, plan_err}; use datafusion_common::{Result, ScalarValue}; -use arrow::error::ArrowError; use std::sync::Arc; /// Downcasts $LEFT and $RIGHT to $ARRAY_TYPE and then calls $KERNEL($LEFT, $RIGHT) -macro_rules! call_bitwise_kernel { +macro_rules! call_kernel { ($LEFT:expr, $RIGHT:expr, $KERNEL:expr, $ARRAY_TYPE:ident) => {{ let left = $LEFT.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap(); let right = $RIGHT.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap(); @@ -42,33 +44,33 @@ macro_rules! call_bitwise_kernel { /// Creates a $FUNC(left: ArrayRef, right: ArrayRef) that /// downcasts left / right to the appropriate integral type and calls the kernel -macro_rules! create_dyn_kernel { +macro_rules! create_left_integral_dyn_kernel { ($FUNC:ident, $KERNEL:ident) => { pub(crate) fn $FUNC(left: ArrayRef, right: ArrayRef) -> Result { match &left.data_type() { DataType::Int8 => { - call_bitwise_kernel!(left, right, $KERNEL, Int8Array) + call_kernel!(left, right, $KERNEL, Int8Array) } DataType::Int16 => { - call_bitwise_kernel!(left, right, $KERNEL, Int16Array) + call_kernel!(left, right, $KERNEL, Int16Array) } DataType::Int32 => { - call_bitwise_kernel!(left, right, $KERNEL, Int32Array) + call_kernel!(left, right, $KERNEL, Int32Array) } DataType::Int64 => { - call_bitwise_kernel!(left, right, $KERNEL, Int64Array) + call_kernel!(left, right, $KERNEL, Int64Array) } DataType::UInt8 => { - call_bitwise_kernel!(left, right, $KERNEL, UInt8Array) + call_kernel!(left, right, $KERNEL, UInt8Array) } DataType::UInt16 => { - call_bitwise_kernel!(left, right, $KERNEL, UInt16Array) + call_kernel!(left, right, $KERNEL, UInt16Array) } DataType::UInt32 => { - call_bitwise_kernel!(left, right, $KERNEL, UInt32Array) + call_kernel!(left, right, $KERNEL, UInt32Array) } DataType::UInt64 => { - call_bitwise_kernel!(left, right, $KERNEL, UInt64Array) + call_kernel!(left, right, $KERNEL, UInt64Array) } other => plan_err!( "Data type {} not supported for binary operation '{}' on dyn arrays", @@ -80,14 +82,14 @@ macro_rules! create_dyn_kernel { }; } -create_dyn_kernel!(bitwise_or_dyn, bitwise_or); -create_dyn_kernel!(bitwise_xor_dyn, bitwise_xor); -create_dyn_kernel!(bitwise_and_dyn, bitwise_and); -create_dyn_kernel!(bitwise_shift_right_dyn, bitwise_shift_right); -create_dyn_kernel!(bitwise_shift_left_dyn, bitwise_shift_left); +create_left_integral_dyn_kernel!(bitwise_or_dyn, bitwise_or); +create_left_integral_dyn_kernel!(bitwise_xor_dyn, bitwise_xor); +create_left_integral_dyn_kernel!(bitwise_and_dyn, bitwise_and); +create_left_integral_dyn_kernel!(bitwise_shift_right_dyn, bitwise_shift_right); +create_left_integral_dyn_kernel!(bitwise_shift_left_dyn, bitwise_shift_left); /// Downcasts $LEFT as $ARRAY_TYPE and $RIGHT as TYPE and calls $KERNEL($LEFT, $RIGHT) -macro_rules! call_bitwise_scalar_kernel { +macro_rules! call_scalar_kernel { ($LEFT:expr, $RIGHT:expr, $KERNEL:ident, $ARRAY_TYPE:ident, $TYPE:ty) => {{ let len = $LEFT.len(); let array = $LEFT.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap(); @@ -104,18 +106,18 @@ macro_rules! call_bitwise_scalar_kernel { /// Creates a $FUNC(left: ArrayRef, right: ScalarValue) that /// downcasts left / right to the appropriate integral type and calls the kernel -macro_rules! create_dyn_scalar_kernel { +macro_rules! create_left_integral_dyn_scalar_kernel { ($FUNC:ident, $KERNEL:ident) => { pub(crate) fn $FUNC(array: &dyn Array, scalar: ScalarValue) -> Option> { let result = match array.data_type() { - DataType::Int8 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, Int8Array, i8), - DataType::Int16 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, Int16Array, i16), - DataType::Int32 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, Int32Array, i32), - DataType::Int64 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, Int64Array, i64), - DataType::UInt8 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, UInt8Array, u8), - DataType::UInt16 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, UInt16Array, u16), - DataType::UInt32 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, UInt32Array, u32), - DataType::UInt64 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, UInt64Array, u64), + DataType::Int8 => call_scalar_kernel!(array, scalar, $KERNEL, Int8Array, i8), + DataType::Int16 => call_scalar_kernel!(array, scalar, $KERNEL, Int16Array, i16), + DataType::Int32 => call_scalar_kernel!(array, scalar, $KERNEL, Int32Array, i32), + DataType::Int64 => call_scalar_kernel!(array, scalar, $KERNEL, Int64Array, i64), + DataType::UInt8 => call_scalar_kernel!(array, scalar, $KERNEL, UInt8Array, u8), + DataType::UInt16 => call_scalar_kernel!(array, scalar, $KERNEL, UInt16Array, u16), + DataType::UInt32 => call_scalar_kernel!(array, scalar, $KERNEL, UInt32Array, u32), + DataType::UInt64 => call_scalar_kernel!(array, scalar, $KERNEL, UInt64Array, u64), other => plan_err!( "Data type {} not supported for binary operation '{}' on dyn arrays", other, @@ -127,11 +129,17 @@ macro_rules! create_dyn_scalar_kernel { }; } -create_dyn_scalar_kernel!(bitwise_and_dyn_scalar, bitwise_and_scalar); -create_dyn_scalar_kernel!(bitwise_or_dyn_scalar, bitwise_or_scalar); -create_dyn_scalar_kernel!(bitwise_xor_dyn_scalar, bitwise_xor_scalar); -create_dyn_scalar_kernel!(bitwise_shift_right_dyn_scalar, bitwise_shift_right_scalar); -create_dyn_scalar_kernel!(bitwise_shift_left_dyn_scalar, bitwise_shift_left_scalar); +create_left_integral_dyn_scalar_kernel!(bitwise_and_dyn_scalar, bitwise_and_scalar); +create_left_integral_dyn_scalar_kernel!(bitwise_or_dyn_scalar, bitwise_or_scalar); +create_left_integral_dyn_scalar_kernel!(bitwise_xor_dyn_scalar, bitwise_xor_scalar); +create_left_integral_dyn_scalar_kernel!( + bitwise_shift_right_dyn_scalar, + bitwise_shift_right_scalar +); +create_left_integral_dyn_scalar_kernel!( + bitwise_shift_left_dyn_scalar, + bitwise_shift_left_scalar +); pub fn concat_elements_utf8view( left: &StringViewArray, @@ -164,3 +172,125 @@ pub fn concat_elements_utf8view( } Ok(result.finish()) } + +/// Invoke a compute kernel on a pair of binary data arrays with flags +macro_rules! regexp_is_match_flag { + ($LEFT:expr, $RIGHT:expr, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{ + let ll = $LEFT + .as_any() + .downcast_ref::<$ARRAYTYPE>() + .expect("failed to downcast array"); + let rr = $RIGHT + .as_any() + .downcast_ref::<$ARRAYTYPE>() + .expect("failed to downcast array"); + + let flag = if $FLAG { + Some($ARRAYTYPE::from(vec!["i"; ll.len()])) + } else { + None + }; + let mut array = regexp_is_match(ll, rr, flag.as_ref())?; + if $NOT { + array = not(&array).unwrap(); + } + Ok(Arc::new(array)) + }}; +} + +pub(crate) fn regex_match_dyn( + left: ArrayRef, + right: ArrayRef, + not_match: bool, + flag: bool, +) -> Result { + match left.data_type() { + DataType::Utf8 => { + regexp_is_match_flag!(left, right, StringArray, not_match, flag) + } + DataType::Utf8View => { + regexp_is_match_flag!(left, right, StringViewArray, not_match, flag) + } + DataType::LargeUtf8 => { + regexp_is_match_flag!(left, right, LargeStringArray, not_match, flag) + } + other => internal_err!( + "Data type {} not supported for regex_match_dyn on string array", + other + ), + } +} + +/// Invoke a compute kernel on a data array and a scalar value with flag +macro_rules! regexp_is_match_flag_scalar { + ($LEFT:expr, $RIGHT:expr, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{ + let ll = $LEFT + .as_any() + .downcast_ref::<$ARRAYTYPE>() + .expect("failed to downcast array"); + + if let Some(Some(string_value)) = $RIGHT.try_as_str() { + let flag = $FLAG.then_some("i"); + match regexp_is_match_scalar(ll, &string_value, flag) { + Ok(mut array) => { + if $NOT { + array = not(&array).unwrap(); + } + Ok(Arc::new(array)) + } + Err(e) => internal_err!("failed to call 'regex_match_dyn_scalar' {}", e), + } + } else { + internal_err!( + "failed to cast literal value {} for operation 'regex_match_dyn_scalar'", + $RIGHT + ) + } + }}; +} + +pub(crate) fn regex_match_dyn_scalar( + left: &dyn Array, + right: ScalarValue, + not_match: bool, + flag: bool, +) -> Option> { + let result: Result = match left.data_type() { + DataType::Utf8 => { + regexp_is_match_flag_scalar!(left, right, StringArray, not_match, flag) + }, + DataType::Utf8View => { + regexp_is_match_flag_scalar!(left, right, StringViewArray, not_match, flag) + } + DataType::LargeUtf8 => { + regexp_is_match_flag_scalar!(left, right, LargeStringArray, not_match, flag) + }, + DataType::Dictionary(_, _) => { + let values = left.as_any_dictionary().values(); + + match values.data_type() { + DataType::Utf8 => regexp_is_match_flag_scalar!(values, right, StringArray, not_match, flag), + DataType::Utf8View => regexp_is_match_flag_scalar!(values, right, StringViewArray, not_match, flag), + DataType::LargeUtf8 => regexp_is_match_flag_scalar!(values, right, LargeStringArray, not_match, flag), + other => internal_err!( + "Data type {} not supported as a dictionary value type for operation 'regex_match_dyn_scalar' on string array", + other + ), + }.map( + // downcast_dictionary_array duplicates code per possible key type, so we aim to do all prep work before + |evaluated_values| downcast_dictionary_array! { + left => { + let unpacked_dict = evaluated_values.take_iter(left.keys().iter().map(|opt| opt.map(|v| v as _))).collect::(); + Arc::new(unpacked_dict) as ArrayRef + }, + _ => unreachable!(), + } + ) + }, + other => internal_err!( + "Data type {} not supported for operation 'regex_match_dyn_scalar' on string array", + other + ), + }; + Some(result) +}