Skip to content

Commit 4f7e98d

Browse files
authored
fix: Handle fallback to arrow_compare between canonical and non canonical encodings of Utf8 and Binary (#2696)
1 parent 3651278 commit 4f7e98d

File tree

8 files changed

+80
-24
lines changed

8 files changed

+80
-24
lines changed

fuzz/fuzz_targets/file_io.rs

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use vortex_array::stream::ArrayStreamArrayExt;
1414
use vortex_array::{Array, ArrayRef, ToCanonical};
1515
use vortex_buffer::ByteBufferMut;
1616
use vortex_dtype::{DType, StructDType};
17-
use vortex_error::VortexUnwrap;
17+
use vortex_error::{VortexUnwrap, vortex_panic};
1818
use vortex_file::{VortexOpenOptions, VortexWriteOptions};
1919

2020
fuzz_target!(|array_data: ArbitraryArray| -> Corpus {
@@ -66,13 +66,21 @@ fuzz_target!(|array_data: ArbitraryArray| -> Corpus {
6666
if matches!(array_data.dtype(), DType::Struct(_, _) | DType::List(_, _)) {
6767
compare_struct(array_data, output);
6868
} else {
69-
let r = compare(&array_data, &output, Operator::Eq).vortex_unwrap();
70-
let true_count = r
71-
.to_bool()
69+
let bool_result = compare(&array_data, &output, Operator::Eq)
7270
.vortex_unwrap()
73-
.boolean_buffer()
74-
.count_set_bits();
75-
assert_eq!(true_count, array_data.len());
71+
.to_bool()
72+
.vortex_unwrap();
73+
let true_count = bool_result.boolean_buffer().count_set_bits();
74+
if true_count != array_data.len()
75+
&& (bool_result.all_valid().vortex_unwrap()
76+
|| array_data.all_valid().vortex_unwrap())
77+
{
78+
vortex_panic!(
79+
"Failed to match original array {}with{}",
80+
array_data.tree_display(),
81+
output.tree_display()
82+
);
83+
}
7684
}
7785
});
7886

vortex-array/src/arrays/varbin/compute/compare.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ impl CompareFn<&VarBinArray> for VarBinEncoding {
5454
));
5555
}
5656

57-
let lhs = Datum::try_new(lhs.clone().into_array())?;
57+
let lhs = Datum::try_new(lhs)?;
5858

5959
// TODO(robert): Handle LargeString/Binary arrays
6060
let arrow_rhs: &dyn arrow_array::Datum = match rhs_const.dtype() {

vortex-array/src/arrays/varbin/compute/to_arrow.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,14 @@ impl ToArrowFn<&VarBinArray> for VarBinEncoding {
3131
data_type: &DataType,
3232
) -> VortexResult<Option<ArrayRef>> {
3333
let array_ref = match data_type {
34-
DataType::BinaryView | DataType::FixedSizeBinary(_) | DataType::Utf8View => {
34+
DataType::FixedSizeBinary(_) => {
3535
// TODO(ngates): we should support converting VarBin into these Arrow arrays.
3636
return Ok(None);
3737
}
38+
DataType::BinaryView | DataType::Utf8View => Ok(arrow_cast::cast(
39+
&*varbin_to_arrow::<i32>(array)?,
40+
data_type,
41+
)?),
3842
DataType::Binary | DataType::Utf8 => {
3943
// These are both supported with a zero-copy cast, see below
4044
varbin_to_arrow::<i32>(array)

vortex-array/src/arrays/varbinview/compute/mod.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ impl ComputeVTable for VarBinViewEncoding {
4242
Some(self)
4343
}
4444

45+
fn min_max_fn(&self) -> Option<&dyn MinMaxFn<&dyn Array>> {
46+
Some(self)
47+
}
48+
4549
fn scalar_at_fn(&self) -> Option<&dyn ScalarAtFn<&dyn Array>> {
4650
Some(self)
4751
}
@@ -58,10 +62,6 @@ impl ComputeVTable for VarBinViewEncoding {
5862
Some(self)
5963
}
6064

61-
fn min_max_fn(&self) -> Option<&dyn MinMaxFn<&dyn Array>> {
62-
Some(self)
63-
}
64-
6565
fn uncompressed_size_fn(&self) -> Option<&dyn UncompressedSizeFn<&dyn Array>> {
6666
Some(self)
6767
}

vortex-array/src/arrow/datum.rs

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use arrow_array::{Array as ArrowArray, ArrayRef as ArrowArrayRef, Datum as ArrowDatum};
2+
use arrow_schema::DataType;
23
use vortex_error::{VortexResult, vortex_panic};
34

45
use crate::arrays::ConstantArray;
@@ -15,15 +16,32 @@ pub struct Datum {
1516

1617
impl Datum {
1718
/// Create a new [`Datum`] from an [`ArrayRef`], which can then be passed to Arrow compute.
18-
pub fn try_new(array: ArrayRef) -> VortexResult<Self> {
19+
pub fn try_new(array: &dyn Array) -> VortexResult<Self> {
1920
if array.is_constant() {
2021
Ok(Self {
21-
array: slice(&array, 0, 1)?.into_arrow_preferred()?,
22+
array: slice(array, 0, 1)?.into_arrow_preferred()?,
2223
is_scalar: true,
2324
})
2425
} else {
2526
Ok(Self {
26-
array: array.into_arrow_preferred()?,
27+
array: array.to_array().into_arrow_preferred()?,
28+
is_scalar: false,
29+
})
30+
}
31+
}
32+
33+
pub fn with_target_datatype(
34+
array: &dyn Array,
35+
target_datatype: &DataType,
36+
) -> VortexResult<Self> {
37+
if array.is_constant() {
38+
Ok(Self {
39+
array: slice(array, 0, 1)?.into_arrow(target_datatype)?,
40+
is_scalar: true,
41+
})
42+
} else {
43+
Ok(Self {
44+
array: array.to_array().into_arrow(target_datatype)?,
2745
is_scalar: false,
2846
})
2947
}

vortex-array/src/compute/binary_numeric.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ fn arrow_numeric(
150150
let nullable = lhs.dtype().is_nullable() || rhs.dtype().is_nullable();
151151
let len = lhs.len();
152152

153-
let left = Datum::try_new(lhs.to_array())?;
154-
let right = Datum::try_new(rhs.to_array())?;
153+
let left = Datum::try_new(lhs)?;
154+
let right = Datum::try_new(rhs)?;
155155

156156
let array = match operator {
157157
BinaryNumericOperator::Add => arrow_arith::numeric::add(&left, &right)?,

vortex-array/src/compute/compare.rs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use std::fmt::{Display, Formatter};
33

44
use arrow_buffer::BooleanBuffer;
55
use arrow_ord::cmp;
6+
use arrow_schema::DataType;
67
use vortex_dtype::{DType, NativePType, Nullability};
78
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
89
use vortex_scalar::Scalar;
@@ -194,8 +195,8 @@ fn arrow_compare(
194195
operator: Operator,
195196
) -> VortexResult<ArrayRef> {
196197
let nullable = left.dtype().is_nullable() || right.dtype().is_nullable();
197-
let lhs = Datum::try_new(left.to_array())?;
198-
let rhs = Datum::try_new(right.to_array())?;
198+
let lhs = datum_for_cmp(left)?;
199+
let rhs = datum_for_cmp(right)?;
199200

200201
let array = match operator {
201202
Operator::Eq => cmp::eq(&lhs, &rhs)?,
@@ -250,14 +251,26 @@ pub fn scalar_cmp(lhs: &Scalar, rhs: &Scalar, operator: Operator) -> Scalar {
250251
}
251252
}
252253

254+
// Make sure both of the arrays end up with the same arrow data type
255+
fn datum_for_cmp(array: &dyn Array) -> VortexResult<Datum> {
256+
if matches!(array.dtype(), DType::Utf8(_)) {
257+
Datum::with_target_datatype(array, &DataType::Utf8View)
258+
} else if matches!(array.dtype(), DType::Binary(_)) {
259+
Datum::with_target_datatype(array, &DataType::BinaryView)
260+
} else {
261+
Datum::try_new(array)
262+
}
263+
}
264+
253265
#[cfg(test)]
254266
mod tests {
255267
use arrow_buffer::BooleanBuffer;
256268
use itertools::Itertools;
269+
use rstest::rstest;
257270

258271
use super::*;
259272
use crate::ToCanonical;
260-
use crate::arrays::{BoolArray, ConstantArray};
273+
use crate::arrays::{BoolArray, ConstantArray, VarBinArray, VarBinViewArray};
261274
use crate::validity::Validity;
262275

263276
fn to_int_indices(indices_bits: BoolArray) -> Vec<u64> {
@@ -337,7 +350,7 @@ mod tests {
337350
assert_eq!(compare.len(), 10);
338351
}
339352

340-
#[rstest::rstest]
353+
#[rstest]
341354
#[case(Operator::Eq, vec![false, false, false, true])]
342355
#[case(Operator::NotEq, vec![true, true, true, false])]
343356
#[case(Operator::Gt, vec![true, true, true, false])]
@@ -350,4 +363,17 @@ mod tests {
350363
let output = compare_lengths_to_empty(lengths.iter().copied(), op);
351364
assert_eq!(Vec::from_iter(output.iter()), expected);
352365
}
366+
367+
#[rstest]
368+
#[case(VarBinArray::from(vec!["a", "b"]).into_array(), VarBinViewArray::from_iter_str(["a", "b"]).into_array())]
369+
#[case(VarBinViewArray::from_iter_str(["a", "b"]).into_array(), VarBinArray::from(vec!["a", "b"]).into_array())]
370+
#[case(VarBinArray::from(vec!["a".as_bytes(), "b".as_bytes()]).into_array(), VarBinViewArray::from_iter_bin(["a".as_bytes(), "b".as_bytes()]).into_array())]
371+
#[case(VarBinViewArray::from_iter_bin(["a".as_bytes(), "b".as_bytes()]).into_array(), VarBinArray::from(vec!["a".as_bytes(), "b".as_bytes()]).into_array())]
372+
fn arrow_compare_different_encodings(#[case] left: ArrayRef, #[case] right: ArrayRef) {
373+
let res = arrow_compare(&left, &right, Operator::Eq).unwrap();
374+
assert_eq!(
375+
res.to_bool().unwrap().boolean_buffer().count_set_bits(),
376+
left.len()
377+
);
378+
}
353379
}

vortex-array/src/compute/like.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,8 @@ pub(crate) fn arrow_like(
116116
"Arrow Like: length mismatch for {}",
117117
array.encoding()
118118
);
119-
let lhs = Datum::try_new(array.to_array())?;
120-
let rhs = Datum::try_new(pattern.to_array())?;
119+
let lhs = Datum::try_new(array)?;
120+
let rhs = Datum::try_new(pattern)?;
121121

122122
let result = match (options.negated, options.case_insensitive) {
123123
(false, false) => arrow_string::like::like(&lhs, &rhs)?,

0 commit comments

Comments
 (0)