Skip to content

Commit a643065

Browse files
feat: Support patching bool arrays, patch primitive array validity and use patching when canonicalizing sparse arrays (#1218)
Co-authored-by: Will Manning <[email protected]>
1 parent 635ea62 commit a643065

File tree

6 files changed

+274
-68
lines changed

6 files changed

+274
-68
lines changed

encodings/alp/src/alp/compress.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,11 @@ fn patch_decoded(array: PrimitiveArray, patches: &Array) -> VortexResult<Primiti
8383
Sparse::ID => {
8484
match_each_alp_float_ptype!(array.ptype(), |$T| {
8585
let typed_patches = SparseArray::try_from(patches).unwrap();
86+
let primitive_values = typed_patches.values().into_primitive()?;
8687
array.patch(
8788
&typed_patches.resolved_indices(),
88-
typed_patches.values().into_primitive()?.maybe_null_slice::<$T>())
89+
primitive_values.maybe_null_slice::<$T>(),
90+
primitive_values.validity())
8991
})
9092
}
9193
_ => vortex_bail!(

encodings/fastlanes/src/bitpacking/compress.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,11 @@ fn patch_unpacked(array: PrimitiveArray, patches: &Array) -> VortexResult<Primit
194194
Sparse::ID => {
195195
match_each_integer_ptype!(array.ptype(), |$T| {
196196
let typed_patches = SparseArray::try_from(patches).unwrap();
197+
let primitive_values = typed_patches.values().into_primitive()?;
197198
array.patch(
198199
&typed_patches.resolved_indices(),
199-
typed_patches.values().into_primitive()?.maybe_null_slice::<$T>())
200+
primitive_values.maybe_null_slice::<$T>(),
201+
primitive_values.validity())
200202
})
201203
}
202204
_ => vortex_bail!(

vortex-array/src/array/bool/mod.rs

Lines changed: 97 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
use std::fmt::{Debug, Display};
22

33
use arrow_buffer::bit_iterator::{BitIndexIterator, BitSliceIterator};
4-
use arrow_buffer::BooleanBuffer;
4+
use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder, MutableBuffer};
55
use itertools::Itertools;
6+
use num_traits::AsPrimitive;
67
use serde::{Deserialize, Serialize};
78
use vortex_buffer::Buffer;
89
use vortex_dtype::DType;
9-
use vortex_error::{VortexExpect as _, VortexResult};
10+
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};
1011

1112
use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
1213
use crate::encoding::ids;
@@ -34,12 +35,21 @@ impl Display for BoolMetadata {
3435
}
3536

3637
impl BoolArray {
38+
/// Access internal array buffer
3739
pub fn buffer(&self) -> &Buffer {
3840
self.as_ref()
3941
.buffer()
4042
.vortex_expect("Missing buffer in BoolArray")
4143
}
4244

45+
/// Convert array into its internal buffer
46+
pub fn into_buffer(self) -> Buffer {
47+
self.into_array()
48+
.into_buffer()
49+
.vortex_expect("BoolArray must have a buffer")
50+
}
51+
52+
/// Get array values as an arrow [BooleanBuffer]
4353
pub fn boolean_buffer(&self) -> BooleanBuffer {
4454
BooleanBuffer::new(
4555
self.buffer().clone().into_arrow(),
@@ -48,6 +58,33 @@ impl BoolArray {
4858
)
4959
}
5060

61+
/// Get a mutable version of this array.
62+
///
63+
/// If the caller holds the only reference to the underlying buffer the underlying buffer is returned
64+
/// otherwise a copy is created.
65+
///
66+
/// The second value of the tuple is a bit_offset of first value in first byte of the returned builder
67+
pub fn into_boolean_builder(self) -> (BooleanBufferBuilder, usize) {
68+
let first_byte_bit_offset = self.metadata().first_byte_bit_offset as usize;
69+
let len = self.len();
70+
let arrow_buffer = self.into_buffer().into_arrow();
71+
let mutable_buf = if arrow_buffer.ptr_offset() == 0 {
72+
arrow_buffer.into_mutable().unwrap_or_else(|b| {
73+
let mut buf = MutableBuffer::with_capacity(b.len());
74+
buf.extend_from_slice(b.as_slice());
75+
buf
76+
})
77+
} else {
78+
let mut buf = MutableBuffer::with_capacity(arrow_buffer.len());
79+
buf.extend_from_slice(arrow_buffer.as_slice());
80+
buf
81+
};
82+
(
83+
BooleanBufferBuilder::new_from_buffer(mutable_buf, len + first_byte_bit_offset),
84+
first_byte_bit_offset,
85+
)
86+
}
87+
5188
pub fn validity(&self) -> Validity {
5289
self.metadata().validity.to_validity(|| {
5390
self.as_ref()
@@ -85,6 +122,34 @@ impl BoolArray {
85122
let buffer = BooleanBuffer::from(bools);
86123
Self::try_new(buffer, validity).vortex_expect("Failed to create BoolArray from vec")
87124
}
125+
126+
pub fn patch<P: AsPrimitive<usize>>(
127+
self,
128+
positions: &[P],
129+
values: BoolArray,
130+
) -> VortexResult<Self> {
131+
if positions.len() != values.len() {
132+
vortex_bail!(
133+
"Positions and values passed to patch had different lengths {} and {}",
134+
positions.len(),
135+
values.len()
136+
);
137+
}
138+
if let Some(last_pos) = positions.last() {
139+
if last_pos.as_() >= self.len() {
140+
vortex_bail!(OutOfBounds: last_pos.as_(), 0, self.len())
141+
}
142+
}
143+
144+
let len = self.len();
145+
let result_validity = self.validity().patch(len, positions, values.validity())?;
146+
let (mut own_values, bit_offset) = self.into_boolean_builder();
147+
for (idx, value) in positions.iter().zip_eq(values.boolean_buffer().iter()) {
148+
own_values.set_bit(idx.as_() + bit_offset, value);
149+
}
150+
151+
Self::try_new(own_values.finish().slice(bit_offset, len), result_validity)
152+
}
88153
}
89154

90155
impl ArrayTrait for BoolArray {}
@@ -165,13 +230,15 @@ impl AcceptArrayVisitor for BoolArray {
165230

166231
#[cfg(test)]
167232
mod tests {
233+
use arrow_buffer::BooleanBuffer;
168234
use itertools::Itertools;
169235

170236
use crate::array::BoolArray;
237+
use crate::compute::slice;
171238
use crate::compute::unary::scalar_at;
172239
use crate::validity::Validity;
173240
use crate::variants::BoolArrayTrait;
174-
use crate::IntoArray;
241+
use crate::{IntoArray, IntoArrayVariant};
175242

176243
#[test]
177244
fn bool_array() {
@@ -238,4 +305,31 @@ mod tests {
238305
assert_eq!(0, arr.maybe_null_indices_iter().collect_vec().len());
239306
assert_eq!(0, arr.maybe_null_slices_iter().collect_vec().len());
240307
}
308+
309+
#[test]
310+
fn patch_sliced_bools() {
311+
let arr = BoolArray::from(BooleanBuffer::new_set(12));
312+
let sliced = slice(arr, 4, 12).unwrap();
313+
let (values, offset) = sliced.into_bool().unwrap().into_boolean_builder();
314+
assert_eq!(offset, 4);
315+
assert_eq!(values.as_slice(), &[255, 15]);
316+
}
317+
318+
#[test]
319+
fn patch_sliced_bools_offset() {
320+
let arr = BoolArray::from(BooleanBuffer::new_set(15));
321+
let sliced = slice(arr, 4, 15).unwrap();
322+
let (values, offset) = sliced.into_bool().unwrap().into_boolean_builder();
323+
assert_eq!(offset, 4);
324+
assert_eq!(values.as_slice(), &[255, 127]);
325+
}
326+
327+
#[test]
328+
fn patch_sliced_bools_even() {
329+
let arr = BoolArray::from(BooleanBuffer::new_set(31));
330+
let sliced = slice(arr, 8, 24).unwrap();
331+
let (values, offset) = sliced.into_bool().unwrap().into_boolean_builder();
332+
assert_eq!(offset, 0);
333+
assert_eq!(values.as_slice(), &[255, 255]);
334+
}
241335
}

vortex-array/src/array/primitive/mod.rs

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -162,19 +162,34 @@ impl PrimitiveArray {
162162
self,
163163
positions: &[P],
164164
values: &[T],
165+
values_validity: Validity,
165166
) -> VortexResult<Self> {
167+
if positions.len() != values.len() {
168+
vortex_bail!(
169+
"Positions and values passed to patch had different lengths {} and {}",
170+
positions.len(),
171+
values.len()
172+
);
173+
}
174+
if let Some(last_pos) = positions.last() {
175+
if last_pos.as_() >= self.len() {
176+
vortex_bail!(OutOfBounds: last_pos.as_(), 0, self.len())
177+
}
178+
}
179+
166180
if self.ptype() != T::PTYPE {
167181
vortex_bail!(MismatchedTypes: self.dtype(), T::PTYPE)
168182
}
169183

170-
let validity = self.validity();
171-
172-
let mut own_values = self.into_maybe_null_slice();
173-
// TODO(robert): Also patch validity
174-
for (idx, value) in positions.iter().zip_eq(values.iter()) {
175-
own_values[(*idx).as_()] = *value;
184+
let result_validity = self
185+
.validity()
186+
.patch(self.len(), positions, values_validity)?;
187+
let mut own_values = self.into_maybe_null_slice::<T>();
188+
for (idx, value) in positions.iter().zip_eq(values) {
189+
own_values[idx.as_()] = *value;
176190
}
177-
Ok(Self::from_vec(own_values, validity))
191+
192+
Ok(Self::from_vec(own_values, result_validity))
178193
}
179194

180195
pub fn into_buffer(self) -> Buffer {
@@ -461,6 +476,8 @@ mod tests {
461476
use vortex_scalar::Scalar;
462477

463478
use super::*;
479+
use crate::compute::slice;
480+
use crate::IntoArrayVariant;
464481

465482
#[test]
466483
fn batched_iter() {
@@ -546,4 +563,17 @@ mod tests {
546563
assert_eq!(o.unwrap(), 3);
547564
}
548565
}
566+
567+
#[test]
568+
fn patch_sliced() {
569+
let input = PrimitiveArray::from_vec(vec![2u32; 10], Validity::AllValid);
570+
let sliced = slice(input, 2, 8).unwrap();
571+
assert_eq!(
572+
sliced
573+
.into_primitive()
574+
.unwrap()
575+
.into_maybe_null_slice::<u32>(),
576+
vec![2u32; 6]
577+
);
578+
}
549579
}

vortex-array/src/array/sparse/canonical.rs

Lines changed: 45 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
use arrow_buffer::BooleanBufferBuilder;
2-
use vortex_dtype::{match_each_native_ptype, DType, NativePType};
1+
use arrow_buffer::{ArrowNativeType, BooleanBuffer};
2+
use vortex_dtype::{match_each_native_ptype, DType, NativePType, Nullability};
33
use vortex_error::{VortexError, VortexResult};
44
use vortex_scalar::ScalarValue;
55

@@ -12,93 +12,82 @@ use crate::{ArrayDType, Canonical, IntoArrayVariant, IntoCanonical};
1212

1313
impl IntoCanonical for SparseArray {
1414
fn into_canonical(self) -> VortexResult<Canonical> {
15-
// Resolve our indices into a vector of usize, applying the offset
15+
// Resolve our indices into a vector of usize applying the offset
1616
let indices = self.resolved_indices();
1717

1818
if matches!(self.dtype(), DType::Bool(_)) {
19-
canonicalize_sparse_bools(
20-
self.values().into_bool()?,
21-
&indices,
22-
self.len(),
23-
self.fill_value(),
24-
)
19+
let values = self.values().into_bool()?;
20+
canonicalize_sparse_bools(values, &indices, self.len(), self.fill_value())
2521
} else {
2622
let values = self.values().into_primitive()?;
2723
match_each_native_ptype!(values.ptype(), |$P| {
28-
canonicalize_sparse_primitives(
29-
values.maybe_null_slice::<$P>(),
30-
values.validity(),
24+
canonicalize_sparse_primitives::<$P>(
25+
values,
3126
&indices,
3227
self.len(),
33-
self.fill_value()
28+
self.fill_value(),
3429
)
3530
})
3631
}
3732
}
3833
}
3934

4035
fn canonicalize_sparse_bools(
41-
values_array: BoolArray,
36+
values: BoolArray,
4237
indices: &[usize],
4338
len: usize,
4439
fill_value: &ScalarValue,
4540
) -> VortexResult<Canonical> {
46-
let fill_bool = if fill_value.is_null() {
47-
false
41+
let (fill_bool, validity) = if fill_value.is_null() {
42+
(false, Validity::AllInvalid)
4843
} else {
49-
fill_value.try_into()?
44+
(
45+
fill_value.try_into()?,
46+
if values.dtype().nullability() == Nullability::NonNullable {
47+
Validity::NonNullable
48+
} else {
49+
Validity::AllValid
50+
},
51+
)
5052
};
5153

52-
let values_validity = values_array.validity();
53-
let values = values_array.boolean_buffer();
54-
55-
// pre-fill both values and validity based on fill_value
56-
// this optimizes performance for the common case where indices.len() < len / 2
57-
let mut flat_bools = BooleanBufferBuilder::new(len);
58-
flat_bools.append_n(len, fill_bool);
59-
let mut validity_buffer = BooleanBufferBuilder::new(len);
60-
validity_buffer.append_n(len, !fill_value.is_null());
61-
62-
// patch in the actual values and validity
63-
for (i, idx) in indices.iter().enumerate() {
64-
flat_bools.set_bit(*idx, values.value(i));
65-
validity_buffer.set_bit(*idx, values_validity.is_valid(i));
66-
}
67-
68-
BoolArray::try_new(
69-
flat_bools.finish(),
70-
Validity::from(validity_buffer.finish()),
71-
)
72-
.map(Canonical::Bool)
54+
let bools = BoolArray::try_new(
55+
if fill_bool {
56+
BooleanBuffer::new_set(len)
57+
} else {
58+
BooleanBuffer::new_unset(len)
59+
},
60+
validity,
61+
)?;
62+
let patched = bools.patch(indices, values)?;
63+
Ok(Canonical::Bool(patched))
7364
}
7465

7566
fn canonicalize_sparse_primitives<
76-
T: NativePType + for<'a> TryFrom<&'a ScalarValue, Error = VortexError>,
67+
T: NativePType + for<'a> TryFrom<&'a ScalarValue, Error = VortexError> + ArrowNativeType,
7768
>(
78-
values: &[T],
79-
values_validity: Validity,
69+
values: PrimitiveArray,
8070
indices: &[usize],
8171
len: usize,
8272
fill_value: &ScalarValue,
8373
) -> VortexResult<Canonical> {
84-
let primitive_fill = if fill_value.is_null() {
85-
T::default()
74+
let values_validity = values.validity();
75+
let (primitive_fill, validity) = if fill_value.is_null() {
76+
(T::default(), Validity::AllInvalid)
8677
} else {
87-
fill_value.try_into()?
78+
(
79+
fill_value.try_into()?,
80+
if values_validity == Validity::NonNullable {
81+
Validity::NonNullable
82+
} else {
83+
Validity::AllValid
84+
},
85+
)
8886
};
89-
let mut result = vec![primitive_fill; len];
90-
let mut validity = BooleanBufferBuilder::new(len);
91-
validity.append_n(len, !fill_value.is_null());
92-
93-
for (i, idx) in indices.iter().enumerate() {
94-
result[*idx] = values[i];
95-
validity.set_bit(*idx, values_validity.is_valid(i));
96-
}
9787

98-
Ok(Canonical::Primitive(PrimitiveArray::from_vec(
99-
result,
100-
Validity::from(validity.finish()),
101-
)))
88+
let parray = PrimitiveArray::from_vec(vec![primitive_fill; len], validity);
89+
let patched = parray.patch(indices, values.maybe_null_slice::<T>(), values_validity)?;
90+
Ok(Canonical::Primitive(patched))
10291
}
10392

10493
#[cfg(test)]

0 commit comments

Comments
 (0)