Skip to content

Commit 6b3b101

Browse files
committed
feat: mask
Mask sets entries of an array to null. I like the analogy to light: the array is a sequence of lights (each value might be a different wavelength). Null is represented by the absence of light. Placing a mask (i.e. a piece of plastic with slits) over the array causes those values where the mask is present (i.e. "on", "true") to be dark. An example in pseudo-code: ```rust a = [1, 2, 3, 4, 5] a_mask = [t, f, f, t, f] mask(a, a_mask) == [null, 2, 3, null, 5] ``` Specializations --------------- I only fallback to Arrow for two of the core arrays: - Sparse. I was skeptical that I could do better than decompressing and applying it. - Constant. If the mask is sparse, SparseArray might be a good choice. I didn't investigate. For the non-core arrays, I'm missing the following. I'm not clear that I can beat decompression for run end. The others are easy enough but some amount of typing and testing. - fastlanes - fsst - roaring - runend - runend-bool - zigzag Naming ------ Pandas also calls this operation [`mask`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mask.html) but accepts an optional second argument which is an array of values to use instead of null (which makes Pandas' mask more like an `if_else`). Arrow-rs calls this [`nullif`](https://arrow.apache.org/rust/arrow/compute/fn.nullif.html). Arrow-cpp has [`if_else(condition, consequent, alternate)`](https://arrow.apache.org/docs/cpp/compute.html#cpp-compute-scalar-selections) and [`replace_with_mask(array, mask, replacements)`](https://arrow.apache.org/docs/cpp/compute.html#replace-functions) both of which can implement our `mask` by passing a `NullArray` as the third argument.
1 parent f97c0cd commit 6b3b101

File tree

39 files changed

+1583
-93
lines changed

39 files changed

+1583
-93
lines changed
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
use vortex_array::compute::{mask, try_cast, FilterMask, MaskFn};
2+
use vortex_array::{ArrayDType as _, ArrayData, IntoArrayData};
3+
use vortex_error::VortexResult;
4+
5+
use crate::{ALPArray, ALPEncoding};
6+
7+
impl MaskFn<ALPArray> for ALPEncoding {
8+
fn mask(&self, array: &ALPArray, filter_mask: FilterMask) -> VortexResult<ArrayData> {
9+
ALPArray::try_new(
10+
mask(&array.encoded(), filter_mask)?,
11+
array.exponents(),
12+
array
13+
.patches()
14+
.map(|patches| {
15+
patches.map_values(|values| try_cast(&values, &values.dtype().as_nullable()))
16+
})
17+
.transpose()?,
18+
)
19+
.map(IntoArrayData::into_array)
20+
}
21+
}
22+
23+
#[cfg(test)]
24+
mod tests {
25+
use vortex_array::array::PrimitiveArray;
26+
use vortex_array::compute::test_harness::test_mask;
27+
use vortex_array::validity::Validity;
28+
use vortex_array::IntoArrayData as _;
29+
use vortex_buffer::buffer;
30+
31+
use crate::alp_encode;
32+
33+
#[test]
34+
fn test_mask_no_patches_alp_array() {
35+
test_mask(
36+
alp_encode(&PrimitiveArray::new(
37+
buffer![1.0f32, 2.0, 3.0, 4.0, 5.0],
38+
Validity::AllValid,
39+
))
40+
.unwrap()
41+
.into_array(),
42+
);
43+
44+
test_mask(
45+
alp_encode(&PrimitiveArray::new(
46+
buffer![1.0f32, 2.0, 3.0, 4.0, 5.0],
47+
Validity::NonNullable,
48+
))
49+
.unwrap()
50+
.into_array(),
51+
);
52+
}
53+
54+
#[test]
55+
fn test_mask_patched_alp_array() {
56+
let alp_array = alp_encode(&PrimitiveArray::new(
57+
buffer![1.0f32, 2.0, 3.0, 4.0, 1e10],
58+
Validity::AllValid,
59+
))
60+
.unwrap();
61+
assert!(alp_array.patches().is_some());
62+
test_mask(alp_array.into_array());
63+
64+
let alp_array = alp_encode(&PrimitiveArray::new(
65+
buffer![1.0f32, 2.0, 3.0, 4.0, 1e10],
66+
Validity::NonNullable,
67+
))
68+
.unwrap();
69+
assert!(alp_array.patches().is_some());
70+
test_mask(alp_array.into_array());
71+
}
72+
}

encodings/alp/src/alp/compute/mod.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
mod mask;
2+
13
use vortex_array::compute::{
2-
filter, scalar_at, slice, take, ComputeVTable, FilterFn, FilterMask, ScalarAtFn, SliceFn,
3-
TakeFn,
4+
filter, scalar_at, slice, take, ComputeVTable, FilterFn, FilterMask, MaskFn, ScalarAtFn,
5+
SliceFn, TakeFn,
46
};
57
use vortex_array::variants::PrimitiveArrayTrait;
68
use vortex_array::{ArrayDType, ArrayData, IntoArrayData};
@@ -14,6 +16,10 @@ impl ComputeVTable for ALPEncoding {
1416
Some(self)
1517
}
1618

19+
fn mask_fn(&self) -> Option<&dyn MaskFn<ArrayData>> {
20+
Some(self)
21+
}
22+
1723
fn scalar_at_fn(&self) -> Option<&dyn ScalarAtFn<ArrayData>> {
1824
Some(self)
1925
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
use vortex_array::compute::{mask, FilterMask, MaskFn};
2+
use vortex_array::{ArrayDType, ArrayData, IntoArrayData};
3+
use vortex_error::VortexResult;
4+
5+
use crate::{ALPRDArray, ALPRDEncoding};
6+
7+
impl MaskFn<ALPRDArray> for ALPRDEncoding {
8+
fn mask(&self, array: &ALPRDArray, filter_mask: FilterMask) -> VortexResult<ArrayData> {
9+
Ok(ALPRDArray::try_new(
10+
array.dtype().as_nullable(),
11+
mask(&array.left_parts(), filter_mask)?,
12+
array.left_parts_dict(),
13+
array.right_parts(),
14+
array.right_bit_width(),
15+
array.left_parts_patches(),
16+
)?
17+
.into_array())
18+
}
19+
}
20+
21+
#[cfg(test)]
22+
mod tests {
23+
use rstest::rstest;
24+
use vortex_array::array::PrimitiveArray;
25+
use vortex_array::compute::test_harness::test_mask;
26+
use vortex_array::IntoArrayData as _;
27+
28+
use crate::{ALPRDFloat, RDEncoder};
29+
30+
#[rstest]
31+
#[case(0.1f32, 0.2f32, 3e25f32)]
32+
#[case(0.1f64, 0.2f64, 3e100f64)]
33+
fn test_mask_simple<T: ALPRDFloat>(#[case] a: T, #[case] b: T, #[case] outlier: T) {
34+
test_mask(
35+
RDEncoder::new(&[a, b])
36+
.encode(&PrimitiveArray::from_iter([a, b, outlier, b, outlier]))
37+
.into_array(),
38+
);
39+
}
40+
41+
#[rstest]
42+
#[case(0.1f32, 3e25f32)]
43+
#[case(0.5f64, 1e100f64)]
44+
fn test_mask_with_nulls<T: ALPRDFloat>(#[case] a: T, #[case] outlier: T) {
45+
test_mask(
46+
RDEncoder::new(&[a])
47+
.encode(&PrimitiveArray::from_option_iter([
48+
Some(a),
49+
None,
50+
Some(outlier),
51+
Some(a),
52+
None,
53+
]))
54+
.into_array(),
55+
);
56+
}
57+
}

encodings/alp/src/alp_rd/compute/mod.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
use vortex_array::compute::{ComputeVTable, FilterFn, ScalarAtFn, SliceFn, TakeFn};
1+
use vortex_array::compute::{ComputeVTable, FilterFn, MaskFn, ScalarAtFn, SliceFn, TakeFn};
22
use vortex_array::ArrayData;
33

44
use crate::ALPRDEncoding;
55

66
mod filter;
7+
mod mask;
78
mod scalar_at;
89
mod slice;
910
mod take;
@@ -13,6 +14,10 @@ impl ComputeVTable for ALPRDEncoding {
1314
Some(self)
1415
}
1516

17+
fn mask_fn(&self) -> Option<&dyn MaskFn<ArrayData>> {
18+
Some(self)
19+
}
20+
1621
fn scalar_at_fn(&self) -> Option<&dyn ScalarAtFn<ArrayData>> {
1722
Some(self)
1823
}

encodings/bytebool/src/compute.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
use num_traits::AsPrimitive;
2-
use vortex_array::compute::{ComputeVTable, FillForwardFn, ScalarAtFn, SliceFn, TakeFn};
2+
use vortex_array::compute::{
3+
ComputeVTable, FillForwardFn, FilterMask, MaskFn, ScalarAtFn, SliceFn, TakeFn,
4+
};
35
use vortex_array::validity::{ArrayValidity, Validity};
46
use vortex_array::variants::PrimitiveArrayTrait;
57
use vortex_array::{ArrayDType, ArrayData, ArrayLen, IntoArrayData, IntoArrayVariant, ToArrayData};
@@ -14,6 +16,10 @@ impl ComputeVTable for ByteBoolEncoding {
1416
None
1517
}
1618

19+
fn mask_fn(&self) -> Option<&dyn MaskFn<ArrayData>> {
20+
Some(self)
21+
}
22+
1723
fn scalar_at_fn(&self) -> Option<&dyn ScalarAtFn<ArrayData>> {
1824
Some(self)
1925
}
@@ -27,6 +33,13 @@ impl ComputeVTable for ByteBoolEncoding {
2733
}
2834
}
2935

36+
impl MaskFn<ByteBoolArray> for ByteBoolEncoding {
37+
fn mask(&self, array: &ByteBoolArray, mask: FilterMask) -> VortexResult<ArrayData> {
38+
ByteBoolArray::try_new(array.buffer().clone(), array.validity().mask(&mask)?)
39+
.map(IntoArrayData::into_array)
40+
}
41+
}
42+
3043
impl ScalarAtFn<ByteBoolArray> for ByteBoolEncoding {
3144
fn scalar_at(&self, array: &ByteBoolArray, index: usize) -> VortexResult<Scalar> {
3245
Ok(Scalar::bool(
@@ -136,6 +149,7 @@ impl FillForwardFn<ByteBoolArray> for ByteBoolEncoding {
136149

137150
#[cfg(test)]
138151
mod tests {
152+
use vortex_array::compute::test_harness::test_mask;
139153
use vortex_array::compute::{compare, scalar_at, slice, Operator};
140154

141155
use super::*;
@@ -208,4 +222,12 @@ mod tests {
208222
let s = scalar_at(&arr, 4).unwrap();
209223
assert!(s.is_null());
210224
}
225+
226+
#[test]
227+
fn test_mask_byte_bool() {
228+
test_mask(ByteBoolArray::from(vec![true, false, true, true, false]).into_array());
229+
test_mask(
230+
ByteBoolArray::from(vec![Some(true), Some(true), None, Some(false), None]).into_array(),
231+
);
232+
}
211233
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
use vortex_array::compute::{try_cast, CastFn};
2+
use vortex_array::{ArrayDType, ArrayData, IntoArrayData};
3+
use vortex_dtype::DType;
4+
use vortex_error::{vortex_bail, VortexResult};
5+
6+
use crate::{DateTimePartsArray, DateTimePartsEncoding};
7+
8+
impl CastFn<DateTimePartsArray> for DateTimePartsEncoding {
9+
fn cast(&self, array: &DateTimePartsArray, dtype: &DType) -> VortexResult<ArrayData> {
10+
if !array.dtype().eq_ignore_nullability(dtype) {
11+
vortex_bail!("cannot cast from {} to {}", array.dtype(), dtype);
12+
};
13+
14+
Ok(DateTimePartsArray::try_new(
15+
array.dtype().clone().as_nullable(),
16+
try_cast(
17+
array.days().as_ref(),
18+
&array.days().dtype().with_nullability(dtype.nullability()),
19+
)?,
20+
array.seconds(),
21+
array.subsecond(),
22+
)?
23+
.into_array())
24+
}
25+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
use vortex_array::compute::{mask, FilterMask, MaskFn};
2+
use vortex_array::{ArrayDType, ArrayData, IntoArrayData};
3+
use vortex_error::VortexResult;
4+
5+
use crate::{DateTimePartsArray, DateTimePartsEncoding};
6+
7+
impl MaskFn<DateTimePartsArray> for DateTimePartsEncoding {
8+
fn mask(&self, array: &DateTimePartsArray, filter_mask: FilterMask) -> VortexResult<ArrayData> {
9+
Ok(DateTimePartsArray::try_new(
10+
array.dtype().clone().as_nullable(),
11+
mask(array.days().as_ref(), filter_mask)?,
12+
array.seconds(),
13+
array.subsecond(),
14+
)?
15+
.into_array())
16+
}
17+
}
18+
19+
#[cfg(test)]
20+
mod tests {
21+
use vortex_array::array::TemporalArray;
22+
use vortex_array::compute::test_harness::test_mask;
23+
use vortex_array::IntoArrayData as _;
24+
use vortex_buffer::buffer;
25+
use vortex_datetime_dtype::TimeUnit;
26+
use vortex_dtype::DType;
27+
28+
use crate::{split_temporal, DateTimePartsArray, TemporalParts};
29+
30+
#[test]
31+
fn test_mask_datetime_parts_array() {
32+
let raw_millis = buffer![
33+
86_400i64, // element with only day component
34+
86_400i64 + 1000, // element with day + second components
35+
86_400i64 + 1000 + 1, // element with day + second + sub-second components
36+
86_400i64 + 1000 + 5, // element with day + second + sub-second components
37+
86_400i64 + 1000 + 55, // element with day + second + sub-second components
38+
]
39+
.into_array();
40+
let temporal_array =
41+
TemporalArray::new_timestamp(raw_millis, TimeUnit::Ms, Some("UTC".to_string()));
42+
let TemporalParts {
43+
days,
44+
seconds,
45+
subseconds,
46+
} = split_temporal(temporal_array.clone()).unwrap();
47+
let date_times = DateTimePartsArray::try_new(
48+
DType::Extension(temporal_array.ext_dtype()),
49+
days,
50+
seconds,
51+
subseconds,
52+
)
53+
.unwrap()
54+
.into_array();
55+
56+
test_mask(date_times.clone());
57+
}
58+
}

encodings/datetime-parts/src/compute/mod.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
mod cast;
12
mod filter;
3+
mod mask;
24
mod take;
35

46
use vortex_array::array::{PrimitiveArray, TemporalArray};
57
use vortex_array::compute::{
6-
scalar_at, slice, try_cast, ComputeVTable, FilterFn, ScalarAtFn, SliceFn, TakeFn,
8+
scalar_at, slice, try_cast, CastFn, ComputeVTable, FilterFn, MaskFn, ScalarAtFn, SliceFn,
9+
TakeFn,
710
};
811
use vortex_array::validity::ArrayValidity;
912
use vortex_array::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant};
@@ -17,10 +20,18 @@ use vortex_scalar::{PrimitiveScalar, Scalar};
1720
use crate::{DateTimePartsArray, DateTimePartsEncoding};
1821

1922
impl ComputeVTable for DateTimePartsEncoding {
23+
fn cast_fn(&self) -> Option<&dyn CastFn<ArrayData>> {
24+
Some(self)
25+
}
26+
2027
fn filter_fn(&self) -> Option<&dyn FilterFn<ArrayData>> {
2128
Some(self)
2229
}
2330

31+
fn mask_fn(&self) -> Option<&dyn MaskFn<ArrayData>> {
32+
Some(self)
33+
}
34+
2435
fn scalar_at_fn(&self) -> Option<&dyn ScalarAtFn<ArrayData>> {
2536
Some(self)
2637
}

encodings/dict/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,7 @@ vortex-array = { workspace = true, features = ["test-harness"] }
3535
[[bench]]
3636
name = "dict_compress"
3737
harness = false
38+
39+
[[bench]]
40+
name = "dict_mask"
41+
harness = false

0 commit comments

Comments
 (0)