diff --git a/Cargo.lock b/Cargo.lock index 1243e4eed5f..5d62d4e972c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4965,6 +4965,7 @@ dependencies = [ "vortex-fsst", "vortex-io", "vortex-ipc", + "vortex-mask", "vortex-proto", "vortex-runend", "vortex-sampling-compressor", @@ -4987,6 +4988,7 @@ dependencies = [ "vortex-dtype", "vortex-error", "vortex-fastlanes", + "vortex-mask", "vortex-scalar", ] @@ -5030,6 +5032,7 @@ dependencies = [ "vortex-dtype", "vortex-error", "vortex-flatbuffers", + "vortex-mask", "vortex-scalar", ] @@ -5116,6 +5119,7 @@ dependencies = [ "vortex-datetime-dtype", "vortex-dtype", "vortex-error", + "vortex-mask", "vortex-scalar", ] @@ -5132,6 +5136,7 @@ dependencies = [ "vortex-buffer", "vortex-dtype", "vortex-error", + "vortex-mask", "vortex-scalar", ] @@ -5209,6 +5214,7 @@ dependencies = [ "vortex-buffer", "vortex-dtype", "vortex-error", + "vortex-mask", "vortex-scalar", ] @@ -5241,6 +5247,7 @@ dependencies = [ "vortex-io", "vortex-ipc", "vortex-layout", + "vortex-mask", "vortex-scalar", "vortex-scan", ] @@ -5264,6 +5271,7 @@ dependencies = [ "vortex-buffer", "vortex-dtype", "vortex-error", + "vortex-mask", "vortex-scalar", ] @@ -5282,6 +5290,7 @@ dependencies = [ "vortex-dtype", "vortex-error", "vortex-file", + "vortex-mask", "vortex-sampling-compressor", "vortex-scalar", ] @@ -5340,6 +5349,7 @@ dependencies = [ "vortex-error", "vortex-expr", "vortex-flatbuffers", + "vortex-mask", "vortex-scalar", "vortex-scan", ] @@ -5375,6 +5385,7 @@ dependencies = [ "vortex-buffer", "vortex-dtype", "vortex-error", + "vortex-mask", "vortex-scalar", ] @@ -5439,6 +5450,7 @@ dependencies = [ "vortex-dtype", "vortex-error", "vortex-expr", + "vortex-mask", ] [[package]] @@ -5450,6 +5462,7 @@ dependencies = [ "vortex-buffer", "vortex-dtype", "vortex-error", + "vortex-mask", "vortex-scalar", "zigzag", ] diff --git a/Cargo.toml b/Cargo.toml index 9445b993744..db41d84e6ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -156,6 +156,7 @@ vortex-fsst = { version = "0.22.1", path = "./encodings/fsst" } vortex-io = { version = "0.22.1", path = "./vortex-io" } vortex-ipc = { version = "0.22.1", path = "./vortex-ipc" } vortex-layout = { version = "0.22.1", path = "./vortex-layout" } +vortex-mask = { version = "0.22.1", path = "./vortex-mask" } vortex-proto = { version = "0.22.1", path = "./vortex-proto" } vortex-runend = { version = "0.22.1", path = "./encodings/runend" } vortex-scalar = { version = "0.22.1", path = "./vortex-scalar", default-features = false } diff --git a/encodings/alp/Cargo.toml b/encodings/alp/Cargo.toml index fd7993c50aa..f7462689693 100644 --- a/encodings/alp/Cargo.toml +++ b/encodings/alp/Cargo.toml @@ -25,6 +25,7 @@ vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } vortex-fastlanes = { workspace = true } +vortex-mask = { workspace = true } vortex-scalar = { workspace = true } [dev-dependencies] diff --git a/encodings/alp/src/alp/compute/mod.rs b/encodings/alp/src/alp/compute/mod.rs index 1a8b147c095..61290c72b5e 100644 --- a/encodings/alp/src/alp/compute/mod.rs +++ b/encodings/alp/src/alp/compute/mod.rs @@ -1,10 +1,10 @@ use vortex_array::compute::{ - filter, scalar_at, slice, take, ComputeVTable, FilterFn, FilterMask, ScalarAtFn, SliceFn, - TakeFn, + filter, scalar_at, slice, take, ComputeVTable, FilterFn, ScalarAtFn, SliceFn, TakeFn, }; use vortex_array::variants::PrimitiveArrayTrait; use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; use vortex_error::VortexResult; +use vortex_mask::Mask; use vortex_scalar::Scalar; use crate::{match_each_alp_float_ptype, ALPArray, ALPEncoding, ALPFloat}; @@ -79,7 +79,7 @@ impl SliceFn for ALPEncoding { } impl FilterFn for ALPEncoding { - fn filter(&self, array: &ALPArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &ALPArray, mask: &Mask) -> VortexResult { let patches = array .patches() .map(|p| p.filter(mask)) diff --git a/encodings/alp/src/alp_rd/compute/filter.rs b/encodings/alp/src/alp_rd/compute/filter.rs index f0811e18017..434b548ef23 100644 --- a/encodings/alp/src/alp_rd/compute/filter.rs +++ b/encodings/alp/src/alp_rd/compute/filter.rs @@ -1,11 +1,12 @@ -use vortex_array::compute::{filter, FilterFn, FilterMask}; +use vortex_array::compute::{filter, FilterFn}; use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; use vortex_error::VortexResult; +use vortex_mask::Mask; use crate::{ALPRDArray, ALPRDEncoding}; impl FilterFn for ALPRDEncoding { - fn filter(&self, array: &ALPRDArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &ALPRDArray, mask: &Mask) -> VortexResult { let left_parts_exceptions = array .left_parts_patches() .map(|patches| patches.filter(mask)) @@ -28,10 +29,11 @@ impl FilterFn for ALPRDEncoding { mod test { use rstest::rstest; use vortex_array::array::PrimitiveArray; - use vortex_array::compute::{filter, FilterMask}; + use vortex_array::compute::filter; use vortex_array::validity::Validity; use vortex_array::IntoArrayVariant; use vortex_buffer::buffer; + use vortex_mask::Mask; use crate::{ALPRDFloat, RDEncoder}; @@ -46,13 +48,10 @@ mod test { assert!(encoded.left_parts_patches().is_some()); // The first two values need no patching - let filtered = filter( - encoded.as_ref(), - &FilterMask::from_iter([true, false, true]), - ) - .unwrap() - .into_primitive() - .unwrap(); + let filtered = filter(encoded.as_ref(), &Mask::from_iter([true, false, true])) + .unwrap() + .into_primitive() + .unwrap(); assert_eq!(filtered.as_slice::(), &[a, outlier]); } } diff --git a/encodings/datetime-parts/Cargo.toml b/encodings/datetime-parts/Cargo.toml index f45d57e07ca..b31a1c7c785 100644 --- a/encodings/datetime-parts/Cargo.toml +++ b/encodings/datetime-parts/Cargo.toml @@ -23,6 +23,7 @@ vortex-buffer = { workspace = true } vortex-datetime-dtype = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } +vortex-mask = { workspace = true } vortex-scalar = { workspace = true } [dev-dependencies] diff --git a/encodings/datetime-parts/src/compute/filter.rs b/encodings/datetime-parts/src/compute/filter.rs index da768fc36ab..33edbc5ca9b 100644 --- a/encodings/datetime-parts/src/compute/filter.rs +++ b/encodings/datetime-parts/src/compute/filter.rs @@ -1,11 +1,12 @@ -use vortex_array::compute::{filter, FilterFn, FilterMask}; +use vortex_array::compute::{filter, FilterFn}; use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; use vortex_error::VortexResult; +use vortex_mask::Mask; use crate::{DateTimePartsArray, DateTimePartsEncoding}; impl FilterFn for DateTimePartsEncoding { - fn filter(&self, array: &DateTimePartsArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &DateTimePartsArray, mask: &Mask) -> VortexResult { Ok(DateTimePartsArray::try_new( array.dtype().clone(), filter(array.days().as_ref(), mask)?, diff --git a/encodings/dict/Cargo.toml b/encodings/dict/Cargo.toml index 2f7a755beed..07d8ae6b545 100644 --- a/encodings/dict/Cargo.toml +++ b/encodings/dict/Cargo.toml @@ -21,6 +21,7 @@ vortex-array = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } +vortex-mask = { workspace = true } vortex-scalar = { workspace = true } [lints] diff --git a/encodings/dict/src/compute/mod.rs b/encodings/dict/src/compute/mod.rs index 43e505feb33..cd2eac3ed63 100644 --- a/encodings/dict/src/compute/mod.rs +++ b/encodings/dict/src/compute/mod.rs @@ -3,11 +3,12 @@ mod compare; mod like; use vortex_array::compute::{ - filter, scalar_at, slice, take, BinaryNumericFn, CompareFn, ComputeVTable, FilterFn, - FilterMask, LikeFn, ScalarAtFn, SliceFn, TakeFn, + filter, scalar_at, slice, take, BinaryNumericFn, CompareFn, ComputeVTable, FilterFn, LikeFn, + ScalarAtFn, SliceFn, TakeFn, }; use vortex_array::{ArrayData, IntoArrayData}; use vortex_error::VortexResult; +use vortex_mask::Mask; use vortex_scalar::Scalar; use crate::{DictArray, DictEncoding}; @@ -60,7 +61,7 @@ impl TakeFn for DictEncoding { } impl FilterFn for DictEncoding { - fn filter(&self, array: &DictArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &DictArray, mask: &Mask) -> VortexResult { let codes = filter(&array.codes(), mask)?; DictArray::try_new(codes, array.values()).map(|a| a.into_array()) } diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index 2f560c0ec73..157da552169 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -28,6 +28,7 @@ vortex-array = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } +vortex-mask = { workspace = true } vortex-scalar = { workspace = true } [dev-dependencies] diff --git a/encodings/fastlanes/src/bitpacking/compute/filter.rs b/encodings/fastlanes/src/bitpacking/compute/filter.rs index 5b9821d2a05..587f8fa307b 100644 --- a/encodings/fastlanes/src/bitpacking/compute/filter.rs +++ b/encodings/fastlanes/src/bitpacking/compute/filter.rs @@ -1,19 +1,20 @@ use arrow_buffer::ArrowNativeType; use fastlanes::BitPacking; use vortex_array::array::PrimitiveArray; -use vortex_array::compute::{filter, FilterFn, FilterIter, FilterMask}; +use vortex_array::compute::{filter, FilterFn}; use vortex_array::variants::PrimitiveArrayTrait; use vortex_array::{ArrayData, IntoArrayData, IntoArrayVariant}; use vortex_buffer::{Buffer, BufferMut}; use vortex_dtype::{match_each_unsigned_integer_ptype, NativePType}; use vortex_error::VortexResult; +use vortex_mask::{Mask, MaskIter}; use super::chunked_indices; use crate::bitpacking::compute::take::UNPACK_CHUNK_THRESHOLD; use crate::{BitPackedArray, BitPackedEncoding}; impl FilterFn for BitPackedEncoding { - fn filter(&self, array: &BitPackedArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &BitPackedArray, mask: &Mask) -> VortexResult { let primitive = match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$I| { filter_primitive::<$I>(array, mask) }); @@ -31,7 +32,7 @@ impl FilterFn for BitPackedEncoding { /// dictates the final `PType` of the result. fn filter_primitive( array: &BitPackedArray, - mask: &FilterMask, + mask: &Mask, ) -> VortexResult { let validity = array.validity().filter(mask)?; @@ -48,12 +49,10 @@ fn filter_primitive( } let values: Buffer = match mask.iter() { - FilterIter::Indices(indices) => { + MaskIter::Indices(indices) => { filter_indices(array, mask.true_count(), indices.iter().copied()) } - FilterIter::Slices(slices) => { - filter_slices(array, mask.true_count(), slices.iter().copied()) - } + MaskIter::Slices(slices) => filter_slices(array, mask.true_count(), slices.iter().copied()), }; let mut values = PrimitiveArray::new(values, validity).reinterpret_cast(array.ptype()); @@ -128,10 +127,11 @@ fn filter_slices( #[cfg(test)] mod test { use vortex_array::array::PrimitiveArray; - use vortex_array::compute::{filter, slice, FilterMask}; + use vortex_array::compute::{filter, slice}; use vortex_array::validity::Validity; use vortex_array::{ArrayLen, IntoArrayVariant}; use vortex_buffer::Buffer; + use vortex_mask::Mask; use crate::BitPackedArray; @@ -141,7 +141,7 @@ mod test { let unpacked = PrimitiveArray::from_iter((0..4096).map(|i| (i % 63) as u8)); let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 6).unwrap(); - let mask = FilterMask::from_indices(bitpacked.len(), vec![0, 125, 2047, 2049, 2151, 2790]); + let mask = Mask::from_indices(bitpacked.len(), vec![0, 125, 2047, 2049, 2151, 2790]); let primitive_result = filter(bitpacked.as_ref(), &mask) .unwrap() @@ -158,7 +158,7 @@ mod test { let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 6).unwrap(); let sliced = slice(bitpacked.as_ref(), 128, 2050).unwrap(); - let mask = FilterMask::from_indices(sliced.len(), vec![1919, 1921]); + let mask = Mask::from_indices(sliced.len(), vec![1919, 1921]); let primitive_result = filter(&sliced, &mask).unwrap().into_primitive().unwrap(); let res_bytes = primitive_result.as_slice::(); @@ -171,7 +171,7 @@ mod test { let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 6).unwrap(); let filtered = filter( bitpacked.as_ref(), - &FilterMask::from_indices(4096, (0..1024).collect()), + &Mask::from_indices(4096, (0..1024).collect()), ) .unwrap(); assert_eq!( @@ -187,7 +187,7 @@ mod test { let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 9).unwrap(); let filtered = filter( bitpacked.as_ref(), - &FilterMask::from_indices(values.len(), (0..250).collect()), + &Mask::from_indices(values.len(), (0..250).collect()), ) .unwrap() .into_primitive() diff --git a/encodings/fastlanes/src/for/compute/mod.rs b/encodings/fastlanes/src/for/compute/mod.rs index 1294069d474..b5f28fb1b47 100644 --- a/encodings/fastlanes/src/for/compute/mod.rs +++ b/encodings/fastlanes/src/for/compute/mod.rs @@ -4,13 +4,14 @@ use std::ops::AddAssign; use num_traits::{CheckedShl, CheckedShr, WrappingAdd, WrappingSub}; use vortex_array::compute::{ - filter, scalar_at, search_sorted, slice, take, CompareFn, ComputeVTable, FilterFn, FilterMask, - ScalarAtFn, SearchResult, SearchSortedFn, SearchSortedSide, SliceFn, TakeFn, + filter, scalar_at, search_sorted, slice, take, CompareFn, ComputeVTable, FilterFn, ScalarAtFn, + SearchResult, SearchSortedFn, SearchSortedSide, SliceFn, TakeFn, }; use vortex_array::variants::PrimitiveArrayTrait; use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; use vortex_dtype::{match_each_integer_ptype, NativePType}; use vortex_error::{VortexError, VortexExpect as _, VortexResult}; +use vortex_mask::Mask; use vortex_scalar::{PValue, Scalar}; use crate::{FoRArray, FoREncoding}; @@ -53,7 +54,7 @@ impl TakeFn for FoREncoding { } impl FilterFn for FoREncoding { - fn filter(&self, array: &FoRArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &FoRArray, mask: &Mask) -> VortexResult { FoRArray::try_new( filter(&array.encoded(), mask)?, array.reference_scalar(), diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml index 7e4d9e113de..70f3e8b072b 100644 --- a/encodings/fsst/Cargo.toml +++ b/encodings/fsst/Cargo.toml @@ -25,6 +25,7 @@ vortex-array = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } +vortex-mask = { workspace = true } vortex-scalar = { workspace = true } [dev-dependencies] diff --git a/encodings/fsst/src/compute/mod.rs b/encodings/fsst/src/compute/mod.rs index 0ce03f932ea..ad6cb3cd955 100644 --- a/encodings/fsst/src/compute/mod.rs +++ b/encodings/fsst/src/compute/mod.rs @@ -2,12 +2,12 @@ mod compare; use vortex_array::array::varbin_scalar; use vortex_array::compute::{ - filter, scalar_at, slice, take, CompareFn, ComputeVTable, FilterFn, FilterMask, ScalarAtFn, - SliceFn, TakeFn, + filter, scalar_at, slice, take, CompareFn, ComputeVTable, FilterFn, ScalarAtFn, SliceFn, TakeFn, }; use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; use vortex_buffer::ByteBuffer; use vortex_error::{vortex_err, VortexResult}; +use vortex_mask::Mask; use vortex_scalar::Scalar; use crate::{FSSTArray, FSSTEncoding}; @@ -80,7 +80,7 @@ impl ScalarAtFn for FSSTEncoding { impl FilterFn for FSSTEncoding { // Filtering an FSSTArray filters the codes array, leaving the symbols array untouched - fn filter(&self, array: &FSSTArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &FSSTArray, mask: &Mask) -> VortexResult { Ok(FSSTArray::try_new( array.dtype().clone(), array.symbols(), diff --git a/encodings/fsst/tests/fsst_tests.rs b/encodings/fsst/tests/fsst_tests.rs index 3f00e6b1cc7..9a568c89c3a 100644 --- a/encodings/fsst/tests/fsst_tests.rs +++ b/encodings/fsst/tests/fsst_tests.rs @@ -1,12 +1,13 @@ #![cfg(test)] use vortex_array::array::builder::VarBinBuilder; -use vortex_array::compute::{filter, scalar_at, slice, take, FilterMask}; +use vortex_array::compute::{filter, scalar_at, slice, take}; use vortex_array::encoding::Encoding; use vortex_array::{ArrayData, IntoArrayData, IntoCanonical}; use vortex_buffer::buffer; use vortex_dtype::{DType, Nullability}; use vortex_fsst::{fsst_compress, fsst_train_compressor, FSSTEncoding}; +use vortex_mask::Mask; macro_rules! assert_nth_scalar { ($arr:expr, $n:expr, $expected:expr) => { @@ -84,7 +85,7 @@ fn test_fsst_array_ops() { ); // test filter - let mask = FilterMask::from_iter([false, true, false]); + let mask = Mask::from_iter([false, true, false]); let fsst_filtered = filter(&fsst_array, &mask).unwrap(); assert_eq!(fsst_filtered.encoding().id(), FSSTEncoding::ID); diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml index e08d80cfba4..209b0659bcc 100644 --- a/encodings/runend/Cargo.toml +++ b/encodings/runend/Cargo.toml @@ -22,6 +22,7 @@ vortex-array = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } +vortex-mask = { workspace = true } vortex-scalar = { workspace = true } diff --git a/encodings/runend/benches/run_end_filter.rs b/encodings/runend/benches/run_end_filter.rs index 90aef8bf184..5914861a3fc 100644 --- a/encodings/runend/benches/run_end_filter.rs +++ b/encodings/runend/benches/run_end_filter.rs @@ -4,9 +4,9 @@ use std::iter::Iterator; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use num_traits::ToPrimitive; -use vortex_array::compute::FilterMask; use vortex_array::{ArrayLen, IntoArrayData}; use vortex_buffer::Buffer; +use vortex_mask::Mask; use vortex_runend::RunEndArray; use vortex_runend::_benchmarking::{filter_run_end, take_indices_unchecked}; @@ -35,7 +35,7 @@ fn evenly_spaced(c: &mut Criterion) { let array = RunEndArray::try_new(ends, values).unwrap(); for filter_density in [0.001, 0.01, 0.015, 0.020, 0.025, 0.030] { - let mask = FilterMask::from_indices( + let mask = Mask::from_indices( array.len(), // In this case, the benchmarks don't seem to change whether we evenly spread // the mask values or like here we pack them into the beginning of the mask. diff --git a/encodings/runend/src/compute/filter.rs b/encodings/runend/src/compute/filter.rs index 7f8039829fd..2c3da188fef 100644 --- a/encodings/runend/src/compute/filter.rs +++ b/encodings/runend/src/compute/filter.rs @@ -4,13 +4,14 @@ use std::ops::AddAssign; use arrow_buffer::BooleanBuffer; use num_traits::AsPrimitive; use vortex_array::array::PrimitiveArray; -use vortex_array::compute::{filter, FilterFn, FilterMask}; +use vortex_array::compute::{filter, FilterFn}; use vortex_array::validity::Validity; use vortex_array::variants::PrimitiveArrayTrait; use vortex_array::{ArrayData, ArrayLen, IntoArrayData, IntoArrayVariant}; use vortex_buffer::buffer_mut; use vortex_dtype::{match_each_unsigned_integer_ptype, NativePType}; use vortex_error::{VortexResult, VortexUnwrap}; +use vortex_mask::Mask; use crate::compute::take::take_indices_unchecked; use crate::{RunEndArray, RunEndEncoding}; @@ -18,7 +19,7 @@ use crate::{RunEndArray, RunEndEncoding}; const FILTER_TAKE_THRESHOLD: f64 = 0.1; impl FilterFn for RunEndEncoding { - fn filter(&self, array: &RunEndArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &RunEndArray, mask: &Mask) -> VortexResult { let runs_ratio = mask.true_count() as f64 / array.ends().len() as f64; if runs_ratio < FILTER_TAKE_THRESHOLD || mask.true_count() < 25 { @@ -39,7 +40,7 @@ impl FilterFn for RunEndEncoding { } // We expose this function to our benchmarks. -pub fn filter_run_end(array: &RunEndArray, mask: &FilterMask) -> VortexResult { +pub fn filter_run_end(array: &RunEndArray, mask: &Mask) -> VortexResult { let primitive_run_ends = array.ends().into_primitive()?; let (run_ends, values_mask) = match_each_unsigned_integer_ptype!(primitive_run_ends.ptype(), |$P| { filter_run_end_primitive(primitive_run_ends.as_slice::<$P>(), array.offset() as u64, array.len() as u64, mask)? @@ -54,8 +55,8 @@ fn filter_run_end_primitive + AsPrimitiv run_ends: &[R], offset: u64, length: u64, - mask: &FilterMask, -) -> VortexResult<(PrimitiveArray, FilterMask)> { + mask: &Mask, +) -> VortexResult<(PrimitiveArray, Mask)> { let mut new_run_ends = buffer_mut![R::zero(); run_ends.len()]; let mut start = 0u64; @@ -63,7 +64,7 @@ fn filter_run_end_primitive + AsPrimitiv let mut count = R::zero(); let filter_values = mask.boolean_buffer(); - let new_mask: FilterMask = BooleanBuffer::collect_bool(run_ends.len(), |i| { + let new_mask: Mask = BooleanBuffer::collect_bool(run_ends.len(), |i| { let mut keep = false; let end = min(run_ends[i].as_() - offset, length); @@ -93,8 +94,9 @@ fn filter_run_end_primitive + AsPrimitiv #[cfg(test)] mod tests { use vortex_array::array::PrimitiveArray; - use vortex_array::compute::{slice, FilterMask}; + use vortex_array::compute::slice; use vortex_array::{IntoArrayVariant, ToArrayData}; + use vortex_mask::Mask; use super::filter_run_end; use crate::RunEndArray; @@ -111,7 +113,7 @@ mod tests { let arr = ree_array(); let filtered = filter_run_end( &arr, - &FilterMask::from_iter([ + &Mask::from_iter([ true, true, false, false, false, false, false, false, false, false, true, true, ]), ) @@ -141,7 +143,7 @@ mod tests { let arr = slice(ree_array(), 2, 7).unwrap(); let filtered = filter_run_end( &RunEndArray::maybe_from(arr).unwrap(), - &FilterMask::from_iter([true, false, false, true, true]), + &Mask::from_iter([true, false, false, true, true]), ) .unwrap(); let filtered_run_end = RunEndArray::try_from(filtered).unwrap(); diff --git a/encodings/zigzag/Cargo.toml b/encodings/zigzag/Cargo.toml index a35f671a85f..8f250c87ba7 100644 --- a/encodings/zigzag/Cargo.toml +++ b/encodings/zigzag/Cargo.toml @@ -19,6 +19,7 @@ vortex-array = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } +vortex-mask = { workspace = true } vortex-scalar = { workspace = true } zigzag = { workspace = true } diff --git a/encodings/zigzag/src/compute.rs b/encodings/zigzag/src/compute.rs index 9b2e914c7db..4953a2f1b94 100644 --- a/encodings/zigzag/src/compute.rs +++ b/encodings/zigzag/src/compute.rs @@ -1,11 +1,11 @@ use vortex_array::compute::{ - filter, scalar_at, slice, take, ComputeVTable, FilterFn, FilterMask, ScalarAtFn, SliceFn, - TakeFn, + filter, scalar_at, slice, take, ComputeVTable, FilterFn, ScalarAtFn, SliceFn, TakeFn, }; use vortex_array::variants::PrimitiveArrayTrait; use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; use vortex_dtype::match_each_unsigned_integer_ptype; use vortex_error::{vortex_err, VortexResult}; +use vortex_mask::Mask; use vortex_scalar::{PrimitiveScalar, Scalar}; use zigzag::{ZigZag as ExternalZigZag, ZigZag}; @@ -30,7 +30,7 @@ impl ComputeVTable for ZigZagEncoding { } impl FilterFn for ZigZagEncoding { - fn filter(&self, array: &ZigZagArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &ZigZagArray, mask: &Mask) -> VortexResult { let encoded = filter(&array.encoded(), mask)?; Ok(ZigZagArray::try_new(encoded)?.into_array()) } diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index e0f8996c589..b28f5ba9ea4 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -29,6 +29,7 @@ vortex-buffer = { workspace = true } vortex-dtype = { workspace = true, features = ["arbitrary"] } vortex-error = { workspace = true } vortex-file = { workspace = true } +vortex-mask = { workspace = true } vortex-sampling-compressor = { workspace = true, features = ["arbitrary"] } vortex-scalar = { workspace = true, features = ["arbitrary"] } diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs index 9c93502a375..a05ae04cc42 100644 --- a/fuzz/src/lib.rs +++ b/fuzz/src/lib.rs @@ -13,10 +13,11 @@ use libfuzzer_sys::arbitrary::{Arbitrary, Result, Unstructured}; pub use sort::sort_canonical_array; use vortex_array::aliases::hash_set::HashSet; use vortex_array::array::ListEncoding; -use vortex_array::compute::{scalar_at, FilterMask, SearchResult, SearchSortedSide}; +use vortex_array::compute::{scalar_at, SearchResult, SearchSortedSide}; use vortex_array::encoding::{Encoding, EncodingRef}; use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; use vortex_buffer::Buffer; +use vortex_mask::Mask; use vortex_sampling_compressor::SamplingCompressor; use vortex_scalar::arbitrary::random_scalar; use vortex_scalar::Scalar; @@ -60,7 +61,7 @@ pub enum Action { Slice(Range), Take(ArrayData), SearchSorted(Scalar, SearchSortedSide), - Filter(FilterMask), + Filter(Mask), } impl<'a> Arbitrary<'a> for FuzzArrayAction { @@ -148,7 +149,7 @@ impl<'a> Arbitrary<'a> for FuzzArrayAction { .collect::>>()?; current_array = filter_canonical_array(¤t_array, &mask); ( - Action::Filter(FilterMask::from_iter(mask)), + Action::Filter(Mask::from_iter(mask)), ExpectedValue::Array(current_array.clone()), ) } diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index 43f88bb610c..de8063d4e97 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -4,7 +4,8 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{IntoPyDict, PyInt, PyList}; use vortex::array::ChunkedArray; -use vortex::compute::{compare, fill_forward, scalar_at, slice, take, FilterMask, Operator}; +use vortex::compute::{compare, fill_forward, scalar_at, slice, take, Operator}; +use vortex::mask::Mask; use vortex::{ArrayDType, ArrayData, IntoCanonical}; use crate::dtype::PyDType; @@ -262,8 +263,7 @@ impl PyArray { /// ] fn filter(&self, filter: &Bound) -> PyResult { let filter = filter.borrow(); - let inner = - vortex::compute::filter(&self.inner, &FilterMask::try_from(filter.inner.clone())?)?; + let inner = vortex::compute::filter(&self.inner, &Mask::try_from(filter.inner.clone())?)?; Ok(PyArray { inner }) } diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index c0766394cd9..9e0f06a9fda 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -54,6 +54,7 @@ vortex-datetime-dtype = { workspace = true } vortex-dtype = { workspace = true, features = ["rkyv", "serde"] } vortex-error = { workspace = true, features = ["flatbuffers", "flexbuffers", "rancor"] } vortex-flatbuffers = { workspace = true, features = ["array"] } +vortex-mask = { workspace = true } vortex-scalar = { workspace = true, features = [ "flatbuffers", "serde", diff --git a/vortex-array/src/array/bool/compute/filter.rs b/vortex-array/src/array/bool/compute/filter.rs index 128fd122f89..f3955cf2ea3 100644 --- a/vortex-array/src/array/bool/compute/filter.rs +++ b/vortex-array/src/array/bool/compute/filter.rs @@ -1,17 +1,18 @@ use arrow_buffer::{bit_util, BooleanBuffer, BooleanBufferBuilder}; use vortex_error::{VortexExpect, VortexResult}; +use vortex_mask::{Mask, MaskIter}; use crate::array::{BoolArray, BoolEncoding}; -use crate::compute::{FilterFn, FilterIter, FilterMask}; +use crate::compute::FilterFn; use crate::{ArrayData, IntoArrayData}; impl FilterFn for BoolEncoding { - fn filter(&self, array: &BoolArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &BoolArray, mask: &Mask) -> VortexResult { let validity = array.validity().filter(mask)?; let buffer = match mask.iter() { - FilterIter::Indices(indices) => filter_indices_slice(&array.boolean_buffer(), indices), - FilterIter::Slices(slices) => filter_slices( + MaskIter::Indices(indices) => filter_indices_slice(&array.boolean_buffer(), indices), + MaskIter::Slices(slices) => filter_slices( &array.boolean_buffer(), mask.true_count(), slices.iter().copied(), @@ -67,16 +68,17 @@ pub fn filter_slices( #[cfg(test)] mod test { use itertools::Itertools; + use vortex_mask::Mask; use crate::array::bool::compute::filter::{filter_indices, filter_slices}; use crate::array::BoolArray; - use crate::compute::{filter, FilterMask}; + use crate::compute::filter; use crate::{ArrayLen, IntoArrayData, IntoArrayVariant}; #[test] fn filter_bool_test() { let arr = BoolArray::from_iter([true, true, false]); - let mask = FilterMask::from_iter([true, false, true]); + let mask = Mask::from_iter([true, false, true]); let filtered = filter(&arr.into_array(), &mask) .unwrap() diff --git a/vortex-array/src/array/chunked/compute/filter.rs b/vortex-array/src/array/chunked/compute/filter.rs index fc29caf8252..ef488d017fc 100644 --- a/vortex-array/src/array/chunked/compute/filter.rs +++ b/vortex-array/src/array/chunked/compute/filter.rs @@ -1,8 +1,9 @@ use vortex_buffer::BufferMut; use vortex_error::{VortexExpect, VortexResult, VortexUnwrap}; +use vortex_mask::Mask; use crate::array::{ChunkedArray, ChunkedEncoding, PrimitiveArray}; -use crate::compute::{filter, take, FilterFn, FilterMask, SearchSorted, SearchSortedSide}; +use crate::compute::{filter, take, FilterFn, SearchSorted, SearchSortedSide}; use crate::validity::Validity; use crate::{ArrayDType, ArrayData, ArrayLen, IntoArrayData, IntoCanonical}; @@ -10,7 +11,7 @@ use crate::{ArrayDType, ArrayData, ArrayLen, IntoArrayData, IntoCanonical}; const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8; impl FilterFn for ChunkedEncoding { - fn filter(&self, array: &ChunkedArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &ChunkedArray, mask: &Mask) -> VortexResult { let selected = mask.true_count(); // Based on filter selectivity, we take the values between a range of slices, or @@ -38,7 +39,7 @@ enum ChunkFilter { } /// Filter the chunks using slice ranges. -fn filter_slices(array: &ChunkedArray, mask: &FilterMask) -> VortexResult> { +fn filter_slices(array: &ChunkedArray, mask: &Mask) -> VortexResult> { let mut result = Vec::with_capacity(array.nchunks()); // Pre-materialize the chunk ends for performance. @@ -104,10 +105,7 @@ fn filter_slices(array: &ChunkedArray, mask: &FilterMask) -> VortexResult {} // Slices => turn the slices into a boolean buffer. ChunkFilter::Slices(slices) => { - result.push(filter( - &chunk, - &FilterMask::from_slices(chunk.len(), slices), - )?); + result.push(filter(&chunk, &Mask::from_slices(chunk.len(), slices))?); } } } @@ -117,7 +115,7 @@ fn filter_slices(array: &ChunkedArray, mask: &FilterMask) -> VortexResult VortexResult> { +fn filter_indices(array: &ChunkedArray, mask: &Mask) -> VortexResult> { let mut result = Vec::with_capacity(array.nchunks()); let mut current_chunk_id = 0; let mut chunk_indices = BufferMut::with_capacity(array.nchunks()); @@ -182,9 +180,10 @@ fn find_chunk_idx(idx: usize, chunk_ends: &[u64]) -> (usize, usize) { mod test { use vortex_dtype::half::f16; use vortex_dtype::{DType, Nullability, PType}; + use vortex_mask::Mask; use crate::array::{ChunkedArray, PrimitiveArray}; - use crate::compute::{filter, FilterMask}; + use crate::compute::filter; use crate::IntoArrayData; #[test] @@ -213,7 +212,7 @@ mod test { ) .unwrap() .into_array(); - let mask = FilterMask::from_iter([ + let mask = Mask::from_iter([ true, false, false, true, true, true, true, true, true, true, true, ]); let filtered = filter(&chunked, &mask).unwrap(); diff --git a/vortex-array/src/array/constant/compute/mod.rs b/vortex-array/src/array/constant/compute/mod.rs index a5ba0b84853..2df732397b6 100644 --- a/vortex-array/src/array/constant/compute/mod.rs +++ b/vortex-array/src/array/constant/compute/mod.rs @@ -5,13 +5,14 @@ mod invert; mod search_sorted; use vortex_error::VortexResult; +use vortex_mask::Mask; use vortex_scalar::Scalar; use crate::array::constant::ConstantArray; use crate::array::ConstantEncoding; use crate::compute::{ - BinaryBooleanFn, BinaryNumericFn, CompareFn, ComputeVTable, FilterFn, FilterMask, InvertFn, - ScalarAtFn, SearchSortedFn, SliceFn, TakeFn, + BinaryBooleanFn, BinaryNumericFn, CompareFn, ComputeVTable, FilterFn, InvertFn, ScalarAtFn, + SearchSortedFn, SliceFn, TakeFn, }; use crate::{ArrayData, IntoArrayData}; @@ -72,7 +73,7 @@ impl SliceFn for ConstantEncoding { } impl FilterFn for ConstantEncoding { - fn filter(&self, array: &ConstantArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &ConstantArray, mask: &Mask) -> VortexResult { Ok(ConstantArray::new(array.scalar(), mask.true_count()).into_array()) } } diff --git a/vortex-array/src/array/list/mod.rs b/vortex-array/src/array/list/mod.rs index eeefda6b108..b6b3c4da2d3 100644 --- a/vortex-array/src/array/list/mod.rs +++ b/vortex-array/src/array/list/mod.rs @@ -233,11 +233,12 @@ mod test { use vortex_dtype::Nullability; use vortex_dtype::Nullability::NonNullable; use vortex_dtype::PType::I32; + use vortex_mask::Mask; use vortex_scalar::Scalar; use crate::array::list::ListArray; use crate::array::PrimitiveArray; - use crate::compute::{filter, scalar_at, FilterMask}; + use crate::compute::{filter, scalar_at}; use crate::validity::Validity; use crate::{ArrayLen, IntoArrayData}; @@ -319,7 +320,7 @@ mod test { let filtered = filter( &list, - &FilterMask::from(BooleanBuffer::from(vec![false, true, true])), + &Mask::from(BooleanBuffer::from(vec![false, true, true])), ); assert!(filtered.is_ok()) diff --git a/vortex-array/src/array/primitive/compute/filter.rs b/vortex-array/src/array/primitive/compute/filter.rs index 0ed45f85c8f..7392e02201b 100644 --- a/vortex-array/src/array/primitive/compute/filter.rs +++ b/vortex-array/src/array/primitive/compute/filter.rs @@ -1,20 +1,21 @@ use vortex_buffer::{Buffer, BufferMut}; use vortex_dtype::match_each_native_ptype; use vortex_error::VortexResult; +use vortex_mask::{Mask, MaskIter}; use crate::array::primitive::PrimitiveArray; use crate::array::PrimitiveEncoding; -use crate::compute::{FilterFn, FilterIter, FilterMask}; +use crate::compute::FilterFn; use crate::variants::PrimitiveArrayTrait; use crate::{ArrayData, IntoArrayData}; impl FilterFn for PrimitiveEncoding { - fn filter(&self, array: &PrimitiveArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &PrimitiveArray, mask: &Mask) -> VortexResult { let validity = array.validity().filter(mask)?; match_each_native_ptype!(array.ptype(), |$T| { let values = match mask.iter() { - FilterIter::Indices(indices) => filter_primitive_indices(array.as_slice::<$T>(), indices.iter().copied()), - FilterIter::Slices(slices) => filter_primitive_slices(array.as_slice::<$T>(), mask.true_count(), slices.iter().copied()), + MaskIter::Indices(indices) => filter_primitive_indices(array.as_slice::<$T>(), indices.iter().copied()), + MaskIter::Slices(slices) => filter_primitive_slices(array.as_slice::<$T>(), mask.true_count(), slices.iter().copied()), }; Ok(PrimitiveArray::new(values, validity).into_array()) }) @@ -45,9 +46,10 @@ fn filter_primitive_slices( #[cfg(test)] mod test { use itertools::Itertools; + use vortex_mask::Mask; use crate::array::primitive::PrimitiveArray; - use crate::compute::{filter, FilterMask}; + use crate::compute::filter; use crate::{ArrayLen, IntoArrayVariant, ToArrayData}; #[test] @@ -55,7 +57,7 @@ mod test { let mask = [true, true, false, true, true, true, false, true]; let arr = PrimitiveArray::from_iter([1u32, 24, 54, 2, 3, 2, 3, 2]); - let filtered = filter(&arr.to_array(), &FilterMask::from_iter(mask)) + let filtered = filter(&arr.to_array(), &Mask::from_iter(mask)) .unwrap() .into_primitive() .unwrap(); diff --git a/vortex-array/src/array/sparse/compute/mod.rs b/vortex-array/src/array/sparse/compute/mod.rs index 433cdd60621..8695e1bd7b8 100644 --- a/vortex-array/src/array/sparse/compute/mod.rs +++ b/vortex-array/src/array/sparse/compute/mod.rs @@ -1,11 +1,12 @@ use vortex_error::VortexResult; +use vortex_mask::Mask; use vortex_scalar::Scalar; use crate::array::sparse::SparseArray; use crate::array::{ConstantArray, SparseEncoding}; use crate::compute::{ - BinaryNumericFn, ComputeVTable, FilterFn, FilterMask, InvertFn, ScalarAtFn, SearchResult, - SearchSortedFn, SearchSortedSide, SearchSortedUsizeFn, SliceFn, TakeFn, + BinaryNumericFn, ComputeVTable, FilterFn, InvertFn, ScalarAtFn, SearchResult, SearchSortedFn, + SearchSortedSide, SearchSortedUsizeFn, SliceFn, TakeFn, }; use crate::{ArrayDType, ArrayData, ArrayLen, IntoArrayData}; @@ -89,7 +90,7 @@ impl SearchSortedUsizeFn for SparseEncoding { } impl FilterFn for SparseEncoding { - fn filter(&self, array: &SparseArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &SparseArray, mask: &Mask) -> VortexResult { let new_length = mask.true_count(); let Some(new_patches) = array.resolved_patches()?.filter(mask)? else { @@ -105,14 +106,13 @@ impl FilterFn for SparseEncoding { mod test { use rstest::{fixture, rstest}; use vortex_buffer::buffer; + use vortex_mask::Mask; use vortex_scalar::Scalar; use crate::array::primitive::PrimitiveArray; use crate::array::sparse::SparseArray; use crate::compute::test_harness::test_binary_numeric; - use crate::compute::{ - filter, search_sorted, slice, FilterMask, SearchResult, SearchSortedSide, - }; + use crate::compute::{filter, search_sorted, slice, SearchResult, SearchSortedSide}; use crate::validity::Validity; use crate::{ArrayData, ArrayLen, IntoArrayData, IntoArrayVariant}; @@ -186,7 +186,7 @@ mod test { fn test_filter(array: ArrayData) { let mut predicate = vec![false, false, true]; predicate.extend_from_slice(&[false; 17]); - let mask = FilterMask::from_iter(predicate); + let mask = Mask::from_iter(predicate); let filtered_array = filter(&array, &mask).unwrap(); let filtered_array = SparseArray::try_from(filtered_array).unwrap(); @@ -198,7 +198,7 @@ mod test { #[test] fn true_fill_value() { - let mask = FilterMask::from_iter([false, true, false, true, false, true, true]); + let mask = Mask::from_iter([false, true, false, true, false, true, true]); let array = SparseArray::try_new( buffer![0_u64, 3, 6].into_array(), PrimitiveArray::new(buffer![33_i32, 44, 55], Validity::AllValid).into_array(), diff --git a/vortex-array/src/array/struct_/compute.rs b/vortex-array/src/array/struct_/compute.rs index d95788d60e2..f7407071303 100644 --- a/vortex-array/src/array/struct_/compute.rs +++ b/vortex-array/src/array/struct_/compute.rs @@ -1,12 +1,12 @@ use itertools::Itertools; use vortex_error::VortexResult; +use vortex_mask::Mask; use vortex_scalar::Scalar; use crate::array::struct_::StructArray; use crate::array::StructEncoding; use crate::compute::{ - filter, scalar_at, slice, take, ComputeVTable, FilterFn, FilterMask, ScalarAtFn, SliceFn, - TakeFn, + filter, scalar_at, slice, take, ComputeVTable, FilterFn, ScalarAtFn, SliceFn, TakeFn, }; use crate::variants::StructArrayTrait; use crate::{ArrayDType, ArrayData, IntoArrayData}; @@ -73,7 +73,7 @@ impl SliceFn for StructEncoding { } impl FilterFn for StructEncoding { - fn filter(&self, array: &StructArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &StructArray, mask: &Mask) -> VortexResult { let validity = array.validity().filter(mask)?; let fields: Vec = array @@ -92,8 +92,10 @@ impl FilterFn for StructEncoding { #[cfg(test)] mod tests { + use vortex_mask::Mask; + use crate::array::StructArray; - use crate::compute::{filter, FilterMask}; + use crate::compute::filter; use crate::validity::Validity; #[test] @@ -103,7 +105,7 @@ mod tests { let mask = vec![ false, true, false, true, false, true, false, true, false, true, ]; - let filtered = filter(struct_arr.as_ref(), &FilterMask::from_iter(mask)).unwrap(); + let filtered = filter(struct_arr.as_ref(), &Mask::from_iter(mask)).unwrap(); assert_eq!(filtered.len(), 5); } @@ -111,8 +113,7 @@ mod tests { fn filter_empty_struct_with_empty_filter() { let struct_arr = StructArray::try_new(vec![].into(), vec![], 0, Validity::NonNullable).unwrap(); - let filtered = - filter(struct_arr.as_ref(), &FilterMask::from_iter::<[bool; 0]>([])).unwrap(); + let filtered = filter(struct_arr.as_ref(), &Mask::from_iter::<[bool; 0]>([])).unwrap(); assert_eq!(filtered.len(), 0); } } diff --git a/vortex-array/src/array/varbin/compute/filter.rs b/vortex-array/src/array/varbin/compute/filter.rs index 671aa075c89..467caab04c3 100644 --- a/vortex-array/src/array/varbin/compute/filter.rs +++ b/vortex-array/src/array/varbin/compute/filter.rs @@ -2,22 +2,23 @@ use itertools::Itertools; use num_traits::{AsPrimitive, PrimInt, Zero}; use vortex_dtype::{match_each_integer_ptype, DType, NativePType}; use vortex_error::{vortex_err, vortex_panic, VortexResult}; +use vortex_mask::Mask; use crate::array::varbin::builder::VarBinBuilder; use crate::array::varbin::VarBinArray; use crate::array::VarBinEncoding; -use crate::compute::{FilterFn, FilterMask}; +use crate::compute::FilterFn; use crate::validity::Validity; use crate::variants::PrimitiveArrayTrait; use crate::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant}; impl FilterFn for VarBinEncoding { - fn filter(&self, array: &VarBinArray, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &VarBinArray, mask: &Mask) -> VortexResult { filter_select_var_bin(array, mask).map(|a| a.into_array()) } } -fn filter_select_var_bin(arr: &VarBinArray, mask: &FilterMask) -> VortexResult { +fn filter_select_var_bin(arr: &VarBinArray, mask: &Mask) -> VortexResult { let selection_count = mask.true_count(); if selection_count * 2 > mask.len() { filter_select_var_bin_by_slice(arr, mask, selection_count) @@ -28,7 +29,7 @@ fn filter_select_var_bin(arr: &VarBinArray, mask: &FilterMask) -> VortexResult VortexResult { let offsets = values.offsets().into_primitive()?; @@ -49,7 +50,7 @@ fn filter_select_var_bin_by_slice_primitive_offset( dtype: DType, offsets: &[O], data: &[u8], - mask: &FilterMask, + mask: &Mask, validity: Validity, selection_count: usize, ) -> VortexResult @@ -128,7 +129,7 @@ fn update_non_nullable_slice( fn filter_select_var_bin_by_index( values: &VarBinArray, - mask: &FilterMask, + mask: &Mask, selection_count: usize, ) -> VortexResult { let offsets = values.offsets().into_primitive()?; @@ -149,7 +150,7 @@ fn filter_select_var_bin_by_index_primitive_offset( dtype: DType, offsets: &[O], data: &[u8], - mask: &FilterMask, + mask: &Mask, validity: Validity, selection_count: usize, ) -> VortexResult { @@ -177,6 +178,7 @@ mod test { use vortex_buffer::ByteBuffer; use vortex_dtype::DType; use vortex_dtype::Nullability::{NonNullable, Nullable}; + use vortex_mask::Mask; use vortex_scalar::Scalar; use crate::array::primitive::PrimitiveArray; @@ -185,7 +187,7 @@ mod test { }; use crate::array::varbin::VarBinArray; use crate::array::BoolArray; - use crate::compute::{scalar_at, FilterMask}; + use crate::compute::scalar_at; use crate::validity::Validity; use crate::ToArrayData; @@ -203,7 +205,7 @@ mod test { ], DType::Utf8(NonNullable), ); - let filter = FilterMask::from_iter([true, false, true]); + let filter = Mask::from_iter([true, false, true]); let buf = filter_select_var_bin_by_index(&arr, &filter, 2) .unwrap() @@ -226,7 +228,7 @@ mod test { ], DType::Utf8(NonNullable), ); - let filter = FilterMask::from_iter([true, false, true, false, true]); + let filter = Mask::from_iter([true, false, true, false, true]); let buf = filter_select_var_bin_by_slice(&arr, &filter, 3) .unwrap() @@ -256,7 +258,7 @@ mod test { let validity = Validity::Array(BoolArray::from_iter([true, false, true, true, true, true]).to_array()); let arr = VarBinArray::try_new(offsets, bytes, DType::Utf8(Nullable), validity).unwrap(); - let filter = FilterMask::from_iter([true, true, true, false, true, true]); + let filter = Mask::from_iter([true, true, true, false, true, true]); let buf = filter_select_var_bin_by_slice(&arr, &filter, 5) .unwrap() diff --git a/vortex-array/src/compute/filter.rs b/vortex-array/src/compute/filter.rs index 35cfa0b26bb..0233e303261 100644 --- a/vortex-array/src/compute/filter.rs +++ b/vortex-array/src/compute/filter.rs @@ -6,6 +6,7 @@ use arrow_array::BooleanArray; use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; use vortex_dtype::{DType, Nullability}; use vortex_error::{vortex_bail, vortex_panic, VortexError, VortexExpect, VortexResult}; +use vortex_mask::Mask; use crate::array::ConstantArray; use crate::arrow::FromArrowArray; @@ -14,15 +15,9 @@ use crate::encoding::Encoding; use crate::stats::{ArrayStatistics, Stat}; use crate::{ArrayDType, ArrayData, Canonical, IntoArrayData, IntoArrayVariant, IntoCanonical}; -/// If the filter selects more than this fraction of rows, iterate over slices instead of indices. -/// -/// Threshold of 0.8 chosen based on Arrow Rust, which is in turn based on: -/// -const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8; - pub trait FilterFn { /// Filter an array by the provided predicate. - fn filter(&self, array: &Array, mask: &FilterMask) -> VortexResult; + fn filter(&self, array: &Array, mask: &Mask) -> VortexResult; } impl FilterFn for E @@ -30,7 +25,7 @@ where E: FilterFn, for<'a> &'a E::Array: TryFrom<&'a ArrayData, Error = VortexError>, { - fn filter(&self, array: &ArrayData, mask: &FilterMask) -> VortexResult { + fn filter(&self, array: &ArrayData, mask: &Mask) -> VortexResult { let (array_ref, encoding) = array.try_downcast_ref::()?; FilterFn::filter(encoding, array_ref, mask) } @@ -46,7 +41,7 @@ where /// /// The `predicate` must receive an Array with type non-nullable bool, and will panic if this is /// not the case. -pub fn filter(array: &ArrayData, mask: &FilterMask) -> VortexResult { +pub fn filter(array: &ArrayData, mask: &Mask) -> VortexResult { if mask.len() != array.len() { vortex_bail!( "mask.len() is {}, does not equal array.len() of {}", @@ -85,7 +80,7 @@ pub fn filter(array: &ArrayData, mask: &FilterMask) -> VortexResult { Ok(filtered) } -fn filter_impl(array: &ArrayData, mask: &FilterMask) -> VortexResult { +fn filter_impl(array: &ArrayData, mask: &Mask) -> VortexResult { if let Some(filter_fn) = array.encoding().filter_fn() { return filter_fn.filter(array, mask); } @@ -109,452 +104,7 @@ fn filter_impl(array: &ArrayData, mask: &FilterMask) -> VortexResult Ok(ArrayData::from_arrow(filtered, array.dtype().is_nullable())) } -/// Represents the mask argument to a filter function. -/// -/// A [`FilterMask`] can be constructed from various representations, and converted to various -/// others. Internally, these are cached. -#[derive(Clone, Debug)] -pub struct FilterMask(Arc); - -#[derive(Debug)] -struct Inner { - // The three possible representations of the mask. - buffer: OnceLock, - indices: OnceLock>, - slices: OnceLock>, - - // Pre-computed values. - len: usize, - true_count: usize, - selectivity: f64, -} - -impl Inner { - /// Constructs a [`BooleanBuffer`] from one of the other representations. - fn buffer(&self) -> &BooleanBuffer { - self.buffer.get_or_init(|| { - if self.true_count == 0 { - return BooleanBuffer::new_unset(self.len); - } - - if self.true_count == self.len { - return BooleanBuffer::new_set(self.len); - } - - if let Some(indices) = self.indices.get() { - let mut buf = BooleanBufferBuilder::new(self.len); - // TODO(ngates): for dense indices, we can do better by collecting into u64s. - buf.append_n(self.len, false); - indices.iter().for_each(|idx| buf.set_bit(*idx, true)); - return BooleanBuffer::from(buf); - } - - if let Some(slices) = self.slices.get() { - let mut buf = BooleanBufferBuilder::new(self.len); - for (start, end) in slices.iter().copied() { - buf.append_n(start - buf.len(), false); - buf.append_n(end - start, true); - } - if let Some((_, end)) = slices.last() { - buf.append_n(self.len - end, false); - } - debug_assert_eq!(buf.len(), self.len); - return BooleanBuffer::from(buf); - } - - vortex_panic!("No mask representation found") - }) - } - - /// Constructs an indices vector from one of the other representations. - fn indices(&self) -> &[usize] { - self.indices.get_or_init(|| { - if self.true_count == 0 { - return vec![]; - } - - if self.true_count == self.len { - return (0..self.len).collect(); - } - - if let Some(buffer) = self.buffer.get() { - let mut indices = Vec::with_capacity(self.true_count); - indices.extend(buffer.set_indices()); - return indices; - } - - if let Some(slices) = self.slices.get() { - let mut indices = Vec::with_capacity(self.true_count); - indices.extend(slices.iter().flat_map(|(start, end)| *start..*end)); - return indices; - } - - vortex_panic!("No mask representation found") - }) - } - - /// Constructs a slices vector from one of the other representations. - fn slices(&self) -> &[(usize, usize)] { - self.slices.get_or_init(|| { - if self.true_count == self.len { - return vec![(0, self.len)]; - } - - if let Some(buffer) = self.buffer.get() { - return buffer.set_slices().collect(); - } - - if let Some(indices) = self.indices.get() { - let mut slices = Vec::with_capacity(self.true_count); // Upper bound - let mut iter = indices.iter().copied(); - - // Handle empty input - let Some(first) = iter.next() else { - return slices; - }; - - let mut start = first; - let mut prev = first; - for curr in iter { - if curr != prev + 1 { - slices.push((start, prev + 1)); - start = curr; - } - prev = curr; - } - - // Don't forget the last range - slices.push((start, prev + 1)); - - return slices; - } - - vortex_panic!("No mask representation found") - }) - } - - fn first(&self) -> Option { - if self.true_count == 0 { - return None; - } - if self.true_count == self.len { - return Some(0); - } - if let Some(buffer) = self.buffer.get() { - return buffer.set_indices().next(); - } - if let Some(indices) = self.indices.get() { - return indices.first().copied(); - } - if let Some(slices) = self.slices.get() { - return slices.first().map(|(start, _)| *start); - } - None - } -} - -impl FilterMask { - /// Create a new FilterMask where all values are set. - pub fn new_true(length: usize) -> Self { - Self(Arc::new(Inner { - buffer: Default::default(), - indices: Default::default(), - slices: Default::default(), - len: length, - true_count: length, - selectivity: 1.0, - })) - } - - /// Create a new FilterMask where no values are set. - pub fn new_false(length: usize) -> Self { - Self(Arc::new(Inner { - buffer: Default::default(), - indices: Default::default(), - slices: Default::default(), - len: length, - true_count: 0, - selectivity: 0.0, - })) - } - - /// Create a new [`FilterMask`] from a [`BooleanBuffer`]. - pub fn from_buffer(buffer: BooleanBuffer) -> Self { - let true_count = buffer.count_set_bits(); - let len = buffer.len(); - Self(Arc::new(Inner { - buffer: OnceLock::from(buffer), - indices: Default::default(), - slices: Default::default(), - len, - true_count, - selectivity: true_count as f64 / len as f64, - })) - } - - /// Create a new [`FilterMask`] from a [`Vec`]. - pub fn from_indices(len: usize, vec: Vec) -> Self { - let true_count = vec.len(); - assert!(vec.iter().all(|&idx| idx < len)); - Self(Arc::new(Inner { - buffer: Default::default(), - indices: OnceLock::from(vec), - slices: Default::default(), - len, - true_count, - selectivity: true_count as f64 / len as f64, - })) - } - - /// Create a new [`FilterMask`] from a [`Vec<(usize, usize)>`] where each range - /// represents a contiguous range of true values. - pub fn from_slices(len: usize, vec: Vec<(usize, usize)>) -> Self { - assert!(vec.iter().all(|&(b, e)| b < e && e <= len)); - let true_count = vec.iter().map(|(b, e)| e - b).sum(); - Self(Arc::new(Inner { - buffer: Default::default(), - indices: Default::default(), - slices: OnceLock::from(vec), - len, - true_count, - selectivity: true_count as f64 / len as f64, - })) - } - - /// Create a new [`FilterMask`] from the intersection of two indices slices. - pub fn from_intersection_indices( - len: usize, - lhs: impl Iterator, - rhs: impl Iterator, - ) -> Self { - let mut intersection = Vec::with_capacity(len); - let mut lhs = lhs.peekable(); - let mut rhs = rhs.peekable(); - while let (Some(&l), Some(&r)) = (lhs.peek(), rhs.peek()) { - match l.cmp(&r) { - Ordering::Less => { - lhs.next(); - } - Ordering::Greater => { - rhs.next(); - } - Ordering::Equal => { - intersection.push(l); - lhs.next(); - rhs.next(); - } - } - } - Self::from_indices(len, intersection) - } - - #[inline] - // There is good definition of is_empty, does it mean len == 0 or true_count == 0? - #[allow(clippy::len_without_is_empty)] - pub fn len(&self) -> usize { - self.0.len - } - - /// Get the true count of the mask. - #[inline] - pub fn true_count(&self) -> usize { - self.0.true_count - } - - /// Get the false count of the mask. - #[inline] - pub fn false_count(&self) -> usize { - self.len() - self.true_count() - } - - /// Return the selectivity of the full mask. - #[inline] - pub fn selectivity(&self) -> f64 { - self.0.selectivity - } - - /// Get the canonical representation of the mask. - pub fn boolean_buffer(&self) -> &BooleanBuffer { - self.0.buffer() - } - - /// Get the indices of the true values in the mask. - pub fn indices(&self) -> &[usize] { - self.0.indices() - } - - /// Get the slices of the true values in the mask. - pub fn slices(&self) -> &[(usize, usize)] { - self.0.slices() - } - - /// Returns the first true index in the mask. - pub fn first(&self) -> Option { - self.0.first() - } - - /// Returns the best iterator based on a selectivity threshold. - /// - /// Currently, this threshold is fixed at 0.8 based on Arrow Rust. - pub fn iter(&self) -> FilterIter { - if self.selectivity() > FILTER_SLICES_SELECTIVITY_THRESHOLD { - FilterIter::Slices(self.slices()) - } else { - FilterIter::Indices(self.indices()) - } - } - - /// Slice the mask. - pub fn slice(&self, offset: usize, length: usize) -> Self { - if self.true_count() == 0 { - return Self::new_false(length); - } - if self.true_count() == self.len() { - return Self::new_true(length); - } - - if let Some(buffer) = self.0.buffer.get() { - return Self::from_buffer(buffer.slice(offset, length)); - } - - let end = offset + length; - - if let Some(indices) = self.0.indices.get() { - let indices = indices - .iter() - .copied() - .filter(|&idx| offset <= idx && idx < end) - .map(|idx| idx - offset) - .collect(); - return Self::from_indices(length, indices); - } - - if let Some(slices) = self.0.slices.get() { - let slices = slices - .iter() - .copied() - .filter(|(s, e)| *s < end && *e > offset) - .map(|(s, e)| (s.max(offset), e.min(end))) - .collect(); - return Self::from_slices(length, slices); - } - - vortex_panic!("No mask representation found") - } - - /// take the intersection of the `mask` with the set of true values in `self`. - /// - /// We are more interested in low selectivity `self` (as indices) with a boolean buffer mask, - /// so we don't optimize for other cases, yet. - pub fn intersect_by_rank(&self, mask: &FilterMask) -> FilterMask { - assert_eq!(self.true_count(), mask.len()); - - if mask.true_count() == mask.len() { - return self.clone(); - } - - if mask.true_count() == 0 { - return Self::new_false(self.len()); - } - - // TODO(joe): support other fast paths, not converting self & mask into indices, - // however indices are better for sparse masks, so this is the common case for now. - let indices = self.0.indices(); - Self::from_indices( - self.len(), - mask.indices() - .iter() - .map(|idx| - // This is verified as safe because we know that the indices are less than the - // mask.len() and we known mask.len() <= self.len(), - // implied by `self.true_count() == mask.len()`. - unsafe{*indices.get_unchecked(*idx)}) - .collect(), - ) - } -} - -pub enum FilterIter<'a> { - /// Slice of pre-cached indices of a filter mask. - Indices(&'a [usize]), - /// Slice of pre-cached slices of a filter mask. - Slices(&'a [(usize, usize)]), -} - -impl PartialEq for FilterMask { - fn eq(&self, other: &Self) -> bool { - if self.len() != other.len() { - return false; - } - if self.true_count() != other.true_count() { - return false; - } - - // Since the true counts are the same, a full or empty mask is equal to the other mask. - if self.true_count() == 0 || self.true_count() == self.len() { - return true; - } - - // Compare the buffer if both masks are non-empty. - if let (Some(buffer), Some(other)) = (self.0.buffer.get(), other.0.buffer.get()) { - return buffer == other; - } - - // Compare the indices if both masks are non-empty. - if let (Some(indices), Some(other)) = (self.0.indices.get(), other.0.indices.get()) { - return indices == other; - } - - // Compare the slices if both masks are non-empty. - if let (Some(slices), Some(other)) = (self.0.slices.get(), other.0.slices.get()) { - return slices == other; - } - - // Otherwise, we fall back to comparison based on sparsity. - // We could go further an exhaustively check whose OnceLocks are initialized, but that's - // probably not worth the effort. - self.boolean_buffer() == other.boolean_buffer() - } -} - -impl Eq for FilterMask {} - -impl BitAnd for &FilterMask { - type Output = FilterMask; - - fn bitand(self, rhs: Self) -> Self::Output { - if self.len() != rhs.len() { - vortex_panic!("FilterMasks must have the same length"); - } - if self.true_count() == 0 || rhs.true_count() == 0 { - return FilterMask::new_false(self.len()); - } - if self.true_count() == self.len() { - return rhs.clone(); - } - if rhs.true_count() == self.len() { - return self.clone(); - } - - if let (Some(lhs), Some(rhs)) = (self.0.buffer.get(), rhs.0.buffer.get()) { - return FilterMask::from_buffer(lhs & rhs); - } - - if let (Some(lhs), Some(rhs)) = (self.0.indices.get(), rhs.0.indices.get()) { - // TODO(ngates): this may only make sense for sparse indices. - return FilterMask::from_intersection_indices( - self.len(), - lhs.iter().copied(), - rhs.iter().copied(), - ); - } - - // TODO(ngates): we could perform a more efficient intersection for slices. - FilterMask::from_buffer(self.boolean_buffer() & rhs.boolean_buffer()) - } -} - -impl TryFrom for FilterMask { +impl TryFrom for Mask { type Error = VortexError; fn try_from(array: ArrayData) -> Result { @@ -581,18 +131,6 @@ impl TryFrom for FilterMask { } } -impl From for FilterMask { - fn from(value: BooleanBuffer) -> Self { - Self::from_buffer(value) - } -} - -impl FromIterator for FilterMask { - fn from_iter>(iter: T) -> Self { - Self::from_buffer(BooleanBuffer::from_iter(iter)) - } -} - #[cfg(test)] mod test { use itertools::Itertools; @@ -607,10 +145,9 @@ mod test { let items = PrimitiveArray::from_option_iter([Some(0i32), None, Some(1i32), None, Some(2i32)]) .into_array(); - let mask = FilterMask::try_from( - BoolArray::from_iter([true, false, true, false, true]).into_array(), - ) - .unwrap(); + let mask = + Mask::try_from(BoolArray::from_iter([true, false, true, false, true]).into_array()) + .unwrap(); let filtered = filter(&items, &mask).unwrap(); assert_eq!( @@ -623,116 +160,4 @@ mod test { &[0i32, 1i32, 2i32] ); } - - #[test] - fn filter_mask_all_true() { - let mask = FilterMask::new_true(5); - assert_eq!(mask.len(), 5); - assert_eq!(mask.true_count(), 5); - assert_eq!(mask.selectivity(), 1.0); - assert_eq!(mask.indices(), &[0, 1, 2, 3, 4]); - assert_eq!(mask.slices(), &[(0, 5)]); - assert_eq!(mask.boolean_buffer(), &BooleanBuffer::new_set(5)); - } - - #[test] - fn filter_mask_all_false() { - let mask = FilterMask::new_false(5); - assert_eq!(mask.len(), 5); - assert_eq!(mask.true_count(), 0); - assert_eq!(mask.selectivity(), 0.0); - assert_eq!(mask.indices(), &[] as &[usize]); - assert_eq!(mask.slices(), &[]); - assert_eq!(mask.boolean_buffer(), &BooleanBuffer::new_unset(5)); - } - - #[test] - fn filter_mask_from() { - let masks = [ - FilterMask::from_indices(5, vec![0, 2, 3]), - FilterMask::from_slices(5, vec![(0, 1), (2, 4)]), - FilterMask::from_buffer(BooleanBuffer::from_iter([true, false, true, true, false])), - ]; - - for mask in &masks { - assert_eq!(mask.len(), 5); - assert_eq!(mask.true_count(), 3); - assert_eq!(mask.selectivity(), 0.6); - assert_eq!(mask.indices(), &[0, 2, 3]); - assert_eq!(mask.slices(), &[(0, 1), (2, 4)]); - assert_eq!( - &mask.boolean_buffer().iter().collect_vec(), - &[true, false, true, true, false] - ); - } - } - - #[test] - fn filter_mask_eq() { - assert_eq!( - FilterMask::new_true(5), - FilterMask::from_buffer(BooleanBuffer::new_set(5)) - ); - assert_eq!( - FilterMask::new_false(5), - FilterMask::from_buffer(BooleanBuffer::new_unset(5)) - ); - assert_eq!( - FilterMask::from_indices(5, vec![0, 2, 3]), - FilterMask::from_slices(5, vec![(0, 1), (2, 4)]) - ); - assert_eq!( - FilterMask::from_indices(5, vec![0, 2, 3]), - FilterMask::from_buffer(BooleanBuffer::from_iter([true, false, true, true, false])) - ); - } - - #[test] - fn filter_mask_intersect_all_as_bit_and() { - let this = - FilterMask::from_buffer(BooleanBuffer::from_iter(vec![true, true, true, true, true])); - let mask = FilterMask::from_buffer(BooleanBuffer::from_iter(vec![ - false, true, false, true, true, - ])); - assert_eq!( - this.intersect_by_rank(&mask), - FilterMask::from_indices(5, vec![1, 3, 4]) - ); - } - - #[test] - fn filter_mask_intersect_all_true() { - let this = FilterMask::from_buffer(BooleanBuffer::from_iter(vec![ - false, false, true, true, true, - ])); - let mask = FilterMask::from_buffer(BooleanBuffer::from_iter(vec![true, true, true])); - assert_eq!( - this.intersect_by_rank(&mask), - FilterMask::from_indices(5, vec![2, 3, 4]) - ); - } - - #[test] - fn filter_mask_intersect_true() { - let this = FilterMask::from_buffer(BooleanBuffer::from_iter(vec![ - true, false, false, true, true, - ])); - let mask = FilterMask::from_buffer(BooleanBuffer::from_iter(vec![true, false, true])); - assert_eq!( - this.intersect_by_rank(&mask), - FilterMask::from_indices(5, vec![0, 4]) - ); - } - - #[test] - fn filter_mask_intersect_false() { - let this = FilterMask::from_buffer(BooleanBuffer::from_iter(vec![ - true, false, false, true, true, - ])); - let mask = FilterMask::from_buffer(BooleanBuffer::from_iter(vec![false, false, false])); - assert_eq!( - this.intersect_by_rank(&mask), - FilterMask::from_indices(5, vec![]) - ); - } } diff --git a/vortex-array/src/compute/mod.rs b/vortex-array/src/compute/mod.rs index 9e7e9a135cd..d83a138a4ff 100644 --- a/vortex-array/src/compute/mod.rs +++ b/vortex-array/src/compute/mod.rs @@ -18,7 +18,7 @@ pub use cast::{try_cast, CastFn}; pub use compare::{compare, scalar_cmp, CompareFn, Operator}; pub use fill_forward::{fill_forward, FillForwardFn}; pub use fill_null::{fill_null, FillNullFn}; -pub use filter::{filter, FilterFn, FilterIter, FilterMask}; +pub use filter::{filter, FilterFn}; pub use invert::{invert, InvertFn}; pub use like::{like, LikeFn, LikeOptions}; pub use scalar_at::{scalar_at, ScalarAtFn}; diff --git a/vortex-array/src/patches.rs b/vortex-array/src/patches.rs index 7a162c129bb..9ce149ba074 100644 --- a/vortex-array/src/patches.rs +++ b/vortex-array/src/patches.rs @@ -6,13 +6,14 @@ use vortex_buffer::BufferMut; use vortex_dtype::Nullability::NonNullable; use vortex_dtype::{match_each_integer_ptype, DType, PType}; use vortex_error::{vortex_bail, VortexExpect, VortexResult}; +use vortex_mask::Mask; use vortex_scalar::Scalar; use crate::aliases::hash_map::HashMap; use crate::array::PrimitiveArray; use crate::compute::{ scalar_at, search_sorted, search_sorted_usize, search_sorted_usize_many, slice, sub_scalar, - take, FilterMask, SearchResult, SearchSortedSide, + take, SearchResult, SearchSortedSide, }; use crate::stats::{ArrayStatistics, Stat}; use crate::variants::PrimitiveArrayTrait; @@ -206,12 +207,12 @@ impl Patches { } /// Filter the patches by a mask, resulting in new patches for the filtered array. - pub fn filter(&self, mask: &FilterMask) -> VortexResult> { + pub fn filter(&self, mask: &Mask) -> VortexResult> { if mask.true_count() == 0 { return Ok(None); } - // TODO(ngates): add functions to operate with FilterMask directly + // TODO(ngates): add functions to operate with Mask directly let buffer = mask.boolean_buffer(); let mut coordinate_indices = BufferMut::::empty(); let mut value_indices = BufferMut::::empty(); @@ -399,9 +400,10 @@ impl Patches { mod test { use rstest::{fixture, rstest}; use vortex_buffer::buffer; + use vortex_mask::Mask; use crate::array::PrimitiveArray; - use crate::compute::{FilterMask, SearchResult, SearchSortedSide}; + use crate::compute::{SearchResult, SearchSortedSide}; use crate::patches::Patches; use crate::validity::Validity; use crate::{IntoArrayData, IntoArrayVariant}; @@ -415,7 +417,7 @@ mod test { ); let filtered = patches - .filter(&FilterMask::from_indices(100, vec![10, 20, 30])) + .filter(&Mask::from_indices(100, vec![10, 20, 30])) .unwrap() .unwrap(); diff --git a/vortex-array/src/validity.rs b/vortex-array/src/validity.rs index 0746a4e2bb9..1b970998928 100644 --- a/vortex-array/src/validity.rs +++ b/vortex-array/src/validity.rs @@ -11,9 +11,10 @@ use vortex_dtype::{DType, Nullability}; use vortex_error::{ vortex_bail, vortex_err, vortex_panic, VortexError, VortexExpect as _, VortexResult, }; +use vortex_mask::Mask; use crate::array::{BoolArray, ConstantArray}; -use crate::compute::{filter, scalar_at, slice, take, FilterMask}; +use crate::compute::{filter, scalar_at, slice, take}; use crate::encoding::Encoding; use crate::patches::Patches; use crate::stats::ArrayStatistics; @@ -238,7 +239,7 @@ impl Validity { } } - pub fn filter(&self, mask: &FilterMask) -> VortexResult { + pub fn filter(&self, mask: &Mask) -> VortexResult { // NOTE(ngates): we take the mask as a reference to avoid the caller cloning unnecessarily // if we happen to be NonNullable, AllValid, or AllInvalid. match self { diff --git a/vortex-file/Cargo.toml b/vortex-file/Cargo.toml index a02d2d982d2..7a19a662435 100644 --- a/vortex-file/Cargo.toml +++ b/vortex-file/Cargo.toml @@ -39,6 +39,7 @@ vortex-flatbuffers = { workspace = true, features = ["file"] } vortex-io = { workspace = true } vortex-ipc = { workspace = true } vortex-layout = { workspace = true } +vortex-mask = { workspace = true } vortex-scalar = { workspace = true, features = ["flatbuffers"] } vortex-scan = { workspace = true } diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index 9e8569d851d..478743709c7 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -8,7 +8,6 @@ use futures::Stream; use futures_util::{stream, FutureExt, StreamExt, TryFutureExt, TryStreamExt}; use itertools::Itertools; use pin_project_lite::pin_project; -use vortex_array::compute::FilterMask; use vortex_array::stats::{Stat, StatsSet}; use vortex_array::stream::{ArrayStream, ArrayStreamAdapter}; use vortex_array::ContextRef; @@ -19,6 +18,7 @@ use vortex_expr::transform::immediate_access::immediate_scope_access; use vortex_expr::transform::simplify_typed::simplify_typed; use vortex_expr::{ident, ExprRef}; use vortex_layout::{ExprEvaluator, LayoutReader}; +use vortex_mask::Mask; use vortex_scan::{RowMask, Scanner}; use crate::exec::ExecDriver; @@ -176,7 +176,7 @@ impl VortexFile { } // Construct a row mask for the range. - let filter_mask = FilterMask::from_indices( + let filter_mask = Mask::from_indices( usize::try_from(row_range.end - row_range.start) .vortex_expect("Split ranges are within usize"), row_indices[start_idx..end_idx] diff --git a/vortex-layout/Cargo.toml b/vortex-layout/Cargo.toml index c7e07afcea7..89273b409b9 100644 --- a/vortex-layout/Cargo.toml +++ b/vortex-layout/Cargo.toml @@ -27,6 +27,7 @@ vortex-dtype = { workspace = true } vortex-error = { workspace = true } vortex-expr = { workspace = true } vortex-flatbuffers = { workspace = true, features = ["layout"] } +vortex-mask = { workspace = true } vortex-scalar = { workspace = true } vortex-scan = { workspace = true } diff --git a/vortex-layout/src/layouts/struct_/eval_expr.rs b/vortex-layout/src/layouts/struct_/eval_expr.rs index 8e296e48822..7dcc61c688c 100644 --- a/vortex-layout/src/layouts/struct_/eval_expr.rs +++ b/vortex-layout/src/layouts/struct_/eval_expr.rs @@ -58,12 +58,12 @@ mod tests { use futures::executor::block_on; use vortex_array::array::StructArray; - use vortex_array::compute::FilterMask; use vortex_array::{IntoArrayData, IntoArrayVariant}; use vortex_buffer::buffer; use vortex_dtype::PType::I32; use vortex_dtype::{DType, Field, Nullability, StructDType}; use vortex_expr::{get_item, gt, ident, pack}; + use vortex_mask::Mask; use vortex_scan::RowMask; use crate::layouts::flat::writer::FlatLayoutWriter; @@ -133,7 +133,7 @@ mod tests { let expr = gt(get_item("a", ident()), get_item("b", ident())); let result = block_on(reader.evaluate_expr( // Take rows 0 and 1, skip row 2, and anything after that - RowMask::new(FilterMask::from_iter([true, true, false]), 0), + RowMask::new(Mask::from_iter([true, true, false]), 0), expr, )) .unwrap(); @@ -159,7 +159,7 @@ mod tests { let expr = pack([("a", get_item("a", ident())), ("b", get_item("b", ident()))]); let result = block_on(reader.evaluate_expr( // Take rows 0 and 1, skip row 2, and anything after that - RowMask::new(FilterMask::from_iter([true, true, false]), 0), + RowMask::new(Mask::from_iter([true, true, false]), 0), expr, )) .unwrap(); diff --git a/vortex-mask/src/bitand.rs b/vortex-mask/src/bitand.rs index 1cb48320a56..04e90aa08e5 100644 --- a/vortex-mask/src/bitand.rs +++ b/vortex-mask/src/bitand.rs @@ -9,7 +9,7 @@ impl BitAnd for &Mask { fn bitand(self, rhs: Self) -> Self::Output { if self.len() != rhs.len() { - vortex_panic!("FilterMasks must have the same length"); + vortex_panic!("Masks must have the same length"); } if self.true_count() == 0 || rhs.true_count() == 0 { return Mask::new_false(self.len()); diff --git a/vortex-mask/src/lib.rs b/vortex-mask/src/lib.rs index bf1dc3c91eb..8c31649829d 100644 --- a/vortex-mask/src/lib.rs +++ b/vortex-mask/src/lib.rs @@ -170,7 +170,7 @@ impl Inner { } impl Mask { - /// Create a new FilterMask where all values are set. + /// Create a new Mask where all values are set. pub fn new_true(length: usize) -> Self { Self(Arc::new(Inner { buffer: Default::default(), @@ -182,7 +182,7 @@ impl Mask { })) } - /// Create a new FilterMask where no values are set. + /// Create a new Mask where no values are set. pub fn new_false(length: usize) -> Self { Self(Arc::new(Inner { buffer: Default::default(), @@ -342,11 +342,11 @@ impl Mask { /// Returns the best iterator based on a selectivity threshold. /// /// Currently, this threshold is fixed at 0.8 based on Arrow Rust. - pub fn iter(&self) -> Iter { + pub fn iter(&self) -> MaskIter { if self.selectivity() > FILTER_SLICES_SELECTIVITY_THRESHOLD { - Iter::Slices(self.slices()) + MaskIter::Slices(self.slices()) } else { - Iter::Indices(self.indices()) + MaskIter::Indices(self.indices()) } } @@ -391,7 +391,7 @@ impl Mask { } } -pub enum Iter<'a> { +pub enum MaskIter<'a> { /// Slice of pre-cached indices of a mask. Indices(&'a [usize]), /// Slice of pre-cached slices of a mask. diff --git a/vortex-scan/Cargo.toml b/vortex-scan/Cargo.toml index 36349095044..f3983741f3b 100644 --- a/vortex-scan/Cargo.toml +++ b/vortex-scan/Cargo.toml @@ -19,6 +19,7 @@ vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } vortex-expr = { workspace = true } +vortex-mask = { workspace = true } [dev-dependencies] rstest = { workspace = true } diff --git a/vortex-scan/src/range_scan.rs b/vortex-scan/src/range_scan.rs index ca083704c0e..fcb2666b61a 100644 --- a/vortex-scan/src/range_scan.rs +++ b/vortex-scan/src/range_scan.rs @@ -2,25 +2,26 @@ use std::future::Future; use std::ops::{BitAnd, Range}; use std::sync::Arc; -use vortex_array::compute::{fill_null, FilterMask}; +use vortex_array::compute::fill_null; use vortex_array::ArrayData; use vortex_error::{VortexExpect, VortexResult}; use vortex_expr::ExprRef; +use vortex_mask::Mask; use crate::{RowMask, Scanner}; pub struct RangeScanner { scan: Arc, row_range: Range, - mask: FilterMask, + mask: Mask, state: State, } enum State { // First we run the filter expression over the full row range. - FilterEval((FilterMask, Vec)), + FilterEval((Mask, Vec)), // Then we project the selected rows. - Project((FilterMask, ExprRef)), + Project((Mask, ExprRef)), // And then we're done. Ready(Option), } @@ -30,7 +31,7 @@ pub enum NextOp { Ready(Option), /// The next expression to evaluate. /// The caller **must** first apply the mask before evaluating the expression. - Eval((Range, FilterMask, ExprRef)), + Eval((Range, Mask, ExprRef)), } /// We implement the range scan via polling for the next operation to perform, and then posting @@ -57,14 +58,14 @@ pub enum NextOp { // identical to the filter API and there's no point having both. Hence, a single // `evaluate(row_mask, expr)` API. impl RangeScanner { - pub(crate) fn new(scan: Arc, row_offset: u64, mask: FilterMask) -> Self { + pub(crate) fn new(scan: Arc, row_offset: u64, mask: Mask) -> Self { let state = if !scan.rev_filter.is_empty() { // If we have a filter expression, then for now we evaluate it against all rows // of the range. // TODO(ngates): we should decide based on mask.true_count() whether to include the // current mask or not. But note that the resulting expression would need to be // aligned and intersected with the given mask. - State::FilterEval((FilterMask::new_true(mask.len()), scan.rev_filter.to_vec())) + State::FilterEval((Mask::new_true(mask.len()), scan.rev_filter.to_vec())) } else { // If there is no filter expression, then we immediately perform a mask + project. State::Project((mask.clone(), scan.projection().clone())) @@ -81,7 +82,7 @@ impl RangeScanner { /// Check for the next operation to perform. /// Returns `Poll::Ready` when the scan is complete. /// - // FIXME(ngates): currently we have to clone the FilterMask to return it. Doing this + // FIXME(ngates): currently we have to clone the Mask to return it. Doing this // forces the eager evaluation of the iterators. fn next(&self) -> NextOp { match &self.state { @@ -111,7 +112,7 @@ impl RangeScanner { let result = fill_null(result, false.into())?; // Intersect the result of the filter expression with our initial row mask. - let mask = FilterMask::try_from(result)?; + let mask = Mask::try_from(result)?; // We passed a full mask to the eval function so we must bit intersect instead // of set-bit intersection if we massed a non-full mask to the evaluator. @@ -130,7 +131,7 @@ impl RangeScanner { let mask = if self.mask.selectivity() < APPLY_FILTER_SELECTIVITY_THRESHOLD { self.mask.clone() } else { - FilterMask::new_true(self.mask.len()) + Mask::new_true(self.mask.len()) }; // conjuncts_rev is again non-empty, so we can put it into FilterEval self.state = State::FilterEval((mask, conjuncts_rev)) @@ -187,13 +188,14 @@ mod tests { use std::sync::Arc; use vortex_array::array::{BoolArray, PrimitiveArray, StructArray}; - use vortex_array::compute::{filter, FilterMask}; + use vortex_array::compute::filter; use vortex_array::variants::StructArrayTrait; use vortex_array::{IntoArrayData, IntoArrayVariant}; use vortex_dtype::Nullability::NonNullable; use vortex_dtype::PType::U64; use vortex_dtype::{DType, StructDType}; use vortex_expr::{and, get_item, gt, ident, lit}; + use vortex_mask::Mask; use crate::{RangeScanner, Scanner}; @@ -221,7 +223,7 @@ mod tests { .unwrap(), ); let len = 1000; - let range = RangeScanner::new(scan, 0, FilterMask::new_true(len)); + let range = RangeScanner::new(scan, 0, Mask::new_true(len)); let res = range .evaluate(|mask, expr| { @@ -277,7 +279,7 @@ mod tests { .unwrap(), ); let len = 1000; - let range = RangeScanner::new(scan, 0, FilterMask::new_true(len)); + let range = RangeScanner::new(scan, 0, Mask::new_true(len)); let res = range .evaluate(|mask, expr| { diff --git a/vortex-scan/src/row_mask.rs b/vortex-scan/src/row_mask.rs index a44f1559b44..7070660e294 100644 --- a/vortex-scan/src/row_mask.rs +++ b/vortex-scan/src/row_mask.rs @@ -3,20 +3,21 @@ use std::fmt::{Display, Formatter}; use std::ops::{BitAnd, RangeBounds}; use vortex_array::array::{BooleanBuffer, PrimitiveArray, SparseArray}; -use vortex_array::compute::{and, filter, slice, try_cast, FilterMask}; +use vortex_array::compute::{and, filter, slice, try_cast}; use vortex_array::validity::{ArrayValidity, LogicalValidity, Validity}; use vortex_array::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant}; use vortex_buffer::Buffer; use vortex_dtype::Nullability::NonNullable; use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, vortex_err, VortexExpect, VortexResult}; +use vortex_mask::Mask; /// A RowMask captures a set of selected rows within a range. /// /// The range itself can be [`u64`], but the length of the range must fit into a [`usize`]. #[derive(Debug, Clone)] pub struct RowMask { - mask: FilterMask, + mask: Mask, begin: u64, end: u64, } @@ -36,7 +37,7 @@ impl Display for RowMask { } impl RowMask { - pub fn new(mask: FilterMask, begin: u64) -> Self { + pub fn new(mask: Mask, begin: u64) -> Self { let end = begin + (mask.len() as u64); Self { mask, begin, end } } @@ -49,14 +50,14 @@ impl RowMask { pub fn new_valid_between(begin: u64, end: u64) -> Self { let length = usize::try_from(end - begin).vortex_expect("Range length does not fit into a usize"); - RowMask::new(FilterMask::from(BooleanBuffer::new_set(length)), begin) + RowMask::new(Mask::from(BooleanBuffer::new_set(length)), begin) } /// Construct a RowMask which is invalid everywhere in the given range. pub fn new_invalid_between(begin: u64, end: u64) -> Self { let length = usize::try_from(end - begin).vortex_expect("Range length does not fit into a usize"); - RowMask::new(FilterMask::from(BooleanBuffer::new_unset(length)), begin) + RowMask::new(Mask::from(BooleanBuffer::new_unset(length)), begin) } /// Creates a RowMask from an array, only supported boolean and integer types. @@ -78,15 +79,13 @@ impl RowMask { /// True-valued positions are kept by the returned mask. fn from_mask_array(array: &ArrayData, begin: u64) -> VortexResult { match array.logical_validity() { - LogicalValidity::AllValid(_) => { - Ok(Self::new(FilterMask::try_from(array.clone())?, begin)) - } + LogicalValidity::AllValid(_) => Ok(Self::new(Mask::try_from(array.clone())?, begin)), LogicalValidity::AllInvalid(_) => { Ok(Self::new_invalid_between(begin, begin + array.len() as u64)) } LogicalValidity::Array(validity) => { let bitmask = and(array.clone(), validity)?; - Ok(Self::new(FilterMask::try_from(bitmask)?, begin)) + Ok(Self::new(Mask::try_from(bitmask)?, begin)) } } } @@ -102,7 +101,7 @@ impl RowMask { let indices = try_cast(array, &DType::Primitive(PType::U64, NonNullable))?.into_primitive()?; - let mask = FilterMask::from_indices( + let mask = Mask::from_indices( length, indices .as_slice::() @@ -118,7 +117,7 @@ impl RowMask { /// /// This function may return false negatives, but never false positives. /// - /// TODO(ngates): improve this function to take into account the [`FilterMask`]. + /// TODO(ngates): improve this function to take into account the [`Mask`]. pub fn is_disjoint(&self, range: impl RangeBounds) -> bool { use std::ops::Bound; @@ -185,7 +184,7 @@ impl RowMask { let output_len = usize::try_from(output_end - output_begin) .map_err(|_| vortex_err!("Range length does not fit into a usize"))?; - let output_mask = FilterMask::from_intersection_indices( + let output_mask = Mask::from_intersection_indices( output_len, self.mask .indices() @@ -227,8 +226,8 @@ impl RowMask { self.mask.len() } - /// Returns the [`FilterMask`] whose true values are relative to the range of this `RowMask`. - pub fn filter_mask(&self) -> &FilterMask { + /// Returns the [`Mask`] whose true values are relative to the range of this `RowMask`. + pub fn filter_mask(&self) -> &Mask { &self.mask } @@ -317,35 +316,35 @@ impl RowMask { mod tests { use rstest::rstest; use vortex_array::array::PrimitiveArray; - use vortex_array::compute::FilterMask; use vortex_array::validity::Validity; use vortex_array::{IntoArrayData, IntoArrayVariant}; use vortex_buffer::{buffer, Buffer}; use vortex_error::VortexUnwrap; + use vortex_mask::Mask; use super::*; #[rstest] #[case( - RowMask::new(FilterMask::from_iter([true, true, true, false, false, false, false, false, true, true]), 0), (0, 1), - RowMask::new(FilterMask::from_iter([true]), 0))] + RowMask::new(Mask::from_iter([true, true, true, false, false, false, false, false, true, true]), 0), (0, 1), + RowMask::new(Mask::from_iter([true]), 0))] #[case( - RowMask::new(FilterMask::from_iter([false, false, false, false, false, true, true, true, true, true]), 0), (2, 5), - RowMask::new(FilterMask::from_iter([false, false, false]), 2) + RowMask::new(Mask::from_iter([false, false, false, false, false, true, true, true, true, true]), 0), (2, 5), + RowMask::new(Mask::from_iter([false, false, false]), 2) )] #[case( - RowMask::new(FilterMask::from_iter([true, true, true, true, false, false, false, false, false, false]), 0), (2, 5), - RowMask::new(FilterMask::from_iter([true, true, false]), 2) + RowMask::new(Mask::from_iter([true, true, true, true, false, false, false, false, false, false]), 0), (2, 5), + RowMask::new(Mask::from_iter([true, true, false]), 2) )] #[case( - RowMask::new(FilterMask::from_iter([true, true, true, false, false, true, true, false, false, false]), 0), (2, 6), - RowMask::new(FilterMask::from_iter([true, false, false, true]), 2))] + RowMask::new(Mask::from_iter([true, true, true, false, false, true, true, false, false, false]), 0), (2, 6), + RowMask::new(Mask::from_iter([true, false, false, true]), 2))] #[case( - RowMask::new(FilterMask::from_iter([false, false, false, false, false, true, true, true, true, true]), 0), (7, 11), - RowMask::new(FilterMask::from_iter([true, true, true]), 7))] + RowMask::new(Mask::from_iter([false, false, false, false, false, true, true, true, true, true]), 0), (7, 11), + RowMask::new(Mask::from_iter([true, true, true]), 7))] #[case( - RowMask::new(FilterMask::from_iter([false, true, true, true, true, true]), 3), (0, 5), - RowMask::new(FilterMask::from_iter([false, true]), 3))] + RowMask::new(Mask::from_iter([false, true, true, true, true, true]), 3), (0, 5), + RowMask::new(Mask::from_iter([false, true]), 3))] #[cfg_attr(miri, ignore)] fn slice(#[case] first: RowMask, #[case] range: (u64, u64), #[case] expected: RowMask) { assert_eq!(first.slice(range.0, range.1).vortex_unwrap(), expected); @@ -355,7 +354,7 @@ mod tests { #[should_panic] #[cfg_attr(miri, ignore)] fn shift_invalid() { - RowMask::new(FilterMask::from_iter([true, true, true, true, true]), 5) + RowMask::new(Mask::from_iter([true, true, true, true, true]), 5) .shift(7) .unwrap(); } @@ -364,10 +363,10 @@ mod tests { #[cfg_attr(miri, ignore)] fn shift() { assert_eq!( - RowMask::new(FilterMask::from_iter([true, true, true, true, true]), 5) + RowMask::new(Mask::from_iter([true, true, true, true, true]), 5) .shift(5) .unwrap(), - RowMask::new(FilterMask::from_iter([true, true, true, true, true]), 0) + RowMask::new(Mask::from_iter([true, true, true, true, true]), 0) ); } @@ -375,7 +374,7 @@ mod tests { #[cfg_attr(miri, ignore)] fn filter_array() { let mask = RowMask::new( - FilterMask::from_iter([ + Mask::from_iter([ false, false, false, false, false, true, true, true, true, true, ]), 0, diff --git a/vortex/Cargo.toml b/vortex/Cargo.toml index 5c87b45e21b..95c019a4484 100644 --- a/vortex/Cargo.toml +++ b/vortex/Cargo.toml @@ -38,6 +38,7 @@ vortex-flatbuffers = { workspace = true } vortex-fsst = { workspace = true } vortex-io = { workspace = true } vortex-ipc = { workspace = true } +vortex-mask = { workspace = true } vortex-proto = { workspace = true } vortex-runend = { workspace = true } vortex-sampling-compressor = { workspace = true } diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs index 3fa123193d7..f5c3d6bcba6 100644 --- a/vortex/src/lib.rs +++ b/vortex/src/lib.rs @@ -2,9 +2,9 @@ pub use vortex_array::*; pub use { vortex_buffer as buffer, vortex_datetime_dtype as datetime_dtype, vortex_dtype as dtype, vortex_error as error, vortex_expr as expr, vortex_file as file, - vortex_flatbuffers as flatbuffers, vortex_io as io, vortex_ipc as ipc, vortex_proto as proto, - vortex_sampling_compressor as sampling_compressor, vortex_scalar as scalar, - vortex_scan as scan, + vortex_flatbuffers as flatbuffers, vortex_io as io, vortex_ipc as ipc, vortex_mask as mask, + vortex_proto as proto, vortex_sampling_compressor as sampling_compressor, + vortex_scalar as scalar, vortex_scan as scan, }; pub mod encodings {