perf: use filter instead of take in Patches::filter (#2093)

a10y · web-flow · commit 8d024078504d · 2025-01-28T17:33:27.000Z
Replace usage of `take` with `filter` in `Patches::filter`.

Take performs a per-element binary search over `patch_indices`, which is
unnecessary can can just be performed using a filter after building the
sorted filter mask.

Benchmark results before/after change (with `-C target-cpu=native`
applied):

**BEFORE**

```
filter_then_canonical/0.001
                        time:   [18.591 µs 18.646 µs 18.703 µs]
                        change: [-2.2719% -1.8183% -1.3886%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
  3 (3.00%) high mild
  4 (4.00%) high severe
filter_then_canonical/0.01
                        time:   [45.116 µs 45.287 µs 45.469 µs]
                        change: [+0.9772% +1.3910% +1.8349%] (p = 0.00 &lt; 0.05)
                        Change within noise threshold.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild
filter_then_canonical/0.1
                        time:   [55.525 µs 55.895 µs 56.334 µs]
                        change: [-9.1744% -8.2913% -7.4359%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
  4 (4.00%) high mild
filter_then_canonical/0.5
                        time:   [167.36 µs 168.97 µs 170.64 µs]
                        change: [-6.8090% -5.6899% -4.6273%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
filter_then_canonical/0.9
                        time:   [267.43 µs 274.14 µs 280.44 µs]
                        change: [-5.1936% -0.6966% +3.0989%] (p = 0.77 &gt; 0.05)
                        No change in performance detected.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high mild
filter_then_canonical/0.99
                        time:   [218.80 µs 222.99 µs 227.39 µs]
                        change: [-12.117% -9.8998% -7.5168%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high mild
filter_then_canonical/0.999
                        time:   [235.28 µs 242.05 µs 249.06 µs]
                        change: [+7.4971% +11.002% +14.803%] (p = 0.00 &lt; 0.05)
                        Performance has regressed.
filter_then_canonical/1 time:   [44.103 µs 44.969 µs 45.811 µs]
                        change: [+1.1924% +2.5506% +4.0096%] (p = 0.00 &lt; 0.05)
                        Performance has regressed.
Found 16 outliers among 100 measurements (16.00%)
  6 (6.00%) high mild
  10 (10.00%) high severe

canonical_then_filter/0.001
                        time:   [42.622 µs 42.801 µs 43.047 µs]
                        change: [+2.8863% +3.8982% +5.2951%] (p = 0.00 &lt; 0.05)
                        Performance has regressed.
Found 12 outliers among 100 measurements (12.00%)
  12 (12.00%) high mild
canonical_then_filter/0.01
                        time:   [45.406 µs 46.116 µs 46.893 µs]
                        change: [+7.4139% +8.7784% +10.114%] (p = 0.00 &lt; 0.05)
                        Performance has regressed.
canonical_then_filter/0.1
                        time:   [55.605 µs 57.323 µs 59.165 µs]
                        change: [-3.8630% -1.9663% -0.0013%] (p = 0.06 &gt; 0.05)
                        No change in performance detected.
Found 13 outliers among 100 measurements (13.00%)
  12 (12.00%) low mild
  1 (1.00%) high mild
canonical_then_filter/0.5
                        time:   [58.457 µs 59.673 µs 61.200 µs]
                        change: [-3.9607% -1.5104% +0.6748%] (p = 0.23 &gt; 0.05)
                        No change in performance detected.
canonical_then_filter/0.9
                        time:   [101.31 µs 102.97 µs 104.51 µs]
                        change: [-5.6743% -4.0261% -2.4754%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
canonical_then_filter/0.99
                        time:   [71.770 µs 72.499 µs 73.223 µs]
                        change: [+12.322% +14.444% +16.977%] (p = 0.00 &lt; 0.05)
                        Performance has regressed.
Found 4 outliers among 100 measurements (4.00%)
  1 (1.00%) low mild
  1 (1.00%) high mild
  2 (2.00%) high severe
canonical_then_filter/0.999
                        time:   [70.196 µs 71.040 µs 71.869 µs]
                        change: [+0.7287% +1.7791% +2.9515%] (p = 0.00 &lt; 0.05)
                        Change within noise threshold.
Found 7 outliers among 100 measurements (7.00%)
  2 (2.00%) low mild
  4 (4.00%) high mild
  1 (1.00%) high severe
canonical_then_filter/1 time:   [46.668 µs 47.762 µs 48.889 µs]
                        change: [+0.7750% +3.1566% +5.6672%] (p = 0.01 &lt; 0.05)
                        Change within noise threshold.
```


**AFTER**

```
filter_then_canonical/0.001
                        time:   [14.971 µs 15.118 µs 15.310 µs]
                        change: [-20.044% -19.194% -18.225%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
Found 13 outliers among 100 measurements (13.00%)
  8 (8.00%) high mild
  5 (5.00%) high severe
filter_then_canonical/0.01
                        time:   [25.974 µs 26.012 µs 26.053 µs]
                        change: [-42.546% -42.237% -41.891%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
Found 12 outliers among 100 measurements (12.00%)
  8 (8.00%) high mild
  4 (4.00%) high severe
filter_then_canonical/0.1
                        time:   [59.520 µs 59.888 µs 60.260 µs]
                        change: [+6.8618% +7.8890% +8.9357%] (p = 0.00 &lt; 0.05)
                        Performance has regressed.
Found 5 outliers among 100 measurements (5.00%)
  5 (5.00%) high mild
filter_then_canonical/0.5
                        time:   [167.38 µs 169.59 µs 171.66 µs]
                        change: [-2.4051% -1.4161% -0.3761%] (p = 0.01 &lt; 0.05)
                        Change within noise threshold.
Found 29 outliers among 100 measurements (29.00%)
  8 (8.00%) low severe
  2 (2.00%) low mild
  3 (3.00%) high mild
  16 (16.00%) high severe
filter_then_canonical/0.9
                        time:   [258.88 µs 259.30 µs 259.76 µs]
                        change: [-5.1144% -3.5762% -2.0444%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
Found 5 outliers among 100 measurements (5.00%)
  2 (2.00%) high mild
  3 (3.00%) high severe
filter_then_canonical/0.99
                        time:   [187.55 µs 189.52 µs 191.52 µs]
                        change: [-13.451% -11.806% -10.251%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
filter_then_canonical/0.999
                        time:   [179.95 µs 181.57 µs 182.99 µs]
                        change: [-32.654% -30.839% -28.982%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
filter_then_canonical/1 time:   [42.540 µs 42.615 µs 42.681 µs]
                        change: [-3.6925% -2.5866% -1.5384%] (p = 0.00 &lt; 0.05)
                        Performance has improved.

canonical_then_filter/0.001
                        time:   [42.678 µs 42.759 µs 42.828 µs]
                        change: [-4.6690% -3.5527% -2.5030%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
canonical_then_filter/0.01
                        time:   [47.107 µs 47.695 µs 48.234 µs]
                        change: [+2.0595% +3.5847% +5.1306%] (p = 0.00 &lt; 0.05)
                        Performance has regressed.
Found 26 outliers among 100 measurements (26.00%)
  13 (13.00%) low severe
  6 (6.00%) high mild
  7 (7.00%) high severe
canonical_then_filter/0.1
                        time:   [61.105 µs 61.594 µs 62.052 µs]
                        change: [-1.7992% +0.2192% +2.3952%] (p = 0.84 &gt; 0.05)
                        No change in performance detected.
canonical_then_filter/0.5
                        time:   [69.184 µs 69.549 µs 69.928 µs]
                        change: [+2.6673% +5.1603% +7.8241%] (p = 0.00 &lt; 0.05)
                        Performance has regressed.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) low mild
canonical_then_filter/0.9
                        time:   [103.06 µs 103.93 µs 104.90 µs]
                        change: [+4.4514% +6.2341% +7.9956%] (p = 0.00 &lt; 0.05)
                        Performance has regressed.
canonical_then_filter/0.99
                        time:   [68.937 µs 69.321 µs 69.680 µs]
                        change: [-8.9882% -7.3397% -5.9331%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
Found 11 outliers among 100 measurements (11.00%)
  9 (9.00%) low mild
  2 (2.00%) high mild
canonical_then_filter/0.999
                        time:   [67.411 µs 67.657 µs 67.917 µs]
                        change: [-5.0588% -4.0257% -2.9739%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild
canonical_then_filter/1 time:   [44.883 µs 45.997 µs 46.996 µs]
                        change: [-4.4256% -2.0499% +0.2734%] (p = 0.09 &gt; 0.05)
                        No change in performance detected.
```
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -229,3 +229,7 @@ debug = true
 [profile.benchtest]
 inherits = "bench"
 debug-assertions = true
+
+[profile.samply]
+inherits = "release"
+debug = true
diff --git a/bench-vortex/Cargo.toml b/bench-vortex/Cargo.toml
@@ -61,6 +61,7 @@ tokio = { workspace = true, features = ["full"] }
 uuid = { workspace = true, features = ["v4"] }
 vortex = { workspace = true, features = ["object_store", "parquet"] }
 vortex-datafusion = { workspace = true }
+vortex-mask = { workspace = true }
 xshell = { workspace = true }
 
 [dev-dependencies]
@@ -100,3 +101,8 @@ harness = false
 name = "clickbench"
 test = false
 harness = false
+
+[[bench]]
+name = "sel_vec"
+test = false
+harness = false
diff --git a/bench-vortex/benches/sel_vec.rs b/bench-vortex/benches/sel_vec.rs
@@ -0,0 +1,109 @@
+#![allow(unused_imports, unused, dead_code)]
+//! Various tests for the selection vector being present.
+
+use criterion::{BenchmarkId, Criterion};
+use rand::Rng;
+use vortex::array::PrimitiveArray;
+use vortex::compute::filter;
+use vortex::dtype::{DType, Nullability, PType};
+use vortex::encoding::{ArrayEncodingRef, Encoding};
+use vortex::encodings::alp::{ALPArray, ALPEncoding};
+use vortex::sampling_compressor::compressors::alp::ALPCompressor;
+use vortex::sampling_compressor::compressors::bitpacked::{
+    BitPackedCompressor, BITPACK_NO_PATCHES, BITPACK_WITH_PATCHES,
+};
+use vortex::sampling_compressor::compressors::r#for::FoRCompressor;
+use vortex::sampling_compressor::compressors::EncodingCompressor;
+use vortex::sampling_compressor::SamplingCompressor;
+use vortex::variants::PrimitiveArrayTrait;
+use vortex::{ArrayData, IntoArrayData, IntoCanonical};
+use vortex_mask::Mask;
+
+// criterion benchmark setup:
+fn bench_sel_vec(c: &mut Criterion) {
+    let mut group = c.benchmark_group("filter_then_canonical");
+
+    // Run ALP + BitPacking.
+    let compressor = SamplingCompressor::default().including_only(&[
+        &ALPCompressor as &dyn EncodingCompressor,
+        &BITPACK_NO_PATCHES,
+        // &FoRCompressor,
+    ]);
+
+    // Create a low-precision primitive array of f64
+    let arr = PrimitiveArray::from_iter((0..=65535).map(|x| (x as f64) * 0.2f64));
+    assert_eq!(arr.ptype(), PType::F64);
+
+    let arr = compressor
+        .compress(&arr.into_array(), None)
+        .unwrap()
+        .into_array();
+    assert_eq!(arr.encoding().id(), ALPEncoding::ID);
+
+    println!("tree: {}", arr.tree_display());
+
+    // Try for various mask
+    let max = 65536;
+    for selectivity in [0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999, 1.0] {
+        // Create a random mask of the given size
+        let true_count = (selectivity * max as f64) as usize;
+        // Create a randomized mask with the correct length and true_count.
+        let mask = create_mask(max, true_count);
+        assert_eq!(mask.len(), max);
+        assert_eq!(mask.true_count(), true_count);
+        group.bench_with_input(
+            BenchmarkId::from_parameter(selectivity),
+            &mask,
+            |b, mask| {
+                // Filter then into_canonical
+                b.iter(|| filter_then_canonical(&arr, mask))
+            },
+        );
+    }
+    group.finish();
+
+    let mut group = c.benchmark_group("canonical_then_filter");
+    for selectivity in [0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999, 1.0] {
+        // Create a random mask of the given size
+        let true_count = (selectivity * max as f64) as usize;
+        // Create a randomized mask with the correct length and true_count.
+        let mask = create_mask(max, true_count);
+        group.bench_with_input(
+            BenchmarkId::from_parameter(selectivity),
+            &mask,
+            |b, mask| {
+                // Filter then into_canonical
+                b.iter(|| canonical_then_filter(&arr, mask))
+            },
+        );
+    }
+    group.finish();
+}
+
+fn filter_then_canonical(array: &ArrayData, mask: &Mask) -> ArrayData {
+    let filtered = filter(array, mask).unwrap();
+    filtered.into_canonical().unwrap().into_array()
+}
+
+fn canonical_then_filter(array: &ArrayData, mask: &Mask) -> ArrayData {
+    let canonical = array.clone().into_canonical().unwrap().into_array();
+    filter(&canonical, mask).unwrap()
+}
+
+fn create_mask(len: usize, true_count: usize) -> Mask {
+    let mut mask = vec![false; len];
+    // randomly distribute true_count true values
+    let mut rng = rand::thread_rng();
+    let mut set = 0;
+    while set < true_count {
+        let index = rng.gen_range(0..len);
+        if !mask[index] {
+            mask[index] = true;
+            set += 1;
+        }
+    }
+    Mask::from_iter(mask)
+}
+
+criterion::criterion_group!(sel_vec, bench_sel_vec);
+criterion::criterion_main!(sel_vec);
diff --git a/vortex-array/src/patches.rs b/vortex-array/src/patches.rs
@@ -1,19 +1,21 @@
+use std::cmp::Ordering;
 use std::fmt::Debug;
 
 use itertools::Itertools as _;
+use num_traits::{AsPrimitive, NumCast, ToPrimitive};
 use serde::{Deserialize, Serialize};
 use vortex_buffer::BufferMut;
 use vortex_dtype::Nullability::NonNullable;
-use vortex_dtype::{match_each_integer_ptype, DType, PType};
+use vortex_dtype::{match_each_integer_ptype, DType, NativePType, PType};
 use vortex_error::{vortex_bail, VortexExpect, VortexResult};
 use vortex_mask::Mask;
 use vortex_scalar::Scalar;
 
 use crate::aliases::hash_map::HashMap;
 use crate::array::PrimitiveArray;
 use crate::compute::{
-    scalar_at, search_sorted, search_sorted_usize, search_sorted_usize_many, slice, sub_scalar,
-    take, SearchResult, SearchSortedSide,
+    filter, scalar_at, search_sorted, search_sorted_usize, search_sorted_usize_many, slice,
+    sub_scalar, take, SearchResult, SearchSortedSide,
 };
 use crate::stats::{ArrayStatistics, Stat};
 use crate::variants::PrimitiveArrayTrait;
@@ -212,33 +214,14 @@ impl Patches {
             return Ok(None);
         }
 
-        // TODO(ngates): add functions to operate with Mask directly
-        let buffer = mask.boolean_buffer();
-        let mut coordinate_indices = BufferMut::<u64>::empty();
-        let mut value_indices = BufferMut::<u64>::empty();
-        let mut last_inserted_index: usize = 0;
-
         let flat_indices = self.indices().clone().into_primitive()?;
         match_each_integer_ptype!(flat_indices.ptype(), |$I| {
-            for (value_idx, coordinate) in flat_indices.as_slice::<$I>().iter().enumerate() {
-                if buffer.value(*coordinate as usize) {
-                    // We count the number of truthy values between this coordinate and the previous truthy one
-                    let adjusted_coordinate = buffer.slice(last_inserted_index, (*coordinate as usize) - last_inserted_index).count_set_bits() as u64;
-                    coordinate_indices.push(adjusted_coordinate + coordinate_indices.last().copied().unwrap_or_default());
-                    last_inserted_index = *coordinate as usize;
-                    value_indices.push(value_idx as u64);
-                }
-            }
-        });
-
-        if coordinate_indices.is_empty() {
-            return Ok(None);
-        }
-
-        let indices = coordinate_indices.into_array();
-        let values = take(self.values(), value_indices.into_array())?;
-
-        Ok(Some(Self::new(mask.len(), indices, values)))
+            filter_patches_with_mask(
+                flat_indices.as_slice::<$I>(),
+                self.values(),
+                mask
+            )
+        })
     }
 
     /// Slice the patches by a range of the patched array.
@@ -396,6 +379,97 @@ impl Patches {
     }
 }
 
+/// Filter patches with the provided mask (in flattened space).
+///
+/// The filter mask may contain indices that are non-patched. The return value of this function
+/// is a new set of `Patches` with the indices relative to the provided `mask` rank, and the
+/// patch values.
+fn filter_patches_with_mask<T: ToPrimitive + Copy + Ord>(
+    patch_indices: &[T],
+    patch_values: &ArrayData,
+    mask: &Mask,
+) -> VortexResult<Option<Patches>> {
+    let mut new_patch_indices = BufferMut::<u64>::with_capacity(mask.true_count());
+    let mut new_mask_indices = Vec::with_capacity(mask.true_count());
+
+    // Attempt to move the window by `STRIDE` elements on each iteration. This assumes that
+    // the patches are relatively sparse compared to the overall mask, and so many indices in the
+    // mask will end up being skipped.
+    const STRIDE: usize = 4;
+
+    let mut mask_idx = 0usize;
+    let mut true_idx = 0usize;
+
+    let mask_indices = mask.indices();
+
+    while mask_idx < patch_indices.len() && true_idx < mask.true_count() {
+        // NOTE: we are searching for overlaps between sorted, unaligned indices in `patch_indices`
+        //  and `mask_indices`. We assume that Patches are sparse relative to the global space of
+        //  the mask (which covers both patch and non-patch values of the parent array), and so to
+        //  quickly jump through regions with no overlap, we attempt to move our pointers by STRIDE
+        //  elements on each iteration. If we cannot rule out overlap due to min/max values, we
+        //  fallback to performing a two-way iterator merge.
+        if (mask_idx + STRIDE) < patch_indices.len() && (true_idx + STRIDE) < mask_indices.len() {
+            // Load a vector of each into our registers.
+            let left_min = patch_indices[mask_idx].to_usize().vortex_expect("left_min");
+            let left_max = patch_indices[mask_idx + STRIDE]
+                .to_usize()
+                .vortex_expect("left_max");
+            let right_min = mask_indices[true_idx];
+            let right_max = mask_indices[true_idx + STRIDE];
+
+            if left_min > right_max {
+                // Advance right side
+                true_idx += STRIDE;
+                continue;
+            } else if right_min > left_max {
+                mask_idx += STRIDE;
+                continue;
+            } else {
+                // Fallthrough to direct comparison path.
+            }
+        }
+
+        // Two-way sorted iterator merge:
+
+        let left = patch_indices[mask_idx].to_usize().vortex_expect("left");
+        let right = mask_indices[true_idx];
+
+        match left.cmp(&right) {
+            Ordering::Less => {
+                mask_idx += 1;
+            }
+            Ordering::Greater => {
+                true_idx += 1;
+            }
+            Ordering::Equal => {
+                // Save the mask index as well as the positional index.
+                new_mask_indices.push(mask_idx);
+                new_patch_indices.push(true_idx as u64);
+
+                mask_idx += 1;
+                true_idx += 1;
+            }
+        }
+    }
+
+    if new_mask_indices.is_empty() {
+        return Ok(None);
+    }
+
+    let new_patch_indices = new_patch_indices.into_array();
+    let new_patch_values = filter(
+        patch_values,
+        &Mask::from_indices(patch_values.len(), new_mask_indices),
+    )?;
+
+    Ok(Some(Patches::new(
+        mask.true_count(),
+        new_patch_indices,
+        new_patch_values,
+    )))
+}
+
 #[cfg(test)]
 mod test {
     use rstest::{fixture, rstest};