feat: teach RunEndArray NullCount and TrueCount (#2007)

danking · web-flow · commit ee7abec8b980 · 2025-01-22T15:45:25.000Z
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml
@@ -31,7 +31,12 @@ workspace = true
 [dev-dependencies]
 vortex-array = { workspace = true, features = ["test-harness"] }
 criterion = { workspace = true }
+rand = { workspace = true }
 
 [[bench]]
 name = "run_end_filter"
 harness = false
+
+[[bench]]
+name = "run_end_null_count"
+harness = false
diff --git a/encodings/runend/benches/run_end_null_count.rs b/encodings/runend/benches/run_end_null_count.rs
@@ -0,0 +1,60 @@
+#![allow(clippy::unwrap_used)]
+
+use std::iter::Iterator;
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng as _};
+use vortex_array::array::PrimitiveArray;
+use vortex_array::stats::Stat;
+use vortex_array::IntoArrayData;
+use vortex_buffer::Buffer;
+use vortex_runend::RunEndArray;
+
+const LENS: [usize; 2] = [1000, 100_000];
+
+/// Create RunEnd arrays where the runs are equal size, and the null_count mask is evenly spaced.
+fn run_end_null_count(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(0);
+    let mut group = c.benchmark_group("run_end_null_count");
+
+    for &n in LENS.iter().rev() {
+        for run_step in [1usize << 2, 1 << 4, 1 << 8, 1 << 16] {
+            let ends = (0..=n)
+                .step_by(run_step)
+                .map(|x| x as u64)
+                .collect::<Buffer<_>>()
+                .into_array();
+            let run_count = ends.len() - 1;
+            for valid_density in [0.01, 0.1, 0.5] {
+                let values = PrimitiveArray::from_option_iter(
+                    (0..ends.len()).map(|x| rng.gen_bool(valid_density).then_some(x as u64)),
+                )
+                .into_array();
+                let array = RunEndArray::try_new(ends.clone(), values)
+                    .unwrap()
+                    .into_array();
+
+                group.bench_function(
+                    format!(
+                        "null_count_run_end n: {}, run_count: {}, valid_density: {}",
+                        n, run_count, valid_density
+                    ),
+                    |b| {
+                        b.iter(|| {
+                            black_box(
+                                array
+                                    .encoding()
+                                    .compute_statistics(&array, Stat::NullCount)
+                                    .unwrap(),
+                            )
+                        });
+                    },
+                );
+            }
+        }
+    }
+}
+
+criterion_group!(benches, run_end_null_count);
+criterion_main!(benches);
diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs
@@ -6,7 +6,7 @@ use vortex_array::compute::{
     scalar_at, search_sorted_usize, search_sorted_usize_many, SearchSortedSide,
 };
 use vortex_array::encoding::ids;
-use vortex_array::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet};
+use vortex_array::stats::{ArrayStatistics, StatsSet};
 use vortex_array::validate::ValidateVTable;
 use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable};
 use vortex_array::variants::{BoolArrayTrait, PrimitiveArrayTrait, VariantsVTable};
@@ -18,7 +18,6 @@ use vortex_array::{
 use vortex_buffer::Buffer;
 use vortex_dtype::{DType, PType};
 use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};
-use vortex_scalar::Scalar;
 
 use crate::compress::{runend_decode_bools, runend_decode_primitive, runend_encode};
 
@@ -224,29 +223,6 @@ impl VisitorVTable<RunEndArray> for RunEndEncoding {
     }
 }
 
-impl StatisticsVTable<RunEndArray> for RunEndEncoding {
-    fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult<StatsSet> {
-        let maybe_stat = match stat {
-            Stat::Min | Stat::Max => array.values().statistics().compute(stat),
-            Stat::IsSorted => Some(Scalar::from(
-                array
-                    .values()
-                    .statistics()
-                    .compute_is_sorted()
-                    .unwrap_or(false)
-                    && array.logical_validity().all_valid(),
-            )),
-            _ => None,
-        };
-
-        let mut stats = StatsSet::default();
-        if let Some(stat_value) = maybe_stat {
-            stats.set(stat, stat_value);
-        }
-        Ok(stats)
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use vortex_array::compute::scalar_at;
diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs
@@ -4,6 +4,7 @@ mod array;
 pub mod compress;
 mod compute;
 mod iter;
+mod statistics;
 
 #[doc(hidden)]
 pub mod _benchmarking {
diff --git a/encodings/runend/src/statistics.rs b/encodings/runend/src/statistics.rs
@@ -0,0 +1,244 @@
+use std::cmp;
+
+use arrow_buffer::BooleanBuffer;
+use itertools::Itertools;
+use vortex_array::stats::{ArrayStatistics as _, Stat, StatisticsVTable, StatsSet};
+use vortex_array::validity::{ArrayValidity as _, LogicalValidity};
+use vortex_array::variants::PrimitiveArrayTrait;
+use vortex_array::{ArrayDType as _, ArrayLen as _, IntoArrayVariant as _};
+use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType};
+use vortex_error::VortexResult;
+use vortex_scalar::Scalar;
+
+use crate::{RunEndArray, RunEndEncoding};
+
+impl StatisticsVTable<RunEndArray> for RunEndEncoding {
+    fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult<StatsSet> {
+        let maybe_stat = match stat {
+            Stat::Min | Stat::Max => array.values().statistics().compute(stat),
+            Stat::IsSorted => Some(Scalar::from(
+                array
+                    .values()
+                    .statistics()
+                    .compute_is_sorted()
+                    .unwrap_or(false)
+                    && array.logical_validity().all_valid(),
+            )),
+            Stat::TrueCount => match array.dtype() {
+                DType::Bool(_) => Some(Scalar::from(array.true_count()?)),
+                _ => None,
+            },
+            Stat::NullCount => Some(Scalar::from(array.null_count()?)),
+            _ => None,
+        };
+
+        let mut stats = StatsSet::default();
+        if let Some(stat_value) = maybe_stat {
+            stats.set(stat, stat_value);
+        }
+        Ok(stats)
+    }
+}
+
+impl RunEndArray {
+    fn true_count(&self) -> VortexResult<u64> {
+        let ends = self.ends().into_primitive()?;
+        let values = self.values().into_bool()?.boolean_buffer();
+
+        match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.typed_true_count(ends.as_slice::<$P>(), values))
+    }
+
+    fn typed_true_count<P: NativePType + Into<u64>>(
+        &self,
+        decompressed_ends: &[P],
+        decompressed_values: BooleanBuffer,
+    ) -> VortexResult<u64> {
+        Ok(match self.values().logical_validity() {
+            LogicalValidity::AllValid(_) => {
+                let mut begin = self.offset() as u64;
+                decompressed_ends
+                    .iter()
+                    .copied()
+                    .zip_eq(&decompressed_values)
+                    .map(|(end, bool_value)| {
+                        let end: u64 = end.into();
+                        let len = end - begin;
+                        begin = end;
+                        len * u64::from(bool_value)
+                    })
+                    .sum()
+            }
+            LogicalValidity::AllInvalid(_) => 0,
+            LogicalValidity::Array(is_valid) => {
+                let is_valid = is_valid.into_bool()?.boolean_buffer();
+                let mut is_valid = is_valid.set_indices();
+                match is_valid.next() {
+                    None => self.len() as u64,
+                    Some(valid_index) => {
+                        let mut true_count: u64 = 0;
+                        let offsetted_begin = self.offset() as u64;
+                        let offsetted_len = (self.len() + self.offset()) as u64;
+                        let begin = if valid_index == 0 {
+                            offsetted_begin
+                        } else {
+                            decompressed_ends[valid_index - 1].into()
+                        };
+
+                        let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
+                        true_count += decompressed_values.value(valid_index) as u64 * (end - begin);
+
+                        for valid_index in is_valid {
+                            let valid_end: u64 = decompressed_ends[valid_index].into();
+                            let end = cmp::min(valid_end, offsetted_len);
+                            true_count +=
+                                decompressed_values.value(valid_index) as u64 * (end - valid_end);
+                        }
+
+                        true_count
+                    }
+                }
+            }
+        })
+    }
+
+    fn null_count(&self) -> VortexResult<u64> {
+        let ends = self.ends().into_primitive()?;
+        let null_count = match self.values().logical_validity() {
+            LogicalValidity::AllValid(_) => 0u64,
+            LogicalValidity::AllInvalid(_) => self.len() as u64,
+            LogicalValidity::Array(is_valid) => {
+                let is_valid = is_valid.into_bool()?.boolean_buffer();
+                match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.null_count_with_array_validity(ends.as_slice::<$P>(), is_valid))
+            }
+        };
+        Ok(null_count)
+    }
+
+    fn null_count_with_array_validity<P: NativePType + Into<u64>>(
+        &self,
+        decompressed_ends: &[P],
+        is_valid: BooleanBuffer,
+    ) -> u64 {
+        let mut is_valid = is_valid.set_indices();
+        match is_valid.next() {
+            None => self.len() as u64,
+            Some(valid_index) => {
+                let offsetted_len = (self.len() + self.offset()) as u64;
+                let mut null_count: u64 = self.len() as u64;
+                let begin = if valid_index == 0 {
+                    0
+                } else {
+                    decompressed_ends[valid_index - 1].into()
+                };
+
+                let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
+                null_count -= end - begin;
+
+                for valid_index in is_valid {
+                    let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
+                    null_count -= end - decompressed_ends[valid_index - 1].into();
+                }
+
+                null_count
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_buffer::BooleanBuffer;
+    use vortex_array::array::BoolArray;
+    use vortex_array::compute::slice;
+    use vortex_array::stats::{ArrayStatistics as _, Stat};
+    use vortex_array::validity::Validity;
+    use vortex_array::IntoArrayData;
+    use vortex_buffer::buffer;
+
+    use crate::RunEndArray;
+
+    #[test]
+    fn test_runend_int_stats() {
+        let arr = RunEndArray::try_new(
+            buffer![2u32, 5, 10].into_array(),
+            buffer![1i32, 2, 3].into_array(),
+        )
+        .unwrap();
+
+        assert_eq!(arr.statistics().compute_as::<i32>(Stat::Min).unwrap(), 1);
+        assert_eq!(arr.statistics().compute_as::<i32>(Stat::Max).unwrap(), 3);
+        assert_eq!(
+            arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
+            0
+        );
+        assert!(arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap());
+    }
+
+    #[test]
+    fn test_runend_bool_stats() {
+        let arr = RunEndArray::try_new(
+            buffer![2u32, 5, 10].into_array(),
+            BoolArray::try_new(
+                BooleanBuffer::from_iter([true, true, false]),
+                Validity::Array(BoolArray::from_iter([true, false, true]).into_array()),
+            )
+            .unwrap()
+            .into_array(),
+        )
+        .unwrap();
+
+        assert!(!arr.statistics().compute_as::<bool>(Stat::Min).unwrap());
+        assert!(arr.statistics().compute_as::<bool>(Stat::Max).unwrap());
+        assert_eq!(
+            arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
+            3
+        );
+        assert!(!arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap());
+        assert_eq!(
+            arr.statistics().compute_as::<u64>(Stat::TrueCount).unwrap(),
+            2
+        );
+
+        let sliced = slice(arr, 4, 7).unwrap();
+
+        assert!(!sliced.statistics().compute_as::<bool>(Stat::Min).unwrap());
+        assert!(!sliced.statistics().compute_as::<bool>(Stat::Max).unwrap());
+        assert_eq!(
+            sliced
+                .statistics()
+                .compute_as::<u64>(Stat::NullCount)
+                .unwrap(),
+            1
+        );
+        // Not sorted because null must come last
+        assert!(!sliced
+            .statistics()
+            .compute_as::<bool>(Stat::IsSorted)
+            .unwrap());
+        assert_eq!(
+            sliced
+                .statistics()
+                .compute_as::<u64>(Stat::TrueCount)
+                .unwrap(),
+            0
+        );
+    }
+
+    #[test]
+    fn test_all_invalid_true_count() {
+        let arr = RunEndArray::try_new(
+            buffer![2u32, 5, 10].into_array(),
+            BoolArray::from_iter([None, None, None]).into_array(),
+        )
+        .unwrap()
+        .into_array();
+        assert_eq!(
+            arr.statistics().compute_as::<u64>(Stat::TrueCount).unwrap(),
+            0
+        );
+        assert_eq!(
+            arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
+            10
+        );
+    }
+}