diff --git a/Cargo.lock b/Cargo.lock index 5e30f79e4e9..6caf82aba72 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5263,6 +5263,7 @@ dependencies = [ "criterion", "itertools 0.14.0", "num-traits", + "rand", "serde", "vortex-array", "vortex-buffer", diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml index 3fcbae85b3e..e08d80cfba4 100644 --- a/encodings/runend/Cargo.toml +++ b/encodings/runend/Cargo.toml @@ -31,7 +31,12 @@ workspace = true [dev-dependencies] vortex-array = { workspace = true, features = ["test-harness"] } criterion = { workspace = true } +rand = { workspace = true } [[bench]] name = "run_end_filter" harness = false + +[[bench]] +name = "run_end_null_count" +harness = false diff --git a/encodings/runend/benches/run_end_null_count.rs b/encodings/runend/benches/run_end_null_count.rs new file mode 100644 index 00000000000..dd7bba5f9c7 --- /dev/null +++ b/encodings/runend/benches/run_end_null_count.rs @@ -0,0 +1,60 @@ +#![allow(clippy::unwrap_used)] + +use std::iter::Iterator; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng as _}; +use vortex_array::array::PrimitiveArray; +use vortex_array::stats::Stat; +use vortex_array::IntoArrayData; +use vortex_buffer::Buffer; +use vortex_runend::RunEndArray; + +const LENS: [usize; 2] = [1000, 100_000]; + +/// Create RunEnd arrays where the runs are equal size, and the null_count mask is evenly spaced. +fn run_end_null_count(c: &mut Criterion) { + let mut rng = StdRng::seed_from_u64(0); + let mut group = c.benchmark_group("run_end_null_count"); + + for &n in LENS.iter().rev() { + for run_step in [1usize << 2, 1 << 4, 1 << 8, 1 << 16] { + let ends = (0..=n) + .step_by(run_step) + .map(|x| x as u64) + .collect::>() + .into_array(); + let run_count = ends.len() - 1; + for valid_density in [0.01, 0.1, 0.5] { + let values = PrimitiveArray::from_option_iter( + (0..ends.len()).map(|x| rng.gen_bool(valid_density).then_some(x as u64)), + ) + .into_array(); + let array = RunEndArray::try_new(ends.clone(), values) + .unwrap() + .into_array(); + + group.bench_function( + format!( + "null_count_run_end n: {}, run_count: {}, valid_density: {}", + n, run_count, valid_density + ), + |b| { + b.iter(|| { + black_box( + array + .encoding() + .compute_statistics(&array, Stat::NullCount) + .unwrap(), + ) + }); + }, + ); + } + } + } +} + +criterion_group!(benches, run_end_null_count); +criterion_main!(benches); diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 8a6c8a25080..cd3ee055be9 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -6,7 +6,7 @@ use vortex_array::compute::{ scalar_at, search_sorted_usize, search_sorted_usize_many, SearchSortedSide, }; use vortex_array::encoding::ids; -use vortex_array::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet}; +use vortex_array::stats::{ArrayStatistics, StatsSet}; use vortex_array::validate::ValidateVTable; use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable}; use vortex_array::variants::{BoolArrayTrait, PrimitiveArrayTrait, VariantsVTable}; @@ -18,7 +18,6 @@ use vortex_array::{ use vortex_buffer::Buffer; use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; -use vortex_scalar::Scalar; use crate::compress::{runend_decode_bools, runend_decode_primitive, runend_encode}; @@ -225,29 +224,6 @@ impl VisitorVTable for RunEndEncoding { } } -impl StatisticsVTable for RunEndEncoding { - fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult { - let maybe_stat = match stat { - Stat::Min | Stat::Max => array.values().statistics().compute(stat), - Stat::IsSorted => Some(Scalar::from( - array - .values() - .statistics() - .compute_is_sorted() - .unwrap_or(false) - && array.logical_validity().all_valid(), - )), - _ => None, - }; - - let mut stats = StatsSet::default(); - if let Some(stat_value) = maybe_stat { - stats.set(stat, stat_value); - } - Ok(stats) - } -} - #[cfg(test)] mod tests { use vortex_array::compute::scalar_at; diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs index e601f67973e..d4a77c993ce 100644 --- a/encodings/runend/src/lib.rs +++ b/encodings/runend/src/lib.rs @@ -4,6 +4,7 @@ mod array; pub mod compress; mod compute; mod iter; +mod statistics; #[doc(hidden)] pub mod _benchmarking { diff --git a/encodings/runend/src/statistics.rs b/encodings/runend/src/statistics.rs new file mode 100644 index 00000000000..ba5bc1599e5 --- /dev/null +++ b/encodings/runend/src/statistics.rs @@ -0,0 +1,244 @@ +use std::cmp; + +use arrow_buffer::BooleanBuffer; +use itertools::Itertools; +use vortex_array::stats::{ArrayStatistics as _, Stat, StatisticsVTable, StatsSet}; +use vortex_array::validity::{ArrayValidity as _, LogicalValidity}; +use vortex_array::variants::PrimitiveArrayTrait; +use vortex_array::{ArrayDType as _, ArrayLen as _, IntoArrayVariant as _}; +use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType}; +use vortex_error::VortexResult; +use vortex_scalar::Scalar; + +use crate::{RunEndArray, RunEndEncoding}; + +impl StatisticsVTable for RunEndEncoding { + fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult { + let maybe_stat = match stat { + Stat::Min | Stat::Max => array.values().statistics().compute(stat), + Stat::IsSorted => Some(Scalar::from( + array + .values() + .statistics() + .compute_is_sorted() + .unwrap_or(false) + && array.logical_validity().all_valid(), + )), + Stat::TrueCount => match array.dtype() { + DType::Bool(_) => Some(Scalar::from(array.true_count()?)), + _ => None, + }, + Stat::NullCount => Some(Scalar::from(array.null_count()?)), + _ => None, + }; + + let mut stats = StatsSet::default(); + if let Some(stat_value) = maybe_stat { + stats.set(stat, stat_value); + } + Ok(stats) + } +} + +impl RunEndArray { + fn true_count(&self) -> VortexResult { + let ends = self.ends().into_primitive()?; + let values = self.values().into_bool()?.boolean_buffer(); + + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.typed_true_count(ends.as_slice::<$P>(), values)) + } + + fn typed_true_count>( + &self, + decompressed_ends: &[P], + decompressed_values: BooleanBuffer, + ) -> VortexResult { + Ok(match self.values().logical_validity() { + LogicalValidity::AllValid(_) => { + let mut begin = self.offset() as u64; + decompressed_ends + .iter() + .copied() + .zip_eq(&decompressed_values) + .map(|(end, bool_value)| { + let end: u64 = end.into(); + let len = end - begin; + begin = end; + len * u64::from(bool_value) + }) + .sum() + } + LogicalValidity::AllInvalid(_) => 0, + LogicalValidity::Array(is_valid) => { + let is_valid = is_valid.into_bool()?.boolean_buffer(); + let mut is_valid = is_valid.set_indices(); + match is_valid.next() { + None => self.len() as u64, + Some(valid_index) => { + let mut true_count: u64 = 0; + let offsetted_begin = self.offset() as u64; + let offsetted_len = (self.len() + self.offset()) as u64; + let begin = if valid_index == 0 { + offsetted_begin + } else { + decompressed_ends[valid_index - 1].into() + }; + + let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len); + true_count += decompressed_values.value(valid_index) as u64 * (end - begin); + + for valid_index in is_valid { + let valid_end: u64 = decompressed_ends[valid_index].into(); + let end = cmp::min(valid_end, offsetted_len); + true_count += + decompressed_values.value(valid_index) as u64 * (end - valid_end); + } + + true_count + } + } + } + }) + } + + fn null_count(&self) -> VortexResult { + let ends = self.ends().into_primitive()?; + let null_count = match self.values().logical_validity() { + LogicalValidity::AllValid(_) => 0u64, + LogicalValidity::AllInvalid(_) => self.len() as u64, + LogicalValidity::Array(is_valid) => { + let is_valid = is_valid.into_bool()?.boolean_buffer(); + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.null_count_with_array_validity(ends.as_slice::<$P>(), is_valid)) + } + }; + Ok(null_count) + } + + fn null_count_with_array_validity>( + &self, + decompressed_ends: &[P], + is_valid: BooleanBuffer, + ) -> u64 { + let mut is_valid = is_valid.set_indices(); + match is_valid.next() { + None => self.len() as u64, + Some(valid_index) => { + let offsetted_len = (self.len() + self.offset()) as u64; + let mut null_count: u64 = self.len() as u64; + let begin = if valid_index == 0 { + 0 + } else { + decompressed_ends[valid_index - 1].into() + }; + + let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len); + null_count -= end - begin; + + for valid_index in is_valid { + let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len); + null_count -= end - decompressed_ends[valid_index - 1].into(); + } + + null_count + } + } + } +} + +#[cfg(test)] +mod tests { + use arrow_buffer::BooleanBuffer; + use vortex_array::array::BoolArray; + use vortex_array::compute::slice; + use vortex_array::stats::{ArrayStatistics as _, Stat}; + use vortex_array::validity::Validity; + use vortex_array::IntoArrayData; + use vortex_buffer::buffer; + + use crate::RunEndArray; + + #[test] + fn test_runend_int_stats() { + let arr = RunEndArray::try_new( + buffer![2u32, 5, 10].into_array(), + buffer![1i32, 2, 3].into_array(), + ) + .unwrap(); + + assert_eq!(arr.statistics().compute_as::(Stat::Min).unwrap(), 1); + assert_eq!(arr.statistics().compute_as::(Stat::Max).unwrap(), 3); + assert_eq!( + arr.statistics().compute_as::(Stat::NullCount).unwrap(), + 0 + ); + assert!(arr.statistics().compute_as::(Stat::IsSorted).unwrap()); + } + + #[test] + fn test_runend_bool_stats() { + let arr = RunEndArray::try_new( + buffer![2u32, 5, 10].into_array(), + BoolArray::try_new( + BooleanBuffer::from_iter([true, true, false]), + Validity::Array(BoolArray::from_iter([true, false, true]).into_array()), + ) + .unwrap() + .into_array(), + ) + .unwrap(); + + assert!(!arr.statistics().compute_as::(Stat::Min).unwrap()); + assert!(arr.statistics().compute_as::(Stat::Max).unwrap()); + assert_eq!( + arr.statistics().compute_as::(Stat::NullCount).unwrap(), + 3 + ); + assert!(!arr.statistics().compute_as::(Stat::IsSorted).unwrap()); + assert_eq!( + arr.statistics().compute_as::(Stat::TrueCount).unwrap(), + 2 + ); + + let sliced = slice(arr, 4, 7).unwrap(); + + assert!(!sliced.statistics().compute_as::(Stat::Min).unwrap()); + assert!(!sliced.statistics().compute_as::(Stat::Max).unwrap()); + assert_eq!( + sliced + .statistics() + .compute_as::(Stat::NullCount) + .unwrap(), + 1 + ); + // Not sorted because null must come last + assert!(!sliced + .statistics() + .compute_as::(Stat::IsSorted) + .unwrap()); + assert_eq!( + sliced + .statistics() + .compute_as::(Stat::TrueCount) + .unwrap(), + 0 + ); + } + + #[test] + fn test_all_invalid_true_count() { + let arr = RunEndArray::try_new( + buffer![2u32, 5, 10].into_array(), + BoolArray::from_iter([None, None, None]).into_array(), + ) + .unwrap() + .into_array(); + assert_eq!( + arr.statistics().compute_as::(Stat::TrueCount).unwrap(), + 0 + ); + assert_eq!( + arr.statistics().compute_as::(Stat::NullCount).unwrap(), + 10 + ); + } +}