Skip to content
Merged
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions encodings/runend/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ workspace = true
[dev-dependencies]
vortex-array = { workspace = true, features = ["test-harness"] }
criterion = { workspace = true }
rand = { workspace = true }

[[bench]]
name = "run_end_filter"
harness = false

[[bench]]
name = "run_end_null_count"
harness = false
60 changes: 60 additions & 0 deletions encodings/runend/benches/run_end_null_count.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#![allow(clippy::unwrap_used)]

use std::iter::Iterator;

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng as _};
use vortex_array::array::PrimitiveArray;
use vortex_array::stats::Stat;
use vortex_array::IntoArrayData;
use vortex_buffer::Buffer;
use vortex_runend::RunEndArray;

const LENS: [usize; 2] = [1000, 100_000];

/// Create RunEnd arrays where the runs are equal size, and the null_count mask is evenly spaced.
fn run_end_null_count(c: &mut Criterion) {
let mut rng = StdRng::seed_from_u64(0);
let mut group = c.benchmark_group("run_end_null_count");

for &n in LENS.iter().rev() {
for run_step in [1usize << 2, 1 << 4, 1 << 8, 1 << 16] {
let ends = (0..=n)
.step_by(run_step)
.map(|x| x as u64)
.collect::<Buffer<_>>()
.into_array();
let run_count = ends.len() - 1;
for valid_density in [0.01, 0.1, 0.5] {
let values = PrimitiveArray::from_option_iter(
(0..ends.len()).map(|x| rng.gen_bool(valid_density).then_some(x as u64)),
)
.into_array();
let array = RunEndArray::try_new(ends.clone(), values)
.unwrap()
.into_array();

group.bench_function(
format!(
"null_count_run_end n: {}, run_count: {}, valid_density: {}",
n, run_count, valid_density
),
|b| {
b.iter(|| {
black_box(
array
.encoding()
.compute_statistics(&array, Stat::NullCount)
.unwrap(),
)
});
},
);
}
}
}
}

criterion_group!(benches, run_end_null_count);
criterion_main!(benches);
26 changes: 1 addition & 25 deletions encodings/runend/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use vortex_array::compute::{
scalar_at, search_sorted_usize, search_sorted_usize_many, SearchSortedSide,
};
use vortex_array::encoding::ids;
use vortex_array::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet};
use vortex_array::stats::{ArrayStatistics, StatsSet};
use vortex_array::validate::ValidateVTable;
use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable};
use vortex_array::variants::{BoolArrayTrait, PrimitiveArrayTrait, VariantsVTable};
Expand All @@ -18,7 +18,6 @@ use vortex_array::{
use vortex_buffer::Buffer;
use vortex_dtype::{DType, PType};
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};
use vortex_scalar::Scalar;

use crate::compress::{runend_decode_bools, runend_decode_primitive, runend_encode};

Expand Down Expand Up @@ -225,29 +224,6 @@ impl VisitorVTable<RunEndArray> for RunEndEncoding {
}
}

impl StatisticsVTable<RunEndArray> for RunEndEncoding {
fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult<StatsSet> {
let maybe_stat = match stat {
Stat::Min | Stat::Max => array.values().statistics().compute(stat),
Stat::IsSorted => Some(Scalar::from(
array
.values()
.statistics()
.compute_is_sorted()
.unwrap_or(false)
&& array.logical_validity().all_valid(),
)),
_ => None,
};

let mut stats = StatsSet::default();
if let Some(stat_value) = maybe_stat {
stats.set(stat, stat_value);
}
Ok(stats)
}
}

#[cfg(test)]
mod tests {
use vortex_array::compute::scalar_at;
Expand Down
1 change: 1 addition & 0 deletions encodings/runend/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ mod array;
pub mod compress;
mod compute;
mod iter;
mod statistics;

#[doc(hidden)]
pub mod _benchmarking {
Expand Down
244 changes: 244 additions & 0 deletions encodings/runend/src/statistics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
use std::cmp;

use arrow_buffer::BooleanBuffer;
use itertools::Itertools;
use vortex_array::stats::{ArrayStatistics as _, Stat, StatisticsVTable, StatsSet};
use vortex_array::validity::{ArrayValidity as _, LogicalValidity};
use vortex_array::variants::PrimitiveArrayTrait;
use vortex_array::{ArrayDType as _, ArrayLen as _, IntoArrayVariant as _};
use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType};
use vortex_error::VortexResult;
use vortex_scalar::Scalar;

use crate::{RunEndArray, RunEndEncoding};

impl StatisticsVTable<RunEndArray> for RunEndEncoding {
fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult<StatsSet> {
let maybe_stat = match stat {
Stat::Min | Stat::Max => array.values().statistics().compute(stat),
Stat::IsSorted => Some(Scalar::from(
array
.values()
.statistics()
.compute_is_sorted()
.unwrap_or(false)
&& array.logical_validity().all_valid(),
)),
Stat::TrueCount => match array.dtype() {
DType::Bool(_) => Some(Scalar::from(array.true_count()?)),
_ => None,
},
Stat::NullCount => Some(Scalar::from(array.null_count()?)),
_ => None,
};

let mut stats = StatsSet::default();
if let Some(stat_value) = maybe_stat {
stats.set(stat, stat_value);
}
Ok(stats)
}
}

impl RunEndArray {
fn true_count(&self) -> VortexResult<u64> {
let ends = self.ends().into_primitive()?;
let values = self.values().into_bool()?.boolean_buffer();

match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.typed_true_count(ends.as_slice::<$P>(), values))
}

fn typed_true_count<P: NativePType + Into<u64>>(
&self,
decompressed_ends: &[P],
decompressed_values: BooleanBuffer,
) -> VortexResult<u64> {
Ok(match self.values().logical_validity() {
LogicalValidity::AllValid(_) => {
let mut begin = self.offset() as u64;
decompressed_ends
.iter()
.copied()
.zip_eq(&decompressed_values)
.map(|(end, bool_value)| {
let end: u64 = end.into();
let len = end - begin;
begin = end;
len * u64::from(bool_value)
})
.sum()
}
LogicalValidity::AllInvalid(_) => 0,
LogicalValidity::Array(is_valid) => {
let is_valid = is_valid.into_bool()?.boolean_buffer();
let mut is_valid = is_valid.set_indices();
match is_valid.next() {
None => self.len() as u64,
Some(valid_index) => {
let mut true_count: u64 = 0;
let offsetted_begin = self.offset() as u64;
let offsetted_len = (self.len() + self.offset()) as u64;
let begin = if valid_index == 0 {
offsetted_begin
} else {
decompressed_ends[valid_index - 1].into()
};

let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
true_count += decompressed_values.value(valid_index) as u64 * (end - begin);

for valid_index in is_valid {
let valid_end: u64 = decompressed_ends[valid_index].into();
let end = cmp::min(valid_end, offsetted_len);
true_count +=
decompressed_values.value(valid_index) as u64 * (end - valid_end);
}

true_count
}
}
}
})
}

fn null_count(&self) -> VortexResult<u64> {
let ends = self.ends().into_primitive()?;
let null_count = match self.values().logical_validity() {
LogicalValidity::AllValid(_) => 0u64,
LogicalValidity::AllInvalid(_) => self.len() as u64,
LogicalValidity::Array(is_valid) => {
let is_valid = is_valid.into_bool()?.boolean_buffer();
match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.null_count_with_array_validity(ends.as_slice::<$P>(), is_valid))
}
};
Ok(null_count)
}

fn null_count_with_array_validity<P: NativePType + Into<u64>>(
&self,
decompressed_ends: &[P],
is_valid: BooleanBuffer,
) -> u64 {
let mut is_valid = is_valid.set_indices();
match is_valid.next() {
None => self.len() as u64,
Some(valid_index) => {
let offsetted_len = (self.len() + self.offset()) as u64;
let mut null_count: u64 = self.len() as u64;
let begin = if valid_index == 0 {
0
} else {
decompressed_ends[valid_index - 1].into()
};

let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
null_count -= end - begin;

for valid_index in is_valid {
let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len);
null_count -= end - decompressed_ends[valid_index - 1].into();
}

null_count
}
}
}
}

#[cfg(test)]
mod tests {
use arrow_buffer::BooleanBuffer;
use vortex_array::array::BoolArray;
use vortex_array::compute::slice;
use vortex_array::stats::{ArrayStatistics as _, Stat};
use vortex_array::validity::Validity;
use vortex_array::IntoArrayData;
use vortex_buffer::buffer;

use crate::RunEndArray;

#[test]
fn test_runend_int_stats() {
let arr = RunEndArray::try_new(
buffer![2u32, 5, 10].into_array(),
buffer![1i32, 2, 3].into_array(),
)
.unwrap();

assert_eq!(arr.statistics().compute_as::<i32>(Stat::Min).unwrap(), 1);
assert_eq!(arr.statistics().compute_as::<i32>(Stat::Max).unwrap(), 3);
assert_eq!(
arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
0
);
assert!(arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap());
}

#[test]
fn test_runend_bool_stats() {
let arr = RunEndArray::try_new(
buffer![2u32, 5, 10].into_array(),
BoolArray::try_new(
BooleanBuffer::from_iter([true, true, false]),
Validity::Array(BoolArray::from_iter([true, false, true]).into_array()),
)
.unwrap()
.into_array(),
)
.unwrap();

assert!(!arr.statistics().compute_as::<bool>(Stat::Min).unwrap());
assert!(arr.statistics().compute_as::<bool>(Stat::Max).unwrap());
assert_eq!(
arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
3
);
assert!(!arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap());
assert_eq!(
arr.statistics().compute_as::<u64>(Stat::TrueCount).unwrap(),
2
);

let sliced = slice(arr, 4, 7).unwrap();

assert!(!sliced.statistics().compute_as::<bool>(Stat::Min).unwrap());
assert!(!sliced.statistics().compute_as::<bool>(Stat::Max).unwrap());
assert_eq!(
sliced
.statistics()
.compute_as::<u64>(Stat::NullCount)
.unwrap(),
1
);
// Not sorted because null must come last
assert!(!sliced
.statistics()
.compute_as::<bool>(Stat::IsSorted)
.unwrap());
assert_eq!(
sliced
.statistics()
.compute_as::<u64>(Stat::TrueCount)
.unwrap(),
0
);
}

#[test]
fn test_all_invalid_true_count() {
let arr = RunEndArray::try_new(
buffer![2u32, 5, 10].into_array(),
BoolArray::from_iter([None, None, None]).into_array(),
)
.unwrap()
.into_array();
assert_eq!(
arr.statistics().compute_as::<u64>(Stat::TrueCount).unwrap(),
0
);
assert_eq!(
arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
10
);
}
}
Loading