Skip to content
Merged
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions encodings/runend/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ workspace = true
[dev-dependencies]
vortex-array = { workspace = true, features = ["test-harness"] }
criterion = { workspace = true }
rand = { workspace = true }

[[bench]]
name = "run_end_filter"
harness = false

[[bench]]
name = "run_end_null_count"
harness = false
60 changes: 60 additions & 0 deletions encodings/runend/benches/run_end_null_count.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#![allow(clippy::unwrap_used)]

use std::iter::Iterator;

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng as _};
use vortex_array::array::PrimitiveArray;
use vortex_array::stats::Stat;
use vortex_array::IntoArrayData;
use vortex_buffer::Buffer;
use vortex_runend::RunEndArray;

const LENS: [usize; 2] = [1000, 100_000];

/// Create RunEnd arrays where the runs are equal size, and the null_count mask is evenly spaced.
fn run_end_null_count(c: &mut Criterion) {
let mut rng = StdRng::seed_from_u64(0);
let mut group = c.benchmark_group("run_end_null_count");

for &n in LENS.iter().rev() {
for run_step in [1usize << 2, 1 << 4, 1 << 8, 1 << 16] {
let ends = (0..=n)
.step_by(run_step)
.map(|x| x as u64)
.collect::<Buffer<_>>()
.into_array();
let run_count = ends.len() - 1;
for valid_density in [0.01, 0.1, 0.5] {
let values = PrimitiveArray::from_option_iter(
(0..ends.len()).map(|x| rng.gen_bool(valid_density).then_some(x as u64)),
)
.into_array();
let array = RunEndArray::try_new(ends.clone(), values)
.unwrap()
.into_array();

group.bench_function(
format!(
"null_count_run_end n: {}, run_count: {}, valid_density: {}",
n, run_count, valid_density
),
|b| {
b.iter(|| {
black_box(
array
.encoding()
.compute_statistics(&array, Stat::NullCount)
.unwrap(),
)
});
},
);
}
}
}
}

criterion_group!(benches, run_end_null_count);
criterion_main!(benches);
191 changes: 189 additions & 2 deletions encodings/runend/src/array.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::cmp;
use std::fmt::{Debug, Display};

use itertools::Itertools;
use serde::{Deserialize, Serialize};
use vortex_array::array::PrimitiveArray;
use vortex_array::compute::{
Expand All @@ -16,7 +18,7 @@ use vortex_array::{
IntoCanonical,
};
use vortex_buffer::Buffer;
use vortex_dtype::{DType, PType};
use vortex_dtype::{match_each_unsigned_integer_ptype, DType, PType};
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};
use vortex_scalar::Scalar;

Expand Down Expand Up @@ -237,6 +239,11 @@ impl StatisticsVTable<RunEndArray> for RunEndEncoding {
.unwrap_or(false)
&& array.logical_validity().all_valid(),
)),
Stat::TrueCount => match array.dtype() {
DType::Bool(_) => Some(Scalar::from(array.true_count()?)),
_ => None,
},
Stat::NullCount => Some(Scalar::from(array.null_count()?)),
_ => None,
};

Expand All @@ -248,10 +255,105 @@ impl StatisticsVTable<RunEndArray> for RunEndEncoding {
}
}

impl RunEndArray {
fn true_count(&self) -> VortexResult<u64> {
let ends = self.ends().into_primitive()?;
let bools = self.values().into_bool()?.boolean_buffer();

Ok(match self.values().logical_validity() {
LogicalValidity::AllValid(_) => {
match_each_unsigned_integer_ptype!(ends.ptype(), |$P| {
let mut begin = self.offset() as $P;
ends
.as_slice::<$P>()
.iter()
.zip_eq(bools.into_iter())
.map(|(end, bool_value)| {
let len = *end - begin;
begin = *end;
(len as u64) * (bool_value as u64)
})
.sum()
})
}
LogicalValidity::AllInvalid(_) => 0,
LogicalValidity::Array(is_valid) => {
let is_valid = is_valid.into_bool()?.boolean_buffer();
let mut is_valid = is_valid.set_indices();
match is_valid.next() {
None => self.len() as u64,
Some(valid_index) => {
let mut true_count: u64 = 0;
match_each_unsigned_integer_ptype!(ends.ptype(), |$P| {
let offsetted_begin = <$P>::try_from(self.offset())?;
let offsetted_len = <$P>::try_from(self.len() + self.offset())?;
let ends = ends.as_slice::<$P>();
let begin = if valid_index == 0 {
offsetted_begin
} else {
ends[valid_index - 1]
};

let end = cmp::min(ends[valid_index], offsetted_len);
true_count += bools.value(valid_index as usize) as u64 * (end - begin) as u64;

for valid_index in is_valid {
let end = cmp::min(ends[valid_index], offsetted_len);
true_count += bools.value(valid_index as usize) as u64 * (end - ends[valid_index - 1]) as u64;
}

true_count
})
}
}
}
})
}

fn null_count(&self) -> VortexResult<u64> {
let ends = self.ends().into_primitive()?;
let null_count = match self.values().logical_validity() {
LogicalValidity::AllValid(_) => 0_u64,
LogicalValidity::AllInvalid(_) => self.len() as u64,
LogicalValidity::Array(is_valid) => {
let is_valid = is_valid.into_bool()?.boolean_buffer();
let mut is_valid = is_valid.set_indices();
match is_valid.next() {
None => self.len() as u64,
Some(valid_index) => {
let offsetted_len = (self.len() + self.offset()) as u64;
let mut null_count: u64 = self.len() as u64;
match_each_unsigned_integer_ptype!(ends.ptype(), |$P| {
let ends = ends.as_slice::<$P>();
let begin = if valid_index == 0 {
0
} else {
ends[valid_index - 1]
};
null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - begin as u64;

for valid_index in is_valid {
null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - ends[valid_index - 1] as u64;
}

null_count
})
}
}
}
};
Ok(null_count)
}
}

#[cfg(test)]
mod tests {
use vortex_array::compute::scalar_at;
use arrow_buffer::BooleanBuffer;
use vortex_array::array::BoolArray;
use vortex_array::compute::{scalar_at, slice};
use vortex_array::stats::{ArrayStatistics as _, Stat};
use vortex_array::test_harness::check_metadata;
use vortex_array::validity::Validity;
use vortex_array::{ArrayDType, ArrayLen, IntoArrayData};
use vortex_buffer::buffer;
use vortex_dtype::{DType, Nullability, PType};
Expand Down Expand Up @@ -292,4 +394,89 @@ mod tests {
assert_eq!(scalar_at(arr.as_ref(), 5).unwrap(), 3.into());
assert_eq!(scalar_at(arr.as_ref(), 9).unwrap(), 3.into());
}

#[test]
fn test_runend_int_stats() {
let arr = RunEndArray::try_new(
buffer![2u32, 5, 10].into_array(),
buffer![1i32, 2, 3].into_array(),
)
.unwrap();

assert_eq!(arr.statistics().compute_as::<i32>(Stat::Min).unwrap(), 1);
assert_eq!(arr.statistics().compute_as::<i32>(Stat::Max).unwrap(), 3);
assert_eq!(
arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
0
);
assert!(arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap());
}

#[test]
fn test_runend_bool_stats() {
let arr = RunEndArray::try_new(
buffer![2u32, 5, 10].into_array(),
BoolArray::try_new(
BooleanBuffer::from_iter([true, true, false]),
Validity::Array(BoolArray::from_iter([true, false, true]).into_array()),
)
.unwrap()
.into_array(),
)
.unwrap();

assert!(!arr.statistics().compute_as::<bool>(Stat::Min).unwrap());
assert!(arr.statistics().compute_as::<bool>(Stat::Max).unwrap());
assert_eq!(
arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
3
);
assert!(!arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap());
assert_eq!(
arr.statistics().compute_as::<u64>(Stat::TrueCount).unwrap(),
2
);

let sliced = slice(arr, 4, 7).unwrap();

assert!(!sliced.statistics().compute_as::<bool>(Stat::Min).unwrap());
assert!(!sliced.statistics().compute_as::<bool>(Stat::Max).unwrap());
assert_eq!(
sliced
.statistics()
.compute_as::<u64>(Stat::NullCount)
.unwrap(),
1
);
// Not sorted because null must come last
assert!(!sliced
.statistics()
.compute_as::<bool>(Stat::IsSorted)
.unwrap());
assert_eq!(
sliced
.statistics()
.compute_as::<u64>(Stat::TrueCount)
.unwrap(),
0
);
}

#[test]
fn test_all_invalid_true_count() {
let arr = RunEndArray::try_new(
buffer![2u32, 5, 10].into_array(),
BoolArray::from_iter([None, None, None]).into_array(),
)
.unwrap()
.into_array();
assert_eq!(
arr.statistics().compute_as::<u64>(Stat::TrueCount).unwrap(),
0
);
assert_eq!(
arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(),
10
);
}
}
Loading