Skip to content

Commit 38e9860

Browse files
authored
feat: stats implementations for more array types (#1305)
also split dict stats (don't compute them all, always), and some flyby bug fixes to roaring & delta constructors
1 parent 5ad2cd3 commit 38e9860

File tree

24 files changed

+467
-205
lines changed

24 files changed

+467
-205
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bench-vortex/src/bin/notimplemented.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,9 @@ fn enc_impls() -> Vec<Array> {
164164
.into_array(),
165165
varbin_array(),
166166
varbinview_array(),
167-
ZigZagArray::encode(&PrimitiveArray::from(vec![-1, 1, -9, 9]).into_array()).unwrap(),
167+
ZigZagArray::encode(&PrimitiveArray::from(vec![-1, 1, -9, 9]).into_array())
168+
.unwrap()
169+
.into_array(),
168170
]
169171
}
170172

encodings/datetime-parts/src/array.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@ use vortex_array::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
55
use vortex_array::array::StructArray;
66
use vortex_array::compute::unary::try_cast;
77
use vortex_array::encoding::ids;
8-
use vortex_array::stats::{ArrayStatisticsCompute, StatsSet};
8+
use vortex_array::stats::{ArrayStatisticsCompute, Stat, StatsSet};
99
use vortex_array::validity::{ArrayValidity, LogicalValidity, Validity};
1010
use vortex_array::variants::{ArrayVariants, ExtensionArrayTrait};
1111
use vortex_array::{
1212
impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoArray, IntoCanonical,
1313
};
1414
use vortex_dtype::{DType, PType};
1515
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult, VortexUnwrap};
16+
use vortex_scalar::Scalar;
1617

1718
use crate::compute::decode_to_temporal;
1819

@@ -160,4 +161,17 @@ impl AcceptArrayVisitor for DateTimePartsArray {
160161
}
161162
}
162163

163-
impl ArrayStatisticsCompute for DateTimePartsArray {}
164+
impl ArrayStatisticsCompute for DateTimePartsArray {
165+
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
166+
let maybe_stat = match stat {
167+
Stat::NullCount => Some(Scalar::from(self.validity().null_count(self.len())?)),
168+
_ => None,
169+
};
170+
171+
let mut stats = StatsSet::new();
172+
if let Some(value) = maybe_stat {
173+
stats.set(stat, value);
174+
}
175+
Ok(stats)
176+
}
177+
}

encodings/dict/src/stats.rs

Lines changed: 46 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,41 +6,57 @@ use vortex_scalar::Scalar;
66
use crate::DictArray;
77

88
impl ArrayStatisticsCompute for DictArray {
9-
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
9+
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
1010
let mut stats: HashMap<Stat, Scalar> = HashMap::new();
1111

12-
if let Some(rc) = self.codes().statistics().compute(Stat::RunCount) {
13-
stats.insert(Stat::RunCount, rc);
14-
}
15-
if let Some(min) = self.values().statistics().compute(Stat::Min) {
16-
stats.insert(Stat::Min, min);
17-
}
18-
if let Some(max) = self.values().statistics().compute(Stat::Max) {
19-
stats.insert(Stat::Max, max);
20-
}
21-
if let Some(is_constant) = self.codes().statistics().compute(Stat::IsConstant) {
22-
stats.insert(Stat::IsConstant, is_constant);
23-
}
24-
if let Some(null_count) = self.codes().statistics().compute(Stat::NullCount) {
25-
stats.insert(Stat::NullCount, null_count);
26-
}
27-
28-
// if dictionary is sorted
29-
if self
30-
.values()
31-
.statistics()
32-
.compute_is_sorted()
33-
.unwrap_or(false)
34-
{
35-
if let Some(codes_are_sorted) = self.codes().statistics().compute(Stat::IsSorted) {
36-
stats.insert(Stat::IsSorted, codes_are_sorted);
12+
match stat {
13+
Stat::RunCount => {
14+
if let Some(rc) = self.codes().statistics().compute(Stat::RunCount) {
15+
stats.insert(Stat::RunCount, rc);
16+
}
17+
}
18+
Stat::Min => {
19+
if let Some(min) = self.values().statistics().compute(Stat::Min) {
20+
stats.insert(Stat::Min, min);
21+
}
22+
}
23+
Stat::Max => {
24+
if let Some(max) = self.values().statistics().compute(Stat::Max) {
25+
stats.insert(Stat::Max, max);
26+
}
27+
}
28+
Stat::IsConstant => {
29+
if let Some(is_constant) = self.codes().statistics().compute(Stat::IsConstant) {
30+
stats.insert(Stat::IsConstant, is_constant);
31+
}
32+
}
33+
Stat::NullCount => {
34+
if let Some(null_count) = self.codes().statistics().compute(Stat::NullCount) {
35+
stats.insert(Stat::NullCount, null_count);
36+
}
3737
}
38+
Stat::IsSorted | Stat::IsStrictSorted => {
39+
// if dictionary is sorted
40+
if self
41+
.values()
42+
.statistics()
43+
.compute_is_sorted()
44+
.unwrap_or(false)
45+
{
46+
if let Some(codes_are_sorted) =
47+
self.codes().statistics().compute(Stat::IsSorted)
48+
{
49+
stats.insert(Stat::IsSorted, codes_are_sorted);
50+
}
3851

39-
if let Some(codes_are_strict_sorted) =
40-
self.codes().statistics().compute(Stat::IsStrictSorted)
41-
{
42-
stats.insert(Stat::IsStrictSorted, codes_are_strict_sorted);
52+
if let Some(codes_are_strict_sorted) =
53+
self.codes().statistics().compute(Stat::IsStrictSorted)
54+
{
55+
stats.insert(Stat::IsStrictSorted, codes_are_strict_sorted);
56+
}
57+
}
4358
}
59+
_ => {}
4460
}
4561

4662
Ok(StatsSet::from(stats))

encodings/fastlanes/src/delta/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,10 @@ impl DeltaArray {
117117
}
118118

119119
let dtype = bases.dtype().clone();
120+
if !dtype.is_int() {
121+
vortex_bail!("DeltaArray: dtype must be an integer, got {}", dtype);
122+
}
123+
120124
let metadata = DeltaMetadata {
121125
validity: validity.to_metadata(logical_len)?,
122126
deltas_len: deltas.len() as u64,

encodings/roaring/src/boolean/mod.rs

Lines changed: 21 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@ pub use compress::*;
55
use croaring::Native;
66
pub use croaring::{Bitmap, Portable};
77
use serde::{Deserialize, Serialize};
8-
use vortex_array::aliases::hash_map::HashMap;
98
use vortex_array::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
109
use vortex_array::array::BoolArray;
1110
use vortex_array::encoding::ids;
12-
use vortex_array::stats::{Stat, StatsSet};
11+
use vortex_array::stats::StatsSet;
1312
use vortex_array::validity::{ArrayValidity, LogicalValidity, Validity};
1413
use vortex_array::variants::{ArrayVariants, BoolArrayTrait};
1514
use vortex_array::{
@@ -37,35 +36,27 @@ impl Display for RoaringBoolMetadata {
3736

3837
impl RoaringBoolArray {
3938
pub fn try_new(bitmap: Bitmap, length: usize) -> VortexResult<Self> {
40-
if length < bitmap.cardinality() as usize {
41-
vortex_bail!("RoaringBoolArray length is less than bitmap cardinality")
42-
} else {
43-
let roaring_stats = bitmap.statistics();
44-
let stats = StatsSet::from(HashMap::from([
45-
(
46-
Stat::Min,
47-
(roaring_stats.cardinality == length as u64).into(),
48-
),
49-
(Stat::Max, (roaring_stats.cardinality > 0).into()),
50-
(
51-
Stat::IsConstant,
52-
(roaring_stats.cardinality == length as u64 || roaring_stats.cardinality == 0)
53-
.into(),
54-
),
55-
(Stat::TrueCount, roaring_stats.cardinality.into()),
56-
]));
57-
58-
Ok(Self {
59-
typed: TypedArray::try_from_parts(
60-
DType::Bool(NonNullable),
61-
length,
62-
RoaringBoolMetadata,
63-
Some(Buffer::from(bitmap.serialize::<Native>())),
64-
vec![].into(),
65-
stats,
66-
)?,
67-
})
39+
let max_set = bitmap.maximum().unwrap_or(0) as usize;
40+
if length < max_set {
41+
vortex_bail!(
42+
"RoaringBoolArray length is less than bitmap maximum {}",
43+
max_set
44+
)
6845
}
46+
47+
let stats =
48+
StatsSet::bools_with_true_count(bitmap.statistics().cardinality as usize, length);
49+
50+
Ok(Self {
51+
typed: TypedArray::try_from_parts(
52+
DType::Bool(NonNullable),
53+
length,
54+
RoaringBoolMetadata,
55+
Some(Buffer::from(bitmap.serialize::<Native>())),
56+
vec![].into(),
57+
stats,
58+
)?,
59+
})
6960
}
7061

7162
pub fn bitmap(&self) -> Bitmap {

encodings/roaring/src/boolean/stats.rs

Lines changed: 29 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use croaring::Bitset;
21
use vortex_array::aliases::hash_map::HashMap;
32
use vortex_array::stats::{ArrayStatisticsCompute, Stat, StatsSet};
43
use vortex_error::{vortex_err, VortexResult};
@@ -7,87 +6,41 @@ use crate::RoaringBoolArray;
76

87
impl ArrayStatisticsCompute for RoaringBoolArray {
98
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
10-
if self.is_empty() {
11-
return Ok(StatsSet::new());
12-
}
13-
149
// Only needs to compute IsSorted, IsStrictSorted and RunCount all other stats have been populated on construction
1510
let bitmap = self.bitmap();
16-
BitmapStats(
17-
bitmap
18-
.to_bitset()
19-
.ok_or_else(|| vortex_err!("Bitmap to Bitset conversion run out of memory"))?,
20-
self.len(),
21-
bitmap.statistics().cardinality,
22-
)
23-
.compute_statistics(stat)
24-
}
25-
}
26-
27-
// Underlying bitset, length in bits, cardinality (true count) of the bitset
28-
struct BitmapStats(Bitset, usize, u64);
29-
30-
impl ArrayStatisticsCompute for BitmapStats {
31-
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
32-
let bitset_slice = self.0.as_slice();
33-
let whole_chunks = self.1 / 64;
34-
let last_chunk_len = self.1 % 64;
35-
let fist_bool = bitset_slice[0] & 1 == 1;
36-
let mut stats = RoaringBoolStatsAccumulator::new(fist_bool);
37-
for bits64 in bitset_slice[0..whole_chunks].iter() {
38-
stats.next(*bits64);
11+
let true_count = bitmap.statistics().cardinality;
12+
if matches!(
13+
stat,
14+
Stat::TrueCount | Stat::Min | Stat::Max | Stat::IsConstant
15+
) {
16+
return Ok(StatsSet::bools_with_true_count(
17+
true_count as usize,
18+
self.len(),
19+
));
3920
}
40-
stats.next_up_to_length(bitset_slice[whole_chunks], last_chunk_len);
41-
Ok(stats.finish(self.2))
42-
}
43-
}
4421

45-
struct RoaringBoolStatsAccumulator {
46-
prev: bool,
47-
is_sorted: bool,
48-
run_count: usize,
49-
len: usize,
50-
}
51-
52-
impl RoaringBoolStatsAccumulator {
53-
fn new(first_value: bool) -> Self {
54-
Self {
55-
prev: first_value,
56-
is_sorted: true,
57-
run_count: 1,
58-
len: 0,
59-
}
60-
}
61-
62-
pub fn next_up_to_length(&mut self, next: u64, len: usize) {
63-
assert!(len <= 64);
64-
self.len += len;
65-
for i in 0..len {
66-
let current = ((next >> i) & 1) == 1;
67-
// Booleans are sorted true > false so we aren't sorted if we switched from true to false value
68-
if !current && self.prev {
69-
self.is_sorted = false;
70-
}
71-
if current != self.prev {
72-
self.run_count += 1;
73-
self.prev = current;
74-
}
22+
if matches!(stat, Stat::IsSorted | Stat::IsStrictSorted) {
23+
let is_sorted = if true_count == 0 || true_count == self.len() as u64 {
24+
true
25+
} else {
26+
let min_idx = bitmap.minimum().ok_or_else(|| {
27+
vortex_err!("Bitmap has no minimum despite having cardinality > 0")
28+
})?;
29+
let max_idx = bitmap.maximum().ok_or_else(|| {
30+
vortex_err!("Bitmap has no maximum despite having cardinality > 0")
31+
})?;
32+
(max_idx as usize + 1 == self.len()) && (max_idx + 1 - min_idx) as u64 == true_count
33+
};
34+
35+
let is_strict_sorted =
36+
is_sorted && (self.len() <= 1 || (self.len() == 2 && true_count == 1));
37+
return Ok(StatsSet::from(HashMap::from([
38+
(Stat::IsSorted, is_sorted.into()),
39+
(Stat::IsStrictSorted, is_strict_sorted.into()),
40+
])));
7541
}
76-
}
77-
78-
pub fn next(&mut self, next: u64) {
79-
self.next_up_to_length(next, 64)
80-
}
8142

82-
pub fn finish(self, cardinality: u64) -> StatsSet {
83-
StatsSet::from(HashMap::from([
84-
(Stat::IsSorted, self.is_sorted.into()),
85-
(
86-
Stat::IsStrictSorted,
87-
(self.is_sorted && (self.len < 2 || (self.len == 2 && cardinality == 1))).into(),
88-
),
89-
(Stat::RunCount, self.run_count.into()),
90-
]))
43+
Ok(StatsSet::new())
9144
}
9245
}
9346

@@ -111,7 +64,6 @@ mod test {
11164
assert!(!bool_arr.statistics().compute_is_constant().unwrap());
11265
assert!(!bool_arr.statistics().compute_min::<bool>().unwrap());
11366
assert!(bool_arr.statistics().compute_max::<bool>().unwrap());
114-
assert_eq!(bool_arr.statistics().compute_run_count().unwrap(), 5);
11567
assert_eq!(bool_arr.statistics().compute_true_count().unwrap(), 4);
11668
}
11769

encodings/roaring/src/integer/mod.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,11 @@ impl ArrayStatisticsCompute for RoaringIntArray {
142142
if stat == Stat::TrailingZeroFreq || stat == Stat::BitWidthFreq || stat == Stat::RunCount {
143143
let primitive =
144144
PrimitiveArray::from_vec(self.owned_bitmap().to_vec(), Validity::NonNullable);
145-
primitive.statistics().compute(stat);
146-
Ok(primitive.statistics().to_set())
145+
primitive.statistics().compute_all(&[
146+
Stat::TrailingZeroFreq,
147+
Stat::BitWidthFreq,
148+
Stat::RunCount,
149+
])
147150
} else {
148151
Ok(StatsSet::new())
149152
}

encodings/runend-bool/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ vortex-scalar = { workspace = true }
2626
[dev-dependencies]
2727
criterion = { workspace = true }
2828
rand = { workspace = true }
29+
rstest = { workspace = true }
2930

3031
[lints]
3132
workspace = true

0 commit comments

Comments
 (0)