Skip to content

Commit c1cee33

Browse files
authored
feat: propagate statistics through compression (#1236)
fixes #1174
1 parent 994c01f commit c1cee33

File tree

32 files changed

+477
-131
lines changed

32 files changed

+477
-131
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ resolver = "2"
2626
version = "0.14.0"
2727
homepage = "https://github.com/spiraldb/vortex"
2828
repository = "https://github.com/spiraldb/vortex"
29-
authors = ["Vortex Authors <hello@spiraldb.com>"]
29+
authors = ["Vortex Authors <hello@vortex.dev>"]
3030
license = "Apache-2.0"
3131
keywords = ["vortex"]
3232
include = [

encodings/fastlanes/src/for/compress.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ fn decompress_primitive<T: NativePType + WrappingAdd + PrimInt>(
121121
mod test {
122122
use vortex_array::compute::unary::ScalarAtFn;
123123
use vortex_array::IntoArrayVariant;
124+
use vortex_dtype::Nullability;
124125

125126
use super::*;
126127

@@ -133,6 +134,41 @@ mod test {
133134
assert_eq!(u32::try_from(compressed.reference()).unwrap(), 1_000_000u32);
134135
}
135136

137+
#[test]
138+
fn test_zeros() {
139+
let array = PrimitiveArray::from(vec![0i32; 10_000]);
140+
assert!(array.statistics().to_set().into_iter().next().is_none());
141+
142+
let compressed = for_compress(&array).unwrap();
143+
let constant = ConstantArray::try_from(compressed).unwrap();
144+
assert_eq!(constant.scalar_value(), &ScalarValue::from(0i32));
145+
}
146+
147+
#[test]
148+
fn test_nullable_zeros() {
149+
let array = PrimitiveArray::from_nullable_vec(
150+
vec![Some(0i32), None]
151+
.into_iter()
152+
.cycle()
153+
.take(10_000)
154+
.collect_vec(),
155+
);
156+
assert!(array.statistics().to_set().into_iter().next().is_none());
157+
158+
let compressed = for_compress(&array).unwrap();
159+
let sparse = SparseArray::try_from(compressed).unwrap();
160+
assert!(sparse.statistics().to_set().into_iter().next().is_none());
161+
assert_eq!(sparse.fill_value(), &ScalarValue::Null);
162+
assert_eq!(
163+
sparse.scalar_at(0).unwrap(),
164+
Scalar::primitive(0i32, Nullability::Nullable)
165+
);
166+
assert_eq!(
167+
sparse.scalar_at(1).unwrap(),
168+
Scalar::null(sparse.dtype().clone())
169+
);
170+
}
171+
136172
#[test]
137173
fn test_decompress() {
138174
// Create a range offset by a million

vortex-array/src/array/bool/stats.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use arrow_buffer::BooleanBuffer;
2+
use itertools::Itertools;
23
use vortex_dtype::{DType, Nullability};
34
use vortex_error::VortexResult;
45

@@ -43,7 +44,7 @@ impl ArrayStatisticsCompute for NullableBools<'_> {
4344
acc.n_nulls(first_non_null);
4445
self.0
4546
.iter()
46-
.zip(self.1.iter())
47+
.zip_eq(self.1.iter())
4748
.skip(first_non_null + 1)
4849
.map(|(next, valid)| valid.then_some(next))
4950
.for_each(|next| acc.nullable_next(next));
@@ -59,6 +60,10 @@ impl ArrayStatisticsCompute for NullableBools<'_> {
5960

6061
impl ArrayStatisticsCompute for BooleanBuffer {
6162
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
63+
if self.is_empty() {
64+
return Ok(StatsSet::new());
65+
}
66+
6267
let mut stats = BoolStatsAccumulator::new(self.value(0));
6368
self.iter().skip(1).for_each(|next| stats.next(next));
6469
Ok(stats.finish())
@@ -75,7 +80,7 @@ struct BoolStatsAccumulator {
7580
}
7681

7782
impl BoolStatsAccumulator {
78-
fn new(first_value: bool) -> Self {
83+
pub fn new(first_value: bool) -> Self {
7984
Self {
8085
prev: first_value,
8186
is_sorted: true,
@@ -86,7 +91,7 @@ impl BoolStatsAccumulator {
8691
}
8792
}
8893

89-
fn n_nulls(&mut self, n_nulls: usize) {
94+
pub fn n_nulls(&mut self, n_nulls: usize) {
9095
self.null_count += n_nulls;
9196
self.len += n_nulls;
9297
}

vortex-array/src/array/chunked/stats.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,11 @@ impl ArrayStatisticsCompute for ChunkedArray {
99
.chunks()
1010
.map(|c| {
1111
let s = c.statistics();
12-
// HACK(robert): This will compute all stats, but we could just compute one
1312
s.compute(stat);
1413
s.to_set()
1514
})
1615
.reduce(|mut acc, x| {
17-
acc.merge(&x);
16+
acc.merge_ordered(&x);
1817
acc
1918
})
2019
.unwrap_or_default())

vortex-array/src/array/constant/mod.rs

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,14 @@ use serde::{Deserialize, Serialize};
44
use vortex_error::{vortex_panic, VortexResult};
55
use vortex_scalar::{Scalar, ScalarValue};
66

7-
use crate::aliases::hash_map::HashMap;
87
use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
98
use crate::encoding::ids;
10-
use crate::stats::{Stat, StatsSet};
9+
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
1110
use crate::validity::{ArrayValidity, LogicalValidity};
1211
use crate::{impl_encoding, ArrayDType, ArrayTrait};
1312

1413
mod canonical;
1514
mod compute;
16-
mod stats;
1715
mod variants;
1816

1917
impl_encoding!("vortex.constant", ids::CONSTANT, Constant);
@@ -39,24 +37,14 @@ impl ConstantArray {
3937
S: Into<Scalar>,
4038
{
4139
let scalar = scalar.into();
42-
// TODO(aduffy): add stats for bools, ideally there should be a
43-
// StatsSet::constant(Scalar) constructor that does this for us, like StatsSet::nulls.
44-
let stats = StatsSet::from(HashMap::from([
45-
(Stat::Max, scalar.clone()),
46-
(Stat::Min, scalar.clone()),
47-
(Stat::IsConstant, true.into()),
48-
(Stat::IsSorted, true.into()),
49-
(Stat::RunCount, 1.into()),
50-
]));
51-
5240
Self::try_from_parts(
5341
scalar.dtype().clone(),
5442
length,
5543
ConstantMetadata {
5644
scalar_value: scalar.value().clone(),
5745
},
5846
[].into(),
59-
stats,
47+
StatsSet::constant(scalar.clone(), length),
6048
)
6149
.unwrap_or_else(|err| {
6250
vortex_panic!(
@@ -93,6 +81,12 @@ impl ArrayValidity for ConstantArray {
9381
}
9482
}
9583

84+
impl ArrayStatisticsCompute for ConstantArray {
85+
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
86+
Ok(StatsSet::constant(self.owned_scalar(), self.len()))
87+
}
88+
}
89+
9690
impl AcceptArrayVisitor for ConstantArray {
9791
fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> {
9892
Ok(())

vortex-array/src/array/constant/stats.rs

Lines changed: 0 additions & 33 deletions
This file was deleted.

vortex-array/src/array/extension/mod.rs

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
use std::fmt::{Debug, Display};
22
use std::sync::Arc;
33

4+
use enum_iterator::all;
45
use serde::{Deserialize, Serialize};
56
use vortex_dtype::{DType, ExtDType, ExtID};
67
use vortex_error::{VortexExpect as _, VortexResult};
78

89
use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
910
use crate::encoding::ids;
10-
use crate::stats::ArrayStatisticsCompute;
11+
use crate::stats::{ArrayStatistics as _, ArrayStatisticsCompute, Stat, StatsSet};
1112
use crate::validity::{ArrayValidity, LogicalValidity};
1213
use crate::variants::{ArrayVariants, ExtensionArrayTrait};
1314
use crate::{impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoCanonical};
@@ -93,5 +94,69 @@ impl AcceptArrayVisitor for ExtensionArray {
9394
}
9495

9596
impl ArrayStatisticsCompute for ExtensionArray {
96-
// TODO(ngates): pass through stats to the underlying and cast the scalars.
97+
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
98+
let mut stats = self.storage().statistics().compute_all(&[stat])?;
99+
100+
// for e.g., min/max, we want to cast to the extension array's dtype
101+
// for other stats, we don't need to change anything
102+
for stat in all::<Stat>().filter(|s| s.has_same_dtype_as_array()) {
103+
if let Some(value) = stats.get(stat) {
104+
stats.set(stat, value.cast(self.dtype())?);
105+
}
106+
}
107+
108+
Ok(stats)
109+
}
110+
}
111+
112+
#[cfg(test)]
113+
mod tests {
114+
use itertools::Itertools;
115+
use vortex_dtype::PType;
116+
use vortex_scalar::{PValue, Scalar, ScalarValue};
117+
118+
use super::*;
119+
use crate::array::PrimitiveArray;
120+
use crate::validity::Validity;
121+
use crate::IntoArray as _;
122+
123+
#[test]
124+
fn compute_statistics() {
125+
let ext_dtype = Arc::new(ExtDType::new(
126+
ExtID::new("timestamp".into()),
127+
DType::from(PType::I64).into(),
128+
None,
129+
));
130+
let array = ExtensionArray::new(
131+
ext_dtype.clone(),
132+
PrimitiveArray::from_vec(vec![1i64, 2, 3, 4, 5], Validity::NonNullable).into_array(),
133+
);
134+
135+
let stats = array
136+
.statistics()
137+
.compute_all(&[Stat::Min, Stat::Max, Stat::NullCount])
138+
.unwrap();
139+
let num_stats = stats.clone().into_iter().try_len().unwrap();
140+
assert!(
141+
num_stats >= 3,
142+
"Expected at least 3 stats, got {}",
143+
num_stats
144+
);
145+
146+
assert_eq!(
147+
stats.get(Stat::Min),
148+
Some(&Scalar::extension(
149+
ext_dtype.clone(),
150+
ScalarValue::Primitive(PValue::I64(1))
151+
))
152+
);
153+
assert_eq!(
154+
stats.get(Stat::Max),
155+
Some(&Scalar::extension(
156+
ext_dtype.clone(),
157+
ScalarValue::Primitive(PValue::I64(5))
158+
))
159+
);
160+
assert_eq!(stats.get(Stat::NullCount), Some(&0u64.into()));
161+
}
97162
}

vortex-array/src/array/sparse/mod.rs

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
1010
use crate::compute::unary::scalar_at;
1111
use crate::compute::{search_sorted, SearchResult, SearchSortedSide};
1212
use crate::encoding::ids;
13-
use crate::stats::{ArrayStatisticsCompute, StatsSet};
13+
use crate::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet};
1414
use crate::validity::{ArrayValidity, LogicalValidity};
1515
use crate::variants::PrimitiveArrayTrait;
1616
use crate::{impl_encoding, Array, ArrayDType, ArrayTrait, IntoArray, IntoArrayVariant};
@@ -180,7 +180,28 @@ impl AcceptArrayVisitor for SparseArray {
180180
}
181181
}
182182

183-
impl ArrayStatisticsCompute for SparseArray {}
183+
impl ArrayStatisticsCompute for SparseArray {
184+
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
185+
let mut stats = self.values().statistics().compute_all(&[stat])?;
186+
if self.len() == self.values().len() {
187+
return Ok(stats);
188+
}
189+
190+
let fill_len = self.len() - self.values().len();
191+
let fill_stats = if self.fill_value().is_null() {
192+
StatsSet::nulls(fill_len, self.dtype())
193+
} else {
194+
StatsSet::constant(self.fill_scalar(), fill_len)
195+
};
196+
197+
if self.values().is_empty() {
198+
return Ok(fill_stats);
199+
}
200+
201+
stats.merge_unordered(&fill_stats);
202+
Ok(stats)
203+
}
204+
}
184205

185206
impl ArrayValidity for SparseArray {
186207
fn is_valid(&self, index: usize) -> bool {

vortex-array/src/compress.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use vortex_error::VortexResult;
22

33
use crate::aliases::hash_set::HashSet;
44
use crate::encoding::EncodingRef;
5+
use crate::stats::{ArrayStatistics as _, PRUNING_STATS};
56
use crate::Array;
67

78
pub trait CompressionStrategy {
@@ -45,3 +46,29 @@ pub fn check_dtype_unchanged(arr: &Array, compressed: &Array) {
4546
);
4647
}
4748
}
49+
50+
// Check that compression preserved the statistics.
51+
pub fn check_statistics_unchanged(arr: &Array, compressed: &Array) {
52+
let _ = arr;
53+
let _ = compressed;
54+
#[cfg(debug_assertions)]
55+
{
56+
for (stat, value) in arr.statistics().to_set().into_iter() {
57+
debug_assert_eq!(
58+
compressed.statistics().get(stat),
59+
Some(value.clone()),
60+
"Compression changed {stat} from {value} to {}",
61+
compressed
62+
.statistics()
63+
.get(stat)
64+
.map(|s| s.to_string())
65+
.unwrap_or_else(|| "null".to_string())
66+
);
67+
}
68+
}
69+
}
70+
71+
/// Compute pruning stats for an array.
72+
pub fn compute_pruning_stats(arr: &Array) -> VortexResult<()> {
73+
arr.statistics().compute_all(PRUNING_STATS).map(|_| ())
74+
}

0 commit comments

Comments
 (0)