Skip to content

Commit 18986c2

Browse files
authored
feat: add stat for uncompressed size in bytes (#1315)
fixes #1237
1 parent 8f0ba91 commit 18986c2

File tree

25 files changed

+169
-36
lines changed

25 files changed

+169
-36
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ jobs:
162162
with:
163163
version: v1.0.0
164164
- name: Rust Bench as test
165-
run: cargo bench --bench '*[!noci]' -- --test
165+
run: cargo bench --bench '*[!noci]' --profile benchtest -- --test
166166

167167
generated-files:
168168
name: "Check generated proto/fbs files are up to date"

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,3 +222,7 @@ lto = "thin" # attempts to perform optimizations across all crates within t
222222
codegen-units = 16 # default for "release", which "bench" inherits
223223
lto = false # default
224224
debug = true
225+
226+
[profile.benchtest]
227+
inherits = "bench"
228+
debug-assertions = true

vortex-array/src/array/bool/stats.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,14 @@ use vortex_error::VortexResult;
88
use crate::array::BoolArray;
99
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
1010
use crate::validity::{ArrayValidity, LogicalValidity};
11-
use crate::{ArrayDType, IntoArrayVariant};
11+
use crate::{ArrayDType, ArrayTrait as _, IntoArrayVariant};
1212

1313
impl ArrayStatisticsCompute for BoolArray {
1414
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
15+
if stat == Stat::UncompressedSizeInBytes {
16+
return Ok(StatsSet::of(stat, self.nbytes()));
17+
}
18+
1519
if self.is_empty() {
1620
return Ok(StatsSet::from_iter([
1721
(Stat::TrueCount, 0.into()),

vortex-array/src/array/chunked/mod.rs

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use crate::compute::unary::{scalar_at, scalar_at_unchecked, subtract_scalar, Sub
1717
use crate::compute::{search_sorted, SearchSortedSide};
1818
use crate::encoding::ids;
1919
use crate::iter::{ArrayIterator, ArrayIteratorAdapter};
20-
use crate::stats::StatsSet;
20+
use crate::stats::ArrayStatistics;
2121
use crate::stream::{ArrayStream, ArrayStreamAdapter};
2222
use crate::validity::Validity::NonNullable;
2323
use crate::validity::{ArrayValidity, LogicalValidity, Validity};
@@ -61,9 +61,19 @@ impl ChunkedArray {
6161
.collect_vec();
6262

6363
let nchunks = chunk_offsets.len() - 1;
64-
let length = *chunk_offsets.last().unwrap_or_else(|| {
65-
unreachable!("Chunk ends is guaranteed to have at least one element")
66-
}) as usize;
64+
let length = *chunk_offsets
65+
.last()
66+
.vortex_expect("Chunk ends is guaranteed to have at least one element")
67+
as usize;
68+
69+
let stats = chunks
70+
.iter()
71+
.map(|chunk| chunk.statistics().to_set())
72+
.reduce(|mut acc, stats| {
73+
acc.merge_ordered(&stats);
74+
acc
75+
})
76+
.unwrap_or_default();
6777

6878
let mut children = Vec::with_capacity(chunks.len() + 1);
6979
children.push(PrimitiveArray::from_vec(chunk_offsets, NonNullable).into_array());
@@ -74,7 +84,7 @@ impl ChunkedArray {
7484
length,
7585
ChunkedMetadata { nchunks },
7686
children.into(),
77-
StatsSet::default(),
87+
stats,
7888
)
7989
}
8090

vortex-array/src/array/chunked/stats.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ use crate::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet};
55

66
impl ArrayStatisticsCompute for ChunkedArray {
77
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
8+
// for UncompressedSizeInBytes, we end up with sum of chunk uncompressed sizes
9+
// this ignores the `chunk_offsets` array child, so it won't exactly match self.nbytes()
810
Ok(self
911
.chunks()
1012
.map(|c| {

vortex-array/src/array/null/mod.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,11 @@ impl ArrayValidity for NullArray {
5454
}
5555

5656
impl ArrayStatisticsCompute for NullArray {
57-
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
57+
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
58+
if stat == Stat::UncompressedSizeInBytes {
59+
return Ok(StatsSet::of(stat, self.nbytes()));
60+
}
61+
5862
Ok(StatsSet::nulls(self.len(), &DType::Null))
5963
}
6064
}

vortex-array/src/array/primitive/stats.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,18 @@ use crate::array::primitive::PrimitiveArray;
1414
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
1515
use crate::validity::{ArrayValidity, LogicalValidity};
1616
use crate::variants::PrimitiveArrayTrait;
17-
use crate::{ArrayDType, IntoArrayVariant};
17+
use crate::{ArrayDType, ArrayTrait as _, IntoArrayVariant};
1818

1919
trait PStatsType: NativePType + Into<Scalar> + BitWidth {}
2020

2121
impl<T: NativePType + Into<Scalar> + BitWidth> PStatsType for T {}
2222

2323
impl ArrayStatisticsCompute for PrimitiveArray {
2424
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
25+
if stat == Stat::UncompressedSizeInBytes {
26+
return Ok(StatsSet::of(stat, self.nbytes()));
27+
}
28+
2529
let mut stats = match_each_native_ptype!(self.ptype(), |$P| {
2630
match self.logical_validity() {
2731
LogicalValidity::AllValid(_) => self.maybe_null_slice::<$P>().compute_statistics(stat),
@@ -77,7 +81,7 @@ impl<T: PStatsType> ArrayStatisticsCompute for &[T] {
7781
self.iter().skip(1).for_each(|next| stats.next(*next));
7882
stats.finish()
7983
}
80-
Stat::TrueCount => StatsSet::default(),
84+
Stat::TrueCount | Stat::UncompressedSizeInBytes => StatsSet::default(),
8185
})
8286
}
8387
}
@@ -87,7 +91,7 @@ struct NullableValues<'a, T: PStatsType>(&'a [T], &'a BooleanBuffer);
8791
impl<T: PStatsType> ArrayStatisticsCompute for NullableValues<'_, T> {
8892
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
8993
let values = self.0;
90-
if values.is_empty() || stat == Stat::TrueCount {
94+
if values.is_empty() || stat == Stat::TrueCount || stat == Stat::UncompressedSizeInBytes {
9195
return Ok(StatsSet::default());
9296
}
9397

vortex-array/src/array/struct_/mod.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexExpect as _, Vor
77

88
use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
99
use crate::encoding::ids;
10-
use crate::stats::{ArrayStatisticsCompute, StatsSet};
10+
use crate::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet};
1111
use crate::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata};
1212
use crate::variants::{ArrayVariants, StructArrayTrait};
1313
use crate::{
@@ -191,7 +191,21 @@ impl AcceptArrayVisitor for StructArray {
191191
}
192192
}
193193

194-
impl ArrayStatisticsCompute for StructArray {}
194+
impl ArrayStatisticsCompute for StructArray {
195+
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
196+
Ok(match stat {
197+
Stat::UncompressedSizeInBytes => self
198+
.children()
199+
.map(|f| f.statistics().compute_uncompressed_size_in_bytes())
200+
.reduce(|acc, field_size| acc.zip(field_size).map(|(a, b)| a + b))
201+
.flatten()
202+
.map(|size| StatsSet::of(stat, size))
203+
.unwrap_or_default(),
204+
Stat::NullCount => StatsSet::of(stat, self.validity().null_count(self.len())?),
205+
_ => StatsSet::default(),
206+
})
207+
}
208+
}
195209

196210
#[cfg(test)]
197211
mod test {

vortex-array/src/array/varbin/stats.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,14 @@ use vortex_error::VortexResult;
77
use crate::accessor::ArrayAccessor;
88
use crate::array::varbin::{varbin_scalar, VarBinArray};
99
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
10-
use crate::ArrayDType;
10+
use crate::{ArrayDType, ArrayTrait as _};
1111

1212
impl ArrayStatisticsCompute for VarBinArray {
13-
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
13+
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
14+
if stat == Stat::UncompressedSizeInBytes {
15+
return Ok(StatsSet::of(stat, self.nbytes()));
16+
}
17+
1418
if self.is_empty() {
1519
return Ok(StatsSet::default());
1620
}

0 commit comments

Comments
 (0)