Skip to content

Commit 22ba94c

Browse files
gatesnlwwmanning
andauthored
fix: support stats for ExtensionArray in vortex-file (#1547)
By using ArrayBuilder to accumulate stats --------- Co-authored-by: Will Manning <will@willmanning.io>
1 parent 89fa4e6 commit 22ba94c

File tree

19 files changed

+193
-427
lines changed

19 files changed

+193
-427
lines changed

encodings/datetime-parts/src/array.rs

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
44
use vortex_array::array::StructArray;
55
use vortex_array::compute::try_cast;
66
use vortex_array::encoding::ids;
7-
use vortex_array::stats::{Stat, StatisticsVTable, StatsSet};
7+
use vortex_array::stats::StatsSet;
88
use vortex_array::validity::{ArrayValidity, LogicalValidity, Validity, ValidityVTable};
99
use vortex_array::variants::{ExtensionArrayTrait, VariantsVTable};
1010
use vortex_array::visitor::{ArrayVisitor, VisitorVTable};
@@ -14,7 +14,6 @@ use vortex_array::{
1414
};
1515
use vortex_dtype::{DType, PType};
1616
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult, VortexUnwrap};
17-
use vortex_scalar::Scalar;
1817

1918
use crate::compute::decode_to_temporal;
2019

@@ -166,18 +165,3 @@ impl VisitorVTable<DateTimePartsArray> for DateTimePartsEncoding {
166165
visitor.visit_child("subsecond", &array.subsecond())
167166
}
168167
}
169-
170-
impl StatisticsVTable<DateTimePartsArray> for DateTimePartsEncoding {
171-
fn compute_statistics(&self, array: &DateTimePartsArray, stat: Stat) -> VortexResult<StatsSet> {
172-
let maybe_stat = match stat {
173-
Stat::NullCount => Some(Scalar::from(array.validity().null_count(array.len())?)),
174-
_ => None,
175-
};
176-
177-
let mut stats = StatsSet::default();
178-
if let Some(value) = maybe_stat {
179-
stats.set(stat, value);
180-
}
181-
Ok(stats)
182-
}
183-
}

encodings/datetime-parts/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ pub use compress::*;
44
mod array;
55
mod compress;
66
mod compute;
7+
mod stats;
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
use vortex_array::stats::{Stat, StatisticsVTable, StatsSet};
2+
use vortex_array::ArrayLen;
3+
use vortex_error::VortexResult;
4+
use vortex_scalar::Scalar;
5+
6+
use crate::{DateTimePartsArray, DateTimePartsEncoding};
7+
8+
impl StatisticsVTable<DateTimePartsArray> for DateTimePartsEncoding {
9+
fn compute_statistics(&self, array: &DateTimePartsArray, stat: Stat) -> VortexResult<StatsSet> {
10+
let maybe_stat = match stat {
11+
Stat::NullCount => Some(Scalar::from(array.validity().null_count(array.len())?)),
12+
_ => None,
13+
};
14+
15+
let mut stats = StatsSet::default();
16+
if let Some(value) = maybe_stat {
17+
stats.set(stat, value);
18+
}
19+
Ok(stats)
20+
}
21+
}

vortex-array/src/array/varbin/stats.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ pub fn compute_varbin_statistics<T: ArrayTrait + ArrayAccessor<[u8]>>(
3737

3838
Ok(match stat {
3939
Stat::NullCount => {
40-
let null_count = array.logical_validity().null_count(array.len())?;
40+
let null_count = array.logical_validity().null_count()?;
4141
if null_count == array.len() {
4242
return Ok(StatsSet::nulls(array.len(), array.dtype()));
4343
}

vortex-array/src/builders/binary.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use std::any::Any;
22

33
use arrow_array::builder::{ArrayBuilder as _, BinaryViewBuilder};
44
use arrow_array::Array;
5-
use vortex_dtype::Nullability;
5+
use vortex_dtype::{DType, Nullability};
66
use vortex_error::{vortex_bail, VortexResult};
77

88
use crate::arrow::FromArrowArray;
@@ -12,13 +12,15 @@ use crate::ArrayData;
1212
pub struct BinaryBuilder {
1313
inner: BinaryViewBuilder,
1414
nullability: Nullability,
15+
dtype: DType,
1516
}
1617

1718
impl BinaryBuilder {
1819
pub fn with_capacity(nullability: Nullability, capacity: usize) -> Self {
1920
Self {
2021
inner: BinaryViewBuilder::with_capacity(capacity),
2122
nullability,
23+
dtype: DType::Binary(nullability),
2224
}
2325
}
2426

@@ -40,6 +42,10 @@ impl ArrayBuilder for BinaryBuilder {
4042
self
4143
}
4244

45+
fn dtype(&self) -> &DType {
46+
&self.dtype
47+
}
48+
4349
fn len(&self) -> usize {
4450
self.inner.len()
4551
}
@@ -59,7 +65,7 @@ impl ArrayBuilder for BinaryBuilder {
5965
fn finish(&mut self) -> VortexResult<ArrayData> {
6066
let arrow = self.inner.finish();
6167

62-
if self.nullability == Nullability::NonNullable && arrow.null_count() > 0 {
68+
if !self.dtype().is_nullable() && arrow.null_count() > 0 {
6369
vortex_bail!("Non-nullable builder has null values");
6470
}
6571

vortex-array/src/builders/bool.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use std::any::Any;
22

33
use arrow_array::builder::{ArrayBuilder as _, BooleanBuilder as ArrowBooleanBuilder};
44
use arrow_array::Array;
5-
use vortex_dtype::Nullability;
5+
use vortex_dtype::{DType, Nullability};
66
use vortex_error::{vortex_bail, VortexResult};
77

88
use crate::arrow::FromArrowArray;
@@ -12,7 +12,7 @@ use crate::ArrayData;
1212
pub struct BoolBuilder {
1313
inner: ArrowBooleanBuilder,
1414
nullability: Nullability,
15-
// TODO(ngates): track stats?
15+
dtype: DType,
1616
}
1717

1818
impl BoolBuilder {
@@ -24,6 +24,7 @@ impl BoolBuilder {
2424
Self {
2525
inner: ArrowBooleanBuilder::with_capacity(capacity),
2626
nullability,
27+
dtype: DType::Bool(nullability),
2728
}
2829
}
2930

@@ -52,6 +53,10 @@ impl ArrayBuilder for BoolBuilder {
5253
self
5354
}
5455

56+
fn dtype(&self) -> &DType {
57+
&self.dtype
58+
}
59+
5560
fn len(&self) -> usize {
5661
self.inner.len()
5762
}
@@ -67,7 +72,7 @@ impl ArrayBuilder for BoolBuilder {
6772
fn finish(&mut self) -> VortexResult<ArrayData> {
6873
let arrow = self.inner.finish();
6974

70-
if self.nullability == Nullability::NonNullable && arrow.null_count() > 0 {
75+
if !self.dtype().is_nullable() && arrow.null_count() > 0 {
7176
vortex_bail!("Non-nullable builder has null values");
7277
}
7378

vortex-array/src/builders/extension.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@ use std::sync::Arc;
33

44
use vortex_dtype::{DType, ExtDType};
55
use vortex_error::VortexResult;
6-
use vortex_scalar::{ExtScalar, Scalar};
6+
use vortex_scalar::ExtScalar;
77

88
use crate::array::ExtensionArray;
9-
use crate::builders::{builder_with_capacity, ArrayBuilder};
9+
use crate::builders::{builder_with_capacity, ArrayBuilder, ArrayBuilderExt};
1010
use crate::{ArrayData, IntoArrayData};
1111

1212
pub struct ExtensionBuilder {
@@ -27,8 +27,7 @@ impl ExtensionBuilder {
2727
}
2828

2929
pub fn append_value(&mut self, value: ExtScalar) -> VortexResult<()> {
30-
self.storage
31-
.append_scalar(&Scalar::extension(self.ext_dtype(), value.storage()))
30+
self.storage.append_scalar(&value.storage())
3231
}
3332

3433
pub fn append_option(&mut self, value: Option<ExtScalar>) -> VortexResult<()> {
@@ -59,6 +58,10 @@ impl ArrayBuilder for ExtensionBuilder {
5958
self
6059
}
6160

61+
fn dtype(&self) -> &DType {
62+
&self.dtype
63+
}
64+
6265
fn len(&self) -> usize {
6366
self.storage.len()
6467
}

vortex-array/src/builders/mod.rs

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,21 @@ pub use null::*;
1515
pub use primitive::*;
1616
pub use utf8::*;
1717
use vortex_dtype::{match_each_native_ptype, DType};
18-
use vortex_error::{vortex_err, VortexResult};
18+
use vortex_error::{vortex_bail, vortex_err, VortexResult};
1919
use vortex_scalar::{
2020
BinaryScalar, BoolScalar, ExtScalar, PrimitiveScalar, Scalar, StructScalar, Utf8Scalar,
2121
};
2222

2323
use crate::builders::struct_::StructBuilder;
2424
use crate::ArrayData;
2525

26-
pub trait ArrayBuilder {
26+
pub trait ArrayBuilder: Send {
2727
fn as_any(&self) -> &dyn Any;
2828

2929
fn as_any_mut(&mut self) -> &mut dyn Any;
3030

31+
fn dtype(&self) -> &DType;
32+
3133
fn len(&self) -> usize;
3234

3335
fn is_empty(&self) -> bool {
@@ -78,9 +80,16 @@ pub fn builder_with_capacity(dtype: &DType, capacity: usize) -> Box<dyn ArrayBui
7880
}
7981
}
8082

81-
impl dyn ArrayBuilder + '_ {
83+
pub trait ArrayBuilderExt: ArrayBuilder {
8284
/// A generic function to append a scalar to the builder.
83-
pub fn append_scalar(&mut self, scalar: &Scalar) -> VortexResult<()> {
85+
fn append_scalar(&mut self, scalar: &Scalar) -> VortexResult<()> {
86+
if !scalar.dtype().eq_ignore_nullability(self.dtype()) {
87+
vortex_bail!(
88+
"Builder has dtype {:?}, scalar has {:?}",
89+
self.dtype(),
90+
scalar.dtype()
91+
)
92+
}
8493
match scalar.dtype() {
8594
DType::Null => self
8695
.as_any_mut()
@@ -130,3 +139,5 @@ impl dyn ArrayBuilder + '_ {
130139
Ok(())
131140
}
132141
}
142+
143+
impl<T: ?Sized + ArrayBuilder> ArrayBuilderExt for T {}

vortex-array/src/builders/null.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use std::any::Any;
22

3+
use vortex_dtype::DType;
34
use vortex_error::VortexResult;
45

56
use crate::array::NullArray;
@@ -31,6 +32,10 @@ impl ArrayBuilder for NullBuilder {
3132
self
3233
}
3334

35+
fn dtype(&self) -> &DType {
36+
&DType::Null
37+
}
38+
3439
fn len(&self) -> usize {
3540
self.length
3641
}

vortex-array/src/builders/primitive.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ use crate::ArrayData;
1414
pub struct PrimitiveBuilder<T: NativePType> {
1515
inner: ArrowPrimitiveBuilder<T::ArrowPrimitiveType>,
1616
dtype: DType,
17-
// TODO(ngates): track stats?
1817
}
1918

2019
impl<T: NativePType> PrimitiveBuilder<T>
@@ -64,6 +63,10 @@ where
6463
self
6564
}
6665

66+
fn dtype(&self) -> &DType {
67+
&self.dtype
68+
}
69+
6770
fn len(&self) -> usize {
6871
self.inner.len()
6972
}
@@ -82,8 +85,8 @@ where
8285
fn finish(&mut self) -> VortexResult<ArrayData> {
8386
let arrow = self.inner.finish();
8487

85-
if self.dtype.is_nullable() && arrow.null_count() > 0 {
86-
vortex_bail!("Non-nullable builder has null values");
88+
if !self.dtype().is_nullable() && arrow.null_count() > 0 {
89+
vortex_bail!("Non-nullable builder {} has null values", self.dtype());
8790
}
8891

8992
Ok(ArrayData::from_arrow(&arrow, self.dtype.is_nullable()))

0 commit comments

Comments
 (0)