Skip to content

Commit 41244e3

Browse files
committed
Stat expression
Signed-off-by: Nicholas Gates <[email protected]>
1 parent 3ba3509 commit 41244e3

File tree

12 files changed

+127
-135
lines changed

12 files changed

+127
-135
lines changed

vortex-array/src/expr/expression.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ use std::sync::Arc;
1010
use vortex_dtype::DType;
1111
use vortex_error::{VortexExpect, VortexResult};
1212

13+
use crate::ArrayRef;
1314
use crate::expr::display::DisplayTreeExpr;
1415
use crate::expr::{ChildName, ExprId, ExprVTable, ExpressionView, StatsCatalog, VTable};
1516
use crate::stats::Stat;
16-
use crate::ArrayRef;
1717

1818
/// A node in a Vortex expression tree.
1919
///
@@ -168,10 +168,27 @@ impl Expression {
168168
/// The [`StatsCatalog`] returns expressions that can be evaluated using the zone map as a
169169
/// scope. Expressions can implement this function to propagate such statistics through the
170170
/// expression tree. For example, the `a + 10` expression could propagate `min: min(a) + 10`.
171+
///
172+
/// NOTE(gatesn): we currently cannot represent statistics over nested fields. Please file an
173+
/// issue to discuss a solution to this.
171174
pub fn stat_expression(&self, stat: Stat, catalog: &dyn StatsCatalog) -> Option<Expression> {
172175
self.vtable.as_dyn().stat_expression(self, stat, catalog)
173176
}
174177

178+
/// Returns an expression representing the zoned maximum statistic, if available.
179+
///
180+
/// See [`Self::stat_expression`] for details.
181+
pub fn stat_min(&self, catalog: &dyn StatsCatalog) -> Option<Expression> {
182+
self.stat_expression(Stat::Min, catalog)
183+
}
184+
185+
/// Returns an expression representing the zoned maximum statistic, if available.
186+
///
187+
/// See [`Self::stat_expression`] for details.
188+
pub fn stat_max(&self, catalog: &dyn StatsCatalog) -> Option<Expression> {
189+
self.stat_expression(Stat::Max, catalog)
190+
}
191+
175192
/// Format the expression as a compact string.
176193
///
177194
/// Since this is a recursive formatter, it is exposed on the public Expression type.

vortex-array/src/expr/exprs/between.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@ use std::fmt::Formatter;
66
use prost::Message;
77
use vortex_dtype::DType;
88
use vortex_dtype::DType::Bool;
9-
use vortex_error::{vortex_bail, VortexExpect, VortexResult};
9+
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
1010
use vortex_proto::expr as pb;
1111

12-
use crate::compute::{between as between_compute, BetweenOptions};
12+
use crate::ArrayRef;
13+
use crate::compute::{BetweenOptions, between as between_compute};
1314
use crate::expr::expression::Expression;
1415
use crate::expr::exprs::binary::Binary;
1516
use crate::expr::exprs::operators::Operator;
1617
use crate::expr::{ChildName, ExprId, ExpressionView, StatsCatalog, VTable, VTableExt};
17-
use crate::ArrayRef;
1818

1919
/// An optimized scalar expression to compute whether values fall between two bounds.
2020
///

vortex-array/src/expr/exprs/binary.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,16 @@ use std::fmt::Formatter;
55

66
use prost::Message;
77
use vortex_dtype::DType;
8-
use vortex_error::{vortex_bail, VortexExpect, VortexResult};
8+
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
99
use vortex_proto::expr as pb;
1010

1111
use crate::compute::{add, and_kleene, compare, div, mul, or_kleene, sub};
1212
use crate::expr::expression::Expression;
1313
use crate::expr::exprs::literal::lit;
1414
use crate::expr::exprs::operators::Operator;
1515
use crate::expr::{ChildName, ExprId, ExpressionView, StatsCatalog, VTable, VTableExt};
16-
use crate::{compute, ArrayRef};
16+
use crate::stats::Stat;
17+
use crate::{ArrayRef, compute};
1718

1819
pub struct Binary;
1920

@@ -127,9 +128,9 @@ impl VTable for Binary {
127128
catalog: &dyn StatsCatalog,
128129
) -> Expression {
129130
let nan_predicate = lhs
130-
.stat_nan_count(catalog)
131+
.stat_expression(Stat::NaNCount, catalog)
131132
.into_iter()
132-
.chain(rhs.stat_nan_count(catalog))
133+
.chain(rhs.stat_expression(Stat::NaNCount, catalog))
133134
.map(|nans| eq(nans, lit(0u64)))
134135
.reduce(and);
135136

@@ -510,7 +511,7 @@ mod tests {
510511
use super::{and, and_collect, and_collect_right, eq, gt, gt_eq, lt, lt_eq, not_eq, or};
511512
use crate::expr::exprs::get_item::col;
512513
use crate::expr::exprs::literal::lit;
513-
use crate::expr::{test_harness, Expression};
514+
use crate::expr::{Expression, test_harness};
514515

515516
#[test]
516517
fn and_collect_left_assoc() {

vortex-array/src/expr/exprs/cast.rs

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@ use std::fmt::Formatter;
55
use std::ops::Deref;
66

77
use prost::Message;
8-
use vortex_dtype::{DType, FieldPath};
9-
use vortex_error::{vortex_bail, vortex_err, VortexExpect, VortexResult};
8+
use vortex_dtype::DType;
9+
use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
1010
use vortex_proto::expr as pb;
1111

12+
use crate::ArrayRef;
1213
use crate::compute::cast as compute_cast;
1314
use crate::expr::expression::Expression;
1415
use crate::expr::{ChildName, ExprId, ExpressionView, StatsCatalog, VTable, VTableExt};
15-
use crate::ArrayRef;
16+
use crate::stats::Stat;
1617

1718
/// A cast expression that converts values to a target data type.
1819
pub struct Cast;
@@ -86,32 +87,32 @@ impl VTable for Cast {
8687
})
8788
}
8889

89-
fn stat_max(
90-
&self,
91-
expr: &ExpressionView<Self>,
92-
catalog: &dyn StatsCatalog,
93-
) -> Option<Expression> {
94-
expr.children()[0].stat_max(catalog)
95-
}
96-
97-
fn stat_min(
98-
&self,
99-
expr: &ExpressionView<Self>,
100-
catalog: &dyn StatsCatalog,
101-
) -> Option<Expression> {
102-
expr.children()[0].stat_min(catalog)
103-
}
104-
105-
fn stat_nan_count(
90+
fn stat_expression(
10691
&self,
10792
expr: &ExpressionView<Self>,
93+
stat: Stat,
10894
catalog: &dyn StatsCatalog,
10995
) -> Option<Expression> {
110-
expr.children()[0].stat_nan_count(catalog)
111-
}
112-
113-
fn stat_field_path(&self, expr: &ExpressionView<Self>) -> Option<FieldPath> {
114-
expr.children()[0].stat_field_path()
96+
match stat {
97+
Stat::IsConstant
98+
| Stat::IsSorted
99+
| Stat::IsStrictSorted
100+
| Stat::Max
101+
| Stat::Min
102+
| Stat::NaNCount
103+
| Stat::Sum
104+
| Stat::UncompressedSizeInBytes => expr.child(0).stat_expression(stat, catalog),
105+
Stat::NullCount => {
106+
// if !expr.data().is_nullable() {
107+
// NOTE(ngates): we should decide on the semantics here. In theory, the null
108+
// count of something cast to non-nullable will be zero. But if we return
109+
// that we know this to be zero, then a pruning predicate may eliminate data
110+
// that would otherwise have caused the cast to error.
111+
// return Some(lit(0u64));
112+
// }
113+
None
114+
}
115+
}
115116
}
116117
}
117118

@@ -136,11 +137,11 @@ mod tests {
136137
use vortex_error::VortexUnwrap as _;
137138

138139
use super::cast;
140+
use crate::IntoArray;
139141
use crate::arrays::StructArray;
140142
use crate::expr::exprs::get_item::get_item;
141143
use crate::expr::exprs::root::root;
142-
use crate::expr::{test_harness, Expression};
143-
use crate::IntoArray;
144+
use crate::expr::{Expression, test_harness};
144145

145146
#[test]
146147
fn dtype() {

vortex-array/src/expr/exprs/dynamic.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ use std::sync::Arc;
77

88
use parking_lot::Mutex;
99
use vortex_dtype::DType;
10-
use vortex_error::{vortex_bail, VortexExpect, VortexResult};
10+
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
1111
use vortex_scalar::{Scalar, ScalarValue};
1212

1313
use crate::arrays::ConstantArray;
14-
use crate::compute::{compare, Operator};
14+
use crate::compute::{Operator, compare};
1515
use crate::expr::traversal::{NodeExt, NodeVisitor, TraversalOrder};
1616
use crate::expr::{ChildName, ExprId, Expression, ExpressionView, StatsCatalog, VTable, VTableExt};
1717
use crate::{Array, ArrayRef, IntoArray};

vortex-array/src/expr/exprs/get_item.rs

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::ops::Not;
66

77
use prost::Message;
88
use vortex_dtype::{DType, FieldName, FieldPath, Nullability};
9-
use vortex_error::{vortex_bail, vortex_err, VortexResult};
9+
use vortex_error::{VortexResult, vortex_bail, vortex_err};
1010
use vortex_proto::expr as pb;
1111

1212
use crate::compute::mask;
@@ -94,34 +94,20 @@ impl VTable for GetItem {
9494
}
9595
}
9696

97-
fn stat_max(
97+
fn stat_expression(
9898
&self,
9999
expr: &ExpressionView<Self>,
100+
stat: Stat,
100101
catalog: &dyn StatsCatalog,
101102
) -> Option<Expression> {
102-
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), Stat::Max)
103-
}
104-
105-
fn stat_min(
106-
&self,
107-
expr: &ExpressionView<Self>,
108-
catalog: &dyn StatsCatalog,
109-
) -> Option<Expression> {
110-
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), Stat::Min)
111-
}
112-
113-
fn stat_nan_count(
114-
&self,
115-
expr: &ExpressionView<Self>,
116-
catalog: &dyn StatsCatalog,
117-
) -> Option<Expression> {
118-
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), Stat::NaNCount)
119-
}
120-
121-
fn stat_field_path(&self, expr: &ExpressionView<Self>) -> Option<FieldPath> {
122-
expr.children()[0]
123-
.stat_field_path()
124-
.map(|fp| fp.push(expr.data().clone()))
103+
// TODO(ngates): I think we can do better here and support stats over nested fields.
104+
// It would be nice if delegating to our child would return a struct of statistics
105+
// matching the nested DType such that we can write:
106+
// `get_item(expr.child(0).stat_expression(...), expr.data().field_name())`
107+
108+
// TODO(ngates): For now, we maintain the existing *broken* behavior of assuming that
109+
// getitem refers to a field on the root scope.
110+
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), stat)
125111
}
126112
}
127113

vortex-array/src/expr/exprs/is_null.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use std::fmt::Formatter;
55
use std::ops::Not;
66

77
use vortex_dtype::{DType, Nullability};
8-
use vortex_error::{vortex_bail, VortexResult};
8+
use vortex_error::{VortexResult, vortex_bail};
99
use vortex_mask::Mask;
1010

1111
use crate::arrays::{BoolArray, ConstantArray};
@@ -74,9 +74,7 @@ impl VTable for IsNull {
7474
expr: &ExpressionView<Self>,
7575
catalog: &dyn StatsCatalog,
7676
) -> Option<Expression> {
77-
expr.child(0).stat_nan_count(catalog);
78-
let field_path = expr.children()[0].stat_field_path()?;
79-
let null_count_expr = catalog.stats_ref(&field_path, Stat::NullCount)?;
77+
let null_count_expr = expr.child(0).stat_expression(Stat::NullCount, catalog)?;
8078
Some(eq(null_count_expr, lit(0u64)))
8179
}
8280
}
@@ -103,6 +101,7 @@ mod tests {
103101
use vortex_utils::aliases::hash_set::HashSet;
104102

105103
use super::is_null;
104+
use crate::IntoArray;
106105
use crate::arrays::{PrimitiveArray, StructArray};
107106
use crate::expr::exprs::binary::eq;
108107
use crate::expr::exprs::get_item::{col, get_item};
@@ -111,7 +110,6 @@ mod tests {
111110
use crate::expr::pruning::checked_pruning_expr;
112111
use crate::expr::test_harness;
113112
use crate::stats::Stat;
114-
use crate::IntoArray;
115113

116114
#[test]
117115
fn dtype() {

vortex-array/src/expr/exprs/list_contains.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44
use std::fmt::Formatter;
55

66
use vortex_dtype::DType;
7-
use vortex_error::{vortex_bail, VortexResult};
7+
use vortex_error::{VortexResult, vortex_bail};
88

9+
use crate::ArrayRef;
910
use crate::compute::list_contains as compute_list_contains;
1011
use crate::expr::exprs::binary::{and, gt, lt, or};
11-
use crate::expr::exprs::literal::{lit, Literal};
12+
use crate::expr::exprs::literal::{Literal, lit};
1213
use crate::expr::{ChildName, ExprId, Expression, ExpressionView, StatsCatalog, VTable, VTableExt};
13-
use crate::ArrayRef;
1414

1515
pub struct ListContains;
1616

0 commit comments

Comments
 (0)