Skip to content

Commit 557afef

Browse files
authored
Consolidate into stat_expression function (#5366)
This removes the unused stat_field_path function, as well as improves coverage for propagating other statistics. Interestingly it has highlighted the difference between pruning stats (min, max, is_null), and aggregate stats (sum, null_count, etc.). I have some ideas for generalizing these concepts, but that's for another day. --------- Signed-off-by: Nicholas Gates <[email protected]>
1 parent a891151 commit 557afef

File tree

13 files changed

+140
-275
lines changed

13 files changed

+140
-275
lines changed

vortex-array/src/expr/analysis.rs

Lines changed: 1 addition & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -14,64 +14,7 @@ pub trait StatsCatalog {
1414
/// This is likely to be a column expression, or a literal.
1515
///
1616
/// Returns `None` if the stat is not available for the field path.
17-
fn stats_ref(&mut self, _field_path: &FieldPath, _stat: Stat) -> Option<Expression> {
17+
fn stats_ref(&self, _field_path: &FieldPath, _stat: Stat) -> Option<Expression> {
1818
None
1919
}
2020
}
21-
22-
/// This can be used by expression to plug into vortex expression analysis, such as
23-
/// pruning or expression simplification
24-
pub trait AnalysisExpr {
25-
/// An expression over zone-statistics which implies all records in the zone evaluate to false.
26-
///
27-
/// Given an expression, `e`, if `e.stat_falsification(..)` evaluates to true, it is guaranteed
28-
/// that `e` evaluates to false on all records in the zone. However, the inverse is not
29-
/// necessarily true: even if the falsification evaluates to false, `e` need not evaluate to
30-
/// true on all records.
31-
///
32-
/// The [`StatsCatalog`] can be used to constrain or rename stats used in the final expr.
33-
///
34-
/// # Examples
35-
///
36-
/// - An expression over one variable: `x > 0` is false for all records in a zone if the maximum
37-
/// value of the column `x` in that zone is less than or equal to zero: `max(x) <= 0`.
38-
/// - An expression over two variables: `x > y` becomes `max(x) <= min(y)`.
39-
/// - A conjunctive expression: `x > y AND z < x` becomes `max(x) <= min(y) OR min(z) >= max(x).
40-
///
41-
/// Some expressions, in theory, have falsifications but this function does not support them
42-
/// such as `x < (y < z)` or `x LIKE "needle%"`.
43-
fn stat_falsification(&self, _catalog: &mut dyn StatsCatalog) -> Option<Expression> {
44-
None
45-
}
46-
47-
/// An expression for the upper non-null bound of this expression, if available.
48-
///
49-
/// This function returns None if there is no upper bound or it is difficult to compute.
50-
///
51-
/// The returned expression evaluates to null if the maximum value is unknown. In that case, you
52-
/// _must not_ assume the array is empty _nor_ may you assume the array only contains non-null
53-
/// values.
54-
fn max(&self, _catalog: &mut dyn StatsCatalog) -> Option<Expression> {
55-
None
56-
}
57-
58-
/// An expression for the lower non-null bound of this expression, if available.
59-
///
60-
/// See [AnalysisExpr::max] for important details.
61-
fn min(&self, _catalog: &mut dyn StatsCatalog) -> Option<Expression> {
62-
None
63-
}
64-
65-
/// An expression for the NaN count for a column, if available.
66-
///
67-
/// This method returns `None` if the NaNCount stat is unknown.
68-
fn nan_count(&self, _catalog: &mut dyn StatsCatalog) -> Option<Expression> {
69-
None
70-
}
71-
72-
fn field_path(&self) -> Option<FieldPath> {
73-
None
74-
}
75-
76-
// TODO: add containment
77-
}

vortex-array/src/expr/expression.rs

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@ use std::fmt::{Debug, Display, Formatter};
77
use std::hash::{Hash, Hasher};
88
use std::sync::Arc;
99

10-
use vortex_dtype::{DType, FieldPath};
10+
use vortex_dtype::DType;
1111
use vortex_error::{VortexExpect, VortexResult};
1212
use vortex_vector::Vector;
1313

1414
use crate::ArrayRef;
1515
use crate::expr::display::DisplayTreeExpr;
1616
use crate::expr::{ChildName, ExprId, ExprVTable, ExpressionView, StatsCatalog, VTable};
17+
use crate::stats::Stat;
1718

1819
/// A node in a Vortex expression tree.
1920
///
@@ -164,39 +165,34 @@ impl Expression {
164165
///
165166
/// Some expressions, in theory, have falsifications but this function does not support them
166167
/// such as `x < (y < z)` or `x LIKE "needle%"`.
167-
pub fn stat_falsification(&self, catalog: &mut dyn StatsCatalog) -> Option<Expression> {
168+
pub fn stat_falsification(&self, catalog: &dyn StatsCatalog) -> Option<Expression> {
168169
self.vtable.as_dyn().stat_falsification(self, catalog)
169170
}
170171

171-
/// An expression for the upper non-null bound of this expression, if available.
172+
/// Returns an expression representing the zoned statistic for the given stat, if available.
172173
///
173-
/// This function returns None if there is no upper bound, or it is difficult to compute.
174+
/// The [`StatsCatalog`] returns expressions that can be evaluated using the zone map as a
175+
/// scope. Expressions can implement this function to propagate such statistics through the
176+
/// expression tree. For example, the `a + 10` expression could propagate `min: min(a) + 10`.
174177
///
175-
/// The returned expression evaluates to null if the maximum value is unknown. In that case, you
176-
/// _must not_ assume the array is empty _nor_ may you assume the array only contains non-null
177-
/// values.
178-
pub fn stat_max(&self, catalog: &mut dyn StatsCatalog) -> Option<Expression> {
179-
self.vtable.as_dyn().stat_max(self, catalog)
178+
/// NOTE(gatesn): we currently cannot represent statistics over nested fields. Please file an
179+
/// issue to discuss a solution to this.
180+
pub fn stat_expression(&self, stat: Stat, catalog: &dyn StatsCatalog) -> Option<Expression> {
181+
self.vtable.as_dyn().stat_expression(self, stat, catalog)
180182
}
181183

182-
/// An expression for the lower non-null bound of this expression, if available.
184+
/// Returns an expression representing the zoned maximum statistic, if available.
183185
///
184-
/// See [`Expression::stat_max`] for important details.
185-
pub fn stat_min(&self, catalog: &mut dyn StatsCatalog) -> Option<Expression> {
186-
self.vtable.as_dyn().stat_min(self, catalog)
186+
/// See [`Self::stat_expression`] for details.
187+
pub fn stat_min(&self, catalog: &dyn StatsCatalog) -> Option<Expression> {
188+
self.stat_expression(Stat::Min, catalog)
187189
}
188190

189-
/// An expression for the NaN count for a column, if available.
191+
/// Returns an expression representing the zoned maximum statistic, if available.
190192
///
191-
/// This method returns `None` if the NaNCount stat is unknown.
192-
pub fn stat_nan_count(&self, catalog: &mut dyn StatsCatalog) -> Option<Expression> {
193-
self.vtable.as_dyn().stat_nan_count(self, catalog)
194-
}
195-
196-
// TODO(ngates): I'm not sure what this is really for? We need to clean up stats compute for
197-
// expressions.
198-
pub fn stat_field_path(&self) -> Option<FieldPath> {
199-
self.vtable.as_dyn().stat_field_path(self)
193+
/// See [`Self::stat_expression`] for details.
194+
pub fn stat_max(&self, catalog: &dyn StatsCatalog) -> Option<Expression> {
195+
self.stat_expression(Stat::Max, catalog)
200196
}
201197

202198
/// Format the expression as a compact string.

vortex-array/src/expr/exprs/between.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ impl VTable for Between {
139139
fn stat_falsification(
140140
&self,
141141
expr: &ExpressionView<Self>,
142-
catalog: &mut dyn StatsCatalog,
142+
catalog: &dyn StatsCatalog,
143143
) -> Option<Expression> {
144144
expr.to_binary_expr().stat_falsification(catalog)
145145
}

vortex-array/src/expr/exprs/binary.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use crate::expr::expression::Expression;
1313
use crate::expr::exprs::literal::lit;
1414
use crate::expr::exprs::operators::Operator;
1515
use crate::expr::{ChildName, ExprId, ExpressionView, StatsCatalog, VTable, VTableExt};
16+
use crate::stats::Stat;
1617
use crate::{ArrayRef, compute};
1718

1819
pub struct Binary;
@@ -104,7 +105,7 @@ impl VTable for Binary {
104105
fn stat_falsification(
105106
&self,
106107
expr: &ExpressionView<Self>,
107-
catalog: &mut dyn StatsCatalog,
108+
catalog: &dyn StatsCatalog,
108109
) -> Option<Expression> {
109110
// Wrap another predicate with an optional NaNCount check, if the stat is available.
110111
//
@@ -124,12 +125,12 @@ impl VTable for Binary {
124125
lhs: &Expression,
125126
rhs: &Expression,
126127
value_predicate: Expression,
127-
catalog: &mut dyn StatsCatalog,
128+
catalog: &dyn StatsCatalog,
128129
) -> Expression {
129130
let nan_predicate = lhs
130-
.stat_nan_count(catalog)
131+
.stat_expression(Stat::NaNCount, catalog)
131132
.into_iter()
132-
.chain(rhs.stat_nan_count(catalog))
133+
.chain(rhs.stat_expression(Stat::NaNCount, catalog))
133134
.map(|nans| eq(nans, lit(0u64)))
134135
.reduce(and);
135136

vortex-array/src/expr/exprs/cast.rs

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@ use std::fmt::Formatter;
55
use std::ops::Deref;
66

77
use prost::Message;
8-
use vortex_dtype::{DType, FieldPath};
8+
use vortex_dtype::DType;
99
use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
1010
use vortex_proto::expr as pb;
1111

1212
use crate::ArrayRef;
1313
use crate::compute::cast as compute_cast;
1414
use crate::expr::expression::Expression;
1515
use crate::expr::{ChildName, ExprId, ExpressionView, StatsCatalog, VTable, VTableExt};
16+
use crate::stats::Stat;
1617

1718
/// A cast expression that converts values to a target data type.
1819
pub struct Cast;
@@ -86,36 +87,36 @@ impl VTable for Cast {
8687
})
8788
}
8889

89-
fn stat_max(
90+
fn stat_expression(
9091
&self,
9192
expr: &ExpressionView<Self>,
92-
catalog: &mut dyn StatsCatalog,
93+
stat: Stat,
94+
catalog: &dyn StatsCatalog,
9395
) -> Option<Expression> {
94-
expr.child(0)
95-
.stat_max(catalog)
96-
.map(|x| cast(x, expr.data().clone()))
97-
}
98-
99-
fn stat_min(
100-
&self,
101-
expr: &ExpressionView<Self>,
102-
catalog: &mut dyn StatsCatalog,
103-
) -> Option<Expression> {
104-
expr.child(0)
105-
.stat_min(catalog)
106-
.map(|x| cast(x, expr.data().clone()))
107-
}
108-
109-
fn stat_nan_count(
110-
&self,
111-
expr: &ExpressionView<Self>,
112-
catalog: &mut dyn StatsCatalog,
113-
) -> Option<Expression> {
114-
expr.child(0).stat_nan_count(catalog)
115-
}
116-
117-
fn stat_field_path(&self, expr: &ExpressionView<Self>) -> Option<FieldPath> {
118-
expr.children()[0].stat_field_path()
96+
match stat {
97+
Stat::IsConstant
98+
| Stat::IsSorted
99+
| Stat::IsStrictSorted
100+
| Stat::NaNCount
101+
| Stat::Sum
102+
| Stat::UncompressedSizeInBytes => expr.child(0).stat_expression(stat, catalog),
103+
Stat::Max | Stat::Min => {
104+
// We cast min/max to the new type
105+
expr.child(0)
106+
.stat_expression(stat, catalog)
107+
.map(|x| cast(x, expr.data().clone()))
108+
}
109+
Stat::NullCount => {
110+
// if !expr.data().is_nullable() {
111+
// NOTE(ngates): we should decide on the semantics here. In theory, the null
112+
// count of something cast to non-nullable will be zero. But if we return
113+
// that we know this to be zero, then a pruning predicate may eliminate data
114+
// that would otherwise have caused the cast to error.
115+
// return Some(lit(0u64));
116+
// }
117+
None
118+
}
119+
}
119120
}
120121
}
121122

vortex-array/src/expr/exprs/dynamic.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ impl VTable for DynamicComparison {
9191
fn stat_falsification(
9292
&self,
9393
expr: &ExpressionView<DynamicComparison>,
94-
catalog: &mut dyn StatsCatalog,
94+
catalog: &dyn StatsCatalog,
9595
) -> Option<Expression> {
9696
match expr.data().operator {
9797
Operator::Gt => Some(DynamicComparison.new_expr(

vortex-array/src/expr/exprs/get_item/mod.rs

Lines changed: 12 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -121,34 +121,21 @@ impl VTable for GetItem {
121121
Ok(field)
122122
}
123123

124-
fn stat_max(
124+
fn stat_expression(
125125
&self,
126126
expr: &ExpressionView<Self>,
127-
catalog: &mut dyn StatsCatalog,
127+
stat: Stat,
128+
catalog: &dyn StatsCatalog,
128129
) -> Option<Expression> {
129-
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), Stat::Max)
130-
}
131-
132-
fn stat_min(
133-
&self,
134-
expr: &ExpressionView<Self>,
135-
catalog: &mut dyn StatsCatalog,
136-
) -> Option<Expression> {
137-
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), Stat::Min)
138-
}
139-
140-
fn stat_nan_count(
141-
&self,
142-
expr: &ExpressionView<Self>,
143-
catalog: &mut dyn StatsCatalog,
144-
) -> Option<Expression> {
145-
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), Stat::NaNCount)
146-
}
147-
148-
fn stat_field_path(&self, expr: &ExpressionView<Self>) -> Option<FieldPath> {
149-
expr.children()[0]
150-
.stat_field_path()
151-
.map(|fp| fp.push(expr.data().clone()))
130+
// TODO(ngates): I think we can do better here and support stats over nested fields.
131+
// It would be nice if delegating to our child would return a struct of statistics
132+
// matching the nested DType such that we can write:
133+
// `get_item(expr.child(0).stat_expression(...), expr.data().field_name())`
134+
135+
// TODO(ngates): this is a bug whereby we may return stats for a nested field of the same
136+
// name as a field in the root struct. This should be resolved with upcoming change to
137+
// falsify expressions, but for now I'm preserving the existing buggy behavior.
138+
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), stat)
152139
}
153140
}
154141

vortex-array/src/expr/exprs/is_null.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,9 @@ impl VTable for IsNull {
8888
fn stat_falsification(
8989
&self,
9090
expr: &ExpressionView<Self>,
91-
catalog: &mut dyn StatsCatalog,
91+
catalog: &dyn StatsCatalog,
9292
) -> Option<Expression> {
93-
let field_path = expr.children()[0].stat_field_path()?;
94-
let null_count_expr = catalog.stats_ref(&field_path, Stat::NullCount)?;
93+
let null_count_expr = expr.child(0).stat_expression(Stat::NullCount, catalog)?;
9594
Some(eq(null_count_expr, lit(0u64)))
9695
}
9796
}

vortex-array/src/expr/exprs/list_contains.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ impl VTable for ListContains {
8484
fn stat_falsification(
8585
&self,
8686
expr: &ExpressionView<Self>,
87-
catalog: &mut dyn StatsCatalog,
87+
catalog: &dyn StatsCatalog,
8888
) -> Option<Expression> {
8989
// falsification(contains([1,2,5], x)) =>
9090
// falsification(x != 1) and falsification(x != 2) and falsification(x != 5)

0 commit comments

Comments
 (0)