Skip to content

Commit 3ba3509

Browse files
committed
Stat expression
Signed-off-by: Nicholas Gates <[email protected]>
1 parent 76aeed6 commit 3ba3509

File tree

13 files changed

+75
-204
lines changed

13 files changed

+75
-204
lines changed

vortex-array/src/expr/analysis.rs

Lines changed: 1 addition & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -14,64 +14,7 @@ pub trait StatsCatalog {
1414
/// This is likely to be a column expression, or a literal.
1515
///
1616
/// Returns `None` if the stat is not available for the field path.
17-
fn stats_ref(&mut self, _field_path: &FieldPath, _stat: Stat) -> Option<Expression> {
17+
fn stats_ref(&self, _field_path: &FieldPath, _stat: Stat) -> Option<Expression> {
1818
None
1919
}
2020
}
21-
22-
/// This can be used by expression to plug into vortex expression analysis, such as
23-
/// pruning or expression simplification
24-
pub trait AnalysisExpr {
25-
/// An expression over zone-statistics which implies all records in the zone evaluate to false.
26-
///
27-
/// Given an expression, `e`, if `e.stat_falsification(..)` evaluates to true, it is guaranteed
28-
/// that `e` evaluates to false on all records in the zone. However, the inverse is not
29-
/// necessarily true: even if the falsification evaluates to false, `e` need not evaluate to
30-
/// true on all records.
31-
///
32-
/// The [`StatsCatalog`] can be used to constrain or rename stats used in the final expr.
33-
///
34-
/// # Examples
35-
///
36-
/// - An expression over one variable: `x > 0` is false for all records in a zone if the maximum
37-
/// value of the column `x` in that zone is less than or equal to zero: `max(x) <= 0`.
38-
/// - An expression over two variables: `x > y` becomes `max(x) <= min(y)`.
39-
/// - A conjunctive expression: `x > y AND z < x` becomes `max(x) <= min(y) OR min(z) >= max(x).
40-
///
41-
/// Some expressions, in theory, have falsifications but this function does not support them
42-
/// such as `x < (y < z)` or `x LIKE "needle%"`.
43-
fn stat_falsification(&self, _catalog: &mut dyn StatsCatalog) -> Option<Expression> {
44-
None
45-
}
46-
47-
/// An expression for the upper non-null bound of this expression, if available.
48-
///
49-
/// This function returns None if there is no upper bound or it is difficult to compute.
50-
///
51-
/// The returned expression evaluates to null if the maximum value is unknown. In that case, you
52-
/// _must not_ assume the array is empty _nor_ may you assume the array only contains non-null
53-
/// values.
54-
fn max(&self, _catalog: &mut dyn StatsCatalog) -> Option<Expression> {
55-
None
56-
}
57-
58-
/// An expression for the lower non-null bound of this expression, if available.
59-
///
60-
/// See [AnalysisExpr::max] for important details.
61-
fn min(&self, _catalog: &mut dyn StatsCatalog) -> Option<Expression> {
62-
None
63-
}
64-
65-
/// An expression for the NaN count for a column, if available.
66-
///
67-
/// This method returns `None` if the NaNCount stat is unknown.
68-
fn nan_count(&self, _catalog: &mut dyn StatsCatalog) -> Option<Expression> {
69-
None
70-
}
71-
72-
fn field_path(&self) -> Option<FieldPath> {
73-
None
74-
}
75-
76-
// TODO: add containment
77-
}

vortex-array/src/expr/expression.rs

Lines changed: 10 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@ use std::fmt::{Display, Formatter};
77
use std::hash::{Hash, Hasher};
88
use std::sync::Arc;
99

10-
use vortex_dtype::{DType, FieldPath};
10+
use vortex_dtype::DType;
1111
use vortex_error::{VortexExpect, VortexResult};
1212

13-
use crate::ArrayRef;
1413
use crate::expr::display::DisplayTreeExpr;
1514
use crate::expr::{ChildName, ExprId, ExprVTable, ExpressionView, StatsCatalog, VTable};
15+
use crate::stats::Stat;
16+
use crate::ArrayRef;
1617

1718
/// A node in a Vortex expression tree.
1819
///
@@ -158,39 +159,17 @@ impl Expression {
158159
///
159160
/// Some expressions, in theory, have falsifications but this function does not support them
160161
/// such as `x < (y < z)` or `x LIKE "needle%"`.
161-
pub fn stat_falsification(&self, catalog: &mut dyn StatsCatalog) -> Option<Expression> {
162+
pub fn stat_falsification(&self, catalog: &dyn StatsCatalog) -> Option<Expression> {
162163
self.vtable.as_dyn().stat_falsification(self, catalog)
163164
}
164165

165-
/// An expression for the upper non-null bound of this expression, if available.
166-
///
167-
/// This function returns None if there is no upper bound, or it is difficult to compute.
168-
///
169-
/// The returned expression evaluates to null if the maximum value is unknown. In that case, you
170-
/// _must not_ assume the array is empty _nor_ may you assume the array only contains non-null
171-
/// values.
172-
pub fn stat_max(&self, catalog: &mut dyn StatsCatalog) -> Option<Expression> {
173-
self.vtable.as_dyn().stat_max(self, catalog)
174-
}
175-
176-
/// An expression for the lower non-null bound of this expression, if available.
166+
/// Returns an expression representing the zoned statistic for the given stat, if available.
177167
///
178-
/// See [`Expression::stat_max`] for important details.
179-
pub fn stat_min(&self, catalog: &mut dyn StatsCatalog) -> Option<Expression> {
180-
self.vtable.as_dyn().stat_min(self, catalog)
181-
}
182-
183-
/// An expression for the NaN count for a column, if available.
184-
///
185-
/// This method returns `None` if the NaNCount stat is unknown.
186-
pub fn stat_nan_count(&self, catalog: &mut dyn StatsCatalog) -> Option<Expression> {
187-
self.vtable.as_dyn().stat_nan_count(self, catalog)
188-
}
189-
190-
// TODO(ngates): I'm not sure what this is really for? We need to clean up stats compute for
191-
// expressions.
192-
pub fn stat_field_path(&self) -> Option<FieldPath> {
193-
self.vtable.as_dyn().stat_field_path(self)
168+
/// The [`StatsCatalog`] returns expressions that can be evaluated using the zone map as a
169+
/// scope. Expressions can implement this function to propagate such statistics through the
170+
/// expression tree. For example, the `a + 10` expression could propagate `min: min(a) + 10`.
171+
pub fn stat_expression(&self, stat: Stat, catalog: &dyn StatsCatalog) -> Option<Expression> {
172+
self.vtable.as_dyn().stat_expression(self, stat, catalog)
194173
}
195174

196175
/// Format the expression as a compact string.

vortex-array/src/expr/exprs/between.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@ use std::fmt::Formatter;
66
use prost::Message;
77
use vortex_dtype::DType;
88
use vortex_dtype::DType::Bool;
9-
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
9+
use vortex_error::{vortex_bail, VortexExpect, VortexResult};
1010
use vortex_proto::expr as pb;
1111

12-
use crate::ArrayRef;
13-
use crate::compute::{BetweenOptions, between as between_compute};
12+
use crate::compute::{between as between_compute, BetweenOptions};
1413
use crate::expr::expression::Expression;
1514
use crate::expr::exprs::binary::Binary;
1615
use crate::expr::exprs::operators::Operator;
1716
use crate::expr::{ChildName, ExprId, ExpressionView, StatsCatalog, VTable, VTableExt};
17+
use crate::ArrayRef;
1818

1919
/// An optimized scalar expression to compute whether values fall between two bounds.
2020
///
@@ -139,7 +139,7 @@ impl VTable for Between {
139139
fn stat_falsification(
140140
&self,
141141
expr: &ExpressionView<Self>,
142-
catalog: &mut dyn StatsCatalog,
142+
catalog: &dyn StatsCatalog,
143143
) -> Option<Expression> {
144144
expr.to_binary_expr().stat_falsification(catalog)
145145
}

vortex-array/src/expr/exprs/binary.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@ use std::fmt::Formatter;
55

66
use prost::Message;
77
use vortex_dtype::DType;
8-
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
8+
use vortex_error::{vortex_bail, VortexExpect, VortexResult};
99
use vortex_proto::expr as pb;
1010

1111
use crate::compute::{add, and_kleene, compare, div, mul, or_kleene, sub};
1212
use crate::expr::expression::Expression;
1313
use crate::expr::exprs::literal::lit;
1414
use crate::expr::exprs::operators::Operator;
1515
use crate::expr::{ChildName, ExprId, ExpressionView, StatsCatalog, VTable, VTableExt};
16-
use crate::{ArrayRef, compute};
16+
use crate::{compute, ArrayRef};
1717

1818
pub struct Binary;
1919

@@ -104,7 +104,7 @@ impl VTable for Binary {
104104
fn stat_falsification(
105105
&self,
106106
expr: &ExpressionView<Self>,
107-
catalog: &mut dyn StatsCatalog,
107+
catalog: &dyn StatsCatalog,
108108
) -> Option<Expression> {
109109
// Wrap another predicate with an optional NaNCount check, if the stat is available.
110110
//
@@ -124,7 +124,7 @@ impl VTable for Binary {
124124
lhs: &Expression,
125125
rhs: &Expression,
126126
value_predicate: Expression,
127-
catalog: &mut dyn StatsCatalog,
127+
catalog: &dyn StatsCatalog,
128128
) -> Expression {
129129
let nan_predicate = lhs
130130
.stat_nan_count(catalog)
@@ -510,7 +510,7 @@ mod tests {
510510
use super::{and, and_collect, and_collect_right, eq, gt, gt_eq, lt, lt_eq, not_eq, or};
511511
use crate::expr::exprs::get_item::col;
512512
use crate::expr::exprs::literal::lit;
513-
use crate::expr::{Expression, test_harness};
513+
use crate::expr::{test_harness, Expression};
514514

515515
#[test]
516516
fn and_collect_left_assoc() {

vortex-array/src/expr/exprs/cast.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ use std::ops::Deref;
66

77
use prost::Message;
88
use vortex_dtype::{DType, FieldPath};
9-
use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
9+
use vortex_error::{vortex_bail, vortex_err, VortexExpect, VortexResult};
1010
use vortex_proto::expr as pb;
1111

12-
use crate::ArrayRef;
1312
use crate::compute::cast as compute_cast;
1413
use crate::expr::expression::Expression;
1514
use crate::expr::{ChildName, ExprId, ExpressionView, StatsCatalog, VTable, VTableExt};
15+
use crate::ArrayRef;
1616

1717
/// A cast expression that converts values to a target data type.
1818
pub struct Cast;
@@ -89,23 +89,23 @@ impl VTable for Cast {
8989
fn stat_max(
9090
&self,
9191
expr: &ExpressionView<Self>,
92-
catalog: &mut dyn StatsCatalog,
92+
catalog: &dyn StatsCatalog,
9393
) -> Option<Expression> {
9494
expr.children()[0].stat_max(catalog)
9595
}
9696

9797
fn stat_min(
9898
&self,
9999
expr: &ExpressionView<Self>,
100-
catalog: &mut dyn StatsCatalog,
100+
catalog: &dyn StatsCatalog,
101101
) -> Option<Expression> {
102102
expr.children()[0].stat_min(catalog)
103103
}
104104

105105
fn stat_nan_count(
106106
&self,
107107
expr: &ExpressionView<Self>,
108-
catalog: &mut dyn StatsCatalog,
108+
catalog: &dyn StatsCatalog,
109109
) -> Option<Expression> {
110110
expr.children()[0].stat_nan_count(catalog)
111111
}
@@ -136,11 +136,11 @@ mod tests {
136136
use vortex_error::VortexUnwrap as _;
137137

138138
use super::cast;
139-
use crate::IntoArray;
140139
use crate::arrays::StructArray;
141140
use crate::expr::exprs::get_item::get_item;
142141
use crate::expr::exprs::root::root;
143-
use crate::expr::{Expression, test_harness};
142+
use crate::expr::{test_harness, Expression};
143+
use crate::IntoArray;
144144

145145
#[test]
146146
fn dtype() {

vortex-array/src/expr/exprs/dynamic.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ use std::sync::Arc;
77

88
use parking_lot::Mutex;
99
use vortex_dtype::DType;
10-
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
10+
use vortex_error::{vortex_bail, VortexExpect, VortexResult};
1111
use vortex_scalar::{Scalar, ScalarValue};
1212

1313
use crate::arrays::ConstantArray;
14-
use crate::compute::{Operator, compare};
14+
use crate::compute::{compare, Operator};
1515
use crate::expr::traversal::{NodeExt, NodeVisitor, TraversalOrder};
1616
use crate::expr::{ChildName, ExprId, Expression, ExpressionView, StatsCatalog, VTable, VTableExt};
1717
use crate::{Array, ArrayRef, IntoArray};
@@ -91,7 +91,7 @@ impl VTable for DynamicComparison {
9191
fn stat_falsification(
9292
&self,
9393
expr: &ExpressionView<DynamicComparison>,
94-
catalog: &mut dyn StatsCatalog,
94+
catalog: &dyn StatsCatalog,
9595
) -> Option<Expression> {
9696
match expr.data().operator {
9797
Operator::Gt => Some(DynamicComparison.new_expr(

vortex-array/src/expr/exprs/get_item.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::ops::Not;
66

77
use prost::Message;
88
use vortex_dtype::{DType, FieldName, FieldPath, Nullability};
9-
use vortex_error::{VortexResult, vortex_bail, vortex_err};
9+
use vortex_error::{vortex_bail, vortex_err, VortexResult};
1010
use vortex_proto::expr as pb;
1111

1212
use crate::compute::mask;
@@ -97,23 +97,23 @@ impl VTable for GetItem {
9797
fn stat_max(
9898
&self,
9999
expr: &ExpressionView<Self>,
100-
catalog: &mut dyn StatsCatalog,
100+
catalog: &dyn StatsCatalog,
101101
) -> Option<Expression> {
102102
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), Stat::Max)
103103
}
104104

105105
fn stat_min(
106106
&self,
107107
expr: &ExpressionView<Self>,
108-
catalog: &mut dyn StatsCatalog,
108+
catalog: &dyn StatsCatalog,
109109
) -> Option<Expression> {
110110
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), Stat::Min)
111111
}
112112

113113
fn stat_nan_count(
114114
&self,
115115
expr: &ExpressionView<Self>,
116-
catalog: &mut dyn StatsCatalog,
116+
catalog: &dyn StatsCatalog,
117117
) -> Option<Expression> {
118118
catalog.stats_ref(&FieldPath::from_name(expr.data().clone()), Stat::NaNCount)
119119
}

vortex-array/src/expr/exprs/is_null.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use std::fmt::Formatter;
55
use std::ops::Not;
66

77
use vortex_dtype::{DType, Nullability};
8-
use vortex_error::{VortexResult, vortex_bail};
8+
use vortex_error::{vortex_bail, VortexResult};
99
use vortex_mask::Mask;
1010

1111
use crate::arrays::{BoolArray, ConstantArray};
@@ -72,8 +72,9 @@ impl VTable for IsNull {
7272
fn stat_falsification(
7373
&self,
7474
expr: &ExpressionView<Self>,
75-
catalog: &mut dyn StatsCatalog,
75+
catalog: &dyn StatsCatalog,
7676
) -> Option<Expression> {
77+
expr.child(0).stat_nan_count(catalog);
7778
let field_path = expr.children()[0].stat_field_path()?;
7879
let null_count_expr = catalog.stats_ref(&field_path, Stat::NullCount)?;
7980
Some(eq(null_count_expr, lit(0u64)))
@@ -102,7 +103,6 @@ mod tests {
102103
use vortex_utils::aliases::hash_set::HashSet;
103104

104105
use super::is_null;
105-
use crate::IntoArray;
106106
use crate::arrays::{PrimitiveArray, StructArray};
107107
use crate::expr::exprs::binary::eq;
108108
use crate::expr::exprs::get_item::{col, get_item};
@@ -111,6 +111,7 @@ mod tests {
111111
use crate::expr::pruning::checked_pruning_expr;
112112
use crate::expr::test_harness;
113113
use crate::stats::Stat;
114+
use crate::IntoArray;
114115

115116
#[test]
116117
fn dtype() {

vortex-array/src/expr/exprs/list_contains.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44
use std::fmt::Formatter;
55

66
use vortex_dtype::DType;
7-
use vortex_error::{VortexResult, vortex_bail};
7+
use vortex_error::{vortex_bail, VortexResult};
88

9-
use crate::ArrayRef;
109
use crate::compute::list_contains as compute_list_contains;
1110
use crate::expr::exprs::binary::{and, gt, lt, or};
12-
use crate::expr::exprs::literal::{Literal, lit};
11+
use crate::expr::exprs::literal::{lit, Literal};
1312
use crate::expr::{ChildName, ExprId, Expression, ExpressionView, StatsCatalog, VTable, VTableExt};
13+
use crate::ArrayRef;
1414

1515
pub struct ListContains;
1616

@@ -84,7 +84,7 @@ impl VTable for ListContains {
8484
fn stat_falsification(
8585
&self,
8686
expr: &ExpressionView<Self>,
87-
catalog: &mut dyn StatsCatalog,
87+
catalog: &dyn StatsCatalog,
8888
) -> Option<Expression> {
8989
// falsification(contains([1,2,5], x)) =>
9090
// falsification(x != 1) and falsification(x != 2) and falsification(x != 5)

0 commit comments

Comments
 (0)