-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Allow struct field access projections to be pushed down into scans #19538
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
e62a110
ccbdc5e
aab1d3a
11e9775
6c31dfe
22c826c
6bb96c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -122,6 +122,11 @@ impl ScalarUDF { | |
| Self { inner: fun } | ||
| } | ||
|
|
||
| /// Returns true if this function is trivial (cheap to evaluate). | ||
| pub fn is_trivial(&self) -> bool { | ||
| self.inner.is_trivial() | ||
| } | ||
|
|
||
| /// Return the underlying [`ScalarUDFImpl`] trait object for this function | ||
| pub fn inner(&self) -> &Arc<dyn ScalarUDFImpl> { | ||
| &self.inner | ||
|
|
@@ -846,6 +851,18 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { | |
| fn documentation(&self) -> Option<&Documentation> { | ||
| None | ||
| } | ||
|
|
||
| /// Returns true if this function is trivial (cheap to evaluate). | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest that a good rule of thumb here is that the function takes constant time per RecordBatch (aka it doesn't depend on the number of rows in the batch). Struct field access and column have this property but other functions don't |
||
| /// | ||
| /// Trivial functions are lightweight accessor functions like `get_field` | ||
| /// (struct field access) that simply access nested data within a column | ||
| /// without significant computation. | ||
| /// | ||
| /// This is used to identify expressions that are cheap to duplicate or | ||
| /// don't benefit from caching/partitioning optimizations. | ||
| fn is_trivial(&self) -> bool { | ||
| false | ||
| } | ||
| } | ||
|
|
||
| /// ScalarUDF that adds an alias to the underlying function. It is better to | ||
|
|
@@ -964,6 +981,10 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { | |
| fn documentation(&self) -> Option<&Documentation> { | ||
| self.inner.documentation() | ||
| } | ||
|
|
||
| fn is_trivial(&self) -> bool { | ||
| self.inner.is_trivial() | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -499,6 +499,10 @@ impl ScalarUDFImpl for GetFieldFunc { | |
| fn documentation(&self) -> Option<&Documentation> { | ||
| self.doc() | ||
| } | ||
|
|
||
| fn is_trivial(&self) -> bool { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I recommend some comments explaining the rationale -- namely to allow these accesses to be pushed down into scans |
||
| true | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -535,10 +535,8 @@ fn merge_consecutive_projections(proj: Projection) -> Result<Transformed<Project | |
| // For details, see: https://github.com/apache/datafusion/issues/8296 | ||
| if column_referral_map.into_iter().any(|(col, usage)| { | ||
| usage > 1 | ||
| && !is_expr_trivial( | ||
| &prev_projection.expr | ||
| [prev_projection.schema.index_of_column(col).unwrap()], | ||
| ) | ||
| && !prev_projection.expr[prev_projection.schema.index_of_column(col).unwrap()] | ||
| .is_trivial() | ||
| }) { | ||
| // no change | ||
| return Projection::try_new_with_schema(expr, input, schema).map(Transformed::no); | ||
|
|
@@ -591,11 +589,6 @@ fn merge_consecutive_projections(proj: Projection) -> Result<Transformed<Project | |
| } | ||
| } | ||
|
|
||
| // Check whether `expr` is trivial; i.e. it doesn't imply any computation. | ||
| fn is_expr_trivial(expr: &Expr) -> bool { | ||
| matches!(expr, Expr::Column(_) | Expr::Literal(_, _)) | ||
| } | ||
|
Comment on lines
-594
to
-597
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As evidenced by the existing functions for both logical and physical expressions this was already a concept and implementation within the codebase, so all this PR is really doing is allowing arbitrary functions / expressions to declare themselves as trivial. |
||
|
|
||
| /// Rewrites a projection expression using the projection before it (i.e. its input) | ||
| /// This is a subroutine to the `merge_consecutive_projections` function. | ||
| /// | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is somewhat interesting that it materializes the constant in the scan. This is probably ok, but it does mean that constant may now get carried as a constant record batch up through the plan many 🤔