-
Notifications
You must be signed in to change notification settings - Fork 272
feat: Add support for explode and explode_outer for array inputs
#2836
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
b50829b
21e9bf2
0ea6dce
36cb3bc
de1f012
0aa8048
f5fc1dc
6105dc1
a3cfffe
28a1752
c1a564c
234c231
a595a18
d0b90d7
3b00224
876e7a1
f9dc3e5
96336ab
311e044
28b7e37
0929551
c7d2385
0eb2016
a65a432
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -96,9 +96,11 @@ use arrow::array::{ | |
| use arrow::buffer::{BooleanBuffer, NullBuffer, OffsetBuffer}; | ||
| use arrow::row::{OwnedRow, RowConverter, SortField}; | ||
| use datafusion::common::utils::SingleRowListArrayBuilder; | ||
| use datafusion::common::UnnestOptions; | ||
| use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; | ||
| use datafusion::physical_plan::filter::FilterExec; | ||
| use datafusion::physical_plan::limit::GlobalLimitExec; | ||
| use datafusion::physical_plan::unnest::{ListUnnest, UnnestExec}; | ||
| use datafusion_comet_proto::spark_expression::ListLiteral; | ||
| use datafusion_comet_proto::spark_operator::SparkFilePartition; | ||
| use datafusion_comet_proto::{ | ||
|
|
@@ -1528,6 +1530,107 @@ impl PhysicalPlanner { | |
| Arc::new(SparkPlan::new(spark_plan.plan_id, expand, vec![child])), | ||
| )) | ||
| } | ||
| OpStruct::Explode(explode) => { | ||
| assert_eq!(children.len(), 1); | ||
| let (scans, child) = self.create_plan(&children[0], inputs, partition_count)?; | ||
|
|
||
| // Create the expression for the array to explode | ||
| let child_expr = if let Some(child_expr) = &explode.child { | ||
| self.create_expr(child_expr, child.schema())? | ||
| } else { | ||
| return Err(ExecutionError::GeneralError( | ||
| "Explode operator requires a child expression".to_string(), | ||
| )); | ||
| }; | ||
|
|
||
| // Create projection expressions for other columns | ||
| let projections: Vec<Arc<dyn PhysicalExpr>> = explode | ||
| .project_list | ||
| .iter() | ||
| .map(|expr| self.create_expr(expr, child.schema())) | ||
| .collect::<Result<Vec<_>, _>>()?; | ||
|
|
||
| // For UnnestExec, we need to add a projection to put the columns in the right order: | ||
| // 1. First add all projection columns | ||
| // 2. Then add the array column to be exploded | ||
| // Then UnnestExec will unnest the last column | ||
|
|
||
| let mut project_exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = projections | ||
| .iter() | ||
| .enumerate() | ||
| .map(|(idx, expr)| (Arc::clone(expr), format!("col_{idx}"))) | ||
| .collect(); | ||
|
|
||
| // Add the array column as the last column | ||
| let array_col_name = format!("col_{}", projections.len()); | ||
|
||
| project_exprs.push((Arc::clone(&child_expr), array_col_name.clone())); | ||
|
|
||
| // Create a projection to arrange columns as needed | ||
| let project_exec = Arc::new(ProjectionExec::try_new( | ||
| project_exprs, | ||
| Arc::clone(&child.native_plan), | ||
| )?); | ||
|
|
||
| // Get the input schema from the projection | ||
| let project_schema = project_exec.schema(); | ||
|
|
||
| // Build the output schema for UnnestExec | ||
| // The output schema replaces the list column with its element type | ||
| let mut output_fields: Vec<Field> = Vec::new(); | ||
|
|
||
| // Add all projection columns (non-array columns) | ||
| for i in 0..projections.len() { | ||
| output_fields.push(project_schema.field(i).clone()); | ||
| } | ||
|
|
||
| // Add the unnested array element field | ||
| // Extract the element type from the list/array type | ||
| let array_field = project_schema.field(projections.len()); | ||
| let element_type = match array_field.data_type() { | ||
| DataType::List(field) => field.data_type().clone(), | ||
| dt => { | ||
| return Err(ExecutionError::GeneralError(format!( | ||
| "Expected List type for explode, got {:?}", | ||
| dt | ||
| ))) | ||
| } | ||
| }; | ||
|
|
||
| // The output column has the same name as the input array column | ||
| // but with the element type instead of the list type | ||
| output_fields.push(Field::new( | ||
| array_field.name(), | ||
| element_type, | ||
| true, // Element is nullable after unnesting | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
| )); | ||
|
|
||
| let output_schema = Arc::new(Schema::new(output_fields)); | ||
|
|
||
| // Use UnnestExec to explode the last column (the array column) | ||
| // ListUnnest specifies which column to unnest and the depth (1 for single level) | ||
| let list_unnest = ListUnnest { | ||
| index_in_input_schema: projections.len(), // Index of the array column to unnest | ||
| depth: 1, // Unnest one level (explode single array) | ||
| }; | ||
|
|
||
| let unnest_options = UnnestOptions { | ||
| preserve_nulls: explode.outer, | ||
| recursions: vec![], | ||
| }; | ||
|
|
||
| let unnest_exec = Arc::new(UnnestExec::new( | ||
| project_exec, | ||
| vec![list_unnest], | ||
| vec![], // No struct columns to unnest | ||
| output_schema, | ||
| unnest_options, | ||
| )); | ||
|
|
||
| Ok(( | ||
| scans, | ||
| Arc::new(SparkPlan::new(spark_plan.plan_id, unnest_exec, vec![child])), | ||
| )) | ||
| } | ||
| OpStruct::SortMergeJoin(join) => { | ||
| let (join_params, scans) = self.parse_join_parameters( | ||
| inputs, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| SELECT i_item_sk, explode(array(i_brand_id, i_class_id, i_category_id, i_manufact_id, i_manager_id)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we also have |
||
| FROM item | ||
| ORDER BY i_item_sk | ||
| LIMIT 1000 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
other columns? 🤔
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, as in
SELECT a, b, c, explode(d) FROM ...