@@ -50,7 +50,7 @@ use datafusion::{
5050 plan:: { Aggregate , Extension , Filter , Join , Projection , Sort , TableUDFs , Window } ,
5151 replace_col_to_expr, Column , CrossJoin , DFField , DFSchema , DFSchemaRef , Distinct ,
5252 EmptyRelation , Expr , ExprRewritable , ExprRewriter , GroupingSet , Like , Limit , LogicalPlan ,
53- LogicalPlanBuilder , TableScan , Union ,
53+ LogicalPlanBuilder , Repartition , Subquery , TableScan , Union ,
5454 } ,
5555 physical_plan:: planner:: DefaultPhysicalPlanner ,
5656 scalar:: ScalarValue ,
@@ -1350,10 +1350,18 @@ impl LanguageToLogicalPlanConverter {
13501350 LogicalPlanLanguage :: Join ( params) => {
13511351 let left_on = match_data_node ! ( node_by_id, params[ 2 ] , JoinLeftOn ) ;
13521352 let right_on = match_data_node ! ( node_by_id, params[ 3 ] , JoinRightOn ) ;
1353- let left = self . to_logical_plan ( params[ 0 ] ) ;
1354- let right = self . to_logical_plan ( params[ 1 ] ) ;
1355-
1356- if self . is_cube_scan_node ( params[ 0 ] ) && self . is_cube_scan_node ( params[ 1 ] ) {
1353+ let left = self . to_logical_plan ( params[ 0 ] ) ?;
1354+ let right = self . to_logical_plan ( params[ 1 ] ) ?;
1355+
1356+ // It's OK to join two grouped queries: expected row count is not that high, so
1357+ // SQL API can, potentially, evaluate it completely
1358+ // We don't really want it, so cost function should make WrappedSelect preferable
1359+ // but still, we don't want to hard error on that
1360+ // But if any one of join sides is ungroued, SQL API does not have much of a choice
1361+ // but to process every row from ungrouped query, and that's Not Good
1362+ if Self :: have_ungrouped_cube_scan_inside ( & left)
1363+ || Self :: have_ungrouped_cube_scan_inside ( & right)
1364+ {
13571365 if left_on. iter ( ) . any ( |c| c. name == "__cubeJoinField" )
13581366 || right_on. iter ( ) . any ( |c| c. name == "__cubeJoinField" )
13591367 {
@@ -1370,8 +1378,8 @@ impl LanguageToLogicalPlanConverter {
13701378 }
13711379 }
13721380
1373- let left = Arc :: new ( left? ) ;
1374- let right = Arc :: new ( right? ) ;
1381+ let left = Arc :: new ( left) ;
1382+ let right = Arc :: new ( right) ;
13751383
13761384 let join_type = match_data_node ! ( node_by_id, params[ 4 ] , JoinJoinType ) ;
13771385 let join_constraint = match_data_node ! ( node_by_id, params[ 5 ] , JoinJoinConstraint ) ;
@@ -1394,7 +1402,18 @@ impl LanguageToLogicalPlanConverter {
13941402 } )
13951403 }
13961404 LogicalPlanLanguage :: CrossJoin ( params) => {
1397- if self . is_cube_scan_node ( params[ 0 ] ) && self . is_cube_scan_node ( params[ 1 ] ) {
1405+ let left = self . to_logical_plan ( params[ 0 ] ) ?;
1406+ let right = self . to_logical_plan ( params[ 1 ] ) ?;
1407+
1408+ // See comment in Join conversion
1409+ // Note that DF can generate Filter(CrossJoin(...)) for complex join conditions
1410+ // But, from memory or dataset perspective it's the same: DF would buffer left side completely
1411+ // And then iterate over right side, evaluting predicate
1412+ // Regular join would use hash partitioning here, so it would be quicker, and utilize less CPU,
1413+ // but transfer and buffering will be the same
1414+ if Self :: have_ungrouped_cube_scan_inside ( & left)
1415+ || Self :: have_ungrouped_cube_scan_inside ( & right)
1416+ {
13981417 return Err ( CubeError :: internal (
13991418 "Can not join Cubes. This is most likely due to one of the following reasons:\n \
14001419 • one of the cubes contains a group by\n \
@@ -1403,8 +1422,8 @@ impl LanguageToLogicalPlanConverter {
14031422 ) ) ;
14041423 }
14051424
1406- let left = Arc :: new ( self . to_logical_plan ( params [ 0 ] ) ? ) ;
1407- let right = Arc :: new ( self . to_logical_plan ( params [ 1 ] ) ? ) ;
1425+ let left = Arc :: new ( left ) ;
1426+ let right = Arc :: new ( right ) ;
14081427 let schema = Arc :: new ( left. schema ( ) . join ( right. schema ( ) ) ?) ;
14091428
14101429 LogicalPlan :: CrossJoin ( CrossJoin {
@@ -2304,16 +2323,44 @@ impl LanguageToLogicalPlanConverter {
23042323 } )
23052324 }
23062325
2307- fn is_cube_scan_node ( & self , node_id : Id ) -> bool {
2308- let node_by_id = & self . best_expr ;
2309- match node_by_id. index ( node_id) {
2310- LogicalPlanLanguage :: CubeScan ( _) | LogicalPlanLanguage :: CubeScanWrapper ( _) => {
2311- return true
2326+ fn have_ungrouped_cube_scan_inside ( node : & LogicalPlan ) -> bool {
2327+ match node {
2328+ LogicalPlan :: Projection ( Projection { input, .. } )
2329+ | LogicalPlan :: Filter ( Filter { input, .. } )
2330+ | LogicalPlan :: Window ( Window { input, .. } )
2331+ | LogicalPlan :: Aggregate ( Aggregate { input, .. } )
2332+ | LogicalPlan :: Sort ( Sort { input, .. } )
2333+ | LogicalPlan :: Repartition ( Repartition { input, .. } )
2334+ | LogicalPlan :: Limit ( Limit { input, .. } ) => {
2335+ Self :: have_ungrouped_cube_scan_inside ( input)
2336+ }
2337+ LogicalPlan :: Join ( Join { left, right, .. } )
2338+ | LogicalPlan :: CrossJoin ( CrossJoin { left, right, .. } ) => {
2339+ Self :: have_ungrouped_cube_scan_inside ( left)
2340+ || Self :: have_ungrouped_cube_scan_inside ( right)
2341+ }
2342+ LogicalPlan :: Union ( Union { inputs, .. } ) => {
2343+ inputs. iter ( ) . any ( Self :: have_ungrouped_cube_scan_inside)
2344+ }
2345+ LogicalPlan :: Subquery ( Subquery {
2346+ input, subqueries, ..
2347+ } ) => {
2348+ Self :: have_ungrouped_cube_scan_inside ( input)
2349+ || subqueries. iter ( ) . any ( Self :: have_ungrouped_cube_scan_inside)
2350+ }
2351+ LogicalPlan :: Extension ( Extension { node } ) => {
2352+ if let Some ( cube_scan) = node. as_any ( ) . downcast_ref :: < CubeScanNode > ( ) {
2353+ cube_scan. request . ungrouped == Some ( true )
2354+ } else if let Some ( cube_scan_wrapper) =
2355+ node. as_any ( ) . downcast_ref :: < CubeScanWrapperNode > ( )
2356+ {
2357+ cube_scan_wrapper. has_ungrouped_scan ( )
2358+ } else {
2359+ false
2360+ }
23122361 }
2313- _ => ( ) ,
2362+ _ => false ,
23142363 }
2315-
2316- return false ;
23172364 }
23182365}
23192366
0 commit comments