@@ -45,6 +45,7 @@ use datafusion::{
4545 catalog:: TableReference ,
4646 error:: DataFusionError ,
4747 logical_plan:: {
48+ Repartition , Subquery ,
4849 build_join_schema, build_table_udf_schema, exprlist_to_fields,
4950 exprlist_to_fields_from_schema, normalize_cols,
5051 plan:: { Aggregate , Extension , Filter , Join , Projection , Sort , TableUDFs , Window } ,
@@ -1351,10 +1352,18 @@ impl LanguageToLogicalPlanConverter {
13511352 LogicalPlanLanguage :: Join ( params) => {
13521353 let left_on = match_data_node ! ( node_by_id, params[ 2 ] , JoinLeftOn ) ;
13531354 let right_on = match_data_node ! ( node_by_id, params[ 3 ] , JoinRightOn ) ;
1354- let left = self . to_logical_plan ( params[ 0 ] ) ;
1355- let right = self . to_logical_plan ( params[ 1 ] ) ;
1356-
1357- if self . is_cube_scan_node ( params[ 0 ] ) && self . is_cube_scan_node ( params[ 1 ] ) {
1355+ let left = self . to_logical_plan ( params[ 0 ] ) ?;
1356+ let right = self . to_logical_plan ( params[ 1 ] ) ?;
1357+
1358+ // It's OK to join two grouped queries: expected row count is not that high, so
1359+ // SQL API can, potentially, evaluate it completely
1360+ // We don't really want it, so cost function should make WrappedSelect preferable
1361+ // but still, we don't want to hard error on that
1362+ // But if any one of join sides is ungroued, SQL API does not have much of a choice
1363+ // but to process every row from ungrouped query, and that's Not Good
1364+ if Self :: have_ungrouped_cube_scan_inside ( & left)
1365+ || Self :: have_ungrouped_cube_scan_inside ( & right)
1366+ {
13581367 if left_on. iter ( ) . any ( |c| c. name == "__cubeJoinField" )
13591368 || right_on. iter ( ) . any ( |c| c. name == "__cubeJoinField" )
13601369 {
@@ -1371,8 +1380,8 @@ impl LanguageToLogicalPlanConverter {
13711380 }
13721381 }
13731382
1374- let left = Arc :: new ( left? ) ;
1375- let right = Arc :: new ( right? ) ;
1383+ let left = Arc :: new ( left) ;
1384+ let right = Arc :: new ( right) ;
13761385
13771386 let join_type = match_data_node ! ( node_by_id, params[ 4 ] , JoinJoinType ) ;
13781387 let join_constraint = match_data_node ! ( node_by_id, params[ 5 ] , JoinJoinConstraint ) ;
@@ -1395,7 +1404,18 @@ impl LanguageToLogicalPlanConverter {
13951404 } )
13961405 }
13971406 LogicalPlanLanguage :: CrossJoin ( params) => {
1398- if self . is_cube_scan_node ( params[ 0 ] ) && self . is_cube_scan_node ( params[ 1 ] ) {
1407+ let left = self . to_logical_plan ( params[ 0 ] ) ?;
1408+ let right = self . to_logical_plan ( params[ 1 ] ) ?;
1409+
1410+ // See comment in Join conversion
1411+ // Note that DF can generate Filter(CrossJoin(...)) for complex join conditions
1412+ // But, from memory or dataset perspective it's the same: DF would buffer left side completely
1413+ // And then iterate over right side, evaluting predicate
1414+ // Regular join would use hash partitioning here, so it would be quicker, and utilize less CPU,
1415+ // but transfer and buffering will be the same
1416+ if Self :: have_ungrouped_cube_scan_inside ( & left)
1417+ || Self :: have_ungrouped_cube_scan_inside ( & right)
1418+ {
13991419 return Err ( CubeError :: internal (
14001420 "Can not join Cubes. This is most likely due to one of the following reasons:\n \
14011421 • one of the cubes contains a group by\n \
@@ -1404,8 +1424,8 @@ impl LanguageToLogicalPlanConverter {
14041424 ) ) ;
14051425 }
14061426
1407- let left = Arc :: new ( self . to_logical_plan ( params [ 0 ] ) ? ) ;
1408- let right = Arc :: new ( self . to_logical_plan ( params [ 1 ] ) ? ) ;
1427+ let left = Arc :: new ( left ) ;
1428+ let right = Arc :: new ( right ) ;
14091429 let schema = Arc :: new ( left. schema ( ) . join ( right. schema ( ) ) ?) ;
14101430
14111431 LogicalPlan :: CrossJoin ( CrossJoin {
@@ -2305,16 +2325,44 @@ impl LanguageToLogicalPlanConverter {
23052325 } )
23062326 }
23072327
2308- fn is_cube_scan_node ( & self , node_id : Id ) -> bool {
2309- let node_by_id = & self . best_expr ;
2310- match node_by_id. index ( node_id) {
2311- LogicalPlanLanguage :: CubeScan ( _) | LogicalPlanLanguage :: CubeScanWrapper ( _) => {
2312- return true
2328+ fn have_ungrouped_cube_scan_inside ( node : & LogicalPlan ) -> bool {
2329+ match node {
2330+ LogicalPlan :: Projection ( Projection { input, .. } )
2331+ | LogicalPlan :: Filter ( Filter { input, .. } )
2332+ | LogicalPlan :: Window ( Window { input, .. } )
2333+ | LogicalPlan :: Aggregate ( Aggregate { input, .. } )
2334+ | LogicalPlan :: Sort ( Sort { input, .. } )
2335+ | LogicalPlan :: Repartition ( Repartition { input, .. } )
2336+ | LogicalPlan :: Limit ( Limit { input, .. } ) => {
2337+ Self :: have_ungrouped_cube_scan_inside ( input)
2338+ }
2339+ LogicalPlan :: Join ( Join { left, right, .. } )
2340+ | LogicalPlan :: CrossJoin ( CrossJoin { left, right, .. } ) => {
2341+ Self :: have_ungrouped_cube_scan_inside ( left)
2342+ || Self :: have_ungrouped_cube_scan_inside ( right)
2343+ }
2344+ LogicalPlan :: Union ( Union { inputs, .. } ) => {
2345+ inputs. iter ( ) . any ( Self :: have_ungrouped_cube_scan_inside)
2346+ }
2347+ LogicalPlan :: Subquery ( Subquery {
2348+ input, subqueries, ..
2349+ } ) => {
2350+ Self :: have_ungrouped_cube_scan_inside ( input)
2351+ || subqueries. iter ( ) . any ( Self :: have_ungrouped_cube_scan_inside)
2352+ }
2353+ LogicalPlan :: Extension ( Extension { node } ) => {
2354+ if let Some ( cube_scan) = node. as_any ( ) . downcast_ref :: < CubeScanNode > ( ) {
2355+ cube_scan. request . ungrouped == Some ( true )
2356+ } else if let Some ( cube_scan_wrapper) =
2357+ node. as_any ( ) . downcast_ref :: < CubeScanWrapperNode > ( )
2358+ {
2359+ cube_scan_wrapper. has_ungrouped_scan ( )
2360+ } else {
2361+ false
2362+ }
23132363 }
2314- _ => ( ) ,
2364+ _ => false ,
23152365 }
2316-
2317- return false ;
23182366 }
23192367}
23202368
0 commit comments