2626import org .apache .doris .nereids .trees .expressions .literal .Literal ;
2727import org .apache .doris .nereids .trees .plans .JoinType ;
2828import org .apache .doris .nereids .trees .plans .algebra .Join ;
29- import org .apache .doris .nereids .trees .plans .logical .LogicalJoin ;
3029import org .apache .doris .nereids .util .ExpressionUtils ;
3130import org .apache .doris .statistics .ColumnStatistic ;
3231import org .apache .doris .statistics .ColumnStatisticBuilder ;
@@ -52,6 +51,7 @@ public class JoinEstimation {
5251 private static double TRUSTABLE_CONDITION_SELECTIVITY_POW_FACTOR = 2.0 ;
5352 private static double UNTRUSTABLE_CONDITION_SELECTIVITY_LINEAR_FACTOR = 0.9 ;
5453 private static double TRUSTABLE_UNIQ_THRESHOLD = 0.9 ;
54+ private static double OUTER_JOIN_NULL_SUPPLELMENT_RATIO = 0.1 ;
5555
5656 private static EqualPredicate normalizeEqualPredJoinCondition (EqualPredicate equal , Statistics rightStats ) {
5757 boolean changeOrder = equal .left ().getInputSlots ().stream ()
@@ -363,16 +363,17 @@ private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics ri
363363 * outer join generates nulls.
364364 * for example, T1 left outer join T2,
365365 * in join results, columns from T2 contain nulls.
366- * we estimate the numNulls as inner_join_rows - semi_join_rows
366+ * we estimate the numNulls as max(T1.row - inner_join_rows, T1.row * 0.1)
367367 */
368- private static void updateNumNullsForOuterJoin (Statistics crossJoinStats , Statistics targetSide ,
369- double supplementNulls ) {
370- for (Map .Entry <Expression , ColumnStatistic > entry : targetSide .columnStatistics ().entrySet ()) {
371- double numNulls = supplementNulls ;
368+ private static void updateNumNullsForOuterJoin (Statistics crossJoinStats , Statistics innerJoinStats ,
369+ Statistics probeStats , Statistics buildStats , double estJoinRowCount ) {
370+ for (Map .Entry <Expression , ColumnStatistic > entry : buildStats .columnStatistics ().entrySet ()) {
371+ double numNulls = Math .max (probeStats .getRowCount () - innerJoinStats .getRowCount (),
372+ probeStats .getRowCount () * OUTER_JOIN_NULL_SUPPLELMENT_RATIO );
372373 if (!entry .getValue ().isUnKnown ()) {
373374 if (entry .getValue ().numNulls > 0 ) {
374- numNulls = Math . max ( 1 , supplementNulls ) ;
375- numNulls = Math .min ( entry . getValue (). numNulls , numNulls );
375+ numNulls += entry . getValue (). numNulls / buildStats . getRowCount () * estJoinRowCount ;
376+ numNulls = Math .max ( 1 , numNulls );
376377 }
377378 ColumnStatistic colStats = new ColumnStatisticBuilder (entry .getValue ())
378379 .setNumNulls (numNulls )
@@ -402,32 +403,19 @@ public static Statistics estimate(Statistics leftStats, Statistics rightStats, J
402403 return innerJoinStats ;
403404 } else if (joinType == JoinType .LEFT_OUTER_JOIN ) {
404405 double rowCount = Math .max (leftStats .getRowCount (), innerJoinStats .getRowCount ());
405- LogicalJoin leftSemi = ((LogicalJoin ) join ).withJoinType (JoinType .LEFT_SEMI_JOIN );
406- Statistics semiStats = estimateSemiOrAnti (leftStats , rightStats , innerJoinStats , leftSemi );
407- double supplementNull = Math .max (1 , leftStats .getRowCount () - semiStats .getRowCount ());
408- updateNumNullsForOuterJoin (crossJoinStats , rightStats , supplementNull );
406+ updateNumNullsForOuterJoin (crossJoinStats , innerJoinStats , leftStats , rightStats , rowCount );
409407 updateJoinConditionColumnStatistics (crossJoinStats , join );
410408 return crossJoinStats .withRowCountAndEnforceValid (rowCount );
411409 } else if (joinType == JoinType .RIGHT_OUTER_JOIN ) {
412410 double rowCount = Math .max (rightStats .getRowCount (), innerJoinStats .getRowCount ());
413- LogicalJoin rightSemi = ((LogicalJoin ) join ).withJoinType (JoinType .RIGHT_SEMI_JOIN );
414- Statistics semiStats = estimateSemiOrAnti (leftStats , rightStats , innerJoinStats , rightSemi );
415- double supplementNull = Math .max (1 , rightStats .getRowCount () - semiStats .getRowCount ());
416- updateNumNullsForOuterJoin (crossJoinStats , leftStats , supplementNull );
411+ updateNumNullsForOuterJoin (crossJoinStats , innerJoinStats , rightStats , leftStats , rowCount );
417412 updateJoinConditionColumnStatistics (crossJoinStats , join );
418413 return crossJoinStats .withRowCountAndEnforceValid (rowCount );
419414 } else if (joinType == JoinType .FULL_OUTER_JOIN ) {
420415 double rowCount = Math .max (leftStats .getRowCount (), innerJoinStats .getRowCount ());
421416 rowCount = Math .max (rightStats .getRowCount (), rowCount );
422- LogicalJoin leftSemiJoin = ((LogicalJoin ) join ).withJoinType (JoinType .LEFT_SEMI_JOIN );
423- Statistics leftSemiStats = estimateSemiOrAnti (leftStats , rightStats , innerJoinStats , leftSemiJoin );
424- double supplementNullRight = Math .max (1 , leftStats .getRowCount () - leftSemiStats .getRowCount ());
425- updateNumNullsForOuterJoin (crossJoinStats , rightStats , supplementNullRight );
426-
427- LogicalJoin rightSemiJoin = ((LogicalJoin ) join ).withJoinType (JoinType .RIGHT_SEMI_JOIN );
428- Statistics rightSemiStats = estimateSemiOrAnti (leftStats , leftStats , innerJoinStats , rightSemiJoin );
429- double supplementNullLeft = Math .max (1 , rightStats .getRowCount () - rightSemiStats .getRowCount ());
430- updateNumNullsForOuterJoin (crossJoinStats , leftStats , supplementNullLeft );
417+ updateNumNullsForOuterJoin (crossJoinStats , innerJoinStats , leftStats , rightStats , rowCount );
418+ updateNumNullsForOuterJoin (crossJoinStats , innerJoinStats , rightStats , leftStats , rowCount );
431419 updateJoinConditionColumnStatistics (crossJoinStats , join );
432420 return crossJoinStats .withRowCountAndEnforceValid (rowCount );
433421 } else if (joinType == JoinType .CROSS_JOIN ) {
0 commit comments