diff --git a/docs/changelog/103080.yaml b/docs/changelog/103080.yaml new file mode 100644 index 0000000000000..47adcc5b5acc0 --- /dev/null +++ b/docs/changelog/103080.yaml @@ -0,0 +1,8 @@ +pr: 103080 +summary: "ESQL: `mv_expand` default limit fix" +area: ES|QL +type: bug +issues: + - 101266 + - 102084 + - 102061 diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Analyzer.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Analyzer.java index bf97faea7ae74..b991fd7024448 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Analyzer.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Analyzer.java @@ -49,7 +49,9 @@ import org.elasticsearch.xpack.ql.plan.logical.EsRelation; import org.elasticsearch.xpack.ql.plan.logical.Limit; import org.elasticsearch.xpack.ql.plan.logical.LogicalPlan; +import org.elasticsearch.xpack.ql.plan.logical.OrderBy; import org.elasticsearch.xpack.ql.plan.logical.Project; +import org.elasticsearch.xpack.ql.plan.logical.UnaryPlan; import org.elasticsearch.xpack.ql.rule.ParameterizedRule; import org.elasticsearch.xpack.ql.rule.ParameterizedRuleExecutor; import org.elasticsearch.xpack.ql.rule.Rule; @@ -651,7 +653,75 @@ public LogicalPlan apply(LogicalPlan logicalPlan, AnalyzerContext context) { } else { limit = context.configuration().resultTruncationMaxSize(); // user provided a limit: cap result entries to the max } - return new Limit(Source.EMPTY, new Literal(Source.EMPTY, limit, DataTypes.INTEGER), logicalPlan); + + Limit l = new Limit(Source.EMPTY, new Literal(Source.EMPTY, limit, DataTypes.INTEGER), logicalPlan); + return maybeAddDefaultLimitForMvExpand(l); + } + + /** + * This adds the implicit limit to a plan that has a sort and no limit between EsRelation and the first MvExpand. + * To date, the only known use case that "needs" this is a query of the form + * from test + * | sort emp_no + * | mv_expand first_name + * | rename first_name AS x + * | where x LIKE "*a*" + * | limit 15 + * + * or + * + * from test + * | sort emp_no + * | mv_expand first_name + * | sort first_name + * | limit 15 + * + * LogicalPlanAnalyzer.PushDownAndCombineLimits rule will copy the "limit 15" after "sort emp_no" if there is no filter + * on the expanded values OR if there is no sort between "limit" and "mv_expand". In these two situations, to be able for + * "sort emp_no" to form a TopN, we need a limit. Since the "limit 15" in the query cannot be pushed down (otherwise, it will change + * the results that reach mv_expand command) we need some kind of value there. For now, this is the implicit limit. + * The second query above becomes: + * + * from test + * | sort emp_no + * | limit 10000 + * | mv_expand first_name + * | sort first_name + * | limit 15 + */ + private Limit maybeAddDefaultLimitForMvExpand(Limit limit) { + LogicalPlan plan = limit.child(); + MvExpand mvExpand = null; + UnaryPlan esRelationParent = null; + UnaryPlan orderByParent = null; + + // basically, locate the closest to Lucene mv_expand and any potential sort + while (plan instanceof UnaryPlan unaryPlan) { + if (plan instanceof MvExpand mve) { + mvExpand = mve; + orderByParent = null; + } else if (plan instanceof OrderBy && mvExpand != null) { + orderByParent = esRelationParent; + } + plan = unaryPlan.child(); + esRelationParent = unaryPlan; + } + + // when these two are found, place the default limit before sort + if (mvExpand != null && orderByParent != null && plan instanceof EsRelation) { + var duplicateLimit = new Limit(limit.source(), limit.limit(), orderByParent.child()); + return limit.replaceChild(propagateLimitUntilEsRelation(duplicateLimit, orderByParent, (UnaryPlan) limit.child())); + } + + return limit; + } + + private LogicalPlan propagateLimitUntilEsRelation(Limit duplicateLimit, UnaryPlan esRelationParent, UnaryPlan child) { + if (child == esRelationParent) { + return esRelationParent.replaceChild(duplicateLimit); + } else { + return child.replaceChild(propagateLimitUntilEsRelation(duplicateLimit, esRelationParent, (UnaryPlan) child.child())); + } } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizer.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizer.java index 66654b78c3af4..ed5b7f250fdcf 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizer.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizer.java @@ -52,7 +52,6 @@ import org.elasticsearch.xpack.ql.optimizer.OptimizerRules.SetAsOptimized; import org.elasticsearch.xpack.ql.optimizer.OptimizerRules.SimplifyComparisonsArithmetics; import org.elasticsearch.xpack.ql.plan.logical.Aggregate; -import org.elasticsearch.xpack.ql.plan.logical.EsRelation; import org.elasticsearch.xpack.ql.plan.logical.Filter; import org.elasticsearch.xpack.ql.plan.logical.Limit; import org.elasticsearch.xpack.ql.plan.logical.LogicalPlan; @@ -62,7 +61,6 @@ import org.elasticsearch.xpack.ql.rule.ParameterizedRule; import org.elasticsearch.xpack.ql.rule.ParameterizedRuleExecutor; import org.elasticsearch.xpack.ql.rule.Rule; -import org.elasticsearch.xpack.ql.tree.Source; import org.elasticsearch.xpack.ql.type.DataType; import org.elasticsearch.xpack.ql.type.DataTypes; import org.elasticsearch.xpack.ql.util.CollectionUtils; @@ -155,10 +153,9 @@ protected static List> rules() { new PushDownAndCombineLimits(), new ReplaceLimitAndSortAsTopN() ); - var defaultTopN = new Batch<>("Add default TopN", new AddDefaultTopN()); var label = new Batch<>("Set as Optimized", Limiter.ONCE, new SetAsOptimized()); - return asList(substitutions, operators, skip, cleanup, defaultTopN, label); + return asList(substitutions, operators, skip, cleanup, label); } // TODO: currently this rule only works for aggregate functions (AVG) @@ -493,13 +490,16 @@ protected LogicalPlan rule(Limit limit) { || child instanceof Limit; if (shouldSkip == false && child instanceof UnaryPlan unary) { - MvExpand mvExpand = descendantMvExpand(unary); + // in case unary is THE MvExpand, return it right away + MvExpand mvExpand = unary instanceof MvExpand mve ? mve : descendantMvExpand(unary); if (mvExpand != null) { Limit limitBeforeMvExpand = limitBeforeMvExpand(mvExpand); // if there is no "appropriate" limit before mv_expand, then push down a copy of the one after it so that: // - a possible TopN is properly built as low as possible in the tree (closed to Lucene) // - the input of mv_expand is as small as possible before it is expanded (less rows to inflate and occupy memory) - if (limitBeforeMvExpand == null) { + // limitBeforeMvExpand > limit is a cheap way of not indefinetely copying the same limit past mv_expand + // ">" will enforce the limit of the mv_expand, limitting the results that come to mv_expand to bare minimum + if (limitBeforeMvExpand == null || (int) limitBeforeMvExpand.limit().fold() > (int) limit.limit().fold()) { var duplicateLimit = new Limit(limit.source(), limit.limit(), mvExpand.child()); return limit.replaceChild(propagateDuplicateLimitUntilMvExpand(duplicateLimit, mvExpand, unary)); } @@ -511,7 +511,7 @@ protected LogicalPlan rule(Limit limit) { private static MvExpand descendantMvExpand(UnaryPlan unary) { UnaryPlan plan = unary; AttributeSet filterReferences = new AttributeSet(); - while (plan instanceof Aggregate == false) { + while (plan instanceof UnaryPlan) { if (plan instanceof MvExpand mve) { // don't return the mv_expand that has a filter after it which uses the expanded values // since this will trigger the use of a potentially incorrect (too restrictive) limit further down in the tree @@ -532,6 +532,8 @@ private static MvExpand descendantMvExpand(UnaryPlan unary) { // something like from test | sort emp_no | mv_expand job_positions | sort first_name | limit 5 // (the sort first_name likely changes the order of the docs after sort emp_no, so "limit 5" shouldn't be copied down return null; + } else if (plan instanceof Limit) { + return null; } if (plan.child() instanceof UnaryPlan unaryPlan) { @@ -545,7 +547,7 @@ private static MvExpand descendantMvExpand(UnaryPlan unary) { private static Limit limitBeforeMvExpand(MvExpand mvExpand) { UnaryPlan plan = mvExpand; - while (plan instanceof Aggregate == false) { + while (plan instanceof UnaryPlan) { if (plan instanceof Limit limit) { return limit; } @@ -1016,40 +1018,6 @@ protected LogicalPlan rule(Limit plan) { } } - /** - * This adds an explicit TopN node to a plan that only has an OrderBy right before Lucene. - * To date, the only known use case that "needs" this is a query of the form - * from test - * | sort emp_no - * | mv_expand first_name - * | rename first_name AS x - * | where x LIKE "*a*" - * | limit 15 - * - * or - * - * from test - * | sort emp_no - * | mv_expand first_name - * | sort first_name - * | limit 15 - * - * PushDownAndCombineLimits rule will copy the "limit 15" after "sort emp_no" if there is no filter on the expanded values - * OR if there is no sort between "limit" and "mv_expand". - * But, since this type of query has such a filter, the "sort emp_no" will have no limit when it reaches the current rule. - */ - static class AddDefaultTopN extends ParameterizedOptimizerRule { - - @Override - protected LogicalPlan rule(LogicalPlan plan, LogicalOptimizerContext context) { - if (plan instanceof UnaryPlan unary && unary.child() instanceof OrderBy order && order.child() instanceof EsRelation relation) { - var limit = new Literal(Source.EMPTY, context.configuration().resultTruncationMaxSize(), DataTypes.INTEGER); - return unary.replaceChild(new TopN(plan.source(), relation, order.order(), limit)); - } - return plan; - } - } - public static class ReplaceRegexMatch extends OptimizerRules.ReplaceRegexMatch { protected Expression regexToEquals(RegexMatch regexMatch, Literal literal) { diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java index 352dccc046588..30553d1f99c51 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java @@ -968,17 +968,18 @@ public void testDontPushDownLimitPastMvExpand() { /** * Expected - * EsqlProject[[emp_no{f}#141, first_name{f}#142, languages{f}#143, lll{r}#132, salary{f}#147]] - * \_TopN[[Order[salary{f}#147,DESC,FIRST], Order[first_name{f}#142,ASC,LAST]],5[INTEGER]] - * \_Limit[5[INTEGER]] - * \_MvExpand[salary{f}#147] - * \_Eval[[languages{f}#143 + 5[INTEGER] AS lll]] - * \_Filter[languages{f}#143 > 1[INTEGER]] - * \_Limit[10[INTEGER]] - * \_MvExpand[first_name{f}#142] - * \_TopN[[Order[emp_no{f}#141,DESC,FIRST]],10[INTEGER]] - * \_Filter[emp_no{f}#141 < 10006[INTEGER]] - * \_EsRelation[test][emp_no{f}#141, first_name{f}#142, languages{f}#1..] + * EsqlProject[[emp_no{f}#19, first_name{r}#29, languages{f}#22, lll{r}#8, salary{r}#30]] + * \_TopN[[Order[salary{r}#30,DESC,FIRST]],5[INTEGER]] + * \_Limit[5[INTEGER]] + * \_MvExpand[salary{f}#24,salary{r}#30,2147483647] + * \_Eval[[languages{f}#22 + 5[INTEGER] AS lll]] + * \_Limit[5[INTEGER]] + * \_Filter[languages{f}#22 > 1[INTEGER]] + * \_Limit[10[INTEGER]] + * \_MvExpand[first_name{f}#20,first_name{r}#29,2147483647] + * \_TopN[[Order[emp_no{f}#19,DESC,FIRST]],10[INTEGER]] + * \_Filter[emp_no{f}#19 ≤ 10006[INTEGER]] + * \_EsRelation[test][_meta_field{f}#25, emp_no{f}#19, first_name{f}#20, ..] */ public void testMultipleMvExpandWithSortAndLimit() { LogicalPlan plan = optimizedPlan(""" @@ -1003,7 +1004,8 @@ public void testMultipleMvExpandWithSortAndLimit() { assertThat(limit.limit().fold(), equalTo(5)); var mvExp = as(limit.child(), MvExpand.class); var eval = as(mvExp.child(), Eval.class); - var filter = as(eval.child(), Filter.class); + var limit5 = as(eval.child(), Limit.class); + var filter = as(limit5.child(), Filter.class); limit = as(filter.child(), Limit.class); assertThat(limit.limit().fold(), equalTo(10)); mvExp = as(limit.child(), MvExpand.class); @@ -1203,6 +1205,38 @@ public void testAddDefaultLimit_BeforeMvExpand_WithFilterOnExpandedField() { as(topN.child(), EsRelation.class); } + /** + * Expected + * EsqlProject[[emp_no{f}#104, first_name{f}#105, salary{f}#106]] + * \_TopN[[Order[salary{f}#106,ASC,LAST], Order[first_name{f}#105,ASC,LAST]],15[INTEGER]] + * \_Filter[gender{f}#215 == [46][KEYWORD] AND WILDCARDLIKE(first_name{f}#105)] + * \_MvExpand[first_name{f}#105] + * \_TopN[[Order[emp_no{f}#104,ASC,LAST]],500[INTEGER]] + * \_EsRelation[employees][emp_no{f}#104, first_name{f}#105, salary{f}#106] + */ + public void testAddDefaultLimit_BeforeMvExpand_WithFilterOnExpandedField_ResultTruncationDefaultSize() { + LogicalPlan plan = optimizedPlan(""" + from test + | sort emp_no + | mv_expand first_name + | where gender == "F" + | where first_name LIKE "R*" + | keep emp_no, first_name, salary + | sort salary, first_name"""); + + var keep = as(plan, EsqlProject.class); + var topN = as(keep.child(), TopN.class); + assertThat(topN.limit().fold(), equalTo(500)); + assertThat(orderNames(topN), contains("salary", "first_name")); + var filter = as(topN.child(), Filter.class); + assertThat(filter.condition(), instanceOf(And.class)); + var mvExp = as(filter.child(), MvExpand.class); + topN = as(mvExp.child(), TopN.class); + assertThat(topN.limit().fold(), equalTo(500)); + assertThat(orderNames(topN), contains("emp_no")); + as(topN.child(), EsRelation.class); + } + /** * Expected * EsqlProject[[emp_no{f}#104, first_name{f}#105, salary{f}#106]] @@ -1273,6 +1307,94 @@ public void testAddDefaultLimit_BeforeMvExpand_WithFilterOnExpandedFieldAlias() as(topN.child(), EsRelation.class); } + /** + * Expected + * Limit[2[INTEGER]] + * \_Filter[a{r}#6 > 2[INTEGER]] + * \_MvExpand[a{r}#2,a{r}#6] + * \_Row[[[1, 2, 3][INTEGER] AS a]] + */ + public void testMvExpandFoldable() { + LogicalPlan plan = optimizedPlan(""" + row a = [1, 2, 3] + | mv_expand a + | where a > 2 + | limit 2"""); + + var limit = as(plan, Limit.class); + var filter = as(limit.child(), Filter.class); + var expand = as(filter.child(), MvExpand.class); + assertThat(filter.condition(), instanceOf(GreaterThan.class)); + var filterProp = ((GreaterThan) filter.condition()).left(); + assertTrue(expand.expanded().semanticEquals(filterProp)); + assertFalse(expand.target().semanticEquals(filterProp)); + var row = as(expand.child(), Row.class); + } + + /** + * Expected: + * Limit[500[INTEGER]] + * \_MvExpand[a{r}#2,a{r}#6,2147483647] + * \_TopN[[Order[a{r}#2,ASC,LAST]],500[INTEGER]] + * \_Row[[1[INTEGER] AS a]] + */ + public void testSortMvExpand() { + LogicalPlan plan = optimizedPlan(""" + row a = 1 + | sort a + | mv_expand a"""); + + var limit = as(plan, Limit.class); + var expand = as(limit.child(), MvExpand.class); + var topN = as(expand.child(), TopN.class); + var row = as(topN.child(), Row.class); + } + + /** + * Expected: + * Limit[20[INTEGER]] + * \_MvExpand[emp_no{f}#4,emp_no{r}#14,-1] + * \_TopN[[Order[emp_no{f}#4,ASC,LAST]],20[INTEGER]] + * \_EsRelation[test][_meta_field{f}#10, emp_no{f}#4, first_name{f}#5, ge..] + */ + public void testSortMvExpandLimit() { + LogicalPlan plan = optimizedPlan(""" + from test + | sort emp_no + | mv_expand emp_no + | limit 20"""); + + var limit = as(plan, Limit.class); + assertThat(limit.limit().fold(), is(20)); + var expand = as(limit.child(), MvExpand.class); + var topN = as(expand.child(), TopN.class); + assertThat(topN.limit().fold(), is(20)); + var row = as(topN.child(), EsRelation.class); + } + + /** + * Expected: + * Limit[500[INTEGER]] + * \_MvExpand[b{r}#4,b{r}#8,-1] + * \_Limit[500[INTEGER]] + * \_Row[[1[INTEGER] AS a, -15[INTEGER] AS b]] + * + * see https://github.com/elastic/elasticsearch/issues/102084 + */ + public void testWhereMvExpand() { + LogicalPlan plan = optimizedPlan(""" + row a = 1, b = -15 + | where b < 3 + | mv_expand b"""); + + var limit = as(plan, Limit.class); + assertThat(limit.limit().fold(), is(500)); + var expand = as(limit.child(), MvExpand.class); + var limit2 = as(expand.child(), Limit.class); + assertThat(limit2.limit().fold(), is(500)); + var row = as(limit2.child(), Row.class); + } + private static List orderNames(TopN topN) { return topN.order().stream().map(o -> as(o.child(), NamedExpression.class).name()).toList(); } @@ -2430,30 +2552,6 @@ public void testEliminateDuplicateAggsNonCount() { var source = as(agg.child(), EsRelation.class); } - /** - * Expected - * Limit[2[INTEGER]] - * \_Filter[a{r}#6 > 2[INTEGER]] - * \_MvExpand[a{r}#2,a{r}#6] - * \_Row[[[1, 2, 3][INTEGER] AS a]] - */ - public void testMvExpandFoldable() { - LogicalPlan plan = optimizedPlan(""" - row a = [1, 2, 3] - | mv_expand a - | where a > 2 - | limit 2"""); - - var limit = as(plan, Limit.class); - var filter = as(limit.child(), Filter.class); - var expand = as(filter.child(), MvExpand.class); - assertThat(filter.condition(), instanceOf(GreaterThan.class)); - var filterProp = ((GreaterThan) filter.condition()).left(); - assertTrue(expand.expanded().semanticEquals(filterProp)); - assertFalse(expand.target().semanticEquals(filterProp)); - var row = as(expand.child(), Row.class); - } - /** * Expected * Limit[500[INTEGER]]