From fcb87b53ff2e680f5fd55478e38765b3f13ae692 Mon Sep 17 00:00:00 2001 From: Luigi Dell'Aquila Date: Tue, 15 Apr 2025 18:07:05 +0200 Subject: [PATCH] ES|QL: fix join masking eval (#126614) --- docs/changelog/126614.yaml | 5 + .../rest/generative/GenerativeRestTest.java | 9 +- .../src/main/resources/lookup-join.csv-spec | 42 +++++++ .../xpack/esql/action/EsqlCapabilities.java | 8 +- .../xpack/esql/session/EsqlSession.java | 79 ++++++++++-- .../session/IndexResolverFieldNamesTests.java | 117 +++++++++++++----- 6 files changed, 214 insertions(+), 46 deletions(-) create mode 100644 docs/changelog/126614.yaml diff --git a/docs/changelog/126614.yaml b/docs/changelog/126614.yaml new file mode 100644 index 0000000000000..e8424c8c78245 --- /dev/null +++ b/docs/changelog/126614.yaml @@ -0,0 +1,5 @@ +pr: 126614 +summary: Fix join masking eval +area: ES|QL +type: bug +issues: [] diff --git a/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java b/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java index 0ceeb132f5b5c..5bd38da8d013b 100644 --- a/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java +++ b/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java @@ -44,8 +44,13 @@ public abstract class GenerativeRestTest extends ESRestTestCase { // Awaiting fixes "estimated row size \\[0\\] wasn't set", // https://github.com/elastic/elasticsearch/issues/121739 "unknown physical plan node \\[OrderExec\\]", // https://github.com/elastic/elasticsearch/issues/120817 - "Unknown column \\[\\]", // https://github.com/elastic/elasticsearch/issues/121741 - // + "Unknown column \\[\\]", // https://github.com/elastic/elasticsearch/issues/121741, + "Plan \\[ProjectExec\\[\\[.* optimized incorrectly due to missing references", // https://github.com/elastic/elasticsearch/issues/125866 + "only supports KEYWORD or TEXT values, found expression", // https://github.com/elastic/elasticsearch/issues/126017 + "token recognition error at: '``", // https://github.com/elastic/elasticsearch/issues/125870 + "Unknown column \\[.*\\]", // https://github.com/elastic/elasticsearch/issues/126026 + "optimized incorrectly due to missing references", // https://github.com/elastic/elasticsearch/issues/116781 + "No matches found for pattern", // https://github.com/elastic/elasticsearch/issues/126418 "The incoming YAML document exceeds the limit:" // still to investigate, but it seems to be specific to the test framework ); diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec index 56b35b9754c89..fa9104fcaf7b3 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec @@ -1527,3 +1527,45 @@ from * salary_change.long:double|foo:long 5.0 |1698069301543123456 ; + + +joinMaskingEval +required_capability: join_lookup_v12 +required_capability: fix_join_masking_eval +from languag* +| eval type = null +| rename language_name as message +| lookup join message_types_lookup on message +| rename type as message +| lookup join message_types_lookup on message +| keep `language.name` +; + +ignoreOrder:true +language.name:text +null +null +null +null +null +null +null +null +null +null +null +null +null +null +null +null +null +null +null +null +null +English +French +Spanish +German +; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index 543b3a6289ada..f01e2043d5c13 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -877,7 +877,13 @@ public enum Cap { * the ownership of that block - but didn't account for the fact that the caller might close it, leading to double releases * in some union type queries. C.f. https://github.com/elastic/elasticsearch/issues/125850 */ - FIX_DOUBLY_RELEASED_NULL_BLOCKS_IN_VALUESOURCEREADER; + FIX_DOUBLY_RELEASED_NULL_BLOCKS_IN_VALUESOURCEREADER, + + /** + * During resolution (pre-analysis) we have to consider that joins or enriches can override EVALuated values + * https://github.com/elastic/elasticsearch/issues/126419 + */ + FIX_JOIN_MASKING_EVAL; private final boolean enabled; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java index 8bb960414a4d7..7ff1c947c2bce 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java @@ -58,11 +58,21 @@ import org.elasticsearch.xpack.esql.parser.QueryParams; import org.elasticsearch.xpack.esql.plan.IndexPattern; import org.elasticsearch.xpack.esql.plan.logical.Aggregate; +import org.elasticsearch.xpack.esql.plan.logical.Drop; import org.elasticsearch.xpack.esql.plan.logical.Enrich; +import org.elasticsearch.xpack.esql.plan.logical.Eval; +import org.elasticsearch.xpack.esql.plan.logical.Filter; +import org.elasticsearch.xpack.esql.plan.logical.InlineStats; +import org.elasticsearch.xpack.esql.plan.logical.Insist; import org.elasticsearch.xpack.esql.plan.logical.Keep; +import org.elasticsearch.xpack.esql.plan.logical.Limit; import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan; +import org.elasticsearch.xpack.esql.plan.logical.MvExpand; +import org.elasticsearch.xpack.esql.plan.logical.OrderBy; import org.elasticsearch.xpack.esql.plan.logical.Project; import org.elasticsearch.xpack.esql.plan.logical.RegexExtract; +import org.elasticsearch.xpack.esql.plan.logical.Rename; +import org.elasticsearch.xpack.esql.plan.logical.TopN; import org.elasticsearch.xpack.esql.plan.logical.UnresolvedRelation; import org.elasticsearch.xpack.esql.plan.logical.join.InlineJoin; import org.elasticsearch.xpack.esql.plan.logical.join.JoinTypes; @@ -574,6 +584,8 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set enrichPolicy var keepJoinRefsBuilder = AttributeSet.builder(); Set wildcardJoinIndices = new java.util.HashSet<>(); + boolean[] canRemoveAliases = new boolean[] { true }; + parsed.forEachDown(p -> {// go over each plan top-down if (p instanceof RegexExtract re) { // for Grok and Dissect // remove other down-the-tree references to the extracted fields @@ -619,20 +631,37 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set enrichPolicy } } - // remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree - // for example "from test | eval x = salary | stats max = max(x) by gender" - // remove the UnresolvedAttribute "x", since that is an Alias defined in "eval" - AttributeSet planRefs = p.references(); - Set fieldNames = planRefs.names(); - p.forEachExpressionDown(Alias.class, alias -> { - // do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id = id" - // or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)" - if (fieldNames.contains(alias.name())) { - return; - } - referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), keepCommandRefsBuilder.contains(attr))); - }); + // If the current node in the tree is of type JOIN (lookup join, inlinestats) or ENRICH or other type of + // command that we may add in the future which can override already defined Aliases with EVAL + // (for example + // + // from test + // | eval ip = 123 + // | enrich ips_policy ON hostname + // | rename ip AS my_ip + // + // and ips_policy enriches the results with the same name ip field), + // these aliases should be kept in the list of fields. + if (canRemoveAliases[0] && couldOverrideAliases(p)) { + canRemoveAliases[0] = false; + } + if (canRemoveAliases[0]) { + // remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree + // for example "from test | eval x = salary | stats max = max(x) by gender" + // remove the UnresolvedAttribute "x", since that is an Alias defined in "eval" + AttributeSet planRefs = p.references(); + Set fieldNames = planRefs.names(); + p.forEachExpressionDown(Alias.class, alias -> { + // do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id AS id" + // or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)" + if (fieldNames.contains(alias.name())) { + return; + } + referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), keepCommandRefsBuilder.contains(attr))); + }); + } }); + // Add JOIN ON column references afterward to avoid Alias removal referencesBuilder.addAll(keepJoinRefsBuilder); // If any JOIN commands need wildcard field-caps calls, persist the index names @@ -656,6 +685,30 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set enrichPolicy } } + /** + * Could a plan "accidentally" override aliases? + * Examples are JOIN and ENRICH, that _could_ produce fields with the same + * name of an existing alias, based on their index mapping. + * Here we just have to consider commands where this information is not available before index resolution, + * eg. EVAL, GROK, DISSECT can override an alias, but we know it in advance, ie. we don't need to resolve indices to know. + */ + private static boolean couldOverrideAliases(LogicalPlan p) { + return (p instanceof Aggregate + || p instanceof Drop + || p instanceof Eval + || p instanceof Filter + || p instanceof InlineStats + || p instanceof Insist + || p instanceof Keep + || p instanceof Limit + || p instanceof MvExpand + || p instanceof OrderBy + || p instanceof Project + || p instanceof RegexExtract + || p instanceof Rename + || p instanceof TopN) == false; + } + private static boolean matchByName(Attribute attr, String other, boolean skipIfPattern) { boolean isPattern = Regex.isSimpleMatchPattern(attr.name()); if (skipIfPattern && isPattern) { diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java index 6b797710bec31..8c8c7751ff737 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java @@ -478,13 +478,16 @@ public void testDropAllColumns_WithStats() { } public void testEnrichOn() { - assertFieldNames(""" - from employees - | sort emp_no - | limit 1 - | eval x = to_string(languages) - | enrich languages_policy on x - | keep emp_no, language_name""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*")); + assertFieldNames( + """ + from employees + | sort emp_no + | limit 1 + | eval x = to_string(languages) + | enrich languages_policy on x + | keep emp_no, language_name""", + Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") + ); } public void testEnrichOn2() { @@ -494,7 +497,7 @@ public void testEnrichOn2() { | enrich languages_policy on x | keep emp_no, language_name | sort emp_no - | limit 1""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*")); + | limit 1""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*")); } public void testUselessEnrich() { @@ -512,7 +515,7 @@ public void testSimpleSortLimit() { | enrich languages_policy on x | keep emp_no, language_name | sort emp_no - | limit 1""", Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*")); + | limit 1""", Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*", "x", "x.*")); } public void testWith() { @@ -520,7 +523,7 @@ public void testWith() { """ from employees | eval x = to_string(languages) | keep emp_no, x | sort emp_no | limit 1 | enrich languages_policy on x with language_name""", - Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") + Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") ); } @@ -529,7 +532,7 @@ public void testWithAlias() { """ from employees | sort emp_no | limit 3 | eval x = to_string(languages) | keep emp_no, x | enrich languages_policy on x with lang = language_name""", - Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") + Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") ); } @@ -538,7 +541,7 @@ public void testWithAliasSort() { """ from employees | eval x = to_string(languages) | keep emp_no, x | sort emp_no | limit 3 | enrich languages_policy on x with lang = language_name""", - Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") + Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") ); } @@ -547,7 +550,7 @@ public void testWithAliasAndPlain() { """ from employees | sort emp_no desc | limit 3 | eval x = to_string(languages) | keep emp_no, x | enrich languages_policy on x with lang = language_name, language_name""", - Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") + Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") ); } @@ -556,7 +559,7 @@ public void testWithTwoAliasesSameProp() { """ from employees | sort emp_no | limit 1 | eval x = to_string(languages) | keep emp_no, x | enrich languages_policy on x with lang = language_name, lang2 = language_name""", - Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") + Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") ); } @@ -565,7 +568,7 @@ public void testRedundantWith() { """ from employees | sort emp_no | limit 1 | eval x = to_string(languages) | keep emp_no, x | enrich languages_policy on x with language_name, language_name""", - Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") + Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") ); } @@ -588,28 +591,34 @@ public void testConstantNullInput() { | eval x = to_string(languages) | keep emp_no, x | enrich languages_policy on x with language_name, language_name""", - Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*") + Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*", "x", "x.*") ); } public void testEnrichEval() { - assertFieldNames(""" - from employees - | eval x = to_string(languages) - | enrich languages_policy on x with lang = language_name - | eval language = concat(x, "-", lang) - | keep emp_no, x, lang, language - | sort emp_no desc | limit 3""", Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*")); + assertFieldNames( + """ + from employees + | eval x = to_string(languages) + | enrich languages_policy on x with lang = language_name + | eval language = concat(x, "-", lang) + | keep emp_no, x, lang, language + | sort emp_no desc | limit 3""", + Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*", "x", "x.*", "lang", "lang.*") + ); } public void testSimple() { - assertFieldNames(""" - from employees - | eval x = 1, y = to_string(languages) - | enrich languages_policy on y - | where x > 1 - | keep emp_no, language_name - | limit 1""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*")); + assertFieldNames( + """ + from employees + | eval x = 1, y = to_string(languages) + | enrich languages_policy on y + | where x > 1 + | keep emp_no, language_name + | limit 1""", + Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "y", "x.*", "y.*") + ); } public void testEvalNullSort() { @@ -1653,6 +1662,54 @@ public void testInsist_multiFieldMappedMultiIndex() { ); } + public void testJoinMaskingKeep() { + assertFieldNames( + """ + from languag* + | eval type = null + | rename language_name as message + | lookup join message_types_lookup on message + | rename type as message + | lookup join message_types_lookup on message + | keep `language.name`""", + Set.of("language.name", "type", "language_name", "message", "language_name.*", "message.*", "type.*", "language.name.*") + ); + } + + public void testJoinMaskingKeep2() { + assertFieldNames(""" + from languag* + | eval type = "foo" + | rename type as message + | lookup join message_types_lookup on message + | rename type as message + | lookup join message_types_lookup on message + | keep `language.name`""", Set.of("language.name", "type", "message", "message.*", "type.*", "language.name.*")); + } + + public void testEnrichMaskingEvalOn() { + assertFieldNames(""" + from employees + | eval language_name = null + | enrich languages_policy on languages + | rename language_name as languages + | eval languages = length(languages) + | enrich languages_policy on languages + | keep emp_no, language_name""", Set.of("emp_no", "language_name", "languages", "language_name.*", "languages.*", "emp_no.*")); + } + + public void testEnrichAndJoinMaskingEvalWh() { + assertFieldNames(""" + from employees + | eval language_name = null + | enrich languages_policy on languages + | rename language_name as languages + | eval languages = length(languages) + | enrich languages_policy on languages + | lookup join message_types_lookup on language_name + | keep emp_no, language_name""", Set.of("emp_no", "language_name", "languages", "language_name.*", "languages.*", "emp_no.*")); + } + private Set fieldNames(String query, Set enrichPolicyMatchFields) { var preAnalysisResult = new EsqlSession.PreAnalysisResult(null); return EsqlSession.fieldNames(parser.createStatement(query), enrichPolicyMatchFields, preAnalysisResult).fieldNames();