From ca75e89e39965c61f9fc3012874a2d3c69037419 Mon Sep 17 00:00:00 2001 From: kanoshiou <73424326+kanoshiou@users.noreply.github.com> Date: Tue, 20 May 2025 20:06:24 +0800 Subject: [PATCH] ESQL: Fix alias removal in regex extraction with `JOIN` (#127687) * Disallow removal of regex extracted fields --------- Co-authored-by: Andrei Stefan Co-authored-by: elasticsearchmachine (cherry picked from commit 557f1f12b353950d315f5dda4d391e8f8a06b933) --- docs/changelog/127687.yaml | 6 ++ .../src/main/resources/lookup-join.csv-spec | 69 +++++++++++++++++++ .../xpack/esql/action/EsqlCapabilities.java | 8 ++- .../xpack/esql/session/EsqlSession.java | 18 ++--- .../session/IndexResolverFieldNamesTests.java | 47 +++++++++++++ 5 files changed, 139 insertions(+), 9 deletions(-) create mode 100644 docs/changelog/127687.yaml diff --git a/docs/changelog/127687.yaml b/docs/changelog/127687.yaml new file mode 100644 index 0000000000000..e053c4a31ad2e --- /dev/null +++ b/docs/changelog/127687.yaml @@ -0,0 +1,6 @@ +pr: 127687 +summary: "ESQL: Fix alias removal in regex extraction with JOIN" +area: ES|QL +type: bug +issues: + - 127467 diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec index b7dc04a46f6e2..63a47424ac314 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec @@ -1592,3 +1592,72 @@ null | Milky Way | Marunouchi null | null | null null | null | null ; + + +joinMaskingRegex +// https://github.com/elastic/elasticsearch/issues/127467 +required_capability: union_types +required_capability: join_lookup_v12 +required_capability: fix_join_masking_regex_extract +from books,message_*,ul* +| enrich languages_policy on status +| drop `language_name`, `bytes_out`, `id`, id +| dissect book_no "%{type}" +| dissect author.keyword "%{HZicfARaID}" +| mv_expand `status` +| sort HZicfARaID, year DESC NULLS LAST, publisher DESC NULLS FIRST, description DESC, type NULLS LAST, message ASC NULLS LAST, title NULLS FIRST, status NULLS LAST +| enrich languages_policy on book_no +| grok message "%{WORD:DiLNyZKNDu}" +| limit 7972 +| rename year as language_code +| lookup join languages_lookup on language_code +| limit 13966 +| stats rcyIZnSOb = min(language_code), `ratings` = min(@timestamp), dgDxwMeFYrD = count(`@timestamp`), ifyZfXigqVN = count(*), qTXdrzSpY = min(language_code) by author.keyword +| rename author.keyword as message +| lookup join message_types_lookup on message +| stats `ratings` = count(*) by type +| stats `type` = count(type), `ratings` = count(*) +| keep `ratings`, ratings +; + +ratings:long +1 +; + +joinMaskingDissect +// https://github.com/elastic/elasticsearch/issues/127467 +required_capability: join_lookup_v12 +required_capability: fix_join_masking_regex_extract +from sample_data +| dissect message "%{type}" +| drop type +| lookup join message_types_lookup on message +| stats count = count(*) by type +| keep count +| sort count +; +count:long +1 +3 +3 +; + + +joinMaskingGrok +// https://github.com/elastic/elasticsearch/issues/127467 +required_capability: join_lookup_v12 +required_capability: fix_join_masking_regex_extract +from sample_data +| grok message "%{WORD:type}" +| drop type +| lookup join message_types_lookup on message +| stats max = max(event_duration) by type +| keep max +| sort max +; + +max:long +1232382 +3450233 +8268153 +; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index b4210cad2d9d4..f1245c019c9ce 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -895,7 +895,13 @@ public enum Cap { * Support for keeping `DROP` attributes when resolving field names. * see ES|QL: no matches for pattern #126418 */ - DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL; + DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL, + + /** + * During resolution (pre-analysis) we have to consider that joins can override regex extracted values + * see ES|QL: pruning of JOINs leads to missing fields #127467 + */ + FIX_JOIN_MASKING_REGEX_EXTRACT; private final boolean enabled; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java index 5e139f3a1963c..a77d0852ca311 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java @@ -41,6 +41,8 @@ import org.elasticsearch.xpack.esql.core.expression.Expressions; import org.elasticsearch.xpack.esql.core.expression.FoldContext; import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute; +import org.elasticsearch.xpack.esql.core.expression.NamedExpression; +import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute; import org.elasticsearch.xpack.esql.core.expression.UnresolvedAttribute; import org.elasticsearch.xpack.esql.core.expression.UnresolvedStar; import org.elasticsearch.xpack.esql.core.util.Holder; @@ -594,11 +596,7 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set enrichPolicy parsed.forEachDown(p -> {// go over each plan top-down if (p instanceof RegexExtract re) { // for Grok and Dissect - // remove other down-the-tree references to the extracted fields - for (Attribute extracted : re.extractedFields()) { - referencesBuilder.removeIf(attr -> matchByName(attr, extracted.name(), false)); - } - // but keep the inputs needed by Grok/Dissect + // keep the inputs needed by Grok/Dissect referencesBuilder.addAll(re.input().references()); } else if (p instanceof Enrich enrich) { AttributeSet enrichFieldRefs = Expressions.references(enrich.enrichFields()); @@ -653,15 +651,19 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set enrichPolicy // remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree // for example "from test | eval x = salary | stats max = max(x) by gender" // remove the UnresolvedAttribute "x", since that is an Alias defined in "eval" + // also remove other down-the-tree references to the extracted fields from "grok" and "dissect" AttributeSet planRefs = p.references(); Set fieldNames = planRefs.names(); - p.forEachExpressionDown(Alias.class, alias -> { + p.forEachExpressionDown(NamedExpression.class, ne -> { + if ((ne instanceof Alias || ne instanceof ReferenceAttribute) == false) { + return; + } // do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id AS id" // or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)" - if (fieldNames.contains(alias.name())) { + if (fieldNames.contains(ne.name())) { return; } - referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), shadowingRefsBuilder.contains(attr))); + referencesBuilder.removeIf(attr -> matchByName(attr, ne.name(), shadowingRefsBuilder.contains(attr))); }); } }); diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java index ad132aba74a28..1de69a7a70bde 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java @@ -1341,6 +1341,53 @@ public void testDissectOverwriteName() { assertThat(fieldNames, equalTo(Set.of("emp_no", "emp_no.*", "first_name", "first_name.*"))); } + /** + * Fix alias removal in regex extraction with JOIN + * @see ES|QL: pruning of JOINs leads to missing fields + */ + public void testAvoidGrokAttributesRemoval() { + assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled()); + Set fieldNames = fieldNames(""" + from message_types + | eval type = 1 + | lookup join message_types_lookup on message + | drop message + | grok type "%{WORD:b}" + | stats x = max(b) + | keep x""", Set.of()); + assertThat(fieldNames, equalTo(Set.of("message", "x", "x.*", "message.*"))); + } + + public void testAvoidGrokAttributesRemoval2() { + assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled()); + Set fieldNames = fieldNames(""" + from sample_data + | dissect message "%{type}" + | drop type + | lookup join message_types_lookup on message + | stats count = count(*) by type + | keep count + | sort count""", Set.of()); + assertThat(fieldNames, equalTo(Set.of("type", "message", "count", "message.*", "type.*", "count.*"))); + } + + public void testAvoidGrokAttributesRemoval3() { + assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled()); + Set fieldNames = fieldNames(""" + from sample_data + | grok message "%{WORD:type}" + | drop type + | lookup join message_types_lookup on message + | stats max = max(event_duration) by type + | keep max + | sort max""", Set.of()); + assertThat( + fieldNames, + equalTo(Set.of("type", "event_duration", "message", "max", "event_duration.*", "message.*", "type.*", "max.*")) + ); + + } + public void testEnrichOnDefaultField() { Set fieldNames = fieldNames(""" from employees