ESQL: Fix alias removal in regex extraction with JOIN (#127687) (#128204)

astefan · kanoshiou · elasticsearchmachine · web-flow · commit 2dc646fa4ae1 · 2025-05-21T15:43:51.000+10:00
* ESQL: Fix alias removal in regex extraction with `JOIN` (#127687) * Disallow removal of regex extracted fields --------- Co-authored-by: Andrei Stefan <astefan@users.noreply.github.com> Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co> (cherry picked from commit 557f1f1) * Checkstyle --------- Co-authored-by: kanoshiou <73424326+kanoshiou@users.noreply.github.com> Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
diff --git a/docs/changelog/127687.yaml b/docs/changelog/127687.yaml
@@ -0,0 +1,6 @@
+pr: 127687
+summary: "ESQL: Fix alias removal in regex extraction with JOIN"
+area: ES|QL
+type: bug
+issues:
+  - 127467
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec
@@ -1567,3 +1567,72 @@ null            | Milky Way                                     | Marunouchi
 null            | null                                          | null
 null            | null                                          | null
 ;
+
+
+joinMaskingRegex
+// https://github.com/elastic/elasticsearch/issues/127467
+required_capability: union_types
+required_capability: join_lookup_v12
+required_capability: fix_join_masking_regex_extract
+from books,message_*,ul* 
+| enrich languages_policy on status 
+| drop `language_name`, `bytes_out`, `id`, id 
+| dissect book_no "%{type}" 
+| dissect author.keyword "%{HZicfARaID}" 
+| mv_expand `status` 
+| sort HZicfARaID, year DESC NULLS LAST, publisher DESC NULLS FIRST, description DESC, type NULLS LAST, message ASC NULLS LAST, title NULLS FIRST, status NULLS LAST 
+| enrich languages_policy on book_no 
+| grok message "%{WORD:DiLNyZKNDu}" 
+| limit 7972
+| rename year as language_code 
+| lookup join languages_lookup on language_code 
+| limit 13966 
+| stats  rcyIZnSOb = min(language_code), `ratings` = min(@timestamp), dgDxwMeFYrD = count(`@timestamp`), ifyZfXigqVN = count(*), qTXdrzSpY = min(language_code) by author.keyword
+| rename author.keyword as message 
+| lookup join message_types_lookup on message 
+| stats  `ratings` = count(*) by type 
+| stats  `type` = count(type), `ratings` = count(*) 
+| keep `ratings`, ratings
+;
+
+ratings:long
+1
+;
+
+joinMaskingDissect
+// https://github.com/elastic/elasticsearch/issues/127467
+required_capability: join_lookup_v12
+required_capability: fix_join_masking_regex_extract
+from sample_data
+| dissect message "%{type}" 
+| drop type
+| lookup join message_types_lookup on message
+| stats count = count(*) by type
+| keep count
+| sort count
+;
+count:long
+1
+3
+3
+;
+
+
+joinMaskingGrok
+// https://github.com/elastic/elasticsearch/issues/127467
+required_capability: join_lookup_v12
+required_capability: fix_join_masking_regex_extract
+from sample_data
+| grok message "%{WORD:type}"
+| drop type
+| lookup join message_types_lookup on message
+| stats max = max(event_duration) by type
+| keep max
+| sort max
+;
+
+max:long
+1232382
+3450233
+8268153
+;
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
@@ -854,7 +854,13 @@ public enum Cap {
          * Support for keeping `DROP` attributes when resolving field names.
          * see <a href="https://github.com/elastic/elasticsearch/issues/126418"> ES|QL: no matches for pattern #126418 </a>
          */
-        DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL;
+        DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL,
+
+        /**
+         * During resolution (pre-analysis) we have to consider that joins can override regex extracted values
+         * see <a href="https://github.com/elastic/elasticsearch/issues/127467"> ES|QL: pruning of JOINs leads to missing fields #127467</a>
+         */
+        FIX_JOIN_MASKING_REGEX_EXTRACT;
 
         private final boolean enabled;
 
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java
@@ -39,6 +39,8 @@
 import org.elasticsearch.xpack.esql.core.expression.Expressions;
 import org.elasticsearch.xpack.esql.core.expression.FoldContext;
 import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute;
+import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
+import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute;
 import org.elasticsearch.xpack.esql.core.expression.UnresolvedAttribute;
 import org.elasticsearch.xpack.esql.core.expression.UnresolvedStar;
 import org.elasticsearch.xpack.esql.core.util.Holder;
@@ -570,11 +572,7 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
 
         parsed.forEachDown(p -> {// go over each plan top-down
             if (p instanceof RegexExtract re) { // for Grok and Dissect
-                // remove other down-the-tree references to the extracted fields
-                for (Attribute extracted : re.extractedFields()) {
-                    referencesBuilder.removeIf(attr -> matchByName(attr, extracted.name(), false));
-                }
-                // but keep the inputs needed by Grok/Dissect
+                // keep the inputs needed by Grok/Dissect
                 referencesBuilder.addAll(re.input().references());
             } else if (p instanceof Enrich enrich) {
                 AttributeSet enrichFieldRefs = Expressions.references(enrich.enrichFields());
@@ -629,15 +627,19 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
                 // remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree
                 // for example "from test | eval x = salary | stats max = max(x) by gender"
                 // remove the UnresolvedAttribute "x", since that is an Alias defined in "eval"
+                // also remove other down-the-tree references to the extracted fields from "grok" and "dissect"
                 AttributeSet planRefs = p.references();
                 Set<String> fieldNames = planRefs.names();
-                p.forEachExpressionDown(Alias.class, alias -> {
+                p.forEachExpressionDown(NamedExpression.class, ne -> {
+                    if ((ne instanceof Alias || ne instanceof ReferenceAttribute) == false) {
+                        return;
+                    }
                     // do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id AS id"
                     // or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)"
-                    if (fieldNames.contains(alias.name())) {
+                    if (fieldNames.contains(ne.name())) {
                         return;
                     }
-                    referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), shadowingRefsBuilder.contains(attr)));
+                    referencesBuilder.removeIf(attr -> matchByName(attr, ne.name(), shadowingRefsBuilder.contains(attr)));
                 });
             }
         });
diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java
@@ -1341,6 +1341,53 @@ public void testDissectOverwriteName() {
         assertThat(fieldNames, equalTo(Set.of("emp_no", "emp_no.*", "first_name", "first_name.*")));
     }
 
+    /**
+     * Fix alias removal in regex extraction with JOIN
+     * @see <a href="https://github.com/elastic/elasticsearch/issues/127467">ES|QL: pruning of JOINs leads to missing fields</a>
+      */
+    public void testAvoidGrokAttributesRemoval() {
+        assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
+        Set<String> fieldNames = fieldNames("""
+            from message_types
+            | eval type = 1
+            | lookup join message_types_lookup on message
+            | drop  message
+            | grok type "%{WORD:b}"
+            | stats x = max(b)
+            | keep x""", Set.of());
+        assertThat(fieldNames, equalTo(Set.of("message", "x", "x.*", "message.*")));
+    }
+
+    public void testAvoidGrokAttributesRemoval2() {
+        assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
+        Set<String> fieldNames = fieldNames("""
+            from sample_data
+            | dissect message "%{type}"
+            | drop type
+            | lookup join message_types_lookup on message
+            | stats count = count(*) by type
+            | keep count
+            | sort count""", Set.of());
+        assertThat(fieldNames, equalTo(Set.of("type", "message", "count", "message.*", "type.*", "count.*")));
+    }
+
+    public void testAvoidGrokAttributesRemoval3() {
+        assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
+        Set<String> fieldNames = fieldNames("""
+            from sample_data
+            | grok message "%{WORD:type}"
+            | drop type
+            | lookup join message_types_lookup on message
+            | stats max = max(event_duration) by type
+            | keep max
+            | sort max""", Set.of());
+        assertThat(
+            fieldNames,
+            equalTo(Set.of("type", "event_duration", "message", "max", "event_duration.*", "message.*", "type.*", "max.*"))
+        );
+
+    }
+
     public void testEnrichOnDefaultField() {
         Set<String> fieldNames = fieldNames("""
             from employees