elastic · astefan · May 20, 2025 · May 4, 2025 · May 4, 2025 · May 5, 2025
diff --git a/docs/changelog/127687.yaml b/docs/changelog/127687.yaml
@@ -0,0 +1,6 @@
+pr: 127687
+summary: "ESQL: Fix alias removal in regex extraction with JOIN"
+area: ES|QL
+type: bug
+issues:
+  - 127467
diff --git a/...ver/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java b/...ver/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java
@@ -54,7 +54,6 @@ public abstract class GenerativeRestTest extends ESRestTestCase {
                                         // https://github.com/elastic/elasticsearch/issues/127167
         "optimized incorrectly due to missing references", // https://github.com/elastic/elasticsearch/issues/116781
         "No matches found for pattern", // https://github.com/elastic/elasticsearch/issues/126418
-        "Unknown column", // https://github.com/elastic/elasticsearch/issues/127467
         "only supports KEYWORD or TEXT values", // https://github.com/elastic/elasticsearch/issues/127468
         "The incoming YAML document exceeds the limit:" // still to investigate, but it seems to be specific to the test framework
     );

diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec
@@ -1650,3 +1650,69 @@ event_duration:long
 2764889
 3450233
 ;
+
+
+joinMaskingRegex
+required_capability: union_types
+required_capability: join_lookup_v12
+required_capability: fix_join_masking_regex_extract
+from books,message_*,ul* 
+| enrich languages_policy on status 
+| drop `language_name`, `bytes_out`, `id`, id 
+| dissect book_no "%{type}" 
+| dissect author.keyword "%{HZicfARaID}" 
+| mv_expand `status` 
+| sort HZicfARaID, year DESC NULLS LAST, publisher DESC NULLS FIRST, description DESC, type NULLS LAST, message ASC NULLS LAST, title NULLS FIRST, status NULLS LAST 
+| enrich languages_policy on book_no 
+| grok message "%{WORD:DiLNyZKNDu}" 
+| limit 7972
+| rename year as language_code 
+| lookup join languages_lookup on language_code 
+| limit 13966 
+| stats  rcyIZnSOb = min(language_code), `ratings` = min(@timestamp), dgDxwMeFYrD = count(`@timestamp`), ifyZfXigqVN = count(*), qTXdrzSpY = min(language_code) by author.keyword
+| rename author.keyword as message 
+| lookup join message_types_lookup on message 
+| stats  `ratings` = count(*) by type 
+| stats  `type` = count(type), `ratings` = count(*) 
+| keep `ratings`, ratings
+;
+
+ratings:long
+1
+;
+
+joinMaskingDissect
+required_capability: join_lookup_v12
+required_capability: fix_join_masking_regex_extract
+from sample_data
+| dissect message "%{type}" 
+| drop type
+| lookup join message_types_lookup on message
+| stats count = count(*) by type
+| keep count
+| sort count
+;
+count:long
+1
+3
+3
+;
+
+
+joinMaskingGrok
+required_capability: join_lookup_v12
+required_capability: fix_join_masking_regex_extract
+from sample_data
+| grok message "%{WORD:type}"
+| drop type
+| lookup join message_types_lookup on message
+| stats max = max(event_duration) by type
+| keep max
+| sort max
+;
+
+max:long
+1232382
+3450233
+8268153
+;
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
@@ -1036,6 +1036,12 @@ public enum Cap {
          */
         FIX_JOIN_MASKING_EVAL,
 
+        /**
+         * During resolution (pre-analysis) we have to consider that joins can override regex extracted values
+         * see <a href="https://github.com/elastic/elasticsearch/issues/127467"> ES|QL: pruning of JOINs leads to missing fields #127467 </a>
+         */
+        FIX_JOIN_MASKING_REGEX_EXTRACT,
+
         /**
          * Support last_over_time aggregation that gets evaluated per time-series
          */

diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java
@@ -600,9 +600,11 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
 
         parsed.forEachDown(p -> {// go over each plan top-down
             if (p instanceof RegexExtract re) { // for Grok and Dissect
-                // remove other down-the-tree references to the extracted fields
-                for (Attribute extracted : re.extractedFields()) {
-                    referencesBuilder.removeIf(attr -> matchByName(attr, extracted.name(), false));
+                if (canRemoveAliases[0]) {
+                    // remove other down-the-tree references to the extracted fields
+                    for (Attribute extracted : re.extractedFields()) {
+                        referencesBuilder.removeIf(attr -> matchByName(attr, extracted.name(), false));
+                    }
                 }
                 // but keep the inputs needed by Grok/Dissect
                 referencesBuilder.addAll(re.input().references());

diff --git a/...esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java b/...esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java
@@ -1710,6 +1710,28 @@ public void testEnrichAndJoinMaskingEvalWh() {
             | keep emp_no, language_name""", Set.of("emp_no", "language_name", "languages", "language_name.*", "languages.*", "emp_no.*"));
     }
 
+    public void testJoinMaskingGrok() {
+        assertFieldNames("""
+              from message_types
+              | grok message "%{WORD:type}"
+              | drop type
+              | lookup join message_types_lookup on message
+              | stats `ratings` = count(*) by type
+              | keep ratings
+            """, Set.of("type", "message", "ratings", "message.*", "type.*", "ratings.*"));
+    }
+
+    public void testJoinMaskingDissect() {
+        assertFieldNames("""
+              from message_types
+              | dissect message "%{type}"
+              | stats message = max(message)
+              | lookup join message_types_lookup on message
+              | stats `ratings` = count(*) by type
+              | keep ratings
+            """, Set.of("type", "message", "ratings", "message.*", "type.*", "ratings.*"));
+    }
+
     private Set<String> fieldNames(String query, Set<String> enrichPolicyMatchFields) {
         var preAnalysisResult = new EsqlSession.PreAnalysisResult(null);
         return EsqlSession.fieldNames(parser.createStatement(query), enrichPolicyMatchFields, preAnalysisResult).fieldNames();