Skip to content

Commit 2dc646f

Browse files
astefankanoshiouelasticsearchmachine
authored
ESQL: Fix alias removal in regex extraction with JOIN (#127687) (#128204)
* ESQL: Fix alias removal in regex extraction with `JOIN` (#127687) * Disallow removal of regex extracted fields --------- Co-authored-by: Andrei Stefan <[email protected]> Co-authored-by: elasticsearchmachine <[email protected]> (cherry picked from commit 557f1f1) * Checkstyle --------- Co-authored-by: kanoshiou <[email protected]> Co-authored-by: elasticsearchmachine <[email protected]>
1 parent fbcdb8d commit 2dc646f

File tree

5 files changed

+139
-9
lines changed

5 files changed

+139
-9
lines changed

docs/changelog/127687.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 127687
2+
summary: "ESQL: Fix alias removal in regex extraction with JOIN"
3+
area: ES|QL
4+
type: bug
5+
issues:
6+
- 127467

x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1567,3 +1567,72 @@ null | Milky Way | Marunouchi
15671567
null | null | null
15681568
null | null | null
15691569
;
1570+
1571+
1572+
joinMaskingRegex
1573+
// https://github.com/elastic/elasticsearch/issues/127467
1574+
required_capability: union_types
1575+
required_capability: join_lookup_v12
1576+
required_capability: fix_join_masking_regex_extract
1577+
from books,message_*,ul*
1578+
| enrich languages_policy on status
1579+
| drop `language_name`, `bytes_out`, `id`, id
1580+
| dissect book_no "%{type}"
1581+
| dissect author.keyword "%{HZicfARaID}"
1582+
| mv_expand `status`
1583+
| sort HZicfARaID, year DESC NULLS LAST, publisher DESC NULLS FIRST, description DESC, type NULLS LAST, message ASC NULLS LAST, title NULLS FIRST, status NULLS LAST
1584+
| enrich languages_policy on book_no
1585+
| grok message "%{WORD:DiLNyZKNDu}"
1586+
| limit 7972
1587+
| rename year as language_code
1588+
| lookup join languages_lookup on language_code
1589+
| limit 13966
1590+
| stats rcyIZnSOb = min(language_code), `ratings` = min(@timestamp), dgDxwMeFYrD = count(`@timestamp`), ifyZfXigqVN = count(*), qTXdrzSpY = min(language_code) by author.keyword
1591+
| rename author.keyword as message
1592+
| lookup join message_types_lookup on message
1593+
| stats `ratings` = count(*) by type
1594+
| stats `type` = count(type), `ratings` = count(*)
1595+
| keep `ratings`, ratings
1596+
;
1597+
1598+
ratings:long
1599+
1
1600+
;
1601+
1602+
joinMaskingDissect
1603+
// https://github.com/elastic/elasticsearch/issues/127467
1604+
required_capability: join_lookup_v12
1605+
required_capability: fix_join_masking_regex_extract
1606+
from sample_data
1607+
| dissect message "%{type}"
1608+
| drop type
1609+
| lookup join message_types_lookup on message
1610+
| stats count = count(*) by type
1611+
| keep count
1612+
| sort count
1613+
;
1614+
count:long
1615+
1
1616+
3
1617+
3
1618+
;
1619+
1620+
1621+
joinMaskingGrok
1622+
// https://github.com/elastic/elasticsearch/issues/127467
1623+
required_capability: join_lookup_v12
1624+
required_capability: fix_join_masking_regex_extract
1625+
from sample_data
1626+
| grok message "%{WORD:type}"
1627+
| drop type
1628+
| lookup join message_types_lookup on message
1629+
| stats max = max(event_duration) by type
1630+
| keep max
1631+
| sort max
1632+
;
1633+
1634+
max:long
1635+
1232382
1636+
3450233
1637+
8268153
1638+
;

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,13 @@ public enum Cap {
854854
* Support for keeping `DROP` attributes when resolving field names.
855855
* see <a href="https://github.com/elastic/elasticsearch/issues/126418"> ES|QL: no matches for pattern #126418 </a>
856856
*/
857-
DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL;
857+
DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL,
858+
859+
/**
860+
* During resolution (pre-analysis) we have to consider that joins can override regex extracted values
861+
* see <a href="https://github.com/elastic/elasticsearch/issues/127467"> ES|QL: pruning of JOINs leads to missing fields #127467</a>
862+
*/
863+
FIX_JOIN_MASKING_REGEX_EXTRACT;
858864

859865
private final boolean enabled;
860866

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
import org.elasticsearch.xpack.esql.core.expression.Expressions;
4040
import org.elasticsearch.xpack.esql.core.expression.FoldContext;
4141
import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute;
42+
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
43+
import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute;
4244
import org.elasticsearch.xpack.esql.core.expression.UnresolvedAttribute;
4345
import org.elasticsearch.xpack.esql.core.expression.UnresolvedStar;
4446
import org.elasticsearch.xpack.esql.core.util.Holder;
@@ -570,11 +572,7 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
570572

571573
parsed.forEachDown(p -> {// go over each plan top-down
572574
if (p instanceof RegexExtract re) { // for Grok and Dissect
573-
// remove other down-the-tree references to the extracted fields
574-
for (Attribute extracted : re.extractedFields()) {
575-
referencesBuilder.removeIf(attr -> matchByName(attr, extracted.name(), false));
576-
}
577-
// but keep the inputs needed by Grok/Dissect
575+
// keep the inputs needed by Grok/Dissect
578576
referencesBuilder.addAll(re.input().references());
579577
} else if (p instanceof Enrich enrich) {
580578
AttributeSet enrichFieldRefs = Expressions.references(enrich.enrichFields());
@@ -629,15 +627,19 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
629627
// remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree
630628
// for example "from test | eval x = salary | stats max = max(x) by gender"
631629
// remove the UnresolvedAttribute "x", since that is an Alias defined in "eval"
630+
// also remove other down-the-tree references to the extracted fields from "grok" and "dissect"
632631
AttributeSet planRefs = p.references();
633632
Set<String> fieldNames = planRefs.names();
634-
p.forEachExpressionDown(Alias.class, alias -> {
633+
p.forEachExpressionDown(NamedExpression.class, ne -> {
634+
if ((ne instanceof Alias || ne instanceof ReferenceAttribute) == false) {
635+
return;
636+
}
635637
// do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id AS id"
636638
// or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)"
637-
if (fieldNames.contains(alias.name())) {
639+
if (fieldNames.contains(ne.name())) {
638640
return;
639641
}
640-
referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), shadowingRefsBuilder.contains(attr)));
642+
referencesBuilder.removeIf(attr -> matchByName(attr, ne.name(), shadowingRefsBuilder.contains(attr)));
641643
});
642644
}
643645
});

x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,6 +1341,53 @@ public void testDissectOverwriteName() {
13411341
assertThat(fieldNames, equalTo(Set.of("emp_no", "emp_no.*", "first_name", "first_name.*")));
13421342
}
13431343

1344+
/**
1345+
* Fix alias removal in regex extraction with JOIN
1346+
* @see <a href="https://github.com/elastic/elasticsearch/issues/127467">ES|QL: pruning of JOINs leads to missing fields</a>
1347+
*/
1348+
public void testAvoidGrokAttributesRemoval() {
1349+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1350+
Set<String> fieldNames = fieldNames("""
1351+
from message_types
1352+
| eval type = 1
1353+
| lookup join message_types_lookup on message
1354+
| drop message
1355+
| grok type "%{WORD:b}"
1356+
| stats x = max(b)
1357+
| keep x""", Set.of());
1358+
assertThat(fieldNames, equalTo(Set.of("message", "x", "x.*", "message.*")));
1359+
}
1360+
1361+
public void testAvoidGrokAttributesRemoval2() {
1362+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1363+
Set<String> fieldNames = fieldNames("""
1364+
from sample_data
1365+
| dissect message "%{type}"
1366+
| drop type
1367+
| lookup join message_types_lookup on message
1368+
| stats count = count(*) by type
1369+
| keep count
1370+
| sort count""", Set.of());
1371+
assertThat(fieldNames, equalTo(Set.of("type", "message", "count", "message.*", "type.*", "count.*")));
1372+
}
1373+
1374+
public void testAvoidGrokAttributesRemoval3() {
1375+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1376+
Set<String> fieldNames = fieldNames("""
1377+
from sample_data
1378+
| grok message "%{WORD:type}"
1379+
| drop type
1380+
| lookup join message_types_lookup on message
1381+
| stats max = max(event_duration) by type
1382+
| keep max
1383+
| sort max""", Set.of());
1384+
assertThat(
1385+
fieldNames,
1386+
equalTo(Set.of("type", "event_duration", "message", "max", "event_duration.*", "message.*", "type.*", "max.*"))
1387+
);
1388+
1389+
}
1390+
13441391
public void testEnrichOnDefaultField() {
13451392
Set<String> fieldNames = fieldNames("""
13461393
from employees

0 commit comments

Comments
 (0)