Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/127687.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 127687
summary: "ESQL: Fix alias removal in regex extraction with JOIN"
area: ES|QL
type: bug
issues:
- 127467
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ public abstract class GenerativeRestTest extends ESRestTestCase {
// https://github.com/elastic/elasticsearch/issues/127167
"optimized incorrectly due to missing references", // https://github.com/elastic/elasticsearch/issues/116781
"No matches found for pattern", // https://github.com/elastic/elasticsearch/issues/126418
"Unknown column", // https://github.com/elastic/elasticsearch/issues/127467
"only supports KEYWORD or TEXT values", // https://github.com/elastic/elasticsearch/issues/127468
"The incoming YAML document exceeds the limit:" // still to investigate, but it seems to be specific to the test framework
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1650,3 +1650,69 @@ event_duration:long
2764889
3450233
;


joinMaskingRegex
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not a big fan of having such test queries here. At least, add a comment with the link to the original bug report.

required_capability: union_types
required_capability: join_lookup_v12
required_capability: fix_join_masking_regex_extract
from books,message_*,ul*
| enrich languages_policy on status
| drop `language_name`, `bytes_out`, `id`, id
| dissect book_no "%{type}"
| dissect author.keyword "%{HZicfARaID}"
| mv_expand `status`
| sort HZicfARaID, year DESC NULLS LAST, publisher DESC NULLS FIRST, description DESC, type NULLS LAST, message ASC NULLS LAST, title NULLS FIRST, status NULLS LAST
| enrich languages_policy on book_no
| grok message "%{WORD:DiLNyZKNDu}"
| limit 7972
| rename year as language_code
| lookup join languages_lookup on language_code
| limit 13966
| stats rcyIZnSOb = min(language_code), `ratings` = min(@timestamp), dgDxwMeFYrD = count(`@timestamp`), ifyZfXigqVN = count(*), qTXdrzSpY = min(language_code) by author.keyword
| rename author.keyword as message
| lookup join message_types_lookup on message
| stats `ratings` = count(*) by type
| stats `type` = count(type), `ratings` = count(*)
| keep `ratings`, ratings
;

ratings:long
1
;

joinMaskingDissect
required_capability: join_lookup_v12
required_capability: fix_join_masking_regex_extract
from sample_data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add this test and the one below in IndexResolverFieldNamesTests as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you missed this request here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, sorry! I misunderstood what you meant. I was actually thinking about adding comments of links to the original issue.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added in d56383b

| dissect message "%{type}"
| drop type
| lookup join message_types_lookup on message
| stats count = count(*) by type
| keep count
| sort count
;
count:long
1
3
3
;


joinMaskingGrok
required_capability: join_lookup_v12
required_capability: fix_join_masking_regex_extract
from sample_data
| grok message "%{WORD:type}"
| drop type
| lookup join message_types_lookup on message
| stats max = max(event_duration) by type
| keep max
| sort max
;

max:long
1232382
3450233
8268153
;
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,12 @@ public enum Cap {
*/
FIX_JOIN_MASKING_EVAL,

/**
* During resolution (pre-analysis) we have to consider that joins can override regex extracted values
* see <a href="https://github.com/elastic/elasticsearch/issues/127467"> ES|QL: pruning of JOINs leads to missing fields #127467 </a>
*/
FIX_JOIN_MASKING_REGEX_EXTRACT,

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please, add this capability at the end of the list of capabilities. It will be slightly easier for me to manually backport the PR, if the automatic backport fails.

/**
* Support last_over_time aggregation that gets evaluated per time-series
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -600,9 +600,11 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy

parsed.forEachDown(p -> {// go over each plan top-down
if (p instanceof RegexExtract re) { // for Grok and Dissect
// remove other down-the-tree references to the extracted fields
for (Attribute extracted : re.extractedFields()) {
referencesBuilder.removeIf(attr -> matchByName(attr, extracted.name(), false));
if (canRemoveAliases[0]) {
// remove other down-the-tree references to the extracted fields
for (Attribute extracted : re.extractedFields()) {
referencesBuilder.removeIf(attr -> matchByName(attr, extracted.name(), false));
}
}
// but keep the inputs needed by Grok/Dissect
referencesBuilder.addAll(re.input().references());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1710,6 +1710,28 @@ public void testEnrichAndJoinMaskingEvalWh() {
| keep emp_no, language_name""", Set.of("emp_no", "language_name", "languages", "language_name.*", "languages.*", "emp_no.*"));
}

public void testJoinMaskingGrok() {
assertFieldNames("""
from message_types
| grok message "%{WORD:type}"
| drop type
| lookup join message_types_lookup on message
| stats `ratings` = count(*) by type
| keep ratings
""", Set.of("type", "message", "ratings", "message.*", "type.*", "ratings.*"));
}

public void testJoinMaskingDissect() {
assertFieldNames("""
from message_types
| dissect message "%{type}"
| stats message = max(message)
| lookup join message_types_lookup on message
| stats `ratings` = count(*) by type
| keep ratings
""", Set.of("type", "message", "ratings", "message.*", "type.*", "ratings.*"));
}

private Set<String> fieldNames(String query, Set<String> enrichPolicyMatchFields) {
var preAnalysisResult = new EsqlSession.PreAnalysisResult(null);
return EsqlSession.fieldNames(parser.createStatement(query), enrichPolicyMatchFields, preAnalysisResult).fieldNames();
Expand Down
Loading