-
Notifications
You must be signed in to change notification settings - Fork 25.6k
ES|QL: fix join masking eval #126614
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ES|QL: fix join masking eval #126614
Changes from 17 commits
27865d7
0d35e21
08803dc
b820057
fc44c26
132c587
dcc9e18
fb205bd
3c1ccea
7d5d67b
812cbe8
5f3b001
f2f0a3d
ae267ac
6720f9b
1a23c32
ab77736
cf4719d
ecb72e9
d6fdb22
e6f51bb
8744884
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| pr: 126614 | ||
| summary: Fix join masking eval | ||
| area: ES|QL | ||
| type: bug | ||
| issues: [] |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -1000,7 +1000,13 @@ public enum Cap { | |||||
| /** | ||||||
| * Support loading of ip fields if they are not indexed. | ||||||
| */ | ||||||
| LOADING_NON_INDEXED_IP_FIELDS; | ||||||
| LOADING_NON_INDEXED_IP_FIELDS, | ||||||
|
|
||||||
| /** | ||||||
| * During resolution (pre-analysis) we have to consider that joins can override EVALuated values | ||||||
|
||||||
| * During resolution (pre-analysis) we have to consider that joins can override EVALuated values | |
| * During resolution (pre-analysis) we have to consider that joins or enriches can override EVALuated values |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -60,13 +60,24 @@ | |||||
| import org.elasticsearch.xpack.esql.parser.QueryParams; | ||||||
| import org.elasticsearch.xpack.esql.plan.IndexPattern; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Aggregate; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Drop; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Enrich; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Eval; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Filter; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Fork; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.InlineStats; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Insist; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Keep; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Limit; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.MvExpand; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.OrderBy; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Project; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.RegexExtract; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.Rename; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.TopN; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.UnresolvedRelation; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.inference.Completion; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.inference.InferencePlan; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.join.InlineJoin; | ||||||
| import org.elasticsearch.xpack.esql.plan.logical.join.JoinTypes; | ||||||
|
|
@@ -500,6 +511,7 @@ private void preAnalyzeMainIndices( | |||||
|
|
||||||
| /** | ||||||
| * Check if there are any clusters to search. | ||||||
| * | ||||||
| * @return true if there are no clusters to search, false otherwise | ||||||
| */ | ||||||
| private boolean allCCSClustersSkipped( | ||||||
|
|
@@ -612,6 +624,8 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy | |||||
| var keepJoinRefsBuilder = AttributeSet.builder(); | ||||||
| Set<String> wildcardJoinIndices = new java.util.HashSet<>(); | ||||||
|
|
||||||
| boolean[] canRemoveAliases = new boolean[] { true }; | ||||||
|
|
||||||
| parsed.forEachDown(p -> {// go over each plan top-down | ||||||
| if (p instanceof RegexExtract re) { // for Grok and Dissect | ||||||
| // remove other down-the-tree references to the extracted fields | ||||||
|
|
@@ -657,20 +671,28 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy | |||||
| } | ||||||
| } | ||||||
|
|
||||||
| // remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree | ||||||
| // for example "from test | eval x = salary | stats max = max(x) by gender" | ||||||
| // remove the UnresolvedAttribute "x", since that is an Alias defined in "eval" | ||||||
| AttributeSet planRefs = p.references(); | ||||||
| Set<String> fieldNames = planRefs.names(); | ||||||
| p.forEachExpressionDown(Alias.class, alias -> { | ||||||
| // do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id = id" | ||||||
| // or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)" | ||||||
| if (fieldNames.contains(alias.name())) { | ||||||
| return; | ||||||
| } | ||||||
| referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), keepCommandRefsBuilder.contains(attr))); | ||||||
| }); | ||||||
| // If there are joins, enriches etc. in the middle, these could override some of these fields. | ||||||
|
||||||
| // We don't know at this stage, so we have to keep all of them. | ||||||
| if (canRemoveAliases[0] && couldOverrideAliases(p)) { | ||||||
| canRemoveAliases[0] = false; | ||||||
| } | ||||||
| if (canRemoveAliases[0]) { | ||||||
| // remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree | ||||||
| // for example "from test | eval x = salary | stats max = max(x) by gender" | ||||||
| // remove the UnresolvedAttribute "x", since that is an Alias defined in "eval" | ||||||
| AttributeSet planRefs = p.references(); | ||||||
| Set<String> fieldNames = planRefs.names(); | ||||||
| p.forEachExpressionDown(Alias.class, alias -> { | ||||||
| // do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id = id" | ||||||
|
||||||
| // do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id = id" | |
| // do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id AS id" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are you listing all of these that cannot define an additional "hidden" Attribute instead of checking those that can?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To avoid that it breaks when we add new commands that behave like JOIN.
As a follow-up, IMHO we should define an interface for commands that allow this optimization (similar to SortAgnostic) and replace this long list with a single instanceof
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we add a command that doesn't behave like JOIN, then it could break the other way around. Imo, it makes much more sense to list those commands that are "special".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we add a command that doesn't behave like JOIN, then it could break the other way around
This is interesting, I'm not sure I have completely clear the implications of this aspect.
If I got it right, we use these field names to limit the scope of the field_caps; if we add a field that does not exist in any of the involved indices, will field_caps fail?
My understanding is that it will work fine, just ignoring the additional fields.
If it wasn't the case, then we would be in a catch-22: we couldn't know which fields to send before validating the query, and we couldn't validate the query before sending the fields to field_caps.
The fact that everything works fine makes me think that it's safe, but maybe I'm missing something.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about completion command (I see it implements GeneratingPlan), is this a command that can add a "hidden" attribute that can be overiden?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think Completion is safe (the target attribute is defined at parsing time), but I have to double-check.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I got it right, we use these field names to limit the scope of the field_caps; if we add a field that does not exist in any of the involved indices, will field_caps fail?
My understanding is that it will work fine, just ignoring the additional fields.
Yep, I see your point. Adding more fields is safer. Added less fields is problematic. Better more fields than less.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -478,13 +478,16 @@ public void testDropAllColumns_WithStats() { | |
| } | ||
|
|
||
| public void testEnrichOn() { | ||
| assertFieldNames(""" | ||
| from employees | ||
| | sort emp_no | ||
| | limit 1 | ||
| | eval x = to_string(languages) | ||
| | enrich languages_policy on x | ||
| | keep emp_no, language_name""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*")); | ||
| assertFieldNames( | ||
| """ | ||
| from employees | ||
| | sort emp_no | ||
| | limit 1 | ||
| | eval x = to_string(languages) | ||
| | enrich languages_policy on x | ||
| | keep emp_no, language_name""", | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
| ); | ||
| } | ||
|
|
||
| public void testEnrichOn2() { | ||
|
|
@@ -494,7 +497,7 @@ public void testEnrichOn2() { | |
| | enrich languages_policy on x | ||
| | keep emp_no, language_name | ||
| | sort emp_no | ||
| | limit 1""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*")); | ||
| | limit 1""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*")); | ||
| } | ||
|
|
||
| public void testUselessEnrich() { | ||
|
|
@@ -512,15 +515,15 @@ public void testSimpleSortLimit() { | |
| | enrich languages_policy on x | ||
| | keep emp_no, language_name | ||
| | sort emp_no | ||
| | limit 1""", Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*")); | ||
| | limit 1""", Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*", "x", "x.*")); | ||
| } | ||
|
|
||
| public void testWith() { | ||
| assertFieldNames( | ||
| """ | ||
| from employees | eval x = to_string(languages) | keep emp_no, x | sort emp_no | limit 1 | ||
| | enrich languages_policy on x with language_name""", | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
| ); | ||
| } | ||
|
|
||
|
|
@@ -529,7 +532,7 @@ public void testWithAlias() { | |
| """ | ||
| from employees | sort emp_no | limit 3 | eval x = to_string(languages) | keep emp_no, x | ||
| | enrich languages_policy on x with lang = language_name""", | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
| ); | ||
| } | ||
|
|
||
|
|
@@ -538,7 +541,7 @@ public void testWithAliasSort() { | |
| """ | ||
| from employees | eval x = to_string(languages) | keep emp_no, x | sort emp_no | limit 3 | ||
| | enrich languages_policy on x with lang = language_name""", | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
| ); | ||
| } | ||
|
|
||
|
|
@@ -547,7 +550,7 @@ public void testWithAliasAndPlain() { | |
| """ | ||
| from employees | sort emp_no desc | limit 3 | eval x = to_string(languages) | keep emp_no, x | ||
| | enrich languages_policy on x with lang = language_name, language_name""", | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
| ); | ||
| } | ||
|
|
||
|
|
@@ -556,7 +559,7 @@ public void testWithTwoAliasesSameProp() { | |
| """ | ||
| from employees | sort emp_no | limit 1 | eval x = to_string(languages) | keep emp_no, x | ||
| | enrich languages_policy on x with lang = language_name, lang2 = language_name""", | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
| ); | ||
| } | ||
|
|
||
|
|
@@ -565,7 +568,7 @@ public void testRedundantWith() { | |
| """ | ||
| from employees | sort emp_no | limit 1 | eval x = to_string(languages) | keep emp_no, x | ||
| | enrich languages_policy on x with language_name, language_name""", | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
| ); | ||
| } | ||
|
|
||
|
|
@@ -588,28 +591,34 @@ public void testConstantNullInput() { | |
| | eval x = to_string(languages) | ||
| | keep emp_no, x | ||
| | enrich languages_policy on x with language_name, language_name""", | ||
| Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*") | ||
| Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*", "x", "x.*") | ||
| ); | ||
| } | ||
|
|
||
| public void testEnrichEval() { | ||
| assertFieldNames(""" | ||
| from employees | ||
| | eval x = to_string(languages) | ||
| | enrich languages_policy on x with lang = language_name | ||
| | eval language = concat(x, "-", lang) | ||
| | keep emp_no, x, lang, language | ||
| | sort emp_no desc | limit 3""", Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*")); | ||
| assertFieldNames( | ||
| """ | ||
| from employees | ||
| | eval x = to_string(languages) | ||
| | enrich languages_policy on x with lang = language_name | ||
| | eval language = concat(x, "-", lang) | ||
| | keep emp_no, x, lang, language | ||
| | sort emp_no desc | limit 3""", | ||
| Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*", "x", "x.*", "lang", "lang.*") | ||
| ); | ||
| } | ||
|
|
||
| public void testSimple() { | ||
| assertFieldNames(""" | ||
| from employees | ||
| | eval x = 1, y = to_string(languages) | ||
| | enrich languages_policy on y | ||
| | where x > 1 | ||
| | keep emp_no, language_name | ||
| | limit 1""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*")); | ||
| assertFieldNames( | ||
| """ | ||
| from employees | ||
| | eval x = 1, y = to_string(languages) | ||
| | enrich languages_policy on y | ||
| | where x > 1 | ||
| | keep emp_no, language_name | ||
| | limit 1""", | ||
| Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "y", "x.*", "y.*") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly, we don't know the structure of the enrich index yet, after the ENRICH x could become something else (eg. a keyword) and the WHERE could be invalid |
||
| ); | ||
| } | ||
|
|
||
| public void testEvalNullSort() { | ||
|
|
@@ -1653,6 +1662,54 @@ public void testInsist_multiFieldMappedMultiIndex() { | |
| ); | ||
| } | ||
|
|
||
| public void testJoinMaskingKeep() { | ||
| assertFieldNames( | ||
| """ | ||
| from languag* | ||
| | eval type = null | ||
| | rename language_name as message | ||
| | lookup join message_types_lookup on message | ||
| | rename type as message | ||
| | lookup join message_types_lookup on message | ||
| | keep `language.name`""", | ||
| Set.of("language.name", "type", "language_name", "message", "language_name.*", "message.*", "type.*", "language.name.*") | ||
| ); | ||
| } | ||
|
|
||
| public void testJoinMaskingKeep2() { | ||
| assertFieldNames(""" | ||
| from languag* | ||
| | eval type = "foo" | ||
| | rename type as message | ||
| | lookup join message_types_lookup on message | ||
| | rename type as message | ||
| | lookup join message_types_lookup on message | ||
| | keep `language.name`""", Set.of("language.name", "type", "message", "message.*", "type.*", "language.name.*")); | ||
| } | ||
|
|
||
| public void testEnrichMaskingEvalOn() { | ||
| assertFieldNames(""" | ||
| from employees | ||
| | eval langague_name = null | ||
|
||
| | enrich languages_policy on languages | ||
| | rename language_name as languages | ||
| | eval languages = length(language_name) | ||
|
||
| | enrich languages_policy on languages | ||
| | keep emp_no, language_name""", Set.of("emp_no", "language_name", "languages", "language_name.*", "languages.*", "emp_no.*")); | ||
| } | ||
|
|
||
| public void testEnrichAndJoinMaskingEvalWh() { | ||
| assertFieldNames(""" | ||
| from employees | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same strange "typos" here as well. |
||
| | eval langague_name = null | ||
| | enrich languages_policy on languages | ||
| | rename language_name as languages | ||
| | eval languages = length(language_name) | ||
| | enrich languages_policy on languages | ||
| | lookup join message_types_lookup on language_name | ||
| | keep emp_no, language_name""", Set.of("emp_no", "language_name", "languages", "language_name.*", "languages.*", "emp_no.*")); | ||
| } | ||
|
|
||
| private Set<String> fieldNames(String query, Set<String> enrichPolicyMatchFields) { | ||
| var preAnalysisResult = new EsqlSession.PreAnalysisResult(null); | ||
| return EsqlSession.fieldNames(parser.createStatement(query), enrichPolicyMatchFields, preAnalysisResult).fieldNames(); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please, add this test and others (be creative) to
IndexResolverFieldNamesTests.