Skip to content

Commit 8500d42

Browse files
committed
ESQL: text == and text != pushdown (elastic#127355)
Reenables `text ==` pushdown and adds support for `text !=` pushdown. It does so by making `TranslationAware#translatable` return something we can turn into a tri-valued function. It has these values: * `YES` * `NO` * `RECHECK` `YES` means the `Expression` is entirely pushable into Lucene. They will be pushed into Lucene and removed from the plan. `NO` means the `Expression` can't be pushed to Lucene at all and will stay in the plan. `RECHECK` mean the `Expression` can push a query that makes *candidate* matches but must be rechecked. Documents that don't match the query won't match the expression, but documents that match the query might not match the expression. These are pushed to Lucene *and* left in the plan. This is required because `txt != "b"` can build a *candidate* query against the `txt.keyword` subfield but it can't be sure of the match without loading the `_source` - which we do in the compute engine. I haven't plugged rally into this, but here's some basic performance tests: ``` Before: not text eq {"took":460,"documents_found":1000000} text eq {"took":432,"documents_found":1000000} After: text eq {"took":5,"documents_found":1} not text eq {"took":351,"documents_found":800000} ``` This comes from: ``` rm -f /tmp/bulk* for a in {1..1000}; do echo '{"index":{}}' >> /tmp/bulk echo '{"text":"text '$(printf $(($a % 5)))'"}' >> /tmp/bulk done ls -l /tmp/bulk* passwd="redacted" curl -sk -uelastic:$passwd -HContent-Type:application/json -XDELETE https://localhost:9200/test curl -sk -uelastic:$passwd -HContent-Type:application/json -XPUT https://localhost:9200/test -d'{ "settings": { "index.codec": "best_compression", "index.refresh_interval": -1 }, "mappings": { "properties": { "many": { "enabled": false } } } }' for a in {1..1000}; do printf %04d: $a curl -sk -uelastic:$passwd -HContent-Type:application/json -XPOST https://localhost:9200/test/_bulk?pretty --data-binary @/tmp/bulk | grep errors done curl -sk -uelastic:$passwd -HContent-Type:application/json -XPOST https://localhost:9200/test/_forcemerge?max_num_segments=1 curl -sk -uelastic:$passwd -HContent-Type:application/json -XPOST https://localhost:9200/test/_refresh echo curl -sk -uelastic:$passwd https://localhost:9200/_cat/indices?v text_eq() { echo -n " text eq " curl -sk -uelastic:$passwd -HContent-Type:application/json -XPOST 'https://localhost:9200/_query?pretty' -d'{ "query": "FROM test | WHERE text == \"text 1\" | STATS COUNT(*)", "pragma": { "data_partitioning": "shard" } }' | jq -c '{took, documents_found}' } not_text_eq() { echo -n "not text eq " curl -sk -uelastic:$passwd -HContent-Type:application/json -XPOST 'https://localhost:9200/_query?pretty' -d'{ "query": "FROM test | WHERE NOT text == \"text 1\" | STATS COUNT(*)", "pragma": { "data_partitioning": "shard" } }' | jq -c '{took, documents_found}' } for a in {1..100}; do text_eq not_text_eq done ```
1 parent e0b4471 commit 8500d42

File tree

30 files changed

+558
-122
lines changed

30 files changed

+558
-122
lines changed

docs/changelog/127355.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 127355
2+
summary: '`text ==` and `text !=` pushdown'
3+
area: ES|QL
4+
type: enhancement
5+
issues: []

x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/PushQueriesIT.java

Lines changed: 168 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@
77

88
package org.elasticsearch.xpack.esql.qa.single_node;
99

10+
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
1011
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
1112

1213
import org.elasticsearch.client.Request;
1314
import org.elasticsearch.client.Response;
15+
import org.elasticsearch.client.ResponseException;
1416
import org.elasticsearch.test.ListMatcher;
1517
import org.elasticsearch.test.MapMatcher;
1618
import org.elasticsearch.test.TestClustersThreadFilter;
@@ -26,6 +28,7 @@
2628
import java.util.List;
2729
import java.util.Map;
2830
import java.util.regex.Pattern;
31+
import java.util.stream.Stream;
2932

3033
import static org.elasticsearch.test.ListMatcher.matchesList;
3134
import static org.elasticsearch.test.MapMatcher.assertMap;
@@ -47,50 +50,161 @@ public class PushQueriesIT extends ESRestTestCase {
4750
@ClassRule
4851
public static ElasticsearchCluster cluster = Clusters.testCluster();
4952

50-
public void testPushEqualityOnDefaults() throws IOException {
53+
@ParametersFactory(argumentFormatting = "%1s")
54+
public static List<Object[]> args() {
55+
return Stream.of("auto", "text", "match_only_text", "semantic_text").map(s -> new Object[] { s }).toList();
56+
}
57+
58+
private final String type;
59+
60+
public PushQueriesIT(String type) {
61+
this.type = type;
62+
}
63+
64+
public void testEquality() throws IOException {
5165
String value = "v".repeat(between(0, 256));
52-
testPushQuery(value, """
66+
String esqlQuery = """
5367
FROM test
5468
| WHERE test == "%value"
55-
""", "*:*", true, true);
69+
""";
70+
String luceneQuery = switch (type) {
71+
case "text", "auto" -> "#test.keyword:%value -_ignored:test.keyword";
72+
case "match_only_text" -> "*:*";
73+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
74+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
75+
};
76+
boolean filterInCompute = switch (type) {
77+
case "text", "auto" -> false;
78+
case "match_only_text", "semantic_text" -> true;
79+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
80+
};
81+
testPushQuery(value, esqlQuery, luceneQuery, filterInCompute, true);
5682
}
5783

58-
public void testPushEqualityOnDefaultsTooBigToPush() throws IOException {
84+
public void testEqualityTooBigToPush() throws IOException {
5985
String value = "a".repeat(between(257, 1000));
60-
testPushQuery(value, """
86+
String esqlQuery = """
6187
FROM test
6288
| WHERE test == "%value"
63-
""", "*:*", true, true);
89+
""";
90+
String luceneQuery = switch (type) {
91+
case "text", "auto", "match_only_text" -> "*:*";
92+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
93+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
94+
};
95+
testPushQuery(value, esqlQuery, luceneQuery, true, true);
96+
}
97+
98+
/**
99+
* Turns into an {@code IN} which isn't currently pushed.
100+
*/
101+
public void testEqualityOrTooBig() throws IOException {
102+
String value = "v".repeat(between(0, 256));
103+
String tooBig = "a".repeat(between(257, 1000));
104+
String esqlQuery = """
105+
FROM test
106+
| WHERE test == "%value" OR test == "%tooBig"
107+
""".replace("%tooBig", tooBig);
108+
String luceneQuery = switch (type) {
109+
case "text", "auto", "match_only_text" -> "*:*";
110+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
111+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
112+
};
113+
testPushQuery(value, esqlQuery, luceneQuery, true, true);
114+
}
115+
116+
public void testEqualityOrOther() throws IOException {
117+
String value = "v".repeat(between(0, 256));
118+
String esqlQuery = """
119+
FROM test
120+
| WHERE test == "%value" OR foo == 2
121+
""";
122+
String luceneQuery = switch (type) {
123+
case "text", "auto" -> "(#test.keyword:%value -_ignored:test.keyword) foo:[2 TO 2]";
124+
case "match_only_text" -> "*:*";
125+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
126+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
127+
};
128+
boolean filterInCompute = switch (type) {
129+
case "text", "auto" -> false;
130+
case "match_only_text", "semantic_text" -> true;
131+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
132+
};
133+
testPushQuery(value, esqlQuery, luceneQuery, filterInCompute, true);
64134
}
65135

66-
public void testPushInequalityOnDefaults() throws IOException {
136+
public void testEqualityAndOther() throws IOException {
67137
String value = "v".repeat(between(0, 256));
68-
testPushQuery(value, """
138+
String esqlQuery = """
139+
FROM test
140+
| WHERE test == "%value" AND foo == 1
141+
""";
142+
String luceneQuery = switch (type) {
143+
case "text", "auto" -> "#test.keyword:%value -_ignored:test.keyword #foo:[1 TO 1]";
144+
case "match_only_text" -> "foo:[1 TO 1]";
145+
case "semantic_text" ->
146+
/*
147+
* single_value_match is here because there are extra documents hiding in the index
148+
* that don't have the `foo` field.
149+
*/
150+
"#foo:[1 TO 1] #single_value_match(foo)";
151+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
152+
};
153+
boolean filterInCompute = switch (type) {
154+
case "text", "auto" -> false;
155+
case "match_only_text", "semantic_text" -> true;
156+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
157+
};
158+
testPushQuery(value, esqlQuery, luceneQuery, filterInCompute, true);
159+
}
160+
161+
public void testInequality() throws IOException {
162+
String value = "v".repeat(between(0, 256));
163+
String esqlQuery = """
69164
FROM test
70165
| WHERE test != "%different_value"
71-
""", "*:*", true, true);
166+
""";
167+
String luceneQuery = switch (type) {
168+
case "text", "auto" -> "(-test.keyword:%different_value #*:*) _ignored:test.keyword";
169+
case "match_only_text" -> "*:*";
170+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
171+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
172+
};
173+
testPushQuery(value, esqlQuery, luceneQuery, true, true);
72174
}
73175

74-
public void testPushInequalityOnDefaultsTooBigToPush() throws IOException {
176+
public void testInequalityTooBigToPush() throws IOException {
75177
String value = "a".repeat(between(257, 1000));
76-
testPushQuery(value, """
178+
String esqlQuery = """
77179
FROM test
78180
| WHERE test != "%value"
79-
""", "*:*", true, false);
181+
""";
182+
String luceneQuery = switch (type) {
183+
case "text", "auto", "match_only_text" -> "*:*";
184+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
185+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
186+
};
187+
testPushQuery(value, esqlQuery, luceneQuery, true, false);
80188
}
81189

82-
public void testPushCaseInsensitiveEqualityOnDefaults() throws IOException {
190+
public void testCaseInsensitiveEquality() throws IOException {
83191
String value = "a".repeat(between(0, 256));
84-
testPushQuery(value, """
192+
String esqlQuery = """
85193
FROM test
86194
| WHERE TO_LOWER(test) == "%value"
87-
""", "*:*", true, true);
195+
""";
196+
String luceneQuery = switch (type) {
197+
case "text", "auto", "match_only_text" -> "*:*";
198+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
199+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
200+
};
201+
testPushQuery(value, esqlQuery, luceneQuery, true, true);
88202
}
89203

90204
private void testPushQuery(String value, String esqlQuery, String luceneQuery, boolean filterInCompute, boolean found)
91205
throws IOException {
92206
indexValue(value);
93-
String differentValue = randomValueOtherThan(value, () -> randomAlphaOfLength(value.length()));
207+
String differentValue = randomValueOtherThan(value, () -> randomAlphaOfLength(value.isEmpty() ? 1 : value.length()));
94208

95209
String replacedQuery = esqlQuery.replaceAll("%value", value).replaceAll("%different_value", differentValue);
96210
RestEsqlTestCase.RequestObjectBuilder builder = requestObjectBuilder().query(replacedQuery + "\n| KEEP test");
@@ -142,15 +256,43 @@ private void testPushQuery(String value, String esqlQuery, String luceneQuery, b
142256
}
143257

144258
private void indexValue(String value) throws IOException {
259+
try {
260+
// Delete the index if it has already been created.
261+
client().performRequest(new Request("DELETE", "test"));
262+
} catch (ResponseException e) {
263+
if (e.getResponse().getStatusLine().getStatusCode() != 404) {
264+
throw e;
265+
}
266+
}
267+
145268
Request createIndex = new Request("PUT", "test");
146-
createIndex.setJsonEntity("""
269+
String json = """
147270
{
148271
"settings": {
149272
"index": {
150273
"number_of_shards": 1
151274
}
152-
}
153-
}""");
275+
}""";
276+
if (false == "auto".equals(type)) {
277+
json += """
278+
,
279+
"mappings": {
280+
"properties": {
281+
"test": {
282+
"type": "%type",
283+
"fields": {
284+
"keyword": {
285+
"type": "keyword",
286+
"ignore_above": 256
287+
}
288+
}
289+
}
290+
}
291+
}
292+
}""".replace("%type", type);
293+
}
294+
json += "}";
295+
createIndex.setJsonEntity(json);
154296
Response createResponse = client().performRequest(createIndex);
155297
assertThat(
156298
entityToMap(createResponse.getEntity(), XContentType.JSON),
@@ -161,7 +303,7 @@ private void indexValue(String value) throws IOException {
161303
bulk.addParameter("refresh", "");
162304
bulk.setJsonEntity(String.format("""
163305
{"create":{"_index":"test"}}
164-
{"test":"%s"}
306+
{"test":"%s","foo":1}
165307
""", value));
166308
Response bulkResponse = client().performRequest(bulk);
167309
assertThat(entityToMap(bulkResponse.getEntity(), XContentType.JSON), matchesMap().entry("errors", false).extraOk());
@@ -184,4 +326,10 @@ private static String checkOperatorProfile(Map<String, Object> o, String query)
184326
protected String getTestRestCluster() {
185327
return cluster.getHttpAddresses();
186328
}
329+
330+
@Override
331+
protected boolean preserveClusterUponCompletion() {
332+
// Preserve the cluser to speed up the semantic_text tests
333+
return true;
334+
}
187335
}

x-pack/plugin/esql/qa/testFixtures/src/main/resources/data/mv_text.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
2023-10-23T13:55:01.544Z,Connected to 10.1.0.1
44
2023-10-23T13:55:01.545Z,[Connected to 10.1.0.1, More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100]
55
2023-10-23T13:55:01.546Z,More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100
6+
2023-10-23T13:55:01.547Z,[More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100,Second than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100]

0 commit comments

Comments
 (0)