Skip to content

Commit c59b7d9

Browse files
committed
ESQL: Push more ==s on text fields to lucene (elastic#126641)
If you do: ``` | WHERE text_field == "cat" ``` we can't push to the text field because it's search index is for individual words. But most text fields have a `.keyword` sub field and we *can* query it's index. EXCEPT! It's normal for these fields to have `ignore_above` in their mapping. In that case we don't push to the field. Very sad. With this change we can push down `==`, but only when the right hand side is shorter than the `ignore_above`. This has pretty much infinite speed gain. An example using a million documents: ``` Before: "took" : 391, After: "took" : 4, ``` But this is going from totally un-indexed linear scans to totally indexed. You can make the "Before" number as high as you want by loading more data.
1 parent 31e4aed commit c59b7d9

File tree

48 files changed

+738
-94
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+738
-94
lines changed

docs/changelog/126641.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 126641
2+
summary: Push more `==`s on text fields to lucene
3+
area: ES|QL
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -986,6 +986,21 @@ public boolean canUseSyntheticSourceDelegateForQuerying() {
986986
&& syntheticSourceDelegate.isIndexed();
987987
}
988988

989+
/**
990+
* Returns true if the delegate sub-field can be used for querying only (ie. isIndexed must be true)
991+
*/
992+
public boolean canUseSyntheticSourceDelegateForQueryingEquality(String str) {
993+
if (syntheticSourceDelegate == null
994+
// Can't push equality to an index if there isn't an index
995+
|| syntheticSourceDelegate.isIndexed() == false
996+
// ESQL needs docs values to push equality
997+
|| syntheticSourceDelegate.hasDocValues() == false) {
998+
return false;
999+
}
1000+
// Can't push equality if the field we're checking for is so big we'd ignore it.
1001+
return str.length() <= syntheticSourceDelegate.ignoreAbove();
1002+
}
1003+
9891004
@Override
9901005
public BlockLoader blockLoader(BlockLoaderContext blContext) {
9911006
if (canUseSyntheticSourceDelegateForLoading()) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.xpack.esql.qa.single_node;
9+
10+
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
11+
12+
import org.elasticsearch.client.Request;
13+
import org.elasticsearch.client.Response;
14+
import org.elasticsearch.test.ListMatcher;
15+
import org.elasticsearch.test.MapMatcher;
16+
import org.elasticsearch.test.TestClustersThreadFilter;
17+
import org.elasticsearch.test.cluster.ElasticsearchCluster;
18+
import org.elasticsearch.test.rest.ESRestTestCase;
19+
import org.elasticsearch.xcontent.XContentType;
20+
import org.elasticsearch.xpack.esql.AssertWarnings;
21+
import org.elasticsearch.xpack.esql.qa.rest.RestEsqlTestCase;
22+
import org.junit.ClassRule;
23+
24+
import java.io.IOException;
25+
import java.util.ArrayList;
26+
import java.util.List;
27+
import java.util.Map;
28+
import java.util.regex.Pattern;
29+
30+
import static org.elasticsearch.test.ListMatcher.matchesList;
31+
import static org.elasticsearch.test.MapMatcher.assertMap;
32+
import static org.elasticsearch.test.MapMatcher.matchesMap;
33+
import static org.elasticsearch.xpack.esql.EsqlTestUtils.entityToMap;
34+
import static org.elasticsearch.xpack.esql.qa.rest.RestEsqlTestCase.requestObjectBuilder;
35+
import static org.elasticsearch.xpack.esql.qa.rest.RestEsqlTestCase.runEsql;
36+
import static org.elasticsearch.xpack.esql.qa.single_node.RestEsqlIT.commonProfile;
37+
import static org.elasticsearch.xpack.esql.qa.single_node.RestEsqlIT.fixTypesOnProfile;
38+
import static org.hamcrest.Matchers.equalTo;
39+
import static org.hamcrest.Matchers.instanceOf;
40+
import static org.hamcrest.Matchers.startsWith;
41+
42+
/**
43+
* Tests for pushing queries to lucene.
44+
*/
45+
@ThreadLeakFilters(filters = TestClustersThreadFilter.class)
46+
public class PushQueriesIT extends ESRestTestCase {
47+
@ClassRule
48+
public static ElasticsearchCluster cluster = Clusters.testCluster();
49+
50+
public void testPushEqualityOnDefaults() throws IOException {
51+
String value = "v".repeat(between(0, 256));
52+
testPushQuery(value, """
53+
FROM test
54+
| WHERE test == "%value"
55+
""", "#test.keyword:%value -_ignored:test.keyword", false);
56+
}
57+
58+
public void testPushEqualityOnDefaultsTooBigToPush() throws IOException {
59+
String value = "a".repeat(between(257, 1000));
60+
testPushQuery(value, """
61+
FROM test
62+
| WHERE test == "%value"
63+
""", "*:*", true);
64+
}
65+
66+
public void testPushCaseInsensitiveEqualityOnDefaults() throws IOException {
67+
String value = "a".repeat(between(0, 256));
68+
testPushQuery(value, """
69+
FROM test
70+
| WHERE TO_LOWER(test) == "%value"
71+
""", "*:*", true);
72+
}
73+
74+
private void testPushQuery(String value, String esqlQuery, String luceneQuery, boolean filterInCompute) throws IOException {
75+
indexValue(value);
76+
77+
RestEsqlTestCase.RequestObjectBuilder builder = requestObjectBuilder().query(
78+
esqlQuery.replaceAll("%value", value) + "\n| KEEP test"
79+
);
80+
builder.profile(true);
81+
Map<String, Object> result = runEsql(builder, new AssertWarnings.NoWarnings(), RestEsqlTestCase.Mode.SYNC);
82+
assertResultMap(
83+
result,
84+
getResultMatcher(result).entry("profile", matchesMap().entry("drivers", instanceOf(List.class))),
85+
matchesList().item(matchesMap().entry("name", "test").entry("type", "text")),
86+
equalTo(List.of(List.of(value)))
87+
);
88+
89+
@SuppressWarnings("unchecked")
90+
List<Map<String, Object>> profiles = (List<Map<String, Object>>) ((Map<String, Object>) result.get("profile")).get("drivers");
91+
for (Map<String, Object> p : profiles) {
92+
fixTypesOnProfile(p);
93+
assertThat(p, commonProfile());
94+
List<String> sig = new ArrayList<>();
95+
@SuppressWarnings("unchecked")
96+
List<Map<String, Object>> operators = (List<Map<String, Object>>) p.get("operators");
97+
for (Map<String, Object> o : operators) {
98+
sig.add(checkOperatorProfile(o, luceneQuery.replaceAll("%value", value)));
99+
}
100+
String description = p.get("task_description").toString();
101+
switch (description) {
102+
case "data" -> {
103+
ListMatcher matcher = matchesList().item("LuceneSourceOperator").item("ValuesSourceReaderOperator");
104+
if (filterInCompute) {
105+
matcher = matcher.item("FilterOperator").item("LimitOperator");
106+
}
107+
matcher = matcher.item("ProjectOperator").item("ExchangeSinkOperator");
108+
assertMap(sig, matcher);
109+
}
110+
case "node_reduce" -> assertMap(sig, matchesList().item("ExchangeSourceOperator").item("ExchangeSinkOperator"));
111+
case "final" -> assertMap(
112+
sig,
113+
matchesList().item("ExchangeSourceOperator").item("LimitOperator").item("ProjectOperator").item("OutputOperator")
114+
);
115+
default -> throw new IllegalArgumentException("can't match " + description);
116+
}
117+
}
118+
}
119+
120+
private void indexValue(String value) throws IOException {
121+
Request createIndex = new Request("PUT", "test");
122+
createIndex.setJsonEntity("""
123+
{
124+
"settings": {
125+
"index": {
126+
"number_of_shards": 1
127+
}
128+
}
129+
}""");
130+
Response createResponse = client().performRequest(createIndex);
131+
assertThat(
132+
entityToMap(createResponse.getEntity(), XContentType.JSON),
133+
matchesMap().entry("shards_acknowledged", true).entry("index", "test").entry("acknowledged", true)
134+
);
135+
136+
Request bulk = new Request("POST", "/_bulk");
137+
bulk.addParameter("refresh", "");
138+
bulk.setJsonEntity(String.format("""
139+
{"create":{"_index":"test"}}
140+
{"test":"%s"}
141+
""", value));
142+
Response bulkResponse = client().performRequest(bulk);
143+
assertThat(entityToMap(bulkResponse.getEntity(), XContentType.JSON), matchesMap().entry("errors", false).extraOk());
144+
}
145+
146+
private static final Pattern TO_NAME = Pattern.compile("\\[.+", Pattern.DOTALL);
147+
148+
private static String checkOperatorProfile(Map<String, Object> o, String query) {
149+
String name = (String) o.get("operator");
150+
name = TO_NAME.matcher(name).replaceAll("");
151+
if (name.equals("LuceneSourceOperator")) {
152+
MapMatcher expectedOp = matchesMap().entry("operator", startsWith(name))
153+
.entry("status", matchesMap().entry("processed_queries", List.of(query)).extraOk());
154+
assertMap(o, expectedOp);
155+
}
156+
return name;
157+
}
158+
159+
@Override
160+
protected String getTestRestCluster() {
161+
return cluster.getHttpAddresses();
162+
}
163+
}

x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/RestEsqlIT.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -660,7 +660,8 @@ public void testForceSleepsProfile() throws IOException {
660660
}
661661

662662
public static MapMatcher commonProfile() {
663-
return matchesMap().entry("task_description", any(String.class))
663+
return matchesMap() //
664+
.entry("task_description", any(String.class))
664665
.entry("start_millis", greaterThan(0L))
665666
.entry("stop_millis", greaterThan(0L))
666667
.entry("iterations", greaterThan(0L))

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ public class CsvTestsDataLoader {
115115
private static final TestDataset ADDRESSES = new TestDataset("addresses");
116116
private static final TestDataset BOOKS = new TestDataset("books").withSetting("books-settings.json");
117117
private static final TestDataset SEMANTIC_TEXT = new TestDataset("semantic_text").withInferenceEndpoint(true);
118+
private static final TestDataset MV_TEXT = new TestDataset("mv_text");
118119

119120
public static final Map<String, TestDataset> CSV_DATASET_MAP = Map.ofEntries(
120121
Map.entry(EMPLOYEES.indexName, EMPLOYEES),
@@ -160,7 +161,8 @@ public class CsvTestsDataLoader {
160161
Map.entry(DISTANCES.indexName, DISTANCES),
161162
Map.entry(ADDRESSES.indexName, ADDRESSES),
162163
Map.entry(BOOKS.indexName, BOOKS),
163-
Map.entry(SEMANTIC_TEXT.indexName, SEMANTIC_TEXT)
164+
Map.entry(SEMANTIC_TEXT.indexName, SEMANTIC_TEXT),
165+
Map.entry(MV_TEXT.indexName, MV_TEXT)
164166
);
165167

166168
private static final EnrichConfig LANGUAGES_ENRICH = new EnrichConfig("languages_policy", "enrich-policy-languages.json");

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/EsqlTestUtils.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,11 @@ public byte[] max(String field, DataType dataType) {
285285
public boolean isSingleValue(String field) {
286286
return false;
287287
}
288+
289+
@Override
290+
public boolean canUseEqualityOnSyntheticSourceDelegate(String name, String value) {
291+
return false;
292+
}
288293
}
289294

290295
/**
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
@timestamp:date ,message:text
2+
2023-10-23T13:55:01.543Z,[Connected to 10.1.0.1, Banana]
3+
2023-10-23T13:55:01.544Z,Connected to 10.1.0.1
4+
2023-10-23T13:55:01.545Z,[Connected to 10.1.0.1, More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100]
5+
2023-10-23T13:55:01.546Z,More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"properties" : {
3+
"emp_no" : {
4+
"type" : "integer"
5+
},
6+
"first_name" : {
7+
"type" : "keyword"
8+
},
9+
"gender" : {
10+
"type" : "text"
11+
},
12+
"languages" : {
13+
"type" : "byte"
14+
},
15+
"last_name" : {
16+
"type" : "keyword"
17+
},
18+
"salary" : {
19+
"type" : "integer"
20+
},
21+
"_meta_field": {
22+
"type" : "keyword"
23+
},
24+
"hire_date": {
25+
"type": "date"
26+
},
27+
"job": {
28+
"type": "text",
29+
"fields": {
30+
"raw": {
31+
"type": "keyword",
32+
"ignore_above": 4
33+
}
34+
}
35+
},
36+
"long_noidx": {
37+
"type": "long",
38+
"index": false,
39+
"doc_values": false
40+
}
41+
}
42+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"properties": {
3+
"@timestamp": {
4+
"type": "date"
5+
},
6+
"message": {
7+
"type": "text",
8+
"fields": {
9+
"raw": {
10+
"type": "keyword",
11+
"ignore_above": 100
12+
}
13+
}
14+
}
15+
}
16+
}

x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2302,3 +2302,27 @@ message:keyword
23022302
foo ( bar
23032303
// end::rlikeEscapingTripleQuotes-result[]
23042304
;
2305+
2306+
mvStringEquals
2307+
FROM mv_text
2308+
| WHERE message == "Connected to 10.1.0.1"
2309+
| KEEP @timestamp, message
2310+
;
2311+
warning:Line 2:9: evaluation of [message == \"Connected to 10.1.0.1\"] failed, treating result as null. Only first 20 failures recorded.
2312+
warning:Line 2:9: java.lang.IllegalArgumentException: single-value function encountered multi-value
2313+
2314+
@timestamp:date | message:text
2315+
2023-10-23T13:55:01.544Z|Connected to 10.1.0.1
2316+
;
2317+
2318+
mvStringEqualsLongString
2319+
FROM mv_text
2320+
| WHERE message == "More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100"
2321+
| KEEP @timestamp, message
2322+
;
2323+
warning:Line 2:9: evaluation of [message == \"More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100\"] failed, treating result as null. Only first 20 failures recorded.
2324+
warning:Line 2:9: java.lang.IllegalArgumentException: single-value function encountered multi-value
2325+
2326+
@timestamp:date | message:text
2327+
2023-10-23T13:55:01.546Z|More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100
2328+
;

0 commit comments

Comments
 (0)