Skip to content

Commit edc7a61

Browse files
authored
Enable _terms_enum API for version fields (#93839)
The _terms_enum API currently only supports the keyword, constant_keyword and flattened field type. This change adds support for the `version` field type that sorts according to the semantic versioning definition. Closes #83403
1 parent cf38fad commit edc7a61

File tree

12 files changed

+349
-77
lines changed

12 files changed

+349
-77
lines changed

docs/changelog/93839.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 93839
2+
summary: Enable _terms_enum on version fields
3+
area: Search
4+
type: enhancement
5+
issues:
6+
- 83403

docs/reference/search/terms-enum.asciidoc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77
The terms enum API can be used to discover terms in the index that match
88
a partial string. Supported field types are <<keyword-field-type,`keyword`>>,
9-
<<constant-keyword-field-type,`constant_keyword`>> and
10-
<<flattened,`flattened`>>. This is used for auto-complete:
9+
<<constant-keyword-field-type,`constant_keyword`>>, <<flattened,`flattened`>>
10+
and <<version, `version`>>. This is used for auto-complete:
1111

1212
[source,console]
1313
--------------------------------------------------
@@ -109,4 +109,4 @@ query rewrites to `match_none`.
109109
(Optional, string)
110110
The string after which terms in the index should be returned. Allows for a form of
111111
pagination if the last result from one request is passed as the `search_after`
112-
parameter for a subsequent request.
112+
parameter for a subsequent request.

server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import org.apache.lucene.document.SortedSetDocValuesField;
1818
import org.apache.lucene.document.StoredField;
1919
import org.apache.lucene.index.DocValuesType;
20-
import org.apache.lucene.index.FilteredTermsEnum;
2120
import org.apache.lucene.index.IndexOptions;
2221
import org.apache.lucene.index.IndexReader;
2322
import org.apache.lucene.index.LeafReaderContext;
@@ -561,26 +560,6 @@ public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutio
561560
return terms.intersect(automaton, searchBytes);
562561
}
563562

564-
// Initialises with a seek to a given term but excludes that term
565-
// from any results. The problem it addresses is that termsEnum.seekCeil()
566-
// would work but either leaves us positioned on the seek term (if it exists) or the
567-
// term after (if the seek term doesn't exist). That complicates any subsequent
568-
// iteration logic so this class simplifies the pagination use case.
569-
static final class SearchAfterTermsEnum extends FilteredTermsEnum {
570-
private final BytesRef afterRef;
571-
572-
SearchAfterTermsEnum(TermsEnum tenum, BytesRef termText) {
573-
super(tenum);
574-
afterRef = termText;
575-
setInitialSeekTerm(termText);
576-
}
577-
578-
@Override
579-
protected AcceptStatus accept(BytesRef term) {
580-
return term.equals(afterRef) ? AcceptStatus.NO : AcceptStatus.YES;
581-
}
582-
}
583-
584563
/**
585564
* A simple terms implementation for SortedSetDocValues that only provides access to {@link TermsEnum} via
586565
* {@link #iterator} and {@link #intersect(CompiledAutomaton, BytesRef)} methods.
@@ -898,6 +877,7 @@ public int ignoreAbove() {
898877
/**
899878
* @return true if field has been marked as a dimension field
900879
*/
880+
@Override
901881
public boolean isDimension() {
902882
return isDimension;
903883
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0 and the Server Side Public License, v 1; you may not use this file except
5+
* in compliance with, at your election, the Elastic License 2.0 or the Server
6+
* Side Public License, v 1.
7+
*/
8+
9+
package org.elasticsearch.index.mapper;
10+
11+
import org.apache.lucene.index.FilteredTermsEnum;
12+
import org.apache.lucene.index.TermsEnum;
13+
import org.apache.lucene.util.BytesRef;
14+
15+
/**
16+
* This terms enumeration initializes with a seek to a given term but excludes that term
17+
* from any results. The problem it addresses is that termsEnum.seekCeil()
18+
* would work but either leaves us positioned on the seek term (if it exists) or the
19+
* term after (if the seek term doesn't exist). That complicates any subsequent
20+
* iteration logic so this class simplifies the pagination use case.
21+
*/
22+
public final class SearchAfterTermsEnum extends FilteredTermsEnum {
23+
private final BytesRef afterRef;
24+
25+
public SearchAfterTermsEnum(TermsEnum tenum, BytesRef termText) {
26+
super(tenum);
27+
afterRef = termText;
28+
setInitialSeekTerm(termText);
29+
}
30+
31+
@Override
32+
protected AcceptStatus accept(BytesRef term) {
33+
return term.equals(afterRef) ? AcceptStatus.NO : AcceptStatus.YES;
34+
}
35+
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/termsenum/action/MultiShardTermsEnum.java

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,15 @@
2222
import org.apache.lucene.util.PriorityQueue;
2323

2424
import java.io.IOException;
25+
import java.util.ArrayList;
26+
import java.util.List;
27+
import java.util.function.Function;
2528

2629
/**
2730
* Merges terms and stats from multiple TermEnum classes
2831
* This does a merge sort, by term text.
29-
* Adapted from Lucene's MultiTermsEnum and differs in that:
30-
* 1) Only next(), term() and docFreq() methods are supported
31-
* 2) Doc counts are longs not ints.
32-
*
32+
* Adapted from Lucene's MultiTermsEnum and differs in that
33+
* only next() and term() are supported.
3334
*/
3435
public final class MultiShardTermsEnum {
3536

@@ -38,45 +39,67 @@ public final class MultiShardTermsEnum {
3839

3940
private int numTop;
4041
private BytesRef current;
42+
private Function<Object, Object> termsDecoder;
43+
44+
private record ShardTermsEnum(TermsEnum termsEnum, Function<Object, Object> termsDecoder) {};
45+
46+
public static class Builder {
4147

42-
/** Sole constructor.
48+
private final List<ShardTermsEnum> shardTermsEnums = new ArrayList<>();
49+
50+
void add(TermsEnum termsEnum, Function<Object, Object> termsDecoder) {
51+
this.shardTermsEnums.add(new ShardTermsEnum(termsEnum, termsDecoder));
52+
}
53+
54+
MultiShardTermsEnum build() throws IOException {
55+
return new MultiShardTermsEnum(shardTermsEnums);
56+
}
57+
58+
int size() {
59+
return shardTermsEnums.size();
60+
}
61+
}
62+
63+
/**
4364
* @param enums TermsEnums from shards which we should merge
4465
* @throws IOException Errors accessing data
4566
**/
46-
public MultiShardTermsEnum(TermsEnum[] enums) throws IOException {
47-
queue = new TermMergeQueue(enums.length);
48-
top = new TermsEnumWithCurrent[enums.length];
67+
private MultiShardTermsEnum(List<ShardTermsEnum> enums) throws IOException {
68+
queue = new TermMergeQueue(enums.size());
69+
top = new TermsEnumWithCurrent[enums.size()];
4970
numTop = 0;
5071
queue.clear();
51-
for (int i = 0; i < enums.length; i++) {
52-
final TermsEnum termsEnum = enums[i];
72+
for (ShardTermsEnum shardEnum : enums) {
73+
final TermsEnum termsEnum = shardEnum.termsEnum();
5374
final BytesRef term = termsEnum.next();
5475
if (term != null) {
5576
final TermsEnumWithCurrent entry = new TermsEnumWithCurrent();
5677
entry.current = term;
57-
entry.terms = termsEnum;
78+
entry.termsEnum = termsEnum;
79+
entry.termsDecoder = shardEnum.termsDecoder();
5880
queue.add(entry);
5981
} else {
6082
// field has no terms
6183
}
6284
}
6385
}
6486

65-
public BytesRef term() {
66-
return current;
87+
public String decodedTerm() {
88+
return this.termsDecoder.apply(current).toString();
6789
}
6890

6991
private void pullTop() {
7092
assert numTop == 0;
7193
numTop = queue.fillTop(top);
7294
current = top[0].current;
95+
termsDecoder = top[0].termsDecoder;
7396
}
7497

7598
private void pushTop() throws IOException {
7699
// call next() on each top, and reorder queue
77100
for (int i = 0; i < numTop; i++) {
78101
TermsEnumWithCurrent termsEnum = queue.top();
79-
termsEnum.current = termsEnum.terms.next();
102+
termsEnum.current = termsEnum.termsEnum.next();
80103
if (termsEnum.current == null) {
81104
queue.pop();
82105
} else {
@@ -96,21 +119,13 @@ public BytesRef next() throws IOException {
96119
} else {
97120
current = null;
98121
}
99-
100122
return current;
101123
}
102124

103-
public long docFreq() throws IOException {
104-
long sum = 0;
105-
for (int i = 0; i < numTop; i++) {
106-
sum += top[i].terms.docFreq();
107-
}
108-
return sum;
109-
}
110-
111-
static final class TermsEnumWithCurrent {
112-
TermsEnum terms;
113-
public BytesRef current;
125+
private static final class TermsEnumWithCurrent {
126+
private Function<Object, Object> termsDecoder;
127+
private TermsEnum termsEnum;
128+
private BytesRef current;
114129
}
115130

116131
private static final class TermMergeQueue extends PriorityQueue<TermsEnumWithCurrent> {
@@ -126,8 +141,10 @@ protected boolean lessThan(TermsEnumWithCurrent termsA, TermsEnumWithCurrent ter
126141
return termsA.current.compareTo(termsB.current) < 0;
127142
}
128143

129-
/** Add the {@link #top()} slice as well as all slices that are positioned
130-
* on the same term to {@code tops} and return how many of them there are. */
144+
/**
145+
* Add the {@link #top()} slice as well as all slices that are positioned
146+
* on the same term to {@code tops} and return how many of them there are.
147+
*/
131148
int fillTop(TermsEnumWithCurrent[] tops) {
132149
final int size = size();
133150
if (size == 0) {

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/termsenum/action/TransportTermsEnumAction.java

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
package org.elasticsearch.xpack.core.termsenum.action;
88

99
import org.apache.lucene.index.TermsEnum;
10-
import org.apache.lucene.util.BytesRef;
1110
import org.apache.lucene.util.PriorityQueue;
1211
import org.elasticsearch.ExceptionsHelper;
1312
import org.elasticsearch.action.ActionListener;
@@ -329,9 +328,9 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
329328
long timeout_millis = request.timeout();
330329
long scheduledEnd = request.nodeStartedTimeMillis() + timeout_millis;
331330

332-
ArrayList<TermsEnum> shardTermsEnums = new ArrayList<>();
333331
ArrayList<Closeable> openedResources = new ArrayList<>();
334332
try {
333+
MultiShardTermsEnum.Builder teBuilder = new MultiShardTermsEnum.Builder();
335334
for (ShardId shardId : request.shardIds()) {
336335
// Check we haven't just arrived on a node and time is up already.
337336
if (System.currentTimeMillis() > scheduledEnd) {
@@ -359,15 +358,15 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
359358
request.searchAfter()
360359
);
361360
if (terms != null) {
362-
shardTermsEnums.add(terms);
361+
teBuilder.add(terms, mappedFieldType::valueForDisplay);
363362
}
364363
}
365364
}
366-
if (shardTermsEnums.size() == 0) {
365+
if (teBuilder.size() == 0) {
367366
// No term enums available
368367
return new NodeTermsEnumResponse(request.nodeId(), termsList, error, true);
369368
}
370-
MultiShardTermsEnum te = new MultiShardTermsEnum(shardTermsEnums.toArray(new TermsEnum[0]));
369+
MultiShardTermsEnum te = teBuilder.build();
371370

372371
int shard_size = request.size();
373372
// All the above prep might take a while - do a timer check now before we continue further.
@@ -387,8 +386,7 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
387386
}
388387
termCount = 0;
389388
}
390-
BytesRef bytes = te.term();
391-
termsList.add(bytes.utf8ToString());
389+
termsList.add(te.decodedTerm());
392390
if (termsList.size() >= shard_size) {
393391
break;
394392
}
Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* 2.0; you may not use this file except in compliance with the Elastic License
55
* 2.0.
66
*/
7-
package org.elasticsearch.xpack.core.termsenum;
7+
package org.elasticsearch.xpack.core.termsenum.action;
88

99
import org.apache.lucene.document.Document;
1010
import org.apache.lucene.document.Field;
@@ -18,6 +18,7 @@
1818
import org.apache.lucene.store.ByteBuffersDirectory;
1919
import org.apache.lucene.store.Directory;
2020
import org.apache.lucene.tests.analysis.MockAnalyzer;
21+
import org.apache.lucene.util.BytesRef;
2122
import org.apache.lucene.util.automaton.Automata;
2223
import org.apache.lucene.util.automaton.Automaton;
2324
import org.apache.lucene.util.automaton.CompiledAutomaton;
@@ -26,8 +27,6 @@
2627
import org.elasticsearch.common.lucene.search.AutomatonQueries;
2728
import org.elasticsearch.core.IOUtils;
2829
import org.elasticsearch.test.ESTestCase;
29-
import org.elasticsearch.xpack.core.termsenum.action.MultiShardTermsEnum;
30-
import org.elasticsearch.xpack.core.termsenum.action.SimpleTermCountEnum;
3130

3231
import java.io.Closeable;
3332
import java.util.ArrayList;
@@ -75,7 +74,7 @@ public void testRandomIndexFusion() throws Exception {
7574
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
7675
CompiledAutomaton automaton = new CompiledAutomaton(a);
7776

78-
ArrayList<TermsEnum> termsEnums = new ArrayList<>();
77+
MultiShardTermsEnum.Builder builder = new MultiShardTermsEnum.Builder();
7978
for (DirectoryReader reader : readers) {
8079
Terms terms = MultiTerms.getTerms(reader, fieldName);
8180
TermsEnum te = automaton.getTermsEnum(terms);
@@ -86,13 +85,12 @@ public void testRandomIndexFusion() throws Exception {
8685
while (te.next() != null) {
8786
termCounts.add(te.term().utf8ToString());
8887
}
89-
SimpleTermCountEnum simpleEnum = new SimpleTermCountEnum(termCounts.toArray(new String[0]));
90-
termsEnums.add(simpleEnum);
88+
builder.add(new SimpleTermCountEnum(termCounts.toArray(new String[0])), o -> ((BytesRef) o).utf8ToString());
9189
} else {
92-
termsEnums.add(te);
90+
builder.add(te, o -> ((BytesRef) o).utf8ToString());
9391
}
9492
}
95-
MultiShardTermsEnum mte = new MultiShardTermsEnum(termsEnums.toArray(new TermsEnum[0]));
93+
MultiShardTermsEnum mte = builder.build();
9694
Set<String> expecteds = new HashSet<>();
9795

9896
for (String term : globalTermCounts) {
@@ -102,7 +100,7 @@ public void testRandomIndexFusion() throws Exception {
102100
}
103101

104102
while (mte.next() != null) {
105-
String teString = mte.term().utf8ToString();
103+
String teString = mte.decodedTerm();
106104
assertTrue(expecteds.contains(teString));
107105
expecteds.remove(teString);
108106
}

0 commit comments

Comments
 (0)