Skip to content

Commit c6cdc01

Browse files
authored
Revert "Store keyword fields that trip ignore_above in binary doc values (#137483)" (#138858)
This reverts commit 6783135. Closes #138857
1 parent ca09313 commit c6cdc01

File tree

7 files changed

+97
-250
lines changed

7 files changed

+97
-250
lines changed

modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java

Lines changed: 28 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
import org.apache.lucene.document.Field;
1515
import org.apache.lucene.document.FieldType;
1616
import org.apache.lucene.document.StoredField;
17-
import org.apache.lucene.index.BinaryDocValues;
18-
import org.apache.lucene.index.DocValues;
1917
import org.apache.lucene.index.IndexOptions;
2018
import org.apache.lucene.index.LeafReaderContext;
2119
import org.apache.lucene.index.Term;
@@ -32,7 +30,6 @@
3230
import org.apache.lucene.util.BytesRef;
3331
import org.apache.lucene.util.IOFunction;
3432
import org.elasticsearch.common.CheckedIntFunction;
35-
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
3633
import org.elasticsearch.common.lucene.Lucene;
3734
import org.elasticsearch.common.text.UTF8DecodingReader;
3835
import org.elasticsearch.common.unit.Fuzziness;
@@ -42,7 +39,6 @@
4239
import org.elasticsearch.index.analysis.NamedAnalyzer;
4340
import org.elasticsearch.index.fielddata.FieldDataContext;
4441
import org.elasticsearch.index.fielddata.IndexFieldData;
45-
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
4642
import org.elasticsearch.index.fielddata.SourceValueFetcherSortedBinaryIndexFieldData;
4743
import org.elasticsearch.index.fielddata.StoredFieldSortedBinaryIndexFieldData;
4844
import org.elasticsearch.index.fieldvisitor.StoredFieldLoader;
@@ -301,17 +297,12 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
301297

302298
if (parent instanceof KeywordFieldMapper.KeywordFieldType keywordParent
303299
&& keywordParent.ignoreAbove().valuesPotentiallyIgnored()) {
300+
final String parentFallbackFieldName = keywordParent.syntheticSourceFallbackFieldName();
304301
if (parent.isStored()) {
305-
return combineFieldFetchers(
306-
storedFieldFetcher(parentFieldName),
307-
ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
308-
);
302+
return storedFieldFetcher(parentFieldName, parentFallbackFieldName);
309303
} else if (parent.hasDocValues()) {
310304
var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH);
311-
return combineFieldFetchers(
312-
docValuesFieldFetcher(ifd),
313-
ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
314-
);
305+
return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(parentFallbackFieldName));
315306
}
316307
}
317308

@@ -334,16 +325,22 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
334325
final KeywordFieldMapper.KeywordFieldType keywordDelegate
335326
) {
336327
if (keywordDelegate.ignoreAbove().valuesPotentiallyIgnored()) {
337-
String delegateFieldName = keywordDelegate.name();
338-
// bc we don't know whether the delegate will ignore a value, we must also check the fallback field created by this
339-
// match_only_text field
328+
// because we don't know whether the delegate field will be ignored during parsing, we must also check the current field
329+
String fieldName = name();
340330
String fallbackName = syntheticSourceFallbackFieldName();
341331

332+
// delegate field names
333+
String delegateFieldName = keywordDelegate.name();
334+
String delegateFieldFallbackName = keywordDelegate.syntheticSourceFallbackFieldName();
335+
342336
if (keywordDelegate.isStored()) {
343-
return storedFieldFetcher(delegateFieldName, fallbackName);
337+
return storedFieldFetcher(delegateFieldName, delegateFieldFallbackName, fieldName, fallbackName);
344338
} else if (keywordDelegate.hasDocValues()) {
345339
var ifd = searchExecutionContext.getForField(keywordDelegate, MappedFieldType.FielddataOperation.SEARCH);
346-
return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(fallbackName));
340+
return combineFieldFetchers(
341+
docValuesFieldFetcher(ifd),
342+
storedFieldFetcher(delegateFieldFallbackName, fieldName, fallbackName)
343+
);
347344
}
348345
}
349346

@@ -358,34 +355,25 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
358355
}
359356
}
360357

361-
private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> docValuesFieldFetcher(IndexFieldData<?> ifd) {
362-
return context -> {
363-
SortedBinaryDocValues indexedValuesDocValues = ifd.load(context).getBytesValues();
364-
return docId -> getValuesFromDocValues(indexedValuesDocValues, docId);
365-
};
366-
}
367-
368-
private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> ignoredValuesDocValuesFieldFetcher(
369-
String fieldName
358+
private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> docValuesFieldFetcher(
359+
IndexFieldData<?> ifd
370360
) {
371361
return context -> {
372-
CustomBinaryDocValues ignoredValuesDocValues = new CustomBinaryDocValues(DocValues.getBinary(context.reader(), fieldName));
373-
return docId -> getValuesFromDocValues(ignoredValuesDocValues, docId);
362+
var sortedBinaryDocValues = ifd.load(context).getBytesValues();
363+
return docId -> {
364+
if (sortedBinaryDocValues.advanceExact(docId)) {
365+
var values = new ArrayList<>(sortedBinaryDocValues.docValueCount());
366+
for (int i = 0; i < sortedBinaryDocValues.docValueCount(); i++) {
367+
values.add(sortedBinaryDocValues.nextValue().utf8ToString());
368+
}
369+
return values;
370+
} else {
371+
return List.of();
372+
}
373+
};
374374
};
375375
}
376376

377-
private List<Object> getValuesFromDocValues(SortedBinaryDocValues docValues, int docId) throws IOException {
378-
if (docValues.advanceExact(docId)) {
379-
var values = new ArrayList<>(docValues.docValueCount());
380-
for (int i = 0; i < docValues.docValueCount(); i++) {
381-
values.add(docValues.nextValue().utf8ToString());
382-
}
383-
return values;
384-
} else {
385-
return List.of();
386-
}
387-
}
388-
389377
private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> storedFieldFetcher(String... names) {
390378
var loader = StoredFieldLoader.create(false, Set.of(names));
391379
return context -> {
@@ -791,46 +779,4 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {
791779

792780
return fieldLoader;
793781
}
794-
795-
/**
796-
* A wrapper around {@link BinaryDocValues} that exposes some quality of life functions. Note, these values are not sorted.
797-
*/
798-
private static class CustomBinaryDocValues extends SortedBinaryDocValues {
799-
800-
private final BinaryDocValues binaryDocValues;
801-
private final ByteArrayStreamInput stream;
802-
803-
private int docValueCount = 0;
804-
805-
CustomBinaryDocValues(BinaryDocValues binaryDocValues) {
806-
this.binaryDocValues = binaryDocValues;
807-
this.stream = new ByteArrayStreamInput();
808-
}
809-
810-
@Override
811-
public BytesRef nextValue() throws IOException {
812-
// this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
813-
return stream.readBytesRef();
814-
}
815-
816-
@Override
817-
public boolean advanceExact(int docId) throws IOException {
818-
// if document has a value, read underlying bytes
819-
if (binaryDocValues.advanceExact(docId)) {
820-
BytesRef docValuesBytes = binaryDocValues.binaryValue();
821-
stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
822-
docValueCount = stream.readVInt();
823-
return true;
824-
}
825-
826-
// otherwise there is nothing to do
827-
docValueCount = 0;
828-
return false;
829-
}
830-
831-
@Override
832-
public int docValueCount() {
833-
return docValueCount;
834-
}
835-
}
836782
}

modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
465465
id: "1"
466466
refresh: true
467467
body:
468-
foo: [ "Apache Lucene powers Elasticsearch", "Apache", "Apache Lucene" ]
468+
foo: [ "Apache Lucene powers Elasticsearch", "Apache" ]
469469

470470
- do:
471471
search:
@@ -477,7 +477,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
477477

478478
- match: { "hits.total.value": 1 }
479479
- match:
480-
hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch", "Apache Lucene" ]
480+
hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch" ]
481481

482482
---
483483
synthetic_source match_only_text as multi-field with stored keyword as parent:

server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java

Lines changed: 0 additions & 84 deletions
This file was deleted.

server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

Lines changed: 8 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@
4040
import org.apache.lucene.util.automaton.CompiledAutomaton;
4141
import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
4242
import org.apache.lucene.util.automaton.Operations;
43-
import org.elasticsearch.ElasticsearchException;
44-
import org.elasticsearch.common.io.stream.BytesStreamOutput;
4543
import org.elasticsearch.common.lucene.BytesRefs;
4644
import org.elasticsearch.common.lucene.Lucene;
4745
import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -91,7 +89,6 @@
9189
import java.util.Arrays;
9290
import java.util.Collection;
9391
import java.util.Collections;
94-
import java.util.LinkedHashSet;
9592
import java.util.List;
9693
import java.util.Locale;
9794
import java.util.Map;
@@ -1174,14 +1171,7 @@ private boolean indexValue(DocumentParserContext context, XContentString value)
11741171
var utfBytes = value.bytes();
11751172
var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
11761173
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
1177-
1178-
// store the value in a binary doc values field, create one if it doesn't exist
1179-
MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName);
1180-
if (field == null) {
1181-
field = new MultiValuedBinaryDocValuesField(fieldName);
1182-
context.doc().addWithKey(fieldName, field);
1183-
}
1184-
field.add(bytesRef);
1174+
context.doc().add(new StoredField(fieldName, bytesRef));
11851175
}
11861176

11871177
return false;
@@ -1344,56 +1334,15 @@ protected BytesRef preserve(BytesRef value) {
13441334
// extra copy of the field for supporting synthetic source. This layer will check that copy.
13451335
if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) {
13461336
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
1347-
layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName));
1348-
}
1349-
1350-
return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
1351-
}
1352-
1353-
/**
1354-
* A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique
1355-
* binary doc values for fields with multiple values per document.
1356-
*/
1357-
private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField {
1358-
1359-
private final Set<BytesRef> uniqueValues;
1360-
private int docValuesByteCount = 0;
1361-
1362-
MultiValuedBinaryDocValuesField(String name) {
1363-
super(name);
1364-
// linked hash set to maintain insertion order of elements
1365-
uniqueValues = new LinkedHashSet<>();
1366-
}
1367-
1368-
public void add(final BytesRef value) {
1369-
if (uniqueValues.add(value)) {
1370-
// might as well track these on the go as opposed to having to loop through all entries later
1371-
docValuesByteCount += value.length;
1372-
}
1373-
}
1374-
1375-
/**
1376-
* Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes
1377-
* the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
1378-
*/
1379-
@Override
1380-
public BytesRef binaryValue() {
1381-
int docValuesCount = uniqueValues.size();
1382-
// the + 1 is for the total doc values count, which is prefixed at the start of the array
1383-
int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES;
1384-
1385-
try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) {
1386-
out.writeVInt(docValuesCount);
1387-
for (BytesRef value : uniqueValues) {
1388-
int valueLength = value.length;
1389-
out.writeVInt(valueLength);
1390-
out.writeBytes(value.bytes, value.offset, valueLength);
1337+
layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) {
1338+
@Override
1339+
protected void writeValue(Object value, XContentBuilder b) throws IOException {
1340+
BytesRef ref = (BytesRef) value;
1341+
b.utf8Value(ref.bytes, ref.offset, ref.length);
13911342
}
1392-
return out.bytes().toBytesRef();
1393-
} catch (IOException e) {
1394-
throw new ElasticsearchException("Failed to get binary value", e);
1395-
}
1343+
});
13961344
}
13971345

1346+
return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
13981347
}
13991348
}

server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception {
5252
.endObject()
5353
.endObject()
5454
.endObject();
55-
5655
// Note values that would be ignored are added at the end of arrays,
5756
// this makes testing easier as ignored values are always synthesized after regular values:
5857
var arrayValues = new Object[][] {
@@ -61,16 +60,7 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception {
6160
new Object[] { "123", "1234", "12345" },
6261
new Object[] { null, null, null, "blabla" },
6362
new Object[] { "1", "2", "3", "blabla" } };
64-
65-
// values in the original array should be deduplicated
66-
var expectedArrayValues = new Object[][] {
67-
new Object[] { null, "a", "ab", "abc", "abcd", null, "abcde" },
68-
new Object[] { "12345" },
69-
new Object[] { "123", "1234", "12345" },
70-
new Object[] { null, null, null, "blabla" },
71-
new Object[] { "1", "2", "3", "blabla" } };
72-
73-
verifySyntheticArray(arrayValues, expectedArrayValues, mapping, "_id");
63+
verifySyntheticArray(arrayValues, mapping, "_id", "field._original");
7464
}
7565

7666
public void testSynthesizeObjectArray() throws Exception {

0 commit comments

Comments
 (0)