Skip to content

Commit 8bf42f4

Browse files
committed
Store keyword fields that trip ignore_above in binary doc values
1 parent 7aa001c commit 8bf42f4

File tree

4 files changed

+196
-24
lines changed

4 files changed

+196
-24
lines changed

modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import org.apache.lucene.document.Field;
1515
import org.apache.lucene.document.FieldType;
1616
import org.apache.lucene.document.StoredField;
17+
import org.apache.lucene.index.DocValues;
1718
import org.apache.lucene.index.IndexOptions;
1819
import org.apache.lucene.index.LeafReaderContext;
1920
import org.apache.lucene.index.Term;
@@ -30,6 +31,7 @@
3031
import org.apache.lucene.util.BytesRef;
3132
import org.apache.lucene.util.IOFunction;
3233
import org.elasticsearch.common.CheckedIntFunction;
34+
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
3335
import org.elasticsearch.common.lucene.Lucene;
3436
import org.elasticsearch.common.text.UTF8DecodingReader;
3537
import org.elasticsearch.common.unit.Fuzziness;
@@ -297,12 +299,18 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
297299

298300
if (parent instanceof KeywordFieldMapper.KeywordFieldType keywordParent
299301
&& keywordParent.ignoreAbove().valuesPotentiallyIgnored()) {
300-
final String parentFallbackFieldName = keywordParent.syntheticSourceFallbackFieldName();
301302
if (parent.isStored()) {
302-
return storedFieldFetcher(parentFieldName, parentFallbackFieldName);
303+
// if the parent keyword field has ignore_above set, then any ignored values will be stored under a fallback field
304+
return combineFieldFetchers(
305+
storedFieldFetcher(parentFieldName),
306+
binaryDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
307+
);
303308
} else if (parent.hasDocValues()) {
304309
var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH);
305-
return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(parentFallbackFieldName));
310+
return combineFieldFetchers(
311+
docValuesFieldFetcher(ifd),
312+
binaryDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
313+
);
306314
}
307315
}
308316

@@ -325,22 +333,16 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
325333
final KeywordFieldMapper.KeywordFieldType keywordDelegate
326334
) {
327335
if (keywordDelegate.ignoreAbove().valuesPotentiallyIgnored()) {
328-
// because we don't know whether the delegate field will be ignored during parsing, we must also check the current field
329-
String fieldName = name();
330-
String fallbackName = syntheticSourceFallbackFieldName();
331-
332-
// delegate field names
333336
String delegateFieldName = keywordDelegate.name();
334-
String delegateFieldFallbackName = keywordDelegate.syntheticSourceFallbackFieldName();
337+
// bc we don't know whether the delegate will ignore a value, we must also check the fallback field created by this
338+
// match_only_text field
339+
String fallbackName = syntheticSourceFallbackFieldName();
335340

336341
if (keywordDelegate.isStored()) {
337-
return storedFieldFetcher(delegateFieldName, delegateFieldFallbackName, fieldName, fallbackName);
342+
return storedFieldFetcher(delegateFieldName, fallbackName);
338343
} else if (keywordDelegate.hasDocValues()) {
339344
var ifd = searchExecutionContext.getForField(keywordDelegate, MappedFieldType.FielddataOperation.SEARCH);
340-
return combineFieldFetchers(
341-
docValuesFieldFetcher(ifd),
342-
storedFieldFetcher(delegateFieldFallbackName, fieldName, fallbackName)
343-
);
345+
return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(fallbackName));
344346
}
345347
}
346348

@@ -374,6 +376,42 @@ private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IO
374376
};
375377
}
376378

379+
/**
380+
* Used exclusively to load ignored values from binary doc values. These values are stored in a separate fallback field in order to
381+
* retain the original value and hence be able to support synthetic source.
382+
*/
383+
private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> binaryDocValuesFieldFetcher(
384+
String fieldName
385+
) {
386+
return context -> {
387+
var binaryDocValues = DocValues.getBinary(context.reader(), fieldName);
388+
return docId -> {
389+
if (binaryDocValues == null || binaryDocValues.advanceExact(docId) == false) {
390+
return List.of();
391+
}
392+
393+
// see KeywordFieldMapper.MultiValuedBinaryDocValuesField for context on how to decode these binary doc values back into
394+
// strings
395+
BytesRef docValuesBytes = binaryDocValues.binaryValue();
396+
397+
try (ByteArrayStreamInput stream = new ByteArrayStreamInput()) {
398+
stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
399+
400+
int docValueCount = stream.readVInt();
401+
var values = new ArrayList<>(docValueCount);
402+
403+
for (int i = 0; i < docValueCount; i++) {
404+
// this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
405+
BytesRef valueBytes = stream.readBytesRef();
406+
values.add(valueBytes.utf8ToString());
407+
}
408+
409+
return values;
410+
}
411+
};
412+
};
413+
}
414+
377415
private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> storedFieldFetcher(String... names) {
378416
var loader = StoredFieldLoader.create(false, Set.of(names));
379417
return context -> {

modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
465465
id: "1"
466466
refresh: true
467467
body:
468-
foo: [ "Apache Lucene powers Elasticsearch", "Apache" ]
468+
foo: [ "Apache Lucene powers Elasticsearch", "Apache", "Apache Lucene" ]
469469

470470
- do:
471471
search:
@@ -477,7 +477,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
477477

478478
- match: { "hits.total.value": 1 }
479479
- match:
480-
hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch" ]
480+
hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch", "Apache Lucene" ]
481481

482482
---
483483
synthetic_source match_only_text as multi-field with stored keyword as parent:
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.mapper;
11+
12+
import org.apache.lucene.index.BinaryDocValues;
13+
import org.apache.lucene.index.LeafReader;
14+
import org.apache.lucene.util.BytesRef;
15+
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
16+
import org.elasticsearch.xcontent.XContentBuilder;
17+
18+
import java.io.IOException;
19+
20+
public final class BinaryDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer {
21+
22+
private final String fieldName;
23+
24+
// the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read
25+
// the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
26+
private final ByteArrayStreamInput stream = new ByteArrayStreamInput();
27+
private BytesRef docValuesBytes;
28+
private int valueCount;
29+
30+
public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) {
31+
this.fieldName = fieldName;
32+
}
33+
34+
@Override
35+
public long valueCount() {
36+
return valueCount;
37+
}
38+
39+
@Override
40+
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
41+
BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldName);
42+
43+
// there are no values associated with this field
44+
if (docValues == null) return null;
45+
46+
return docId -> {
47+
// there are no more documents to process
48+
if (docValues.advanceExact(docId) == false) {
49+
valueCount = 0;
50+
return false;
51+
}
52+
53+
// otherwise, extract the doc values into a stream to later read from
54+
docValuesBytes = docValues.binaryValue();
55+
stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
56+
valueCount = stream.readVInt();
57+
58+
return hasValue();
59+
};
60+
}
61+
62+
@Override
63+
public boolean hasValue() {
64+
return valueCount > 0;
65+
}
66+
67+
@Override
68+
public void write(XContentBuilder b) throws IOException {
69+
for (int i = 0; i < valueCount; i++) {
70+
// extract the length of the ith value and serialize that many bytes into XContentBuilder
71+
int valueLength = stream.readVInt();
72+
b.utf8Value(docValuesBytes.bytes, stream.getPosition(), valueLength);
73+
74+
// finally, skip over the bytes we've just serialized to prepare for the next value
75+
stream.skipBytes(valueLength);
76+
}
77+
}
78+
79+
@Override
80+
public String fieldName() {
81+
return fieldName;
82+
}
83+
84+
}

server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
import org.apache.lucene.util.automaton.CompiledAutomaton;
3737
import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
3838
import org.apache.lucene.util.automaton.Operations;
39+
import org.elasticsearch.ElasticsearchException;
40+
import org.elasticsearch.common.io.stream.BytesStreamOutput;
3941
import org.elasticsearch.common.lucene.BytesRefs;
4042
import org.elasticsearch.common.lucene.Lucene;
4143
import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -83,6 +85,7 @@
8385
import java.util.Arrays;
8486
import java.util.Collection;
8587
import java.util.Collections;
88+
import java.util.LinkedHashSet;
8689
import java.util.List;
8790
import java.util.Locale;
8891
import java.util.Map;
@@ -1245,7 +1248,14 @@ private boolean indexValue(DocumentParserContext context, XContentString value)
12451248
var utfBytes = value.bytes();
12461249
var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
12471250
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
1248-
context.doc().add(new StoredField(fieldName, bytesRef));
1251+
1252+
// store the value in a binary doc values field, create one if it doesn't exist
1253+
MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName);
1254+
if (field == null) {
1255+
field = new MultiValuedBinaryDocValuesField(fieldName);
1256+
context.doc().addWithKey(fieldName, field);
1257+
}
1258+
field.add(bytesRef);
12491259
}
12501260

12511261
return false;
@@ -1413,15 +1423,55 @@ protected BytesRef preserve(BytesRef value) {
14131423
// extra copy of the field for supporting synthetic source. This layer will check that copy.
14141424
if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) {
14151425
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
1416-
layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) {
1417-
@Override
1418-
protected void writeValue(Object value, XContentBuilder b) throws IOException {
1419-
BytesRef ref = (BytesRef) value;
1420-
b.utf8Value(ref.bytes, ref.offset, ref.length);
1421-
}
1422-
});
1426+
layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName));
14231427
}
14241428

14251429
return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
14261430
}
1431+
1432+
/**
1433+
* A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique
1434+
* binary doc values for fields with multiple values per document.
1435+
*/
1436+
private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField {
1437+
1438+
private final Set<BytesRef> uniqueValues;
1439+
private int docValuesByteCount = 0;
1440+
1441+
MultiValuedBinaryDocValuesField(String name) {
1442+
super(name);
1443+
// linked hash set to maintain insertion order of elements
1444+
uniqueValues = new LinkedHashSet<>();
1445+
}
1446+
1447+
public void add(final BytesRef value) {
1448+
uniqueValues.add(value);
1449+
// might as well track these on the go as opposed to having to loop through all entries later
1450+
docValuesByteCount += value.length;
1451+
}
1452+
1453+
/**
1454+
* Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes
1455+
* the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
1456+
*/
1457+
@Override
1458+
public BytesRef binaryValue() {
1459+
int docValuesCount = uniqueValues.size();
1460+
// the + 1 is for the total doc values count, which is prefixed at the start of the array
1461+
int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES;
1462+
1463+
try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) {
1464+
out.writeVInt(docValuesCount);
1465+
for (BytesRef value : uniqueValues) {
1466+
int valueLength = value.length;
1467+
out.writeVInt(valueLength);
1468+
out.writeBytes(value.bytes, value.offset, valueLength);
1469+
}
1470+
return out.bytes().toBytesRef();
1471+
} catch (IOException e) {
1472+
throw new ElasticsearchException("Failed to get binary value", e);
1473+
}
1474+
}
1475+
1476+
}
14271477
}

0 commit comments

Comments
 (0)