Skip to content

Commit ffc2498

Browse files
committed
Store keyword fields that trip ignore_above in binary doc values
1 parent bb71252 commit ffc2498

File tree

4 files changed

+193
-24
lines changed

4 files changed

+193
-24
lines changed

modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import org.apache.lucene.document.Field;
1515
import org.apache.lucene.document.FieldType;
1616
import org.apache.lucene.document.StoredField;
17+
import org.apache.lucene.index.DocValues;
1718
import org.apache.lucene.index.IndexOptions;
1819
import org.apache.lucene.index.LeafReaderContext;
1920
import org.apache.lucene.index.Term;
@@ -30,6 +31,7 @@
3031
import org.apache.lucene.util.BytesRef;
3132
import org.apache.lucene.util.IOFunction;
3233
import org.elasticsearch.common.CheckedIntFunction;
34+
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
3335
import org.elasticsearch.common.lucene.Lucene;
3436
import org.elasticsearch.common.text.UTF8DecodingReader;
3537
import org.elasticsearch.common.unit.Fuzziness;
@@ -297,12 +299,18 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
297299

298300
if (parent instanceof KeywordFieldMapper.KeywordFieldType keywordParent
299301
&& keywordParent.ignoreAbove().valuesPotentiallyIgnored()) {
300-
final String parentFallbackFieldName = keywordParent.syntheticSourceFallbackFieldName();
301302
if (parent.isStored()) {
302-
return storedFieldFetcher(parentFieldName, parentFallbackFieldName);
303+
// if the parent keyword field has ignore_above set, then any ignored values will be stored under a fallback field
304+
return combineFieldFetchers(
305+
storedFieldFetcher(parentFieldName),
306+
binaryDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
307+
);
303308
} else if (parent.hasDocValues()) {
304309
var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH);
305-
return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(parentFallbackFieldName));
310+
return combineFieldFetchers(
311+
docValuesFieldFetcher(ifd),
312+
binaryDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
313+
);
306314
}
307315
}
308316

@@ -325,22 +333,16 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
325333
final KeywordFieldMapper.KeywordFieldType keywordDelegate
326334
) {
327335
if (keywordDelegate.ignoreAbove().valuesPotentiallyIgnored()) {
328-
// because we don't know whether the delegate field will be ignored during parsing, we must also check the current field
329-
String fieldName = name();
330-
String fallbackName = syntheticSourceFallbackFieldName();
331-
332-
// delegate field names
333336
String delegateFieldName = keywordDelegate.name();
334-
String delegateFieldFallbackName = keywordDelegate.syntheticSourceFallbackFieldName();
337+
// bc we don't know whether the delegate will ignore a value, we must also check the fallback field created by this
338+
// match_only_text field
339+
String fallbackName = syntheticSourceFallbackFieldName();
335340

336341
if (keywordDelegate.isStored()) {
337-
return storedFieldFetcher(delegateFieldName, delegateFieldFallbackName, fieldName, fallbackName);
342+
return storedFieldFetcher(delegateFieldName, fallbackName);
338343
} else if (keywordDelegate.hasDocValues()) {
339344
var ifd = searchExecutionContext.getForField(keywordDelegate, MappedFieldType.FielddataOperation.SEARCH);
340-
return combineFieldFetchers(
341-
docValuesFieldFetcher(ifd),
342-
storedFieldFetcher(delegateFieldFallbackName, fieldName, fallbackName)
343-
);
345+
return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(fallbackName));
344346
}
345347
}
346348

@@ -374,6 +376,42 @@ private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IO
374376
};
375377
}
376378

379+
/**
380+
* Used exclusively to load ignored values from binary doc values. These values are stored in a separate fallback field in order to
381+
* retain the original value and hence be able to support synthetic source.
382+
*/
383+
private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> binaryDocValuesFieldFetcher(
384+
String fieldName
385+
) {
386+
return context -> {
387+
var binaryDocValues = DocValues.getBinary(context.reader(), fieldName);
388+
return docId -> {
389+
if (binaryDocValues == null || binaryDocValues.advanceExact(docId) == false) {
390+
return List.of();
391+
}
392+
393+
// see KeywordFieldMapper.MultiValuedBinaryDocValuesField for context on how to decode these binary doc values back into
394+
// strings
395+
BytesRef docValuesBytes = binaryDocValues.binaryValue();
396+
397+
try (ByteArrayStreamInput stream = new ByteArrayStreamInput()) {
398+
stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
399+
400+
int docValueCount = stream.readVInt();
401+
var values = new ArrayList<>(docValueCount);
402+
403+
for (int i = 0; i < docValueCount; i++) {
404+
// this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
405+
BytesRef valueBytes = stream.readBytesRef();
406+
values.add(valueBytes.utf8ToString());
407+
}
408+
409+
return values;
410+
}
411+
};
412+
};
413+
}
414+
377415
private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> storedFieldFetcher(String... names) {
378416
var loader = StoredFieldLoader.create(false, Set.of(names));
379417
return context -> {

modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
465465
id: "1"
466466
refresh: true
467467
body:
468-
foo: [ "Apache Lucene powers Elasticsearch", "Apache" ]
468+
foo: [ "Apache Lucene powers Elasticsearch", "Apache", "Apache Lucene" ]
469469

470470
- do:
471471
search:
@@ -477,7 +477,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
477477

478478
- match: { "hits.total.value": 1 }
479479
- match:
480-
hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch" ]
480+
hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch", "Apache Lucene" ]
481481

482482
---
483483
synthetic_source match_only_text as multi-field with stored keyword as parent:
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.mapper;
11+
12+
import org.apache.lucene.index.BinaryDocValues;
13+
import org.apache.lucene.index.LeafReader;
14+
import org.apache.lucene.util.BytesRef;
15+
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
16+
import org.elasticsearch.xcontent.XContentBuilder;
17+
18+
import java.io.IOException;
19+
20+
public final class BinaryDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer {
21+
22+
private final String fieldName;
23+
24+
// the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read
25+
// the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
26+
private final ByteArrayStreamInput stream = new ByteArrayStreamInput();
27+
private BytesRef docValuesBytes;
28+
private int valueCount;
29+
30+
public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) {
31+
this.fieldName = fieldName;
32+
}
33+
34+
@Override
35+
public long valueCount() {
36+
return valueCount;
37+
}
38+
39+
@Override
40+
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
41+
BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldName);
42+
43+
// there are no values associated with this field
44+
if (docValues == null) return null;
45+
46+
return docId -> {
47+
// there are no more documents to process
48+
if (docValues.advanceExact(docId) == false) {
49+
valueCount = 0;
50+
return false;
51+
}
52+
53+
// otherwise, extract the doc values into a stream to later read from
54+
docValuesBytes = docValues.binaryValue();
55+
stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
56+
valueCount = stream.readVInt();
57+
58+
return hasValue();
59+
};
60+
}
61+
62+
@Override
63+
public boolean hasValue() {
64+
return valueCount > 0;
65+
}
66+
67+
@Override
68+
public void write(XContentBuilder b) throws IOException {
69+
for (int i = 0; i < valueCount; i++) {
70+
// this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
71+
BytesRef valueBytes = stream.readBytesRef();
72+
b.value(valueBytes.utf8ToString());
73+
}
74+
}
75+
76+
@Override
77+
public String fieldName() {
78+
return fieldName;
79+
}
80+
81+
}

server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
import org.apache.lucene.util.automaton.CompiledAutomaton;
4141
import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
4242
import org.apache.lucene.util.automaton.Operations;
43+
import org.elasticsearch.ElasticsearchException;
44+
import org.elasticsearch.common.io.stream.BytesStreamOutput;
4345
import org.elasticsearch.common.lucene.BytesRefs;
4446
import org.elasticsearch.common.lucene.Lucene;
4547
import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -85,6 +87,7 @@
8587
import java.util.Arrays;
8688
import java.util.Collection;
8789
import java.util.Collections;
90+
import java.util.LinkedHashSet;
8891
import java.util.List;
8992
import java.util.Locale;
9093
import java.util.Map;
@@ -1248,7 +1251,14 @@ private boolean indexValue(DocumentParserContext context, XContentString value)
12481251
var utfBytes = value.bytes();
12491252
var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
12501253
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
1251-
context.doc().add(new StoredField(fieldName, bytesRef));
1254+
1255+
// store the value in a binary doc values field, create one if it doesn't exist
1256+
MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName);
1257+
if (field == null) {
1258+
field = new MultiValuedBinaryDocValuesField(fieldName);
1259+
context.doc().addWithKey(fieldName, field);
1260+
}
1261+
field.add(bytesRef);
12521262
}
12531263

12541264
return false;
@@ -1416,15 +1426,55 @@ protected BytesRef preserve(BytesRef value) {
14161426
// extra copy of the field for supporting synthetic source. This layer will check that copy.
14171427
if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) {
14181428
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
1419-
layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) {
1420-
@Override
1421-
protected void writeValue(Object value, XContentBuilder b) throws IOException {
1422-
BytesRef ref = (BytesRef) value;
1423-
b.utf8Value(ref.bytes, ref.offset, ref.length);
1424-
}
1425-
});
1429+
layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName));
14261430
}
14271431

14281432
return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
14291433
}
1434+
1435+
/**
1436+
* A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique
1437+
* binary doc values for fields with multiple values per document.
1438+
*/
1439+
private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField {
1440+
1441+
private final Set<BytesRef> uniqueValues;
1442+
private int docValuesByteCount = 0;
1443+
1444+
MultiValuedBinaryDocValuesField(String name) {
1445+
super(name);
1446+
// linked hash set to maintain insertion order of elements
1447+
uniqueValues = new LinkedHashSet<>();
1448+
}
1449+
1450+
public void add(final BytesRef value) {
1451+
uniqueValues.add(value);
1452+
// might as well track these on the go as opposed to having to loop through all entries later
1453+
docValuesByteCount += value.length;
1454+
}
1455+
1456+
/**
1457+
* Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes
1458+
* the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
1459+
*/
1460+
@Override
1461+
public BytesRef binaryValue() {
1462+
int docValuesCount = uniqueValues.size();
1463+
// the + 1 is for the total doc values count, which is prefixed at the start of the array
1464+
int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES;
1465+
1466+
try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) {
1467+
out.writeVInt(docValuesCount);
1468+
for (BytesRef value : uniqueValues) {
1469+
int valueLength = value.length;
1470+
out.writeVInt(valueLength);
1471+
out.writeBytes(value.bytes, value.offset, valueLength);
1472+
}
1473+
return out.bytes().toBytesRef();
1474+
} catch (IOException e) {
1475+
throw new ElasticsearchException("Failed to get binary value", e);
1476+
}
1477+
}
1478+
1479+
}
14301480
}

0 commit comments

Comments
 (0)