Skip to content

Commit 2a64367

Browse files
committed
Store keyword fields that trip ignore_above in binary doc values
1 parent 7aa001c commit 2a64367

File tree

2 files changed

+141
-8
lines changed

2 files changed

+141
-8
lines changed
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.mapper;
11+
12+
import org.apache.lucene.index.BinaryDocValues;
13+
import org.apache.lucene.index.LeafReader;
14+
import org.apache.lucene.util.BytesRef;
15+
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
16+
import org.elasticsearch.xcontent.XContentBuilder;
17+
18+
import java.io.IOException;
19+
20+
public final class BinaryDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer {
21+
22+
private final String fieldName;
23+
24+
// the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read
25+
// the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
26+
private final ByteArrayStreamInput stream = new ByteArrayStreamInput();
27+
private BytesRef docValuesBytes;
28+
private int valueCount;
29+
30+
public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) {
31+
this.fieldName = fieldName;
32+
}
33+
34+
@Override
35+
public long valueCount() {
36+
return valueCount;
37+
}
38+
39+
@Override
40+
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
41+
BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldName);
42+
43+
// there are no values associated with this field
44+
if (docValues == null) return null;
45+
46+
return docId -> {
47+
// there are no more documents to process
48+
if (docValues.advanceExact(docId) == false) {
49+
valueCount = 0;
50+
return false;
51+
}
52+
53+
// otherwise, extract the doc values into a stream to later read from
54+
docValuesBytes = docValues.binaryValue();
55+
stream.reset(docValuesBytes.bytes);
56+
stream.setPosition(docValuesBytes.offset);
57+
valueCount = stream.readVInt();
58+
59+
return hasValue();
60+
};
61+
}
62+
63+
@Override
64+
public boolean hasValue() {
65+
return valueCount > 0;
66+
}
67+
68+
@Override
69+
public void write(XContentBuilder b) throws IOException {
70+
for (int i = 0; i < valueCount; i++) {
71+
// extract the length of the ith value and serialize that many bytes into XContentBuilder
72+
int valueLength = stream.readVInt();
73+
b.utf8Value(docValuesBytes.bytes, stream.getPosition(), valueLength);
74+
75+
// finally, skip over the bytes we've just serialized to prepare for the next value
76+
stream.skipBytes(valueLength);
77+
}
78+
}
79+
80+
@Override
81+
public String fieldName() {
82+
return fieldName;
83+
}
84+
85+
}

server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
import org.apache.lucene.util.automaton.CompiledAutomaton;
3737
import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
3838
import org.apache.lucene.util.automaton.Operations;
39+
import org.elasticsearch.ElasticsearchException;
40+
import org.elasticsearch.common.io.stream.BytesStreamOutput;
3941
import org.elasticsearch.common.lucene.BytesRefs;
4042
import org.elasticsearch.common.lucene.Lucene;
4143
import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -83,6 +85,7 @@
8385
import java.util.Arrays;
8486
import java.util.Collection;
8587
import java.util.Collections;
88+
import java.util.LinkedHashSet;
8689
import java.util.List;
8790
import java.util.Locale;
8891
import java.util.Map;
@@ -1245,7 +1248,14 @@ private boolean indexValue(DocumentParserContext context, XContentString value)
12451248
var utfBytes = value.bytes();
12461249
var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
12471250
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
1248-
context.doc().add(new StoredField(fieldName, bytesRef));
1251+
1252+
// store the value in a binary doc values field, create one if it doesn't exist
1253+
MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName);
1254+
if (field == null) {
1255+
field = new MultiValuedBinaryDocValuesField(fieldName);
1256+
context.doc().addWithKey(fieldName, field);
1257+
}
1258+
field.add(bytesRef);
12491259
}
12501260

12511261
return false;
@@ -1413,15 +1423,53 @@ protected BytesRef preserve(BytesRef value) {
14131423
// extra copy of the field for supporting synthetic source. This layer will check that copy.
14141424
if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) {
14151425
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
1416-
layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) {
1417-
@Override
1418-
protected void writeValue(Object value, XContentBuilder b) throws IOException {
1419-
BytesRef ref = (BytesRef) value;
1420-
b.utf8Value(ref.bytes, ref.offset, ref.length);
1421-
}
1422-
});
1426+
layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName));
14231427
}
14241428

14251429
return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
14261430
}
1431+
1432+
/**
1433+
* A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique
1434+
* binary doc values for fields with multiple values per document.
1435+
*/
1436+
private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField {
1437+
1438+
private final Set<BytesRef> uniqueValues;
1439+
1440+
MultiValuedBinaryDocValuesField(String name) {
1441+
super(name);
1442+
// linked hash set to maintain insertion order of elements
1443+
uniqueValues = new LinkedHashSet<>();
1444+
}
1445+
1446+
public void add(final BytesRef value) {
1447+
uniqueValues.add(value);
1448+
}
1449+
1450+
/**
1451+
* Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes
1452+
* the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
1453+
*/
1454+
@Override
1455+
public BytesRef binaryValue() {
1456+
int docValuesByteCount = uniqueValues.stream().map(a -> a.length).reduce(0, Integer::sum);
1457+
int docValuesCount = uniqueValues.size();
1458+
// the + 1 is for the total doc values count, which is prefixed at the start of the array
1459+
int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES;
1460+
1461+
try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) {
1462+
out.writeVInt(docValuesCount);
1463+
for (BytesRef value : uniqueValues) {
1464+
int valueLength = value.length;
1465+
out.writeVInt(valueLength);
1466+
out.writeBytes(value.bytes, value.offset, valueLength);
1467+
}
1468+
return out.bytes().toBytesRef();
1469+
} catch (IOException e) {
1470+
throw new ElasticsearchException("Failed to get binary value", e);
1471+
}
1472+
}
1473+
1474+
}
14271475
}

0 commit comments

Comments
 (0)