|
36 | 36 | import org.apache.lucene.util.automaton.CompiledAutomaton; |
37 | 37 | import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE; |
38 | 38 | import org.apache.lucene.util.automaton.Operations; |
| 39 | +import org.elasticsearch.ElasticsearchException; |
| 40 | +import org.elasticsearch.common.io.stream.BytesStreamOutput; |
39 | 41 | import org.elasticsearch.common.lucene.BytesRefs; |
40 | 42 | import org.elasticsearch.common.lucene.Lucene; |
41 | 43 | import org.elasticsearch.common.lucene.search.AutomatonQueries; |
|
83 | 85 | import java.util.Arrays; |
84 | 86 | import java.util.Collection; |
85 | 87 | import java.util.Collections; |
| 88 | +import java.util.LinkedHashSet; |
86 | 89 | import java.util.List; |
87 | 90 | import java.util.Locale; |
88 | 91 | import java.util.Map; |
@@ -1245,7 +1248,14 @@ private boolean indexValue(DocumentParserContext context, XContentString value) |
1245 | 1248 | var utfBytes = value.bytes(); |
1246 | 1249 | var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length()); |
1247 | 1250 | final String fieldName = fieldType().syntheticSourceFallbackFieldName(); |
1248 | | - context.doc().add(new StoredField(fieldName, bytesRef)); |
| 1251 | + |
| 1252 | + // store the value in a binary doc values field, create one if it doesn't exist |
| 1253 | + MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName); |
| 1254 | + if (field == null) { |
| 1255 | + field = new MultiValuedBinaryDocValuesField(fieldName); |
| 1256 | + context.doc().addWithKey(fieldName, field); |
| 1257 | + } |
| 1258 | + field.add(bytesRef); |
1249 | 1259 | } |
1250 | 1260 |
|
1251 | 1261 | return false; |
@@ -1413,15 +1423,53 @@ protected BytesRef preserve(BytesRef value) { |
1413 | 1423 | // extra copy of the field for supporting synthetic source. This layer will check that copy. |
1414 | 1424 | if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) { |
1415 | 1425 | final String fieldName = fieldType().syntheticSourceFallbackFieldName(); |
1416 | | - layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) { |
1417 | | - @Override |
1418 | | - protected void writeValue(Object value, XContentBuilder b) throws IOException { |
1419 | | - BytesRef ref = (BytesRef) value; |
1420 | | - b.utf8Value(ref.bytes, ref.offset, ref.length); |
1421 | | - } |
1422 | | - }); |
| 1426 | + layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName)); |
1423 | 1427 | } |
1424 | 1428 |
|
1425 | 1429 | return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers); |
1426 | 1430 | } |
| 1431 | + |
| 1432 | + /** |
| 1433 | + * A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique |
| 1434 | + * binary doc values for fields with multiple values per document. |
| 1435 | + */ |
| 1436 | + private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField { |
| 1437 | + |
| 1438 | + private final Set<BytesRef> uniqueValues; |
| 1439 | + |
| 1440 | + MultiValuedBinaryDocValuesField(String name) { |
| 1441 | + super(name); |
| 1442 | + // linked hash set to maintain insertion order of elements |
| 1443 | + uniqueValues = new LinkedHashSet<>(); |
| 1444 | + } |
| 1445 | + |
| 1446 | + public void add(final BytesRef value) { |
| 1447 | + uniqueValues.add(value); |
| 1448 | + } |
| 1449 | + |
| 1450 | + /** |
| 1451 | + * Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes |
| 1452 | + * the form of [doc value count][length of value 1][value 1][length of value 2][value 2]... |
| 1453 | + */ |
| 1454 | + @Override |
| 1455 | + public BytesRef binaryValue() { |
| 1456 | + int docValuesByteCount = uniqueValues.stream().map(a -> a.length).reduce(0, Integer::sum); |
| 1457 | + int docValuesCount = uniqueValues.size(); |
| 1458 | + // the + 1 is for the total doc values count, which is prefixed at the start of the array |
| 1459 | + int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES; |
| 1460 | + |
| 1461 | + try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) { |
| 1462 | + out.writeVInt(docValuesCount); |
| 1463 | + for (BytesRef value : uniqueValues) { |
| 1464 | + int valueLength = value.length; |
| 1465 | + out.writeVInt(valueLength); |
| 1466 | + out.writeBytes(value.bytes, value.offset, valueLength); |
| 1467 | + } |
| 1468 | + return out.bytes().toBytesRef(); |
| 1469 | + } catch (IOException e) { |
| 1470 | + throw new ElasticsearchException("Failed to get binary value", e); |
| 1471 | + } |
| 1472 | + } |
| 1473 | + |
| 1474 | + } |
1427 | 1475 | } |
0 commit comments