diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java index a9edfc456db24..54842bc84693b 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java @@ -14,6 +14,8 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StoredField; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; @@ -30,6 +32,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOFunction; import org.elasticsearch.common.CheckedIntFunction; +import org.elasticsearch.common.io.stream.ByteArrayStreamInput; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.text.UTF8DecodingReader; import org.elasticsearch.common.unit.Fuzziness; @@ -39,6 +42,7 @@ import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.fielddata.FieldDataContext; import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.fielddata.SortedBinaryDocValues; import org.elasticsearch.index.fielddata.SourceValueFetcherSortedBinaryIndexFieldData; import org.elasticsearch.index.fielddata.StoredFieldSortedBinaryIndexFieldData; import org.elasticsearch.index.fieldvisitor.StoredFieldLoader; @@ -297,12 +301,17 @@ private IOFunction, IOExcepti if (parent instanceof KeywordFieldMapper.KeywordFieldType keywordParent && keywordParent.ignoreAbove().valuesPotentiallyIgnored()) { - final String parentFallbackFieldName = keywordParent.syntheticSourceFallbackFieldName(); if (parent.isStored()) { - return storedFieldFetcher(parentFieldName, parentFallbackFieldName); + return combineFieldFetchers( + storedFieldFetcher(parentFieldName), + ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName()) + ); } else if (parent.hasDocValues()) { var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH); - return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(parentFallbackFieldName)); + return combineFieldFetchers( + docValuesFieldFetcher(ifd), + ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName()) + ); } } @@ -325,22 +334,16 @@ private IOFunction, IOExcepti final KeywordFieldMapper.KeywordFieldType keywordDelegate ) { if (keywordDelegate.ignoreAbove().valuesPotentiallyIgnored()) { - // because we don't know whether the delegate field will be ignored during parsing, we must also check the current field - String fieldName = name(); - String fallbackName = syntheticSourceFallbackFieldName(); - - // delegate field names String delegateFieldName = keywordDelegate.name(); - String delegateFieldFallbackName = keywordDelegate.syntheticSourceFallbackFieldName(); + // bc we don't know whether the delegate will ignore a value, we must also check the fallback field created by this + // match_only_text field + String fallbackName = syntheticSourceFallbackFieldName(); if (keywordDelegate.isStored()) { - return storedFieldFetcher(delegateFieldName, delegateFieldFallbackName, fieldName, fallbackName); + return storedFieldFetcher(delegateFieldName, fallbackName); } else if (keywordDelegate.hasDocValues()) { var ifd = searchExecutionContext.getForField(keywordDelegate, MappedFieldType.FielddataOperation.SEARCH); - return combineFieldFetchers( - docValuesFieldFetcher(ifd), - storedFieldFetcher(delegateFieldFallbackName, fieldName, fallbackName) - ); + return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(fallbackName)); } } @@ -355,25 +358,34 @@ private IOFunction, IOExcepti } } - private static IOFunction, IOException>> docValuesFieldFetcher( - IndexFieldData ifd + private IOFunction, IOException>> docValuesFieldFetcher(IndexFieldData ifd) { + return context -> { + SortedBinaryDocValues indexedValuesDocValues = ifd.load(context).getBytesValues(); + return docId -> getValuesFromDocValues(indexedValuesDocValues, docId); + }; + } + + private IOFunction, IOException>> ignoredValuesDocValuesFieldFetcher( + String fieldName ) { return context -> { - var sortedBinaryDocValues = ifd.load(context).getBytesValues(); - return docId -> { - if (sortedBinaryDocValues.advanceExact(docId)) { - var values = new ArrayList<>(sortedBinaryDocValues.docValueCount()); - for (int i = 0; i < sortedBinaryDocValues.docValueCount(); i++) { - values.add(sortedBinaryDocValues.nextValue().utf8ToString()); - } - return values; - } else { - return List.of(); - } - }; + CustomBinaryDocValues ignoredValuesDocValues = new CustomBinaryDocValues(DocValues.getBinary(context.reader(), fieldName)); + return docId -> getValuesFromDocValues(ignoredValuesDocValues, docId); }; } + private List getValuesFromDocValues(SortedBinaryDocValues docValues, int docId) throws IOException { + if (docValues.advanceExact(docId)) { + var values = new ArrayList<>(docValues.docValueCount()); + for (int i = 0; i < docValues.docValueCount(); i++) { + values.add(docValues.nextValue().utf8ToString()); + } + return values; + } else { + return List.of(); + } + } + private static IOFunction, IOException>> storedFieldFetcher(String... names) { var loader = StoredFieldLoader.create(false, Set.of(names)); return context -> { @@ -779,4 +791,46 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException { return fieldLoader; } + + /** + * A wrapper around {@link BinaryDocValues} that exposes some quality of life functions. Note, these values are not sorted. + */ + private static class CustomBinaryDocValues extends SortedBinaryDocValues { + + private final BinaryDocValues binaryDocValues; + private final ByteArrayStreamInput stream; + + private int docValueCount = 0; + + CustomBinaryDocValues(BinaryDocValues binaryDocValues) { + this.binaryDocValues = binaryDocValues; + this.stream = new ByteArrayStreamInput(); + } + + @Override + public BytesRef nextValue() throws IOException { + // this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt() + return stream.readBytesRef(); + } + + @Override + public boolean advanceExact(int docId) throws IOException { + // if document has a value, read underlying bytes + if (binaryDocValues.advanceExact(docId)) { + BytesRef docValuesBytes = binaryDocValues.binaryValue(); + stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length); + docValueCount = stream.readVInt(); + return true; + } + + // otherwise there is nothing to do + docValueCount = 0; + return false; + } + + @Override + public int docValueCount() { + return docValueCount; + } + } } diff --git a/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml b/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml index 0050618beeb67..581841df3fe52 100644 --- a/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml +++ b/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml @@ -465,7 +465,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent: id: "1" refresh: true body: - foo: [ "Apache Lucene powers Elasticsearch", "Apache" ] + foo: [ "Apache Lucene powers Elasticsearch", "Apache", "Apache Lucene" ] - do: search: @@ -477,7 +477,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent: - match: { "hits.total.value": 1 } - match: - hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch" ] + hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch", "Apache Lucene" ] --- synthetic_source match_only_text as multi-field with stored keyword as parent: diff --git a/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java b/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java new file mode 100644 index 0000000000000..1f0c0be1f9555 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java @@ -0,0 +1,84 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.io.stream.ByteArrayStreamInput; +import org.elasticsearch.xcontent.XContentBuilder; + +import java.io.IOException; + +public final class BinaryDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer { + + private final String fieldName; + + // the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read + // the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]... + private final ByteArrayStreamInput stream; + private int valueCount; + + public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) { + this.fieldName = fieldName; + this.stream = new ByteArrayStreamInput(); + } + + @Override + public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException { + BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldName); + + // there are no values associated with this field + if (docValues == null) { + valueCount = 0; + return null; + } + + return docId -> { + // there are no more documents to process + if (docValues.advanceExact(docId) == false) { + valueCount = 0; + return false; + } + + // otherwise, extract the doc values into a stream to later read from + BytesRef docValuesBytes = docValues.binaryValue(); + stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length); + valueCount = stream.readVInt(); + + return hasValue(); + }; + } + + @Override + public void write(XContentBuilder b) throws IOException { + for (int i = 0; i < valueCount; i++) { + // this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt() + BytesRef valueBytes = stream.readBytesRef(); + b.value(valueBytes.utf8ToString()); + } + } + + @Override + public boolean hasValue() { + return valueCount > 0; + } + + @Override + public long valueCount() { + return valueCount; + } + + @Override + public String fieldName() { + return fieldName; + } + +} diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 31fd1e404d108..4d5f1c9a67ef9 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -40,6 +40,8 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE; import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.common.io.stream.BytesStreamOutput; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.search.AutomatonQueries; @@ -87,6 +89,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; @@ -1153,7 +1156,14 @@ private boolean indexValue(DocumentParserContext context, XContentString value) var utfBytes = value.bytes(); var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length()); final String fieldName = fieldType().syntheticSourceFallbackFieldName(); - context.doc().add(new StoredField(fieldName, bytesRef)); + + // store the value in a binary doc values field, create one if it doesn't exist + MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName); + if (field == null) { + field = new MultiValuedBinaryDocValuesField(fieldName); + context.doc().addWithKey(fieldName, field); + } + field.add(bytesRef); } return false; @@ -1316,15 +1326,56 @@ protected BytesRef preserve(BytesRef value) { // extra copy of the field for supporting synthetic source. This layer will check that copy. if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) { final String fieldName = fieldType().syntheticSourceFallbackFieldName(); - layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) { - @Override - protected void writeValue(Object value, XContentBuilder b) throws IOException { - BytesRef ref = (BytesRef) value; - b.utf8Value(ref.bytes, ref.offset, ref.length); - } - }); + layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName)); } return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers); } + + /** + * A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique + * binary doc values for fields with multiple values per document. + */ + private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField { + + private final Set uniqueValues; + private int docValuesByteCount = 0; + + MultiValuedBinaryDocValuesField(String name) { + super(name); + // linked hash set to maintain insertion order of elements + uniqueValues = new LinkedHashSet<>(); + } + + public void add(final BytesRef value) { + if (uniqueValues.add(value)) { + // might as well track these on the go as opposed to having to loop through all entries later + docValuesByteCount += value.length; + } + } + + /** + * Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes + * the form of [doc value count][length of value 1][value 1][length of value 2][value 2]... + */ + @Override + public BytesRef binaryValue() { + int docValuesCount = uniqueValues.size(); + // the + 1 is for the total doc values count, which is prefixed at the start of the array + int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES; + + try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) { + out.writeVInt(docValuesCount); + for (BytesRef value : uniqueValues) { + int valueLength = value.length; + out.writeVInt(valueLength); + out.writeBytes(value.bytes, value.offset, valueLength); + } + return out.bytes().toBytesRef(); + } catch (IOException e) { + throw new ElasticsearchException("Failed to get binary value", e); + } + } + + } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java index 41e0c644ee20e..ec71e07fc9231 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java @@ -52,6 +52,7 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception { .endObject() .endObject() .endObject(); + // Note values that would be ignored are added at the end of arrays, // this makes testing easier as ignored values are always synthesized after regular values: var arrayValues = new Object[][] { @@ -60,7 +61,16 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception { new Object[] { "123", "1234", "12345" }, new Object[] { null, null, null, "blabla" }, new Object[] { "1", "2", "3", "blabla" } }; - verifySyntheticArray(arrayValues, mapping, "_id", "field._original"); + + // values in the original array should be deduplicated + var expectedArrayValues = new Object[][] { + new Object[] { null, "a", "ab", "abc", "abcd", null, "abcde" }, + new Object[] { "12345" }, + new Object[] { "123", "1234", "12345" }, + new Object[] { null, null, null, "blabla" }, + new Object[] { "1", "2", "3", "blabla" } }; + + verifySyntheticArray(arrayValues, expectedArrayValues, mapping, "_id"); } public void testSynthesizeObjectArray() throws Exception { diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/NativeArrayIntegrationTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/NativeArrayIntegrationTestCase.java index d1ab3c0907562..950626292d120 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/mapper/NativeArrayIntegrationTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/NativeArrayIntegrationTestCase.java @@ -259,11 +259,17 @@ protected void verifySyntheticArray(Object[][] arrays, XContentBuilder mapping, private XContentBuilder arrayToSource(Object[] array) throws IOException { var source = jsonBuilder().startObject(); if (array != null) { - source.startArray("field"); - for (Object arrayValue : array) { - source.value(arrayValue); + // collapse array if it only consists of one element + // if the only element is null, then we'll skip synthesizing source for that field + if (array.length == 1 && array[0] != null) { + source.field("field", array[0]); + } else { + source.startArray("field"); + for (Object arrayValue : array) { + source.value(arrayValue); + } + source.endArray(); } - source.endArray(); } else { source.field("field").nullValue(); } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index ad28b0336d855..6dbf619e0c140 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -17,10 +17,8 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StoredField; -import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; -import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; @@ -43,7 +41,6 @@ import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.geo.ShapeRelation; -import org.elasticsearch.common.io.stream.ByteArrayStreamInput; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.time.DateMathParser; @@ -58,6 +55,7 @@ import org.elasticsearch.index.fielddata.FieldDataContext; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.plain.StringBinaryIndexFieldData; +import org.elasticsearch.index.mapper.BinaryDocValuesSyntheticFieldLoaderLayer; import org.elasticsearch.index.mapper.BinaryFieldMapper.CustomBinaryDocValuesField; import org.elasticsearch.index.mapper.BlockLoader; import org.elasticsearch.index.mapper.CompositeSyntheticFieldLoader; @@ -1106,7 +1104,7 @@ public FieldMapper.Builder getMergeBuilder() { protected SyntheticSourceSupport syntheticSourceSupport() { return new SyntheticSourceSupport.Native(() -> { var layers = new ArrayList(); - layers.add(new WildcardSyntheticFieldLoader()); + layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fullPath())); if (ignoreAbove.valuesPotentiallyIgnored()) { layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(originalName()) { @Override @@ -1120,54 +1118,4 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException { }); } - private class WildcardSyntheticFieldLoader implements CompositeSyntheticFieldLoader.DocValuesLayer { - private final ByteArrayStreamInput docValuesStream = new ByteArrayStreamInput(); - private int docValueCount; - private BytesRef docValueBytes; - - @Override - public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException { - BinaryDocValues values = leafReader.getBinaryDocValues(fullPath()); - if (values == null) { - docValueCount = 0; - return null; - } - - return docId -> { - if (values.advanceExact(docId) == false) { - docValueCount = 0; - return hasValue(); - } - docValueBytes = values.binaryValue(); - docValuesStream.reset(docValueBytes.bytes); - docValuesStream.setPosition(docValueBytes.offset); - docValueCount = docValuesStream.readVInt(); - return hasValue(); - }; - } - - @Override - public boolean hasValue() { - return docValueCount > 0; - } - - @Override - public long valueCount() { - return docValueCount; - } - - @Override - public void write(XContentBuilder b) throws IOException { - for (int i = 0; i < docValueCount; i++) { - int length = docValuesStream.readVInt(); - b.utf8Value(docValueBytes.bytes, docValuesStream.getPosition(), length); - docValuesStream.skipBytes(length); - } - } - - @Override - public String fieldName() { - return fullPath(); - } - } }