Moved ignore values doc value field fetcher inside of existing fetcher function

Kubik42 · Kubik42 · commit 23c0027c8582 · 2025-11-07T16:02:47.000-08:00
diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java
@@ -14,6 +14,7 @@
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.LeafReaderContext;
@@ -28,19 +29,21 @@
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOFunction;
 import org.elasticsearch.common.CheckedIntFunction;
-import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.text.UTF8DecodingReader;
 import org.elasticsearch.common.unit.Fuzziness;
 import org.elasticsearch.index.IndexVersion;
 import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.index.fielddata.AbstractBinaryDocValues;
 import org.elasticsearch.index.fielddata.FieldDataContext;
 import org.elasticsearch.index.fielddata.IndexFieldData;
+import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
 import org.elasticsearch.index.fielddata.SourceValueFetcherSortedBinaryIndexFieldData;
 import org.elasticsearch.index.fielddata.StoredFieldSortedBinaryIndexFieldData;
 import org.elasticsearch.index.fieldvisitor.StoredFieldLoader;
@@ -299,18 +302,11 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
 
             if (parent instanceof KeywordFieldMapper.KeywordFieldType keywordParent
                 && keywordParent.ignoreAbove().valuesPotentiallyIgnored()) {
+                var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH);
                 if (parent.isStored()) {
-                    // if the parent keyword field has ignore_above set, then any ignored values will be stored under a fallback field
-                    return combineFieldFetchers(
-                        storedFieldFetcher(parentFieldName),
-                        binaryDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
-                    );
+                    return combineFieldFetchers(storedFieldFetcher(parentFieldName), docValuesFieldFetcher(ifd));
                 } else if (parent.hasDocValues()) {
-                    var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH);
-                    return combineFieldFetchers(
-                        docValuesFieldFetcher(ifd),
-                        binaryDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
-                    );
+                    return docValuesFieldFetcher(ifd);
                 }
             }
 
@@ -357,57 +353,29 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
             }
         }
 
-        private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> docValuesFieldFetcher(
-            IndexFieldData<?> ifd
-        ) {
+        private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> docValuesFieldFetcher(IndexFieldData<?> ifd) {
             return context -> {
-                var sortedBinaryDocValues = ifd.load(context).getBytesValues();
-                return docId -> {
-                    if (sortedBinaryDocValues.advanceExact(docId)) {
-                        var values = new ArrayList<>(sortedBinaryDocValues.docValueCount());
-                        for (int i = 0; i < sortedBinaryDocValues.docValueCount(); i++) {
-                            values.add(sortedBinaryDocValues.nextValue().utf8ToString());
-                        }
-                        return values;
-                    } else {
-                        return List.of();
-                    }
-                };
-            };
-        }
+                SortedBinaryDocValues indexedValuesDocValues = ifd.load(context).getBytesValues();
+                CustomBinaryDocValues ignoredValuesDocValues = new CustomBinaryDocValues(
+                    DocValues.getBinary(context.reader(), ifd.getFieldName() + TextFamilyFieldType.FALLBACK_FIELD_NAME_SUFFIX)
+                );
 
-        /**
-         * Used exclusively to load ignored values from binary doc values. These values are stored in a separate fallback field in order to
-         * retain the original value and hence be able to support synthetic source.
-         */
-        private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> binaryDocValuesFieldFetcher(
-            String fieldName
-        ) {
-            return context -> {
-                var binaryDocValues = DocValues.getBinary(context.reader(), fieldName);
                 return docId -> {
-                    if (binaryDocValues == null || binaryDocValues.advanceExact(docId) == false) {
-                        return List.of();
-                    }
-
-                    // see KeywordFieldMapper.MultiValuedBinaryDocValuesField for context on how to decode these binary doc values back into
-                    // strings
-                    BytesRef docValuesBytes = binaryDocValues.binaryValue();
-
-                    try (ByteArrayStreamInput stream = new ByteArrayStreamInput()) {
-                        stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
+                    int indexedValueCount = indexedValuesDocValues.advanceExact(docId) ? indexedValuesDocValues.docValueCount() : 0;
+                    int ignoredValueCount = ignoredValuesDocValues.advanceExact(docId) ? ignoredValuesDocValues.docValueCount() : 0;
+                    var values = new ArrayList<>(indexedValueCount + ignoredValueCount);
 
-                        int docValueCount = stream.readVInt();
-                        var values = new ArrayList<>(docValueCount);
-
-                        for (int i = 0; i < docValueCount; i++) {
-                            // this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
-                            BytesRef valueBytes = stream.readBytesRef();
-                            values.add(valueBytes.utf8ToString());
-                        }
+                    // extract indexed values from doc values
+                    for (int i = 0; i < indexedValueCount; i++) {
+                        values.add(indexedValuesDocValues.nextValue().utf8ToString());
+                    }
 
-                        return values;
+                    // extract ignored values from doc values
+                    for (int i = 0; i < ignoredValueCount; i++) {
+                        values.add(ignoredValuesDocValues.nextValue().utf8ToString());
                     }
+
+                    return values;
                 };
             };
         }
@@ -817,4 +785,52 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {
 
         return fieldLoader;
     }
+
+    private static class CustomBinaryDocValues extends AbstractBinaryDocValues {
+
+        private final BinaryDocValues binaryDocValues;
+
+        private ByteArrayDataInput data;
+        private int docValueCount = 0;
+
+        CustomBinaryDocValues(BinaryDocValues binaryDocValues) {
+            this.binaryDocValues = binaryDocValues;
+        }
+
+        public BytesRef nextValue() {
+            // get the length of the value
+            int length = data.readVInt();
+
+            // read that many bytes from the underlying bytes array
+            // the read will automatically move the offset to the next value
+            byte[] valueBytes = new byte[length];
+            data.readBytes(valueBytes, 0, length);
+
+            return new BytesRef(valueBytes);
+        }
+
+        @Override
+        public BytesRef binaryValue() throws IOException {
+            return binaryDocValues.binaryValue();
+        }
+
+        @Override
+        public boolean advanceExact(int docId) throws IOException {
+            // if document has a value, read underlying bytes
+            if (binaryDocValues.advanceExact(docId)) {
+                BytesRef docValuesBytes = binaryDocValues.binaryValue();
+                data = new ByteArrayDataInput(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
+                docValueCount = data.readVInt();
+                return true;
+            }
+
+            // otherwise there is nothing to do
+            docValueCount = 0;
+            return false;
+        }
+
+        public int docValueCount() {
+            return docValueCount;
+        }
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java b/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java
@@ -11,8 +11,8 @@
 
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.BytesRef;
-import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
 import org.elasticsearch.xcontent.XContentBuilder;
 
 import java.io.IOException;
@@ -23,7 +23,7 @@ public final class BinaryDocValuesSyntheticFieldLoaderLayer implements Composite
 
     // the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read
     // the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
-    private final ByteArrayStreamInput stream = new ByteArrayStreamInput();
+    private final ByteArrayDataInput data = new ByteArrayDataInput();
     private int valueCount;
 
     public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) {
@@ -49,8 +49,8 @@ public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf
 
             // otherwise, extract the doc values into a stream to later read from
             BytesRef docValuesBytes = docValues.binaryValue();
-            stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
-            valueCount = stream.readVInt();
+            data.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
+            valueCount = data.readVInt();
 
             return hasValue();
         };
@@ -59,9 +59,16 @@ public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf
     @Override
     public void write(XContentBuilder b) throws IOException {
         for (int i = 0; i < valueCount; i++) {
-            // this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
-            BytesRef valueBytes = stream.readBytesRef();
-            b.value(valueBytes.utf8ToString());
+            // read the length of the value
+            int length = data.readVInt();
+
+            // read that many bytes from the input
+            // the read will automatically move the offset to the next value
+            byte[] valueBytes = new byte[length];
+            data.readBytes(valueBytes, 0, length);
+
+            // finally, write those bytes into XContentBuilder
+            b.value(new BytesRef(valueBytes).utf8ToString());
         }
     }
 
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -1452,7 +1452,6 @@ public void add(final BytesRef value) {
                 // might as well track these on the go as opposed to having to loop through all entries later
                 docValuesByteCount += value.length;
             }
-            ;
         }
 
         /**

Original file line number	Diff line number	Diff line change
`@@ -1452,7 +1452,6 @@ public void add(final BytesRef value) {`
`1452`	`1452`	`// might as well track these on the go as opposed to having to loop through all entries later`
`1453`	`1453`	`docValuesByteCount += value.length;`
`1454`	`1454`	`}`
`1455`		`- ;`
`1456`	`1455`	`}`
`1457`	`1456`
`1458`	`1457`	`/**`