Store keyword fields that trip ignore_above in binary doc values

Kubik42 · Kubik42 · commit ffc2498bd64a · 2025-11-04T21:01:15.000-08:00
diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java
@@ -14,6 +14,7 @@
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.Term;
@@ -30,6 +31,7 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOFunction;
 import org.elasticsearch.common.CheckedIntFunction;
+import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.text.UTF8DecodingReader;
 import org.elasticsearch.common.unit.Fuzziness;
@@ -297,12 +299,18 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
 
             if (parent instanceof KeywordFieldMapper.KeywordFieldType keywordParent
                 && keywordParent.ignoreAbove().valuesPotentiallyIgnored()) {
-                final String parentFallbackFieldName = keywordParent.syntheticSourceFallbackFieldName();
                 if (parent.isStored()) {
-                    return storedFieldFetcher(parentFieldName, parentFallbackFieldName);
+                    // if the parent keyword field has ignore_above set, then any ignored values will be stored under a fallback field
+                    return combineFieldFetchers(
+                        storedFieldFetcher(parentFieldName),
+                        binaryDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
+                    );
                 } else if (parent.hasDocValues()) {
                     var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH);
-                    return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(parentFallbackFieldName));
+                    return combineFieldFetchers(
+                        docValuesFieldFetcher(ifd),
+                        binaryDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
+                    );
                 }
             }
 
@@ -325,22 +333,16 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
             final KeywordFieldMapper.KeywordFieldType keywordDelegate
         ) {
             if (keywordDelegate.ignoreAbove().valuesPotentiallyIgnored()) {
-                // because we don't know whether the delegate field will be ignored during parsing, we must also check the current field
-                String fieldName = name();
-                String fallbackName = syntheticSourceFallbackFieldName();
-
-                // delegate field names
                 String delegateFieldName = keywordDelegate.name();
-                String delegateFieldFallbackName = keywordDelegate.syntheticSourceFallbackFieldName();
+                // bc we don't know whether the delegate will ignore a value, we must also check the fallback field created by this
+                // match_only_text field
+                String fallbackName = syntheticSourceFallbackFieldName();
 
                 if (keywordDelegate.isStored()) {
-                    return storedFieldFetcher(delegateFieldName, delegateFieldFallbackName, fieldName, fallbackName);
+                    return storedFieldFetcher(delegateFieldName, fallbackName);
                 } else if (keywordDelegate.hasDocValues()) {
                     var ifd = searchExecutionContext.getForField(keywordDelegate, MappedFieldType.FielddataOperation.SEARCH);
-                    return combineFieldFetchers(
-                        docValuesFieldFetcher(ifd),
-                        storedFieldFetcher(delegateFieldFallbackName, fieldName, fallbackName)
-                    );
+                    return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(fallbackName));
                 }
             }
 
@@ -374,6 +376,42 @@ private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IO
             };
         }
 
+        /**
+         * Used exclusively to load ignored values from binary doc values. These values are stored in a separate fallback field in order to
+         * retain the original value and hence be able to support synthetic source.
+         */
+        private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> binaryDocValuesFieldFetcher(
+            String fieldName
+        ) {
+            return context -> {
+                var binaryDocValues = DocValues.getBinary(context.reader(), fieldName);
+                return docId -> {
+                    if (binaryDocValues == null || binaryDocValues.advanceExact(docId) == false) {
+                        return List.of();
+                    }
+
+                    // see KeywordFieldMapper.MultiValuedBinaryDocValuesField for context on how to decode these binary doc values back into
+                    // strings
+                    BytesRef docValuesBytes = binaryDocValues.binaryValue();
+
+                    try (ByteArrayStreamInput stream = new ByteArrayStreamInput()) {
+                        stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
+
+                        int docValueCount = stream.readVInt();
+                        var values = new ArrayList<>(docValueCount);
+
+                        for (int i = 0; i < docValueCount; i++) {
+                            // this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
+                            BytesRef valueBytes = stream.readBytesRef();
+                            values.add(valueBytes.utf8ToString());
+                        }
+
+                        return values;
+                    }
+                };
+            };
+        }
+
         private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> storedFieldFetcher(String... names) {
             var loader = StoredFieldLoader.create(false, Set.of(names));
             return context -> {
diff --git a/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml b/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml
@@ -465,7 +465,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
         id: "1"
         refresh: true
         body:
-          foo: [ "Apache Lucene powers Elasticsearch", "Apache" ]
+          foo: [ "Apache Lucene powers Elasticsearch", "Apache", "Apache Lucene" ]
 
   - do:
       search:
@@ -477,7 +477,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
 
   - match: { "hits.total.value": 1 }
   - match:
-      hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch" ]
+      hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch", "Apache Lucene" ]
 
 ---
 synthetic_source match_only_text as multi-field with stored keyword as parent:
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java b/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java
@@ -0,0 +1,81 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
+import org.elasticsearch.xcontent.XContentBuilder;
+
+import java.io.IOException;
+
+public final class BinaryDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer {
+
+    private final String fieldName;
+
+    // the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read
+    // the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
+    private final ByteArrayStreamInput stream = new ByteArrayStreamInput();
+    private BytesRef docValuesBytes;
+    private int valueCount;
+
+    public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) {
+        this.fieldName = fieldName;
+    }
+
+    @Override
+    public long valueCount() {
+        return valueCount;
+    }
+
+    @Override
+    public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
+        BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldName);
+
+        // there are no values associated with this field
+        if (docValues == null) return null;
+
+        return docId -> {
+            // there are no more documents to process
+            if (docValues.advanceExact(docId) == false) {
+                valueCount = 0;
+                return false;
+            }
+
+            // otherwise, extract the doc values into a stream to later read from
+            docValuesBytes = docValues.binaryValue();
+            stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
+            valueCount = stream.readVInt();
+
+            return hasValue();
+        };
+    }
+
+    @Override
+    public boolean hasValue() {
+        return valueCount > 0;
+    }
+
+    @Override
+    public void write(XContentBuilder b) throws IOException {
+        for (int i = 0; i < valueCount; i++) {
+            // this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
+            BytesRef valueBytes = stream.readBytesRef();
+            b.value(valueBytes.utf8ToString());
+        }
+    }
+
+    @Override
+    public String fieldName() {
+        return fieldName;
+    }
+
+}
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -40,6 +40,8 @@
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
 import org.apache.lucene.util.automaton.Operations;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -85,6 +87,7 @@
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -1248,7 +1251,14 @@ private boolean indexValue(DocumentParserContext context, XContentString value)
                 var utfBytes = value.bytes();
                 var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
                 final String fieldName = fieldType().syntheticSourceFallbackFieldName();
-                context.doc().add(new StoredField(fieldName, bytesRef));
+
+                // store the value in a binary doc values field, create one if it doesn't exist
+                MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName);
+                if (field == null) {
+                    field = new MultiValuedBinaryDocValuesField(fieldName);
+                    context.doc().addWithKey(fieldName, field);
+                }
+                field.add(bytesRef);
             }
 
             return false;
@@ -1416,15 +1426,55 @@ protected BytesRef preserve(BytesRef value) {
         // extra copy of the field for supporting synthetic source. This layer will check that copy.
         if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) {
             final String fieldName = fieldType().syntheticSourceFallbackFieldName();
-            layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) {
-                @Override
-                protected void writeValue(Object value, XContentBuilder b) throws IOException {
-                    BytesRef ref = (BytesRef) value;
-                    b.utf8Value(ref.bytes, ref.offset, ref.length);
-                }
-            });
+            layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName));
         }
 
         return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
     }
+
+    /**
+     * A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique
+     * binary doc values for fields with multiple values per document.
+     */
+    private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField {
+
+        private final Set<BytesRef> uniqueValues;
+        private int docValuesByteCount = 0;
+
+        MultiValuedBinaryDocValuesField(String name) {
+            super(name);
+            // linked hash set to maintain insertion order of elements
+            uniqueValues = new LinkedHashSet<>();
+        }
+
+        public void add(final BytesRef value) {
+            uniqueValues.add(value);
+            // might as well track these on the go as opposed to having to loop through all entries later
+            docValuesByteCount += value.length;
+        }
+
+        /**
+         * Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes
+         * the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
+         */
+        @Override
+        public BytesRef binaryValue() {
+            int docValuesCount = uniqueValues.size();
+            // the + 1 is for the total doc values count, which is prefixed at the start of the array
+            int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES;
+
+            try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) {
+                out.writeVInt(docValuesCount);
+                for (BytesRef value : uniqueValues) {
+                    int valueLength = value.length;
+                    out.writeVInt(valueLength);
+                    out.writeBytes(value.bytes, value.offset, valueLength);
+                }
+                return out.bytes().toBytesRef();
+            } catch (IOException e) {
+                throw new ElasticsearchException("Failed to get binary value", e);
+            }
+        }
+
+    }
 }