Store keyword fields that trip ignore_above in binary doc values

Kubik42 · Kubik42 · commit 2a64367b531d · 2025-11-03T13:14:00.000-08:00
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java b/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
+import org.elasticsearch.xcontent.XContentBuilder;
+
+import java.io.IOException;
+
+public final class BinaryDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer {
+
+    private final String fieldName;
+
+    // the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read
+    // the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
+    private final ByteArrayStreamInput stream = new ByteArrayStreamInput();
+    private BytesRef docValuesBytes;
+    private int valueCount;
+
+    public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) {
+        this.fieldName = fieldName;
+    }
+
+    @Override
+    public long valueCount() {
+        return valueCount;
+    }
+
+    @Override
+    public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
+        BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldName);
+
+        // there are no values associated with this field
+        if (docValues == null) return null;
+
+        return docId -> {
+            // there are no more documents to process
+            if (docValues.advanceExact(docId) == false) {
+                valueCount = 0;
+                return false;
+            }
+
+            // otherwise, extract the doc values into a stream to later read from
+            docValuesBytes = docValues.binaryValue();
+            stream.reset(docValuesBytes.bytes);
+            stream.setPosition(docValuesBytes.offset);
+            valueCount = stream.readVInt();
+
+            return hasValue();
+        };
+    }
+
+    @Override
+    public boolean hasValue() {
+        return valueCount > 0;
+    }
+
+    @Override
+    public void write(XContentBuilder b) throws IOException {
+        for (int i = 0; i < valueCount; i++) {
+            // extract the length of the ith value and serialize that many bytes into XContentBuilder
+            int valueLength = stream.readVInt();
+            b.utf8Value(docValuesBytes.bytes, stream.getPosition(), valueLength);
+
+            // finally, skip over the bytes we've just serialized to prepare for the next value
+            stream.skipBytes(valueLength);
+        }
+    }
+
+    @Override
+    public String fieldName() {
+        return fieldName;
+    }
+
+}
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -36,6 +36,8 @@
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
 import org.apache.lucene.util.automaton.Operations;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -83,6 +85,7 @@
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -1245,7 +1248,14 @@ private boolean indexValue(DocumentParserContext context, XContentString value)
                 var utfBytes = value.bytes();
                 var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
                 final String fieldName = fieldType().syntheticSourceFallbackFieldName();
-                context.doc().add(new StoredField(fieldName, bytesRef));
+
+                // store the value in a binary doc values field, create one if it doesn't exist
+                MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName);
+                if (field == null) {
+                    field = new MultiValuedBinaryDocValuesField(fieldName);
+                    context.doc().addWithKey(fieldName, field);
+                }
+                field.add(bytesRef);
             }
 
             return false;
@@ -1413,15 +1423,53 @@ protected BytesRef preserve(BytesRef value) {
         // extra copy of the field for supporting synthetic source. This layer will check that copy.
         if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) {
             final String fieldName = fieldType().syntheticSourceFallbackFieldName();
-            layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) {
-                @Override
-                protected void writeValue(Object value, XContentBuilder b) throws IOException {
-                    BytesRef ref = (BytesRef) value;
-                    b.utf8Value(ref.bytes, ref.offset, ref.length);
-                }
-            });
+            layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName));
         }
 
         return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
     }
+
+    /**
+     * A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique
+     * binary doc values for fields with multiple values per document.
+     */
+    private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField {
+
+        private final Set<BytesRef> uniqueValues;
+
+        MultiValuedBinaryDocValuesField(String name) {
+            super(name);
+            // linked hash set to maintain insertion order of elements
+            uniqueValues = new LinkedHashSet<>();
+        }
+
+        public void add(final BytesRef value) {
+            uniqueValues.add(value);
+        }
+
+        /**
+         * Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes
+         * the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
+         */
+        @Override
+        public BytesRef binaryValue() {
+            int docValuesByteCount = uniqueValues.stream().map(a -> a.length).reduce(0, Integer::sum);
+            int docValuesCount = uniqueValues.size();
+            // the + 1 is for the total doc values count, which is prefixed at the start of the array
+            int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES;
+
+            try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) {
+                out.writeVInt(docValuesCount);
+                for (BytesRef value : uniqueValues) {
+                    int valueLength = value.length;
+                    out.writeVInt(valueLength);
+                    out.writeBytes(value.bytes, value.offset, valueLength);
+                }
+                return out.bytes().toBytesRef();
+            } catch (IOException e) {
+                throw new ElasticsearchException("Failed to get binary value", e);
+            }
+        }
+
+    }
 }