elastic · martijnvg · Feb 20, 2025 · Sep 29, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/...spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml b/...spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml
@@ -953,7 +953,7 @@ subobjects auto:
   - match: { hits.hits.0._source.foo: 10  }
   - match: { hits.hits.0._source.foo\.bar: 100  }
   - match: { hits.hits.0._source.regular.span.id: "1" }
-  - match: { hits.hits.0._source.regular.trace.id: [ "a", "b" ] }
+  - match: { hits.hits.0._source.regular.trace.id: ["b", "a", "b" ] }
   - match: { hits.hits.1._source.id: 2  }
   - match: { hits.hits.1._source.foo: 20 }
   - match: { hits.hits.1._source.foo\.bar: 200 }

diff --git a/...c/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml b/...c/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml
@@ -447,7 +447,7 @@ field param - keep root array:
         index: test
         sort: id
 
-  - match: { hits.hits.0._source.kw_default: [ "A", "B" ] }
+  - match: { hits.hits.0._source.kw_default: [ "B", "A" ] }
   - match: { hits.hits.0._source.kw_arrays: [ "B", "A" ] }
   - match: { hits.hits.0._source.kw_all: [ "B", "A" ] }
 
@@ -513,7 +513,7 @@ field param - keep nested array:
         sort: id
 
   - match: { hits.hits.0._source.path.to.ratio: 10.0 }
-  - match: { hits.hits.0._source.path.to.kw_default: [ "A", "B" ] }
+  - match: { hits.hits.0._source.path.to.kw_default: [ "B", "A" ] }
   - match: { hits.hits.0._source.path.to.kw_arrays: [ "B", "A" ] }
   - match: { hits.hits.0._source.path.to.kw_all: [ "B", "A" ] }
 

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java
@@ -799,7 +799,7 @@ private static void parseNonDynamicArray(
         String fullPath = context.path().pathAsText(arrayFieldName);
 
         // Check if we need to record the array source. This only applies to synthetic source.
-        if (context.canAddIgnoredField()) {
+        if (context.canAddIgnoredField() && (mapper instanceof KeywordFieldMapper == false /* hard coded exclusion keyword mapper */)) {
             boolean objectRequiresStoringSource = mapper instanceof ObjectMapper objectMapper
                 && (objectMapper.storeArraySource()
                     || (context.sourceKeepModeFromIndexSettings() == Mapper.SourceKeepMode.ARRAYS
@@ -826,10 +826,13 @@ private static void parseNonDynamicArray(
                 }
         }
 
-        // In synthetic source, if any array element requires storing its source as-is, it takes precedence over
-        // elements from regular source loading that are then skipped from the synthesized array source.
-        // To prevent this, we track each array name, to check if it contains any sub-arrays in its elements.
-        context = context.cloneForArray(fullPath);
+        // hard coded exclusion keyword mapper:
+        if (mapper instanceof KeywordFieldMapper == false) {
+            // In synthetic source, if any array element requires storing its source as-is, it takes precedence over
+            // elements from regular source loading that are then skipped from the synthesized array source.
+            // To prevent this, we track each array name, to check if it contains any sub-arrays in its elements.
+            context = context.cloneForArray(fullPath);
+        }
 
         XContentParser parser = context.parser();
         XContentParser.Token token;
@@ -847,6 +850,10 @@ private static void parseNonDynamicArray(
                 parseValue(context, lastFieldName);
             }
         }
+        // hard coded post processing of arrays:
+        if (mapper instanceof KeywordFieldMapper k) {
+            k.processOffsets(context);
+        }
         postProcessDynamicArrayMapping(context, lastFieldName);
     }
 

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java
@@ -27,11 +27,14 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
 
 /**
  * Context used when parsing incoming documents. Holds everything that is needed to parse a document as well as
@@ -856,4 +859,27 @@ public String currentName() throws IOException {
             return field;
         }
     }
+
+    private final Map<String, SortedMap<String, List<Integer>>> arrayOffsetsByField = new HashMap<>();
+    private final Map<String, Integer> numValuesByField = new HashMap<>();
+
+    public SortedMap<String, List<Integer>> getValuesWithOffsets(String field) {
+        return arrayOffsetsByField.get(field);
+    }
+
+    public int getArrayValueCount(String field) {
+        if (numValuesByField.containsKey(field)) {
+            return numValuesByField.get(field) + 1;
+        } else {
+            return 0;
+        }
+    }
+
+    public void recordOffset(String fieldName, String value) {
+        int count = numValuesByField.compute(fieldName, (s, integer) -> integer == null ? 0 : ++integer);
+        var values = arrayOffsetsByField.computeIfAbsent(fieldName , s -> new TreeMap<>(Comparator.naturalOrder()));
+        var offsets = values.computeIfAbsent(value, s -> new ArrayList<>());
+        offsets.add(count);
+    }
+
 }
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -13,6 +13,7 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.InvertableType;
@@ -34,6 +35,7 @@
 import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
 import org.apache.lucene.util.automaton.MinimizationOperations;
 import org.apache.lucene.util.automaton.Operations;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -89,6 +91,7 @@ public final class KeywordFieldMapper extends FieldMapper {
     private static final Logger logger = LogManager.getLogger(KeywordFieldMapper.class);
 
     public static final String CONTENT_TYPE = "keyword";
+    public static final String OFFSETS_FIELD_NAME_SUFFIX = "_offsets";
 
     static final NodeFeature KEYWORD_DIMENSION_IGNORE_ABOVE = new NodeFeature("mapper.keyword_dimension_ignore_above");
     static final NodeFeature KEYWORD_NORMALIZER_SYNTHETIC_SOURCE = new NodeFeature("mapper.keyword_normalizer_synthetic_source");
@@ -375,13 +378,26 @@ public KeywordFieldMapper build(MapperBuilderContext context) {
             }
             super.hasScript = script.get() != null;
             super.onScriptError = onScriptError.getValue();
+
+            BinaryFieldMapper offsetsFieldMapper;
+            if (context.isSourceSynthetic() && fieldtype.stored() == false) {
+                // keep track of value offsets so that we can reconstruct arrays from doc values in order as was specified during indexing
+                // (if field is stored then there is no point of doing this)
+                offsetsFieldMapper = new BinaryFieldMapper.Builder(leafName() + OFFSETS_FIELD_NAME_SUFFIX, context.isSourceSynthetic())
+                    .docValues(true)
+                    .build(context);
+            } else {
+                offsetsFieldMapper = null;
+            }
+
             return new KeywordFieldMapper(
                 leafName(),
                 fieldtype,
                 buildFieldType(context, fieldtype),
                 builderParams(this, context),
                 context.isSourceSynthetic(),
-                this
+                this,
+                offsetsFieldMapper
             );
         }
     }
@@ -882,14 +898,16 @@ public boolean hasNormalizer() {
     private final IndexAnalyzers indexAnalyzers;
     private final int ignoreAboveDefault;
     private final int ignoreAbove;
+    private final BinaryFieldMapper offsetsFieldMapper;
 
     private KeywordFieldMapper(
         String simpleName,
         FieldType fieldType,
         KeywordFieldType mappedFieldType,
         BuilderParams builderParams,
         boolean isSyntheticSource,
-        Builder builder
+        Builder builder,
+        BinaryFieldMapper offsetsFieldMapper
     ) {
         super(simpleName, mappedFieldType, builderParams);
         assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0;
@@ -906,17 +924,50 @@ private KeywordFieldMapper(
         this.isSyntheticSource = isSyntheticSource;
         this.ignoreAboveDefault = builder.ignoreAboveDefault;
         this.ignoreAbove = builder.ignoreAbove.getValue();
+        this.offsetsFieldMapper = offsetsFieldMapper;
     }
 
     @Override
     public KeywordFieldType fieldType() {
         return (KeywordFieldType) super.fieldType();
     }
 
-    @Override
     protected void parseCreateField(DocumentParserContext context) throws IOException {
-        final String value = context.parser().textOrNull();
-        indexValue(context, value == null ? fieldType().nullValue : value);
+        String value = context.parser().textOrNull();
+        if (value == null) {
+            value = fieldType().nullValue;
+        }
+        boolean indexed = indexValue(context, value);
+        if (offsetsFieldMapper != null && indexed) {
+            context.recordOffset(fullPath(), value);
+        }
+    }
+
+    public void processOffsets(DocumentParserContext context) throws IOException {
+        var values = context.getValuesWithOffsets(fullPath());
+        if (values == null || values.isEmpty()) {
+            return;
+        }
+
+        int arrayLength = context.getArrayValueCount(fullPath());
+        int ord = 0;
+        int[] offsetToOrd = new int[arrayLength];
+        for (var entry : values.entrySet()) {
+            for (var offsetAndLevel : entry.getValue()) {
+                offsetToOrd[offsetAndLevel] = ord;
+            }
+            ord++;
+        }
+
+        logger.info("values=" + values);
+        logger.info("offsetToOrd=" + Arrays.toString(offsetToOrd));
+
+        try (var streamOutput = new BytesStreamOutput()) {
+            // TODO: optimize
+            // This array allows to retain the original ordering of the leaf array and duplicate values.
+            streamOutput.writeVIntArray(offsetToOrd);
+            context.doc().add(new BinaryDocValuesField(offsetsFieldMapper.fullPath(), streamOutput.bytes().toBytesRef()));
+        }
     }
 
     @Override
@@ -929,13 +980,13 @@ protected void indexScriptValues(
         this.fieldType().scriptValues.valuesForDoc(searchLookup, readerContext, doc, value -> indexValue(documentParserContext, value));
     }
 
-    private void indexValue(DocumentParserContext context, String value) {
+    private boolean indexValue(DocumentParserContext context, String value) {
         if (value == null) {
-            return;
+            return false;
         }
         // if field is disabled, skip indexing
         if ((fieldType.indexOptions() == IndexOptions.NONE) && (fieldType.stored() == false) && (fieldType().hasDocValues() == false)) {
-            return;
+            return false;
         }
 
         if (value.length() > fieldType().ignoreAbove()) {
@@ -944,7 +995,7 @@ private void indexValue(DocumentParserContext context, String value) {
                 // Save a copy of the field so synthetic source can load it
                 context.doc().add(new StoredField(originalName(), new BytesRef(value)));
             }
-            return;
+            return false;
         }
 
         value = normalizeValue(fieldType().normalizer(), fullPath(), value);
@@ -982,6 +1033,8 @@ private void indexValue(DocumentParserContext context, String value) {
         if (fieldType().hasDocValues() == false && fieldType.omitNorms()) {
             context.addToFieldNames(fieldType().name());
         }
+
+        return true;
     }
 
     private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) {
@@ -1078,7 +1131,8 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {
                 }
             });
         } else if (hasDocValues) {
-            layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath()) {
+            String offsetsFullPath = offsetsFieldMapper != null ? offsetsFieldMapper.fullPath() : null;
+            layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath(), offsetsFullPath) {
 
                 @Override
                 protected BytesRef convert(BytesRef value) {