elastic · martijnvg · Feb 20, 2025 · Sep 29, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/docs/changelog/113757.yaml b/docs/changelog/113757.yaml
@@ -0,0 +1,5 @@
+pr: 113757
+summary: Store arrays offsets for keyword fields natively with synthetic source instead of falling back to ignored source.
+area: Mapping
+type: enhancement
+issues: []
diff --git a/rest-api-spec/build.gradle b/rest-api-spec/build.gradle
@@ -83,4 +83,8 @@ tasks.named("yamlRestCompatTestTransform").configure ({ task ->
     "node_version warning is removed in 9.0"
   )
   task.skipTest("tsdb/20_mapping/nested fields", "nested field support in tsdb indices is now supported")
+  task.skipTest("logsdb/10_settings/routing path allowed in logs mode with routing on sort fields", "Unknown feature routing.logsb_route_on_sort_fields")
+  task.skipTest("indices.create/21_synthetic_source_stored/index param - field ordering", "Synthetic source keep arrays now stores leaf arrays natively")
+  task.skipTest("indices.create/21_synthetic_source_stored/field param - keep nested array", "Synthetic source keep arrays now stores leaf arrays natively")
+  task.skipTest("indices.create/21_synthetic_source_stored/field param - keep root array", "Synthetic source keep arrays now stores leaf arrays natively")
 })
diff --git a/...spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml b/...spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml
@@ -922,7 +922,7 @@ subobjects auto:
   - match: { hits.hits.0._source.foo: 10  }
   - match: { hits.hits.0._source.foo\.bar: 100  }
   - match: { hits.hits.0._source.regular.span.id: "1" }
-  - match: { hits.hits.0._source.regular.trace.id: [ "a", "b" ] }
+  - match: { hits.hits.0._source.regular.trace.id: ["a", "b" ] }
   - match: { hits.hits.1._source.id: 2  }
   - match: { hits.hits.1._source.foo: 20 }
   - match: { hits.hits.1._source.foo\.bar: 200 }

diff --git a/...c/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml b/...c/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml
@@ -1024,7 +1024,7 @@ index param - field ordering:
         index: test
 
   - length: { hits.hits.0._source: 4 }
-  - match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": [30, 20, 10], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } }
+  - match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": ["30", "20", "10"], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } }
 
 
 ---

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java
@@ -154,6 +154,7 @@ private void internalParseDocument(MetadataFieldMapper[] metadataFieldsMappers,
 
             executeIndexTimeScripts(context);
 
+            context.processArrayOffsets(context);
             for (MetadataFieldMapper metadataMapper : metadataFieldsMappers) {
                 metadataMapper.postParse(context);
             }
@@ -519,6 +520,7 @@ private static void throwOnCopyToOnObject(Mapper mapper, List<String> copyToFiel
 
     private static void parseObject(final DocumentParserContext context, String currentFieldName) throws IOException {
         assert currentFieldName != null;
+        context.setImmediateXContentParent(context.parser().currentToken());
         Mapper objectMapper = context.getMapper(currentFieldName);
         if (objectMapper != null) {
             doParseObject(context, currentFieldName, objectMapper);
@@ -611,6 +613,12 @@ private static void throwOnCreateDynamicNestedViaCopyTo(Mapper dynamicObjectMapp
     }
 
     private static void parseArray(DocumentParserContext context, String lastFieldName) throws IOException {
+        // Record previous immediate parent, so that it can be reset after array has been parsed.
+        // This is for recording array offset with synthetic source. Only if the immediate parent is an array,
+        // then the offsets can be accounted accurately.
+        var prev = context.getImmediateXContentParent();
+        context.setImmediateXContentParent(context.parser().currentToken());
+
         Mapper mapper = getLeafMapper(context, lastFieldName);
         if (mapper != null) {
             // There is a concrete mapper for this field already. Need to check if the mapper
@@ -624,6 +632,8 @@ private static void parseArray(DocumentParserContext context, String lastFieldNa
         } else {
             parseArrayDynamic(context, lastFieldName);
         }
+        // Reset previous immediate parent
+        context.setImmediateXContentParent(prev);
     }
 
     private static void parseArrayDynamic(DocumentParserContext context, String currentFieldName) throws IOException {
@@ -688,11 +698,12 @@ private static void parseNonDynamicArray(
         final String lastFieldName,
         String arrayFieldName
     ) throws IOException {
+        boolean supportStoringArrayOffsets = mapper != null && mapper.supportStoringArrayOffsets(context);
         String fullPath = context.path().pathAsText(arrayFieldName);
 
         // Check if we need to record the array source. This only applies to synthetic source.
         boolean canRemoveSingleLeafElement = false;
-        if (context.canAddIgnoredField()) {
+        if (context.canAddIgnoredField() && supportStoringArrayOffsets == false) {
             Mapper.SourceKeepMode mode = Mapper.SourceKeepMode.NONE;
             boolean objectWithFallbackSyntheticSource = false;
             if (mapper instanceof ObjectMapper objectMapper) {
@@ -736,6 +747,7 @@ private static void parseNonDynamicArray(
 
         XContentParser parser = context.parser();
         XContentParser.Token token;
+        XContentParser.Token previousToken = parser.currentToken();
         int elements = 0;
         while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
             if (token == XContentParser.Token.START_OBJECT) {
@@ -754,6 +766,14 @@ private static void parseNonDynamicArray(
                 elements++;
                 parseValue(context, lastFieldName);
             }
+            previousToken = token;
+        }
+        if (mapper != null
+            && context.canAddIgnoredField()
+            && mapper.supportStoringArrayOffsets(context)
+            && previousToken == XContentParser.Token.START_ARRAY
+            && context.isImmediateParentAnArray()) {
+            context.getOffSetContext().maybeRecordEmptyArray(mapper.getOffsetFieldName());
         }
         if (elements <= 1 && canRemoveSingleLeafElement) {
             context.removeLastIgnoredField(fullPath);

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java
@@ -91,6 +91,31 @@ public LuceneDocument doc() {
         protected void addDoc(LuceneDocument doc) {
             in.addDoc(doc);
         }
+
+        @Override
+        public void processArrayOffsets(DocumentParserContext context) throws IOException {
+            in.processArrayOffsets(context);
+        }
+
+        @Override
+        public FieldArrayContext getOffSetContext() {
+            return in.getOffSetContext();
+        }
+
+        @Override
+        public void setImmediateXContentParent(XContentParser.Token token) {
+            in.setImmediateXContentParent(token);
+        }
+
+        @Override
+        public XContentParser.Token getImmediateXContentParent() {
+            return in.getImmediateXContentParent();
+        }
+
+        @Override
+        public boolean isImmediateParentAnArray() {
+            return in.isImmediateParentAnArray();
+        }
     }
 
     /**
@@ -141,6 +166,8 @@ private enum Scope {
     private final SeqNoFieldMapper.SequenceIDFields seqID;
     private final Set<String> fieldsAppliedFromTemplates;
 
+    private FieldArrayContext fieldArrayContext;
+
     /**
      * Fields that are copied from values of other fields via copy_to.
      * This per-document state is needed since it is possible
@@ -460,6 +487,33 @@ public boolean isCopyToDestinationField(String name) {
         return copyToFields.contains(name);
     }
 
+    public void processArrayOffsets(DocumentParserContext context) throws IOException {
+        if (fieldArrayContext != null) {
+            fieldArrayContext.addToLuceneDocument(context);
+        }
+    }
+
+    public FieldArrayContext getOffSetContext() {
+        if (fieldArrayContext == null) {
+            fieldArrayContext = new FieldArrayContext();
+        }
+        return fieldArrayContext;
+    }
+
+    private XContentParser.Token lastSetToken;
+
+    public void setImmediateXContentParent(XContentParser.Token token) {
+        this.lastSetToken = token;
+    }
+
+    public XContentParser.Token getImmediateXContentParent() {
+        return lastSetToken;
+    }
+
+    public boolean isImmediateParentAnArray() {
+        return lastSetToken == XContentParser.Token.START_ARRAY;
+    }
+
     /**
      * Add a new mapper dynamically created while parsing.
      *

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/FieldArrayContext.java b/server/src/main/java/org/elasticsearch/index/mapper/FieldArrayContext.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.util.BitUtil;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
+import org.elasticsearch.common.io.stream.StreamInput;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+public class FieldArrayContext {
+
+    private final Map<String, Offsets> offsetsPerField = new HashMap<>();
+
+    void recordOffset(String field, String value) {
+        Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets());
+        int nextOffset = arrayOffsets.currentOffset++;
+        var offsets = arrayOffsets.valueToOffsets.computeIfAbsent(value, s -> new ArrayList<>(2));
+        offsets.add(nextOffset);
+    }
+
+    void recordNull(String field) {
+        Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets());
+        int nextOffset = arrayOffsets.currentOffset++;
+        arrayOffsets.nullValueOffsets.add(nextOffset);
+    }
+
+    void maybeRecordEmptyArray(String field) {
+        offsetsPerField.computeIfAbsent(field, k -> new Offsets());
+    }
+
+    void addToLuceneDocument(DocumentParserContext context) throws IOException {
+        for (var entry : offsetsPerField.entrySet()) {
+            var fieldName = entry.getKey();
+            var offset = entry.getValue();
+
+            int currentOrd = 0;
+            // This array allows to retain the original ordering of elements in leaf arrays and retain duplicates.
+            int[] offsetToOrd = new int[offset.currentOffset];
+            for (var offsetEntry : offset.valueToOffsets.entrySet()) {
+                for (var offsetAndLevel : offsetEntry.getValue()) {
+                    offsetToOrd[offsetAndLevel] = currentOrd;
+                }
+                currentOrd++;
+            }
+            for (var nullOffset : offset.nullValueOffsets) {
+                offsetToOrd[nullOffset] = -1;
+            }
+
+            try (var streamOutput = new BytesStreamOutput()) {
+                // Could just use vint for array length, but this allows for decoding my_field: null as -1
+                streamOutput.writeVInt(BitUtil.zigZagEncode(offsetToOrd.length));
+                for (int ord : offsetToOrd) {
+                    streamOutput.writeVInt(BitUtil.zigZagEncode(ord));
+                }
+                context.doc().add(new SortedDocValuesField(fieldName, streamOutput.bytes().toBytesRef()));
+            }
+        }
+    }
+
+    static int[] parseOffsetArray(StreamInput in) throws IOException {
+        int[] offsetToOrd = new int[BitUtil.zigZagDecode(in.readVInt())];
+        for (int i = 0; i < offsetToOrd.length; i++) {
+            offsetToOrd[i] = BitUtil.zigZagDecode(in.readVInt());
+        }
+        return offsetToOrd;
+    }
+
+    private static class Offsets {
+
+        int currentOffset;
+        // Need to use TreeMap here, so that we maintain the order in which each value (with offset) stored inserted,
+        // (which is in the same order the document gets parsed) so we store offsets in right order. This is the same
+        // order in what the values get stored in SortedSetDocValues.
+        final Map<String, List<Integer>> valueToOffsets = new TreeMap<>();
+        final List<Integer> nullValueOffsets = new ArrayList<>(2);
+
+    }
+
+}
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java
@@ -200,15 +200,15 @@ public void parse(DocumentParserContext context) throws IOException {
         }
     }
 
-    private void doParseMultiFields(DocumentParserContext context) throws IOException {
+    protected void doParseMultiFields(DocumentParserContext context) throws IOException {
         context.path().add(leafName());
         for (FieldMapper mapper : builderParams.multiFields.mappers) {
             mapper.parse(context);
         }
         context.path().remove();
     }
 
-    private static void throwIndexingWithScriptParam() {
+    protected static void throwIndexingWithScriptParam() {
         throw new IllegalArgumentException("Cannot index data directly into a field with a [script] parameter");
     }