Skip to content

Commit 43ddfc9

Browse files
committed
[8.x] Store arrays offsets for keyword fields natively with synthetic source
Backporting elastic#113757 to 8.x branch. The keyword doc values field gets an extra sorted doc values field, that encodes the order of how array values were specified at index time. This also captures duplicate values. This is stored in an offset to ordinal array that gets zigzag vint encoded into a sorted doc values field. For example, in case of the following string array for a keyword field: ["c", "b", "a", "c"]. Sorted set doc values: ["a", "b", "c"] with ordinals: 0, 1 and 2. The offset array will be: [2, 1, 0, 2] Null values are also supported. For example ["c", "b", null, "c"] results into sorted set doc values: ["b", "c"] with ordinals: 0 and 1. The offset array will be: [1, 0, -1, 1] Empty arrays are also supported by encoding a zigzag vint array of zero elements. Limitations: currently only doc values based array support for keyword field mapper. multi level leaf arrays are flattened. For example: [[b], [c]] -> [b, c] arrays are always synthesized as one type. In case of keyword field, [1, 2] gets synthesized as ["1", "2"]. These limitations can be addressed, but some require more complexity and or additional storage. With this PR, keyword field array will no longer be stored in ignored source, but array offsets are kept track of in an adjacent sorted doc value field. This only applies if index.mapping.synthetic_source_keep is set to arrays (default for logsdb).
1 parent 1420de7 commit 43ddfc9

File tree

19 files changed

+1095
-33
lines changed

19 files changed

+1095
-33
lines changed

docs/changelog/113757.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 113757
2+
summary: Store arrays offsets for keyword fields natively with synthetic source instead of falling back to ignored source.
3+
area: Mapping
4+
type: enhancement
5+
issues: []

rest-api-spec/build.gradle

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,4 +276,8 @@ tasks.named("yamlRestTestV7CompatTransform").configure({ task ->
276276
task.skipTest("search.vectors/130_knn_query_nested_search/nested kNN search inner_hits size > 1", "waiting for #118774 backport")
277277
task.skipTest("search.vectors/110_knn_query_with_filter/PRE_FILTER: pre-filter across multiple aliases", "waiting for #118774 backport")
278278
task.skipTest("search.vectors/160_knn_query_missing_params/kNN search in a dis_max query - missing num_candidates", "waiting for #118774 backport")
279+
task.skipTest("logsdb/10_settings/routing path allowed in logs mode with routing on sort fields", "Unknown feature routing.logsb_route_on_sort_fields")
280+
task.skipTest("indices.create/21_synthetic_source_stored/index param - field ordering", "Synthetic source keep arrays now stores leaf arrays natively")
281+
task.skipTest("indices.create/21_synthetic_source_stored/field param - keep nested array", "Synthetic source keep arrays now stores leaf arrays natively")
282+
task.skipTest("indices.create/21_synthetic_source_stored/field param - keep root array", "Synthetic source keep arrays now stores leaf arrays natively")
279283
})

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -982,7 +982,7 @@ subobjects auto:
982982
- match: { hits.hits.0._source.foo: 10 }
983983
- match: { hits.hits.0._source.foo\.bar: 100 }
984984
- match: { hits.hits.0._source.regular.span.id: "1" }
985-
- match: { hits.hits.0._source.regular.trace.id: [ "a", "b" ] }
985+
- match: { hits.hits.0._source.regular.trace.id: ["a", "b" ] }
986986
- match: { hits.hits.1._source.id: 2 }
987987
- match: { hits.hits.1._source.foo: 20 }
988988
- match: { hits.hits.1._source.foo\.bar: 200 }

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1091,7 +1091,7 @@ index param - field ordering:
10911091
index: test
10921092

10931093
- length: { hits.hits.0._source: 4 }
1094-
- match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": [30, 20, 10], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } }
1094+
- match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": ["30", "20", "10"], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } }
10951095

10961096

10971097
---

server/src/main/java/org/elasticsearch/index/IndexVersions.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ private static IndexVersion def(int id, Version luceneVersion) {
127127
public static final IndexVersion INFERENCE_METADATA_FIELDS_BACKPORT = def(8_524_0_00, Version.LUCENE_9_12_1);
128128
public static final IndexVersion LOGSB_OPTIONAL_SORTING_ON_HOST_NAME_BACKPORT = def(8_525_0_00, Version.LUCENE_9_12_1);
129129
public static final IndexVersion USE_SYNTHETIC_SOURCE_FOR_RECOVERY_BY_DEFAULT_BACKPORT = def(8_526_0_00, Version.LUCENE_9_12_1);
130+
public static final IndexVersion SYNTHETIC_SOURCE_STORE_ARRAYS_NATIVELY_KEYWORD = def(8_527_0_00, Version.LUCENE_9_12_1);
130131
/*
131132
* STOP! READ THIS FIRST! No, really,
132133
* ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _

server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ private void internalParseDocument(MetadataFieldMapper[] metadataFieldsMappers,
154154

155155
executeIndexTimeScripts(context);
156156

157+
context.processArrayOffsets(context);
157158
for (MetadataFieldMapper metadataMapper : metadataFieldsMappers) {
158159
metadataMapper.postParse(context);
159160
}
@@ -519,6 +520,7 @@ private static void throwOnCopyToOnObject(Mapper mapper, List<String> copyToFiel
519520

520521
private static void parseObject(final DocumentParserContext context, String currentFieldName) throws IOException {
521522
assert currentFieldName != null;
523+
context.setImmediateXContentParent(context.parser().currentToken());
522524
Mapper objectMapper = context.getMapper(currentFieldName);
523525
if (objectMapper != null) {
524526
doParseObject(context, currentFieldName, objectMapper);
@@ -611,6 +613,12 @@ private static void throwOnCreateDynamicNestedViaCopyTo(Mapper dynamicObjectMapp
611613
}
612614

613615
private static void parseArray(DocumentParserContext context, String lastFieldName) throws IOException {
616+
// Record previous immediate parent, so that it can be reset after array has been parsed.
617+
// This is for recording array offset with synthetic source. Only if the immediate parent is an array,
618+
// then the offsets can be accounted accurately.
619+
var prev = context.getImmediateXContentParent();
620+
context.setImmediateXContentParent(context.parser().currentToken());
621+
614622
Mapper mapper = getLeafMapper(context, lastFieldName);
615623
if (mapper != null) {
616624
// There is a concrete mapper for this field already. Need to check if the mapper
@@ -624,6 +632,8 @@ private static void parseArray(DocumentParserContext context, String lastFieldNa
624632
} else {
625633
parseArrayDynamic(context, lastFieldName);
626634
}
635+
// Reset previous immediate parent
636+
context.setImmediateXContentParent(prev);
627637
}
628638

629639
private static void parseArrayDynamic(DocumentParserContext context, String currentFieldName) throws IOException {
@@ -688,11 +698,12 @@ private static void parseNonDynamicArray(
688698
final String lastFieldName,
689699
String arrayFieldName
690700
) throws IOException {
701+
boolean supportStoringArrayOffsets = mapper != null && mapper.supportStoringArrayOffsets();
691702
String fullPath = context.path().pathAsText(arrayFieldName);
692703

693704
// Check if we need to record the array source. This only applies to synthetic source.
694705
boolean canRemoveSingleLeafElement = false;
695-
if (context.canAddIgnoredField()) {
706+
if (context.canAddIgnoredField() && supportStoringArrayOffsets == false) {
696707
Mapper.SourceKeepMode mode = Mapper.SourceKeepMode.NONE;
697708
boolean objectWithFallbackSyntheticSource = false;
698709
if (mapper instanceof ObjectMapper objectMapper) {
@@ -736,6 +747,7 @@ private static void parseNonDynamicArray(
736747

737748
XContentParser parser = context.parser();
738749
XContentParser.Token token;
750+
XContentParser.Token previousToken = parser.currentToken();
739751
int elements = 0;
740752
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
741753
if (token == XContentParser.Token.START_OBJECT) {
@@ -754,6 +766,14 @@ private static void parseNonDynamicArray(
754766
elements++;
755767
parseValue(context, lastFieldName);
756768
}
769+
previousToken = token;
770+
}
771+
if (mapper != null
772+
&& context.canAddIgnoredField()
773+
&& mapper.supportStoringArrayOffsets()
774+
&& previousToken == XContentParser.Token.START_ARRAY
775+
&& context.isImmediateParentAnArray()) {
776+
context.getOffSetContext().maybeRecordEmptyArray(mapper.getOffsetFieldName());
757777
}
758778
if (elements <= 1 && canRemoveSingleLeafElement) {
759779
context.removeLastIgnoredField(fullPath);

server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,31 @@ public LuceneDocument doc() {
9191
protected void addDoc(LuceneDocument doc) {
9292
in.addDoc(doc);
9393
}
94+
95+
@Override
96+
public void processArrayOffsets(DocumentParserContext context) throws IOException {
97+
in.processArrayOffsets(context);
98+
}
99+
100+
@Override
101+
public FieldArrayContext getOffSetContext() {
102+
return in.getOffSetContext();
103+
}
104+
105+
@Override
106+
public void setImmediateXContentParent(XContentParser.Token token) {
107+
in.setImmediateXContentParent(token);
108+
}
109+
110+
@Override
111+
public XContentParser.Token getImmediateXContentParent() {
112+
return in.getImmediateXContentParent();
113+
}
114+
115+
@Override
116+
public boolean isImmediateParentAnArray() {
117+
return in.isImmediateParentAnArray();
118+
}
94119
}
95120

96121
/**
@@ -141,6 +166,8 @@ private enum Scope {
141166
private final SeqNoFieldMapper.SequenceIDFields seqID;
142167
private final Set<String> fieldsAppliedFromTemplates;
143168

169+
private FieldArrayContext fieldArrayContext;
170+
144171
/**
145172
* Fields that are copied from values of other fields via copy_to.
146173
* This per-document state is needed since it is possible
@@ -460,6 +487,33 @@ public boolean isCopyToDestinationField(String name) {
460487
return copyToFields.contains(name);
461488
}
462489

490+
public void processArrayOffsets(DocumentParserContext context) throws IOException {
491+
if (fieldArrayContext != null) {
492+
fieldArrayContext.addToLuceneDocument(context);
493+
}
494+
}
495+
496+
public FieldArrayContext getOffSetContext() {
497+
if (fieldArrayContext == null) {
498+
fieldArrayContext = new FieldArrayContext();
499+
}
500+
return fieldArrayContext;
501+
}
502+
503+
private XContentParser.Token lastSetToken;
504+
505+
public void setImmediateXContentParent(XContentParser.Token token) {
506+
this.lastSetToken = token;
507+
}
508+
509+
public XContentParser.Token getImmediateXContentParent() {
510+
return lastSetToken;
511+
}
512+
513+
public boolean isImmediateParentAnArray() {
514+
return lastSetToken == XContentParser.Token.START_ARRAY;
515+
}
516+
463517
/**
464518
* Add a new mapper dynamically created while parsing.
465519
*
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.mapper;
11+
12+
import org.apache.lucene.document.SortedDocValuesField;
13+
import org.apache.lucene.util.BitUtil;
14+
import org.elasticsearch.common.io.stream.BytesStreamOutput;
15+
import org.elasticsearch.common.io.stream.StreamInput;
16+
17+
import java.io.IOException;
18+
import java.util.ArrayList;
19+
import java.util.HashMap;
20+
import java.util.List;
21+
import java.util.Map;
22+
import java.util.TreeMap;
23+
24+
public class FieldArrayContext {
25+
26+
private final Map<String, Offsets> offsetsPerField = new HashMap<>();
27+
28+
void recordOffset(String field, String value) {
29+
Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets());
30+
int nextOffset = arrayOffsets.currentOffset++;
31+
var offsets = arrayOffsets.valueToOffsets.computeIfAbsent(value, s -> new ArrayList<>(2));
32+
offsets.add(nextOffset);
33+
}
34+
35+
void recordNull(String field) {
36+
Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets());
37+
int nextOffset = arrayOffsets.currentOffset++;
38+
arrayOffsets.nullValueOffsets.add(nextOffset);
39+
}
40+
41+
void maybeRecordEmptyArray(String field) {
42+
offsetsPerField.computeIfAbsent(field, k -> new Offsets());
43+
}
44+
45+
void addToLuceneDocument(DocumentParserContext context) throws IOException {
46+
for (var entry : offsetsPerField.entrySet()) {
47+
var fieldName = entry.getKey();
48+
var offset = entry.getValue();
49+
50+
int currentOrd = 0;
51+
// This array allows to retain the original ordering of elements in leaf arrays and retain duplicates.
52+
int[] offsetToOrd = new int[offset.currentOffset];
53+
for (var offsetEntry : offset.valueToOffsets.entrySet()) {
54+
for (var offsetAndLevel : offsetEntry.getValue()) {
55+
offsetToOrd[offsetAndLevel] = currentOrd;
56+
}
57+
currentOrd++;
58+
}
59+
for (var nullOffset : offset.nullValueOffsets) {
60+
offsetToOrd[nullOffset] = -1;
61+
}
62+
63+
try (var streamOutput = new BytesStreamOutput()) {
64+
// Could just use vint for array length, but this allows for decoding my_field: null as -1
65+
streamOutput.writeVInt(BitUtil.zigZagEncode(offsetToOrd.length));
66+
for (int ord : offsetToOrd) {
67+
streamOutput.writeVInt(BitUtil.zigZagEncode(ord));
68+
}
69+
context.doc().add(new SortedDocValuesField(fieldName, streamOutput.bytes().toBytesRef()));
70+
}
71+
}
72+
}
73+
74+
static int[] parseOffsetArray(StreamInput in) throws IOException {
75+
int[] offsetToOrd = new int[BitUtil.zigZagDecode(in.readVInt())];
76+
for (int i = 0; i < offsetToOrd.length; i++) {
77+
offsetToOrd[i] = BitUtil.zigZagDecode(in.readVInt());
78+
}
79+
return offsetToOrd;
80+
}
81+
82+
private static class Offsets {
83+
84+
int currentOffset;
85+
// Need to use TreeMap here, so that we maintain the order in which each value (with offset) stored inserted,
86+
// (which is in the same order the document gets parsed) so we store offsets in right order. This is the same
87+
// order in what the values get stored in SortedSetDocValues.
88+
final Map<String, List<Integer>> valueToOffsets = new TreeMap<>();
89+
final List<Integer> nullValueOffsets = new ArrayList<>(2);
90+
91+
}
92+
93+
}

server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,15 +201,15 @@ public void parse(DocumentParserContext context) throws IOException {
201201
}
202202
}
203203

204-
private void doParseMultiFields(DocumentParserContext context) throws IOException {
204+
protected void doParseMultiFields(DocumentParserContext context) throws IOException {
205205
context.path().add(leafName());
206206
for (FieldMapper mapper : builderParams.multiFields.mappers) {
207207
mapper.parse(context);
208208
}
209209
context.path().remove();
210210
}
211211

212-
private static void throwIndexingWithScriptParam() {
212+
protected static void throwIndexingWithScriptParam() {
213213
throw new IllegalArgumentException("Cannot index data directly into a field with a [script] parameter");
214214
}
215215

0 commit comments

Comments
 (0)