From 43ddfc9b37e46e18d313ae75454b9c6807bf7ccb Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 20 Feb 2025 09:20:49 +0100 Subject: [PATCH] [8.x] Store arrays offsets for keyword fields natively with synthetic source Backporting #113757 to 8.x branch. The keyword doc values field gets an extra sorted doc values field, that encodes the order of how array values were specified at index time. This also captures duplicate values. This is stored in an offset to ordinal array that gets zigzag vint encoded into a sorted doc values field. For example, in case of the following string array for a keyword field: ["c", "b", "a", "c"]. Sorted set doc values: ["a", "b", "c"] with ordinals: 0, 1 and 2. The offset array will be: [2, 1, 0, 2] Null values are also supported. For example ["c", "b", null, "c"] results into sorted set doc values: ["b", "c"] with ordinals: 0 and 1. The offset array will be: [1, 0, -1, 1] Empty arrays are also supported by encoding a zigzag vint array of zero elements. Limitations: currently only doc values based array support for keyword field mapper. multi level leaf arrays are flattened. For example: [[b], [c]] -> [b, c] arrays are always synthesized as one type. In case of keyword field, [1, 2] gets synthesized as ["1", "2"]. These limitations can be addressed, but some require more complexity and or additional storage. With this PR, keyword field array will no longer be stored in ignored source, but array offsets are kept track of in an adjacent sorted doc value field. This only applies if index.mapping.synthetic_source_keep is set to arrays (default for logsdb). --- docs/changelog/113757.yaml | 5 + rest-api-spec/build.gradle | 4 + .../indices.create/20_synthetic_source.yml | 2 +- .../21_synthetic_source_stored.yml | 2 +- .../elasticsearch/index/IndexVersions.java | 1 + .../index/mapper/DocumentParser.java | 22 +- .../index/mapper/DocumentParserContext.java | 54 +++ .../index/mapper/FieldArrayContext.java | 93 +++++ .../index/mapper/FieldMapper.java | 4 +- .../index/mapper/KeywordFieldMapper.java | 106 ++++-- .../elasticsearch/index/mapper/Mapper.java | 15 + ...etsDocValuesSyntheticFieldLoaderLayer.java | 167 +++++++++ .../index/mapper/FieldArrayContextTests.java | 67 ++++ .../index/mapper/KeywordFieldMapperTests.java | 6 + .../index/mapper/KeywordFieldTypeTests.java | 3 +- .../KeywordOffsetDocValuesLoaderTests.java | 237 +++++++++++++ ...eticSourceNativeArrayIntegrationTests.java | 331 ++++++++++++++++++ .../index/mapper/MultiFieldsTests.java | 3 +- .../index/mapper/MapperTestCase.java | 6 +- 19 files changed, 1095 insertions(+), 33 deletions(-) create mode 100644 docs/changelog/113757.yaml create mode 100644 server/src/main/java/org/elasticsearch/index/mapper/FieldArrayContext.java create mode 100644 server/src/main/java/org/elasticsearch/index/mapper/SortedSetWithOffsetsDocValuesSyntheticFieldLoaderLayer.java create mode 100644 server/src/test/java/org/elasticsearch/index/mapper/FieldArrayContextTests.java create mode 100644 server/src/test/java/org/elasticsearch/index/mapper/KeywordOffsetDocValuesLoaderTests.java create mode 100644 server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java diff --git a/docs/changelog/113757.yaml b/docs/changelog/113757.yaml new file mode 100644 index 0000000000000..30e173d80b2a7 --- /dev/null +++ b/docs/changelog/113757.yaml @@ -0,0 +1,5 @@ +pr: 113757 +summary: Store arrays offsets for keyword fields natively with synthetic source instead of falling back to ignored source. +area: Mapping +type: enhancement +issues: [] diff --git a/rest-api-spec/build.gradle b/rest-api-spec/build.gradle index 03ee3c7b52764..c8bff378862c2 100644 --- a/rest-api-spec/build.gradle +++ b/rest-api-spec/build.gradle @@ -276,4 +276,8 @@ tasks.named("yamlRestTestV7CompatTransform").configure({ task -> task.skipTest("search.vectors/130_knn_query_nested_search/nested kNN search inner_hits size > 1", "waiting for #118774 backport") task.skipTest("search.vectors/110_knn_query_with_filter/PRE_FILTER: pre-filter across multiple aliases", "waiting for #118774 backport") task.skipTest("search.vectors/160_knn_query_missing_params/kNN search in a dis_max query - missing num_candidates", "waiting for #118774 backport") + task.skipTest("logsdb/10_settings/routing path allowed in logs mode with routing on sort fields", "Unknown feature routing.logsb_route_on_sort_fields") + task.skipTest("indices.create/21_synthetic_source_stored/index param - field ordering", "Synthetic source keep arrays now stores leaf arrays natively") + task.skipTest("indices.create/21_synthetic_source_stored/field param - keep nested array", "Synthetic source keep arrays now stores leaf arrays natively") + task.skipTest("indices.create/21_synthetic_source_stored/field param - keep root array", "Synthetic source keep arrays now stores leaf arrays natively") }) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml index 97e6e96dc16f2..6857743e27489 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml @@ -982,7 +982,7 @@ subobjects auto: - match: { hits.hits.0._source.foo: 10 } - match: { hits.hits.0._source.foo\.bar: 100 } - match: { hits.hits.0._source.regular.span.id: "1" } - - match: { hits.hits.0._source.regular.trace.id: [ "a", "b" ] } + - match: { hits.hits.0._source.regular.trace.id: ["a", "b" ] } - match: { hits.hits.1._source.id: 2 } - match: { hits.hits.1._source.foo: 20 } - match: { hits.hits.1._source.foo\.bar: 200 } diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml index 262e646cc882b..dd2e106b59204 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml @@ -1091,7 +1091,7 @@ index param - field ordering: index: test - length: { hits.hits.0._source: 4 } - - match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": [30, 20, 10], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } } + - match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": ["30", "20", "10"], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } } --- diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java index 710464e1f8276..03392f7e2e165 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexVersions.java +++ b/server/src/main/java/org/elasticsearch/index/IndexVersions.java @@ -127,6 +127,7 @@ private static IndexVersion def(int id, Version luceneVersion) { public static final IndexVersion INFERENCE_METADATA_FIELDS_BACKPORT = def(8_524_0_00, Version.LUCENE_9_12_1); public static final IndexVersion LOGSB_OPTIONAL_SORTING_ON_HOST_NAME_BACKPORT = def(8_525_0_00, Version.LUCENE_9_12_1); public static final IndexVersion USE_SYNTHETIC_SOURCE_FOR_RECOVERY_BY_DEFAULT_BACKPORT = def(8_526_0_00, Version.LUCENE_9_12_1); + public static final IndexVersion SYNTHETIC_SOURCE_STORE_ARRAYS_NATIVELY_KEYWORD = def(8_527_0_00, Version.LUCENE_9_12_1); /* * STOP! READ THIS FIRST! No, really, * ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _ diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java index 99f4be4dfb6d4..1fc964ea7d0e6 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java @@ -154,6 +154,7 @@ private void internalParseDocument(MetadataFieldMapper[] metadataFieldsMappers, executeIndexTimeScripts(context); + context.processArrayOffsets(context); for (MetadataFieldMapper metadataMapper : metadataFieldsMappers) { metadataMapper.postParse(context); } @@ -519,6 +520,7 @@ private static void throwOnCopyToOnObject(Mapper mapper, List copyToFiel private static void parseObject(final DocumentParserContext context, String currentFieldName) throws IOException { assert currentFieldName != null; + context.setImmediateXContentParent(context.parser().currentToken()); Mapper objectMapper = context.getMapper(currentFieldName); if (objectMapper != null) { doParseObject(context, currentFieldName, objectMapper); @@ -611,6 +613,12 @@ private static void throwOnCreateDynamicNestedViaCopyTo(Mapper dynamicObjectMapp } private static void parseArray(DocumentParserContext context, String lastFieldName) throws IOException { + // Record previous immediate parent, so that it can be reset after array has been parsed. + // This is for recording array offset with synthetic source. Only if the immediate parent is an array, + // then the offsets can be accounted accurately. + var prev = context.getImmediateXContentParent(); + context.setImmediateXContentParent(context.parser().currentToken()); + Mapper mapper = getLeafMapper(context, lastFieldName); if (mapper != null) { // There is a concrete mapper for this field already. Need to check if the mapper @@ -624,6 +632,8 @@ private static void parseArray(DocumentParserContext context, String lastFieldNa } else { parseArrayDynamic(context, lastFieldName); } + // Reset previous immediate parent + context.setImmediateXContentParent(prev); } private static void parseArrayDynamic(DocumentParserContext context, String currentFieldName) throws IOException { @@ -688,11 +698,12 @@ private static void parseNonDynamicArray( final String lastFieldName, String arrayFieldName ) throws IOException { + boolean supportStoringArrayOffsets = mapper != null && mapper.supportStoringArrayOffsets(); String fullPath = context.path().pathAsText(arrayFieldName); // Check if we need to record the array source. This only applies to synthetic source. boolean canRemoveSingleLeafElement = false; - if (context.canAddIgnoredField()) { + if (context.canAddIgnoredField() && supportStoringArrayOffsets == false) { Mapper.SourceKeepMode mode = Mapper.SourceKeepMode.NONE; boolean objectWithFallbackSyntheticSource = false; if (mapper instanceof ObjectMapper objectMapper) { @@ -736,6 +747,7 @@ private static void parseNonDynamicArray( XContentParser parser = context.parser(); XContentParser.Token token; + XContentParser.Token previousToken = parser.currentToken(); int elements = 0; while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { if (token == XContentParser.Token.START_OBJECT) { @@ -754,6 +766,14 @@ private static void parseNonDynamicArray( elements++; parseValue(context, lastFieldName); } + previousToken = token; + } + if (mapper != null + && context.canAddIgnoredField() + && mapper.supportStoringArrayOffsets() + && previousToken == XContentParser.Token.START_ARRAY + && context.isImmediateParentAnArray()) { + context.getOffSetContext().maybeRecordEmptyArray(mapper.getOffsetFieldName()); } if (elements <= 1 && canRemoveSingleLeafElement) { context.removeLastIgnoredField(fullPath); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java index 9ab6265257aa2..48a2d1662e4da 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java @@ -91,6 +91,31 @@ public LuceneDocument doc() { protected void addDoc(LuceneDocument doc) { in.addDoc(doc); } + + @Override + public void processArrayOffsets(DocumentParserContext context) throws IOException { + in.processArrayOffsets(context); + } + + @Override + public FieldArrayContext getOffSetContext() { + return in.getOffSetContext(); + } + + @Override + public void setImmediateXContentParent(XContentParser.Token token) { + in.setImmediateXContentParent(token); + } + + @Override + public XContentParser.Token getImmediateXContentParent() { + return in.getImmediateXContentParent(); + } + + @Override + public boolean isImmediateParentAnArray() { + return in.isImmediateParentAnArray(); + } } /** @@ -141,6 +166,8 @@ private enum Scope { private final SeqNoFieldMapper.SequenceIDFields seqID; private final Set fieldsAppliedFromTemplates; + private FieldArrayContext fieldArrayContext; + /** * Fields that are copied from values of other fields via copy_to. * This per-document state is needed since it is possible @@ -460,6 +487,33 @@ public boolean isCopyToDestinationField(String name) { return copyToFields.contains(name); } + public void processArrayOffsets(DocumentParserContext context) throws IOException { + if (fieldArrayContext != null) { + fieldArrayContext.addToLuceneDocument(context); + } + } + + public FieldArrayContext getOffSetContext() { + if (fieldArrayContext == null) { + fieldArrayContext = new FieldArrayContext(); + } + return fieldArrayContext; + } + + private XContentParser.Token lastSetToken; + + public void setImmediateXContentParent(XContentParser.Token token) { + this.lastSetToken = token; + } + + public XContentParser.Token getImmediateXContentParent() { + return lastSetToken; + } + + public boolean isImmediateParentAnArray() { + return lastSetToken == XContentParser.Token.START_ARRAY; + } + /** * Add a new mapper dynamically created while parsing. * diff --git a/server/src/main/java/org/elasticsearch/index/mapper/FieldArrayContext.java b/server/src/main/java/org/elasticsearch/index/mapper/FieldArrayContext.java new file mode 100644 index 0000000000000..523ac19524ee2 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/mapper/FieldArrayContext.java @@ -0,0 +1,93 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.util.BitUtil; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.io.stream.StreamInput; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +public class FieldArrayContext { + + private final Map offsetsPerField = new HashMap<>(); + + void recordOffset(String field, String value) { + Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets()); + int nextOffset = arrayOffsets.currentOffset++; + var offsets = arrayOffsets.valueToOffsets.computeIfAbsent(value, s -> new ArrayList<>(2)); + offsets.add(nextOffset); + } + + void recordNull(String field) { + Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets()); + int nextOffset = arrayOffsets.currentOffset++; + arrayOffsets.nullValueOffsets.add(nextOffset); + } + + void maybeRecordEmptyArray(String field) { + offsetsPerField.computeIfAbsent(field, k -> new Offsets()); + } + + void addToLuceneDocument(DocumentParserContext context) throws IOException { + for (var entry : offsetsPerField.entrySet()) { + var fieldName = entry.getKey(); + var offset = entry.getValue(); + + int currentOrd = 0; + // This array allows to retain the original ordering of elements in leaf arrays and retain duplicates. + int[] offsetToOrd = new int[offset.currentOffset]; + for (var offsetEntry : offset.valueToOffsets.entrySet()) { + for (var offsetAndLevel : offsetEntry.getValue()) { + offsetToOrd[offsetAndLevel] = currentOrd; + } + currentOrd++; + } + for (var nullOffset : offset.nullValueOffsets) { + offsetToOrd[nullOffset] = -1; + } + + try (var streamOutput = new BytesStreamOutput()) { + // Could just use vint for array length, but this allows for decoding my_field: null as -1 + streamOutput.writeVInt(BitUtil.zigZagEncode(offsetToOrd.length)); + for (int ord : offsetToOrd) { + streamOutput.writeVInt(BitUtil.zigZagEncode(ord)); + } + context.doc().add(new SortedDocValuesField(fieldName, streamOutput.bytes().toBytesRef())); + } + } + } + + static int[] parseOffsetArray(StreamInput in) throws IOException { + int[] offsetToOrd = new int[BitUtil.zigZagDecode(in.readVInt())]; + for (int i = 0; i < offsetToOrd.length; i++) { + offsetToOrd[i] = BitUtil.zigZagDecode(in.readVInt()); + } + return offsetToOrd; + } + + private static class Offsets { + + int currentOffset; + // Need to use TreeMap here, so that we maintain the order in which each value (with offset) stored inserted, + // (which is in the same order the document gets parsed) so we store offsets in right order. This is the same + // order in what the values get stored in SortedSetDocValues. + final Map> valueToOffsets = new TreeMap<>(); + final List nullValueOffsets = new ArrayList<>(2); + + } + +} diff --git a/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java index 7cf012fd298ff..9df9313133014 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java @@ -201,7 +201,7 @@ public void parse(DocumentParserContext context) throws IOException { } } - private void doParseMultiFields(DocumentParserContext context) throws IOException { + protected void doParseMultiFields(DocumentParserContext context) throws IOException { context.path().add(leafName()); for (FieldMapper mapper : builderParams.multiFields.mappers) { mapper.parse(context); @@ -209,7 +209,7 @@ private void doParseMultiFields(DocumentParserContext context) throws IOExceptio context.path().remove(); } - private static void throwIndexingWithScriptParam() { + protected static void throwIndexingWithScriptParam() { throw new IllegalArgumentException("Cannot index data directly into a field with a [script] parameter"); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 5642ffe91a10c..b9c9f09238050 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -41,6 +41,7 @@ import org.elasticsearch.core.Nullable; import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.fielddata.FieldData; @@ -94,6 +95,7 @@ public final class KeywordFieldMapper extends FieldMapper { static final NodeFeature KEYWORD_DIMENSION_IGNORE_ABOVE = new NodeFeature("mapper.keyword_dimension_ignore_above", true); static final NodeFeature KEYWORD_NORMALIZER_SYNTHETIC_SOURCE = new NodeFeature("mapper.keyword_normalizer_synthetic_source", true); + public static final String OFFSETS_FIELD_NAME_SUFFIX = ".offsets"; public static class Defaults { public static final FieldType FIELD_TYPE; @@ -189,6 +191,7 @@ public static final class Builder extends FieldMapper.DimensionBuilder { private final IndexAnalyzers indexAnalyzers; private final ScriptCompiler scriptCompiler; private final IndexVersion indexCreatedVersion; + private final SourceKeepMode indexSourceKeepMode; public Builder(final String name, final MappingParserContext mappingParserContext) { this( @@ -196,7 +199,8 @@ public Builder(final String name, final MappingParserContext mappingParserContex mappingParserContext.getIndexAnalyzers(), mappingParserContext.scriptCompiler(), IGNORE_ABOVE_SETTING.get(mappingParserContext.getSettings()), - mappingParserContext.getIndexSettings().getIndexVersionCreated() + mappingParserContext.getIndexSettings().getIndexVersionCreated(), + mappingParserContext.getIndexSettings().sourceKeepMode() ); } @@ -205,7 +209,8 @@ public Builder(final String name, final MappingParserContext mappingParserContex IndexAnalyzers indexAnalyzers, ScriptCompiler scriptCompiler, int ignoreAboveDefault, - IndexVersion indexCreatedVersion + IndexVersion indexCreatedVersion, + SourceKeepMode indexSourceKeepMode ) { super(name); this.indexAnalyzers = indexAnalyzers; @@ -240,10 +245,11 @@ public Builder(final String name, final MappingParserContext mappingParserContex throw new IllegalArgumentException("[ignore_above] must be positive, got [" + v + "]"); } }); + this.indexSourceKeepMode = indexSourceKeepMode; } public Builder(String name, IndexVersion indexCreatedVersion) { - this(name, null, ScriptCompiler.NONE, Integer.MAX_VALUE, indexCreatedVersion); + this(name, null, ScriptCompiler.NONE, Integer.MAX_VALUE, indexCreatedVersion, SourceKeepMode.NONE); } public Builder ignoreAbove(int ignoreAbove) { @@ -377,13 +383,36 @@ public KeywordFieldMapper build(MapperBuilderContext context) { } super.hasScript = script.get() != null; super.onScriptError = onScriptError.getValue(); + + var sourceKeepMode = this.sourceKeepMode.orElse(indexSourceKeepMode); + String offsetsFieldName; + if (context.isSourceSynthetic() + && sourceKeepMode == SourceKeepMode.ARRAYS + && hasDocValues() + && fieldtype.stored() == false + && copyTo.copyToFields().isEmpty() + && multiFieldsBuilder.hasMultiFields() == false + && indexCreatedVersion.onOrAfter(IndexVersions.SYNTHETIC_SOURCE_STORE_ARRAYS_NATIVELY_KEYWORD)) { + // Skip stored, we will be synthesizing from stored fields, no point to keep track of the offsets + // Skip copy_to and multi fields, supporting that requires more work. However, copy_to usage is rare in metrics and + // logging use cases + + // keep track of value offsets so that we can reconstruct arrays from doc values in order as was specified during indexing + // (if field is stored then there is no point of doing this) + offsetsFieldName = context.buildFullName(leafName() + OFFSETS_FIELD_NAME_SUFFIX); + } else { + offsetsFieldName = null; + } + return new KeywordFieldMapper( leafName(), fieldtype, buildFieldType(context, fieldtype), builderParams(this, context), context.isSourceSynthetic(), - this + this, + offsetsFieldName, + indexSourceKeepMode ); } } @@ -925,6 +954,8 @@ public boolean hasNormalizer() { private final IndexAnalyzers indexAnalyzers; private final int ignoreAboveDefault; private final int ignoreAbove; + private final String offsetsFieldName; + private final SourceKeepMode indexSourceKeepMode; private KeywordFieldMapper( String simpleName, @@ -932,7 +963,9 @@ private KeywordFieldMapper( KeywordFieldType mappedFieldType, BuilderParams builderParams, boolean isSyntheticSource, - Builder builder + Builder builder, + String offsetsFieldName, + SourceKeepMode indexSourceKeepMode ) { super(simpleName, mappedFieldType, builderParams); assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0; @@ -949,6 +982,8 @@ private KeywordFieldMapper( this.isSyntheticSource = isSyntheticSource; this.ignoreAboveDefault = builder.ignoreAboveDefault; this.ignoreAbove = builder.ignoreAbove.getValue(); + this.offsetsFieldName = offsetsFieldName; + this.indexSourceKeepMode = indexSourceKeepMode; } @Override @@ -957,9 +992,24 @@ public KeywordFieldType fieldType() { } @Override + public String getOffsetFieldName() { + return offsetsFieldName; + } + protected void parseCreateField(DocumentParserContext context) throws IOException { - final String value = context.parser().textOrNull(); - indexValue(context, value == null ? fieldType().nullValue : value); + String value = context.parser().textOrNull(); + if (value == null) { + value = fieldType().nullValue; + } + + boolean indexed = indexValue(context, value); + if (offsetsFieldName != null && context.isImmediateParentAnArray() && context.getRecordedSource() == false) { + if (indexed) { + context.getOffSetContext().recordOffset(offsetsFieldName, value); + } else if (value == null) { + context.getOffSetContext().recordNull(offsetsFieldName); + } + } } @Override @@ -972,13 +1022,13 @@ protected void indexScriptValues( this.fieldType().scriptValues.valuesForDoc(searchLookup, readerContext, doc, value -> indexValue(documentParserContext, value)); } - private void indexValue(DocumentParserContext context, String value) { + private boolean indexValue(DocumentParserContext context, String value) { if (value == null) { - return; + return false; } // if field is disabled, skip indexing if ((fieldType.indexOptions() == IndexOptions.NONE) && (fieldType.stored() == false) && (fieldType().hasDocValues() == false)) { - return; + return false; } if (value.length() > fieldType().ignoreAbove()) { @@ -987,7 +1037,7 @@ private void indexValue(DocumentParserContext context, String value) { // Save a copy of the field so synthetic source can load it context.doc().add(new StoredField(originalName(), new BytesRef(value))); } - return; + return false; } value = normalizeValue(fieldType().normalizer(), fullPath(), value); @@ -1025,6 +1075,8 @@ private void indexValue(DocumentParserContext context, String value) { if (fieldType().hasDocValues() == false && fieldType.omitNorms()) { context.addToFieldNames(fieldType().name()); } + + return true; } private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) { @@ -1066,9 +1118,9 @@ public Map indexAnalyzers() { @Override public FieldMapper.Builder getMergeBuilder() { - return new Builder(leafName(), indexAnalyzers, scriptCompiler, ignoreAboveDefault, indexCreatedVersion).dimension( - fieldType().isDimension() - ).init(this); + return new Builder(leafName(), indexAnalyzers, scriptCompiler, ignoreAboveDefault, indexCreatedVersion, indexSourceKeepMode) + .dimension(fieldType().isDimension()) + .init(this); } @Override @@ -1121,19 +1173,23 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException { } }); } else if (hasDocValues) { - layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath()) { + if (offsetsFieldName != null) { + layers.add(new SortedSetWithOffsetsDocValuesSyntheticFieldLoaderLayer(fullPath(), offsetsFieldName)); + } else { + layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath()) { - @Override - protected BytesRef convert(BytesRef value) { - return value; - } + @Override + protected BytesRef convert(BytesRef value) { + return value; + } - @Override - protected BytesRef preserve(BytesRef value) { - // Preserve must make a deep copy because convert gets a shallow copy from the iterator - return BytesRef.deepCopyOf(value); - } - }); + @Override + protected BytesRef preserve(BytesRef value) { + // Preserve must make a deep copy because convert gets a shallow copy from the iterator + return BytesRef.deepCopyOf(value); + } + }); + } } if (fieldType().ignoreAbove != Integer.MAX_VALUE) { diff --git a/server/src/main/java/org/elasticsearch/index/mapper/Mapper.java b/server/src/main/java/org/elasticsearch/index/mapper/Mapper.java index d8e2f64dc1642..91c80da4654b6 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/Mapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/Mapper.java @@ -215,4 +215,19 @@ public static FieldType freezeAndDeduplicateFieldType(FieldType fieldType) { * Defines how this mapper counts towards {@link MapperService#INDEX_MAPPING_TOTAL_FIELDS_LIMIT_SETTING}. */ public abstract int getTotalFieldsCount(); + + /** + * @return whether this mapper supports storing leaf array elements natively when synthetic source is enabled. + */ + public final boolean supportStoringArrayOffsets() { + return getOffsetFieldName() != null; + } + + /** + * @return the offset field name used to store offsets iff {@link #supportStoringArrayOffsets()} returns + * true. + */ + public String getOffsetFieldName() { + return null; + } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/SortedSetWithOffsetsDocValuesSyntheticFieldLoaderLayer.java b/server/src/main/java/org/elasticsearch/index/mapper/SortedSetWithOffsetsDocValuesSyntheticFieldLoaderLayer.java new file mode 100644 index 0000000000000..09a63eb6ab4a7 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/mapper/SortedSetWithOffsetsDocValuesSyntheticFieldLoaderLayer.java @@ -0,0 +1,167 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.io.stream.ByteArrayStreamInput; +import org.elasticsearch.xcontent.XContentBuilder; + +import java.io.IOException; +import java.util.Objects; + +/** + * Load {@code _source} fields from {@link SortedSetDocValues} and associated {@link BinaryDocValues}. The former contains the unique values + * in sorted order and the latter the offsets for each instance of the values. This allows synthesizing array elements in order as was + * specified at index time. Note that this works only for leaf arrays. + */ +final class SortedSetWithOffsetsDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer { + + private final String name; + private final String offsetsFieldName; + private DocValuesWithOffsetsLoader docValues; + + SortedSetWithOffsetsDocValuesSyntheticFieldLoaderLayer(String name, String offsetsFieldName) { + this.name = Objects.requireNonNull(name); + this.offsetsFieldName = Objects.requireNonNull(offsetsFieldName); + } + + @Override + public String fieldName() { + return name; + } + + @Override + public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException { + SortedSetDocValues valueDocValues = DocValues.getSortedSet(leafReader, name); + SortedDocValues offsetDocValues = DocValues.getSorted(leafReader, offsetsFieldName); + + return docValues = new DocValuesWithOffsetsLoader(valueDocValues, offsetDocValues); + } + + @Override + public boolean hasValue() { + if (docValues != null) { + return docValues.count() > 0; + } else { + return false; + } + } + + @Override + public long valueCount() { + if (docValues != null) { + return docValues.count(); + } else { + return 0; + } + } + + @Override + public void write(XContentBuilder b) throws IOException { + if (docValues != null) { + docValues.write(b); + } + } + + static final class DocValuesWithOffsetsLoader implements DocValuesLoader { + private final SortedDocValues offsetDocValues; + private final SortedSetDocValues valueDocValues; + private final ByteArrayStreamInput scratch = new ByteArrayStreamInput(); + + private boolean hasValue; + private boolean hasOffset; + private int[] offsetToOrd; + + DocValuesWithOffsetsLoader(SortedSetDocValues valueDocValues, SortedDocValues offsetDocValues) { + this.valueDocValues = valueDocValues; + this.offsetDocValues = offsetDocValues; + } + + @Override + public boolean advanceToDoc(int docId) throws IOException { + hasValue = valueDocValues.advanceExact(docId); + hasOffset = offsetDocValues.advanceExact(docId); + if (hasValue || hasOffset) { + if (hasOffset) { + int offsetOrd = offsetDocValues.ordValue(); + var encodedValue = offsetDocValues.lookupOrd(offsetOrd); + scratch.reset(encodedValue.bytes, encodedValue.offset, encodedValue.length); + offsetToOrd = FieldArrayContext.parseOffsetArray(scratch); + } else { + offsetToOrd = null; + } + return true; + } else { + offsetToOrd = null; + return false; + } + } + + public int count() { + if (hasValue) { + if (offsetToOrd != null) { + // HACK: trick CompositeSyntheticFieldLoader to serialize this layer as array. + // (if offsetToOrd is not null, then at index time an array was always specified even if there is just one value) + return offsetToOrd.length + 1; + } else { + return valueDocValues.docValueCount(); + } + } else { + if (hasOffset) { + // trick CompositeSyntheticFieldLoader to serialize this layer as empty array. + return 2; + } else { + return 0; + } + } + } + + public void write(XContentBuilder b) throws IOException { + if (hasValue == false && hasOffset == false) { + return; + } + if (offsetToOrd != null && hasValue) { + long[] ords = new long[valueDocValues.docValueCount()]; + for (int i = 0; i < valueDocValues.docValueCount(); i++) { + ords[i] = valueDocValues.nextOrd(); + } + + for (int offset : offsetToOrd) { + if (offset == -1) { + b.nullValue(); + continue; + } + + long ord = ords[offset]; + BytesRef c = valueDocValues.lookupOrd(ord); + // This is keyword specific and needs to be updated once support is added for other field types: + b.utf8Value(c.bytes, c.offset, c.length); + } + } else if (offsetToOrd != null) { + // in case all values are NULLs + for (int offset : offsetToOrd) { + assert offset == -1; + b.nullValue(); + } + } else { + for (int i = 0; i < valueDocValues.docValueCount(); i++) { + BytesRef c = valueDocValues.lookupOrd(valueDocValues.nextOrd()); + b.utf8Value(c.bytes, c.offset, c.length); + } + } + } + } + +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/FieldArrayContextTests.java b/server/src/test/java/org/elasticsearch/index/mapper/FieldArrayContextTests.java new file mode 100644 index 0000000000000..a1fa3024d7973 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/mapper/FieldArrayContextTests.java @@ -0,0 +1,67 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper; + +import org.elasticsearch.common.io.stream.ByteArrayStreamInput; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; + +import static org.elasticsearch.index.mapper.FieldArrayContext.parseOffsetArray; + +public class FieldArrayContextTests extends ESTestCase { + + public void testOffsets() throws IOException { + var context = new FieldArrayContext(); + context.recordOffset("field", "a"); + context.recordOffset("field", "a"); + context.recordOffset("field", "b"); + context.recordOffset("field", "z"); + context.recordOffset("field", "a"); + context.recordOffset("field", "b"); + + var parserContext = new TestDocumentParserContext(); + context.addToLuceneDocument(parserContext); + + var binaryDocValues = parserContext.doc().getField("field"); + int[] offsetToOrd = parseOffsetArray(new ByteArrayStreamInput(binaryDocValues.binaryValue().bytes)); + assertArrayEquals(new int[] { 0, 0, 1, 2, 0, 1 }, offsetToOrd); + } + + public void testOffsetsWithNull() throws IOException { + var context = new FieldArrayContext(); + context.recordNull("field"); + context.recordOffset("field", "a"); + context.recordOffset("field", "b"); + context.recordOffset("field", "z"); + context.recordNull("field"); + context.recordOffset("field", "b"); + + var parserContext = new TestDocumentParserContext(); + context.addToLuceneDocument(parserContext); + + var binaryDocValues = parserContext.doc().getField("field"); + int[] offsetToOrd = parseOffsetArray(new ByteArrayStreamInput(binaryDocValues.binaryValue().bytes)); + assertArrayEquals(new int[] { -1, 0, 1, 2, -1, 1 }, offsetToOrd); + } + + public void testEmptyOffset() throws IOException { + var context = new FieldArrayContext(); + context.maybeRecordEmptyArray("field"); + + var parserContext = new TestDocumentParserContext(); + context.addToLuceneDocument(parserContext); + + var binaryDocValues = parserContext.doc().getField("field"); + int[] offsetToOrd = parseOffsetArray(new ByteArrayStreamInput(binaryDocValues.binaryValue().bytes)); + assertArrayEquals(new int[] {}, offsetToOrd); + } + +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index 052bf995bdd48..b599a9941ad14 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -770,4 +770,10 @@ public void testDocValuesLoadedFromStoredSynthetic() throws IOException { ); assertScriptDocValues(mapper, "foo", equalTo(List.of("foo"))); } + + @Override + protected String randomSyntheticSourceKeep() { + // Only option all keeps array source in ignored source. + return randomFrom("all"); + } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java index b4c7ea0ed9508..e59c5f875ec0e 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java @@ -244,7 +244,8 @@ public void testFetchSourceValue() throws IOException { createIndexAnalyzers(), ScriptCompiler.NONE, Integer.MAX_VALUE, - IndexVersion.current() + IndexVersion.current(), + randomFrom(Mapper.SourceKeepMode.values()) ).normalizer("lowercase").build(MapperBuilderContext.root(false, false)).fieldType(); assertEquals(List.of("value"), fetchSourceValue(normalizerMapper, "VALUE")); assertEquals(List.of("42"), fetchSourceValue(normalizerMapper, 42L)); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordOffsetDocValuesLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordOffsetDocValuesLoaderTests.java new file mode 100644 index 0000000000000..8dcb78222e096 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordOffsetDocValuesLoaderTests.java @@ -0,0 +1,237 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.index.DirectoryReader; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.mapper.SortedSetWithOffsetsDocValuesSyntheticFieldLoaderLayer.DocValuesWithOffsetsLoader; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentType; + +import java.io.IOException; + +import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.Matchers.nullValue; + +public class KeywordOffsetDocValuesLoaderTests extends MapperServiceTestCase { + + @Override + protected Settings getIndexSettings() { + return Settings.builder() + .put("index.mapping.source.mode", "synthetic") + .put("index.mapping.synthetic_source_keep", "arrays") + .build(); + } + + public void testOffsetArrayNoDocValues() throws Exception { + String mapping = """ + { + "_doc": { + "properties": { + "field": { + "type": "keyword", + "doc_values": false + } + } + } + } + """; + try (var mapperService = createMapperService(mapping)) { + var fieldMapper = mapperService.mappingLookup().getMapper("field"); + assertThat(fieldMapper.getOffsetFieldName(), nullValue()); + } + } + + public void testOffsetArrayStored() throws Exception { + String mapping = """ + { + "_doc": { + "properties": { + "field": { + "type": "keyword", + "store": true + } + } + } + } + """; + try (var mapperService = createMapperService(mapping)) { + var fieldMapper = mapperService.mappingLookup().getMapper("field"); + assertThat(fieldMapper.getOffsetFieldName(), nullValue()); + } + } + + public void testOffsetMultiFields() throws Exception { + String mapping = """ + { + "_doc": { + "properties": { + "field": { + "type": "keyword", + "fields": { + "sub": { + "type": "text" + } + } + } + } + } + } + """; + try (var mapperService = createMapperService(mapping)) { + var fieldMapper = mapperService.mappingLookup().getMapper("field"); + assertThat(fieldMapper.getOffsetFieldName(), nullValue()); + } + } + + public void testOffsetArrayNoSyntheticSource() throws Exception { + String mapping = """ + { + "_doc": { + "properties": { + "field": { + "type": "keyword" + } + } + } + } + """; + try (var mapperService = createMapperService(Settings.EMPTY, mapping)) { + var fieldMapper = mapperService.mappingLookup().getMapper("field"); + assertThat(fieldMapper.getOffsetFieldName(), nullValue()); + } + } + + public void testOffsetArrayNoSourceArrayKeep() throws Exception { + var settingsBuilder = Settings.builder().put("index.mapping.source.mode", "synthetic"); + String mapping; + if (randomBoolean()) { + mapping = """ + { + "_doc": { + "properties": { + "field": { + "type": "keyword", + "synthetic_source_keep": "{{synthetic_source_keep}}" + } + } + } + } + """.replace("{{synthetic_source_keep}}", randomBoolean() ? "none" : "all"); + } else { + mapping = """ + { + "_doc": { + "properties": { + "field": { + "type": "keyword" + } + } + } + } + """; + if (randomBoolean()) { + settingsBuilder.put("index.mapping.synthetic_source_keep", "none"); + } + } + try (var mapperService = createMapperService(settingsBuilder.build(), mapping)) { + var fieldMapper = mapperService.mappingLookup().getMapper("field"); + assertThat(fieldMapper.getOffsetFieldName(), nullValue()); + } + } + + public void testOffsetArray() throws Exception { + verifyOffsets("{\"field\":[\"z\",\"x\",\"y\",\"c\",\"b\",\"a\"]}"); + verifyOffsets("{\"field\":[\"z\",null,\"y\",\"c\",null,\"a\"]}"); + } + + public void testOffsetNestedArray() throws Exception { + verifyOffsets("{\"field\":[\"z\",[\"y\"],[\"c\"],null,\"a\"]}", "{\"field\":[\"z\",\"y\",\"c\",null,\"a\"]}"); + verifyOffsets( + "{\"field\":[\"z\",[\"y\", [\"k\"]],[\"c\", [\"l\"]],null,\"a\"]}", + "{\"field\":[\"z\",\"y\",\"k\",\"c\",\"l\",null,\"a\"]}" + ); + } + + public void testOffsetEmptyArray() throws Exception { + verifyOffsets("{\"field\":[]}"); + } + + public void testOffsetArrayWithNulls() throws Exception { + verifyOffsets("{\"field\":[null,null,null]}"); + } + + public void testOffsetArrayRandom() throws Exception { + StringBuilder values = new StringBuilder(); + int numValues = randomIntBetween(0, 256); + for (int i = 0; i < numValues; i++) { + if (randomInt(10) == 1) { + values.append("null"); + } else { + values.append('"').append(randomAlphanumericOfLength(2)).append('"'); + } + if (i != (numValues - 1)) { + values.append(','); + } + } + verifyOffsets("{\"field\":[" + values + "]}"); + } + + private void verifyOffsets(String source) throws IOException { + verifyOffsets(source, source); + } + + private void verifyOffsets(String source, String expectedSource) throws IOException { + String mapping = """ + { + "_doc": { + "properties": { + "field": { + "type": "keyword" + } + } + } + } + """; + verifyOffsets(mapping, source, expectedSource); + } + + private void verifyOffsets(String mapping, String source, String expectedSource) throws IOException { + try (var mapperService = createMapperService(mapping)) { + var mapper = mapperService.documentMapper(); + + try (var directory = newDirectory()) { + var iw = indexWriterForSyntheticSource(directory); + var doc = mapper.parse(new SourceToParse("_id", new BytesArray(source), XContentType.JSON)); + doc.updateSeqID(0, 0); + doc.version().setLongValue(0); + iw.addDocuments(doc.docs()); + iw.close(); + try (var indexReader = wrapInMockESDirectoryReader(DirectoryReader.open(directory))) { + var layer = new SortedSetWithOffsetsDocValuesSyntheticFieldLoaderLayer("field", "field.offsets"); + var leafReader = indexReader.leaves().get(0).reader(); + var loader = (DocValuesWithOffsetsLoader) layer.docValuesLoader(leafReader, new int[] { 0 }); + assertTrue(loader.advanceToDoc(0)); + assertTrue(loader.count() > 0); + XContentBuilder builder = jsonBuilder().startObject(); + builder.startArray("field"); + loader.write(builder); + builder.endArray().endObject(); + + var actual = Strings.toString(builder); + assertEquals(expectedSource, actual); + } + } + } + } + +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java new file mode 100644 index 0000000000000..8ebcfb4845c8c --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java @@ -0,0 +1,331 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.LeafReader; +import org.elasticsearch.action.admin.indices.forcemerge.ForceMergeRequest; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.action.search.SearchRequest; +import org.elasticsearch.action.support.WriteRequest; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.query.IdsQueryBuilder; +import org.elasticsearch.test.ESSingleNodeTestCase; +import org.elasticsearch.xcontent.XContentBuilder; +import org.hamcrest.Matchers; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.Matchers.contains; +import static org.hamcrest.Matchers.empty; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.hasKey; +import static org.hamcrest.Matchers.nullValue; + +public class KeywordSyntheticSourceNativeArrayIntegrationTests extends ESSingleNodeTestCase { + + public void testSynthesizeArray() throws Exception { + var arrayValues = new Object[][] { + new Object[] { "z", "y", null, "x", null, "v" }, + new Object[] { null, "b", null, "a" }, + new Object[] { null }, + new Object[] { null, null, null }, + new Object[] { "c", "b", "a" } }; + verifySyntheticArray(arrayValues); + } + + public void testSynthesizeEmptyArray() throws Exception { + var arrayValues = new Object[][] { new Object[] {} }; + verifySyntheticArray(arrayValues); + } + + public void testSynthesizeArrayRandom() throws Exception { + var arrayValues = new Object[][] { generateRandomStringArray(64, 8, false, true) }; + verifySyntheticArray(arrayValues); + } + + public void testSynthesizeArrayIgnoreAbove() throws Exception { + var mapping = jsonBuilder().startObject() + .startObject("properties") + .startObject("field") + .field("type", "keyword") + .field("ignore_above", 4) + .endObject() + .endObject() + .endObject(); + // Note values that would be ignored are added at the end of arrays, + // this makes testing easier as ignored values are always synthesized after regular values: + var arrayValues = new Object[][] { + new Object[] { null, "a", "ab", "abc", "abcd", null, "abcde" }, + new Object[] { "12345", "12345", "12345" }, + new Object[] { "123", "1234", "12345" }, + new Object[] { null, null, null, "blabla" }, + new Object[] { "1", "2", "3", "blabla" } }; + verifySyntheticArray(arrayValues, mapping, 4, "_id", "field._original"); + } + + public void testSynthesizeObjectArray() throws Exception { + List> documents = new ArrayList<>(); + { + List document = new ArrayList<>(); + document.add(new Object[] { "z", "y", "x" }); + document.add(new Object[] { "m", "l", "m" }); + document.add(new Object[] { "c", "b", "a" }); + documents.add(document); + } + { + List document = new ArrayList<>(); + document.add(new Object[] { "9", "7", "5" }); + document.add(new Object[] { "2", "4", "6" }); + document.add(new Object[] { "7", "6", "5" }); + documents.add(document); + } + verifySyntheticObjectArray(documents); + } + + public void testSynthesizeArrayInObjectField() throws Exception { + List documents = new ArrayList<>(); + documents.add(new Object[] { "z", "y", "x" }); + documents.add(new Object[] { "m", "l", "m" }); + documents.add(new Object[] { "c", "b", "a" }); + documents.add(new Object[] { "9", "7", "5" }); + documents.add(new Object[] { "2", "4", "6" }); + documents.add(new Object[] { "7", "6", "5" }); + verifySyntheticArrayInObject(documents); + } + + public void testSynthesizeArrayInObjectFieldRandom() throws Exception { + List documents = new ArrayList<>(); + int numDocs = randomIntBetween(8, 256); + for (int i = 0; i < numDocs; i++) { + documents.add(generateRandomStringArray(64, 8, false, true)); + } + verifySyntheticArrayInObject(documents); + } + + private void verifySyntheticArray(Object[][] arrays) throws IOException { + var mapping = jsonBuilder().startObject() + .startObject("properties") + .startObject("field") + .field("type", "keyword") + .endObject() + .endObject() + .endObject(); + verifySyntheticArray(arrays, mapping, null, "_id"); + } + + private void verifySyntheticArray(Object[][] arrays, XContentBuilder mapping, Integer ignoreAbove, String... expectedStoredFields) + throws IOException { + var indexService = createIndex( + "test-index", + Settings.builder().put("index.mapping.source.mode", "synthetic").put("index.mapping.synthetic_source_keep", "arrays").build(), + mapping + ); + for (int i = 0; i < arrays.length; i++) { + var array = arrays[i]; + + var indexRequest = new IndexRequest("test-index"); + indexRequest.id("my-id-" + i); + var source = jsonBuilder().startObject(); + if (array != null) { + source.startArray("field"); + for (Object arrayValue : array) { + source.value(arrayValue); + } + source.endArray(); + } else { + source.field("field").nullValue(); + } + indexRequest.source(source.endObject()); + indexRequest.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); + client().index(indexRequest).actionGet(); + + var searchRequest = new SearchRequest("test-index"); + searchRequest.source().query(new IdsQueryBuilder().addIds("my-id-" + i)); + var searchResponse = client().search(searchRequest).actionGet(); + try { + var hit = searchResponse.getHits().getHits()[0]; + assertThat(hit.getId(), equalTo("my-id-" + i)); + var sourceAsMap = hit.getSourceAsMap(); + assertThat(sourceAsMap, hasKey("field")); + var actualArray = (List) sourceAsMap.get("field"); + if (array == null) { + assertThat(actualArray, nullValue()); + } else if (array.length == 0) { + assertThat(actualArray, empty()); + } else { + assertThat(actualArray, Matchers.contains(array)); + } + } finally { + searchResponse.decRef(); + } + } + + try (var searcher = indexService.getShard(0).acquireSearcher(getTestName())) { + var reader = searcher.getDirectoryReader(); + for (int i = 0; i < arrays.length; i++) { + var document = reader.storedFields().document(i); + // Verify that there is no ignored source: + Set storedFieldNames = new LinkedHashSet<>(document.getFields().stream().map(IndexableField::name).toList()); + assertThat(storedFieldNames, contains(expectedStoredFields)); + } + var fieldInfo = FieldInfos.getMergedFieldInfos(reader).fieldInfo("field.offsets"); + assertThat(fieldInfo.getDocValuesType(), equalTo(DocValuesType.SORTED)); + } + } + + private void verifySyntheticObjectArray(List> documents) throws IOException { + var indexService = createIndex( + "test-index", + Settings.builder().put("index.mapping.source.mode", "synthetic").put("index.mapping.synthetic_source_keep", "arrays").build(), + jsonBuilder().startObject() + .startObject("properties") + .startObject("object") + .startObject("properties") + .startObject("field") + .field("type", "keyword") + .endObject() + .endObject() + .endObject() + .endObject() + .endObject() + ); + for (int i = 0; i < documents.size(); i++) { + var document = documents.get(i); + + var indexRequest = new IndexRequest("test-index"); + indexRequest.id("my-id-" + i); + var source = jsonBuilder().startObject(); + source.startArray("object"); + for (Object[] arrayValue : document) { + source.startObject(); + source.array("field", arrayValue); + source.endObject(); + } + source.endArray(); + indexRequest.source(source.endObject()); + indexRequest.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); + client().index(indexRequest).actionGet(); + + var searchRequest = new SearchRequest("test-index"); + searchRequest.source().query(new IdsQueryBuilder().addIds("my-id-" + i)); + var searchResponse = client().search(searchRequest).actionGet(); + try { + var hit = searchResponse.getHits().getHits()[0]; + assertThat(hit.getId(), equalTo("my-id-" + i)); + var sourceAsMap = hit.getSourceAsMap(); + var objectArray = (List) sourceAsMap.get("object"); + for (int j = 0; j < document.size(); j++) { + var expected = document.get(j); + List actual = (List) ((Map) objectArray.get(j)).get("field"); + assertThat(actual, Matchers.contains(expected)); + } + } finally { + searchResponse.decRef(); + } + } + + indexService.getShard(0).forceMerge(new ForceMergeRequest("test-index").maxNumSegments(1)); + try (var searcher = indexService.getShard(0).acquireSearcher(getTestName())) { + var reader = searcher.getDirectoryReader(); + for (int i = 0; i < documents.size(); i++) { + var document = reader.storedFields().document(i); + // Verify that there is ignored source because of leaf array being wrapped by object array: + List storedFieldNames = document.getFields().stream().map(IndexableField::name).toList(); + assertThat(storedFieldNames, contains("_id", "_ignored_source")); + + // Verify that there is no offset field: + LeafReader leafReader = reader.leaves().get(0).reader(); + for (FieldInfo fieldInfo : leafReader.getFieldInfos()) { + String name = fieldInfo.getName(); + assertFalse("expected no field that contains [offsets] in name, but found [" + name + "]", name.contains("offsets")); + } + + var binaryDocValues = leafReader.getBinaryDocValues("object.field.offsets"); + assertThat(binaryDocValues, nullValue()); + } + } + } + + private void verifySyntheticArrayInObject(List documents) throws IOException { + var indexService = createIndex( + "test-index", + Settings.builder().put("index.mapping.source.mode", "synthetic").put("index.mapping.synthetic_source_keep", "arrays").build(), + jsonBuilder().startObject() + .startObject("properties") + .startObject("object") + .startObject("properties") + .startObject("field") + .field("type", "keyword") + .endObject() + .endObject() + .endObject() + .endObject() + .endObject() + ); + for (int i = 0; i < documents.size(); i++) { + var arrayValue = documents.get(i); + + var indexRequest = new IndexRequest("test-index"); + indexRequest.id("my-id-" + i); + var source = jsonBuilder().startObject(); + source.startObject("object"); + source.array("field", arrayValue); + source.endObject(); + indexRequest.source(source.endObject()); + indexRequest.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); + client().index(indexRequest).actionGet(); + + var searchRequest = new SearchRequest("test-index"); + searchRequest.source().query(new IdsQueryBuilder().addIds("my-id-" + i)); + var searchResponse = client().search(searchRequest).actionGet(); + try { + var hit = searchResponse.getHits().getHits()[0]; + assertThat(hit.getId(), equalTo("my-id-" + i)); + var sourceAsMap = hit.getSourceAsMap(); + var objectArray = (Map) sourceAsMap.get("object"); + + List actual = (List) objectArray.get("field"); + if (arrayValue == null) { + assertThat(actual, nullValue()); + } else if (arrayValue.length == 0) { + assertThat(actual, empty()); + } else { + assertThat(actual, Matchers.contains(arrayValue)); + } + } finally { + searchResponse.decRef(); + } + } + + indexService.getShard(0).forceMerge(new ForceMergeRequest("test-index").maxNumSegments(1)); + try (var searcher = indexService.getShard(0).acquireSearcher(getTestName())) { + var reader = searcher.getDirectoryReader(); + for (int i = 0; i < documents.size(); i++) { + var document = reader.storedFields().document(i); + // Verify that there is no ignored source: + Set storedFieldNames = new LinkedHashSet<>(document.getFields().stream().map(IndexableField::name).toList()); + assertThat(storedFieldNames, contains("_id")); + } + var fieldInfo = FieldInfos.getMergedFieldInfos(reader).fieldInfo("object.field.offsets"); + assertThat(fieldInfo.getDocValuesType(), equalTo(DocValuesType.SORTED)); + } + } + +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/MultiFieldsTests.java b/server/src/test/java/org/elasticsearch/index/mapper/MultiFieldsTests.java index fd024c5d23e28..4c5bfeb66b075 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/MultiFieldsTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/MultiFieldsTests.java @@ -64,7 +64,8 @@ private KeywordFieldMapper.Builder getKeywordFieldMapperBuilder(boolean isStored IndexAnalyzers.of(Map.of(), Map.of("normalizer", Lucene.STANDARD_ANALYZER), Map.of()), ScriptCompiler.NONE, Integer.MAX_VALUE, - IndexVersion.current() + IndexVersion.current(), + Mapper.SourceKeepMode.NONE ); if (isStored) { keywordFieldMapperBuilder.stored(true); diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java index 757183d100166..72abb8a179dfe 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java @@ -1725,7 +1725,7 @@ public void testSyntheticSourceKeepArrays() throws IOException { SyntheticSourceExample example = syntheticSourceSupportForKeepTests(shouldUseIgnoreMalformed()).example(1); DocumentMapper mapperAll = createSytheticSourceMapperService(mapping(b -> { b.startObject("field"); - b.field("synthetic_source_keep", randomFrom("arrays", "all")); // Both options keep array source. + b.field("synthetic_source_keep", randomSyntheticSourceKeep()); example.mapping().accept(b); b.endObject(); })).documentMapper(); @@ -1744,6 +1744,10 @@ public void testSyntheticSourceKeepArrays() throws IOException { assertThat(actual, equalTo(expected)); } + protected String randomSyntheticSourceKeep() { + return randomFrom("all", "arrays"); + } + @Override protected final T compileScript(Script script, ScriptContext context) { return ingestScriptSupport().compileScript(script, context);