Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
c5580da
Synthetic source doc values arrays encoding experiment
martijnvg Sep 29, 2024
acf4d09
spotless
martijnvg Sep 30, 2024
dca77d7
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Sep 30, 2024
49efe26
iter
martijnvg Sep 30, 2024
f5e3d5a
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Sep 30, 2024
59010c3
iter
martijnvg Sep 30, 2024
a5198ae
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Oct 2, 2024
9b4aa5f
spotless
martijnvg Oct 2, 2024
12d30c5
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Oct 3, 2024
2ae8d83
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Oct 8, 2024
fc0e627
parsesArrayValue approach
martijnvg Oct 8, 2024
a111f94
fix multi fields
martijnvg Oct 9, 2024
14c2ddd
iter
martijnvg Oct 9, 2024
ba9e513
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Oct 9, 2024
007afd3
do not handle copy_to for now
martijnvg Oct 9, 2024
194b4ca
cleanup
martijnvg Oct 9, 2024
52c0db4
move ValueXContentParser
martijnvg Oct 9, 2024
8cc5b46
adjust expected json element type
martijnvg Oct 9, 2024
dc9db8a
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Oct 13, 2024
6e03aca
iter
martijnvg Oct 13, 2024
674f03e
fixed mistake
martijnvg Oct 14, 2024
acfaa55
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Jan 16, 2025
0d90234
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Jan 24, 2025
259d212
iter
martijnvg Jan 24, 2025
bf9ed2f
iter
martijnvg Jan 24, 2025
7bd3a15
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Jan 24, 2025
d8e48c5
Moved processing of offsets to DocumentParser instead of in fieldmapp…
martijnvg Jan 24, 2025
ae1ce9f
[CI] Auto commit changes from spotless
Jan 24, 2025
5e610ef
* Added support for json null by encoding it as \0
martijnvg Jan 26, 2025
8f163eb
Encode offsets using zigzag encoding, so that -1 can be used to encod…
martijnvg Jan 26, 2025
43a1375
fixed codec issue if only NULLs gets indexed
martijnvg Jan 27, 2025
cf2b9a3
Update docs/changelog/113757.yaml
martijnvg Jan 27, 2025
893f555
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Jan 27, 2025
61fd132
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Jan 27, 2025
a428f11
handle string arrays for keyword fields nested under object arrays co…
martijnvg Jan 27, 2025
962ac8a
offsetsFieldMapper -> offsetsFieldName
martijnvg Jan 27, 2025
1c77cfe
added ignore above test
martijnvg Jan 27, 2025
a785110
Move OffsetDocValuesLoader to its own layer implementation
martijnvg Jan 27, 2025
fa03f46
iter
martijnvg Jan 27, 2025
ccbf0cd
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Jan 27, 2025
9664fa7
rename
martijnvg Feb 2, 2025
01cd313
move some offset logic from DocumentParserContext to new ArrayOffsetC…
martijnvg Feb 2, 2025
7ed0857
use correct full name for offset field
martijnvg Feb 2, 2025
012ac7f
Move offset parsing logic to DocumentParser
martijnvg Feb 3, 2025
4b4eaf4
[CI] Auto commit changes from spotless
Feb 3, 2025
64c6fe8
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Feb 5, 2025
38f784a
synthetic recovery source is enabled by default now (in snapshot builds)
martijnvg Feb 5, 2025
b9535e1
Small iter
martijnvg Feb 5, 2025
470afad
handle empty array logic in DocumentParser
martijnvg Feb 5, 2025
80521c2
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Feb 6, 2025
ab612ba
randomFrom
martijnvg Feb 6, 2025
f21cce6
reduce number of `setImmediateXContentParent(...)` invocations
martijnvg Feb 6, 2025
ca21c22
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Feb 15, 2025
969139e
remove `parsesArrayValue(mapper) == false` checks
martijnvg Feb 15, 2025
acf0aed
cleanup DocumentParser and
martijnvg Feb 15, 2025
3d75e27
ensure doc values enabled and added more tests
martijnvg Feb 15, 2025
5487cf8
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Feb 17, 2025
b89660a
iter
martijnvg Feb 17, 2025
ba0434b
s/:/./
martijnvg Feb 17, 2025
4bcde0d
iter
martijnvg Feb 17, 2025
5b6b05c
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Feb 17, 2025
37634b9
Store offsets in sorted doc values field instead of binary doc values…
martijnvg Feb 18, 2025
09c6a0e
applied feedback
martijnvg Feb 18, 2025
60f45f2
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Feb 19, 2025
bbee160
iter testSyntheticSourceKeepArrays() test
martijnvg Feb 19, 2025
4e6265f
add index version check
martijnvg Feb 19, 2025
405edf4
iter test
martijnvg Feb 19, 2025
7c7b3a3
[CI] Auto commit changes from spotless
Feb 19, 2025
cfe5b56
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Feb 19, 2025
8049206
Merge remote-tracking branch 'es/main' into synthetic_source_encode_a…
martijnvg Feb 20, 2025
3fcb461
cleanup supportStoringArrayOffsets(...) method
martijnvg Feb 20, 2025
5b1f80b
renamed test suites
martijnvg Feb 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,7 @@ subobjects auto:
- match: { hits.hits.0._source.foo: 10 }
- match: { hits.hits.0._source.foo\.bar: 100 }
- match: { hits.hits.0._source.regular.span.id: "1" }
- match: { hits.hits.0._source.regular.trace.id: [ "a", "b" ] }
- match: { hits.hits.0._source.regular.trace.id: ["b", "a", "b" ] }
- match: { hits.hits.1._source.id: 2 }
- match: { hits.hits.1._source.foo: 20 }
- match: { hits.hits.1._source.foo\.bar: 200 }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ field param - keep root array:
index: test
sort: id

- match: { hits.hits.0._source.kw_default: [ "A", "B" ] }
- match: { hits.hits.0._source.kw_default: [ "B", "A" ] }
- match: { hits.hits.0._source.kw_arrays: [ "B", "A" ] }
- match: { hits.hits.0._source.kw_all: [ "B", "A" ] }

Expand Down Expand Up @@ -513,7 +513,7 @@ field param - keep nested array:
sort: id

- match: { hits.hits.0._source.path.to.ratio: 10.0 }
- match: { hits.hits.0._source.path.to.kw_default: [ "A", "B" ] }
- match: { hits.hits.0._source.path.to.kw_default: [ "B", "A" ] }
- match: { hits.hits.0._source.path.to.kw_arrays: [ "B", "A" ] }
- match: { hits.hits.0._source.path.to.kw_all: [ "B", "A" ] }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -799,7 +799,7 @@ private static void parseNonDynamicArray(
String fullPath = context.path().pathAsText(arrayFieldName);

// Check if we need to record the array source. This only applies to synthetic source.
if (context.canAddIgnoredField()) {
if (context.canAddIgnoredField() && (mapper instanceof KeywordFieldMapper == false /* hard coded exclusion keyword mapper */)) {
boolean objectRequiresStoringSource = mapper instanceof ObjectMapper objectMapper
&& (objectMapper.storeArraySource()
|| (context.sourceKeepModeFromIndexSettings() == Mapper.SourceKeepMode.ARRAYS
Expand All @@ -826,10 +826,13 @@ private static void parseNonDynamicArray(
}
}

// In synthetic source, if any array element requires storing its source as-is, it takes precedence over
// elements from regular source loading that are then skipped from the synthesized array source.
// To prevent this, we track each array name, to check if it contains any sub-arrays in its elements.
context = context.cloneForArray(fullPath);
// hard coded exclusion keyword mapper:
if (mapper instanceof KeywordFieldMapper == false) {
// In synthetic source, if any array element requires storing its source as-is, it takes precedence over
// elements from regular source loading that are then skipped from the synthesized array source.
// To prevent this, we track each array name, to check if it contains any sub-arrays in its elements.
context = context.cloneForArray(fullPath);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move to the branch above, to avoid repeating the condition?


XContentParser parser = context.parser();
XContentParser.Token token;
Expand All @@ -847,6 +850,10 @@ private static void parseNonDynamicArray(
parseValue(context, lastFieldName);
}
}
// hard coded post processing of arrays:
if (mapper instanceof KeywordFieldMapper k) {
k.processOffsets(context);
}
postProcessDynamicArrayMapping(context, lastFieldName);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,14 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;

/**
* Context used when parsing incoming documents. Holds everything that is needed to parse a document as well as
Expand Down Expand Up @@ -856,4 +859,27 @@ public String currentName() throws IOException {
return field;
}
}

private final Map<String, SortedMap<String, List<Integer>>> arrayOffsetsByField = new HashMap<>();
private final Map<String, Integer> numValuesByField = new HashMap<>();

public SortedMap<String, List<Integer>> getValuesWithOffsets(String field) {
return arrayOffsetsByField.get(field);
}

public int getArrayValueCount(String field) {
if (numValuesByField.containsKey(field)) {
return numValuesByField.get(field) + 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why +1?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

numValuesByField returns the last offset, in order to get count that returned value needs to be incremented by one.

numValuesByField should be names something else.

} else {
return 0;
}
}

public void recordOffset(String fieldName, String value) {
int count = numValuesByField.compute(fieldName, (s, integer) -> integer == null ? 0 : ++integer);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to comment above, maybe 1 here instead of 0?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

count is the wrong name here. It is like the next offset to be used.

var values = arrayOffsetsByField.computeIfAbsent(fieldName , s -> new TreeMap<>(Comparator.naturalOrder()));
var offsets = values.computeIfAbsent(value, s -> new ArrayList<>());
offsets.add(count);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.InvertableType;
Expand All @@ -34,6 +35,7 @@
import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.AutomatonQueries;
Expand Down Expand Up @@ -89,6 +91,7 @@ public final class KeywordFieldMapper extends FieldMapper {
private static final Logger logger = LogManager.getLogger(KeywordFieldMapper.class);

public static final String CONTENT_TYPE = "keyword";
public static final String OFFSETS_FIELD_NAME_SUFFIX = "_offsets";

static final NodeFeature KEYWORD_DIMENSION_IGNORE_ABOVE = new NodeFeature("mapper.keyword_dimension_ignore_above");
static final NodeFeature KEYWORD_NORMALIZER_SYNTHETIC_SOURCE = new NodeFeature("mapper.keyword_normalizer_synthetic_source");
Expand Down Expand Up @@ -375,13 +378,26 @@ public KeywordFieldMapper build(MapperBuilderContext context) {
}
super.hasScript = script.get() != null;
super.onScriptError = onScriptError.getValue();

BinaryFieldMapper offsetsFieldMapper;
if (context.isSourceSynthetic() && fieldtype.stored() == false) {
// keep track of value offsets so that we can reconstruct arrays from doc values in order as was specified during indexing
// (if field is stored then there is no point of doing this)
offsetsFieldMapper = new BinaryFieldMapper.Builder(leafName() + OFFSETS_FIELD_NAME_SUFFIX, context.isSourceSynthetic())
.docValues(true)
.build(context);
} else {
offsetsFieldMapper = null;
}

return new KeywordFieldMapper(
leafName(),
fieldtype,
buildFieldType(context, fieldtype),
builderParams(this, context),
context.isSourceSynthetic(),
this
this,
offsetsFieldMapper
);
}
}
Expand Down Expand Up @@ -882,14 +898,16 @@ public boolean hasNormalizer() {
private final IndexAnalyzers indexAnalyzers;
private final int ignoreAboveDefault;
private final int ignoreAbove;
private final BinaryFieldMapper offsetsFieldMapper;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we actually need this? We only ever use fullPath() from it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I think we can remove it. We can just keep track of offsetFieldName.

I think I wanted to share this field via #iterator(), so that this field is visible in other places like field caps, but I think this field should remain internal/hidden. Additionally BinaryFieldMapper uses a custom binary doc values field that has no advantage for offsets.


private KeywordFieldMapper(
String simpleName,
FieldType fieldType,
KeywordFieldType mappedFieldType,
BuilderParams builderParams,
boolean isSyntheticSource,
Builder builder
Builder builder,
BinaryFieldMapper offsetsFieldMapper
) {
super(simpleName, mappedFieldType, builderParams);
assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0;
Expand All @@ -906,17 +924,50 @@ private KeywordFieldMapper(
this.isSyntheticSource = isSyntheticSource;
this.ignoreAboveDefault = builder.ignoreAboveDefault;
this.ignoreAbove = builder.ignoreAbove.getValue();
this.offsetsFieldMapper = offsetsFieldMapper;
}

@Override
public KeywordFieldType fieldType() {
return (KeywordFieldType) super.fieldType();
}

@Override
protected void parseCreateField(DocumentParserContext context) throws IOException {
final String value = context.parser().textOrNull();
indexValue(context, value == null ? fieldType().nullValue : value);
String value = context.parser().textOrNull();
if (value == null) {
value = fieldType().nullValue;
}
boolean indexed = indexValue(context, value);
if (offsetsFieldMapper != null && indexed) {
context.recordOffset(fullPath(), value);
}
}

public void processOffsets(DocumentParserContext context) throws IOException {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be called from postParse? It is possible that a field is indexed multiple times in one document with object arrays.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am surprised randomized tests don't complain about this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 I think postParse is a better place to invoke this process offsets logic.

var values = context.getValuesWithOffsets(fullPath());
if (values == null || values.isEmpty()) {
return;
}

int arrayLength = context.getArrayValueCount(fullPath());
int ord = 0;
int[] offsetToOrd = new int[arrayLength];
for (var entry : values.entrySet()) {
for (var offsetAndLevel : entry.getValue()) {
offsetToOrd[offsetAndLevel] = ord;
}
ord++;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we keep this logic in DocumentParser? It feels like leaking parsing implementations in mappers..

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made the following change: 012ac7f
In order to make sure empty arrays and nested leaf arrays are covered some logic had to be added. Let me know what you think of this. The nice thing is that now adding offset to other field types should be easier.

The previous approach was more isolated, field types would be responsible for parsing arrays, which means not many changes to DocumentParser were needed. I think I like this PR with 012ac7f more now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: return (offsetsFieldName != null);


logger.info("values=" + values);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe debug?

logger.info("offsetToOrd=" + Arrays.toString(offsetToOrd));

try (var streamOutput = new BytesStreamOutput()) {
// TODO: optimize
// This array allows to retain the original ordering of the leaf array and duplicate values.
streamOutput.writeVIntArray(offsetToOrd);
context.doc().add(new BinaryDocValuesField(offsetsFieldMapper.fullPath(), streamOutput.bytes().toBytesRef()));
}
}

@Override
Expand All @@ -929,13 +980,13 @@ protected void indexScriptValues(
this.fieldType().scriptValues.valuesForDoc(searchLookup, readerContext, doc, value -> indexValue(documentParserContext, value));
}

private void indexValue(DocumentParserContext context, String value) {
private boolean indexValue(DocumentParserContext context, String value) {
if (value == null) {
return;
return false;
}
// if field is disabled, skip indexing
if ((fieldType.indexOptions() == IndexOptions.NONE) && (fieldType.stored() == false) && (fieldType().hasDocValues() == false)) {
return;
return false;
}

if (value.length() > fieldType().ignoreAbove()) {
Expand All @@ -944,7 +995,7 @@ private void indexValue(DocumentParserContext context, String value) {
// Save a copy of the field so synthetic source can load it
context.doc().add(new StoredField(originalName(), new BytesRef(value)));
}
return;
return false;
}

value = normalizeValue(fieldType().normalizer(), fullPath(), value);
Expand Down Expand Up @@ -982,6 +1033,8 @@ private void indexValue(DocumentParserContext context, String value) {
if (fieldType().hasDocValues() == false && fieldType.omitNorms()) {
context.addToFieldNames(fieldType().name());
}

return true;
}

private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) {
Expand Down Expand Up @@ -1078,7 +1131,8 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {
}
});
} else if (hasDocValues) {
layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath()) {
String offsetsFullPath = offsetsFieldMapper != null ? offsetsFieldMapper.fullPath() : null;
layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath(), offsetsFullPath) {

@Override
protected BytesRef convert(BytesRef value) {
Expand Down
Loading