-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Store arrays offsets for keyword fields natively with synthetic source #113757
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
c5580da
acf4d09
dca77d7
49efe26
f5e3d5a
59010c3
a5198ae
9b4aa5f
12d30c5
2ae8d83
fc0e627
a111f94
14c2ddd
ba9e513
007afd3
194b4ca
52c0db4
8cc5b46
dc9db8a
6e03aca
674f03e
acfaa55
0d90234
259d212
bf9ed2f
7bd3a15
d8e48c5
ae1ce9f
5e610ef
8f163eb
43a1375
cf2b9a3
893f555
61fd132
a428f11
962ac8a
1c77cfe
a785110
fa03f46
ccbf0cd
9664fa7
01cd313
7ed0857
012ac7f
4b4eaf4
64c6fe8
38f784a
b9535e1
470afad
80521c2
ab612ba
f21cce6
ca21c22
969139e
acf0aed
3d75e27
5487cf8
b89660a
ba0434b
4bcde0d
5b6b05c
37634b9
09c6a0e
60f45f2
bbee160
4e6265f
405edf4
7c7b3a3
cfe5b56
8049206
3fcb461
5b1f80b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,11 +27,14 @@ | |
| import java.util.ArrayList; | ||
| import java.util.Collection; | ||
| import java.util.Collections; | ||
| import java.util.Comparator; | ||
| import java.util.HashMap; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
| import java.util.SortedMap; | ||
| import java.util.TreeMap; | ||
|
|
||
| /** | ||
| * Context used when parsing incoming documents. Holds everything that is needed to parse a document as well as | ||
|
|
@@ -856,4 +859,27 @@ public String currentName() throws IOException { | |
| return field; | ||
| } | ||
| } | ||
|
|
||
| private final Map<String, SortedMap<String, List<Integer>>> arrayOffsetsByField = new HashMap<>(); | ||
| private final Map<String, Integer> numValuesByField = new HashMap<>(); | ||
|
|
||
| public SortedMap<String, List<Integer>> getValuesWithOffsets(String field) { | ||
| return arrayOffsetsByField.get(field); | ||
| } | ||
|
|
||
| public int getArrayValueCount(String field) { | ||
| if (numValuesByField.containsKey(field)) { | ||
| return numValuesByField.get(field) + 1; | ||
|
||
| } else { | ||
| return 0; | ||
| } | ||
| } | ||
|
|
||
| public void recordOffset(String fieldName, String value) { | ||
| int count = numValuesByField.compute(fieldName, (s, integer) -> integer == null ? 0 : ++integer); | ||
|
||
| var values = arrayOffsetsByField.computeIfAbsent(fieldName , s -> new TreeMap<>(Comparator.naturalOrder())); | ||
| var offsets = values.computeIfAbsent(value, s -> new ArrayList<>()); | ||
| offsets.add(count); | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
| import org.apache.logging.log4j.Logger; | ||
| import org.apache.lucene.analysis.TokenStream; | ||
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
| import org.apache.lucene.document.BinaryDocValuesField; | ||
| import org.apache.lucene.document.Field; | ||
| import org.apache.lucene.document.FieldType; | ||
| import org.apache.lucene.document.InvertableType; | ||
|
|
@@ -34,6 +35,7 @@ | |
| import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE; | ||
| import org.apache.lucene.util.automaton.MinimizationOperations; | ||
| import org.apache.lucene.util.automaton.Operations; | ||
| import org.elasticsearch.common.io.stream.BytesStreamOutput; | ||
| import org.elasticsearch.common.lucene.BytesRefs; | ||
| import org.elasticsearch.common.lucene.Lucene; | ||
| import org.elasticsearch.common.lucene.search.AutomatonQueries; | ||
|
|
@@ -89,6 +91,7 @@ public final class KeywordFieldMapper extends FieldMapper { | |
| private static final Logger logger = LogManager.getLogger(KeywordFieldMapper.class); | ||
|
|
||
| public static final String CONTENT_TYPE = "keyword"; | ||
| public static final String OFFSETS_FIELD_NAME_SUFFIX = "_offsets"; | ||
|
|
||
| static final NodeFeature KEYWORD_DIMENSION_IGNORE_ABOVE = new NodeFeature("mapper.keyword_dimension_ignore_above"); | ||
| static final NodeFeature KEYWORD_NORMALIZER_SYNTHETIC_SOURCE = new NodeFeature("mapper.keyword_normalizer_synthetic_source"); | ||
|
|
@@ -375,13 +378,26 @@ public KeywordFieldMapper build(MapperBuilderContext context) { | |
| } | ||
| super.hasScript = script.get() != null; | ||
| super.onScriptError = onScriptError.getValue(); | ||
|
|
||
| BinaryFieldMapper offsetsFieldMapper; | ||
| if (context.isSourceSynthetic() && fieldtype.stored() == false) { | ||
| // keep track of value offsets so that we can reconstruct arrays from doc values in order as was specified during indexing | ||
| // (if field is stored then there is no point of doing this) | ||
| offsetsFieldMapper = new BinaryFieldMapper.Builder(leafName() + OFFSETS_FIELD_NAME_SUFFIX, context.isSourceSynthetic()) | ||
| .docValues(true) | ||
| .build(context); | ||
| } else { | ||
| offsetsFieldMapper = null; | ||
| } | ||
|
|
||
| return new KeywordFieldMapper( | ||
| leafName(), | ||
| fieldtype, | ||
| buildFieldType(context, fieldtype), | ||
| builderParams(this, context), | ||
| context.isSourceSynthetic(), | ||
| this | ||
| this, | ||
| offsetsFieldMapper | ||
| ); | ||
| } | ||
| } | ||
|
|
@@ -882,14 +898,16 @@ public boolean hasNormalizer() { | |
| private final IndexAnalyzers indexAnalyzers; | ||
| private final int ignoreAboveDefault; | ||
| private final int ignoreAbove; | ||
| private final BinaryFieldMapper offsetsFieldMapper; | ||
|
||
|
|
||
| private KeywordFieldMapper( | ||
| String simpleName, | ||
| FieldType fieldType, | ||
| KeywordFieldType mappedFieldType, | ||
| BuilderParams builderParams, | ||
| boolean isSyntheticSource, | ||
| Builder builder | ||
| Builder builder, | ||
| BinaryFieldMapper offsetsFieldMapper | ||
| ) { | ||
| super(simpleName, mappedFieldType, builderParams); | ||
| assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0; | ||
|
|
@@ -906,17 +924,50 @@ private KeywordFieldMapper( | |
| this.isSyntheticSource = isSyntheticSource; | ||
| this.ignoreAboveDefault = builder.ignoreAboveDefault; | ||
| this.ignoreAbove = builder.ignoreAbove.getValue(); | ||
| this.offsetsFieldMapper = offsetsFieldMapper; | ||
| } | ||
|
|
||
| @Override | ||
| public KeywordFieldType fieldType() { | ||
| return (KeywordFieldType) super.fieldType(); | ||
| } | ||
|
|
||
| @Override | ||
| protected void parseCreateField(DocumentParserContext context) throws IOException { | ||
| final String value = context.parser().textOrNull(); | ||
| indexValue(context, value == null ? fieldType().nullValue : value); | ||
| String value = context.parser().textOrNull(); | ||
| if (value == null) { | ||
| value = fieldType().nullValue; | ||
| } | ||
| boolean indexed = indexValue(context, value); | ||
| if (offsetsFieldMapper != null && indexed) { | ||
| context.recordOffset(fullPath(), value); | ||
| } | ||
| } | ||
|
|
||
| public void processOffsets(DocumentParserContext context) throws IOException { | ||
|
||
| var values = context.getValuesWithOffsets(fullPath()); | ||
| if (values == null || values.isEmpty()) { | ||
| return; | ||
| } | ||
|
|
||
| int arrayLength = context.getArrayValueCount(fullPath()); | ||
| int ord = 0; | ||
| int[] offsetToOrd = new int[arrayLength]; | ||
| for (var entry : values.entrySet()) { | ||
| for (var offsetAndLevel : entry.getValue()) { | ||
| offsetToOrd[offsetAndLevel] = ord; | ||
| } | ||
| ord++; | ||
| } | ||
|
||
|
|
||
| logger.info("values=" + values); | ||
|
||
| logger.info("offsetToOrd=" + Arrays.toString(offsetToOrd)); | ||
|
|
||
| try (var streamOutput = new BytesStreamOutput()) { | ||
| // TODO: optimize | ||
| // This array allows to retain the original ordering of the leaf array and duplicate values. | ||
| streamOutput.writeVIntArray(offsetToOrd); | ||
| context.doc().add(new BinaryDocValuesField(offsetsFieldMapper.fullPath(), streamOutput.bytes().toBytesRef())); | ||
| } | ||
kkrik-es marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -929,13 +980,13 @@ protected void indexScriptValues( | |
| this.fieldType().scriptValues.valuesForDoc(searchLookup, readerContext, doc, value -> indexValue(documentParserContext, value)); | ||
| } | ||
|
|
||
| private void indexValue(DocumentParserContext context, String value) { | ||
| private boolean indexValue(DocumentParserContext context, String value) { | ||
| if (value == null) { | ||
| return; | ||
| return false; | ||
| } | ||
| // if field is disabled, skip indexing | ||
| if ((fieldType.indexOptions() == IndexOptions.NONE) && (fieldType.stored() == false) && (fieldType().hasDocValues() == false)) { | ||
| return; | ||
| return false; | ||
| } | ||
|
|
||
| if (value.length() > fieldType().ignoreAbove()) { | ||
|
|
@@ -944,7 +995,7 @@ private void indexValue(DocumentParserContext context, String value) { | |
| // Save a copy of the field so synthetic source can load it | ||
| context.doc().add(new StoredField(originalName(), new BytesRef(value))); | ||
| } | ||
| return; | ||
| return false; | ||
| } | ||
|
|
||
| value = normalizeValue(fieldType().normalizer(), fullPath(), value); | ||
|
|
@@ -982,6 +1033,8 @@ private void indexValue(DocumentParserContext context, String value) { | |
| if (fieldType().hasDocValues() == false && fieldType.omitNorms()) { | ||
| context.addToFieldNames(fieldType().name()); | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) { | ||
|
|
@@ -1078,7 +1131,8 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException { | |
| } | ||
| }); | ||
| } else if (hasDocValues) { | ||
| layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath()) { | ||
| String offsetsFullPath = offsetsFieldMapper != null ? offsetsFieldMapper.fullPath() : null; | ||
| layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath(), offsetsFullPath) { | ||
|
|
||
| @Override | ||
| protected BytesRef convert(BytesRef value) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move to the branch above, to avoid repeating the condition?