-
Notifications
You must be signed in to change notification settings - Fork 25.7k
Speed up loading keyword fields with index sorts #132950
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
8639922
e24a8e0
ecb306d
a0ce374
91d32d1
0c4edb6
4207b63
4c05703
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,6 +53,8 @@ | |
|
|
||
| final class ES819TSDBDocValuesProducer extends DocValuesProducer { | ||
| final IntObjectHashMap<NumericEntry> numerics; | ||
| private int primarySortFieldNumber = -1; | ||
| private boolean primarySortFieldReversed = false; | ||
| final IntObjectHashMap<BinaryEntry> binaries; | ||
| final IntObjectHashMap<SortedEntry> sorted; | ||
| final IntObjectHashMap<SortedSetEntry> sortedSets; | ||
|
|
@@ -91,7 +93,15 @@ final class ES819TSDBDocValuesProducer extends DocValuesProducer { | |
| ); | ||
|
|
||
| readFields(in, state.fieldInfos); | ||
|
|
||
| final var indexSort = state.segmentInfo.getIndexSort(); | ||
| if (indexSort != null && indexSort.getSort().length > 0) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 - good idea to extract this information from segment info, no need to check for index setting. |
||
| var primarySortField = indexSort.getSort()[0]; | ||
| var sortField = state.fieldInfos.fieldInfo(primarySortField.getField()); | ||
| if (sortField != null) { | ||
| primarySortFieldNumber = sortField.number; | ||
| primarySortFieldReversed = primarySortField.getReverse(); | ||
| } | ||
| } | ||
| } catch (Throwable exception) { | ||
| priorE = exception; | ||
| } finally { | ||
|
|
@@ -333,10 +343,10 @@ public boolean advanceExact(int target) throws IOException { | |
| @Override | ||
| public SortedDocValues getSorted(FieldInfo field) throws IOException { | ||
| SortedEntry entry = sorted.get(field.number); | ||
| return getSorted(entry); | ||
| return getSorted(entry, field.number == primarySortFieldNumber); | ||
| } | ||
|
|
||
| private SortedDocValues getSorted(SortedEntry entry) throws IOException { | ||
| private SortedDocValues getSorted(SortedEntry entry, boolean valuesSorted) throws IOException { | ||
| final NumericDocValues ords = getNumeric(entry.ordsEntry, entry.termsDictEntry.termsDictSize); | ||
| return new BaseSortedDocValues(entry) { | ||
|
|
||
|
|
@@ -369,10 +379,29 @@ public int advance(int target) throws IOException { | |
| public long cost() { | ||
| return ords.cost(); | ||
| } | ||
|
|
||
| @Override | ||
| public BlockLoader.Block tryRead(BlockLoader.BlockFactory factory, BlockLoader.Docs docs, int offset) throws IOException { | ||
| if (valuesSorted && ords instanceof BaseDenseNumericValues denseOrds) { | ||
| int firstDoc = docs.get(offset); | ||
| denseOrds.advanceExact(firstDoc); | ||
| long startValue = denseOrds.longValue(); | ||
| final int docCount = docs.count(); | ||
| int lastDoc = docs.get(docCount - 1); | ||
| long lastValue = denseOrds.lookAheadValueAt(lastDoc); | ||
| if (lastValue == startValue) { | ||
| BytesRef b = lookupOrd(Math.toIntExact(startValue)); | ||
| return factory.constantBytes(BytesRef.deepCopyOf(b), docCount - offset); | ||
| } | ||
| // TODO: Since ordinals are sorted, start at 0 (offset by startValue), scan until lastValue, | ||
| // then fill remaining positions with lastValue. | ||
| } | ||
| return null; | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| abstract class BaseSortedDocValues extends SortedDocValues { | ||
| abstract class BaseSortedDocValues extends SortedDocValues implements BlockLoader.OptionalColumnAtATimeReader { | ||
|
|
||
| final SortedEntry entry; | ||
| final TermsEnum termsEnum; | ||
|
|
@@ -406,6 +435,15 @@ public int lookupTerm(BytesRef key) throws IOException { | |
| public TermsEnum termsEnum() throws IOException { | ||
| return new TermsDict(entry.termsDictEntry, data, merging); | ||
| } | ||
|
|
||
| @Override | ||
| public BlockLoader.Block tryRead(BlockLoader.BlockFactory factory, BlockLoader.Docs docs, int offset) throws IOException { | ||
| return null; | ||
| } | ||
| } | ||
|
|
||
| abstract static class BaseDenseNumericValues extends NumericDocValues implements BlockLoader.OptionalColumnAtATimeReader { | ||
| abstract long lookAheadValueAt(int targetDoc) throws IOException; | ||
| } | ||
|
|
||
| abstract static class BaseSortedSetDocValues extends SortedSetDocValues { | ||
|
|
@@ -695,7 +733,7 @@ public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOExcepti | |
| public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { | ||
| SortedSetEntry entry = sortedSets.get(field.number); | ||
| if (entry.singleValueEntry != null) { | ||
| return DocValues.singleton(getSorted(entry.singleValueEntry)); | ||
| return DocValues.singleton(getSorted(entry.singleValueEntry, field.number == primarySortFieldNumber)); | ||
| } | ||
|
|
||
| SortedNumericEntry ordsEntry = entry.ordsEntry; | ||
|
|
@@ -1047,7 +1085,7 @@ private NumericDocValues getNumeric(NumericEntry entry, long maxOrd) throws IOEx | |
| // Special case for maxOrd 1, no need to read blocks and use ordinal 0 as only value | ||
| if (entry.docsWithFieldOffset == -1) { | ||
| // Special case when all docs have a value | ||
| return new NumericDocValues() { | ||
| return new BaseDenseNumericValues() { | ||
|
|
||
| private final int maxDoc = ES819TSDBDocValuesProducer.this.maxDoc; | ||
| private int doc = -1; | ||
|
|
@@ -1086,6 +1124,17 @@ public boolean advanceExact(int target) { | |
| public long cost() { | ||
| return maxDoc; | ||
| } | ||
|
|
||
| @Override | ||
| long lookAheadValueAt(int targetDoc) throws IOException { | ||
| return 0L; // Only one ordinal! | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. easy one :)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the test found it :) |
||
| } | ||
|
|
||
| @Override | ||
| public BlockLoader.Block tryRead(BlockLoader.BlockFactory factory, BlockLoader.Docs docs, int offset) | ||
| throws IOException { | ||
| return null; | ||
| } | ||
| }; | ||
| } else { | ||
| final IndexedDISI disi = new IndexedDISI( | ||
|
|
@@ -1141,13 +1190,17 @@ public long longValue() { | |
| final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1; | ||
| if (entry.docsWithFieldOffset == -1) { | ||
| // dense | ||
| return new BulkNumericDocValues() { | ||
| return new BaseDenseNumericValues() { | ||
|
|
||
| private final int maxDoc = ES819TSDBDocValuesProducer.this.maxDoc; | ||
| private int doc = -1; | ||
| private final TSDBDocValuesEncoder decoder = new TSDBDocValuesEncoder(ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE); | ||
| private long currentBlockIndex = -1; | ||
| private final long[] currentBlock = new long[ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE]; | ||
| // lookahead block | ||
| private long lookaheadBlockIndex = -1; | ||
| private long[] lookaheadBlock; | ||
| private IndexInput lookaheadData = null; | ||
|
|
||
| @Override | ||
| public int docID() { | ||
|
|
@@ -1183,24 +1236,28 @@ public long longValue() throws IOException { | |
| final int index = doc; | ||
| final int blockIndex = index >>> ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SHIFT; | ||
| final int blockInIndex = index & ES819TSDBDocValuesFormat.NUMERIC_BLOCK_MASK; | ||
| if (blockIndex != currentBlockIndex) { | ||
| assert blockIndex > currentBlockIndex : blockIndex + " < " + currentBlockIndex; | ||
| // no need to seek if the loading block is the next block | ||
| if (currentBlockIndex + 1 != blockIndex) { | ||
| valuesData.seek(indexReader.get(blockIndex)); | ||
| } | ||
| currentBlockIndex = blockIndex; | ||
| if (maxOrd >= 0) { | ||
| decoder.decodeOrdinals(valuesData, currentBlock, bitsPerOrd); | ||
| } else { | ||
| decoder.decode(valuesData, currentBlock); | ||
| } | ||
| if (blockIndex == currentBlockIndex) { | ||
| return currentBlock[blockInIndex]; | ||
| } | ||
| if (blockIndex == lookaheadBlockIndex) { | ||
| return lookaheadBlock[blockInIndex]; | ||
| } | ||
| assert blockIndex > currentBlockIndex : blockIndex + " < " + currentBlockIndex; | ||
| // no need to seek if the loading block is the next block | ||
| if (currentBlockIndex + 1 != blockIndex) { | ||
| valuesData.seek(indexReader.get(blockIndex)); | ||
| } | ||
| currentBlockIndex = blockIndex; | ||
| if (maxOrd >= 0) { | ||
| decoder.decodeOrdinals(valuesData, currentBlock, bitsPerOrd); | ||
| } else { | ||
| decoder.decode(valuesData, currentBlock); | ||
| } | ||
| return currentBlock[blockInIndex]; | ||
| } | ||
|
|
||
| @Override | ||
| public BlockLoader.Block read(BlockLoader.BlockFactory factory, BlockLoader.Docs docs, int offset) throws IOException { | ||
| public BlockLoader.Block tryRead(BlockLoader.BlockFactory factory, BlockLoader.Docs docs, int offset) throws IOException { | ||
| assert maxOrd == -1 : "unexpected maxOrd[" + maxOrd + "]"; | ||
| final int docsCount = docs.count(); | ||
| doc = docs.get(docsCount - 1); | ||
|
|
@@ -1238,6 +1295,32 @@ public BlockLoader.Block read(BlockLoader.BlockFactory factory, BlockLoader.Docs | |
| } | ||
| } | ||
|
|
||
| @Override | ||
| long lookAheadValueAt(int targetDoc) throws IOException { | ||
| final int blockIndex = targetDoc >>> ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SHIFT; | ||
| final int valueIndex = targetDoc & ES819TSDBDocValuesFormat.NUMERIC_BLOCK_MASK; | ||
| if (blockIndex == currentBlockIndex) { | ||
| return currentBlock[valueIndex]; | ||
| } | ||
| // load data to the lookahead block | ||
| if (lookaheadBlockIndex != blockIndex) { | ||
| if (lookaheadBlock == null) { | ||
| lookaheadBlock = new long[ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE]; | ||
| lookaheadData = data.slice("look_ahead_values", entry.valuesOffset, entry.valuesLength); | ||
| } | ||
| if (lookaheadBlockIndex + 1 != blockIndex) { | ||
| lookaheadData.seek(indexReader.get(blockIndex)); | ||
| } | ||
| if (maxOrd >= 0) { | ||
|
||
| decoder.decodeOrdinals(lookaheadData, lookaheadBlock, bitsPerOrd); | ||
| } else { | ||
| decoder.decode(lookaheadData, lookaheadBlock); | ||
| } | ||
| lookaheadBlockIndex = blockIndex; | ||
| } | ||
| return lookaheadBlock[valueIndex]; | ||
| } | ||
|
|
||
| static boolean isDense(int firstDocId, int lastDocId, int length) { | ||
| // This does not detect duplicate docids (e.g [1, 1, 2, 4] would be detected as dense), | ||
| // this can happen with enrich or lookup. However this codec isn't used for enrich / lookup. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
| import org.apache.lucene.index.SortedDocValues; | ||
| import org.apache.lucene.index.SortedSetDocValues; | ||
| import org.apache.lucene.util.BytesRef; | ||
| import org.elasticsearch.core.Nullable; | ||
| import org.elasticsearch.core.Releasable; | ||
| import org.elasticsearch.search.fetch.StoredFieldsSpec; | ||
| import org.elasticsearch.search.lookup.Source; | ||
|
|
@@ -46,6 +47,22 @@ interface ColumnAtATimeReader extends Reader { | |
| BlockLoader.Block read(BlockFactory factory, Docs docs, int offset) throws IOException; | ||
| } | ||
|
|
||
| /** | ||
| * An interface for readers that attempt to load all document values in a column-at-a-time fashion. | ||
| * <p> | ||
| * Unlike {@link ColumnAtATimeReader}, implementations may return {@code null} if they are unable | ||
| * to load the requested values, for example due to unsupported underlying data. | ||
| * This allows callers to optimistically try optimized loading strategies first, and fall back if necessary. | ||
| */ | ||
| interface OptionalColumnAtATimeReader { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 This is better than previous abstraction ( |
||
| /** | ||
| * Attempts to read the values of all documents in {@code docs} | ||
| * Returns {@code null} if unable to load the values. | ||
| */ | ||
| @Nullable | ||
| BlockLoader.Block tryRead(BlockFactory factory, Docs docs, int offset) throws IOException; | ||
| } | ||
|
|
||
| interface RowStrideReader extends Reader { | ||
| /** | ||
| * Reads the values of the given document into the builder. | ||
|
|
@@ -549,6 +566,5 @@ interface AggregateMetricDoubleBuilder extends Builder { | |
| DoubleBuilder sum(); | ||
|
|
||
| IntBuilder count(); | ||
|
|
||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that this field can be converted to a variable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks, I removed a0ce374