-
Notifications
You must be signed in to change notification settings - Fork 25.4k
Push compute engine value loading for longs down to tsdb codec. #132622
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
dd4c80d
d8ea150
0d73a27
b32cf93
173f113
5d84f19
e6014ec
150e06a
ffd2de6
be0c77c
a5c877a
1c15d39
054b12e
f097f4a
c26e575
cc3614b
8636124
212ec5d
6ca5c66
2ef68f4
115e4e6
dcbcaf2
5e207e1
206703b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 132622 | ||
summary: Add bulk loading of dense singleton number doc values to tsdb codec and push compute engine value loading for longs down to tsdb codec | ||
area: "Codec" | ||
type: enhancement | ||
issues: [] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the "Elastic License | ||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
* Public License v 1"; you may not use this file except in compliance with, at | ||
* your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
* License v3.0 only", or the "Server Side Public License, v 1". | ||
*/ | ||
|
||
package org.elasticsearch.index.codec.tsdb.es819; | ||
|
||
import org.apache.lucene.index.NumericDocValues; | ||
import org.elasticsearch.index.mapper.BlockLoader; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* An es819 doc values specialization that allows bulk loading of values that is optimized in the context of compute engine. | ||
*/ | ||
public abstract class BulkNumericDocValues extends NumericDocValues { | ||
|
||
/** | ||
* Reads the values of all documents in {@code docs}. | ||
*/ | ||
public abstract BlockLoader.Block read(BlockLoader.BlockFactory factory, BlockLoader.Docs docs, int offset) throws IOException; | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ | |
import org.apache.lucene.util.packed.PackedInts; | ||
import org.elasticsearch.core.IOUtils; | ||
import org.elasticsearch.index.codec.tsdb.TSDBDocValuesEncoder; | ||
import org.elasticsearch.index.mapper.BlockLoader; | ||
|
||
import java.io.IOException; | ||
|
||
|
@@ -1140,8 +1141,9 @@ public long longValue() { | |
final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1; | ||
if (entry.docsWithFieldOffset == -1) { | ||
// dense | ||
return new NumericDocValues() { | ||
return new BulkNumericDocValues() { | ||
|
||
private final Thread creationThread = Thread.currentThread(); | ||
private final int maxDoc = ES819TSDBDocValuesProducer.this.maxDoc; | ||
private int doc = -1; | ||
private final TSDBDocValuesEncoder decoder = new TSDBDocValuesEncoder(ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE); | ||
|
@@ -1197,6 +1199,53 @@ public long longValue() throws IOException { | |
} | ||
return currentBlock[blockInIndex]; | ||
} | ||
|
||
@Override | ||
public BlockLoader.Block read(BlockLoader.BlockFactory factory, BlockLoader.Docs docs, int offset) throws IOException { | ||
assert maxOrd == -1 : "unexpected maxOrd[" + maxOrd + "]"; | ||
final int docsCount = docs.count(); | ||
doc = docs.get(docsCount - 1); | ||
try (BlockLoader.SingletonLongBuilder builder = factory.singletonLongs(docs.count() - offset)) { | ||
for (int i = offset; i < docsCount;) { | ||
int index = docs.get(i); | ||
final int blockIndex = index >>> ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SHIFT; | ||
final int blockInIndex = index & ES819TSDBDocValuesFormat.NUMERIC_BLOCK_MASK; | ||
if (blockIndex != currentBlockIndex) { | ||
assert blockIndex > currentBlockIndex : blockIndex + " < " + currentBlockIndex; | ||
// no need to seek if the loading block is the next block | ||
if (currentBlockIndex + 1 != blockIndex) { | ||
valuesData.seek(indexReader.get(blockIndex)); | ||
} | ||
currentBlockIndex = blockIndex; | ||
decoder.decode(valuesData, currentBlock); | ||
} | ||
|
||
// Try to append more than just one value: | ||
// Instead of iterating over docs and find the max length, take an optimistic approach to avoid as | ||
// many comparisons as there are remaining docs and instead do at most 7 comparisons: | ||
int length = 1; | ||
int remainingBlockLength = Math.min(ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE - blockInIndex, docsCount - i); | ||
for (int newLength = remainingBlockLength; newLength > 1; newLength = newLength >> 1) { | ||
int lastIndex = i + newLength - 1; | ||
if (lastIndex < docsCount && isDense(index, docs.get(lastIndex), newLength)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this logic! Can we limit remainingBlockLength to the min of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Let me try this.
I will add a comment about this here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pushed: 6ca5c66 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can remove |
||
length = newLength; | ||
break; | ||
} | ||
} | ||
builder.appendLongs(currentBlock, blockInIndex, length); | ||
i += length; | ||
} | ||
return builder.build(); | ||
} | ||
} | ||
|
||
static boolean isDense(int firstDocId, int lastDocId, int length) { | ||
// This does not detect duplicate docids (e.g [1, 1, 2, 4] would be detected as dense), | ||
// this can happen with enrich or lookup. However this codec isn't used for enrich / lookup. | ||
// This codec is only used in the context of logsdb and tsdb, so this is fine here. | ||
return lastDocId - firstDocId == length - 1; | ||
} | ||
|
||
}; | ||
} else { | ||
final IndexedDISI disi = new IndexedDISI( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
import org.apache.lucene.util.BytesRef; | ||
import org.elasticsearch.common.io.stream.ByteArrayStreamInput; | ||
import org.elasticsearch.index.IndexVersion; | ||
import org.elasticsearch.index.codec.tsdb.es819.BulkNumericDocValues; | ||
import org.elasticsearch.index.mapper.BlockLoader.BlockFactory; | ||
import org.elasticsearch.index.mapper.BlockLoader.BooleanBuilder; | ||
import org.elasticsearch.index.mapper.BlockLoader.Builder; | ||
|
@@ -84,6 +85,7 @@ public boolean supportsOrdinals() { | |
public SortedSetDocValues ordinals(LeafReaderContext context) throws IOException { | ||
throw new UnsupportedOperationException(); | ||
} | ||
|
||
} | ||
|
||
public static class LongsBlockLoader extends DocValuesBlockLoader { | ||
|
@@ -116,15 +118,18 @@ public AllReader reader(LeafReaderContext context) throws IOException { | |
} | ||
} | ||
|
||
private static class SingletonLongs extends BlockDocValuesReader { | ||
private final NumericDocValues numericDocValues; | ||
static class SingletonLongs extends BlockDocValuesReader { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we enable the optimization in
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that should work as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I took me a while, but this works out and it is now even simpler! 054b12e |
||
final NumericDocValues numericDocValues; | ||
|
||
SingletonLongs(NumericDocValues numericDocValues) { | ||
this.numericDocValues = numericDocValues; | ||
} | ||
|
||
@Override | ||
public BlockLoader.Block read(BlockFactory factory, Docs docs, int offset) throws IOException { | ||
if (numericDocValues instanceof BulkNumericDocValues bulkDv) { | ||
return bulkDv.read(factory, docs, offset); | ||
} | ||
try (BlockLoader.LongBuilder builder = factory.longsFromDocValues(docs.count() - offset)) { | ||
int lastDoc = -1; | ||
for (int i = offset; i < docs.count(); i++) { | ||
|
@@ -164,7 +169,7 @@ public String toString() { | |
} | ||
} | ||
|
||
private static class Longs extends BlockDocValuesReader { | ||
static class Longs extends BlockDocValuesReader { | ||
private final SortedNumericDocValues numericDocValues; | ||
private int docID = -1; | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -400,6 +400,17 @@ interface BlockFactory { | |
*/ | ||
LongBuilder longs(int expectedCount); | ||
|
||
/** | ||
* Build a specialized builder for singleton dense long based fields with the following constraints: | ||
* <ul> | ||
* <li>Only one value per document can be collected</li> | ||
* <li>No more than expectedCount values can be collected</li> | ||
* </ul> | ||
* | ||
* @param expectedCount The maximum number of values to be collected. | ||
*/ | ||
SingletonLongBuilder singletonLongs(int expectedCount); | ||
|
||
/** | ||
* Build a builder to load only {@code null}s. | ||
*/ | ||
|
@@ -498,6 +509,16 @@ interface IntBuilder extends Builder { | |
IntBuilder appendInt(int value); | ||
} | ||
|
||
/** | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we rename this to SingletonLongBuilder and add support for appending a single long? I think we can use this builder when doc_values is dense, even if it's not from our codec. Also, we should consider extending LongVectorFixedBuilder to support bulking, but it's not an issue of this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed the class: 2924c402368fd58bc13adea5a943d8afa2fda963
I think so too, we would need to check by:
👍 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I pushed f097f4a, to use singleton long builder when we're dense even when not using es819 doc value codec. |
||
* Specialized builder for collecting dense arrays of long values. | ||
*/ | ||
interface SingletonLongBuilder extends Builder { | ||
|
||
SingletonLongBuilder appendLong(long value); | ||
|
||
SingletonLongBuilder appendLongs(long[] values, int from, int length); | ||
} | ||
|
||
interface LongBuilder extends Builder { | ||
/** | ||
* Appends a long to the current entry. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the summary doesn't match with the PR title?