Skip to content

Commit e0fcab7

Browse files
Improve bulk loading of binary doc values (elastic#137995)
Speed up bulk loading for bytes ref doc values. If doc values has dense docIds, and the queried docs are dense, copy the bytes for the adjacent values and use directly in the block loader.
1 parent e8e21ce commit e0fcab7

File tree

9 files changed

+411
-50
lines changed

9 files changed

+411
-50
lines changed

docs/changelog/137995.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 137995
2+
summary: Improve bulk loading of binary doc values
3+
area: Mapping
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/common/util/BigArrays.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,10 @@ public ByteArray newByteArray(long size, boolean clearOnResize) {
614614
}
615615
}
616616

617+
public ByteArray newByteArrayWrapper(byte[] bytes) {
618+
return validate(new ByteArrayWrapper(this, bytes, bytes.length, null, false));
619+
}
620+
617621
/**
618622
* Allocate a new {@link ByteArray} initialized with zeros.
619623
* @param size the initial length of the array

server/src/main/java/org/elasticsearch/common/util/BytesRefArray.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ public BytesRefArray(StreamInput in, BigArrays bigArrays) throws IOException {
7777
}
7878
}
7979

80-
private BytesRefArray(LongArray startOffsets, ByteArray bytes, long size, BigArrays bigArrays) {
80+
public BytesRefArray(LongArray startOffsets, ByteArray bytes, long size, BigArrays bigArrays) {
8181
this.bytes = bytes;
8282
this.startOffsets = startOffsets;
8383
this.size = size;

server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java

Lines changed: 59 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -220,13 +220,27 @@ public BlockLoader.Block tryRead(
220220
boolean toInt
221221
) throws IOException {
222222
int count = docs.count() - offset;
223-
try (var builder = factory.bytesRefs(count)) {
224-
for (int i = offset; i < docs.count(); i++) {
225-
doc = docs.get(i);
226-
bytesSlice.readBytes((long) doc * length, bytes.bytes, 0, length);
227-
builder.appendBytesRef(bytes);
223+
int firstDocId = docs.get(offset);
224+
int lastDocId = docs.get(count - 1);
225+
doc = lastDocId;
226+
227+
if (isDense(firstDocId, lastDocId, count)) {
228+
try (var builder = factory.singletonBytesRefs(count)) {
229+
int bulkLength = length * count;
230+
byte[] bytes = new byte[bulkLength];
231+
bytesSlice.readBytes((long) firstDocId * length, bytes, 0, bulkLength);
232+
builder.appendBytesRefs(bytes, length);
233+
return builder.build();
234+
}
235+
} else {
236+
try (var builder = factory.bytesRefs(count)) {
237+
for (int i = offset; i < docs.count(); i++) {
238+
int docId = docs.get(i);
239+
bytesSlice.readBytes((long) docId * length, bytes.bytes, 0, length);
240+
builder.appendBytesRef(bytes);
241+
}
242+
return builder.build();
228243
}
229-
return builder.build();
230244
}
231245
}
232246
};
@@ -255,15 +269,39 @@ public BlockLoader.Block tryRead(
255269
boolean toInt
256270
) throws IOException {
257271
int count = docs.count() - offset;
258-
try (var builder = factory.bytesRefs(count)) {
259-
for (int i = offset; i < docs.count(); i++) {
260-
doc = docs.get(i);
261-
long startOffset = addresses.get(doc);
262-
bytes.length = (int) (addresses.get(doc + 1L) - startOffset);
263-
bytesSlice.readBytes(startOffset, bytes.bytes, 0, bytes.length);
264-
builder.appendBytesRef(bytes);
272+
int firstDocId = docs.get(offset);
273+
int lastDocId = docs.get(count - 1);
274+
doc = lastDocId;
275+
276+
if (isDense(firstDocId, lastDocId, count)) {
277+
try (var builder = factory.singletonBytesRefs(count)) {
278+
long[] offsets = new long[count + 1];
279+
280+
// Normalize offsets so that first offset is 0
281+
long startOffset = addresses.get(firstDocId);
282+
for (int i = offset, j = 1; i < docs.count(); i++, j++) {
283+
int docId = docs.get(i);
284+
long nextOffset = addresses.get(docId + 1) - startOffset;
285+
offsets[j] = nextOffset;
286+
}
287+
288+
int length = Math.toIntExact(addresses.get(lastDocId + 1L) - startOffset);
289+
byte[] bytes = new byte[length];
290+
bytesSlice.readBytes(startOffset, bytes, 0, length);
291+
builder.appendBytesRefs(bytes, offsets);
292+
return builder.build();
293+
}
294+
} else {
295+
try (var builder = factory.bytesRefs(count)) {
296+
for (int i = offset; i < docs.count(); i++) {
297+
int docId = docs.get(i);
298+
long startOffset = addresses.get(docId);
299+
bytes.length = (int) (addresses.get(docId + 1L) - startOffset);
300+
bytesSlice.readBytes(startOffset, bytes.bytes, 0, bytes.length);
301+
builder.appendBytesRef(bytes);
302+
}
303+
return builder.build();
265304
}
266-
return builder.build();
267305
}
268306
}
269307
};
@@ -1556,13 +1594,6 @@ long lookAheadValueAt(int targetDoc) throws IOException {
15561594
return lookaheadBlock[valueIndex];
15571595
}
15581596

1559-
static boolean isDense(int firstDocId, int lastDocId, int length) {
1560-
// This does not detect duplicate docids (e.g [1, 1, 2, 4] would be detected as dense),
1561-
// this can happen with enrich or lookup. However this codec isn't used for enrich / lookup.
1562-
// This codec is only used in the context of logsdb and tsdb, so this is fine here.
1563-
return lastDocId - firstDocId == length - 1;
1564-
}
1565-
15661597
@Override
15671598
SortedOrdinalReader sortedOrdinalReader() {
15681599
return null;
@@ -1681,6 +1712,13 @@ public BlockLoader.Block tryRead(
16811712
}
16821713
}
16831714

1715+
private static boolean isDense(int firstDocId, int lastDocId, int length) {
1716+
// This does not detect duplicate docids (e.g [1, 1, 2, 4] would be detected as dense),
1717+
// this can happen with enrich or lookup. However this codec isn't used for enrich / lookup.
1718+
// This codec is only used in the context of logsdb and tsdb, so this is fine here.
1719+
return lastDocId - firstDocId == length - 1;
1720+
}
1721+
16841722
private NumericDocValues getRangeEncodedNumericDocValues(NumericEntry entry, long maxOrd) throws IOException {
16851723
final var ordinalsReader = new SortedOrdinalReader(
16861724
maxOrd,

server/src/main/java/org/elasticsearch/index/mapper/BlockLoader.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,17 @@ interface BlockFactory {
407407
*/
408408
BytesRefBuilder bytesRefs(int expectedCount);
409409

410+
/**
411+
* Build a specialized builder for singleton dense {@link BytesRef} fields with the following constraints:
412+
* <ul>
413+
* <li>Only one value per document can be collected</li>
414+
* <li>No more than expectedCount values can be collected</li>
415+
* </ul>
416+
*
417+
* @param expectedCount The maximum number of values to be collected.
418+
*/
419+
SingletonBytesRefBuilder singletonBytesRefs(int expectedCount);
420+
410421
/**
411422
* Build a builder to load doubles as loaded from doc values.
412423
* Doc values load doubles in sorted order.
@@ -574,6 +585,22 @@ interface BytesRefBuilder extends Builder {
574585
BytesRefBuilder appendBytesRef(BytesRef value);
575586
}
576587

588+
/**
589+
* Specialized builder for collecting dense arrays of BytesRef values.
590+
*/
591+
interface SingletonBytesRefBuilder extends Builder {
592+
/**
593+
* Append multiple BytesRef. Offsets contains offsets of each BytesRef in the byte array.
594+
* The length of the offsets array is one more than the number of BytesRefs.
595+
*/
596+
SingletonBytesRefBuilder appendBytesRefs(byte[] bytes, long[] offsets) throws IOException;
597+
598+
/**
599+
* Append multiple BytesRefs, all with the same length.
600+
*/
601+
SingletonBytesRefBuilder appendBytesRefs(byte[] bytes, long bytesRefLengths) throws IOException;
602+
}
603+
577604
interface FloatBuilder extends Builder {
578605
/**
579606
* Appends a float to the current entry.

0 commit comments

Comments
 (0)