Skip to content

Commit 5076681

Browse files
authored
Implement docIDRunEnd() on ES819TSDBDocValuesProducer (#132939)
This method allows consumers to quickly check if a DocIdSetIterator matches a large run of documents; it was missing from our custom Codec DocValues implementations.
1 parent c453ff2 commit 5076681

File tree

2 files changed

+181
-0
lines changed

2 files changed

+181
-0
lines changed

server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,11 @@ public boolean advanceExact(int target) throws IOException {
294294
doc = target;
295295
return true;
296296
}
297+
298+
@Override
299+
public int docIDRunEnd() throws IOException {
300+
return maxDoc;
301+
}
297302
}
298303

299304
private abstract static class SparseBinaryDocValues extends BinaryDocValues {
@@ -328,6 +333,11 @@ public int advance(int target) throws IOException {
328333
public boolean advanceExact(int target) throws IOException {
329334
return disi.advanceExact(target);
330335
}
336+
337+
@Override
338+
public int docIDRunEnd() throws IOException {
339+
return disi.docIDRunEnd();
340+
}
331341
}
332342

333343
@Override
@@ -369,6 +379,11 @@ public int advance(int target) throws IOException {
369379
public long cost() {
370380
return ords.cost();
371381
}
382+
383+
@Override
384+
public int docIDRunEnd() throws IOException {
385+
return ords.docIDRunEnd();
386+
}
372387
};
373388
}
374389

@@ -750,6 +765,11 @@ public int advance(int target) throws IOException {
750765
public long cost() {
751766
return ords.cost();
752767
}
768+
769+
@Override
770+
public int docIDRunEnd() throws IOException {
771+
return ords.docIDRunEnd();
772+
}
753773
};
754774
}
755775

@@ -1086,6 +1106,11 @@ public boolean advanceExact(int target) {
10861106
public long cost() {
10871107
return maxDoc;
10881108
}
1109+
1110+
@Override
1111+
public int docIDRunEnd() {
1112+
return maxDoc;
1113+
}
10891114
};
10901115
} else {
10911116
final IndexedDISI disi = new IndexedDISI(
@@ -1127,6 +1152,11 @@ public long cost() {
11271152
public long longValue() {
11281153
return 0L;
11291154
}
1155+
1156+
@Override
1157+
public int docIDRunEnd() throws IOException {
1158+
return disi.docIDRunEnd();
1159+
}
11301160
};
11311161
}
11321162
}
@@ -1178,6 +1208,11 @@ public long cost() {
11781208
return maxDoc;
11791209
}
11801210

1211+
@Override
1212+
public int docIDRunEnd() {
1213+
return maxDoc;
1214+
}
1215+
11811216
@Override
11821217
public long longValue() throws IOException {
11831218
final int index = doc;
@@ -1286,6 +1321,11 @@ public long cost() {
12861321
return disi.cost();
12871322
}
12881323

1324+
@Override
1325+
public int docIDRunEnd() throws IOException {
1326+
return disi.docIDRunEnd();
1327+
}
1328+
12891329
@Override
12901330
public long longValue() throws IOException {
12911331
final int index = disi.index();
@@ -1406,6 +1446,11 @@ public long nextValue() throws IOException {
14061446
public int docValueCount() {
14071447
return count;
14081448
}
1449+
1450+
@Override
1451+
public int docIDRunEnd() {
1452+
return maxDoc;
1453+
}
14091454
};
14101455
} else {
14111456
// sparse
@@ -1463,6 +1508,11 @@ public int docValueCount() {
14631508
return count;
14641509
}
14651510

1511+
@Override
1512+
public int docIDRunEnd() throws IOException {
1513+
return disi.docIDRunEnd();
1514+
}
1515+
14661516
private void set() {
14671517
if (set == false) {
14681518
final int index = disi.index();

server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.lucene.index.LogByteSizeMergePolicy;
2828
import org.apache.lucene.index.NumericDocValues;
2929
import org.apache.lucene.index.SortedDocValues;
30+
import org.apache.lucene.search.DocIdSetIterator;
3031
import org.apache.lucene.search.IndexSearcher;
3132
import org.apache.lucene.search.Sort;
3233
import org.apache.lucene.search.SortField;
@@ -49,6 +50,8 @@
4950
import java.util.function.Supplier;
5051
import java.util.stream.IntStream;
5152

53+
import static org.elasticsearch.test.ESTestCase.randomFrom;
54+
5255
public class ES819TSDBDocValuesFormatTests extends ES87TSDBDocValuesFormatTests {
5356

5457
final Codec codec = TestUtil.alwaysDocValuesFormat(new ES819TSDBDocValuesFormat());
@@ -959,6 +962,134 @@ private static BulkNumericDocValues getBulkNumericDocValues(LeafReader leafReade
959962
return (BulkNumericDocValues) DocValues.unwrapSingleton(leafReader.getSortedNumericDocValues(counterField));
960963
}
961964

965+
public void testDocIDEndRun() throws IOException {
966+
String timestampField = "@timestamp";
967+
String hostnameField = "host.name";
968+
long baseTimestamp = 1704067200000L;
969+
970+
var config = getTimeSeriesIndexWriterConfig(hostnameField, timestampField);
971+
try (var dir = newDirectory(); var iw = new IndexWriter(dir, config)) {
972+
long counter1 = 0;
973+
974+
long[] gauge2Values = new long[] { -2, -4, -6, -8, -10, -12, -14, -16 };
975+
String[] tags = new String[] { "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8" };
976+
977+
// IndexedDISI stores ids in blocks of 4096. To test sparse end runs, we want a mixture of
978+
// dense and sparse blocks, so we need the gap frequency to be larger than
979+
// this value, but smaller than two blocks, and to index at least three blocks
980+
int gap_frequency = 4500 + random().nextInt(2048);
981+
int numDocs = 10000 + random().nextInt(10000);
982+
int numHosts = numDocs / 20;
983+
984+
for (int i = 0; i < numDocs; i++) {
985+
var d = new Document();
986+
987+
int batchIndex = i / numHosts;
988+
String hostName = String.format(Locale.ROOT, "host-%03d", batchIndex);
989+
long timestamp = baseTimestamp + (1000L * i);
990+
991+
d.add(new SortedDocValuesField(hostnameField, new BytesRef(hostName)));
992+
// Index sorting doesn't work with NumericDocValuesField:
993+
d.add(new SortedNumericDocValuesField(timestampField, timestamp));
994+
d.add(new NumericDocValuesField("counter", counter1++));
995+
if (i % gap_frequency != 0) {
996+
d.add(new NumericDocValuesField("sparse_counter", counter1));
997+
}
998+
999+
int numGauge2 = 1 + random().nextInt(8);
1000+
for (int j = 0; j < numGauge2; j++) {
1001+
d.add(new SortedNumericDocValuesField("gauge", gauge2Values[(i + j) % gauge2Values.length]));
1002+
if (i % gap_frequency != 0) {
1003+
d.add(new SortedNumericDocValuesField("sparse_gauge", gauge2Values[(i + j) % gauge2Values.length]));
1004+
}
1005+
}
1006+
1007+
d.add(new SortedDocValuesField("tag", new BytesRef(randomFrom(tags))));
1008+
if (i % gap_frequency != 0) {
1009+
d.add(new SortedDocValuesField("sparse_tag", new BytesRef(randomFrom(tags))));
1010+
}
1011+
1012+
int numTags = 1 + random().nextInt(8);
1013+
for (int j = 0; j < numTags; j++) {
1014+
d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[(i + j) % tags.length])));
1015+
if (i % gap_frequency != 0) {
1016+
d.add(new SortedSetDocValuesField("sparse_tags", new BytesRef(tags[(i + j) % tags.length])));
1017+
}
1018+
}
1019+
1020+
d.add(new BinaryDocValuesField("tags_as_bytes", new BytesRef(tags[i % tags.length])));
1021+
if (i % gap_frequency != 0) {
1022+
d.add(new BinaryDocValuesField("sparse_tags_as_bytes", new BytesRef(tags[i % tags.length])));
1023+
}
1024+
1025+
iw.addDocument(d);
1026+
if (i % 100 == 0) {
1027+
iw.commit();
1028+
}
1029+
}
1030+
iw.commit();
1031+
1032+
iw.forceMerge(1);
1033+
1034+
try (var reader = DirectoryReader.open(iw)) {
1035+
assertEquals(1, reader.leaves().size());
1036+
assertEquals(numDocs, reader.maxDoc());
1037+
var leaf = reader.leaves().get(0).reader();
1038+
var hostNameDV = leaf.getSortedDocValues(hostnameField);
1039+
assertNotNull(hostNameDV);
1040+
validateRunEnd(hostNameDV);
1041+
var timestampDV = DocValues.unwrapSingleton(leaf.getSortedNumericDocValues(timestampField));
1042+
assertNotNull(timestampDV);
1043+
validateRunEnd(timestampDV);
1044+
var counterOneDV = leaf.getNumericDocValues("counter");
1045+
assertNotNull(counterOneDV);
1046+
validateRunEnd(counterOneDV);
1047+
var sparseCounter = leaf.getNumericDocValues("sparse_counter");
1048+
assertNotNull(sparseCounter);
1049+
validateRunEnd(sparseCounter);
1050+
var gaugeOneDV = leaf.getSortedNumericDocValues("gauge");
1051+
assertNotNull(gaugeOneDV);
1052+
validateRunEnd(gaugeOneDV);
1053+
var sparseGaugeDV = leaf.getSortedNumericDocValues("sparse_gauge");
1054+
assertNotNull(sparseGaugeDV);
1055+
validateRunEnd(sparseGaugeDV);
1056+
var tagDV = leaf.getSortedDocValues("tag");
1057+
assertNotNull(tagDV);
1058+
validateRunEnd(tagDV);
1059+
var sparseTagDV = leaf.getSortedDocValues("sparse_tag");
1060+
assertNotNull(sparseTagDV);
1061+
validateRunEnd(sparseTagDV);
1062+
var tagsDV = leaf.getSortedSetDocValues("tags");
1063+
assertNotNull(tagsDV);
1064+
validateRunEnd(tagsDV);
1065+
var sparseTagsDV = leaf.getSortedSetDocValues("sparse_tags");
1066+
assertNotNull(sparseTagsDV);
1067+
validateRunEnd(sparseTagsDV);
1068+
var tagBytesDV = leaf.getBinaryDocValues("tags_as_bytes");
1069+
assertNotNull(tagBytesDV);
1070+
validateRunEnd(tagBytesDV);
1071+
var sparseTagBytesDV = leaf.getBinaryDocValues("sparse_tags_as_bytes");
1072+
assertNotNull(sparseTagBytesDV);
1073+
validateRunEnd(sparseTagBytesDV);
1074+
}
1075+
}
1076+
}
1077+
1078+
private void validateRunEnd(DocIdSetIterator iterator) throws IOException {
1079+
int runCount = 0;
1080+
while (iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
1081+
int runLength = iterator.docIDRunEnd() - iterator.docID() - 1;
1082+
if (runLength > 1) {
1083+
runCount++;
1084+
for (int i = 0; i < runLength; i++) {
1085+
int expected = iterator.docID() + 1;
1086+
assertEquals(expected, iterator.advance(expected));
1087+
}
1088+
}
1089+
}
1090+
assertTrue("Expected docid runs of greater than 1", runCount > 0);
1091+
}
1092+
9621093
private IndexWriterConfig getTimeSeriesIndexWriterConfig(String hostnameField, String timestampField) {
9631094
var config = new IndexWriterConfig();
9641095
if (hostnameField != null) {

0 commit comments

Comments
 (0)