Skip to content

Commit d9783ad

Browse files
authored
Fix bug in tsdb doc value codec (#134979)
The ordinal range encoding doesn't support multi-valued fields, but the logic that prohibits its usage for multi-valued field was missing, which resulted in multi-valued fields being incorrectly encoded. The ordinal range encoding was introduced for primary sort fields only to allow a more efficient look-ahead / skip logic. This is to efficiently determine whether a constant block could be used for a range of matching doc ids in compute engine. Closes #134950
1 parent d39cc7d commit d9783ad

File tree

3 files changed

+127
-42
lines changed

3 files changed

+127
-42
lines changed

server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,10 @@ public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOExcepti
129129
writeField(field, producer, -1, null);
130130
}
131131

132-
private boolean shouldEncodeOrdinalRange(FieldInfo field, long maxOrd, int numDocsWithValue) {
132+
private boolean shouldEncodeOrdinalRange(FieldInfo field, long maxOrd, int numDocsWithValue, long numValues) {
133133
return maxDoc > 1
134134
&& field.number == primarySortFieldNumber
135+
&& numDocsWithValue == numValues // Only single valued fields can be supported with range encoded ordinals format
135136
&& (numDocsWithValue / maxOrd) >= minDocsPerOrdinalForOrdinalRangeEncoding;
136137
}
137138

@@ -167,7 +168,8 @@ private long[] writeField(FieldInfo field, TsdbDocValuesProducer valuesProducer,
167168
if (maxOrd == 1) {
168169
// Special case for maxOrd of 1, signal -1 that no blocks will be written
169170
meta.writeInt(-1);
170-
} else if (shouldEncodeOrdinalRange(field, maxOrd, numDocsWithValue)) {
171+
} else if (shouldEncodeOrdinalRange(field, maxOrd, numDocsWithValue, numValues)) {
172+
assert offsetsAccumulator == null;
171173
// When a field is sorted, use ordinal range encode for long runs of the same ordinal.
172174
meta.writeInt(-2);
173175
meta.writeVInt(Math.toIntExact(maxOrd));
@@ -188,9 +190,6 @@ private long[] writeField(FieldInfo field, TsdbDocValuesProducer valuesProducer,
188190
if (disiAccumulator != null) {
189191
disiAccumulator.addDocId(doc);
190192
}
191-
if (offsetsAccumulator != null) {
192-
offsetsAccumulator.addDoc(1);
193-
}
194193
final long nextOrd = values.nextValue();
195194
if (nextOrd != lastOrd) {
196195
lastOrd = nextOrd;

server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java

Lines changed: 44 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1318,42 +1318,7 @@ public long longValue() throws IOException {
13181318
};
13191319
}
13201320
} else if (entry.sortedOrdinals != null) {
1321-
final var ordinalsReader = new SortedOrdinalReader(
1322-
maxOrd,
1323-
DirectMonotonicReader.getInstance(
1324-
entry.sortedOrdinals,
1325-
data.randomAccessSlice(entry.valuesOffset, entry.valuesLength),
1326-
true
1327-
)
1328-
);
1329-
if (entry.docsWithFieldOffset == -1) {
1330-
return new BaseDenseNumericValues(maxDoc) {
1331-
@Override
1332-
long lookAheadValueAt(int targetDoc) {
1333-
return ordinalsReader.lookAheadValue(targetDoc);
1334-
}
1335-
1336-
@Override
1337-
public long longValue() {
1338-
return ordinalsReader.readValueAndAdvance(doc);
1339-
}
1340-
};
1341-
} else {
1342-
final var disi = new IndexedDISI(
1343-
data,
1344-
entry.docsWithFieldOffset,
1345-
entry.docsWithFieldLength,
1346-
entry.jumpTableEntryCount,
1347-
entry.denseRankPower,
1348-
entry.numValues
1349-
);
1350-
return new BaseSparseNumericValues(disi) {
1351-
@Override
1352-
public long longValue() {
1353-
return ordinalsReader.readValueAndAdvance(disi.docID());
1354-
}
1355-
};
1356-
}
1321+
return getRangeEncodedNumericDocValues(entry, maxOrd);
13571322
}
13581323

13591324
// NOTE: we could make this a bit simpler by reusing #getValues but this
@@ -1596,6 +1561,41 @@ public BlockLoader.Block tryRead(
15961561
}
15971562
}
15981563

1564+
private NumericDocValues getRangeEncodedNumericDocValues(NumericEntry entry, long maxOrd) throws IOException {
1565+
final var ordinalsReader = new SortedOrdinalReader(
1566+
maxOrd,
1567+
DirectMonotonicReader.getInstance(entry.sortedOrdinals, data.randomAccessSlice(entry.valuesOffset, entry.valuesLength), true)
1568+
);
1569+
if (entry.docsWithFieldOffset == -1) {
1570+
return new BaseDenseNumericValues(maxDoc) {
1571+
@Override
1572+
long lookAheadValueAt(int targetDoc) {
1573+
return ordinalsReader.lookAheadValue(targetDoc);
1574+
}
1575+
1576+
@Override
1577+
public long longValue() {
1578+
return ordinalsReader.readValueAndAdvance(doc);
1579+
}
1580+
};
1581+
} else {
1582+
final var disi = new IndexedDISI(
1583+
data,
1584+
entry.docsWithFieldOffset,
1585+
entry.docsWithFieldLength,
1586+
entry.jumpTableEntryCount,
1587+
entry.denseRankPower,
1588+
entry.numValues
1589+
);
1590+
return new BaseSparseNumericValues(disi) {
1591+
@Override
1592+
public long longValue() {
1593+
return ordinalsReader.readValueAndAdvance(disi.docID());
1594+
}
1595+
};
1596+
}
1597+
}
1598+
15991599
private NumericValues getValues(NumericEntry entry, final long maxOrd) throws IOException {
16001600
assert entry.numValues > 0;
16011601
final RandomAccessInput indexSlice = data.randomAccessSlice(entry.indexOffset, entry.indexLength);
@@ -1638,6 +1638,14 @@ private SortedNumericDocValues getSortedNumeric(SortedNumericEntry entry, long m
16381638
final RandomAccessInput addressesInput = data.randomAccessSlice(entry.addressesOffset, entry.addressesLength);
16391639
final LongValues addresses = DirectMonotonicReader.getInstance(entry.addressesMeta, addressesInput, merging);
16401640

1641+
assert entry.sortedOrdinals == null : "encoded ordinal range supports only one value per document";
1642+
if (entry.sortedOrdinals != null) {
1643+
// TODO: determine when this can be removed.
1644+
// This is for the clusters that ended up using ordinal range encoding for multi-values fields. Only first value can be
1645+
// returned.
1646+
NumericDocValues values = getRangeEncodedNumericDocValues(entry, maxOrd);
1647+
return DocValues.singleton(values);
1648+
}
16411649
final NumericValues values = getValues(entry, maxOrd);
16421650

16431651
if (entry.docsWithFieldOffset == -1) {

server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import org.apache.lucene.search.Sort;
3333
import org.apache.lucene.search.SortField;
3434
import org.apache.lucene.search.SortedNumericSortField;
35+
import org.apache.lucene.search.SortedSetSortField;
3536
import org.apache.lucene.store.Directory;
3637
import org.apache.lucene.util.BytesRef;
3738
import org.elasticsearch.cluster.metadata.DataStream;
@@ -1302,6 +1303,79 @@ public int get(int docId) {
13021303
}
13031304
}
13041305

1306+
public void testEncodeRangeWithSortedSetPrimarySortField() throws Exception {
1307+
String timestampField = "@timestamp";
1308+
String hostnameField = "host.name";
1309+
long baseTimestamp = 1704067200000L;
1310+
1311+
var config = getTimeSeriesIndexWriterConfig(hostnameField, true, timestampField);
1312+
try (var dir = newDirectory(); var iw = new IndexWriter(dir, config)) {
1313+
1314+
int numDocs = 512 + random().nextInt(512);
1315+
int numHosts = numDocs / 20;
1316+
1317+
for (int i = 0; i < numDocs; i++) {
1318+
var d = new Document();
1319+
int batchIndex = i / numHosts;
1320+
{
1321+
String hostName = String.format(Locale.ROOT, "host-%03d", batchIndex);
1322+
d.add(new SortedSetDocValuesField(hostnameField, new BytesRef(hostName)));
1323+
}
1324+
{
1325+
String hostName = String.format(Locale.ROOT, "host-%03d", batchIndex + 1);
1326+
d.add(new SortedSetDocValuesField(hostnameField, new BytesRef(hostName)));
1327+
}
1328+
// Index sorting doesn't work with NumericDocValuesField:
1329+
long timestamp = baseTimestamp + (1000L * i);
1330+
d.add(new SortedNumericDocValuesField(timestampField, timestamp));
1331+
iw.addDocument(d);
1332+
if (i % 100 == 0) {
1333+
iw.commit();
1334+
}
1335+
}
1336+
iw.commit();
1337+
iw.forceMerge(1);
1338+
1339+
try (var reader = DirectoryReader.open(iw)) {
1340+
assertEquals(1, reader.leaves().size());
1341+
assertEquals(numDocs, reader.maxDoc());
1342+
var leaf = reader.leaves().get(0).reader();
1343+
var hostNameDV = leaf.getSortedSetDocValues(hostnameField);
1344+
assertNotNull(hostNameDV);
1345+
var timestampDV = DocValues.unwrapSingleton(leaf.getSortedNumericDocValues(timestampField));
1346+
assertNotNull(timestampDV);
1347+
for (int i = 0; i < numDocs; i++) {
1348+
assertEquals(i, hostNameDV.nextDoc());
1349+
1350+
int batchIndex = i / numHosts;
1351+
assertEquals(2, hostNameDV.docValueCount());
1352+
1353+
long firstOrd = hostNameDV.nextOrd();
1354+
assertEquals(batchIndex, firstOrd);
1355+
String expectedFirstHostName = String.format(Locale.ROOT, "host-%03d", batchIndex);
1356+
String actualFirstHostName = hostNameDV.lookupOrd(firstOrd).utf8ToString();
1357+
assertEquals(expectedFirstHostName, actualFirstHostName);
1358+
1359+
batchIndex++;
1360+
long secondOrd = hostNameDV.nextOrd();
1361+
assertEquals(batchIndex, secondOrd);
1362+
String expectedSecondHostName = String.format(Locale.ROOT, "host-%03d", batchIndex);
1363+
String actualSecondHostName = hostNameDV.lookupOrd(secondOrd).utf8ToString();
1364+
assertEquals(expectedSecondHostName, actualSecondHostName);
1365+
1366+
assertEquals(i, timestampDV.nextDoc());
1367+
long timestamp = timestampDV.longValue();
1368+
long lowerBound = baseTimestamp;
1369+
long upperBound = baseTimestamp + (1000L * numDocs);
1370+
assertTrue(
1371+
"unexpected timestamp [" + timestamp + "], expected between [" + lowerBound + "] and [" + upperBound + "]",
1372+
timestamp >= lowerBound && timestamp < upperBound
1373+
);
1374+
}
1375+
}
1376+
}
1377+
}
1378+
13051379
private static BaseDenseNumericValues getBaseDenseNumericValues(LeafReader leafReader, String field) throws IOException {
13061380
return (BaseDenseNumericValues) DocValues.unwrapSingleton(leafReader.getSortedNumericDocValues(field));
13071381
}
@@ -1315,11 +1389,15 @@ private static BaseSortedDocValues getBaseSortedDocValues(LeafReader leafReader,
13151389
}
13161390

13171391
private IndexWriterConfig getTimeSeriesIndexWriterConfig(String hostnameField, String timestampField) {
1392+
return getTimeSeriesIndexWriterConfig(hostnameField, false, timestampField);
1393+
}
1394+
1395+
private IndexWriterConfig getTimeSeriesIndexWriterConfig(String hostnameField, boolean multiValued, String timestampField) {
13181396
var config = new IndexWriterConfig();
13191397
if (hostnameField != null) {
13201398
config.setIndexSort(
13211399
new Sort(
1322-
new SortField(hostnameField, SortField.Type.STRING, false),
1400+
multiValued ? new SortedSetSortField(hostnameField, false) : new SortField(hostnameField, SortField.Type.STRING, false),
13231401
new SortedNumericSortField(timestampField, SortField.Type.LONG, true)
13241402
)
13251403
);

0 commit comments

Comments
 (0)