diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java index 8aaef4329d6c4..8cabd75652d4e 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java @@ -129,9 +129,10 @@ public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOExcepti writeField(field, producer, -1, null); } - private boolean shouldEncodeOrdinalRange(FieldInfo field, long maxOrd, int numDocsWithValue) { + private boolean shouldEncodeOrdinalRange(FieldInfo field, long maxOrd, int numDocsWithValue, long numValues) { return maxDoc > 1 && field.number == primarySortFieldNumber + && numDocsWithValue == numValues // Only single valued fields can be supported with range encoded ordinals format && (numDocsWithValue / maxOrd) >= minDocsPerOrdinalForOrdinalRangeEncoding; } @@ -167,7 +168,8 @@ private long[] writeField(FieldInfo field, TsdbDocValuesProducer valuesProducer, if (maxOrd == 1) { // Special case for maxOrd of 1, signal -1 that no blocks will be written meta.writeInt(-1); - } else if (shouldEncodeOrdinalRange(field, maxOrd, numDocsWithValue)) { + } else if (shouldEncodeOrdinalRange(field, maxOrd, numDocsWithValue, numValues)) { + assert offsetsAccumulator == null; // When a field is sorted, use ordinal range encode for long runs of the same ordinal. meta.writeInt(-2); meta.writeVInt(Math.toIntExact(maxOrd)); @@ -188,9 +190,6 @@ private long[] writeField(FieldInfo field, TsdbDocValuesProducer valuesProducer, if (disiAccumulator != null) { disiAccumulator.addDocId(doc); } - if (offsetsAccumulator != null) { - offsetsAccumulator.addDoc(1); - } final long nextOrd = values.nextValue(); if (nextOrd != lastOrd) { lastOrd = nextOrd; diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java index de0da6b9d2ac7..6e713693e3ea6 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java @@ -1318,42 +1318,7 @@ public long longValue() throws IOException { }; } } else if (entry.sortedOrdinals != null) { - final var ordinalsReader = new SortedOrdinalReader( - maxOrd, - DirectMonotonicReader.getInstance( - entry.sortedOrdinals, - data.randomAccessSlice(entry.valuesOffset, entry.valuesLength), - true - ) - ); - if (entry.docsWithFieldOffset == -1) { - return new BaseDenseNumericValues(maxDoc) { - @Override - long lookAheadValueAt(int targetDoc) { - return ordinalsReader.lookAheadValue(targetDoc); - } - - @Override - public long longValue() { - return ordinalsReader.readValueAndAdvance(doc); - } - }; - } else { - final var disi = new IndexedDISI( - data, - entry.docsWithFieldOffset, - entry.docsWithFieldLength, - entry.jumpTableEntryCount, - entry.denseRankPower, - entry.numValues - ); - return new BaseSparseNumericValues(disi) { - @Override - public long longValue() { - return ordinalsReader.readValueAndAdvance(disi.docID()); - } - }; - } + return getRangeEncodedNumericDocValues(entry, maxOrd); } // NOTE: we could make this a bit simpler by reusing #getValues but this @@ -1596,6 +1561,41 @@ public BlockLoader.Block tryRead( } } + private NumericDocValues getRangeEncodedNumericDocValues(NumericEntry entry, long maxOrd) throws IOException { + final var ordinalsReader = new SortedOrdinalReader( + maxOrd, + DirectMonotonicReader.getInstance(entry.sortedOrdinals, data.randomAccessSlice(entry.valuesOffset, entry.valuesLength), true) + ); + if (entry.docsWithFieldOffset == -1) { + return new BaseDenseNumericValues(maxDoc) { + @Override + long lookAheadValueAt(int targetDoc) { + return ordinalsReader.lookAheadValue(targetDoc); + } + + @Override + public long longValue() { + return ordinalsReader.readValueAndAdvance(doc); + } + }; + } else { + final var disi = new IndexedDISI( + data, + entry.docsWithFieldOffset, + entry.docsWithFieldLength, + entry.jumpTableEntryCount, + entry.denseRankPower, + entry.numValues + ); + return new BaseSparseNumericValues(disi) { + @Override + public long longValue() { + return ordinalsReader.readValueAndAdvance(disi.docID()); + } + }; + } + } + private NumericValues getValues(NumericEntry entry, final long maxOrd) throws IOException { assert entry.numValues > 0; final RandomAccessInput indexSlice = data.randomAccessSlice(entry.indexOffset, entry.indexLength); @@ -1638,6 +1638,14 @@ private SortedNumericDocValues getSortedNumeric(SortedNumericEntry entry, long m final RandomAccessInput addressesInput = data.randomAccessSlice(entry.addressesOffset, entry.addressesLength); final LongValues addresses = DirectMonotonicReader.getInstance(entry.addressesMeta, addressesInput, merging); + assert entry.sortedOrdinals == null : "encoded ordinal range supports only one value per document"; + if (entry.sortedOrdinals != null) { + // TODO: determine when this can be removed. + // This is for the clusters that ended up using ordinal range encoding for multi-values fields. Only first value can be + // returned. + NumericDocValues values = getRangeEncodedNumericDocValues(entry, maxOrd); + return DocValues.singleton(values); + } final NumericValues values = getValues(entry, maxOrd); if (entry.docsWithFieldOffset == -1) { diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java index 73bb8cdac5fc9..ea29b1cbf1356 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java @@ -32,6 +32,7 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.search.SortedSetSortField; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.elasticsearch.cluster.metadata.DataStream; @@ -1302,6 +1303,79 @@ public int get(int docId) { } } + public void testEncodeRangeWithSortedSetPrimarySortField() throws Exception { + String timestampField = "@timestamp"; + String hostnameField = "host.name"; + long baseTimestamp = 1704067200000L; + + var config = getTimeSeriesIndexWriterConfig(hostnameField, true, timestampField); + try (var dir = newDirectory(); var iw = new IndexWriter(dir, config)) { + + int numDocs = 512 + random().nextInt(512); + int numHosts = numDocs / 20; + + for (int i = 0; i < numDocs; i++) { + var d = new Document(); + int batchIndex = i / numHosts; + { + String hostName = String.format(Locale.ROOT, "host-%03d", batchIndex); + d.add(new SortedSetDocValuesField(hostnameField, new BytesRef(hostName))); + } + { + String hostName = String.format(Locale.ROOT, "host-%03d", batchIndex + 1); + d.add(new SortedSetDocValuesField(hostnameField, new BytesRef(hostName))); + } + // Index sorting doesn't work with NumericDocValuesField: + long timestamp = baseTimestamp + (1000L * i); + d.add(new SortedNumericDocValuesField(timestampField, timestamp)); + iw.addDocument(d); + if (i % 100 == 0) { + iw.commit(); + } + } + iw.commit(); + iw.forceMerge(1); + + try (var reader = DirectoryReader.open(iw)) { + assertEquals(1, reader.leaves().size()); + assertEquals(numDocs, reader.maxDoc()); + var leaf = reader.leaves().get(0).reader(); + var hostNameDV = leaf.getSortedSetDocValues(hostnameField); + assertNotNull(hostNameDV); + var timestampDV = DocValues.unwrapSingleton(leaf.getSortedNumericDocValues(timestampField)); + assertNotNull(timestampDV); + for (int i = 0; i < numDocs; i++) { + assertEquals(i, hostNameDV.nextDoc()); + + int batchIndex = i / numHosts; + assertEquals(2, hostNameDV.docValueCount()); + + long firstOrd = hostNameDV.nextOrd(); + assertEquals(batchIndex, firstOrd); + String expectedFirstHostName = String.format(Locale.ROOT, "host-%03d", batchIndex); + String actualFirstHostName = hostNameDV.lookupOrd(firstOrd).utf8ToString(); + assertEquals(expectedFirstHostName, actualFirstHostName); + + batchIndex++; + long secondOrd = hostNameDV.nextOrd(); + assertEquals(batchIndex, secondOrd); + String expectedSecondHostName = String.format(Locale.ROOT, "host-%03d", batchIndex); + String actualSecondHostName = hostNameDV.lookupOrd(secondOrd).utf8ToString(); + assertEquals(expectedSecondHostName, actualSecondHostName); + + assertEquals(i, timestampDV.nextDoc()); + long timestamp = timestampDV.longValue(); + long lowerBound = baseTimestamp; + long upperBound = baseTimestamp + (1000L * numDocs); + assertTrue( + "unexpected timestamp [" + timestamp + "], expected between [" + lowerBound + "] and [" + upperBound + "]", + timestamp >= lowerBound && timestamp < upperBound + ); + } + } + } + } + private static BaseDenseNumericValues getBaseDenseNumericValues(LeafReader leafReader, String field) throws IOException { return (BaseDenseNumericValues) DocValues.unwrapSingleton(leafReader.getSortedNumericDocValues(field)); } @@ -1315,11 +1389,15 @@ private static BaseSortedDocValues getBaseSortedDocValues(LeafReader leafReader, } private IndexWriterConfig getTimeSeriesIndexWriterConfig(String hostnameField, String timestampField) { + return getTimeSeriesIndexWriterConfig(hostnameField, false, timestampField); + } + + private IndexWriterConfig getTimeSeriesIndexWriterConfig(String hostnameField, boolean multiValued, String timestampField) { var config = new IndexWriterConfig(); if (hostnameField != null) { config.setIndexSort( new Sort( - new SortField(hostnameField, SortField.Type.STRING, false), + multiValued ? new SortedSetSortField(hostnameField, false) : new SortField(hostnameField, SortField.Type.STRING, false), new SortedNumericSortField(timestampField, SortField.Type.LONG, true) ) );