IndexedDISIBuilder

martijnvg · martijnvg · commit e370f4330e88 · 2025-03-31T21:05:49.000+02:00
diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java
@@ -39,7 +39,6 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.LongsRef;
-import org.apache.lucene.util.RoaringDocIdSet;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.compress.LZ4;
 import org.apache.lucene.util.packed.DirectMonotonicWriter;
@@ -159,8 +158,9 @@ private long[] writeField(
         meta.writeLong(numValues);
         meta.writeInt(numDocsWithValue);
 
-        // TODO: write DISI to temp file and append it later to data part:
-        var docIdSetBuilder = new RoaringDocIdSet.Builder(maxDoc);
+        // TODO: which IOContext should be used here?
+        IndexOutput disiTempOutput = null;
+        IndexedDISIBuilder docIdSetBuilder = null;
         if (numValues > 0) {
             // Special case for maxOrd of 1, signal -1 that no blocks will be written
             meta.writeInt(maxOrd != 1 ? ES87TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT : -1);
@@ -181,13 +181,15 @@ private long[] writeField(
                 values = valuesProducer.getSortedNumeric(field);
                 final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1;
 
-                // Reset and recompute. The value gathered from TsdbDocValuesProducer may not be accurate if one of the leaves was singleton
-                // This could cause failures when writing addresses in writeSortedNumericField(...)
-                numDocsWithValue = 0;
+                if (numDocsWithValue != 0 && numDocsWithValue != maxDoc) {
+                    disiTempOutput = dir.createTempOutput(data.getName(), "disi", IOContext.DEFAULT);
+                    docIdSetBuilder = new IndexedDISIBuilder(disiTempOutput, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
+                }
 
                 for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
-                    numDocsWithValue++;
-                    docIdSetBuilder.add(doc);
+                    if (docIdSetBuilder != null) {
+                        docIdSetBuilder.addDocId(doc);
+                    }
                     final int count = values.docValueCount();
                     if (docCountConsumer != null) {
                         docCountConsumer.accept(count);
@@ -244,13 +246,17 @@ private long[] writeField(
             long offset = data.getFilePointer();
             meta.writeLong(offset); // docsWithFieldOffset
             final short jumpTableEntryCount;
-            if (maxOrd != 1) {
-                var bitSet = docIdSetBuilder.build();
-                var iterator = bitSet.iterator();
-                if (iterator == null) {
-                    iterator = DocIdSetIterator.empty();
+            if (maxOrd != 1 && docIdSetBuilder != null) {
+                jumpTableEntryCount = docIdSetBuilder.build();
+                String skipListTempFileName = disiTempOutput.getName();
+                disiTempOutput.close();
+                try (
+                    // TODO: which IOContext should be used here?
+                    var addressDataInput = dir.openInput(skipListTempFileName, IOContext.DEFAULT)
+                ) {
+                    data.copyBytes(addressDataInput, addressDataInput.length());
                 }
-                jumpTableEntryCount = IndexedDISI.writeBitSet(iterator, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
+                org.apache.lucene.util.IOUtils.deleteFilesIgnoringExceptions(dir, skipListTempFileName);
             } else {
                 values = valuesProducer.getSortedNumeric(field);
                 jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/IndexedDISIBuilder.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/IndexedDISIBuilder.java
@@ -0,0 +1,172 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.codec.tsdb;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+
+import java.io.IOException;
+
+/**
+ * Fork of {@link org.apache.lucene.codecs.lucene90.IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput)} but that allows
+ * building jump list iteratively with one docid at a time instead of relying on docidset iterator.
+ */
+final class IndexedDISIBuilder {
+
+    private static final int BLOCK_SIZE = 65536; // The number of docIDs that a single block represents
+
+    private static final int DENSE_BLOCK_LONGS = BLOCK_SIZE / Long.SIZE; // 1024
+    public static final byte DEFAULT_DENSE_RANK_POWER = 9; // Every 512 docIDs / 8 longs
+
+    static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;
+
+    final IndexOutput out;
+    final byte denseRankPower;
+    final long origo;
+
+    int totalCardinality = 0;
+    int blockCardinality = 0;
+    final FixedBitSet buffer = new FixedBitSet(1 << 16);
+    int[] jumps = new int[ArrayUtil.oversize(1, Integer.BYTES * 2)];
+    int prevBlock = -1;
+    int jumpBlockIndex = 0;
+
+    IndexedDISIBuilder(IndexOutput out, byte denseRankPower) {
+        this.out = out;
+        this.denseRankPower = denseRankPower;
+
+        this.origo = out.getFilePointer(); // All jumps are relative to the origo
+        if ((denseRankPower < 7 || denseRankPower > 15) && denseRankPower != -1) {
+            throw new IllegalArgumentException(
+                "Acceptable values for denseRankPower are 7-15 (every 128-32768 docIDs). "
+                    + "The provided power was "
+                    + denseRankPower
+                    + " (every "
+                    + (int) Math.pow(2, denseRankPower)
+                    + " docIDs)"
+            );
+        }
+    }
+
+    void addDocId(int doc) throws IOException {
+        final int block = doc >>> 16;
+        if (prevBlock != -1 && block != prevBlock) {
+            // Track offset+index from previous block up to current
+            jumps = addJumps(jumps, out.getFilePointer() - origo, totalCardinality, jumpBlockIndex, prevBlock + 1);
+            jumpBlockIndex = prevBlock + 1;
+            // Flush block
+            flush(prevBlock, buffer, blockCardinality, denseRankPower, out);
+            // Reset for next block
+            buffer.clear();
+            totalCardinality += blockCardinality;
+            blockCardinality = 0;
+        }
+        buffer.set(doc & 0xFFFF);
+        blockCardinality++;
+        prevBlock = block;
+    }
+
+    short build() throws IOException {
+        if (blockCardinality > 0) {
+            jumps = addJumps(jumps, out.getFilePointer() - origo, totalCardinality, jumpBlockIndex, prevBlock + 1);
+            totalCardinality += blockCardinality;
+            flush(prevBlock, buffer, blockCardinality, denseRankPower, out);
+            buffer.clear();
+            prevBlock++;
+        }
+        final int lastBlock = prevBlock == -1 ? 0 : prevBlock; // There will always be at least 1 block (NO_MORE_DOCS)
+        // Last entry is a SPARSE with blockIndex == 32767 and the single entry 65535, which becomes the
+        // docID NO_MORE_DOCS
+        // To avoid creating 65K jump-table entries, only a single entry is created pointing to the
+        // offset of the
+        // NO_MORE_DOCS block, with the jumpBlockIndex set to the logical EMPTY block after all real
+        // blocks.
+        jumps = addJumps(jumps, out.getFilePointer() - origo, totalCardinality, lastBlock, lastBlock + 1);
+        buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
+        flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, denseRankPower, out);
+        // offset+index jump-table stored at the end
+        return flushBlockJumps(jumps, lastBlock + 1, out);
+    }
+
+    // Adds entries to the offset & index jump-table for blocks
+    private static int[] addJumps(int[] jumps, long offset, int index, int startBlock, int endBlock) {
+        assert offset < Integer.MAX_VALUE : "Logically the offset should not exceed 2^30 but was >= Integer.MAX_VALUE";
+        jumps = ArrayUtil.grow(jumps, (endBlock + 1) * 2);
+        for (int b = startBlock; b < endBlock; b++) {
+            jumps[b * 2] = index;
+            jumps[b * 2 + 1] = (int) offset;
+        }
+        return jumps;
+    }
+
+    private static void flush(int block, FixedBitSet buffer, int cardinality, byte denseRankPower, IndexOutput out) throws IOException {
+        assert block >= 0 && block < BLOCK_SIZE;
+        out.writeShort((short) block);
+        assert cardinality > 0 && cardinality <= BLOCK_SIZE;
+        out.writeShort((short) (cardinality - 1));
+        if (cardinality > MAX_ARRAY_LENGTH) {
+            if (cardinality != BLOCK_SIZE) { // all docs are set
+                if (denseRankPower != -1) {
+                    final byte[] rank = createRank(buffer, denseRankPower);
+                    out.writeBytes(rank, rank.length);
+                }
+                for (long word : buffer.getBits()) {
+                    out.writeLong(word);
+                }
+            }
+        } else {
+            BitSetIterator it = new BitSetIterator(buffer, cardinality);
+            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+                out.writeShort((short) doc);
+            }
+        }
+    }
+
+    // Flushes the offset & index jump-table for blocks. This should be the last data written to out
+    // This method returns the blockCount for the blocks reachable for the jump_table or -1 for no
+    // jump-table
+    private static short flushBlockJumps(int[] jumps, int blockCount, IndexOutput out) throws IOException {
+        if (blockCount == 2) { // Jumps with a single real entry + NO_MORE_DOCS is just wasted space so we ignore
+            // that
+            blockCount = 0;
+        }
+        for (int i = 0; i < blockCount; i++) {
+            out.writeInt(jumps[i * 2]); // index
+            out.writeInt(jumps[i * 2 + 1]); // offset
+        }
+        // As there are at most 32k blocks, the count is a short
+        // The jumpTableOffset will be at lastPos - (blockCount * Long.BYTES)
+        return (short) blockCount;
+    }
+
+    // Creates a DENSE rank-entry (the number of set bits up to a given point) for the buffer.
+    // One rank-entry for every {@code 2^denseRankPower} bits, with each rank-entry using 2 bytes.
+    // Represented as a byte[] for fast flushing and mirroring of the retrieval representation.
+    private static byte[] createRank(FixedBitSet buffer, byte denseRankPower) {
+        final int longsPerRank = 1 << (denseRankPower - 6);
+        final int rankMark = longsPerRank - 1;
+        final int rankIndexShift = denseRankPower - 7; // 6 for the long (2^6) + 1 for 2 bytes/entry
+        final byte[] rank = new byte[DENSE_BLOCK_LONGS >> rankIndexShift];
+        final long[] bits = buffer.getBits();
+        int bitCount = 0;
+        for (int word = 0; word < DENSE_BLOCK_LONGS; word++) {
+            if ((word & rankMark) == 0) { // Every longsPerRank longs
+                rank[word >> rankIndexShift] = (byte) (bitCount >> 8);
+                rank[(word >> rankIndexShift) + 1] = (byte) (bitCount & 0xFF);
+            }
+            bitCount += Long.bitCount(bits[word]);
+        }
+        return rank;
+    }
+
+}