Copy consumer code for non-merging

parkertimmins · parkertimmins · commit eb0c3426cb06 · 2025-08-04T16:40:18.000-05:00
diff --git a/server/src/main/java/org/elasticsearch/common/compress/fsst/BulkCompressBufferer.java b/server/src/main/java/org/elasticsearch/common/compress/fsst/BulkCompressBufferer.java
@@ -0,0 +1,125 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.common.compress.fsst;
+
+import org.apache.lucene.store.DataOutput;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+public class BulkCompressBufferer implements Closeable {
+    private static final int MAX_LINES = 512;
+    private static final int MAX_INPUT_DATA = 128 << 10;
+    private static final int MAX_OUTPUT_DATA = MAX_INPUT_DATA * 2;
+
+    final byte[] inData = new byte[MAX_INPUT_DATA + 8];
+    final int[] inOffsets = new int[MAX_LINES + 1]; // 1 additional space for offset where next item would have been
+    byte[] outBuf = new byte[MAX_OUTPUT_DATA + 8];
+    int[] outOffsets = new int[MAX_LINES + 1]; // 1 additional space for offset where next item would have been
+    private final DataOutput finalOutput;
+    private final FSST.SymbolTable st;
+    private final FSST.OffsetWriter offsetWriter;
+    private int numLines = 0;
+    private int inOff = 0;
+
+    public BulkCompressBufferer(DataOutput finalOutput, FSST.SymbolTable st, FSST.OffsetWriter offsetWriter) {
+        this.finalOutput = finalOutput;
+        this.st = st;
+        this.offsetWriter = offsetWriter;
+    }
+
+    private void addToBuffer(byte[] bytes, int offset, int length) {
+        System.arraycopy(bytes, offset, inData, inOff, length);
+        int lineIdx = numLines;
+        inOffsets[lineIdx] = inOff;
+        inOff += length;
+        numLines++;
+    }
+
+    public void addLine(byte[] bytes, int offset, int length) throws IOException {
+        if (inOff + length > MAX_INPUT_DATA || numLines == MAX_LINES) {
+            // can't fit another
+            compressAndWriteBuffer();
+
+            if (length > MAX_INPUT_DATA) {
+                // new item doesn't fit by itself, so deal with it by itself
+                compressAndWriteSingle(bytes, offset, length);
+            } else {
+                // does fit
+                addToBuffer(bytes, offset, length);
+            }
+        } else {
+            // does fit
+            addToBuffer(bytes, offset, length);
+        }
+    }
+
+    private void compressAndWriteSingle(byte[] bytes, int offset, int length) throws IOException {
+        assert numLines == 0 && inOff == 0;
+
+        int off = offset;
+        int lenToWrite = length;
+        int totalOutLen = 0;
+
+        while (lenToWrite > 0) {
+            int len = Math.min(lenToWrite, MAX_INPUT_DATA);
+
+            // copy data into buffer
+            numLines = 1;
+            inOffsets[0] = off;
+            inOffsets[1] = off + len;
+
+            long outLine = st.compressBulk(numLines, bytes, inOffsets, outBuf, outOffsets);
+            assert outLine == numLines;
+            long outLen = outOffsets[(int) outLine];
+            totalOutLen += (int) outLen;
+            finalOutput.writeBytes(outBuf, 0, (int) outLen);
+
+            off += len;
+            lenToWrite -= len;
+
+        }
+        offsetWriter.addLen(totalOutLen);
+
+        clear();
+    }
+
+    private void compressAndWriteBuffer() throws IOException {
+        assert numLines < MAX_LINES + 1;
+        assert inOff <= MAX_INPUT_DATA;
+
+        // add a pseudo-offset to provide last line's length
+        inOffsets[numLines] = inOff;
+
+        long outLines = st.compressBulk(numLines, inData, inOffsets, outBuf, outOffsets);
+        assert outLines == numLines;
+        long fullOutLen = outOffsets[(int) outLines];
+
+        finalOutput.writeBytes(outBuf, 0, (int) fullOutLen);
+        for (int i = 0; i < numLines; ++i) {
+            int len = outOffsets[i+1] - outOffsets[i];
+            offsetWriter.addLen(len);
+        }
+
+        clear();
+    }
+
+    void clear() {
+        numLines = inOff = 0;
+    }
+
+    @Override
+    public void close() throws IOException {
+        if (numLines > 0) {
+            compressAndWriteBuffer();
+        }
+        clear();
+    }
+}
diff --git a/server/src/main/java/org/elasticsearch/common/compress/fsst/ReservoirSampler.java b/server/src/main/java/org/elasticsearch/common/compress/fsst/ReservoirSampler.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.common.compress.fsst;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+import static org.elasticsearch.common.compress.fsst.FSST.FSST_SAMPLELINE;
+import static org.elasticsearch.common.compress.fsst.FSST.FSST_SAMPLEMAXSZ;
+import static org.elasticsearch.common.compress.fsst.FSST.FSST_SAMPLETARGET;
+
+public class ReservoirSampler {
+    private static final int SAMPLE_TARGET = FSST_SAMPLETARGET;
+    private static final int SAMPLE_MAX = FSST_SAMPLEMAXSZ;
+    private static final int SAMPLE_LINE = FSST_SAMPLELINE;
+    private int numBytesInSample = 0;
+    private int numChunksSeen = 0;
+    private final Random random = new Random(1234);
+    private List<byte[]> sample = new ArrayList<>();
+
+    public List<byte[]> getSample() {
+        return sample;
+    }
+
+    // The byte array is only valid during this call, thus bytes need to be deep copied
+    public void processLine(byte[] bytes, int offset, int length) {
+        if (length == 0) {
+            return;
+        }
+
+        // iterate over the chunks
+        int numChunks = length / SAMPLE_LINE + (length % SAMPLE_LINE == 0 ? 0 : 1);
+        for (int c = 0; c < numChunks; ++c) {
+            numChunksSeen++;
+            int chunkOffset = c * SAMPLE_LINE;
+            int chunkLen = c == numChunks - 1 ? length - chunkOffset : SAMPLE_LINE;
+
+            if (numBytesInSample < SAMPLE_TARGET + SAMPLE_LINE) {
+                // If the reservoir isn't full, just add to it.
+                // This will occur on startup, but also if a recent swap caused us to go below the target.
+                // Add a buffer of an additional sample line, so that one swap doesn't cause us to fall below target.
+                byte[] chunkBytes = Arrays.copyOfRange(bytes, offset + chunkOffset, offset + chunkOffset + chunkLen);
+                sample.add(chunkBytes);
+                numBytesInSample += chunkBytes.length;
+            } else {
+                int p = random.nextInt(numChunksSeen);
+                if (p < sample.size()) {
+                    // swap for an existing value
+                    byte[] toAdd = Arrays.copyOfRange(bytes, offset + chunkOffset, offset + chunkOffset + chunkLen);
+                    byte[] toRemove = sample.get(p);
+                    numBytesInSample -= toRemove.length;
+                    numBytesInSample += toAdd.length;
+                    sample.set(p, toAdd);
+
+                    // Sample could now be too small if we swapped a small chunk for a big one.
+                    // This will be rectified as the next chunk will just be added to the sample, in the if-block above
+
+                    // But if the sample is too large (from swapping big samples for small samples),
+                    // we need to discard some
+                    while (numBytesInSample > SAMPLE_MAX) {
+                        numBytesInSample -= sample.removeLast().length;
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java