elastic · martijnvg · Aug 4, 2025 · Aug 5, 2025
diff --git a/server/src/main/java/org/elasticsearch/index/codec/postings/ES812PostingsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/postings/ES812PostingsFormat.java
@@ -27,7 +27,6 @@
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.PostingsReaderBase;
 import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
 import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.SegmentReadState;
@@ -37,6 +36,8 @@
 import org.apache.lucene.util.packed.PackedInts;
 import org.elasticsearch.core.IOUtils;
 import org.elasticsearch.index.codec.ForUtil;
+import org.elasticsearch.index.codec.postings.terms.Lucene90BlockTreeTermsReader;
+import org.elasticsearch.index.codec.postings.terms.NonForkedHelper;
 
 import java.io.IOException;
 
@@ -414,7 +415,12 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException
         PostingsReaderBase postingsReader = new ES812PostingsReader(state);
         boolean success = false;
         try {
-            FieldsProducer ret = new Lucene90BlockTreeTermsReader(postingsReader, state);
+            FieldsProducer ret;
+            if (NonForkedHelper.USE_FORKED_TERMS_READER.isEnabled()) {
+                ret = new Lucene90BlockTreeTermsReader(postingsReader, state);
+            } else {
+                ret = new org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader(postingsReader, state);
+            }
             success = true;
             return ret;
         } finally {

diff --git a/server/src/main/java/org/elasticsearch/index/codec/postings/terms/CompressionAlgorithm.java b/server/src/main/java/org/elasticsearch/index/codec/postings/terms/CompressionAlgorithm.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+package org.elasticsearch.index.codec.postings.terms;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.util.compress.LowercaseAsciiCompression;
+
+import java.io.IOException;
+
+/** Compression algorithm used for suffixes of a block of terms. */
+enum CompressionAlgorithm {
+    NO_COMPRESSION(0x00) {
+
+        @Override
+        void read(DataInput in, byte[] out, int len) throws IOException {
+            in.readBytes(out, 0, len);
+        }
+    },
+
+    LOWERCASE_ASCII(0x01) {
+
+        @Override
+        void read(DataInput in, byte[] out, int len) throws IOException {
+            LowercaseAsciiCompression.decompress(in, out, len);
+        }
+    },
+
+    LZ4(0x02) {
+
+        @Override
+        void read(DataInput in, byte[] out, int len) throws IOException {
+            org.apache.lucene.util.compress.LZ4.decompress(in, len, out, 0);
+        }
+    };
+
+    private static final CompressionAlgorithm[] BY_CODE = new CompressionAlgorithm[3];
+
+    static {
+        for (CompressionAlgorithm alg : CompressionAlgorithm.values()) {
+            BY_CODE[alg.code] = alg;
+        }
+    }
+
+    /** Look up a {@link CompressionAlgorithm} by its {@link CompressionAlgorithm#code}. */
+    static CompressionAlgorithm byCode(int code) {
+        if (code < 0 || code >= BY_CODE.length) {
+            throw new IllegalArgumentException("Illegal code for a compression algorithm: " + code);
+        }
+        return BY_CODE[code];
+    }
+
+    public final int code;
+
+    CompressionAlgorithm(int code) {
+        this.code = code;
+    }
+
+    abstract void read(DataInput in, byte[] out, int len) throws IOException;
+}
diff --git a/server/src/main/java/org/elasticsearch/index/codec/postings/terms/FieldReader.java b/server/src/main/java/org/elasticsearch/index/codec/postings/terms/FieldReader.java
@@ -0,0 +1,245 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+package org.elasticsearch.index.codec.postings.terms;
+
+import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.OffHeapFSTStore;
+
+import java.io.IOException;
+
+import static org.elasticsearch.index.codec.postings.terms.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT;
+
+/**
+ * BlockTree's implementation of {@link Terms}.
+ */
+public final class FieldReader extends Terms {
+
+    // private final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
+
+    final long numTerms;
+    final FieldInfo fieldInfo;
+    final long sumTotalTermFreq;
+    final long sumDocFreq;
+    final int docCount;
+    final long rootBlockFP;
+    final BytesRef rootCode;
+    final BytesRef minTerm;
+    final BytesRef maxTerm;
+    final Lucene90BlockTreeTermsReader parent;
+
+    final FST<BytesRef> index;
+
+    // private boolean DEBUG;
+
+    FieldReader(
+        Lucene90BlockTreeTermsReader parent,
+        FieldInfo fieldInfo,
+        long numTerms,
+        BytesRef rootCode,
+        long sumTotalTermFreq,
+        long sumDocFreq,
+        int docCount,
+        long indexStartFP,
+        IndexInput metaIn,
+        IndexInput indexIn,
+        BytesRef minTerm,
+        BytesRef maxTerm
+    ) throws IOException {
+        assert numTerms > 0;
+        this.fieldInfo = fieldInfo;
+        // DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
+        this.parent = parent;
+        this.numTerms = numTerms;
+        this.sumTotalTermFreq = sumTotalTermFreq;
+        this.sumDocFreq = sumDocFreq;
+        this.docCount = docCount;
+        this.minTerm = minTerm;
+        this.maxTerm = maxTerm;
+        // if (DEBUG) {
+        // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode="
+        // + rootCode + " divisor=" + indexDivisor);
+        // }
+        rootBlockFP = readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
+            >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
+        // Initialize FST always off-heap.
+        var metadata = FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton());
+        index = FST.fromFSTReader(metadata, new OffHeapFSTStore(indexIn, indexStartFP, metadata));
+        /*
+         if (false) {
+         final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
+         Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
+         Util.toDot(index, w, false, false);
+         System.out.println("FST INDEX: SAVED to " + dotFileName);
+         w.close();
+         }
+        */
+        BytesRef emptyOutput = metadata.getEmptyOutput();
+        if (rootCode.equals(emptyOutput) == false) {
+            // TODO: this branch is never taken
+            assert false;
+            this.rootCode = rootCode;
+        } else {
+            this.rootCode = emptyOutput;
+        }
+    }
+
+    long readVLongOutput(DataInput in) throws IOException {
+        if (parent.version >= VERSION_MSB_VLONG_OUTPUT) {
+            return readMSBVLong(in);
+        } else {
+            return in.readVLong();
+        }
+    }
+
+    /**
+     * Decodes a variable length byte[] in MSB order back to long, as written by {@link
+     * Lucene90BlockTreeTermsWriter#writeMSBVLong}.
+     *
+     * <p>Package private for testing.
+     */
+    static long readMSBVLong(DataInput in) throws IOException {
+        long l = 0L;
+        while (true) {
+            byte b = in.readByte();
+            l = (l << 7) | (b & 0x7FL);
+            if ((b & 0x80) == 0) {
+                break;
+            }
+        }
+        return l;
+    }
+
+    @Override
+    public BytesRef getMin() throws IOException {
+        if (minTerm == null) {
+            // Older index that didn't store min/maxTerm
+            return super.getMin();
+        } else {
+            return minTerm;
+        }
+    }
+
+    @Override
+    public BytesRef getMax() throws IOException {
+        if (maxTerm == null) {
+            // Older index that didn't store min/maxTerm
+            return super.getMax();
+        } else {
+            return maxTerm;
+        }
+    }
+
+    /** For debugging -- used by CheckIndex too */
+    @Override
+    public Stats getStats() throws IOException {
+        return new SegmentTermsEnum(this).computeBlockStats();
+    }
+
+    @Override
+    public boolean hasFreqs() {
+        return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+    }
+
+    @Override
+    public boolean hasOffsets() {
+        return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+    }
+
+    @Override
+    public boolean hasPositions() {
+        return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+    }
+
+    @Override
+    public boolean hasPayloads() {
+        return fieldInfo.hasPayloads();
+    }
+
+    @Override
+    public TermsEnum iterator() throws IOException {
+        return new SegmentTermsEnum(this);
+    }
+
+    @Override
+    public long size() {
+        return numTerms;
+    }
+
+    @Override
+    public long getSumTotalTermFreq() {
+        return sumTotalTermFreq;
+    }
+
+    @Override
+    public long getSumDocFreq() {
+        return sumDocFreq;
+    }
+
+    @Override
+    public int getDocCount() {
+        return docCount;
+    }
+
+    @Override
+    public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+        // if (DEBUG) System.out.println(" FieldReader.intersect startTerm=" +
+        // ToStringUtils.bytesRefToString(startTerm));
+        // System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton);
+        // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum?
+        // can we optimize knowing that...?
+        if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
+            throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
+        }
+        return new IntersectTermsEnum(
+            this,
+            compiled.getTransitionAccessor(),
+            compiled.getByteRunnable(),
+            compiled.commonSuffixRef,
+            startTerm
+        );
+    }
+
+    @Override
+    public String toString() {
+        return "BlockTreeTerms(seg="
+            + parent.segment
+            + " terms="
+            + numTerms
+            + ",postings="
+            + sumDocFreq
+            + ",positions="
+            + sumTotalTermFreq
+            + ",docs="
+            + docCount
+            + ")";
+    }
+
+    // CHANGES:
+
+    public BytesRef getMinTerm() {
+        return minTerm;
+    }
+
+    public BytesRef getMaxTerm() {
+        return maxTerm;
+    }
+
+    // END CHANGES:
+}