From 2d8f270ee01fbe74890d7a6c4ab98baaa6ec88f2 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 26 Dec 2023 18:11:44 +0900 Subject: [PATCH 01/10] lazily write the FST padding byte --- .../java/org/apache/lucene/util/fst/FSTCompiler.java | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 961511616abb..2a04a575ef7c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -164,14 +164,12 @@ private FSTCompiler( boolean allowFixedLengthArcs, DataOutput dataOutput, float directAddressingMaxOversizingFactor, - int version) - throws IOException { + int version) { this.allowFixedLengthArcs = allowFixedLengthArcs; this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor; this.version = version; // pad: ensure no node gets address 0 which is reserved to mean - // the stop state w/ no arcs - dataOutput.writeByte((byte) 0); + // the stop state w/ no arcs. the actual byte will be written lazily numBytesWritten++; this.dataOutput = dataOutput; fst = @@ -344,7 +342,7 @@ public Builder setVersion(int version) { } /** Creates a new {@link FSTCompiler}. */ - public FSTCompiler build() throws IOException { + public FSTCompiler build() { // create a default DataOutput if not specified if (dataOutput == null) { dataOutput = getOnHeapReaderWriter(15); @@ -552,6 +550,10 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { } reverseScratchBytes(); + if (numBytesWritten == 1) { + // first time, write the padding byte + dataOutput.writeByte((byte) 0); + } scratchBytes.writeTo(dataOutput); numBytesWritten += scratchBytes.getPosition(); From b4a349404810ff5ac15aa6131f0d2cb220f73598 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 26 Dec 2023 18:25:21 +0900 Subject: [PATCH 02/10] Also write the pad byte when there is emptyOutput --- .../java/org/apache/lucene/util/fst/FSTCompiler.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 2a04a575ef7c..16581d4096b6 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -551,8 +551,7 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { reverseScratchBytes(); if (numBytesWritten == 1) { - // first time, write the padding byte - dataOutput.writeByte((byte) 0); + writePaddingByte(); } scratchBytes.writeTo(dataOutput); numBytesWritten += scratchBytes.getPosition(); @@ -561,6 +560,10 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { return numBytesWritten - 1; } + private void writePaddingByte() throws IOException { + dataOutput.writeByte((byte) 0); + } + private void writeLabel(DataOutput out, int v) throws IOException { assert v >= 0 : "v=" + v; if (fst.metadata.inputType == INPUT_TYPE.BYTE1) { @@ -970,6 +973,10 @@ public FST compile() throws IOException { if (root.numArcs == 0) { if (fst.metadata.emptyOutput == null) { return null; + } else { + // we haven't written the pad byte so far, but the FST is still valid + assert numBytesWritten == 1; + writePaddingByte(); } } From 601a6e17ad2328f120d0bb5ec7d45ddf7ca4f1c0 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 26 Dec 2023 18:27:32 +0900 Subject: [PATCH 03/10] add comment --- .../core/src/java/org/apache/lucene/util/fst/FSTCompiler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 16581d4096b6..7f67fb2a38bc 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -550,6 +550,7 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { } reverseScratchBytes(); + // write the padding byte if needed if (numBytesWritten == 1) { writePaddingByte(); } @@ -561,6 +562,7 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { } private void writePaddingByte() throws IOException { + assert numBytesWritten == 1; dataOutput.writeByte((byte) 0); } @@ -975,7 +977,6 @@ public FST compile() throws IOException { return null; } else { // we haven't written the pad byte so far, but the FST is still valid - assert numBytesWritten == 1; writePaddingByte(); } } From f4995ab98b9a4f037740cb6aafa4e3b23d924549 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Fri, 29 Dec 2023 12:37:30 +0900 Subject: [PATCH 04/10] Make Lucene90BlockTreeTermsWriter to write FST off-heap --- .../Lucene90BlockTreeTermsWriter.java | 50 ++++++++++++------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 26aea1dda595..53a10d9faf11 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -512,7 +512,8 @@ public String toString() { public void compileIndex( List blocks, ByteBuffersDataOutput scratchBytes, - IntsRefBuilder scratchIntsRef) + IntsRefBuilder scratchIntsRef, + DataOutput fstDataOutput) throws IOException { assert (isFloor && blocks.size() > 1) || (isFloor == false && blocks.size() == 1) @@ -542,17 +543,6 @@ public void compileIndex( } } - long estimateSize = prefix.length; - for (PendingBlock block : blocks) { - if (block.subIndices != null) { - for (FST subIndex : block.subIndices) { - estimateSize += subIndex.numBytes(); - } - } - } - int estimateBitsRequired = PackedInts.bitsRequired(estimateSize); - int pageBits = Math.min(15, Math.max(6, estimateBitsRequired)); - final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final int fstVersion; if (version >= Lucene90BlockTreeTermsReader.VERSION_CURRENT) { @@ -565,7 +555,7 @@ public void compileIndex( // Disable suffixes sharing for block tree index because suffixes are mostly dropped // from the FST index and left in the term blocks. .suffixRAMLimitMB(0d) - .dataOutput(getOnHeapReaderWriter(pageBits)) + .dataOutput(fstDataOutput) .setVersion(fstVersion) .build(); // if (DEBUG) { @@ -620,8 +610,6 @@ private void append( private final ByteBuffersDataOutput scratchBytes = ByteBuffersDataOutput.newResettableInstance(); private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); - static final BytesRef EMPTY_BYTES_REF = new BytesRef(); - private static class StatsWriter { private final DataOutput out; @@ -795,7 +783,12 @@ void writeBlocks(int prefixLength, int count) throws IOException { assert firstBlock.isFloor || newBlocks.size() == 1; - firstBlock.compileIndex(newBlocks, scratchBytes, scratchIntsRef); + boolean isRootBlock = prefixLength == 0 && count == pending.size(); + // Create a proper DataOutput for the FST. For root block, we will write to the IndexOut + // directly. For sub blocks, we will use the on-heap ReadWriteDataOutput + DataOutput fstDataOutput = getFSTDataOutput(newBlocks, firstBlock.prefix.length, isRootBlock); + + firstBlock.compileIndex(newBlocks, scratchBytes, scratchIntsRef, fstDataOutput); // Remove slice from the top of the pending stack, that we just wrote: pending.subList(pending.size() - count, pending.size()).clear(); @@ -806,6 +799,25 @@ void writeBlocks(int prefixLength, int count) throws IOException { newBlocks.clear(); } + private DataOutput getFSTDataOutput( + List blocks, int prefixLength, boolean isRootBlock) { + if (isRootBlock) { + return indexOut; + } + long estimateSize = prefixLength; + for (PendingBlock block : blocks) { + if (block.subIndices != null) { + for (FST subIndex : block.subIndices) { + estimateSize += subIndex.numBytes(); + } + } + } + int estimateBitsRequired = PackedInts.bitsRequired(estimateSize); + int pageBits = Math.min(15, Math.max(6, estimateBitsRequired)); + + return getOnHeapReaderWriter(pageBits); + } + private boolean allEqual(byte[] b, int startOffset, int endOffset, byte value) { Objects.checkFromToIndex(startOffset, endOffset, b.length); for (int i = startOffset; i < endOffset; ++i) { @@ -1200,9 +1212,11 @@ public void finish() throws IOException { metaOut.writeVInt(docsSeen.cardinality()); writeBytesRef(metaOut, new BytesRef(firstPendingTerm.termBytes)); writeBytesRef(metaOut, new BytesRef(lastPendingTerm.termBytes)); - metaOut.writeVLong(indexOut.getFilePointer()); + // Write the address to the beginning of the FST. Note that the FST is already written to + // indexOut by this point + metaOut.writeVLong(indexOut.getFilePointer() - root.index.numBytes()); // Write FST to index - root.index.save(metaOut, indexOut); + root.index.saveMetadata(metaOut); // System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name); /* From 41fa692cc7c27421a5afa130e21ce508c3ba6906 Mon Sep 17 00:00:00 2001 From: Dzung Bui Date: Thu, 11 Jan 2024 23:00:43 +0900 Subject: [PATCH 05/10] Add change log --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 786f04a02dc0..447b0406af8b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -105,6 +105,8 @@ Improvements * GITHUB#12873: Expressions module now uses JEP 371 "Hidden Classes" with JEP 309 "Dynamic Class-File Constants" to implement Javascript expressions. (Uwe Schindler) +* GITHUB#12985: Make Lucene90BlockTreePostingsFormat to build FST off-heap for the root block. (Anh Dung Bui) + Optimizations --------------------- From 3a1095e2d60bca58c165630bfd1bc316051deccf Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 30 Jan 2024 11:43:03 +0900 Subject: [PATCH 06/10] Tidy code & Add comments --- .../blocktree/Lucene90BlockTreeTermsWriter.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 53a10d9faf11..8c1db35d4e4c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -669,8 +669,13 @@ class TermsWriter { private PendingTerm firstPendingTerm; private PendingTerm lastPendingTerm; - /** Writes the top count entries in pending, using prevTerm to compute the prefix. */ - void writeBlocks(int prefixLength, int count) throws IOException { + /** + * Writes the top count entries in pending, using prevTerm to compute the prefix. + * + *

For root block, we will write the FST directly to the IndexOutput, for others they will + * use on-heap FST + */ + void writeBlocks(int prefixLength, int count, boolean isRootBlock) throws IOException { assert count > 0; @@ -783,7 +788,6 @@ void writeBlocks(int prefixLength, int count) throws IOException { assert firstBlock.isFloor || newBlocks.size() == 1; - boolean isRootBlock = prefixLength == 0 && count == pending.size(); // Create a proper DataOutput for the FST. For root block, we will write to the IndexOut // directly. For sub blocks, we will use the on-heap ReadWriteDataOutput DataOutput fstDataOutput = getFSTDataOutput(newBlocks, firstBlock.prefix.length, isRootBlock); @@ -1157,7 +1161,7 @@ private void pushTerm(BytesRef text) throws IOException { if (prefixTopSize >= minItemsInBlock) { // if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + " // minItemsInBlock=" + minItemsInBlock); - writeBlocks(i + 1, prefixTopSize); + writeBlocks(i + 1, prefixTopSize, false); prefixStarts[i] -= prefixTopSize - 1; } } @@ -1187,7 +1191,7 @@ public void finish() throws IOException { // we can save writing a "degenerate" root block, but we have to // fix all the places that assume the root block's prefix is the empty string: pushTerm(new BytesRef()); - writeBlocks(0, pending.size()); + writeBlocks(0, pending.size(), true); // We better have one final "root" block: assert pending.size() == 1 && !pending.get(0).isTerm From 1de132553bd984ed361a7e829d0f6ee3f756bd10 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 28 Feb 2024 09:18:47 +0900 Subject: [PATCH 07/10] use temp IndexOutput for FST writing --- .../lucene90/Lucene90RWPostingsFormat.java | 11 +- .../lucene90/TestLucene90PostingsFormat.java | 23 +++- .../Lucene90BlockTreeTermsWriter.java | 107 ++++++++++++++++-- .../java/org/apache/lucene/util/fst/FST.java | 102 +++++++++-------- 4 files changed, 184 insertions(+), 59 deletions(-) diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWPostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWPostingsFormat.java index 4360b90f2370..16016ae746f0 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWPostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWPostingsFormat.java @@ -48,19 +48,23 @@ public final class Lucene90RWPostingsFormat extends PostingsFormat { private final int minTermBlockSize; private final int maxTermBlockSize; + private long blockHeapSizeLimitBytes; /** Creates {@code Lucene90RWPostingsFormat} with default settings. */ public Lucene90RWPostingsFormat() { this( Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, - Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); + Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE, + Lucene90BlockTreeTermsWriter.DEFAULT_BLOCK_HEAP_LIMIT_BYTES); } - public Lucene90RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + public Lucene90RWPostingsFormat( + int minTermBlockSize, int maxTermBlockSize, long blockHeapSizeLimitBytes) { super("Lucene90"); Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); this.minTermBlockSize = minTermBlockSize; this.maxTermBlockSize = maxTermBlockSize; + this.blockHeapSizeLimitBytes = blockHeapSizeLimitBytes; } @Override @@ -79,7 +83,8 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException postingsWriter, minTermBlockSize, maxTermBlockSize, - Lucene90BlockTreeTermsReader.VERSION_START); + Lucene90BlockTreeTermsReader.VERSION_START, + blockHeapSizeLimitBytes); success = true; return ret; } finally { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90PostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90PostingsFormat.java index 7965bc9c7780..00500c1b8770 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90PostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90PostingsFormat.java @@ -26,6 +26,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; +import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; import org.apache.lucene.codecs.lucene99.Lucene99SkipWriter; @@ -45,7 +46,27 @@ import org.apache.lucene.tests.util.TestUtil; public class TestLucene90PostingsFormat extends BasePostingsFormatTestCase { - private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene90RWPostingsFormat()); + private final Codec codec = + TestUtil.alwaysPostingsFormat( + new Lucene90RWPostingsFormat( + Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, + Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE, + getBlockHeapSizeLimitBytes())); + + private static long getBlockHeapSizeLimitBytes() { + // randomize the block heap max size between 3 states: + // - 0, effectively disable on-heap FST and always use off-heap + // - DEFAULT_BLOCK_HEAP_LIMIT_BYTES + // - a random number between 0 and DEFAULT_BLOCK_HEAP_LIMIT_BYTES + int r = random().nextInt(2); + if (r == 0) { + return 0; + } + if (r == 1) { + return Lucene90BlockTreeTermsWriter.DEFAULT_BLOCK_HEAP_LIMIT_BYTES; + } + return random().nextLong(Lucene90BlockTreeTermsWriter.DEFAULT_BLOCK_HEAP_LIMIT_BYTES); + } @Override protected Codec getCodec() { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 55dba26b270e..08e22d375f6f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -39,6 +39,7 @@ import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -54,6 +55,7 @@ import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; +import org.apache.lucene.util.fst.OffHeapFSTStore; import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.packed.PackedInts; @@ -228,6 +230,9 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer { */ public static final int DEFAULT_MAX_BLOCK_SIZE = 48; + /** Suggested default value for the {@code blockHeapSizeLimitBytes} parameter. */ + public static final long DEFAULT_BLOCK_HEAP_LIMIT_BYTES = 512 * 1024; // 512KB + // public static boolean DEBUG = false; // public static boolean DEBUG2 = false; @@ -246,6 +251,18 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer { private final List fields = new ArrayList<>(); + // keep track of the temp IndexInput to close them + private final List tempIndexInputs = new ArrayList<>(); + + // keep track of the temp IndexInput name to delete them + private final List tempInputNames = new ArrayList<>(); + + // if the {@link PendingBlock} size is more than this, we will use off-heap FST + // setting to 0 means we will always use off-heap FST + private final long blockHeapSizeLimitBytes; + + private final SegmentWriteState state; + /** * Create a new writer. The number of items (terms or sub-blocks) per block will aim to be between * minItemsPerBlock and maxItemsPerBlock, though in some cases the blocks may be smaller than the @@ -273,10 +290,32 @@ public Lucene90BlockTreeTermsWriter( int maxItemsInBlock, int version) throws IOException { + this( + state, + postingsWriter, + minItemsInBlock, + maxItemsInBlock, + version, + DEFAULT_BLOCK_HEAP_LIMIT_BYTES); + } + + /** + * Expert constructor that allows configuring the version, used for bw tests. It also allows + * configuring of block heap max size + */ + public Lucene90BlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock, + int version, + long blockHeapSizeLimitBytes) + throws IOException { validateSettings(minItemsInBlock, maxItemsInBlock); this.minItemsInBlock = minItemsInBlock; this.maxItemsInBlock = maxItemsInBlock; + this.blockHeapSizeLimitBytes = blockHeapSizeLimitBytes; if (version < Lucene90BlockTreeTermsReader.VERSION_START || version > Lucene90BlockTreeTermsReader.VERSION_CURRENT) { throw new IllegalArgumentException( @@ -288,6 +327,7 @@ public Lucene90BlockTreeTermsWriter( + version); } this.version = version; + this.state = state; this.maxDoc = state.segmentInfo.maxDoc(); this.fieldInfos = state.fieldInfos; @@ -438,6 +478,26 @@ public String toString() { } } + /** + * Create temp IndexOutput to write FST off-heap. The file name will be tracked to clean up later. + */ + private IndexOutput createTempOutput(String prefix) throws IOException { + IndexOutput tempOut = state.directory.createTempOutput(prefix, "temp", state.context); + tempInputNames.add(tempOut.getName()); + return tempOut; + } + + /** + * Close the temp IndexOutput and open IndexInput to read from. The IndexInput will be tracked to + * clean up later. + */ + private IndexInput openTempInput(IndexOutput fstDataOutput) throws IOException { + IOUtils.close(fstDataOutput); // we need to close the DataOutput before reading from DataInput + IndexInput tempIn = state.directory.openInput(fstDataOutput.getName(), state.context); + tempIndexInputs.add(tempIn); + return tempIn; + } + /** * Encodes long value to variable length byte[], in MSB order. Use {@link * FieldReader#readMSBVLong} to decode. @@ -460,7 +520,10 @@ static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException { private final class PendingBlock extends PendingEntry { public final BytesRef prefix; public final long fp; - public FST index; + // the index's FST + public FST indexFST; + // the index's FST metadata + public FST.FSTMetadata indexMetadata; public List> subIndices; public final boolean hasTerms; public final boolean isFloor; @@ -555,7 +618,19 @@ public void compileIndex( } } - index = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()); + indexMetadata = fstCompiler.compile(); + + if (fstDataOutput == indexOut) { + // this is the root block, we don't need to read from it and Lucene doesn't allow to read + // from still-writing DataOutput either, hence we only store the FST metadata to save it + // later + } else if (fstDataOutput instanceof IndexOutput) { + // if we write to IndexOutput then we should open and read from IndexInput + IndexInput indexInput = openTempInput((IndexOutput) fstDataOutput); + indexFST = new FST<>(indexMetadata, indexInput, new OffHeapFSTStore()); + } else { + indexFST = FST.fromFSTReader(indexMetadata, fstCompiler.getFSTReader()); + } assert subIndices == null; @@ -767,7 +842,8 @@ void writeBlocks(int prefixLength, int count, boolean isRootBlock) throws IOExce assert firstBlock.isFloor || newBlocks.size() == 1; // Create a proper DataOutput for the FST. For root block, we will write to the IndexOut - // directly. For sub blocks, we will use the on-heap ReadWriteDataOutput + // directly. For sub blocks, if the size is smaller than blockHeapSizeLimitBytes then we + // will use the on-heap ReadWriteDataOutput, otherwise create a temp output DataOutput fstDataOutput = getFSTDataOutput(newBlocks, firstBlock.prefix.length, isRootBlock); firstBlock.compileIndex(newBlocks, scratchBytes, scratchIntsRef, fstDataOutput); @@ -782,7 +858,7 @@ void writeBlocks(int prefixLength, int count, boolean isRootBlock) throws IOExce } private DataOutput getFSTDataOutput( - List blocks, int prefixLength, boolean isRootBlock) { + List blocks, int prefixLength, boolean isRootBlock) throws IOException { if (isRootBlock) { return indexOut; } @@ -794,6 +870,12 @@ private DataOutput getFSTDataOutput( } } } + + // the size is larger than heap size limit, use off-heap writing instead + if (estimateSize > blockHeapSizeLimitBytes) { + return createTempOutput(fieldInfo.getName()); + } + int estimateBitsRequired = PackedInts.bitsRequired(estimateSize); int pageBits = Math.min(15, Math.max(6, estimateBitsRequired)); @@ -981,7 +1063,7 @@ private PendingBlock writeBlock( assert block.fp < startFP; suffixLengthsWriter.writeVLong(startFP - block.fp); - subIndices.add(block.index); + subIndices.add(block.indexFST); } } statsWriter.finish(); @@ -1183,7 +1265,7 @@ public void finish() throws IOException { : "pending.size()=" + pending.size() + " pending=" + pending; final PendingBlock root = (PendingBlock) pending.get(0); assert root.prefix.length == 0; - final BytesRef rootCode = root.index.getEmptyOutput(); + final BytesRef rootCode = root.indexMetadata.getEmptyOutput(); assert rootCode != null; ByteBuffersDataOutput metaOut = new ByteBuffersDataOutput(); @@ -1203,9 +1285,9 @@ public void finish() throws IOException { writeBytesRef(metaOut, new BytesRef(lastPendingTerm.termBytes)); // Write the address to the beginning of the FST. Note that the FST is already written to // indexOut by this point - metaOut.writeVLong(indexOut.getFilePointer() - root.index.numBytes()); + metaOut.writeVLong(indexOut.getFilePointer() - root.indexMetadata.getNumBytes()); // Write FST to index - root.index.saveMetadata(metaOut); + root.indexMetadata.save(metaOut); // System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name); /* @@ -1260,8 +1342,17 @@ public void close() throws IOException { } finally { if (success) { IOUtils.close(metaOut, termsOut, indexOut, postingsWriter); + IOUtils.close(tempIndexInputs); } else { IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut, postingsWriter); + IOUtils.closeWhileHandlingException(tempIndexInputs); + } + for (String inputName : tempInputNames) { + try { + state.directory.deleteFile(inputName); + } catch (IOException ex) { + + } } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 6bb5718d5c75..f2b54e5d7aaa 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -537,56 +537,10 @@ public FSTMetadata getMetadata() { * @param out the DataOutput to write the FST bytes to */ public void save(DataOutput metaOut, DataOutput out) throws IOException { - saveMetadata(metaOut); + metadata.save(metaOut); fstReader.writeTo(out); } - /** - * Save the metadata to a DataOutput - * - * @param metaOut the DataOutput to write the metadata to - */ - public void saveMetadata(DataOutput metaOut) throws IOException { - CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT); - // TODO: really we should encode this as an arc, arriving - // to the root node, instead of special casing here: - if (metadata.emptyOutput != null) { - // Accepts empty string - metaOut.writeByte((byte) 1); - - // Serialize empty-string output: - ByteBuffersDataOutput ros = new ByteBuffersDataOutput(); - outputs.writeFinalOutput(metadata.emptyOutput, ros); - byte[] emptyOutputBytes = ros.toArrayCopy(); - int emptyLen = emptyOutputBytes.length; - - // reverse - final int stopAt = emptyLen / 2; - int upto = 0; - while (upto < stopAt) { - final byte b = emptyOutputBytes[upto]; - emptyOutputBytes[upto] = emptyOutputBytes[emptyLen - upto - 1]; - emptyOutputBytes[emptyLen - upto - 1] = b; - upto++; - } - metaOut.writeVInt(emptyLen); - metaOut.writeBytes(emptyOutputBytes, 0, emptyLen); - } else { - metaOut.writeByte((byte) 0); - } - final byte t; - if (metadata.inputType == INPUT_TYPE.BYTE1) { - t = 0; - } else if (metadata.inputType == INPUT_TYPE.BYTE2) { - t = 1; - } else { - t = 2; - } - metaOut.writeByte(t); - metaOut.writeVLong(metadata.startNode); - metaOut.writeVLong(numBytes()); - } - /** Writes an automaton to a file. */ public void save(final Path path) throws IOException { try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) { @@ -1258,5 +1212,59 @@ public FSTMetadata( public int getVersion() { return version; } + + public T getEmptyOutput() { + return emptyOutput; + } + + public long getNumBytes() { + return numBytes; + } + + /** + * Save the metadata to a DataOutput + * + * @param metaOut the DataOutput to write the metadata to + */ + public void save(DataOutput metaOut) throws IOException { + CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT); + // TODO: really we should encode this as an arc, arriving + // to the root node, instead of special casing here: + if (emptyOutput != null) { + // Accepts empty string + metaOut.writeByte((byte) 1); + + // Serialize empty-string output: + ByteBuffersDataOutput ros = new ByteBuffersDataOutput(); + outputs.writeFinalOutput(emptyOutput, ros); + byte[] emptyOutputBytes = ros.toArrayCopy(); + int emptyLen = emptyOutputBytes.length; + + // reverse + final int stopAt = emptyLen / 2; + int upto = 0; + while (upto < stopAt) { + final byte b = emptyOutputBytes[upto]; + emptyOutputBytes[upto] = emptyOutputBytes[emptyLen - upto - 1]; + emptyOutputBytes[emptyLen - upto - 1] = b; + upto++; + } + metaOut.writeVInt(emptyLen); + metaOut.writeBytes(emptyOutputBytes, 0, emptyLen); + } else { + metaOut.writeByte((byte) 0); + } + final byte t; + if (inputType == INPUT_TYPE.BYTE1) { + t = 0; + } else if (inputType == INPUT_TYPE.BYTE2) { + t = 1; + } else { + t = 2; + } + metaOut.writeByte(t); + metaOut.writeVLong(startNode); + metaOut.writeVLong(numBytes); + } } } From 62b242d2c4bc905b22c1bf5eb96b871cf6764bde Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 28 Feb 2024 09:35:17 +0900 Subject: [PATCH 08/10] Use IOUtils to delete files --- .../lucene90/blocktree/Lucene90BlockTreeTermsWriter.java | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 08e22d375f6f..5fd694704db6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -1347,13 +1347,7 @@ public void close() throws IOException { IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut, postingsWriter); IOUtils.closeWhileHandlingException(tempIndexInputs); } - for (String inputName : tempInputNames) { - try { - state.directory.deleteFile(inputName); - } catch (IOException ex) { - - } - } + IOUtils.deleteFilesIgnoringExceptions(state.directory, tempInputNames); } } From 7ce989e3816d78eee9334fd94ff54812193f29d3 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 3 Apr 2024 21:33:07 +0900 Subject: [PATCH 09/10] Update CHANGES.txt --- lucene/CHANGES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index faf088db3244..8ae5f89528e7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -159,7 +159,7 @@ Bug Fixes * GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those of DoubleValues#doubleValue(). (Uwe Schindler) - + Changes in Backwards Compatibility Policy ----------------------------------------- From c9de7664cee2494031c45b28880baf1bcfe58c2e Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 3 Apr 2024 21:38:10 +0900 Subject: [PATCH 10/10] Update CHANGES.txt --- lucene/CHANGES.txt | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8ae5f89528e7..a398f9d9a46c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -126,8 +126,6 @@ Improvements * GITHUB#12873: Expressions module now uses JEP 371 "Hidden Classes" with JEP 309 "Dynamic Class-File Constants" to implement Javascript expressions. (Uwe Schindler) -* GITHUB#12985: Make Lucene90BlockTreePostingsFormat to build FST off-heap for the root block. (Anh Dung Bui) - * GITHUB#11657, LUCENE-10621: Upgrade to OpenNLP 2.3.2. (Christine Poerschke, Eric Pugh) * GITHUB#13209: Upgrade snowball to 26db1ab9. (Robert Muir) @@ -149,6 +147,8 @@ Optimizations * GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X) +* GITHUB#12985: Make Lucene90BlockTreePostingsFormat to build FST off-heap. (Anh Dung Bui) + Bug Fixes --------------------- @@ -159,7 +159,7 @@ Bug Fixes * GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those of DoubleValues#doubleValue(). (Uwe Schindler) - + Changes in Backwards Compatibility Policy ----------------------------------------- @@ -863,7 +863,7 @@ New Features closed while queries are running can no longer crash the JVM. To disable this feature, pass the following sysprop on Java command line: "-Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false" (Uwe Schindler) - + * GITHUB#12252 Add function queries for computing similarity scores between knn vectors. (Elia Porciani, Alessandro Benedetti) Improvements @@ -1542,7 +1542,7 @@ New Features * LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery to speed up computing the number of hits when possible. (Lu Xugang, Luca Cavanna, Adrien Grand) -* LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory` +* LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory` implementation. `Monitor` can be created with a readonly `QueryIndex` in order to have readonly `Monitor` instances. (Niko Usai) @@ -1601,7 +1601,7 @@ Optimizations term of each block as a dictionary when compressing suffixes of the other 63 terms of the block. (Adrien Grand) -* LUCENE-10411: Add nearest neighbors vectors support to ExitableDirectoryReader. +* LUCENE-10411: Add nearest neighbors vectors support to ExitableDirectoryReader. (Zach Chen, Adrien Grand, Julie Tibshirani, Tomoko Uchida) * LUCENE-10542: FieldSource exists implementations can avoid value retrieval (Kevin Risden) @@ -1766,7 +1766,7 @@ New Features points are indexed. (Quentin Pradet, Adrien Grand) -* LUCENE-10263: Added Weight#count to NormsFieldExistsQuery to speed up the query if all +* LUCENE-10263: Added Weight#count to NormsFieldExistsQuery to speed up the query if all documents have the field.. (Alan Woodward) * LUCENE-10248: Add SpanishPluralStemFilter, for precise stemming of Spanish plurals. @@ -1792,14 +1792,14 @@ New Features * LUCENE-10403: Add ArrayUtil#grow(T[]). (Greg Miller) -* LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss, +* LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss, Alan Woodward) - + * LUCENE-10378: Implement Weight#count for PointRangeQuery to provide a faster way to calculate the number of matching range docs when each doc has at-most one point and the points are 1-dimensional. (Gautam Worah, Ignacio Vera, Adrien Grand) -* LUCENE-10415: FunctionScoreQuery and IndexOrDocValuesQuery delegate Weight#count. (Ignacio Vera) +* LUCENE-10415: FunctionScoreQuery and IndexOrDocValuesQuery delegate Weight#count. (Ignacio Vera) * LUCENE-10382: Add support for filtering in KnnVectorQuery. This allows for finding the nearest k documents that also match a query. (Julie Tibshirani, Joel Bernstein) @@ -1816,10 +1816,10 @@ Improvements * LUCENE-10238: Upgrade icu4j dependency to 70.1. (Dawid Weiss) -* LUCENE-9820: Extract BKD tree interface and move intersecting logic to the +* LUCENE-9820: Extract BKD tree interface and move intersecting logic to the PointValues abstract class. (Ignacio Vera, Adrien Grand) - -* LUCENE-10262: Lift up restrictions for navigating PointValues#PointTree + +* LUCENE-10262: Lift up restrictions for navigating PointValues#PointTree added in LUCENE-9820 (Ignacio Vera) * LUCENE-9538: Detect polygon self-intersections in the Tessellator. (Ignacio Vera) @@ -1934,8 +1934,8 @@ Bug Fixes * LUCENE-10407: Containing intervals could sometimes yield incorrect matches when wrapped in a disjunction. (Alan Woodward, Dawid Weiss) - -* LUCENE-10405: When using the MemoryIndex, binary and Sorted doc values are stored + +* LUCENE-10405: When using the MemoryIndex, binary and Sorted doc values are stored as BytesRef instead of BytesRefHash so they don't have a limit on size. (Ignacio Vera) * LUCENE-10428: Queries with a misbehaving score function may no longer cause @@ -1967,7 +1967,7 @@ Other * LUCENE-10413: Make Ukrainian default stop words list available as a public getter. (Alan Woodward) -* LUCENE-10437: Polygon tessellator throws a more informative error message when the provided polygon +* LUCENE-10437: Polygon tessellator throws a more informative error message when the provided polygon does not contain enough no-collinear points. (Ignacio Vera) ======================= Lucene 9.0.0 ======================= @@ -2086,7 +2086,7 @@ API Changes only applicable for fields that are indexed with doc values only. (Mayya Sharipova, Adrien Grand, Simon Willnauer) -* LUCENE-9047: Directory API is now little endian. (Ignacio Vera, Adrien Grand) +* LUCENE-9047: Directory API is now little endian. (Ignacio Vera, Adrien Grand) * LUCENE-9948: No longer require the user to specify whether-or-not a field is multi-valued in LongValueFacetCounts (detect automatically based on what is indexed). (Greg Miller) @@ -2299,7 +2299,7 @@ Improvements (David Smiley) * LUCENE-10062: Switch taxonomy faceting to use numeric doc values for storing ordinals instead of binary doc values - with its own custom encoding. (Greg Miller) + with its own custom encoding. (Greg Miller) Bug fixes --------------------- @@ -2422,10 +2422,10 @@ Other * LUCENE-9822: Add assertion to PFOR exception encoding, documenting the BLOCK_SIZE assumption. (Greg Miller) * LUCENE-9883: Turn on ecj missingEnumCaseDespiteDefault setting. (Zach Chen) - -* LUCENE-9705: Make new versions of all index formats for the Lucene90 codec and move - the existing ones to the backwards codecs. (Julie Tibshirani, Ignacio Vera) - + +* LUCENE-9705: Make new versions of all index formats for the Lucene90 codec and move + the existing ones to the backwards codecs. (Julie Tibshirani, Ignacio Vera) + * LUCENE-9907: Remove dependency on PackedInts#getReader() from the current codecs and move the method to backwards codec. (Ignacio Vera)