From a6259fc48b49a62de8fa5b61241b418424e23c6a Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 14 Nov 2023 07:55:32 +0900 Subject: [PATCH 01/45] Move size() to FSTStore --- .../src/java/org/apache/lucene/util/fst/BytesStore.java | 5 ----- .../src/java/org/apache/lucene/util/fst/FSTCompiler.java | 2 +- .../src/java/org/apache/lucene/util/fst/FSTReader.java | 7 ------- .../core/src/java/org/apache/lucene/util/fst/FSTStore.java | 7 +++++++ 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index 900822966a4f..e0e54a149a96 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -332,11 +332,6 @@ public long getPosition() { return ((long) blocks.size() - 1) * blockSize + nextWrite; } - @Override - public long size() { - return getPosition(); - } - /** * Pos must be less than the max position written so far! Ie, you cannot "grow" the file with * this! diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index fe4316accfe4..6dfda3958ec0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -849,7 +849,7 @@ void setEmptyOutput(T v) { } void finish(long newStartNode) { - assert newStartNode <= bytes.size(); + assert newStartNode <= bytes.getPosition(); if (fst.metadata.startNode != -1) { throw new IllegalStateException("already finished"); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTReader.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTReader.java index f299fcb75585..b13b24b1a554 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTReader.java @@ -23,13 +23,6 @@ /** Abstraction for reading bytes necessary for FST. */ public interface FSTReader extends Accountable { - /** - * The raw size in bytes of the FST - * - * @return the FST size - */ - long size(); - /** * Get the reverse BytesReader for this FST * diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java index 682503b2c44d..35fcd7f9afbd 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java @@ -22,6 +22,13 @@ /** A type of {@link FSTReader} which needs data to be initialized before use */ public interface FSTStore extends FSTReader { + /** + * The raw size in bytes of the FST + * + * @return the FST size + */ + long size(); + /** * Initialize the FSTStore * From e0e1517a96faa0d2092c13ec0e1beb42f3cf6128 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 14 Nov 2023 21:10:10 +0900 Subject: [PATCH 02/45] Remove size() completely --- .../src/java/org/apache/lucene/util/fst/FSTStore.java | 7 ------- .../java/org/apache/lucene/util/fst/OffHeapFSTStore.java | 1 - .../java/org/apache/lucene/util/fst/OnHeapFSTStore.java | 9 --------- 3 files changed, 17 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java index 35fcd7f9afbd..682503b2c44d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java @@ -22,13 +22,6 @@ /** A type of {@link FSTReader} which needs data to be initialized before use */ public interface FSTStore extends FSTReader { - /** - * The raw size in bytes of the FST - * - * @return the FST size - */ - long size(); - /** * Initialize the FSTStore * diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java index 6c102dff451f..f88715b191cc 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java @@ -56,7 +56,6 @@ public long ramBytesUsed() { return BASE_RAM_BYTES_USED; } - @Override public long size() { return numBytes; } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java index b22c22ee4eb8..2113c93f4c3a 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java @@ -64,15 +64,6 @@ public FSTStore init(DataInput in, long numBytes) throws IOException { return this; } - @Override - public long size() { - if (bytesArray != null) { - return bytesArray.length; - } else { - return bytes.getPosition(); - } - } - @Override public long ramBytesUsed() { long size = BASE_RAM_BYTES_USED; From f2d82340ba8e338e8052e9be4eff19ab6a51eb94 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 15 Nov 2023 14:11:47 +0900 Subject: [PATCH 03/45] Allow FST builder to use different DataOutput --- .../Lucene90BlockTreeTermsWriter.java | 4 +- .../lucene/util/fst/ByteBuffersFSTReader.java | 56 +++ .../apache/lucene/util/fst/BytesStore.java | 454 ++---------------- .../java/org/apache/lucene/util/fst/FST.java | 18 +- .../apache/lucene/util/fst/FSTCompiler.java | 223 ++++++--- .../org/apache/lucene/util/fst/NodeHash.java | 5 +- .../lucene/util/fst/OnHeapFSTStore.java | 20 +- .../lucene/util/fst/TestBytesStore.java | 182 +------ .../util/fst/TestFSTDataOutputWriter.java | 231 +++++++++ .../util/fst/TestFSTDirectAddressing.java | 2 +- .../org/apache/lucene/util/fst/TestFSTs.java | 3 +- .../lucene/tests/util/fst/FSTTester.java | 16 +- 12 files changed, 544 insertions(+), 670 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/ByteBuffersFSTReader.java create mode 100644 lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index acd75092022f..56534db0a126 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.codecs.lucene90.blocktree; +import static org.apache.lucene.util.fst.FSTCompiler.getOnHeapDataOutput; + import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -525,7 +527,7 @@ public void compileIndex( // Disable suffixes sharing for block tree index because suffixes are mostly dropped // from the FST index and left in the term blocks. .suffixRAMLimitMB(0d) - .bytesPageBits(pageBits) + .dataOutput(getOnHeapDataOutput(pageBits)) .build(); // if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ByteBuffersFSTReader.java b/lucene/core/src/java/org/apache/lucene/util/fst/ByteBuffersFSTReader.java new file mode 100644 index 000000000000..4065c4987b8c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ByteBuffersFSTReader.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.IOException; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataOutput; + +/** An adapter class to use {@link ByteBuffersDataOutput} as a {@link FSTReader} */ +final class ByteBuffersFSTReader extends DataOutput implements FSTReader { + + private final ByteBuffersDataOutput dataOutput; + + public ByteBuffersFSTReader(ByteBuffersDataOutput dataOutput) { + this.dataOutput = dataOutput; + } + + @Override + public void writeByte(byte b) { + dataOutput.writeByte(b); + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + dataOutput.writeBytes(b, offset, length); + } + + @Override + public long ramBytesUsed() { + return dataOutput.ramBytesUsed(); + } + + @Override + public FST.BytesReader getReverseBytesReader() { + return new ReverseRandomAccessReader(dataOutput.toDataInput()); + } + + @Override + public void writeTo(DataOutput out) throws IOException { + dataOutput.copyTo(out); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index e0e54a149a96..5ff6adfac790 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -17,485 +17,129 @@ package org.apache.lucene.util.fst; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; -// TODO: merge with PagedBytes, except PagedBytes doesn't -// let you read while writing which FST needs - -class BytesStore extends DataOutput implements FSTReader { +// Storing a byte[] for the current node of the FST we are writing +class BytesStore extends DataOutput implements Accountable { private static final long BASE_RAM_BYTES_USED = - RamUsageEstimator.shallowSizeOfInstance(BytesStore.class) - + RamUsageEstimator.shallowSizeOfInstance(ArrayList.class); + RamUsageEstimator.shallowSizeOfInstance(BytesStore.class); - private final List blocks = new ArrayList<>(); + private static final int INITIAL_SIZE = 1 << 8; - private final int blockSize; - private final int blockBits; - private final int blockMask; + // holds an initial size of 256 bytes. this byte array will only grow, but not shrink + private byte[] bytes = new byte[INITIAL_SIZE]; - private byte[] current; private int nextWrite; - public BytesStore(int blockBits) { - this.blockBits = blockBits; - blockSize = 1 << blockBits; - blockMask = blockSize - 1; - nextWrite = blockSize; - } - /** Absolute write byte; you must ensure dest is < max position written so far. */ - public void writeByte(long dest, byte b) { - int blockIndex = (int) (dest >> blockBits); - byte[] block = blocks.get(blockIndex); - block[(int) (dest & blockMask)] = b; + public void writeByte(int dest, byte b) { + assert dest < nextWrite; + bytes[dest] = b; } @Override public void writeByte(byte b) { - if (nextWrite == blockSize) { - current = new byte[blockSize]; - blocks.add(current); - nextWrite = 0; - } - current[nextWrite++] = b; + ensureCapacity(1); + bytes[nextWrite++] = b; } @Override public void writeBytes(byte[] b, int offset, int len) { - while (len > 0) { - int chunk = blockSize - nextWrite; - if (len <= chunk) { - assert b != null; - assert current != null; - System.arraycopy(b, offset, current, nextWrite, len); - nextWrite += len; - break; - } else { - if (chunk > 0) { - System.arraycopy(b, offset, current, nextWrite, chunk); - offset += chunk; - len -= chunk; - } - current = new byte[blockSize]; - blocks.add(current); - nextWrite = 0; - } - } + ensureCapacity(len); + System.arraycopy(b, offset, bytes, nextWrite, len); + nextWrite += len; } - int getBlockBits() { - return blockBits; + /** + * Ensure we can write additional capacityToWrite bytes. The array will preferably grow x2 size. + * + * @param capacityToWrite the additional bytes to write + */ + private void ensureCapacity(int capacityToWrite) { + bytes = ArrayUtil.grow(bytes, nextWrite + capacityToWrite); } /** * Absolute writeBytes without changing the current position. Note: this cannot "grow" the bytes, * so you must only call it on already written parts. */ - void writeBytes(long dest, byte[] b, int offset, int len) { - // System.out.println(" BS.writeBytes dest=" + dest + " offset=" + offset + " len=" + len); + void writeBytes(int dest, byte[] b, int offset, int len) { assert dest + len <= getPosition() : "dest=" + dest + " pos=" + getPosition() + " len=" + len; - - // Note: weird: must go "backwards" because copyBytes - // calls us with overlapping src/dest. If we - // go forwards then we overwrite bytes before we can - // copy them: - - /* - int blockIndex = dest >> blockBits; - int upto = dest & blockMask; - byte[] block = blocks.get(blockIndex); - while (len > 0) { - int chunk = blockSize - upto; - System.out.println(" cycle chunk=" + chunk + " len=" + len); - if (len <= chunk) { - System.arraycopy(b, offset, block, upto, len); - break; - } else { - System.arraycopy(b, offset, block, upto, chunk); - offset += chunk; - len -= chunk; - blockIndex++; - block = blocks.get(blockIndex); - upto = 0; - } - } - */ - - final long end = dest + len; - int blockIndex = (int) (end >> blockBits); - int downTo = (int) (end & blockMask); - if (downTo == 0) { - blockIndex--; - downTo = blockSize; - } - byte[] block = blocks.get(blockIndex); - - while (len > 0) { - // System.out.println(" cycle downTo=" + downTo + " len=" + len); - if (len <= downTo) { - // System.out.println(" final: offset=" + offset + " len=" + len + " dest=" + - // (downTo-len)); - System.arraycopy(b, offset, block, downTo - len, len); - break; - } else { - len -= downTo; - // System.out.println(" partial: offset=" + (offset + len) + " len=" + downTo + " - // dest=0"); - System.arraycopy(b, offset + len, block, 0, downTo); - blockIndex--; - block = blocks.get(blockIndex); - downTo = blockSize; - } - } + System.arraycopy(b, offset, bytes, dest, len); } @Override public void copyBytes(DataInput input, long numBytes) throws IOException { assert numBytes >= 0 : "numBytes=" + numBytes; assert input != null; - long len = numBytes; - while (len > 0) { - int chunk = blockSize - nextWrite; - int l = (int) Math.min(chunk, len); - if (l > 0) { - assert current != null; - input.readBytes(current, nextWrite, l); - nextWrite += l; - len -= l; - } else { - current = new byte[blockSize]; - blocks.add(current); - nextWrite = 0; - } - } + int length = Math.toIntExact(numBytes); + ensureCapacity(length); + input.readBytes(bytes, nextWrite, length); + nextWrite += length; } /** * Absolute copy bytes self to self, without changing the position. Note: this cannot "grow" the * bytes, so must only call it on already written parts. */ - public void copyBytes(long src, long dest, int len) { - // System.out.println("BS.copyBytes src=" + src + " dest=" + dest + " len=" + len); + public void copyBytes(int src, int dest, int len) { assert src < dest; - - // Note: weird: must go "backwards" because copyBytes - // calls us with overlapping src/dest. If we - // go forwards then we overwrite bytes before we can - // copy them: - - /* - int blockIndex = src >> blockBits; - int upto = src & blockMask; - byte[] block = blocks.get(blockIndex); - while (len > 0) { - int chunk = blockSize - upto; - System.out.println(" cycle: chunk=" + chunk + " len=" + len); - if (len <= chunk) { - writeBytes(dest, block, upto, len); - break; - } else { - writeBytes(dest, block, upto, chunk); - blockIndex++; - block = blocks.get(blockIndex); - upto = 0; - len -= chunk; - dest += chunk; - } - } - */ - - long end = src + len; - - int blockIndex = (int) (end >> blockBits); - int downTo = (int) (end & blockMask); - if (downTo == 0) { - blockIndex--; - downTo = blockSize; - } - byte[] block = blocks.get(blockIndex); - - while (len > 0) { - // System.out.println(" cycle downTo=" + downTo); - if (len <= downTo) { - // System.out.println(" finish"); - writeBytes(dest, block, downTo - len, len); - break; - } else { - // System.out.println(" partial"); - len -= downTo; - writeBytes(dest + len, block, 0, downTo); - blockIndex--; - block = blocks.get(blockIndex); - downTo = blockSize; - } - } + writeBytes(dest, bytes, src, len); } /** Copies bytes from this store to a target byte array. */ - public void copyBytes(long src, byte[] dest, int offset, int len) { - int blockIndex = (int) (src >> blockBits); - int upto = (int) (src & blockMask); - byte[] block = blocks.get(blockIndex); - while (len > 0) { - int chunk = blockSize - upto; - if (len <= chunk) { - System.arraycopy(block, upto, dest, offset, len); - break; - } else { - System.arraycopy(block, upto, dest, offset, chunk); - blockIndex++; - block = blocks.get(blockIndex); - upto = 0; - len -= chunk; - offset += chunk; - } - } - } - - /** Writes an int at the absolute position without changing the current pointer. */ - public void writeInt(long pos, int value) { - int blockIndex = (int) (pos >> blockBits); - int upto = (int) (pos & blockMask); - byte[] block = blocks.get(blockIndex); - int shift = 24; - for (int i = 0; i < 4; i++) { - block[upto++] = (byte) (value >> shift); - shift -= 8; - if (upto == blockSize) { - upto = 0; - blockIndex++; - block = blocks.get(blockIndex); - } - } + public void copyBytes(int src, byte[] dest, int offset, int len) { + System.arraycopy(bytes, src, dest, offset, len); } /** Reverse from srcPos, inclusive, to destPos, inclusive. */ - public void reverse(long srcPos, long destPos) { - assert srcPos < destPos; - assert destPos < getPosition(); - // System.out.println("reverse src=" + srcPos + " dest=" + destPos); - - int srcBlockIndex = (int) (srcPos >> blockBits); - int src = (int) (srcPos & blockMask); - byte[] srcBlock = blocks.get(srcBlockIndex); - - int destBlockIndex = (int) (destPos >> blockBits); - int dest = (int) (destPos & blockMask); - byte[] destBlock = blocks.get(destBlockIndex); - // System.out.println(" srcBlock=" + srcBlockIndex + " destBlock=" + destBlockIndex); - - int limit = (int) (destPos - srcPos + 1) / 2; + public void reverse() { + int src = 0; + int dest = nextWrite - 1; + int limit = (dest - src + 1) / 2; for (int i = 0; i < limit; i++) { - // System.out.println(" cycle src=" + src + " dest=" + dest); - byte b = srcBlock[src]; - srcBlock[src] = destBlock[dest]; - destBlock[dest] = b; - src++; - if (src == blockSize) { - srcBlockIndex++; - srcBlock = blocks.get(srcBlockIndex); - // System.out.println(" set destBlock=" + destBlock + " srcBlock=" + srcBlock); - src = 0; - } - - dest--; - if (dest == -1) { - destBlockIndex--; - destBlock = blocks.get(destBlockIndex); - // System.out.println(" set destBlock=" + destBlock + " srcBlock=" + srcBlock); - dest = blockSize - 1; - } + byte b = bytes[src + i]; + bytes[src + i] = bytes[dest - i]; + bytes[dest - i] = b; } } public void skipBytes(int len) { - while (len > 0) { - int chunk = blockSize - nextWrite; - if (len <= chunk) { - nextWrite += len; - break; - } else { - len -= chunk; - current = new byte[blockSize]; - blocks.add(current); - nextWrite = 0; - } - } + ensureCapacity(len); + nextWrite += len; } - public long getPosition() { - return ((long) blocks.size() - 1) * blockSize + nextWrite; + public int getPosition() { + return nextWrite; } /** * Pos must be less than the max position written so far! Ie, you cannot "grow" the file with * this! */ - public void truncate(long newLen) { + public void truncate(int newLen) { assert newLen <= getPosition(); assert newLen >= 0; - int blockIndex = (int) (newLen >> blockBits); - nextWrite = (int) (newLen & blockMask); - if (nextWrite == 0) { - blockIndex--; - nextWrite = blockSize; - } - blocks.subList(blockIndex + 1, blocks.size()).clear(); - if (newLen == 0) { - current = null; - } else { - current = blocks.get(blockIndex); - } - assert newLen == getPosition(); - } - - public void finish() { - if (current != null) { - byte[] lastBuffer = new byte[nextWrite]; - System.arraycopy(current, 0, lastBuffer, 0, nextWrite); - blocks.set(blocks.size() - 1, lastBuffer); - current = null; - } + nextWrite = newLen; } /** Writes all of our bytes to the target {@link DataOutput}. */ - @Override public void writeTo(DataOutput out) throws IOException { - for (byte[] block : blocks) { - out.writeBytes(block, 0, block.length); - } - } - - public FST.BytesReader getForwardReader() { - if (blocks.size() == 1) { - return new ForwardBytesReader(blocks.get(0)); - } - return new FST.BytesReader() { - private byte[] current; - private int nextBuffer; - private int nextRead = blockSize; - - @Override - public byte readByte() { - if (nextRead == blockSize) { - current = blocks.get(nextBuffer++); - nextRead = 0; - } - return current[nextRead++]; - } - - @Override - public void skipBytes(long count) { - setPosition(getPosition() + count); - } - - @Override - public void readBytes(byte[] b, int offset, int len) { - while (len > 0) { - int chunkLeft = blockSize - nextRead; - if (len <= chunkLeft) { - System.arraycopy(current, nextRead, b, offset, len); - nextRead += len; - break; - } else { - if (chunkLeft > 0) { - System.arraycopy(current, nextRead, b, offset, chunkLeft); - offset += chunkLeft; - len -= chunkLeft; - } - current = blocks.get(nextBuffer++); - nextRead = 0; - } - } - } - - @Override - public long getPosition() { - return ((long) nextBuffer - 1) * blockSize + nextRead; - } - - @Override - public void setPosition(long pos) { - int bufferIndex = (int) (pos >> blockBits); - if (nextBuffer != bufferIndex + 1) { - nextBuffer = bufferIndex + 1; - current = blocks.get(bufferIndex); - } - nextRead = (int) (pos & blockMask); - assert getPosition() == pos; - } - }; - } - - @Override - public FST.BytesReader getReverseBytesReader() { - if (blocks.size() == 1) { - return new ReverseBytesReader(blocks.get(0)); - } - return new FST.BytesReader() { - private byte[] current = blocks.size() == 0 ? null : blocks.get(0); - private int nextBuffer = -1; - private int nextRead = 0; - - @Override - public byte readByte() { - if (nextRead == -1) { - current = blocks.get(nextBuffer--); - nextRead = blockSize - 1; - } - return current[nextRead--]; - } - - @Override - public void skipBytes(long count) { - setPosition(getPosition() - count); - } - - @Override - public void readBytes(byte[] b, int offset, int len) { - for (int i = 0; i < len; i++) { - b[offset + i] = readByte(); - } - } - - @Override - public long getPosition() { - return ((long) nextBuffer + 1) * blockSize + nextRead; - } - - @Override - public void setPosition(long pos) { - // NOTE: a little weird because if you - // setPosition(0), the next byte you read is - // bytes[0] ... but I would expect bytes[-1] (ie, - // EOF)...? - int bufferIndex = (int) (pos >> blockBits); - if (nextBuffer != bufferIndex - 1) { - nextBuffer = bufferIndex - 1; - current = blocks.get(bufferIndex); - } - nextRead = (int) (pos & blockMask); - assert getPosition() == pos : "pos=" + pos + " getPos()=" + getPosition(); - } - }; + out.writeBytes(bytes, 0, nextWrite); } @Override public long ramBytesUsed() { - long size = BASE_RAM_BYTES_USED; - for (byte[] block : blocks) { - size += RamUsageEstimator.sizeOf(block); - } - return size; + return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(bytes); } @Override public String toString() { - return getClass().getSimpleName() + "(numBlocks=" + blocks.size() + ")"; + return getClass().getSimpleName() + "(pos=" + nextWrite + ")"; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 7bf6ba8358b5..77c26542ca8e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -17,6 +17,7 @@ package org.apache.lucene.util.fst; import static org.apache.lucene.util.fst.FST.Arc.BitTable; +import static org.apache.lucene.util.fst.FSTCompiler.getOnHeapDataOutput; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; @@ -435,6 +436,13 @@ public FST(FSTMetadata metadata, DataInput in, Outputs outputs, FSTStore f this.fstReader = fstReader; } + /** + * @return true if and only if this FST is readable (i.e. has a reverse BytesReader) + */ + public boolean hasReverseBytesReader() { + return fstReader.getReverseBytesReader() != null; + } + /** * Read the FST metadata from DataInput * @@ -453,12 +461,12 @@ public static FSTMetadata readMetadata(DataInput metaIn, Outputs outpu if (metaIn.readByte() == 1) { // accepts empty string // 1 KB blocks: - BytesStore emptyBytes = new BytesStore(10); + DataOutput emptyBytes = getOnHeapDataOutput(10); int numBytes = metaIn.readVInt(); emptyBytes.copyBytes(metaIn, numBytes); // De-serialize empty-string output: - BytesReader reader = emptyBytes.getReverseBytesReader(); + BytesReader reader = ((FSTReader) emptyBytes).getReverseBytesReader(); // NoOutputs uses 0 bytes when writing its output, // so we have to check here else BytesStore gets // angry: @@ -1181,7 +1189,11 @@ private void seekToNextNode(BytesReader in) throws IOException { } } - /** Returns a {@link BytesReader} for this FST, positioned at position 0. */ + /** + * Returns a {@link BytesReader} for this FST, positioned at position 0. + * + * @see #hasReverseBytesReader() + */ public BytesReader getBytesReader() { return fstReader.getReverseBytesReader(); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 93ed5a4c7462..a59a6990b6ac 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.util.fst; +import static org.apache.lucene.store.ByteBuffersDataOutput.ALLOCATE_BB_ON_HEAP; +import static org.apache.lucene.store.ByteBuffersDataOutput.NO_REUSE; import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH; import static org.apache.lucene.util.fst.FST.ARCS_FOR_CONTINUOUS; import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING; @@ -32,15 +34,13 @@ import java.io.IOException; import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc -// TODO: could we somehow stream an FST to disk while we -// build it? - /** * Builds a minimal FST (maps an IntsRef term to an arbitrary output) from pre-sorted terms with * outputs. The FST becomes an FSA if you use NoOutputs. The FST is written on-the-fly into a @@ -59,6 +59,8 @@ */ public class FSTCompiler { + static final int DEFAULT_BLOCK_BITS = 15; + static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1f; /** @@ -120,22 +122,45 @@ public class FSTCompiler { final float directAddressingMaxOversizingFactor; long directAddressingExpansionCredit; - final BytesStore bytes; + // the DataOutput to write the FST to + final DataOutput dataOutput; + + // buffer to store bytes for the one node we are currently writing + final BytesStore scratchBytes = new BytesStore(); + + private long numBytesWritten; + + /** + * Get an on-heap DataOutput that allows the FST to be read immediately after writing. + * + * @param blockBits how many bits wide to make each block of the DataOutput + * @return the DataOutput + */ + public static DataOutput getOnHeapDataOutput(int blockBits) { + return new ByteBuffersFSTReader( + new ByteBuffersDataOutput(blockBits, blockBits, ALLOCATE_BB_ON_HEAP, NO_REUSE)); + } private FSTCompiler( FST.INPUT_TYPE inputType, double suffixRAMLimitMB, Outputs outputs, boolean allowFixedLengthArcs, - int bytesPageBits, - float directAddressingMaxOversizingFactor) { + DataOutput dataOutput, + float directAddressingMaxOversizingFactor) + throws IOException { this.allowFixedLengthArcs = allowFixedLengthArcs; this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor; - bytes = new BytesStore(bytesPageBits); // pad: ensure no node gets address 0 which is reserved to mean // the stop state w/ no arcs - bytes.writeByte((byte) 0); - fst = new FST<>(new FST.FSTMetadata<>(inputType, null, -1, VERSION_CURRENT, 0), outputs, bytes); + dataOutput.writeByte((byte) 0); + numBytesWritten++; + this.dataOutput = dataOutput; + fst = + new FST<>( + new FST.FSTMetadata<>(inputType, null, -1, VERSION_CURRENT, 0), + outputs, + toFSTReader(dataOutput)); if (suffixRAMLimitMB < 0) { throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB); } else if (suffixRAMLimitMB > 0) { @@ -153,6 +178,34 @@ private FSTCompiler( } } + // Get the respective FSTReader of the DataOutput. If the DataOutput is also a FSTReader then we + // will use it. Otherwise, we will use NullFSTReader, which does not allow reading. + private FSTReader toFSTReader(DataOutput dataOutput) { + if (dataOutput instanceof FSTReader) { + return (FSTReader) dataOutput; + } + return new NullFSTReader(); + } + + private static final class NullFSTReader implements FSTReader { + + @Override + public FST.BytesReader getReverseBytesReader() { + return null; + } + + @Override + public void writeTo(DataOutput out) { + throw new UnsupportedOperationException("writeTo(DataOutput) is not supported"); + } + + @Override + public long ramBytesUsed() { + return 0; + } + } + ; + /** * Fluent-style constructor for FST {@link FSTCompiler}. * @@ -165,7 +218,7 @@ public static class Builder { private final Outputs outputs; private double suffixRAMLimitMB = 32.0; private boolean allowFixedLengthArcs = true; - private int bytesPageBits = 15; + private DataOutput dataOutput = getOnHeapDataOutput(DEFAULT_BLOCK_BITS); private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR; /** @@ -218,13 +271,19 @@ public Builder allowFixedLengthArcs(boolean allowFixedLengthArcs) { } /** - * How many bits wide to make each byte[] block in the BytesStore; if you know the FST will be - * large then make this larger. For example 15 bits = 32768 byte pages. + * Set the {@link DataOutput} which is used for low-level writing of FST. If you want the FST to + * be readable, you need to use a DataOutput that also implements {@link FSTReader}, such as + * {@link FSTCompiler#getOnHeapDataOutput(int)}. * - *

Default = 15. + *

Otherwise you need to construct the corresponding {@link + * org.apache.lucene.store.DataInput} and use the FST constructor to read it. + * + * @param dataOutput the DataOutput + * @return this builder + * @see FSTCompiler#getOnHeapDataOutput(int) */ - public Builder bytesPageBits(int bytesPageBits) { - this.bytesPageBits = bytesPageBits; + public Builder dataOutput(DataOutput dataOutput) { + this.dataOutput = dataOutput; return this; } @@ -248,15 +307,17 @@ public Builder directAddressingMaxOversizingFactor(float factor) { /** Creates a new {@link FSTCompiler}. */ public FSTCompiler build() { - FSTCompiler fstCompiler = - new FSTCompiler<>( - inputType, - suffixRAMLimitMB, - outputs, - allowFixedLengthArcs, - bytesPageBits, - directAddressingMaxOversizingFactor); - return fstCompiler; + try { + return new FSTCompiler<>( + inputType, + suffixRAMLimitMB, + outputs, + allowFixedLengthArcs, + dataOutput, + directAddressingMaxOversizingFactor); + } catch (IOException e) { + throw new RuntimeException(e); + } } } @@ -277,9 +338,9 @@ public long getMappedStateCount() { return dedupHash == null ? 0 : nodeCount; } - private CompiledNode compileNode(UnCompiledNode nodeIn, int tailLength) throws IOException { + private CompiledNode compileNode(UnCompiledNode nodeIn) throws IOException { final long node; - long bytesPosStart = bytes.getPosition(); + long bytesPosStart = numBytesWritten; if (dedupHash != null) { if (nodeIn.numArcs == 0) { node = addNode(nodeIn); @@ -290,9 +351,10 @@ private CompiledNode compileNode(UnCompiledNode nodeIn, int tailLength) throw } else { node = addNode(nodeIn); } + assert node != -2; - long bytesPosEnd = bytes.getPosition(); + long bytesPosEnd = numBytesWritten; if (bytesPosEnd != bytesPosStart) { // The FST added a new node: assert bytesPosEnd > bytesPosStart; @@ -317,8 +379,11 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { return NON_FINAL_END_NODE; } } - final long startAddress = bytes.getPosition(); - // System.out.println(" startAddr=" + startAddress); + // reset the scratch writer to prepare for new write + scratchBytes.truncate(0); + + // the scratch writer must be cleaned at this point + assert scratchBytes.getPosition() == 0; final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(nodeIn); if (doFixedLengthArcs) { @@ -333,7 +398,7 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { final int lastArc = nodeIn.numArcs - 1; - long lastArcStart = bytes.getPosition(); + long lastArcStart = 0; int maxBytesPerArc = 0; int maxBytesPerArcWithoutLabel = 0; for (int arcIdx = 0; arcIdx < nodeIn.numArcs; arcIdx++) { @@ -373,38 +438,38 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { flags += BIT_ARC_HAS_OUTPUT; } - bytes.writeByte((byte) flags); - long labelStart = bytes.getPosition(); - writeLabel(bytes, arc.label); - int numLabelBytes = (int) (bytes.getPosition() - labelStart); + scratchBytes.writeByte((byte) flags); + long labelStart = scratchBytes.getPosition(); + writeLabel(scratchBytes, arc.label); + int numLabelBytes = (int) (scratchBytes.getPosition() - labelStart); // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " // target=" + target.node + " pos=" + bytes.getPosition() + " output=" + // outputs.outputToString(arc.output)); if (arc.output != NO_OUTPUT) { - fst.outputs.write(arc.output, bytes); + fst.outputs.write(arc.output, scratchBytes); // System.out.println(" write output"); } if (arc.nextFinalOutput != NO_OUTPUT) { // System.out.println(" write final output"); - fst.outputs.writeFinalOutput(arc.nextFinalOutput, bytes); + fst.outputs.writeFinalOutput(arc.nextFinalOutput, scratchBytes); } if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { assert target.node > 0; // System.out.println(" write target"); - bytes.writeVLong(target.node); + scratchBytes.writeVLong(target.node); } // just write the arcs "like normal" on first pass, but record how many bytes each one took // and max byte size: if (doFixedLengthArcs) { - int numArcBytes = (int) (bytes.getPosition() - lastArcStart); + int numArcBytes = (int) (scratchBytes.getPosition() - lastArcStart); numBytesPerArc[arcIdx] = numArcBytes; numLabelBytesPerArc[arcIdx] = numLabelBytes; - lastArcStart = bytes.getPosition(); + lastArcStart = scratchBytes.getPosition(); maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes); maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes); @@ -441,23 +506,25 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { boolean continuousLabel = labelRange == nodeIn.numArcs; if (continuousLabel) { writeNodeForDirectAddressingOrContinuous( - nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, true); + nodeIn, maxBytesPerArcWithoutLabel, labelRange, true); continuousNodeCount++; } else if (shouldExpandNodeWithDirectAddressing( nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) { writeNodeForDirectAddressingOrContinuous( - nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, false); + nodeIn, maxBytesPerArcWithoutLabel, labelRange, false); directAddressingNodeCount++; } else { - writeNodeForBinarySearch(nodeIn, startAddress, maxBytesPerArc); + writeNodeForBinarySearch(nodeIn, maxBytesPerArc); binarySearchNodeCount++; } } - final long thisNodeAddress = bytes.getPosition() - 1; - bytes.reverse(startAddress, thisNodeAddress); + scratchBytes.reverse(); + scratchBytes.writeTo(dataOutput); + numBytesWritten += scratchBytes.getPosition(); + nodeCount++; - return thisNodeAddress; + return numBytesWritten - 1; } private void writeLabel(DataOutput out, int v) throws IOException { @@ -532,8 +599,7 @@ private boolean shouldExpandNodeWithDirectAddressing( return false; } - private void writeNodeForBinarySearch( - FSTCompiler.UnCompiledNode nodeIn, long startAddress, int maxBytesPerArc) { + private void writeNodeForBinarySearch(FSTCompiler.UnCompiledNode nodeIn, int maxBytesPerArc) { // Build the header in a buffer. // It is a false/special arc which is in fact a node header with node flags followed by node // metadata. @@ -545,11 +611,11 @@ private void writeNodeForBinarySearch( int headerLen = fixedLengthArcsBuffer.getPosition(); // Expand the arcs in place, backwards. - long srcPos = bytes.getPosition(); - long destPos = startAddress + headerLen + nodeIn.numArcs * (long) maxBytesPerArc; + int srcPos = scratchBytes.getPosition(); + int destPos = headerLen + nodeIn.numArcs * maxBytesPerArc; assert destPos >= srcPos; if (destPos > srcPos) { - bytes.skipBytes((int) (destPos - srcPos)); + scratchBytes.skipBytes(destPos - srcPos); for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { destPos -= maxBytesPerArc; int arcLen = numBytesPerArc[arcIdx]; @@ -568,18 +634,17 @@ private void writeNodeForBinarySearch( + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs; - bytes.copyBytes(srcPos, destPos, arcLen); + scratchBytes.copyBytes(srcPos, destPos, arcLen); } } } // Write the header. - bytes.writeBytes(startAddress, fixedLengthArcsBuffer.getBytes(), 0, headerLen); + scratchBytes.writeBytes(0, fixedLengthArcsBuffer.getBytes(), 0, headerLen); } private void writeNodeForDirectAddressingOrContinuous( FSTCompiler.UnCompiledNode nodeIn, - long startAddress, int maxBytesPerArcWithoutLabel, int labelRange, boolean continuous) { @@ -590,7 +655,7 @@ private void writeNodeForDirectAddressingOrContinuous( // the presence bits, and the first label. Keep the first label. int headerMaxLen = 11; int numPresenceBytes = continuous ? 0 : getNumPresenceBytes(labelRange); - long srcPos = bytes.getPosition(); + int srcPos = scratchBytes.getPosition(); int totalArcBytes = numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel; int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes; byte[] buffer = fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes(); @@ -601,16 +666,16 @@ private void writeNodeForDirectAddressingOrContinuous( srcPos -= srcArcLen; int labelLen = numLabelBytesPerArc[arcIdx]; // Copy the flags. - bytes.copyBytes(srcPos, buffer, bufferOffset, 1); + scratchBytes.copyBytes(srcPos, buffer, bufferOffset, 1); // Skip the label, copy the remaining. int remainingArcLen = srcArcLen - 1 - labelLen; if (remainingArcLen != 0) { - bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); + scratchBytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); } if (arcIdx == 0) { // Copy the label of the first arc only. bufferOffset -= labelLen; - bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); + scratchBytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); } } assert bufferOffset == headerMaxLen + numPresenceBytes; @@ -627,19 +692,18 @@ private void writeNodeForDirectAddressingOrContinuous( int headerLen = fixedLengthArcsBuffer.getPosition(); // Prepare the builder byte store. Enlarge or truncate if needed. - long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes; - long currentPosition = bytes.getPosition(); + int nodeEnd = headerLen + numPresenceBytes + totalArcBytes; + int currentPosition = scratchBytes.getPosition(); if (nodeEnd >= currentPosition) { - bytes.skipBytes((int) (nodeEnd - currentPosition)); + scratchBytes.skipBytes(nodeEnd - currentPosition); } else { - bytes.truncate(nodeEnd); + scratchBytes.truncate(nodeEnd); } - assert bytes.getPosition() == nodeEnd; + assert scratchBytes.getPosition() == nodeEnd; // Write the header. - long writeOffset = startAddress; - bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), 0, headerLen); - writeOffset += headerLen; + scratchBytes.writeBytes(0, fixedLengthArcsBuffer.getBytes(), 0, headerLen); + int writeOffset = headerLen; // Write the presence bits if (continuous == false) { @@ -648,12 +712,13 @@ private void writeNodeForDirectAddressingOrContinuous( } // Write the first label and the arcs. - bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); + scratchBytes.writeBytes( + writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); } private void writePresenceBits( - FSTCompiler.UnCompiledNode nodeIn, long dest, int numPresenceBytes) { - long bytePos = dest; + FSTCompiler.UnCompiledNode nodeIn, int dest, int numPresenceBytes) { + int bytePos = dest; byte presenceBits = 1; // The first arc is always present. int presenceIndex = 0; int previousLabel = nodeIn.arcs[0].label; @@ -662,7 +727,7 @@ private void writePresenceBits( assert label > previousLabel; presenceIndex += label - previousLabel; while (presenceIndex >= Byte.SIZE) { - bytes.writeByte(bytePos++, presenceBits); + scratchBytes.writeByte(bytePos++, presenceBits); presenceBits = 0; presenceIndex -= Byte.SIZE; } @@ -673,7 +738,7 @@ private void writePresenceBits( assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8; assert presenceBits != 0; // The last byte is not 0. assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. - bytes.writeByte(bytePos++, presenceBits); + scratchBytes.writeByte(bytePos++, presenceBits); assert bytePos - dest == numPresenceBytes; } @@ -700,11 +765,7 @@ private void freezeTail(int prefixLenPlus1) throws IOException { // this node makes it and we now compile it. first, // compile any targets that were previously // undecided: - parent.replaceLast( - lastInput.intAt(idx - 1), - compileNode(node, 1 + lastInput.length() - idx), - nextFinalOutput, - isFinal); + parent.replaceLast(lastInput.intAt(idx - 1), compileNode(node), nextFinalOutput, isFinal); } } @@ -840,7 +901,7 @@ void setEmptyOutput(T v) { } void finish(long newStartNode) { - assert newStartNode <= bytes.getPosition(); + assert newStartNode <= numBytesWritten; if (fst.metadata.startNode != -1) { throw new IllegalStateException("already finished"); } @@ -848,7 +909,8 @@ void finish(long newStartNode) { newStartNode = 0; } fst.metadata.startNode = newStartNode; - fst.metadata.numBytes = bytes.getPosition(); + fst.metadata.numBytes = numBytesWritten; + scratchBytes.truncate(0); } private boolean validOutput(T output) { @@ -856,6 +918,8 @@ private boolean validOutput(T output) { } /** Returns final FST. NOTE: this will return null if nothing is accepted by the FST. */ + // TODO: make this method to only return the FSTMetadata and user needs to construct the FST + // themselves public FST compile() throws IOException { final UnCompiledNode root = frontier[0]; @@ -870,8 +934,7 @@ public FST compile() throws IOException { // if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " // root.output=" + root.output); - finish(compileNode(root, lastInput.length()).node); - bytes.finish(); + finish(compileNode(root).node); return fst; } @@ -894,7 +957,7 @@ interface Node { } public long fstRamBytesUsed() { - return fst.ramBytesUsed(); + return scratchBytes.ramBytesUsed(); } static final class CompiledNode implements Node { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java index faf258afde06..873d7eacfa4c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -129,7 +129,6 @@ public long add(FSTCompiler.UnCompiledNode nodeIn) throws IOException { } else { // not in fallback either -- freeze & add the incoming node - long startAddress = fstCompiler.bytes.getPosition(); // freeze & add nodeAddress = fstCompiler.addNode(nodeIn); @@ -137,8 +136,8 @@ public long add(FSTCompiler.UnCompiledNode nodeIn) throws IOException { // we use 0 as empty marker in hash table, so it better be impossible to get a frozen node // at 0: assert nodeAddress != FST.FINAL_END_NODE && nodeAddress != FST.NON_FINAL_END_NODE; - byte[] buf = new byte[Math.toIntExact(nodeAddress - startAddress + 1)]; - fstCompiler.bytes.copyBytes(startAddress, buf, 0, buf.length); + byte[] buf = new byte[Math.toIntExact(fstCompiler.scratchBytes.getPosition())]; + fstCompiler.scratchBytes.copyBytes(0, buf, 0, buf.length); primaryTable.setNodeAddress(hashSlot, nodeAddress); primaryTable.copyNodeBytes(hashSlot, buf); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java index 2113c93f4c3a..4aafd1f6af1c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.util.fst; +import static org.apache.lucene.util.fst.FSTCompiler.getOnHeapDataOutput; + import java.io.IOException; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -32,10 +34,10 @@ public final class OnHeapFSTStore implements FSTStore { RamUsageEstimator.shallowSizeOfInstance(OnHeapFSTStore.class); /** - * A {@link BytesStore}, used during building, or during reading when the FST is very large (more - * than 1 GB). If the FST is less than 1 GB then bytesArray is set instead. + * A {@link ByteBuffersFSTReader}, used during reading when the FST is very large (more than 1 + * GB). If the FST is less than 1 GB then bytesArray is set instead. */ - private BytesStore bytes; + private ByteBuffersFSTReader byteBuffersReader; /** Used at read time when the FST fits into a single byte[]. */ private byte[] bytesArray; @@ -54,8 +56,8 @@ public OnHeapFSTStore(int maxBlockBits) { public FSTStore init(DataInput in, long numBytes) throws IOException { if (numBytes > 1 << this.maxBlockBits) { // FST is big: we need multiple pages - bytes = new BytesStore(this.maxBlockBits); - bytes.copyBytes(in, numBytes); + byteBuffersReader = (ByteBuffersFSTReader) getOnHeapDataOutput(maxBlockBits); + byteBuffersReader.copyBytes(in, numBytes); } else { // FST fits into a single block: use ByteArrayBytesStoreReader for less overhead bytesArray = new byte[(int) numBytes]; @@ -70,7 +72,7 @@ public long ramBytesUsed() { if (bytesArray != null) { size += bytesArray.length; } else { - size += bytes.ramBytesUsed(); + size += byteBuffersReader.ramBytesUsed(); } return size; } @@ -80,14 +82,14 @@ public FST.BytesReader getReverseBytesReader() { if (bytesArray != null) { return new ReverseBytesReader(bytesArray); } else { - return bytes.getReverseBytesReader(); + return byteBuffersReader.getReverseBytesReader(); } } @Override public void writeTo(DataOutput out) throws IOException { - if (bytes != null) { - bytes.writeTo(out); + if (byteBuffersReader != null) { + byteBuffersReader.writeTo(out); } else { assert bytesArray != null; out.writeBytes(bytesArray, 0, bytesArray.length); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java index 6190b903c7d5..3182acc1f11a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.util.fst; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Arrays; import org.apache.lucene.store.ByteArrayDataInput; @@ -23,6 +24,7 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.ArrayUtil; @@ -36,16 +38,14 @@ public void testRandom() throws Exception { for (int iter = 0; iter < iters; iter++) { final int numBytes = TestUtil.nextInt(random(), 1, maxBytes); final byte[] expected = new byte[numBytes]; - final int blockBits = TestUtil.nextInt(random(), 8, 15); - final BytesStore bytes = new BytesStore(blockBits); + final BytesStore bytes = new BytesStore(); if (VERBOSE) { - System.out.println( - "TEST: iter=" + iter + " numBytes=" + numBytes + " blockBits=" + blockBits); + System.out.println("TEST: iter=" + iter + " numBytes=" + numBytes); } int pos = 0; while (pos < numBytes) { - int op = random().nextInt(8); + int op = random().nextInt(7); if (VERBOSE) { System.out.println(" cycle pos=" + pos); } @@ -79,40 +79,13 @@ public void testRandom() throws Exception { break; case 2: - { - // write int @ absolute pos - if (pos > 4) { - int x = random().nextInt(); - int randomPos = random().nextInt(pos - 4); - if (VERBOSE) { - System.out.println(" abs writeInt pos=" + randomPos + " x=" + x); - } - bytes.writeInt(randomPos, x); - expected[randomPos++] = (byte) (x >> 24); - expected[randomPos++] = (byte) (x >> 16); - expected[randomPos++] = (byte) (x >> 8); - expected[randomPos++] = (byte) x; - } - } - break; - - case 3: { // reverse bytes if (pos > 1) { - int len = TestUtil.nextInt(random(), 2, Math.min(100, pos)); - int start; - if (len == pos) { - start = 0; - } else { - start = random().nextInt(pos - len); - } - int end = start + len - 1; - if (VERBOSE) { - System.out.println( - " reverse start=" + start + " end=" + end + " len=" + len + " pos=" + pos); - } - bytes.reverse(start, end); + bytes.reverse(); + + int start = 0; + int end = bytes.getPosition() - 1; while (start <= end) { byte b = expected[end]; @@ -125,7 +98,7 @@ public void testRandom() throws Exception { } break; - case 4: + case 3: { // abs write random byte[] if (pos > 2) { @@ -148,7 +121,7 @@ public void testRandom() throws Exception { } break; - case 5: + case 4: { // copyBytes if (pos > 1) { @@ -164,7 +137,7 @@ public void testRandom() throws Exception { } break; - case 6: + case 5: { // skip int len = random().nextInt(Math.min(100, numBytes - pos)); @@ -185,7 +158,7 @@ public void testRandom() throws Exception { } break; - case 7: + case 6: { // absWriteByte if (pos > 0) { @@ -227,7 +200,7 @@ public void testRandom() throws Exception { bytes.writeTo(out); out.close(); IndexInput in = dir.openInput("bytes", IOContext.DEFAULT); - bytesToVerify = new BytesStore(TestUtil.nextInt(random(), 8, 20)); + bytesToVerify = new BytesStore(); bytesToVerify.copyBytes(in, numBytes); in.close(); dir.close(); @@ -246,8 +219,7 @@ public void testCopyBytesOnByteStore() throws IOException { int offset = TestUtil.nextInt(random(), 0, 100); int len = bytes.length - offset; ByteArrayDataInput in = new ByteArrayDataInput(bytes, offset, len); - final int blockBits = TestUtil.nextInt(random(), 8, 15); - final BytesStore o = new BytesStore(blockBits); + final BytesStore o = new BytesStore(); o.copyBytes(in, len); o.copyBytes(0, bytesout, 0, len); assertArrayEquals( @@ -265,128 +237,14 @@ private void verify(BytesStore bytes, byte[] expected, int totalLength) throws E } // First verify whole thing in one blast: - byte[] actual = new byte[totalLength]; - if (random().nextBoolean()) { - if (VERBOSE) { - System.out.println(" bulk: reversed"); - } - // reversed - FST.BytesReader r = bytes.getReverseBytesReader(); - r.setPosition(totalLength - 1); - r.readBytes(actual, 0, actual.length); - int start = 0; - int end = totalLength - 1; - while (start < end) { - byte b = actual[start]; - actual[start] = actual[end]; - actual[end] = b; - start++; - end--; - } - } else { - // forward - if (VERBOSE) { - System.out.println(" bulk: forward"); - } - FST.BytesReader r = bytes.getForwardReader(); - r.readBytes(actual, 0, actual.length); - } + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + bytes.writeTo(new OutputStreamDataOutput(baos)); + byte[] actual = baos.toByteArray(); + + assertEquals(totalLength, actual.length); for (int i = 0; i < totalLength; i++) { assertEquals("byte @ index=" + i, expected[i], actual[i]); } - - FST.BytesReader r; - - // Then verify ops: - boolean reversed = random().nextBoolean(); - if (reversed) { - if (VERBOSE) { - System.out.println(" ops: reversed"); - } - r = bytes.getReverseBytesReader(); - } else { - if (VERBOSE) { - System.out.println(" ops: forward"); - } - r = bytes.getForwardReader(); - } - - if (totalLength > 1) { - int numOps = TestUtil.nextInt(random(), 100, 200); - for (int op = 0; op < numOps; op++) { - - int numBytes = random().nextInt(Math.min(1000, totalLength - 1)); - int pos; - if (reversed) { - pos = TestUtil.nextInt(random(), numBytes, totalLength - 1); - } else { - pos = random().nextInt(totalLength - numBytes); - } - if (VERBOSE) { - System.out.println( - " op iter=" - + op - + " reversed=" - + reversed - + " numBytes=" - + numBytes - + " pos=" - + pos); - } - byte[] temp = new byte[numBytes]; - r.setPosition(pos); - assertEquals(pos, r.getPosition()); - r.readBytes(temp, 0, temp.length); - for (int i = 0; i < numBytes; i++) { - byte expectedByte; - if (reversed) { - expectedByte = expected[pos - i]; - } else { - expectedByte = expected[pos + i]; - } - assertEquals("byte @ index=" + i, expectedByte, temp[i]); - } - - int left; - int expectedPos; - - if (reversed) { - expectedPos = pos - numBytes; - left = (int) r.getPosition(); - } else { - expectedPos = pos + numBytes; - left = (int) (totalLength - r.getPosition()); - } - assertEquals(expectedPos, r.getPosition()); - - if (left > 4) { - int skipBytes = random().nextInt(left - 4); - - int expectedInt = 0; - if (reversed) { - expectedPos -= skipBytes; - expectedInt |= (expected[expectedPos--] & 0xFF); - expectedInt |= (expected[expectedPos--] & 0xFF) << 8; - expectedInt |= (expected[expectedPos--] & 0xFF) << 16; - expectedInt |= (expected[expectedPos--] & 0xFF) << 24; - } else { - expectedPos += skipBytes; - expectedInt |= (expected[expectedPos++] & 0xFF); - expectedInt |= (expected[expectedPos++] & 0xFF) << 8; - expectedInt |= (expected[expectedPos++] & 0xFF) << 16; - expectedInt |= (expected[expectedPos++] & 0xFF) << 24; - } - - if (VERBOSE) { - System.out.println(" skip numBytes=" + skipBytes); - System.out.println(" readInt"); - } - - r.skipBytes(skipBytes); - assertEquals(expectedInt, r.readInt()); - } - } - } } } diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java new file mode 100644 index 000000000000..130ba24f1d31 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import static org.apache.lucene.tests.util.fst.FSTTester.toIntsRef; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.tests.store.MockDirectoryWrapper; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.tests.util.fst.FSTTester; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +public class TestFSTDataOutputWriter extends LuceneTestCase { + + private MockDirectoryWrapper dir; + + @Override + public void setUp() throws Exception { + super.setUp(); + dir = newMockDirectory(); + } + + @Override + public void tearDown() throws Exception { + // can be null if we force simpletext (funky, some kind of bug in test runner maybe) + if (dir != null) { + dir.close(); + } + super.tearDown(); + } + + public void testRandom() throws Exception { + + final int iters = atLeast(10); + final int maxBytes = TEST_NIGHTLY ? 200000 : 20000; + for (int iter = 0; iter < iters; iter++) { + final int numBytes = TestUtil.nextInt(random(), 1, maxBytes); + final byte[] expected = new byte[numBytes]; + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final DataOutput dataOutput = new OutputStreamDataOutput(baos); + if (VERBOSE) { + System.out.println("TEST: iter=" + iter + " numBytes=" + numBytes); + } + + int pos = 0; + while (pos < numBytes) { + int op = random().nextInt(2); + if (VERBOSE) { + System.out.println(" cycle pos=" + pos); + } + switch (op) { + case 0: + { + // write random byte + byte b = (byte) random().nextInt(256); + if (VERBOSE) { + System.out.println(" writeByte b=" + b); + } + + expected[pos++] = b; + dataOutput.writeByte(b); + } + break; + + case 1: + { + // write random byte[] + int len = random().nextInt(Math.min(numBytes - pos, 100)); + byte[] temp = new byte[len]; + random().nextBytes(temp); + if (VERBOSE) { + System.out.println(" writeBytes len=" + len + " bytes=" + Arrays.toString(temp)); + } + System.arraycopy(temp, 0, expected, pos, temp.length); + dataOutput.writeBytes(temp, 0, temp.length); + pos += len; + } + break; + } + + assertEquals(pos, baos.toByteArray().length); + } + for (int i = 0; i < numBytes; i++) { + assertEquals("byte @ index=" + i, expected[i], baos.toByteArray()[i]); + } + } + } + + public void testBasicFSA() throws IOException { + String[] strings2 = + new String[] { + "station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation" + }; + IntsRef[] terms2 = new IntsRef[strings2.length]; + // we will also test writing multiple FST to a single byte array + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (int inputMode = 0; inputMode < 2; inputMode++) { + if (VERBOSE) { + System.out.println("TEST: inputMode=" + inputModeToString(inputMode)); + } + + for (int idx = 0; idx < strings2.length; idx++) { + terms2[idx] = toIntsRef(strings2[idx], inputMode); + } + Arrays.sort(terms2); + + // Test pre-determined FST sizes to make sure we haven't lost minimality (at least on this + // trivial set of terms): + + // FSA + { + final Outputs outputs = NoOutputs.getSingleton(); + final Object NO_OUTPUT = outputs.getNoOutput(); + final List> pairs = new ArrayList<>(terms2.length); + for (IntsRef term : terms2) { + pairs.add(new FSTTester.InputOutput<>(term, NO_OUTPUT)); + } + FSTTester tester = + new DataOutputFSTTester<>(random(), dir, inputMode, pairs, outputs, baos); + FST fst = tester.doTest(); + assertNotNull(fst); + assertEquals(22, tester.nodeCount); + assertEquals(27, tester.arcCount); + } + + // FST ord pos int + { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); + final List> pairs = new ArrayList<>(terms2.length); + for (int idx = 0; idx < terms2.length; idx++) { + pairs.add(new FSTTester.InputOutput<>(terms2[idx], (long) idx)); + } + FSTTester tester = + new DataOutputFSTTester<>(random(), dir, inputMode, pairs, outputs, baos); + final FST fst = tester.doTest(); + assertNotNull(fst); + assertEquals(22, tester.nodeCount); + assertEquals(27, tester.arcCount); + } + + // FST byte sequence ord + { + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + final List> pairs = new ArrayList<>(terms2.length); + for (int idx = 0; idx < terms2.length; idx++) { + final BytesRef output = newBytesRef(Integer.toString(idx)); + pairs.add(new FSTTester.InputOutput<>(terms2[idx], output)); + } + FSTTester tester = + new DataOutputFSTTester<>(random(), dir, inputMode, pairs, outputs, baos); + final FST fst = tester.doTest(); + assertNotNull(fst); + assertEquals(24, tester.nodeCount); + assertEquals(30, tester.arcCount); + } + } + } + + class DataOutputFSTTester extends FSTTester { + + private final ByteArrayOutputStream baos; + private int previousOffset; + + public DataOutputFSTTester( + Random random, + Directory dir, + int inputMode, + List> pairs, + Outputs outputs, + ByteArrayOutputStream baos) { + super(random, dir, inputMode, pairs, outputs); + this.baos = baos; + } + + @Override + protected FSTCompiler.Builder getFSTBuilder() { + // as the byte array could already contain another FST bytes, we should get the current offset + // to know where to start reading from + this.previousOffset = baos.size(); + return super.getFSTBuilder().dataOutput(new OutputStreamDataOutput(baos)); + } + + @Override + protected FST compile(FSTCompiler fstCompiler) throws IOException { + FST fst = fstCompiler.compile(); + assertFalse(fst.hasReverseBytesReader()); + + // the returned FST is not readable thus we need to reconstruct one with FSTStore + DataInput dataIn = + new InputStreamDataInput( + new ByteArrayInputStream( + baos.toByteArray(), previousOffset, baos.size() - previousOffset)); + return new FST<>(fst.getMetadata(), dataIn, outputs, new OnHeapFSTStore(5)); + } + } + + String inputModeToString(int mode) { + if (mode == 0) { + return "utf8"; + } else { + return "utf32"; + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java index 0fc6cb9c7509..09f3d4f10392 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java @@ -65,7 +65,7 @@ public void testDeDupTails() throws Exception { } entries.add(new BytesRef(b)); } - long size = buildFST(entries).ramBytesUsed(); + long size = buildFST(entries).numBytes(); // Size is 1648 when we use only list-encoding. We were previously failing to ever de-dup // direct addressing, which led this case to blow up. // This test will fail if there is more than 1% size increase with direct addressing. diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 188774014b90..ea63e4233820 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -521,7 +521,8 @@ private abstract static class VisitTerms { private final FSTCompiler fstCompiler; public VisitTerms( - Path dirOut, Path wordsFileIn, int inputMode, Outputs outputs, boolean noArcArrays) { + Path dirOut, Path wordsFileIn, int inputMode, Outputs outputs, boolean noArcArrays) + throws IOException { this.dirOut = dirOut; this.wordsFileIn = wordsFileIn; this.inputMode = inputMode; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java index 974e575b3dcf..0e0019386221 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java @@ -255,10 +255,7 @@ private T randomAcceptedWord(FST fst, IntsRefBuilder in) throws IOException { public FST doTest() throws IOException { - final FSTCompiler fstCompiler = - new FSTCompiler.Builder<>( - inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs) - .build(); + final FSTCompiler fstCompiler = getFSTBuilder().build(); for (InputOutput pair : pairs) { if (pair.output instanceof List) { @@ -273,7 +270,7 @@ public FST doTest() throws IOException { fstCompiler.add(pair.input, pair.output); } } - FST fst = fstCompiler.compile(); + FST fst = compile(fstCompiler); if (random.nextBoolean() && fst != null) { IOContext context = LuceneTestCase.newIOContext(random); @@ -316,6 +313,15 @@ public FST doTest() throws IOException { return fst; } + protected FST compile(FSTCompiler fstCompiler) throws IOException { + return fstCompiler.compile(); + } + + protected FSTCompiler.Builder getFSTBuilder() { + return new FSTCompiler.Builder<>( + inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs); + } + protected boolean outputsEqual(T a, T b) { return a.equals(b); } From 13c9359cd1470005607bb40b72989e1105efeab9 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 15 Nov 2023 14:57:41 +0900 Subject: [PATCH 04/45] access BytesStore byte[] directly for copying --- .../java/org/apache/lucene/util/fst/BytesStore.java | 2 +- .../src/java/org/apache/lucene/util/fst/NodeHash.java | 10 ++++------ .../src/test/org/apache/lucene/util/fst/TestFSTs.java | 3 +-- .../test/org/apache/lucene/util/fst/TestNodeHash.java | 2 +- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index 5ff6adfac790..8842f7690260 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -32,7 +32,7 @@ class BytesStore extends DataOutput implements Accountable { private static final int INITIAL_SIZE = 1 << 8; // holds an initial size of 256 bytes. this byte array will only grow, but not shrink - private byte[] bytes = new byte[INITIAL_SIZE]; + byte[] bytes = new byte[INITIAL_SIZE]; private int nextWrite; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java index 873d7eacfa4c..30af3353eb91 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -132,15 +132,13 @@ public long add(FSTCompiler.UnCompiledNode nodeIn) throws IOException { // freeze & add nodeAddress = fstCompiler.addNode(nodeIn); - // TODO: Write the bytes directly from BytesStore // we use 0 as empty marker in hash table, so it better be impossible to get a frozen node // at 0: assert nodeAddress != FST.FINAL_END_NODE && nodeAddress != FST.NON_FINAL_END_NODE; - byte[] buf = new byte[Math.toIntExact(fstCompiler.scratchBytes.getPosition())]; - fstCompiler.scratchBytes.copyBytes(0, buf, 0, buf.length); primaryTable.setNodeAddress(hashSlot, nodeAddress); - primaryTable.copyNodeBytes(hashSlot, buf); + primaryTable.copyNodeBytes( + hashSlot, fstCompiler.scratchBytes.bytes, fstCompiler.scratchBytes.getPosition()); // confirm frozen hash and unfrozen hash are the same assert primaryTable.hash(nodeAddress, hashSlot) == hash @@ -299,9 +297,9 @@ public void setNodeAddress(long hashSlot, long nodeAddress) { } /** copy the node bytes from the FST */ - void copyNodeBytes(long hashSlot, byte[] bytes) { + void copyNodeBytes(long hashSlot, byte[] bytes, int length) { assert copiedNodeAddress.get(hashSlot) == 0; - copiedNodes.append(bytes); + copiedNodes.append(bytes, 0, length); // write the offset, which points to the last byte of the node we copied since we later read // this node in reverse copiedNodeAddress.set(hashSlot, copiedNodes.getPosition() - 1); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index ea63e4233820..188774014b90 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -521,8 +521,7 @@ private abstract static class VisitTerms { private final FSTCompiler fstCompiler; public VisitTerms( - Path dirOut, Path wordsFileIn, int inputMode, Outputs outputs, boolean noArcArrays) - throws IOException { + Path dirOut, Path wordsFileIn, int inputMode, Outputs outputs, boolean noArcArrays) { this.dirOut = dirOut; this.wordsFileIn = wordsFileIn; this.inputMode = inputMode; diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestNodeHash.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestNodeHash.java index 4c9b531279ac..8319f20efea3 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestNodeHash.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestNodeHash.java @@ -30,7 +30,7 @@ public void testCopyFallbackNodeBytes() { int nodeLength = atLeast(500); long fallbackHashSlot = 1; byte[] fallbackBytes = RandomBytes.randomBytesOfLength(random(), nodeLength); - fallbackHashTable.copyNodeBytes(fallbackHashSlot, fallbackBytes); + fallbackHashTable.copyNodeBytes(fallbackHashSlot, fallbackBytes, nodeLength); // check if the bytes we wrote are the same as the original bytes byte[] storedBytes = fallbackHashTable.getBytes(fallbackHashSlot, nodeLength); From fa08d515080284097c162e70a87cb10834e6e940 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 15 Nov 2023 20:40:04 +0900 Subject: [PATCH 05/45] Rename BytesStore --- .../java/org/apache/lucene/util/fst/FST.java | 5 +-- .../apache/lucene/util/fst/FSTCompiler.java | 2 +- ....java => GrowableByteArrayDataOutput.java} | 37 ++++++++++--------- ...a => TestGrowableByteArrayDataOutput.java} | 13 ++++--- 4 files changed, 28 insertions(+), 29 deletions(-) rename lucene/core/src/java/org/apache/lucene/util/fst/{BytesStore.java => GrowableByteArrayDataOutput.java} (94%) rename lucene/core/src/test/org/apache/lucene/util/fst/{TestBytesStore.java => TestGrowableByteArrayDataOutput.java} (94%) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 77c26542ca8e..96cd8a6d8694 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -124,10 +124,7 @@ public enum INPUT_TYPE { /** If arc has this label then that arc is final/accepted */ public static final int END_LABEL = -1; - /** - * A {@link BytesStore}, used during building, or during reading when the FST is very large (more - * than 1 GB). If the FST is less than 1 GB then bytesArray is set instead. - */ + /** The reader of the FST */ private final FSTReader fstReader; public final Outputs outputs; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index a59a6990b6ac..7c3eaae3a77b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -126,7 +126,7 @@ public class FSTCompiler { final DataOutput dataOutput; // buffer to store bytes for the one node we are currently writing - final BytesStore scratchBytes = new BytesStore(); + final GrowableByteArrayDataOutput scratchBytes = new GrowableByteArrayDataOutput(); private long numBytesWritten; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java similarity index 94% rename from lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java rename to lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 8842f7690260..e52421788247 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -23,11 +23,12 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; -// Storing a byte[] for the current node of the FST we are writing -class BytesStore extends DataOutput implements Accountable { +// Storing a byte[] for the current node of the FST we are writing. The byte[] will only grow, never +// shrink. +class GrowableByteArrayDataOutput extends DataOutput implements Accountable { private static final long BASE_RAM_BYTES_USED = - RamUsageEstimator.shallowSizeOfInstance(BytesStore.class); + RamUsageEstimator.shallowSizeOfInstance(GrowableByteArrayDataOutput.class); private static final int INITIAL_SIZE = 1 << 8; @@ -36,12 +37,6 @@ class BytesStore extends DataOutput implements Accountable { private int nextWrite; - /** Absolute write byte; you must ensure dest is < max position written so far. */ - public void writeByte(int dest, byte b) { - assert dest < nextWrite; - bytes[dest] = b; - } - @Override public void writeByte(byte b) { ensureCapacity(1); @@ -64,15 +59,6 @@ private void ensureCapacity(int capacityToWrite) { bytes = ArrayUtil.grow(bytes, nextWrite + capacityToWrite); } - /** - * Absolute writeBytes without changing the current position. Note: this cannot "grow" the bytes, - * so you must only call it on already written parts. - */ - void writeBytes(int dest, byte[] b, int offset, int len) { - assert dest + len <= getPosition() : "dest=" + dest + " pos=" + getPosition() + " len=" + len; - System.arraycopy(b, offset, bytes, dest, len); - } - @Override public void copyBytes(DataInput input, long numBytes) throws IOException { assert numBytes >= 0 : "numBytes=" + numBytes; @@ -83,6 +69,21 @@ public void copyBytes(DataInput input, long numBytes) throws IOException { nextWrite += length; } + /** Absolute write byte; you must ensure dest is < max position written so far. */ + public void writeByte(int dest, byte b) { + assert dest < nextWrite; + bytes[dest] = b; + } + + /** + * Absolute writeBytes without changing the current position. Note: this cannot "grow" the bytes, + * so you must only call it on already written parts. + */ + public void writeBytes(int dest, byte[] b, int offset, int len) { + assert dest + len <= getPosition() : "dest=" + dest + " pos=" + getPosition() + " len=" + len; + System.arraycopy(b, offset, bytes, dest, len); + } + /** * Absolute copy bytes self to self, without changing the position. Note: this cannot "grow" the * bytes, so must only call it on already written parts. diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java similarity index 94% rename from lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java rename to lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java index 3182acc1f11a..573cd19beee1 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java @@ -29,7 +29,7 @@ import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.ArrayUtil; -public class TestBytesStore extends LuceneTestCase { +public class TestGrowableByteArrayDataOutput extends LuceneTestCase { public void testRandom() throws Exception { @@ -38,7 +38,7 @@ public void testRandom() throws Exception { for (int iter = 0; iter < iters; iter++) { final int numBytes = TestUtil.nextInt(random(), 1, maxBytes); final byte[] expected = new byte[numBytes]; - final BytesStore bytes = new BytesStore(); + final GrowableByteArrayDataOutput bytes = new GrowableByteArrayDataOutput(); if (VERBOSE) { System.out.println("TEST: iter=" + iter + " numBytes=" + numBytes); } @@ -189,7 +189,7 @@ public void testRandom() throws Exception { } } - BytesStore bytesToVerify; + GrowableByteArrayDataOutput bytesToVerify; if (random().nextBoolean()) { if (VERBOSE) { @@ -200,7 +200,7 @@ public void testRandom() throws Exception { bytes.writeTo(out); out.close(); IndexInput in = dir.openInput("bytes", IOContext.DEFAULT); - bytesToVerify = new BytesStore(); + bytesToVerify = new GrowableByteArrayDataOutput(); bytesToVerify.copyBytes(in, numBytes); in.close(); dir.close(); @@ -219,7 +219,7 @@ public void testCopyBytesOnByteStore() throws IOException { int offset = TestUtil.nextInt(random(), 0, 100); int len = bytes.length - offset; ByteArrayDataInput in = new ByteArrayDataInput(bytes, offset, len); - final BytesStore o = new BytesStore(); + final GrowableByteArrayDataOutput o = new GrowableByteArrayDataOutput(); o.copyBytes(in, len); o.copyBytes(0, bytesout, 0, len); assertArrayEquals( @@ -227,7 +227,8 @@ public void testCopyBytesOnByteStore() throws IOException { ArrayUtil.copyOfSubArray(bytes, offset, offset + len)); } - private void verify(BytesStore bytes, byte[] expected, int totalLength) throws Exception { + private void verify(GrowableByteArrayDataOutput bytes, byte[] expected, int totalLength) + throws Exception { assertEquals(totalLength, bytes.getPosition()); if (totalLength == 0) { return; From c6fb4b5f2de760583f01fd49d9c249afdfd73b0d Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 15 Nov 2023 20:46:52 +0900 Subject: [PATCH 06/45] Change class to final --- .../org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index e52421788247..c6e97cb0efff 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -25,7 +25,7 @@ // Storing a byte[] for the current node of the FST we are writing. The byte[] will only grow, never // shrink. -class GrowableByteArrayDataOutput extends DataOutput implements Accountable { +final class GrowableByteArrayDataOutput extends DataOutput implements Accountable { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(GrowableByteArrayDataOutput.class); From f1e8f8996d8c8b02c3cd1366a69f6a72f08e038e Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 11:14:18 +0900 Subject: [PATCH 07/45] Reorder methods --- .../util/fst/GrowableByteArrayDataOutput.java | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index c6e97cb0efff..7cd560ea22eb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -50,15 +50,6 @@ public void writeBytes(byte[] b, int offset, int len) { nextWrite += len; } - /** - * Ensure we can write additional capacityToWrite bytes. The array will preferably grow x2 size. - * - * @param capacityToWrite the additional bytes to write - */ - private void ensureCapacity(int capacityToWrite) { - bytes = ArrayUtil.grow(bytes, nextWrite + capacityToWrite); - } - @Override public void copyBytes(DataInput input, long numBytes) throws IOException { assert numBytes >= 0 : "numBytes=" + numBytes; @@ -69,6 +60,21 @@ public void copyBytes(DataInput input, long numBytes) throws IOException { nextWrite += length; } + /** Skip a number of bytes, increasing capacity if needed */ + public void skipBytes(int len) { + ensureCapacity(len); + nextWrite += len; + } + + /** + * Ensure we can write additional capacityToWrite bytes. + * + * @param capacityToWrite the additional bytes to write + */ + private void ensureCapacity(int capacityToWrite) { + bytes = ArrayUtil.grow(bytes, nextWrite + capacityToWrite); + } + /** Absolute write byte; you must ensure dest is < max position written so far. */ public void writeByte(int dest, byte b) { assert dest < nextWrite; @@ -110,11 +116,6 @@ public void reverse() { } } - public void skipBytes(int len) { - ensureCapacity(len); - nextWrite += len; - } - public int getPosition() { return nextWrite; } From 2f9c7300c5f92711cc396d45591c5e4e795383bb Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 11:23:22 +0900 Subject: [PATCH 08/45] Remove unused methods --- .../util/fst/GrowableByteArrayDataOutput.java | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 7cd560ea22eb..81bf3de73c46 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -50,16 +50,6 @@ public void writeBytes(byte[] b, int offset, int len) { nextWrite += len; } - @Override - public void copyBytes(DataInput input, long numBytes) throws IOException { - assert numBytes >= 0 : "numBytes=" + numBytes; - assert input != null; - int length = Math.toIntExact(numBytes); - ensureCapacity(length); - input.readBytes(bytes, nextWrite, length); - nextWrite += length; - } - /** Skip a number of bytes, increasing capacity if needed */ public void skipBytes(int len) { ensureCapacity(len); @@ -125,8 +115,7 @@ public int getPosition() { * this! */ public void truncate(int newLen) { - assert newLen <= getPosition(); - assert newLen >= 0; + assert newLen >= 0 && newLen <= getPosition(); nextWrite = newLen; } From 847828d1bf38e4449a261d9bc7e4fa6127bef503 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 13:45:58 +0900 Subject: [PATCH 09/45] Rename truncate to setPosition() and remove skipBytes() --- .../apache/lucene/util/fst/FSTCompiler.java | 16 ++----- .../util/fst/GrowableByteArrayDataOutput.java | 48 ++++++++----------- .../fst/TestGrowableByteArrayDataOutput.java | 25 +--------- 3 files changed, 25 insertions(+), 64 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 7c3eaae3a77b..130e4a04ee3c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -380,10 +380,7 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { } } // reset the scratch writer to prepare for new write - scratchBytes.truncate(0); - - // the scratch writer must be cleaned at this point - assert scratchBytes.getPosition() == 0; + scratchBytes.setPosition(0); final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(nodeIn); if (doFixedLengthArcs) { @@ -615,7 +612,7 @@ private void writeNodeForBinarySearch(FSTCompiler.UnCompiledNode nodeIn, int int destPos = headerLen + nodeIn.numArcs * maxBytesPerArc; assert destPos >= srcPos; if (destPos > srcPos) { - scratchBytes.skipBytes(destPos - srcPos); + scratchBytes.setPosition(destPos); for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { destPos -= maxBytesPerArc; int arcLen = numBytesPerArc[arcIdx]; @@ -693,12 +690,7 @@ private void writeNodeForDirectAddressingOrContinuous( // Prepare the builder byte store. Enlarge or truncate if needed. int nodeEnd = headerLen + numPresenceBytes + totalArcBytes; - int currentPosition = scratchBytes.getPosition(); - if (nodeEnd >= currentPosition) { - scratchBytes.skipBytes(nodeEnd - currentPosition); - } else { - scratchBytes.truncate(nodeEnd); - } + scratchBytes.setPosition(nodeEnd); assert scratchBytes.getPosition() == nodeEnd; // Write the header. @@ -910,7 +902,7 @@ void finish(long newStartNode) { } fst.metadata.startNode = newStartNode; fst.metadata.numBytes = numBytesWritten; - scratchBytes.truncate(0); + scratchBytes.setPosition(0); } private boolean validOutput(T output) { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 81bf3de73c46..0d2d25dd9567 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -17,7 +17,6 @@ package org.apache.lucene.util.fst; import java.io.IOException; -import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.ArrayUtil; @@ -50,10 +49,19 @@ public void writeBytes(byte[] b, int offset, int len) { nextWrite += len; } - /** Skip a number of bytes, increasing capacity if needed */ - public void skipBytes(int len) { - ensureCapacity(len); - nextWrite += len; + public int getPosition() { + return nextWrite; + } + + /** + * Set the position of the byte[], increasing the capacity if needed + */ + public void setPosition(int newLen) { + assert newLen >= 0; + if (newLen > nextWrite) { + ensureCapacity(newLen - nextWrite); + } + nextWrite = newLen; } /** @@ -65,6 +73,11 @@ private void ensureCapacity(int capacityToWrite) { bytes = ArrayUtil.grow(bytes, nextWrite + capacityToWrite); } + /** Writes all of our bytes to the target {@link DataOutput}. */ + public void writeTo(DataOutput out) throws IOException { + out.writeBytes(bytes, 0, nextWrite); + } + /** Absolute write byte; you must ensure dest is < max position written so far. */ public void writeByte(int dest, byte b) { assert dest < nextWrite; @@ -76,7 +89,7 @@ public void writeByte(int dest, byte b) { * so you must only call it on already written parts. */ public void writeBytes(int dest, byte[] b, int offset, int len) { - assert dest + len <= getPosition() : "dest=" + dest + " pos=" + getPosition() + " len=" + len; + assert dest + len <= nextWrite : "dest=" + dest + " pos=" + nextWrite + " len=" + len; System.arraycopy(b, offset, bytes, dest, len); } @@ -106,31 +119,8 @@ public void reverse() { } } - public int getPosition() { - return nextWrite; - } - - /** - * Pos must be less than the max position written so far! Ie, you cannot "grow" the file with - * this! - */ - public void truncate(int newLen) { - assert newLen >= 0 && newLen <= getPosition(); - nextWrite = newLen; - } - - /** Writes all of our bytes to the target {@link DataOutput}. */ - public void writeTo(DataOutput out) throws IOException { - out.writeBytes(bytes, 0, nextWrite); - } - @Override public long ramBytesUsed() { return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(bytes); } - - @Override - public String toString() { - return getClass().getSimpleName() + "(pos=" + nextWrite + ")"; - } } diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java index 573cd19beee1..ddca36ba5ce4 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java @@ -45,7 +45,7 @@ public void testRandom() throws Exception { int pos = 0; while (pos < numBytes) { - int op = random().nextInt(7); + int op = random().nextInt(6); if (VERBOSE) { System.out.println(" cycle pos=" + pos); } @@ -138,27 +138,6 @@ public void testRandom() throws Exception { break; case 5: - { - // skip - int len = random().nextInt(Math.min(100, numBytes - pos)); - - if (VERBOSE) { - System.out.println(" skip len=" + len); - } - - pos += len; - bytes.skipBytes(len); - - // NOTE: must fill in zeros in case truncate was - // used, else we get false fails: - if (len > 0) { - byte[] zeros = new byte[len]; - bytes.writeBytes(pos - len, zeros, 0, len); - } - } - break; - - case 6: { // absWriteByte if (pos > 0) { @@ -176,7 +155,7 @@ public void testRandom() throws Exception { if (pos > 0 && random().nextInt(50) == 17) { // truncate int len = TestUtil.nextInt(random(), 1, Math.min(pos, 100)); - bytes.truncate(pos - len); + bytes.setPosition(pos - len); pos -= len; Arrays.fill(expected, pos, pos + len, (byte) 0); if (VERBOSE) { From 09130629c7cd3ec72c75f8e445928aa08604395e Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 14:17:00 +0900 Subject: [PATCH 10/45] Simplify the writing operations --- .../apache/lucene/util/fst/FSTCompiler.java | 35 +++++++------------ .../util/fst/GrowableByteArrayDataOutput.java | 25 +++---------- .../fst/TestGrowableByteArrayDataOutput.java | 32 ++--------------- 3 files changed, 19 insertions(+), 73 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 130e4a04ee3c..e87bc557ef16 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -631,12 +631,11 @@ private void writeNodeForBinarySearch(FSTCompiler.UnCompiledNode nodeIn, int + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs; - scratchBytes.copyBytes(srcPos, destPos, arcLen); + scratchBytes.writeBytes(destPos, scratchBytes.bytes, srcPos, arcLen); } } } - // Write the header. scratchBytes.writeBytes(0, fixedLengthArcsBuffer.getBytes(), 0, headerLen); } @@ -663,16 +662,16 @@ private void writeNodeForDirectAddressingOrContinuous( srcPos -= srcArcLen; int labelLen = numLabelBytesPerArc[arcIdx]; // Copy the flags. - scratchBytes.copyBytes(srcPos, buffer, bufferOffset, 1); + scratchBytes.writeTo(srcPos, buffer, bufferOffset, 1); // Skip the label, copy the remaining. int remainingArcLen = srcArcLen - 1 - labelLen; if (remainingArcLen != 0) { - scratchBytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); + scratchBytes.writeTo(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); } if (arcIdx == 0) { // Copy the label of the first arc only. bufferOffset -= labelLen; - scratchBytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); + scratchBytes.writeTo(srcPos + 1, buffer, bufferOffset, labelLen); } } assert bufferOffset == headerMaxLen + numPresenceBytes; @@ -688,29 +687,22 @@ private void writeNodeForDirectAddressingOrContinuous( maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc. int headerLen = fixedLengthArcsBuffer.getPosition(); - // Prepare the builder byte store. Enlarge or truncate if needed. - int nodeEnd = headerLen + numPresenceBytes + totalArcBytes; - scratchBytes.setPosition(nodeEnd); - assert scratchBytes.getPosition() == nodeEnd; - // Write the header. - scratchBytes.writeBytes(0, fixedLengthArcsBuffer.getBytes(), 0, headerLen); - int writeOffset = headerLen; + scratchBytes.setPosition(0); + scratchBytes.writeBytes(fixedLengthArcsBuffer.getBytes(), 0, headerLen); // Write the presence bits if (continuous == false) { - writePresenceBits(nodeIn, writeOffset, numPresenceBytes); - writeOffset += numPresenceBytes; + writePresenceBits(nodeIn, numPresenceBytes); + assert scratchBytes.getPosition() - headerLen == numPresenceBytes; } // Write the first label and the arcs. - scratchBytes.writeBytes( - writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); + scratchBytes.writeBytes(fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); + assert scratchBytes.getPosition() == headerLen + numPresenceBytes + totalArcBytes; } - private void writePresenceBits( - FSTCompiler.UnCompiledNode nodeIn, int dest, int numPresenceBytes) { - int bytePos = dest; + private void writePresenceBits(FSTCompiler.UnCompiledNode nodeIn, int numPresenceBytes) { byte presenceBits = 1; // The first arc is always present. int presenceIndex = 0; int previousLabel = nodeIn.arcs[0].label; @@ -719,7 +711,7 @@ private void writePresenceBits( assert label > previousLabel; presenceIndex += label - previousLabel; while (presenceIndex >= Byte.SIZE) { - scratchBytes.writeByte(bytePos++, presenceBits); + scratchBytes.writeByte(presenceBits); presenceBits = 0; presenceIndex -= Byte.SIZE; } @@ -730,8 +722,7 @@ private void writePresenceBits( assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8; assert presenceBits != 0; // The last byte is not 0. assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. - scratchBytes.writeByte(bytePos++, presenceBits); - assert bytePos - dest == numPresenceBytes; + scratchBytes.writeByte(presenceBits); } private void freezeTail(int prefixLenPlus1) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 0d2d25dd9567..f2f02460a839 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -53,9 +53,7 @@ public int getPosition() { return nextWrite; } - /** - * Set the position of the byte[], increasing the capacity if needed - */ + /** Set the position of the byte[], increasing the capacity if needed */ public void setPosition(int newLen) { assert newLen >= 0; if (newLen > nextWrite) { @@ -78,10 +76,9 @@ public void writeTo(DataOutput out) throws IOException { out.writeBytes(bytes, 0, nextWrite); } - /** Absolute write byte; you must ensure dest is < max position written so far. */ - public void writeByte(int dest, byte b) { - assert dest < nextWrite; - bytes[dest] = b; + /** Copies bytes from this store to a target byte array. */ + public void writeTo(int src, byte[] dest, int offset, int len) { + System.arraycopy(bytes, src, dest, offset, len); } /** @@ -93,20 +90,6 @@ public void writeBytes(int dest, byte[] b, int offset, int len) { System.arraycopy(b, offset, bytes, dest, len); } - /** - * Absolute copy bytes self to self, without changing the position. Note: this cannot "grow" the - * bytes, so must only call it on already written parts. - */ - public void copyBytes(int src, int dest, int len) { - assert src < dest; - writeBytes(dest, bytes, src, len); - } - - /** Copies bytes from this store to a target byte array. */ - public void copyBytes(int src, byte[] dest, int offset, int len) { - System.arraycopy(bytes, src, dest, offset, len); - } - /** Reverse from srcPos, inclusive, to destPos, inclusive. */ public void reverse() { int src = 0; diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java index ddca36ba5ce4..46814bb20ede 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java @@ -45,7 +45,7 @@ public void testRandom() throws Exception { int pos = 0; while (pos < numBytes) { - int op = random().nextInt(6); + int op = random().nextInt(4); if (VERBOSE) { System.out.println(" cycle pos=" + pos); } @@ -120,34 +120,6 @@ public void testRandom() throws Exception { } } break; - - case 4: - { - // copyBytes - if (pos > 1) { - int src = random().nextInt(pos - 1); - int dest = TestUtil.nextInt(random(), src + 1, pos - 1); - int len = TestUtil.nextInt(random(), 1, Math.min(300, pos - dest)); - if (VERBOSE) { - System.out.println(" copyBytes src=" + src + " dest=" + dest + " len=" + len); - } - System.arraycopy(expected, src, expected, dest, len); - bytes.copyBytes(src, dest, len); - } - } - break; - - case 5: - { - // absWriteByte - if (pos > 0) { - int dest = random().nextInt(pos); - byte b = (byte) random().nextInt(256); - expected[dest] = b; - bytes.writeByte(dest, b); - } - break; - } } assertEquals(pos, bytes.getPosition()); @@ -200,7 +172,7 @@ public void testCopyBytesOnByteStore() throws IOException { ByteArrayDataInput in = new ByteArrayDataInput(bytes, offset, len); final GrowableByteArrayDataOutput o = new GrowableByteArrayDataOutput(); o.copyBytes(in, len); - o.copyBytes(0, bytesout, 0, len); + o.writeTo(0, bytesout, 0, len); assertArrayEquals( ArrayUtil.copyOfSubArray(bytesout, 0, len), ArrayUtil.copyOfSubArray(bytes, offset, offset + len)); From 0de0d26675b966de80f11c66a87750379474a0ab Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 14:30:43 +0900 Subject: [PATCH 11/45] Update comment --- .../core/src/java/org/apache/lucene/util/fst/FSTCompiler.java | 1 - .../org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index e87bc557ef16..c92b372493f7 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -893,7 +893,6 @@ void finish(long newStartNode) { } fst.metadata.startNode = newStartNode; fst.metadata.numBytes = numBytesWritten; - scratchBytes.setPosition(0); } private boolean validOutput(T output) { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index f2f02460a839..405c9eb5bdcb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -90,7 +90,7 @@ public void writeBytes(int dest, byte[] b, int offset, int len) { System.arraycopy(b, offset, bytes, dest, len); } - /** Reverse from srcPos, inclusive, to destPos, inclusive. */ + /** Reverse the written byte[]. */ public void reverse() { int src = 0; int dest = nextWrite - 1; From ef3fdc6767da525fc1d64749401f7219d1712b68 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 14:44:33 +0900 Subject: [PATCH 12/45] remove unused parameter --- .../core/src/java/org/apache/lucene/util/fst/FSTCompiler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index c92b372493f7..9ecbdbed9d46 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -693,7 +693,7 @@ private void writeNodeForDirectAddressingOrContinuous( // Write the presence bits if (continuous == false) { - writePresenceBits(nodeIn, numPresenceBytes); + writePresenceBits(nodeIn); assert scratchBytes.getPosition() - headerLen == numPresenceBytes; } @@ -702,7 +702,7 @@ private void writeNodeForDirectAddressingOrContinuous( assert scratchBytes.getPosition() == headerLen + numPresenceBytes + totalArcBytes; } - private void writePresenceBits(FSTCompiler.UnCompiledNode nodeIn, int numPresenceBytes) { + private void writePresenceBits(FSTCompiler.UnCompiledNode nodeIn) { byte presenceBits = 1; // The first arc is always present. int presenceIndex = 0; int previousLabel = nodeIn.arcs[0].label; From f00d24f54ebc44ac771a6bb0b0c46f9ca5020ed6 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 15:01:37 +0900 Subject: [PATCH 13/45] Simplify BytesStore operation --- .../apache/lucene/util/fst/BytesStore.java | 20 +++++------ .../apache/lucene/util/fst/FSTCompiler.java | 33 ++++++------------- .../lucene/util/fst/TestBytesStore.java | 18 ++-------- 3 files changed, 21 insertions(+), 50 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index 900822966a4f..766759d12070 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -48,13 +48,6 @@ public BytesStore(int blockBits) { nextWrite = blockSize; } - /** Absolute write byte; you must ensure dest is < max position written so far. */ - public void writeByte(long dest, byte b) { - int blockIndex = (int) (dest >> blockBits); - byte[] block = blocks.get(blockIndex); - block[(int) (dest & blockMask)] = b; - } - @Override public void writeByte(byte b) { if (nextWrite == blockSize) { @@ -313,7 +306,7 @@ public void reverse(long srcPos, long destPos) { } } - public void skipBytes(int len) { + private void skipBytes(int len) { while (len > 0) { int chunk = blockSize - nextWrite; if (len <= chunk) { @@ -338,12 +331,15 @@ public long size() { } /** - * Pos must be less than the max position written so far! Ie, you cannot "grow" the file with - * this! + * Set the position of this BytesStore, truncating or expanding if needed */ - public void truncate(long newLen) { - assert newLen <= getPosition(); + public void setPosition(long newLen) { assert newLen >= 0; + long oldPosition = getPosition(); + if (newLen > oldPosition) { + skipBytes((int) (newLen - oldPosition)); + return; + } int blockIndex = (int) (newLen >> blockBits); nextWrite = (int) (newLen & blockMask); if (nextWrite == 0) { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 56c8f18fc484..7c968c25ef59 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -549,7 +549,7 @@ private void writeNodeForBinarySearch( long destPos = startAddress + headerLen + nodeIn.numArcs * (long) maxBytesPerArc; assert destPos >= srcPos; if (destPos > srcPos) { - bytes.skipBytes((int) (destPos - srcPos)); + bytes.setPosition(destPos); for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { destPos -= maxBytesPerArc; int arcLen = numBytesPerArc[arcIdx]; @@ -626,34 +626,22 @@ private void writeNodeForDirectAddressingOrContinuous( maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc. int headerLen = fixedLengthArcsBuffer.getPosition(); - // Prepare the builder byte store. Enlarge or truncate if needed. - long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes; - long currentPosition = bytes.getPosition(); - if (nodeEnd >= currentPosition) { - bytes.skipBytes((int) (nodeEnd - currentPosition)); - } else { - bytes.truncate(nodeEnd); - } - assert bytes.getPosition() == nodeEnd; - // Write the header. - long writeOffset = startAddress; - bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), 0, headerLen); - writeOffset += headerLen; + bytes.setPosition(startAddress); + bytes.writeBytes(fixedLengthArcsBuffer.getBytes(), 0, headerLen); // Write the presence bits if (continuous == false) { - writePresenceBits(nodeIn, writeOffset, numPresenceBytes); - writeOffset += numPresenceBytes; + writePresenceBits(nodeIn); + assert bytes.getPosition() == startAddress + headerLen + numPresenceBytes; } // Write the first label and the arcs. - bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); + bytes.writeBytes(fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); + assert bytes.getPosition() == startAddress + headerLen + numPresenceBytes + totalArcBytes; } - private void writePresenceBits( - FSTCompiler.UnCompiledNode nodeIn, long dest, int numPresenceBytes) { - long bytePos = dest; + private void writePresenceBits(FSTCompiler.UnCompiledNode nodeIn) { byte presenceBits = 1; // The first arc is always present. int presenceIndex = 0; int previousLabel = nodeIn.arcs[0].label; @@ -662,7 +650,7 @@ private void writePresenceBits( assert label > previousLabel; presenceIndex += label - previousLabel; while (presenceIndex >= Byte.SIZE) { - bytes.writeByte(bytePos++, presenceBits); + bytes.writeByte(presenceBits); presenceBits = 0; presenceIndex -= Byte.SIZE; } @@ -673,8 +661,7 @@ private void writePresenceBits( assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8; assert presenceBits != 0; // The last byte is not 0. assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. - bytes.writeByte(bytePos++, presenceBits); - assert bytePos - dest == numPresenceBytes; + bytes.writeByte(presenceBits); } private void freezeTail(int prefixLenPlus1) throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java index 6190b903c7d5..b1f91e94a8ea 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java @@ -45,7 +45,7 @@ public void testRandom() throws Exception { int pos = 0; while (pos < numBytes) { - int op = random().nextInt(8); + int op = random().nextInt(7); if (VERBOSE) { System.out.println(" cycle pos=" + pos); } @@ -174,7 +174,7 @@ public void testRandom() throws Exception { } pos += len; - bytes.skipBytes(len); + bytes.setPosition(pos); // NOTE: must fill in zeros in case truncate was // used, else we get false fails: @@ -184,18 +184,6 @@ public void testRandom() throws Exception { } } break; - - case 7: - { - // absWriteByte - if (pos > 0) { - int dest = random().nextInt(pos); - byte b = (byte) random().nextInt(256); - expected[dest] = b; - bytes.writeByte(dest, b); - } - break; - } } assertEquals(pos, bytes.getPosition()); @@ -203,7 +191,7 @@ public void testRandom() throws Exception { if (pos > 0 && random().nextInt(50) == 17) { // truncate int len = TestUtil.nextInt(random(), 1, Math.min(pos, 100)); - bytes.truncate(pos - len); + bytes.setPosition(pos - len); pos -= len; Arrays.fill(expected, pos, pos + len, (byte) 0); if (VERBOSE) { From 12963df14a443055ae67d73655bff9ec665ec8ed Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 15:20:20 +0900 Subject: [PATCH 14/45] tidy code --- .../core/src/java/org/apache/lucene/util/fst/BytesStore.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index 766759d12070..14469774e47d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -330,9 +330,7 @@ public long size() { return getPosition(); } - /** - * Set the position of this BytesStore, truncating or expanding if needed - */ + /** Set the position of this BytesStore, truncating or expanding if needed */ public void setPosition(long newLen) { assert newLen >= 0; long oldPosition = getPosition(); From f1e81b8e2f67013b8ce450d0777daede72f43fc2 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 16:53:37 +0900 Subject: [PATCH 15/45] Rename copyBytes to writeTo --- .../src/java/org/apache/lucene/util/fst/BytesStore.java | 4 ++-- .../src/java/org/apache/lucene/util/fst/FSTCompiler.java | 8 ++++---- .../src/java/org/apache/lucene/util/fst/NodeHash.java | 2 +- .../test/org/apache/lucene/util/fst/TestBytesStore.java | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index 14469774e47d..1522dc103311 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -172,7 +172,7 @@ public void copyBytes(DataInput input, long numBytes) throws IOException { * Absolute copy bytes self to self, without changing the position. Note: this cannot "grow" the * bytes, so must only call it on already written parts. */ - public void copyBytes(long src, long dest, int len) { + public void writeTo(long src, long dest, int len) { // System.out.println("BS.copyBytes src=" + src + " dest=" + dest + " len=" + len); assert src < dest; @@ -230,7 +230,7 @@ public void copyBytes(long src, long dest, int len) { } /** Copies bytes from this store to a target byte array. */ - public void copyBytes(long src, byte[] dest, int offset, int len) { + public void writeTo(long src, byte[] dest, int offset, int len) { int blockIndex = (int) (src >> blockBits); int upto = (int) (src & blockMask); byte[] block = blocks.get(blockIndex); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 7c968c25ef59..20790fcb106a 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -568,7 +568,7 @@ private void writeNodeForBinarySearch( + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs; - bytes.copyBytes(srcPos, destPos, arcLen); + bytes.writeTo(srcPos, destPos, arcLen); } } } @@ -601,16 +601,16 @@ private void writeNodeForDirectAddressingOrContinuous( srcPos -= srcArcLen; int labelLen = numLabelBytesPerArc[arcIdx]; // Copy the flags. - bytes.copyBytes(srcPos, buffer, bufferOffset, 1); + bytes.writeTo(srcPos, buffer, bufferOffset, 1); // Skip the label, copy the remaining. int remainingArcLen = srcArcLen - 1 - labelLen; if (remainingArcLen != 0) { - bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); + bytes.writeTo(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); } if (arcIdx == 0) { // Copy the label of the first arc only. bufferOffset -= labelLen; - bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); + bytes.writeTo(srcPos + 1, buffer, bufferOffset, labelLen); } } assert bufferOffset == headerMaxLen + numPresenceBytes; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java index faf258afde06..53640f59ffd2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -138,7 +138,7 @@ public long add(FSTCompiler.UnCompiledNode nodeIn) throws IOException { // at 0: assert nodeAddress != FST.FINAL_END_NODE && nodeAddress != FST.NON_FINAL_END_NODE; byte[] buf = new byte[Math.toIntExact(nodeAddress - startAddress + 1)]; - fstCompiler.bytes.copyBytes(startAddress, buf, 0, buf.length); + fstCompiler.bytes.writeTo(startAddress, buf, 0, buf.length); primaryTable.setNodeAddress(hashSlot, nodeAddress); primaryTable.copyNodeBytes(hashSlot, buf); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java index b1f91e94a8ea..2bbc79d60892 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java @@ -159,7 +159,7 @@ public void testRandom() throws Exception { System.out.println(" copyBytes src=" + src + " dest=" + dest + " len=" + len); } System.arraycopy(expected, src, expected, dest, len); - bytes.copyBytes(src, dest, len); + bytes.writeTo(src, dest, len); } } break; @@ -237,7 +237,7 @@ public void testCopyBytesOnByteStore() throws IOException { final int blockBits = TestUtil.nextInt(random(), 8, 15); final BytesStore o = new BytesStore(blockBits); o.copyBytes(in, len); - o.copyBytes(0, bytesout, 0, len); + o.writeTo(0, bytesout, 0, len); assertArrayEquals( ArrayUtil.copyOfSubArray(bytesout, 0, len), ArrayUtil.copyOfSubArray(bytes, offset, offset + len)); From c8165ad58d2d4bdf6c7471dc9eb0550f1d5df33f Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 17:01:29 +0900 Subject: [PATCH 16/45] Simplify BytesStore operations --- .../apache/lucene/util/fst/BytesStore.java | 24 +++++------- .../apache/lucene/util/fst/FSTCompiler.java | 39 +++++++------------ .../org/apache/lucene/util/fst/NodeHash.java | 2 +- .../lucene/util/fst/TestBytesStore.java | 22 +++-------- 4 files changed, 28 insertions(+), 59 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index 900822966a4f..8f02a077c0e1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -48,13 +48,6 @@ public BytesStore(int blockBits) { nextWrite = blockSize; } - /** Absolute write byte; you must ensure dest is < max position written so far. */ - public void writeByte(long dest, byte b) { - int blockIndex = (int) (dest >> blockBits); - byte[] block = blocks.get(blockIndex); - block[(int) (dest & blockMask)] = b; - } - @Override public void writeByte(byte b) { if (nextWrite == blockSize) { @@ -237,7 +230,7 @@ public void copyBytes(long src, long dest, int len) { } /** Copies bytes from this store to a target byte array. */ - public void copyBytes(long src, byte[] dest, int offset, int len) { + public void writeTo(long src, byte[] dest, int offset, int len) { int blockIndex = (int) (src >> blockBits); int upto = (int) (src & blockMask); byte[] block = blocks.get(blockIndex); @@ -313,7 +306,7 @@ public void reverse(long srcPos, long destPos) { } } - public void skipBytes(int len) { + private void skipBytes(int len) { while (len > 0) { int chunk = blockSize - nextWrite; if (len <= chunk) { @@ -337,13 +330,14 @@ public long size() { return getPosition(); } - /** - * Pos must be less than the max position written so far! Ie, you cannot "grow" the file with - * this! - */ - public void truncate(long newLen) { - assert newLen <= getPosition(); + /** Set the position of this BytesStore, truncating or expanding if needed */ + public void setPosition(long newLen) { assert newLen >= 0; + long oldPosition = getPosition(); + if (newLen > oldPosition) { + skipBytes((int) (newLen - oldPosition)); + return; + } int blockIndex = (int) (newLen >> blockBits); nextWrite = (int) (newLen & blockMask); if (nextWrite == 0) { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 56c8f18fc484..725b14614fab 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -549,7 +549,7 @@ private void writeNodeForBinarySearch( long destPos = startAddress + headerLen + nodeIn.numArcs * (long) maxBytesPerArc; assert destPos >= srcPos; if (destPos > srcPos) { - bytes.skipBytes((int) (destPos - srcPos)); + bytes.setPosition(destPos); for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { destPos -= maxBytesPerArc; int arcLen = numBytesPerArc[arcIdx]; @@ -601,16 +601,16 @@ private void writeNodeForDirectAddressingOrContinuous( srcPos -= srcArcLen; int labelLen = numLabelBytesPerArc[arcIdx]; // Copy the flags. - bytes.copyBytes(srcPos, buffer, bufferOffset, 1); + bytes.writeTo(srcPos, buffer, bufferOffset, 1); // Skip the label, copy the remaining. int remainingArcLen = srcArcLen - 1 - labelLen; if (remainingArcLen != 0) { - bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); + bytes.writeTo(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); } if (arcIdx == 0) { // Copy the label of the first arc only. bufferOffset -= labelLen; - bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); + bytes.writeTo(srcPos + 1, buffer, bufferOffset, labelLen); } } assert bufferOffset == headerMaxLen + numPresenceBytes; @@ -626,34 +626,22 @@ private void writeNodeForDirectAddressingOrContinuous( maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc. int headerLen = fixedLengthArcsBuffer.getPosition(); - // Prepare the builder byte store. Enlarge or truncate if needed. - long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes; - long currentPosition = bytes.getPosition(); - if (nodeEnd >= currentPosition) { - bytes.skipBytes((int) (nodeEnd - currentPosition)); - } else { - bytes.truncate(nodeEnd); - } - assert bytes.getPosition() == nodeEnd; - // Write the header. - long writeOffset = startAddress; - bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), 0, headerLen); - writeOffset += headerLen; + bytes.setPosition(startAddress); + bytes.writeBytes(fixedLengthArcsBuffer.getBytes(), 0, headerLen); // Write the presence bits if (continuous == false) { - writePresenceBits(nodeIn, writeOffset, numPresenceBytes); - writeOffset += numPresenceBytes; + writePresenceBits(nodeIn); + assert bytes.getPosition() == startAddress + headerLen + numPresenceBytes; } // Write the first label and the arcs. - bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); + bytes.writeBytes(fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); + assert bytes.getPosition() == startAddress + headerLen + numPresenceBytes + totalArcBytes; } - private void writePresenceBits( - FSTCompiler.UnCompiledNode nodeIn, long dest, int numPresenceBytes) { - long bytePos = dest; + private void writePresenceBits(FSTCompiler.UnCompiledNode nodeIn) { byte presenceBits = 1; // The first arc is always present. int presenceIndex = 0; int previousLabel = nodeIn.arcs[0].label; @@ -662,7 +650,7 @@ private void writePresenceBits( assert label > previousLabel; presenceIndex += label - previousLabel; while (presenceIndex >= Byte.SIZE) { - bytes.writeByte(bytePos++, presenceBits); + bytes.writeByte(presenceBits); presenceBits = 0; presenceIndex -= Byte.SIZE; } @@ -673,8 +661,7 @@ private void writePresenceBits( assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8; assert presenceBits != 0; // The last byte is not 0. assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. - bytes.writeByte(bytePos++, presenceBits); - assert bytePos - dest == numPresenceBytes; + bytes.writeByte(presenceBits); } private void freezeTail(int prefixLenPlus1) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java index faf258afde06..53640f59ffd2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -138,7 +138,7 @@ public long add(FSTCompiler.UnCompiledNode nodeIn) throws IOException { // at 0: assert nodeAddress != FST.FINAL_END_NODE && nodeAddress != FST.NON_FINAL_END_NODE; byte[] buf = new byte[Math.toIntExact(nodeAddress - startAddress + 1)]; - fstCompiler.bytes.copyBytes(startAddress, buf, 0, buf.length); + fstCompiler.bytes.writeTo(startAddress, buf, 0, buf.length); primaryTable.setNodeAddress(hashSlot, nodeAddress); primaryTable.copyNodeBytes(hashSlot, buf); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java index 6190b903c7d5..2bbc79d60892 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java @@ -45,7 +45,7 @@ public void testRandom() throws Exception { int pos = 0; while (pos < numBytes) { - int op = random().nextInt(8); + int op = random().nextInt(7); if (VERBOSE) { System.out.println(" cycle pos=" + pos); } @@ -159,7 +159,7 @@ public void testRandom() throws Exception { System.out.println(" copyBytes src=" + src + " dest=" + dest + " len=" + len); } System.arraycopy(expected, src, expected, dest, len); - bytes.copyBytes(src, dest, len); + bytes.writeTo(src, dest, len); } } break; @@ -174,7 +174,7 @@ public void testRandom() throws Exception { } pos += len; - bytes.skipBytes(len); + bytes.setPosition(pos); // NOTE: must fill in zeros in case truncate was // used, else we get false fails: @@ -184,18 +184,6 @@ public void testRandom() throws Exception { } } break; - - case 7: - { - // absWriteByte - if (pos > 0) { - int dest = random().nextInt(pos); - byte b = (byte) random().nextInt(256); - expected[dest] = b; - bytes.writeByte(dest, b); - } - break; - } } assertEquals(pos, bytes.getPosition()); @@ -203,7 +191,7 @@ public void testRandom() throws Exception { if (pos > 0 && random().nextInt(50) == 17) { // truncate int len = TestUtil.nextInt(random(), 1, Math.min(pos, 100)); - bytes.truncate(pos - len); + bytes.setPosition(pos - len); pos -= len; Arrays.fill(expected, pos, pos + len, (byte) 0); if (VERBOSE) { @@ -249,7 +237,7 @@ public void testCopyBytesOnByteStore() throws IOException { final int blockBits = TestUtil.nextInt(random(), 8, 15); final BytesStore o = new BytesStore(blockBits); o.copyBytes(in, len); - o.copyBytes(0, bytesout, 0, len); + o.writeTo(0, bytesout, 0, len); assertArrayEquals( ArrayUtil.copyOfSubArray(bytesout, 0, len), ArrayUtil.copyOfSubArray(bytes, offset, offset + len)); From 9a002c07f4337f2b9bcccfa953dba42e263f9611 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 17:27:59 +0900 Subject: [PATCH 17/45] Embed writeBytes() to FSTCompiler --- .../apache/lucene/util/fst/FSTCompiler.java | 17 +++++++++---- .../util/fst/GrowableByteArrayDataOutput.java | 15 ++++------- .../org/apache/lucene/util/fst/NodeHash.java | 4 ++- .../fst/TestGrowableByteArrayDataOutput.java | 25 +------------------ 4 files changed, 21 insertions(+), 40 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 9ecbdbed9d46..f70363aaf281 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -607,12 +607,17 @@ private void writeNodeForBinarySearch(FSTCompiler.UnCompiledNode nodeIn, int .writeVInt(maxBytesPerArc); int headerLen = fixedLengthArcsBuffer.getPosition(); - // Expand the arcs in place, backwards. int srcPos = scratchBytes.getPosition(); + + // First write the header + scratchBytes.setPosition(0); + scratchBytes.writeBytes(fixedLengthArcsBuffer.getBytes(), 0, headerLen); + + // Expand the arcs in place, backwards. int destPos = headerLen + nodeIn.numArcs * maxBytesPerArc; assert destPos >= srcPos; + scratchBytes.setPosition(destPos); if (destPos > srcPos) { - scratchBytes.setPosition(destPos); for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { destPos -= maxBytesPerArc; int arcLen = numBytesPerArc[arcIdx]; @@ -631,12 +636,14 @@ private void writeNodeForBinarySearch(FSTCompiler.UnCompiledNode nodeIn, int + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs; - scratchBytes.writeBytes(destPos, scratchBytes.bytes, srcPos, arcLen); + assert destPos + arcLen <= scratchBytes.getPosition(); + // copy the bytes from srcPos to destPos, essentially expanding the arc from variable + // length to fixed length + System.arraycopy( + scratchBytes.getBytes(), srcPos, scratchBytes.getBytes(), destPos, arcLen); } } } - - scratchBytes.writeBytes(0, fixedLengthArcsBuffer.getBytes(), 0, headerLen); } private void writeNodeForDirectAddressingOrContinuous( diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 405c9eb5bdcb..44e7c6726881 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -32,7 +32,7 @@ final class GrowableByteArrayDataOutput extends DataOutput implements Accountabl private static final int INITIAL_SIZE = 1 << 8; // holds an initial size of 256 bytes. this byte array will only grow, but not shrink - byte[] bytes = new byte[INITIAL_SIZE]; + private byte[] bytes = new byte[INITIAL_SIZE]; private int nextWrite; @@ -53,6 +53,10 @@ public int getPosition() { return nextWrite; } + public byte[] getBytes() { + return bytes; + } + /** Set the position of the byte[], increasing the capacity if needed */ public void setPosition(int newLen) { assert newLen >= 0; @@ -81,15 +85,6 @@ public void writeTo(int src, byte[] dest, int offset, int len) { System.arraycopy(bytes, src, dest, offset, len); } - /** - * Absolute writeBytes without changing the current position. Note: this cannot "grow" the bytes, - * so you must only call it on already written parts. - */ - public void writeBytes(int dest, byte[] b, int offset, int len) { - assert dest + len <= nextWrite : "dest=" + dest + " pos=" + nextWrite + " len=" + len; - System.arraycopy(b, offset, bytes, dest, len); - } - /** Reverse the written byte[]. */ public void reverse() { int src = 0; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java index 30af3353eb91..7326fd77f73b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -138,7 +138,9 @@ public long add(FSTCompiler.UnCompiledNode nodeIn) throws IOException { primaryTable.setNodeAddress(hashSlot, nodeAddress); primaryTable.copyNodeBytes( - hashSlot, fstCompiler.scratchBytes.bytes, fstCompiler.scratchBytes.getPosition()); + hashSlot, + fstCompiler.scratchBytes.getBytes(), + fstCompiler.scratchBytes.getPosition()); // confirm frozen hash and unfrozen hash are the same assert primaryTable.hash(nodeAddress, hashSlot) == hash diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java index 46814bb20ede..6b461f3dbdae 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java @@ -45,7 +45,7 @@ public void testRandom() throws Exception { int pos = 0; while (pos < numBytes) { - int op = random().nextInt(4); + int op = random().nextInt(3); if (VERBOSE) { System.out.println(" cycle pos=" + pos); } @@ -97,29 +97,6 @@ public void testRandom() throws Exception { } } break; - - case 3: - { - // abs write random byte[] - if (pos > 2) { - int randomPos = random().nextInt(pos - 1); - int len = TestUtil.nextInt(random(), 1, Math.min(pos - randomPos - 1, 100)); - byte[] temp = new byte[len]; - random().nextBytes(temp); - if (VERBOSE) { - System.out.println( - " abs writeBytes pos=" - + randomPos - + " len=" - + len - + " bytes=" - + Arrays.toString(temp)); - } - System.arraycopy(temp, 0, expected, randomPos, temp.length); - bytes.writeBytes(randomPos, temp, 0, temp.length); - } - } - break; } assertEquals(pos, bytes.getPosition()); From 1c201d448d5ce81c7e935bdad721437f1c0262bb Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 16 Nov 2023 19:25:32 +0900 Subject: [PATCH 18/45] Fix the write bytes method --- .../apache/lucene/util/fst/FSTCompiler.java | 27 ++++++++++++------- .../util/fst/GrowableByteArrayDataOutput.java | 10 +++---- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index f70363aaf281..4be12f2f40d2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -607,13 +607,8 @@ private void writeNodeForBinarySearch(FSTCompiler.UnCompiledNode nodeIn, int .writeVInt(maxBytesPerArc); int headerLen = fixedLengthArcsBuffer.getPosition(); - int srcPos = scratchBytes.getPosition(); - - // First write the header - scratchBytes.setPosition(0); - scratchBytes.writeBytes(fixedLengthArcsBuffer.getBytes(), 0, headerLen); - // Expand the arcs in place, backwards. + int srcPos = scratchBytes.getPosition(); int destPos = headerLen + nodeIn.numArcs * maxBytesPerArc; assert destPos >= srcPos; scratchBytes.setPosition(destPos); @@ -636,14 +631,28 @@ private void writeNodeForBinarySearch(FSTCompiler.UnCompiledNode nodeIn, int + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs; - assert destPos + arcLen <= scratchBytes.getPosition(); // copy the bytes from srcPos to destPos, essentially expanding the arc from variable // length to fixed length - System.arraycopy( - scratchBytes.getBytes(), srcPos, scratchBytes.getBytes(), destPos, arcLen); + writeScratchBytes(destPos, scratchBytes.getBytes(), srcPos, arcLen); } } } + + // Finally write the header + writeScratchBytes(0, fixedLengthArcsBuffer.getBytes(), 0, headerLen); + } + + /** + * Write bytes from a source byte[] to the scratch bytes + * + * @param destPos the position in the scratch bytes + * @param bytes the source byte[] + * @param offset the offset inside the source byte[] + * @param length the number of bytes to write + */ + private void writeScratchBytes(int destPos, byte[] bytes, int offset, int length) { + assert destPos + length <= scratchBytes.getPosition(); + System.arraycopy(bytes, offset, scratchBytes.getBytes(), destPos, length); } private void writeNodeForDirectAddressingOrContinuous( diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 44e7c6726881..4634dac711ec 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -87,13 +87,11 @@ public void writeTo(int src, byte[] dest, int offset, int len) { /** Reverse the written byte[]. */ public void reverse() { - int src = 0; - int dest = nextWrite - 1; - int limit = (dest - src + 1) / 2; + int limit = nextWrite / 2; for (int i = 0; i < limit; i++) { - byte b = bytes[src + i]; - bytes[src + i] = bytes[dest - i]; - bytes[dest - i] = b; + byte b = bytes[i]; + bytes[i] = bytes[nextWrite - 1 - i]; + bytes[nextWrite - 1 - i] = b; } } From 7efcde0408857ebdcf187c2278972136bb2de349 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Sat, 18 Nov 2023 21:43:28 +0900 Subject: [PATCH 19/45] Remove the default block bits constant --- .../core/src/java/org/apache/lucene/util/fst/FSTCompiler.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 4be12f2f40d2..4df85b19db93 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -59,8 +59,6 @@ */ public class FSTCompiler { - static final int DEFAULT_BLOCK_BITS = 15; - static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1f; /** @@ -218,7 +216,7 @@ public static class Builder { private final Outputs outputs; private double suffixRAMLimitMB = 32.0; private boolean allowFixedLengthArcs = true; - private DataOutput dataOutput = getOnHeapDataOutput(DEFAULT_BLOCK_BITS); + private DataOutput dataOutput = getOnHeapDataOutput(15); private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR; /** From b140b91ae80f09ca064758efd1a31fe4376045a1 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Sat, 18 Nov 2023 21:55:38 +0900 Subject: [PATCH 20/45] add assertion --- .../org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java | 1 + 1 file changed, 1 insertion(+) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 4634dac711ec..26c0d199865c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -82,6 +82,7 @@ public void writeTo(DataOutput out) throws IOException { /** Copies bytes from this store to a target byte array. */ public void writeTo(int src, byte[] dest, int offset, int len) { + assert src + len <= nextWrite; System.arraycopy(bytes, src, dest, offset, len); } From dbc1918fe438dc66b5bdad962db2271aaaa28414 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Sun, 19 Nov 2023 15:31:24 +0900 Subject: [PATCH 21/45] Rename method parameter names --- .../apache/lucene/util/fst/GrowableByteArrayDataOutput.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 26c0d199865c..94ab93b063f0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -81,9 +81,9 @@ public void writeTo(DataOutput out) throws IOException { } /** Copies bytes from this store to a target byte array. */ - public void writeTo(int src, byte[] dest, int offset, int len) { - assert src + len <= nextWrite; - System.arraycopy(bytes, src, dest, offset, len); + public void writeTo(int srcOffset, byte[] dest, int destOffset, int len) { + assert srcOffset + len <= nextWrite; + System.arraycopy(bytes, srcOffset, dest, destOffset, len); } /** Reverse the written byte[]. */ From a5c7e144143f296d3884262c6c5a37f5c54675c1 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Sun, 19 Nov 2023 21:53:28 +0900 Subject: [PATCH 22/45] Move reverse to FSTCompiler --- .../apache/lucene/util/fst/FSTCompiler.java | 14 +++++++++++- .../util/fst/GrowableByteArrayDataOutput.java | 10 --------- .../fst/TestGrowableByteArrayDataOutput.java | 22 +------------------ 3 files changed, 14 insertions(+), 32 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 4df85b19db93..a885fcfd03ce 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -514,7 +514,7 @@ long addNode(FSTCompiler.UnCompiledNode nodeIn) throws IOException { } } - scratchBytes.reverse(); + reverseScratchBytes(); scratchBytes.writeTo(dataOutput); numBytesWritten += scratchBytes.getPosition(); @@ -640,6 +640,18 @@ private void writeNodeForBinarySearch(FSTCompiler.UnCompiledNode nodeIn, int writeScratchBytes(0, fixedLengthArcsBuffer.getBytes(), 0, headerLen); } + /** Reverse the scratch bytes */ + private void reverseScratchBytes() { + int pos = scratchBytes.getPosition(); + byte[] bytes = scratchBytes.getBytes(); + int limit = pos / 2; + for (int i = 0; i < limit; i++) { + byte b = bytes[i]; + bytes[i] = bytes[pos - 1 - i]; + bytes[pos - 1 - i] = b; + } + } + /** * Write bytes from a source byte[] to the scratch bytes * diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 94ab93b063f0..8a5e7128ad5f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -86,16 +86,6 @@ public void writeTo(int srcOffset, byte[] dest, int destOffset, int len) { System.arraycopy(bytes, srcOffset, dest, destOffset, len); } - /** Reverse the written byte[]. */ - public void reverse() { - int limit = nextWrite / 2; - for (int i = 0; i < limit; i++) { - byte b = bytes[i]; - bytes[i] = bytes[nextWrite - 1 - i]; - bytes[nextWrite - 1 - i] = b; - } - } - @Override public long ramBytesUsed() { return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(bytes); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java index 6b461f3dbdae..0d271cc49e95 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestGrowableByteArrayDataOutput.java @@ -45,7 +45,7 @@ public void testRandom() throws Exception { int pos = 0; while (pos < numBytes) { - int op = random().nextInt(3); + int op = random().nextInt(2); if (VERBOSE) { System.out.println(" cycle pos=" + pos); } @@ -77,26 +77,6 @@ public void testRandom() throws Exception { pos += len; } break; - - case 2: - { - // reverse bytes - if (pos > 1) { - bytes.reverse(); - - int start = 0; - int end = bytes.getPosition() - 1; - - while (start <= end) { - byte b = expected[end]; - expected[end] = expected[start]; - expected[start] = b; - start++; - end--; - } - } - } - break; } assertEquals(pos, bytes.getPosition()); From 50de8f7dc2166674302943c97792fb1462878297 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Mon, 20 Nov 2023 05:33:47 +0900 Subject: [PATCH 23/45] Revert setPosition call --- .../core/src/java/org/apache/lucene/util/fst/FSTCompiler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index a885fcfd03ce..7ba5a23d172f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -609,8 +609,8 @@ private void writeNodeForBinarySearch(FSTCompiler.UnCompiledNode nodeIn, int int srcPos = scratchBytes.getPosition(); int destPos = headerLen + nodeIn.numArcs * maxBytesPerArc; assert destPos >= srcPos; - scratchBytes.setPosition(destPos); if (destPos > srcPos) { + scratchBytes.setPosition(destPos); for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { destPos -= maxBytesPerArc; int arcLen = numBytesPerArc[arcIdx]; From e88f452db1d2bb4d81e3a9ed5619a53e687b2a95 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 21 Nov 2023 23:14:49 +0900 Subject: [PATCH 24/45] Address comments --- ...yteBuffersDataOutputFSTReaderAdapter.java} | 4 +- .../java/org/apache/lucene/util/fst/FST.java | 35 +++++++++++++---- .../apache/lucene/util/fst/FSTCompiler.java | 38 ++++++------------- .../util/fst/GrowableByteArrayDataOutput.java | 4 +- .../lucene/util/fst/OnHeapFSTStore.java | 8 ++-- .../util/fst/TestFSTDataOutputWriter.java | 2 +- 6 files changed, 49 insertions(+), 42 deletions(-) rename lucene/core/src/java/org/apache/lucene/util/fst/{ByteBuffersFSTReader.java => ByteBuffersDataOutputFSTReaderAdapter.java} (90%) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ByteBuffersFSTReader.java b/lucene/core/src/java/org/apache/lucene/util/fst/ByteBuffersDataOutputFSTReaderAdapter.java similarity index 90% rename from lucene/core/src/java/org/apache/lucene/util/fst/ByteBuffersFSTReader.java rename to lucene/core/src/java/org/apache/lucene/util/fst/ByteBuffersDataOutputFSTReaderAdapter.java index 4065c4987b8c..a51063474ff4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ByteBuffersFSTReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ByteBuffersDataOutputFSTReaderAdapter.java @@ -21,11 +21,11 @@ import org.apache.lucene.store.DataOutput; /** An adapter class to use {@link ByteBuffersDataOutput} as a {@link FSTReader} */ -final class ByteBuffersFSTReader extends DataOutput implements FSTReader { +final class ByteBuffersDataOutputFSTReaderAdapter extends DataOutput implements FSTReader { private final ByteBuffersDataOutput dataOutput; - public ByteBuffersFSTReader(ByteBuffersDataOutput dataOutput) { + public ByteBuffersDataOutputFSTReaderAdapter(ByteBuffersDataOutput dataOutput) { this.dataOutput = dataOutput; } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 96cd8a6d8694..e87fe7e5c758 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -124,7 +124,7 @@ public enum INPUT_TYPE { /** If arc has this label then that arc is final/accepted */ public static final int END_LABEL = -1; - /** The reader of the FST */ + /** The reader of the FST, used to read bytes from the underlying FST storage */ private final FSTReader fstReader; public final Outputs outputs; @@ -434,10 +434,10 @@ public FST(FSTMetadata metadata, DataInput in, Outputs outputs, FSTStore f } /** - * @return true if and only if this FST is readable (i.e. has a reverse BytesReader) + * @return true if and only if this FST is readable (e.g has a reverse BytesReader) */ - public boolean hasReverseBytesReader() { - return fstReader.getReverseBytesReader() != null; + public boolean isReadable() { + return fstReader != null; } /** @@ -496,7 +496,11 @@ public static FSTMetadata readMetadata(DataInput metaIn, Outputs outpu @Override public long ramBytesUsed() { - return BASE_RAM_BYTES_USED + fstReader.ramBytesUsed(); + long size = BASE_RAM_BYTES_USED; + if (isReadable()) { + size += fstReader.ramBytesUsed(); + } + return size; } @Override @@ -516,7 +520,18 @@ public FSTMetadata getMetadata() { return metadata; } + /** + * Save the FST to DataOutput. You should call {@link #isReadable()} to verify if the FST is + * readable first. + * + * @param metaOut the DataOutput to write the metadata to + * @param out the DataOutput to write the FST bytes to + * @see #isReadable() + */ public void save(DataOutput metaOut, DataOutput out) throws IOException { + if (isReadable() == false) { + throw new IllegalStateException("This FST is non-readable and cannot be saved"); + } saveMetadata(metaOut); fstReader.writeTo(out); } @@ -524,7 +539,7 @@ public void save(DataOutput metaOut, DataOutput out) throws IOException { /** * Save the metadata to a DataOutput * - * @param metaOut the DataOutput to save + * @param metaOut the DataOutput to write the metadata to */ public void saveMetadata(DataOutput metaOut) throws IOException { CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT); @@ -1187,11 +1202,15 @@ private void seekToNextNode(BytesReader in) throws IOException { } /** - * Returns a {@link BytesReader} for this FST, positioned at position 0. + * Returns a {@link BytesReader} for this FST, positioned at position 0. You should call {@link + * #isReadable()} to verify if the FST is readable first. * - * @see #hasReverseBytesReader() + * @see #isReadable() */ public BytesReader getBytesReader() { + if (isReadable() == false) { + throw new IllegalStateException("FST is not readable"); + } return fstReader.getReverseBytesReader(); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 64cbfa5be600..bd6cfceb1442 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -120,7 +120,7 @@ public class FSTCompiler { final float directAddressingMaxOversizingFactor; long directAddressingExpansionCredit; - // the DataOutput to write the FST to + // the DataOutput to stream the FST bytes to final DataOutput dataOutput; // buffer to store bytes for the one node we are currently writing @@ -135,7 +135,7 @@ public class FSTCompiler { * @return the DataOutput */ public static DataOutput getOnHeapDataOutput(int blockBits) { - return new ByteBuffersFSTReader( + return new ByteBuffersDataOutputFSTReaderAdapter( new ByteBuffersDataOutput(blockBits, blockBits, ALLOCATE_BB_ON_HEAP, NO_REUSE)); } @@ -182,28 +182,9 @@ private FSTReader toFSTReader(DataOutput dataOutput) { if (dataOutput instanceof FSTReader) { return (FSTReader) dataOutput; } - return new NullFSTReader(); + return null; } - private static final class NullFSTReader implements FSTReader { - - @Override - public FST.BytesReader getReverseBytesReader() { - return null; - } - - @Override - public void writeTo(DataOutput out) { - throw new UnsupportedOperationException("writeTo(DataOutput) is not supported"); - } - - @Override - public long ramBytesUsed() { - return 0; - } - } - ; - /** * Fluent-style constructor for FST {@link FSTCompiler}. * @@ -270,8 +251,8 @@ public Builder allowFixedLengthArcs(boolean allowFixedLengthArcs) { /** * Set the {@link DataOutput} which is used for low-level writing of FST. If you want the FST to - * be readable, you need to use a DataOutput that also implements {@link FSTReader}, such as - * {@link FSTCompiler#getOnHeapDataOutput(int)}. + * be immediately readable, you need to use a DataOutput that also implements {@link FSTReader}, + * such as {@link FSTCompiler#getOnHeapDataOutput(int)}. * *

Otherwise you need to construct the corresponding {@link * org.apache.lucene.store.DataInput} and use the FST constructor to read it. @@ -640,7 +621,9 @@ private void writeNodeForBinarySearch(FSTCompiler.UnCompiledNode nodeIn, int writeScratchBytes(0, fixedLengthArcsBuffer.getBytes(), 0, headerLen); } - /** Reverse the scratch bytes */ + /** + * Reverse the scratch bytes in place. This operation does not affect scratchBytes.getPosition(). + */ private void reverseScratchBytes() { int pos = scratchBytes.getPosition(); byte[] bytes = scratchBytes.getBytes(); @@ -653,7 +636,10 @@ private void reverseScratchBytes() { } /** - * Write bytes from a source byte[] to the scratch bytes + * Write bytes from a source byte[] to the scratch bytes. The written bytes must fit within what + * was already written in the scratch bytes. + * + *

This operation does not affect scratchBytes.getPosition(). * * @param destPos the position in the scratch bytes * @param bytes the source byte[] diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 8a5e7128ad5f..3b8f24f861a1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -22,7 +22,8 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; -// Storing a byte[] for the current node of the FST we are writing. The byte[] will only grow, never +// Storing a single contiguous byte[] for the current node of the FST we are writing. The byte[] +// will only grow, never // shrink. final class GrowableByteArrayDataOutput extends DataOutput implements Accountable { @@ -72,6 +73,7 @@ public void setPosition(int newLen) { * @param capacityToWrite the additional bytes to write */ private void ensureCapacity(int capacityToWrite) { + assert capacityToWrite > 0; bytes = ArrayUtil.grow(bytes, nextWrite + capacityToWrite); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java index 4aafd1f6af1c..eeb7d9fa1166 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java @@ -34,10 +34,10 @@ public final class OnHeapFSTStore implements FSTStore { RamUsageEstimator.shallowSizeOfInstance(OnHeapFSTStore.class); /** - * A {@link ByteBuffersFSTReader}, used during reading when the FST is very large (more than 1 - * GB). If the FST is less than 1 GB then bytesArray is set instead. + * A {@link ByteBuffersDataOutputFSTReaderAdapter}, used during reading when the FST is very large + * (more than 1 GB). If the FST is less than 1 GB then bytesArray is set instead. */ - private ByteBuffersFSTReader byteBuffersReader; + private ByteBuffersDataOutputFSTReaderAdapter byteBuffersReader; /** Used at read time when the FST fits into a single byte[]. */ private byte[] bytesArray; @@ -56,7 +56,7 @@ public OnHeapFSTStore(int maxBlockBits) { public FSTStore init(DataInput in, long numBytes) throws IOException { if (numBytes > 1 << this.maxBlockBits) { // FST is big: we need multiple pages - byteBuffersReader = (ByteBuffersFSTReader) getOnHeapDataOutput(maxBlockBits); + byteBuffersReader = (ByteBuffersDataOutputFSTReaderAdapter) getOnHeapDataOutput(maxBlockBits); byteBuffersReader.copyBytes(in, numBytes); } else { // FST fits into a single block: use ByteArrayBytesStoreReader for less overhead diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java index 130ba24f1d31..c84fbbd3262c 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java @@ -210,7 +210,7 @@ protected FSTCompiler.Builder getFSTBuilder() { @Override protected FST compile(FSTCompiler fstCompiler) throws IOException { FST fst = fstCompiler.compile(); - assertFalse(fst.hasReverseBytesReader()); + assertFalse(fst.isReadable()); // the returned FST is not readable thus we need to reconstruct one with FSTStore DataInput dataIn = From 2587fc76c43bdeffd965e7e81081ca730fbeb4a1 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 22 Nov 2023 08:34:08 +0900 Subject: [PATCH 25/45] Return immediately when writing 0 bytes --- .../apache/lucene/util/fst/GrowableByteArrayDataOutput.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index 3b8f24f861a1..e253d5129db6 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -45,6 +45,9 @@ public void writeByte(byte b) { @Override public void writeBytes(byte[] b, int offset, int len) { + if (len == 0) { + return; + } ensureCapacity(len); System.arraycopy(b, offset, bytes, nextWrite, len); nextWrite += len; From f0b78d20bfd3ead92efff4a165b6f70d32a1d4ff Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 22 Nov 2023 08:37:07 +0900 Subject: [PATCH 26/45] Add comment & --- .../core/src/java/org/apache/lucene/util/fst/FSTCompiler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index bd6cfceb1442..d4ea38f1f2b8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -177,7 +177,7 @@ private FSTCompiler( } // Get the respective FSTReader of the DataOutput. If the DataOutput is also a FSTReader then we - // will use it. Otherwise, we will use NullFSTReader, which does not allow reading. + // will use it, otherwise we will return null. private FSTReader toFSTReader(DataOutput dataOutput) { if (dataOutput instanceof FSTReader) { return (FSTReader) dataOutput; @@ -286,6 +286,7 @@ public Builder directAddressingMaxOversizingFactor(float factor) { /** Creates a new {@link FSTCompiler}. */ public FSTCompiler build() { + // TODO: throw the IOException instead of catching it try { return new FSTCompiler<>( inputType, From fd458c49bdcf2272e93ff53636b2936d5b948754 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 22 Nov 2023 22:42:52 +0900 Subject: [PATCH 27/45] Rename variables --- .../org/apache/lucene/util/fst/OnHeapFSTStore.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java index 9c3fe782bb55..a26631e2b022 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java @@ -37,7 +37,7 @@ public final class OnHeapFSTStore implements FSTStore { * A {@link ReadWriteDataOutput}, used during reading when the FST is very large (more than 1 GB). * If the FST is less than 1 GB then bytesArray is set instead. */ - private ReadWriteDataOutput byteBuffersReader; + private ReadWriteDataOutput dataOutput; /** Used at read time when the FST fits into a single byte[]. */ private byte[] bytesArray; @@ -56,8 +56,8 @@ public OnHeapFSTStore(int maxBlockBits) { public FSTStore init(DataInput in, long numBytes) throws IOException { if (numBytes > 1 << this.maxBlockBits) { // FST is big: we need multiple pages - byteBuffersReader = (ReadWriteDataOutput) getOnHeapReaderWriter(maxBlockBits); - byteBuffersReader.copyBytes(in, numBytes); + dataOutput = (ReadWriteDataOutput) getOnHeapReaderWriter(maxBlockBits); + dataOutput.copyBytes(in, numBytes); } else { // FST fits into a single block: use ByteArrayBytesStoreReader for less overhead bytesArray = new byte[(int) numBytes]; @@ -72,7 +72,7 @@ public long ramBytesUsed() { if (bytesArray != null) { size += bytesArray.length; } else { - size += byteBuffersReader.ramBytesUsed(); + size += dataOutput.ramBytesUsed(); } return size; } @@ -82,14 +82,14 @@ public FST.BytesReader getReverseBytesReader() { if (bytesArray != null) { return new ReverseBytesReader(bytesArray); } else { - return byteBuffersReader.getReverseBytesReader(); + return dataOutput.getReverseBytesReader(); } } @Override public void writeTo(DataOutput out) throws IOException { - if (byteBuffersReader != null) { - byteBuffersReader.writeTo(out); + if (dataOutput != null) { + dataOutput.writeTo(out); } else { assert bytesArray != null; out.writeBytes(bytesArray, 0, bytesArray.length); From 606fe4595e3db31ae4c03d6249409099a5c24eec Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 22 Nov 2023 22:55:39 +0900 Subject: [PATCH 28/45] Fix the compile error --- .../org/apache/lucene/util/fst/TestFSTDataOutputWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java index c84fbbd3262c..6b12788a3852 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java @@ -217,7 +217,7 @@ protected FST compile(FSTCompiler fstCompiler) throws IOException { new InputStreamDataInput( new ByteArrayInputStream( baos.toByteArray(), previousOffset, baos.size() - previousOffset)); - return new FST<>(fst.getMetadata(), dataIn, outputs, new OnHeapFSTStore(5)); + return new FST<>(fst.getMetadata(), dataIn, new OnHeapFSTStore(5)); } } From dc45d9afd4d78e5544f09cfdd9e35c431492fbdd Mon Sep 17 00:00:00 2001 From: dungba88 Date: Thu, 23 Nov 2023 21:54:19 +0900 Subject: [PATCH 29/45] Remove isReadable() --- .../java/org/apache/lucene/util/fst/FST.java | 32 +++------------- .../apache/lucene/util/fst/FSTCompiler.java | 37 ++++++++++++++++--- 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index e241b47894e6..a0c0b91001ac 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -417,18 +417,13 @@ public FST(FSTMetadata metadata, DataInput in, FSTStore fstStore) throws IOEx /** Create the FST with a metadata object and a FSTReader. */ FST(FSTMetadata metadata, FSTReader fstReader) { + assert metadata != null; + assert fstReader != null; this.metadata = metadata; this.outputs = metadata.outputs; this.fstReader = fstReader; } - /** - * @return true if and only if this FST is readable (e.g has a reverse BytesReader) - */ - public boolean isReadable() { - return fstReader != null; - } - /** * Read the FST metadata from DataInput * @@ -485,11 +480,7 @@ public static FSTMetadata readMetadata(DataInput metaIn, Outputs outpu @Override public long ramBytesUsed() { - long size = BASE_RAM_BYTES_USED; - if (isReadable()) { - size += fstReader.ramBytesUsed(); - } - return size; + return BASE_RAM_BYTES_USED + fstReader.ramBytesUsed(); } @Override @@ -510,17 +501,12 @@ public FSTMetadata getMetadata() { } /** - * Save the FST to DataOutput. You should call {@link #isReadable()} to verify if the FST is - * readable first. + * Save the FST to DataOutput. * * @param metaOut the DataOutput to write the metadata to * @param out the DataOutput to write the FST bytes to - * @see #isReadable() */ public void save(DataOutput metaOut, DataOutput out) throws IOException { - if (isReadable() == false) { - throw new IllegalStateException("This FST is non-readable and cannot be saved"); - } saveMetadata(metaOut); fstReader.writeTo(out); } @@ -1190,16 +1176,8 @@ private void seekToNextNode(BytesReader in) throws IOException { } } - /** - * Returns a {@link BytesReader} for this FST, positioned at position 0. You should call {@link - * #isReadable()} to verify if the FST is readable first. - * - * @see #isReadable() - */ + /** Returns a {@link BytesReader} for this FST, positioned at position 0. */ public BytesReader getBytesReader() { - if (isReadable() == false) { - throw new IllegalStateException("FST is not readable"); - } return fstReader.getReverseBytesReader(); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 624002f953ec..230512cfb6d3 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -85,6 +85,10 @@ public class FSTCompiler { */ private static final float DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR = 1.66f; + // a FSTReader used when a non-FSTReader DataOutput is configured. + // it will throw exceptions if attempt to call getReverseBytesReader() or writeTo(DataOutput) + private static final FSTReader NULL_FST_READER = new NullFSTReader(); + private final NodeHash dedupHash; final FST fst; private final T NO_OUTPUT; @@ -176,12 +180,37 @@ private FSTCompiler( } // Get the respective FSTReader of the DataOutput. If the DataOutput is also a FSTReader then we - // will use it, otherwise we will return null. + // will use it, otherwise we will return a NullFSTReader. Attempting to read from a FST with + // NullFSTReader + // will throw UnsupportedOperationException private FSTReader toFSTReader(DataOutput dataOutput) { if (dataOutput instanceof FSTReader) { return (FSTReader) dataOutput; } - return null; + return NULL_FST_READER; + } + + /** + * This class is used for FST backed by non-FSTReader DataOutput. It does not allow getting the + * reverse BytesReader nor writing to a DataOutput. + */ + private static final class NullFSTReader implements FSTReader { + + @Override + public long ramBytesUsed() { + return 0; + } + + @Override + public FST.BytesReader getReverseBytesReader() { + throw new UnsupportedOperationException( + "NullFSTReader does not support getReverseBytesReader()"); + } + + @Override + public void writeTo(DataOutput out) { + throw new UnsupportedOperationException("NullFSTReader does not support writeTo(DataOutput)"); + } } /** @@ -308,10 +337,6 @@ public long getArcCount() { return arcCount; } - public long getMappedStateCount() { - return dedupHash == null ? 0 : nodeCount; - } - private CompiledNode compileNode(UnCompiledNode nodeIn) throws IOException { final long node; long bytesPosStart = numBytesWritten; From 36685cc41bf12a8bad3ba282633991a0b22ba7d4 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Fri, 24 Nov 2023 15:58:11 +0900 Subject: [PATCH 30/45] Remove isReadable() --- .../test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java | 1 - 1 file changed, 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java index 6b12788a3852..8bb3f5e3629e 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java @@ -210,7 +210,6 @@ protected FSTCompiler.Builder getFSTBuilder() { @Override protected FST compile(FSTCompiler fstCompiler) throws IOException { FST fst = fstCompiler.compile(); - assertFalse(fst.isReadable()); // the returned FST is not readable thus we need to reconstruct one with FSTStore DataInput dataIn = From 1c4f68da4a1aec6bc3f503ee107fe2cd25bccf3c Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 28 Nov 2023 13:25:31 +0900 Subject: [PATCH 31/45] Optimize ReadWriteDataOutput --- .../java/org/apache/lucene/util/fst/FSTCompiler.java | 11 ++++++++++- .../apache/lucene/util/fst/ReadWriteDataOutput.java | 11 ++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 230512cfb6d3..40d54f8783b4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -36,6 +36,7 @@ import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.Accountable; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; @@ -956,7 +957,15 @@ interface Node { } public long fstRamBytesUsed() { - return scratchBytes.ramBytesUsed(); + long ramBytesUsed = scratchBytes.ramBytesUsed(); + if (dataOutput instanceof Accountable) { + ramBytesUsed += ((Accountable) dataOutput).ramBytesUsed(); + } + return ramBytesUsed; + } + + public long fstSize() { + return numBytesWritten; } static final class CompiledNode implements Node { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index f7921c6f7c8b..3382a77e3ad2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -17,6 +17,10 @@ package org.apache.lucene.util.fst; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; + +import org.apache.lucene.store.ByteBuffersDataInput; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; @@ -49,7 +53,12 @@ public long ramBytesUsed() { @Override public FST.BytesReader getReverseBytesReader() { - return new ReverseRandomAccessReader(dataOutput.toDataInput()); + // we are using the writable buffers because we need to access the internal byte array + List buffers = dataOutput.toWriteableBufferList(); + if (buffers.size() == 1) { + return new ReverseBytesReader(buffers.get(0).array()); + } + return new ReverseRandomAccessReader(new ByteBuffersDataInput(buffers)); } @Override From 34aabf23319852b7210e3c9129f05727be687594 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 28 Nov 2023 16:25:52 +0900 Subject: [PATCH 32/45] tidy code --- .../src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java | 1 - 1 file changed, 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index 3382a77e3ad2..56465948ef42 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; - import org.apache.lucene.store.ByteBuffersDataInput; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; From 7ae16bb20a1f618d3bb815b66ece03fd455196a4 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 28 Nov 2023 19:01:37 +0900 Subject: [PATCH 33/45] Freeze the DataOutput once finished() --- .../apache/lucene/util/fst/FSTCompiler.java | 4 ++++ .../org/apache/lucene/util/fst/Freezable.java | 24 +++++++++++++++++++ .../lucene/util/fst/ReadWriteDataOutput.java | 21 +++++++++++----- 3 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/Freezable.java diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 40d54f8783b4..fcb0901c6363 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -911,6 +911,10 @@ void finish(long newStartNode) { } fst.metadata.startNode = newStartNode; fst.metadata.numBytes = numBytesWritten; + // freeze the dataOutput if applicable + if (dataOutput instanceof Freezable) { + ((Freezable) dataOutput).freeze(); + } } private boolean validOutput(T output) { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Freezable.java b/lucene/core/src/java/org/apache/lucene/util/fst/Freezable.java new file mode 100644 index 000000000000..28a07161791b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Freezable.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +/** Represent a datastructure that can be frozen (i.e., no longer modified). */ +interface Freezable { + + /** Freeze the datastructure */ + void freeze(); +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index 56465948ef42..b49774b1e0da 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -27,9 +27,11 @@ * An adapter class to use {@link ByteBuffersDataOutput} as a {@link FSTReader}. It allows the FST * to be readable immediately after writing */ -final class ReadWriteDataOutput extends DataOutput implements FSTReader { +final class ReadWriteDataOutput extends DataOutput implements FSTReader, Freezable { private final ByteBuffersDataOutput dataOutput; + private ByteBuffersDataInput dataInput; + private List byteBuffers; public ReadWriteDataOutput(ByteBuffersDataOutput dataOutput) { this.dataOutput = dataOutput; @@ -50,14 +52,21 @@ public long ramBytesUsed() { return dataOutput.ramBytesUsed(); } + @Override + public void freeze() { + // these operations are costly, so we want to compute it once and cache + byteBuffers = dataOutput.toWriteableBufferList(); + dataInput = new ByteBuffersDataInput(byteBuffers); + } + @Override public FST.BytesReader getReverseBytesReader() { - // we are using the writable buffers because we need to access the internal byte array - List buffers = dataOutput.toWriteableBufferList(); - if (buffers.size() == 1) { - return new ReverseBytesReader(buffers.get(0).array()); + assert dataInput != null; // freeze() must be called first + if (byteBuffers.size() == 1) { + // use a faster implementation for single-block case + return new ReverseBytesReader(byteBuffers.get(0).array()); } - return new ReverseRandomAccessReader(new ByteBuffersDataInput(buffers)); + return new ReverseRandomAccessReader(dataInput); } @Override From 4823bc1c31089952a145b5b875fe8f7889d0a356 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 28 Nov 2023 21:13:14 +0900 Subject: [PATCH 34/45] Refactor --- .../lucene/util/fst/ReadWriteDataOutput.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index b49774b1e0da..516f345a1e85 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -30,8 +30,10 @@ final class ReadWriteDataOutput extends DataOutput implements FSTReader, Freezable { private final ByteBuffersDataOutput dataOutput; + // the DataInput to read from in case the DataOutput has multiple blocks private ByteBuffersDataInput dataInput; - private List byteBuffers; + // the ByteBuffers to read from in case the DataOutput has a single block + private ByteBuffer byteBuffer; public ReadWriteDataOutput(ByteBuffersDataOutput dataOutput) { this.dataOutput = dataOutput; @@ -55,17 +57,21 @@ public long ramBytesUsed() { @Override public void freeze() { // these operations are costly, so we want to compute it once and cache - byteBuffers = dataOutput.toWriteableBufferList(); - dataInput = new ByteBuffersDataInput(byteBuffers); + List byteBuffers = dataOutput.toWriteableBufferList(); + if (byteBuffers.size() == 1) { + byteBuffer = byteBuffers.get(0); + } else { + dataInput = new ByteBuffersDataInput(byteBuffers); + } } @Override public FST.BytesReader getReverseBytesReader() { - assert dataInput != null; // freeze() must be called first - if (byteBuffers.size() == 1) { + if (byteBuffer != null) { // use a faster implementation for single-block case - return new ReverseBytesReader(byteBuffers.get(0).array()); + return new ReverseBytesReader(byteBuffer.array()); } + assert dataInput != null; // freeze() must be called first return new ReverseRandomAccessReader(dataInput); } From 817ae0833784f7bdd7b02289c76c60dfeaf1b2fb Mon Sep 17 00:00:00 2001 From: dungba88 Date: Tue, 28 Nov 2023 21:25:22 +0900 Subject: [PATCH 35/45] freeze the DataOutput before use --- lucene/core/src/java/org/apache/lucene/util/fst/FST.java | 6 ++++-- .../src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index a0c0b91001ac..3209a87737f6 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -442,12 +442,14 @@ public static FSTMetadata readMetadata(DataInput metaIn, Outputs outpu if (metaIn.readByte() == 1) { // accepts empty string // 1 KB blocks: - DataOutput emptyBytes = getOnHeapReaderWriter(10); + ReadWriteDataOutput emptyBytes = (ReadWriteDataOutput) getOnHeapReaderWriter(10); int numBytes = metaIn.readVInt(); emptyBytes.copyBytes(metaIn, numBytes); + emptyBytes.freeze(); + // De-serialize empty-string output: - BytesReader reader = ((FSTReader) emptyBytes).getReverseBytesReader(); + BytesReader reader = emptyBytes.getReverseBytesReader(); // NoOutputs uses 0 bytes when writing its output, // so we have to check here else BytesStore gets // angry: diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java index a26631e2b022..949babdaa883 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java @@ -58,6 +58,7 @@ public FSTStore init(DataInput in, long numBytes) throws IOException { // FST is big: we need multiple pages dataOutput = (ReadWriteDataOutput) getOnHeapReaderWriter(maxBlockBits); dataOutput.copyBytes(in, numBytes); + dataOutput.freeze(); } else { // FST fits into a single block: use ByteArrayBytesStoreReader for less overhead bytesArray = new byte[(int) numBytes]; From 14c977bae217ac809e732913a075c6af0334e5fa Mon Sep 17 00:00:00 2001 From: dungba88 Date: Mon, 4 Dec 2023 18:22:16 +0900 Subject: [PATCH 36/45] Improvement of ReadWriteDataOutput --- .../lucene/store/ByteBuffersDataOutput.java | 4 ++ .../lucene/util/fst/ReadWriteDataOutput.java | 68 +++++++++++++++---- 2 files changed, 57 insertions(+), 15 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java index eaa0929848db..147fc234129e 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java @@ -49,6 +49,10 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab throw new RuntimeException("reset() is not allowed on this buffer."); }; + public int getBlockBits() { + return blockBits; + } + /** * An implementation of a {@link ByteBuffer} allocation and recycling policy. The blocks are * recycled if exactly the same size is requested, otherwise they're released to be GCed. diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index 516f345a1e85..7100cf69061b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; -import org.apache.lucene.store.ByteBuffersDataInput; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; @@ -30,13 +29,16 @@ final class ReadWriteDataOutput extends DataOutput implements FSTReader, Freezable { private final ByteBuffersDataOutput dataOutput; - // the DataInput to read from in case the DataOutput has multiple blocks - private ByteBuffersDataInput dataInput; - // the ByteBuffers to read from in case the DataOutput has a single block - private ByteBuffer byteBuffer; + private final int blockBits; + private final int blockSize; + private final int blockMask; + private List byteBuffers; public ReadWriteDataOutput(ByteBuffersDataOutput dataOutput) { this.dataOutput = dataOutput; + this.blockBits = dataOutput.getBlockBits(); + this.blockSize = 1 << blockBits; + this.blockMask = blockSize - 1; } @Override @@ -57,22 +59,58 @@ public long ramBytesUsed() { @Override public void freeze() { // these operations are costly, so we want to compute it once and cache - List byteBuffers = dataOutput.toWriteableBufferList(); - if (byteBuffers.size() == 1) { - byteBuffer = byteBuffers.get(0); - } else { - dataInput = new ByteBuffersDataInput(byteBuffers); - } + this.byteBuffers = dataOutput.toWriteableBufferList(); } @Override public FST.BytesReader getReverseBytesReader() { - if (byteBuffer != null) { + assert byteBuffers != null; // freeze() must be called first + if (byteBuffers.size() == 1) { // use a faster implementation for single-block case - return new ReverseBytesReader(byteBuffer.array()); + return new ReverseBytesReader(byteBuffers.get(0).array()); } - assert dataInput != null; // freeze() must be called first - return new ReverseRandomAccessReader(dataInput); + return new FST.BytesReader() { + private byte[] current = byteBuffers.get(0).array(); + private int nextBuffer = -1; + private int nextRead = 0; + + @Override + public byte readByte() { + if (nextRead == -1) { + current = byteBuffers.get(nextBuffer--).array(); + nextRead = blockSize - 1; + } + return current[nextRead--]; + } + + @Override + public void skipBytes(long count) { + setPosition(getPosition() - count); + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for (int i = 0; i < len; i++) { + b[offset + i] = readByte(); + } + } + + @Override + public long getPosition() { + return ((long) nextBuffer + 1) * blockSize + nextRead; + } + + @Override + public void setPosition(long pos) { + int bufferIndex = (int) (pos >> blockBits); + if (nextBuffer != bufferIndex - 1) { + nextBuffer = bufferIndex - 1; + current = byteBuffers.get(bufferIndex).array(); + } + nextRead = (int) (pos & blockMask); + assert getPosition() == pos : "pos=" + pos + " getPos()=" + getPosition(); + } + }; } @Override From b7b7b0b165f20c2f8cfb6d78ac4a45168911d302 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Mon, 4 Dec 2023 19:51:15 +0900 Subject: [PATCH 37/45] tidy code --- .../java/org/apache/lucene/store/ByteBuffersDataOutput.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java index 147fc234129e..33daa51389da 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java @@ -49,9 +49,9 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab throw new RuntimeException("reset() is not allowed on this buffer."); }; - public int getBlockBits() { - return blockBits; - } + public int getBlockBits() { + return blockBits; + } /** * An implementation of a {@link ByteBuffer} allocation and recycling policy. The blocks are From 6f7a8c58041de7ccd39924c4317c0e046dcb15f7 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 6 Dec 2023 00:56:15 +0900 Subject: [PATCH 38/45] Address comments and add off-heap FST tests --- .../java/org/apache/lucene/util/fst/FST.java | 4 +- .../apache/lucene/util/fst/FSTCompiler.java | 26 +- .../org/apache/lucene/util/fst/Freezable.java | 24 -- .../util/fst/GrowableByteArrayDataOutput.java | 5 +- .../lucene/util/fst/ReadWriteDataOutput.java | 12 +- .../org/apache/lucene/util/fst/Test2BFST.java | 3 - .../lucene/util/fst/Test2BFSTOffHeap.java | 341 ++++++++++++++++++ .../util/fst/TestFSTDataOutputWriter.java | 230 ------------ .../lucene/tests/util/fst/FSTTester.java | 33 +- 9 files changed, 399 insertions(+), 279 deletions(-) delete mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/Freezable.java create mode 100644 lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java delete mode 100644 lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 3209a87737f6..3833fc264bb2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -503,7 +503,9 @@ public FSTMetadata getMetadata() { } /** - * Save the FST to DataOutput. + * Save the FST to DataOutput. If you use an {@link org.apache.lucene.store.IndexOutput} to build + * the FST, then you should not and do not need to call this method, as the FST is already saved. + * Doing so will throw an {@link UnsupportedOperationException}. * * @param metaOut the DataOutput to write the metadata to * @param out the DataOutput to write the FST bytes to diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index ed6a4b83a598..0e16b1cf2b62 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -56,6 +56,17 @@ *

FSTs larger than 2.1GB are now possible (as of Lucene 4.2). FSTs containing more than 2.1B * nodes are also now possible, however they cannot be packed. * + *

It now supports 3 different workflows: + * + *

- Build FST and use it immediately entirely in RAM and then discard it + * + *

- Build FST and use it immediately entirely in RAM and also save it to other DataOutput, and + * load it later and use it + * + *

- Build FST but stream it immediately to disk (except the FSTMetaData, to be saved at the + * end). In order to use it, you need to construct the corresponding DataInput and use the FST + * constructor to read it. + * * @lucene.experimental */ public class FSTCompiler { @@ -205,12 +216,13 @@ public long ramBytesUsed() { @Override public FST.BytesReader getReverseBytesReader() { throw new UnsupportedOperationException( - "NullFSTReader does not support getReverseBytesReader()"); + "FST was not constructed with getOnHeapReaderWriter()"); } @Override public void writeTo(DataOutput out) { - throw new UnsupportedOperationException("NullFSTReader does not support writeTo(DataOutput)"); + throw new UnsupportedOperationException( + "FST was not constructed with getOnHeapReaderWriter()"); } } @@ -226,7 +238,7 @@ public static class Builder { private final Outputs outputs; private double suffixRAMLimitMB = 32.0; private boolean allowFixedLengthArcs = true; - private DataOutput dataOutput = getOnHeapReaderWriter(15); + private DataOutput dataOutput; private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR; /** @@ -315,6 +327,10 @@ public Builder directAddressingMaxOversizingFactor(float factor) { /** Creates a new {@link FSTCompiler}. */ public FSTCompiler build() throws IOException { + // create a default DataOutput if not specified + if (dataOutput == null) { + dataOutput = getOnHeapReaderWriter(15); + } return new FSTCompiler<>( inputType, suffixRAMLimitMB, @@ -912,8 +928,8 @@ void finish(long newStartNode) { fst.metadata.startNode = newStartNode; fst.metadata.numBytes = numBytesWritten; // freeze the dataOutput if applicable - if (dataOutput instanceof Freezable) { - ((Freezable) dataOutput).freeze(); + if (dataOutput instanceof ReadWriteDataOutput) { + ((ReadWriteDataOutput) dataOutput).freeze(); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Freezable.java b/lucene/core/src/java/org/apache/lucene/util/fst/Freezable.java deleted file mode 100644 index 28a07161791b..000000000000 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Freezable.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.util.fst; - -/** Represent a datastructure that can be frozen (i.e., no longer modified). */ -interface Freezable { - - /** Freeze the datastructure */ - void freeze(); -} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java index e253d5129db6..e6ea871aa06b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/GrowableByteArrayDataOutput.java @@ -23,8 +23,9 @@ import org.apache.lucene.util.RamUsageEstimator; // Storing a single contiguous byte[] for the current node of the FST we are writing. The byte[] -// will only grow, never -// shrink. +// will only grow, never shrink. +// Note: This is only safe for usage that is bounded in the number of bytes written. Do not make +// this public! Public users should instead use ByteBuffersDataOutput final class GrowableByteArrayDataOutput extends DataOutput implements Accountable { private static final long BASE_RAM_BYTES_USED = diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index 516f345a1e85..7b680b49be32 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -27,13 +27,15 @@ * An adapter class to use {@link ByteBuffersDataOutput} as a {@link FSTReader}. It allows the FST * to be readable immediately after writing */ -final class ReadWriteDataOutput extends DataOutput implements FSTReader, Freezable { +final class ReadWriteDataOutput extends DataOutput implements FSTReader { private final ByteBuffersDataOutput dataOutput; // the DataInput to read from in case the DataOutput has multiple blocks private ByteBuffersDataInput dataInput; // the ByteBuffers to read from in case the DataOutput has a single block private ByteBuffer byteBuffer; + // whether this DataOutput is already frozen + private boolean frozen; public ReadWriteDataOutput(ByteBuffersDataOutput dataOutput) { this.dataOutput = dataOutput; @@ -41,11 +43,17 @@ public ReadWriteDataOutput(ByteBuffersDataOutput dataOutput) { @Override public void writeByte(byte b) { + if (frozen) { + throw new IllegalStateException("Already frozen"); + } dataOutput.writeByte(b); } @Override public void writeBytes(byte[] b, int offset, int length) { + if (frozen) { + throw new IllegalStateException("Already frozen"); + } dataOutput.writeBytes(b, offset, length); } @@ -54,8 +62,8 @@ public long ramBytesUsed() { return dataOutput.ramBytesUsed(); } - @Override public void freeze() { + frozen = true; // these operations are costly, so we want to compute it once and cache List byteBuffers = dataOutput.toWriteableBufferList(); if (byteBuffers.size() == 1) { diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java index 2e0aefbc06d8..b2758ca526e9 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java @@ -31,9 +31,6 @@ import org.apache.lucene.util.IntsRef; import org.junit.Ignore; -// TODO: soon we will be able to run this test with small heaps! Once #12633 and #12543 -// are done -// // Run something like this: // ./gradlew test --tests Test2BFST -Dtests.heapsize=32g -Dtests.verbose=true --max-workers=1 diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java new file mode 100644 index 000000000000..9f3cbaf98a62 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; +import java.util.Arrays; +import java.util.Random; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks; +import org.apache.lucene.tests.util.TimeUnits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +// Similar to Test2BFST but will build and read the FST off-heap and can be run with small heap + +// Run something like this: +// ./gradlew test --tests Test2BFSTOffHeap -Dtests.verbose=true --max-workers=1 + +// @Ignore("Requires tons of heap to run (30 GB hits OOME but 35 GB passes after ~4.5 hours)") +@SuppressSysoutChecks(bugUrl = "test prints helpful progress reports with time") +@TimeoutSuite(millis = 100 * TimeUnits.HOUR) +public class Test2BFSTOffHeap extends LuceneTestCase { + + private static long LIMIT = 3L * 1024 * 1024 * 1024; + + public void test() throws Exception { + assumeWorkingMMapOnWindows(); + + int[] ints = new int[7]; + IntsRef input = new IntsRef(ints, 0, ints.length); + long seed = random().nextLong(); + + Directory dir = new MMapDirectory(createTempDir("2BFSTOffHeap")); + + // Build FST w/ NoOutputs and stop when nodeCount > 2.2B + { + System.out.println("\nTEST: ~2.2B nodes; output=NO_OUTPUTS"); + Outputs outputs = NoOutputs.getSingleton(); + Object NO_OUTPUT = outputs.getNoOutput(); + IndexOutput indexOutput = dir.createOutput("fst", IOContext.DEFAULT); + final FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).dataOutput(indexOutput).build(); + + int count = 0; + Random r = new Random(seed); + int[] ints2 = new int[200]; + IntsRef input2 = new IntsRef(ints2, 0, ints2.length); + long startTime = System.nanoTime(); + while (true) { + // System.out.println("add: " + input + " -> " + output); + for (int i = 10; i < ints2.length; i++) { + ints2[i] = r.nextInt(256); + } + fstCompiler.add(input2, NO_OUTPUT); + count++; + if (count % 100000 == 0) { + System.out.println( + count + + ": " + + fstCompiler.fstRamBytesUsed() + + " RAM bytes used; " + + fstCompiler.fstSizeInBytes() + + " FST bytes; " + + fstCompiler.getNodeCount() + + " nodes; took " + + (long) ((System.nanoTime() - startTime) / 1e9) + + " seconds"); + } + if (fstCompiler.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) { + break; + } + nextInput(r, ints2); + } + + FST fst = fstCompiler.compile(); + indexOutput.close(); + try (IndexInput indexInput = dir.openInput("fst", IOContext.DEFAULT)) { + fst = new FST<>(fst.getMetadata(), indexInput, new OffHeapFSTStore()); + + for (int verify = 0; verify < 2; verify++) { + System.out.println( + "\nTEST: now verify [fst size=" + + fst.numBytes() + + "; nodeCount=" + + fstCompiler.getNodeCount() + + "; arcCount=" + + fstCompiler.getArcCount() + + "]"); + + Arrays.fill(ints2, 0); + r = new Random(seed); + + startTime = System.nanoTime(); + for (int i = 0; i < count; i++) { + if (i % 1000000 == 0) { + System.out.println( + i + "...: took " + (long) ((System.nanoTime() - startTime) / 1e9) + " seconds"); + } + for (int j = 10; j < ints2.length; j++) { + ints2[j] = r.nextInt(256); + } + assertEquals(NO_OUTPUT, Util.get(fst, input2)); + nextInput(r, ints2); + } + + System.out.println("\nTEST: enum all input/outputs"); + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum<>(fst); + + Arrays.fill(ints2, 0); + r = new Random(seed); + int upto = 0; + while (true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + for (int j = 10; j < ints2.length; j++) { + ints2[j] = r.nextInt(256); + } + assertEquals(input2, pair.input); + assertEquals(NO_OUTPUT, pair.output); + upto++; + nextInput(r, ints2); + } + assertEquals(count, upto); + } + } finally { + dir.deleteFile("fst"); + } + } + + // Build FST w/ ByteSequenceOutputs and stop when FST + // size = 3GB + { + System.out.println("\nTEST: 3 GB size; outputs=bytes"); + IndexOutput indexOutput = dir.createOutput("fst", IOContext.DEFAULT); + Outputs outputs = ByteSequenceOutputs.getSingleton(); + final FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).dataOutput(indexOutput).build(); + + byte[] outputBytes = new byte[20]; + BytesRef output = new BytesRef(outputBytes); + Arrays.fill(ints, 0); + int count = 0; + Random r = new Random(seed); + while (true) { + r.nextBytes(outputBytes); + // System.out.println("add: " + input + " -> " + output); + fstCompiler.add(input, BytesRef.deepCopyOf(output)); + count++; + if (count % 10000 == 0) { + long size = fstCompiler.fstSizeInBytes(); + if (count % 1000000 == 0) { + System.out.println(count + "...: " + size + " bytes"); + } + if (size > LIMIT) { + break; + } + } + nextInput(r, ints); + } + + FST fst = fstCompiler.compile(); + indexOutput.close(); + try (IndexInput indexInput = dir.openInput("fst", IOContext.DEFAULT)) { + fst = new FST<>(fst.getMetadata(), indexInput, new OffHeapFSTStore()); + for (int verify = 0; verify < 2; verify++) { + + System.out.println( + "\nTEST: now verify [fst size=" + + fst.numBytes() + + "; nodeCount=" + + fstCompiler.getNodeCount() + + "; arcCount=" + + fstCompiler.getArcCount() + + "]"); + + r = new Random(seed); + Arrays.fill(ints, 0); + + long startTime = System.nanoTime(); + + for (int i = 0; i < count; i++) { + if (i % 1000000 == 0) { + System.out.println( + i + "...: took " + (long) ((System.nanoTime() - startTime) / 1e9) + " seconds"); + } + r.nextBytes(outputBytes); + assertEquals(output, Util.get(fst, input)); + nextInput(r, ints); + } + + System.out.println("\nTEST: enum all input/outputs"); + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum<>(fst); + + Arrays.fill(ints, 0); + r = new Random(seed); + int upto = 0; + while (true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + assertEquals(input, pair.input); + r.nextBytes(outputBytes); + assertEquals(output, pair.output); + upto++; + nextInput(r, ints); + } + assertEquals(count, upto); + } + } finally { + dir.deleteFile("fst"); + } + } + + // Build FST w/ PositiveIntOutputs and stop when FST + // size = 3GB + { + IndexOutput indexOutput = dir.createOutput("fst", IOContext.DEFAULT); + System.out.println("\nTEST: 3 GB size; outputs=long"); + Outputs outputs = PositiveIntOutputs.getSingleton(); + final FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).dataOutput(indexOutput).build(); + + long output = 1; + + Arrays.fill(ints, 0); + int count = 0; + Random r = new Random(seed); + while (true) { + // System.out.println("add: " + input + " -> " + output); + fstCompiler.add(input, output); + output += 1 + r.nextInt(10); + count++; + if (count % 10000 == 0) { + long size = fstCompiler.fstSizeInBytes(); + if (count % 1000000 == 0) { + System.out.println(count + "...: " + size + " bytes"); + } + if (size > LIMIT) { + break; + } + } + nextInput(r, ints); + } + + FST fst = fstCompiler.compile(); + indexOutput.close(); + try (IndexInput indexInput = dir.openInput("fst", IOContext.DEFAULT)) { + fst = new FST<>(fst.getMetadata(), indexInput, new OffHeapFSTStore()); + + for (int verify = 0; verify < 2; verify++) { + + System.out.println( + "\nTEST: now verify [fst size=" + + fst.numBytes() + + "; nodeCount=" + + fstCompiler.getNodeCount() + + "; arcCount=" + + fstCompiler.getArcCount() + + "]"); + + Arrays.fill(ints, 0); + + output = 1; + r = new Random(seed); + long startTime = System.nanoTime(); + for (int i = 0; i < count; i++) { + if (i % 1000000 == 0) { + System.out.println( + i + "...: took " + (long) ((System.nanoTime() - startTime) / 1e9) + " seconds"); + } + + assertEquals(output, Util.get(fst, input).longValue()); + + output += 1 + r.nextInt(10); + nextInput(r, ints); + } + + System.out.println("\nTEST: enum all input/outputs"); + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum<>(fst); + + Arrays.fill(ints, 0); + r = new Random(seed); + int upto = 0; + output = 1; + while (true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + assertEquals(input, pair.input); + assertEquals(output, pair.output.longValue()); + output += 1 + r.nextInt(10); + upto++; + nextInput(r, ints); + } + assertEquals(count, upto); + } + } finally { + dir.deleteFile("fst"); + } + } + dir.close(); + } + + private void nextInput(Random r, int[] ints) { + int downTo = 6; + while (downTo >= 0) { + // Must add random amounts (and not just 1) because + // otherwise FST outsmarts us and remains tiny: + ints[downTo] += 1 + r.nextInt(10); + if (ints[downTo] < 256) { + break; + } else { + ints[downTo] = 0; + downTo--; + } + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java deleted file mode 100644 index 8bb3f5e3629e..000000000000 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDataOutputWriter.java +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.util.fst; - -import static org.apache.lucene.tests.util.fst.FSTTester.toIntsRef; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.InputStreamDataInput; -import org.apache.lucene.store.OutputStreamDataOutput; -import org.apache.lucene.tests.store.MockDirectoryWrapper; -import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.tests.util.TestUtil; -import org.apache.lucene.tests.util.fst.FSTTester; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IntsRef; - -public class TestFSTDataOutputWriter extends LuceneTestCase { - - private MockDirectoryWrapper dir; - - @Override - public void setUp() throws Exception { - super.setUp(); - dir = newMockDirectory(); - } - - @Override - public void tearDown() throws Exception { - // can be null if we force simpletext (funky, some kind of bug in test runner maybe) - if (dir != null) { - dir.close(); - } - super.tearDown(); - } - - public void testRandom() throws Exception { - - final int iters = atLeast(10); - final int maxBytes = TEST_NIGHTLY ? 200000 : 20000; - for (int iter = 0; iter < iters; iter++) { - final int numBytes = TestUtil.nextInt(random(), 1, maxBytes); - final byte[] expected = new byte[numBytes]; - final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - final DataOutput dataOutput = new OutputStreamDataOutput(baos); - if (VERBOSE) { - System.out.println("TEST: iter=" + iter + " numBytes=" + numBytes); - } - - int pos = 0; - while (pos < numBytes) { - int op = random().nextInt(2); - if (VERBOSE) { - System.out.println(" cycle pos=" + pos); - } - switch (op) { - case 0: - { - // write random byte - byte b = (byte) random().nextInt(256); - if (VERBOSE) { - System.out.println(" writeByte b=" + b); - } - - expected[pos++] = b; - dataOutput.writeByte(b); - } - break; - - case 1: - { - // write random byte[] - int len = random().nextInt(Math.min(numBytes - pos, 100)); - byte[] temp = new byte[len]; - random().nextBytes(temp); - if (VERBOSE) { - System.out.println(" writeBytes len=" + len + " bytes=" + Arrays.toString(temp)); - } - System.arraycopy(temp, 0, expected, pos, temp.length); - dataOutput.writeBytes(temp, 0, temp.length); - pos += len; - } - break; - } - - assertEquals(pos, baos.toByteArray().length); - } - for (int i = 0; i < numBytes; i++) { - assertEquals("byte @ index=" + i, expected[i], baos.toByteArray()[i]); - } - } - } - - public void testBasicFSA() throws IOException { - String[] strings2 = - new String[] { - "station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation" - }; - IntsRef[] terms2 = new IntsRef[strings2.length]; - // we will also test writing multiple FST to a single byte array - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - for (int inputMode = 0; inputMode < 2; inputMode++) { - if (VERBOSE) { - System.out.println("TEST: inputMode=" + inputModeToString(inputMode)); - } - - for (int idx = 0; idx < strings2.length; idx++) { - terms2[idx] = toIntsRef(strings2[idx], inputMode); - } - Arrays.sort(terms2); - - // Test pre-determined FST sizes to make sure we haven't lost minimality (at least on this - // trivial set of terms): - - // FSA - { - final Outputs outputs = NoOutputs.getSingleton(); - final Object NO_OUTPUT = outputs.getNoOutput(); - final List> pairs = new ArrayList<>(terms2.length); - for (IntsRef term : terms2) { - pairs.add(new FSTTester.InputOutput<>(term, NO_OUTPUT)); - } - FSTTester tester = - new DataOutputFSTTester<>(random(), dir, inputMode, pairs, outputs, baos); - FST fst = tester.doTest(); - assertNotNull(fst); - assertEquals(22, tester.nodeCount); - assertEquals(27, tester.arcCount); - } - - // FST ord pos int - { - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - final List> pairs = new ArrayList<>(terms2.length); - for (int idx = 0; idx < terms2.length; idx++) { - pairs.add(new FSTTester.InputOutput<>(terms2[idx], (long) idx)); - } - FSTTester tester = - new DataOutputFSTTester<>(random(), dir, inputMode, pairs, outputs, baos); - final FST fst = tester.doTest(); - assertNotNull(fst); - assertEquals(22, tester.nodeCount); - assertEquals(27, tester.arcCount); - } - - // FST byte sequence ord - { - final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - final List> pairs = new ArrayList<>(terms2.length); - for (int idx = 0; idx < terms2.length; idx++) { - final BytesRef output = newBytesRef(Integer.toString(idx)); - pairs.add(new FSTTester.InputOutput<>(terms2[idx], output)); - } - FSTTester tester = - new DataOutputFSTTester<>(random(), dir, inputMode, pairs, outputs, baos); - final FST fst = tester.doTest(); - assertNotNull(fst); - assertEquals(24, tester.nodeCount); - assertEquals(30, tester.arcCount); - } - } - } - - class DataOutputFSTTester extends FSTTester { - - private final ByteArrayOutputStream baos; - private int previousOffset; - - public DataOutputFSTTester( - Random random, - Directory dir, - int inputMode, - List> pairs, - Outputs outputs, - ByteArrayOutputStream baos) { - super(random, dir, inputMode, pairs, outputs); - this.baos = baos; - } - - @Override - protected FSTCompiler.Builder getFSTBuilder() { - // as the byte array could already contain another FST bytes, we should get the current offset - // to know where to start reading from - this.previousOffset = baos.size(); - return super.getFSTBuilder().dataOutput(new OutputStreamDataOutput(baos)); - } - - @Override - protected FST compile(FSTCompiler fstCompiler) throws IOException { - FST fst = fstCompiler.compile(); - - // the returned FST is not readable thus we need to reconstruct one with FSTStore - DataInput dataIn = - new InputStreamDataInput( - new ByteArrayInputStream( - baos.toByteArray(), previousOffset, baos.size() - previousOffset)); - return new FST<>(fst.getMetadata(), dataIn, new OnHeapFSTStore(5)); - } - } - - String inputModeToString(int mode) { - if (mode == 0) { - return "utf8"; - } else { - return "utf32"; - } - } -} diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java index 03f6f3502f56..8afbdae5b9d5 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java @@ -255,7 +255,17 @@ private T randomAcceptedWord(FST fst, IntsRefBuilder in) throws IOException { public FST doTest() throws IOException { - final FSTCompiler fstCompiler = getFSTBuilder().build(); + IndexOutput indexOutput = null; + boolean useOffHeap = true; // random.nextBoolean(); + if (useOffHeap) { + indexOutput = dir.createOutput("fstOffHeap.bin", IOContext.DEFAULT); + } + + final FSTCompiler fstCompiler = + new FSTCompiler.Builder<>( + inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs) + .dataOutput(indexOutput) + .build(); for (InputOutput pair : pairs) { if (pair.output instanceof List) { @@ -270,9 +280,17 @@ public FST doTest() throws IOException { fstCompiler.add(pair.input, pair.output); } } - FST fst = compile(fstCompiler); + FST fst = fstCompiler.compile(); + ; - if (random.nextBoolean() && fst != null) { + if (useOffHeap) { + indexOutput.close(); + try (IndexInput in = dir.openInput("fstOffHeap.bin", IOContext.DEFAULT)) { + fst = new FST<>(fst.getMetadata(), in); + } finally { + dir.deleteFile("fstOffHeap.bin"); + } + } else if (random.nextBoolean() && fst != null) { IOContext context = LuceneTestCase.newIOContext(random); try (IndexOutput out = dir.createOutput("fst.bin", context)) { fst.save(out, out); @@ -313,15 +331,6 @@ public FST doTest() throws IOException { return fst; } - protected FST compile(FSTCompiler fstCompiler) throws IOException { - return fstCompiler.compile(); - } - - protected FSTCompiler.Builder getFSTBuilder() { - return new FSTCompiler.Builder<>( - inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs); - } - protected boolean outputsEqual(T a, T b) { return a.equals(b); } From dba57ff0c6d5115e5ed77e9e9423bb089394b7b1 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 6 Dec 2023 00:58:18 +0900 Subject: [PATCH 39/45] Remove the hardcoded random --- .../src/java/org/apache/lucene/tests/util/fst/FSTTester.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java index 8afbdae5b9d5..cfcd0f80d5e1 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/fst/FSTTester.java @@ -256,7 +256,7 @@ private T randomAcceptedWord(FST fst, IntsRefBuilder in) throws IOException { public FST doTest() throws IOException { IndexOutput indexOutput = null; - boolean useOffHeap = true; // random.nextBoolean(); + boolean useOffHeap = random.nextBoolean(); if (useOffHeap) { indexOutput = dir.createOutput("fstOffHeap.bin", IOContext.DEFAULT); } From 55522715248a2d3d816286f876096f7d35ac3402 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 6 Dec 2023 01:06:49 +0900 Subject: [PATCH 40/45] Ignore the Test2BFSTOffHeap test --- .../src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java index 9f3cbaf98a62..090c99716f6b 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java @@ -29,13 +29,14 @@ import org.apache.lucene.tests.util.TimeUnits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; +import org.junit.Ignore; // Similar to Test2BFST but will build and read the FST off-heap and can be run with small heap // Run something like this: // ./gradlew test --tests Test2BFSTOffHeap -Dtests.verbose=true --max-workers=1 -// @Ignore("Requires tons of heap to run (30 GB hits OOME but 35 GB passes after ~4.5 hours)") +@Ignore("Will take long time to run (~4.5 hours)") @SuppressSysoutChecks(bugUrl = "test prints helpful progress reports with time") @TimeoutSuite(millis = 100 * TimeUnits.HOUR) public class Test2BFSTOffHeap extends LuceneTestCase { From 6cc31c9265374d6a6ab216c4d2905483af40d20b Mon Sep 17 00:00:00 2001 From: dungba88 Date: Wed, 6 Dec 2023 10:56:05 +0900 Subject: [PATCH 41/45] Simplify ReadWriteDataOutput --- .../lucene/util/fst/ReadWriteDataOutput.java | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index 7b680b49be32..83706e511084 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -17,8 +17,6 @@ package org.apache.lucene.util.fst; import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.List; import org.apache.lucene.store.ByteBuffersDataInput; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; @@ -30,10 +28,8 @@ final class ReadWriteDataOutput extends DataOutput implements FSTReader { private final ByteBuffersDataOutput dataOutput; - // the DataInput to read from in case the DataOutput has multiple blocks + // the DataInput to read from once we finish writing private ByteBuffersDataInput dataInput; - // the ByteBuffers to read from in case the DataOutput has a single block - private ByteBuffer byteBuffer; // whether this DataOutput is already frozen private boolean frozen; @@ -64,21 +60,12 @@ public long ramBytesUsed() { public void freeze() { frozen = true; - // these operations are costly, so we want to compute it once and cache - List byteBuffers = dataOutput.toWriteableBufferList(); - if (byteBuffers.size() == 1) { - byteBuffer = byteBuffers.get(0); - } else { - dataInput = new ByteBuffersDataInput(byteBuffers); - } + // this operation are costly, so we want to compute it once and cache + dataInput = dataOutput.toDataInput(); } @Override public FST.BytesReader getReverseBytesReader() { - if (byteBuffer != null) { - // use a faster implementation for single-block case - return new ReverseBytesReader(byteBuffer.array()); - } assert dataInput != null; // freeze() must be called first return new ReverseRandomAccessReader(dataInput); } From 34efa9699d0242dcbecd5fcf690e869b52cc3447 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Sun, 10 Dec 2023 21:56:33 +0700 Subject: [PATCH 42/45] Do not expose blockBits --- .../org/apache/lucene/store/ByteBuffersDataOutput.java | 4 ---- .../src/java/org/apache/lucene/util/fst/FSTCompiler.java | 3 +-- .../org/apache/lucene/util/fst/ReadWriteDataOutput.java | 9 ++++++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java index 33daa51389da..eaa0929848db 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java @@ -49,10 +49,6 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab throw new RuntimeException("reset() is not allowed on this buffer."); }; - public int getBlockBits() { - return blockBits; - } - /** * An implementation of a {@link ByteBuffer} allocation and recycling policy. The blocks are * recycled if exactly the same size is requested, otherwise they're released to be GCed. diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index da1fff8ec15e..dd7e2e47a066 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -153,8 +153,7 @@ public class FSTCompiler { * @return the DataOutput */ public static DataOutput getOnHeapReaderWriter(int blockBits) { - return new ReadWriteDataOutput( - new ByteBuffersDataOutput(blockBits, blockBits, ALLOCATE_BB_ON_HEAP, NO_REUSE)); + return new ReadWriteDataOutput(blockBits); } private FSTCompiler( diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index 82cabb8dc6fc..7beaa7812024 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -22,6 +22,9 @@ import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; +import static org.apache.lucene.store.ByteBuffersDataOutput.ALLOCATE_BB_ON_HEAP; +import static org.apache.lucene.store.ByteBuffersDataOutput.NO_REUSE; + /** * An adapter class to use {@link ByteBuffersDataOutput} as a {@link FSTReader}. It allows the FST * to be readable immediately after writing @@ -36,9 +39,9 @@ final class ReadWriteDataOutput extends DataOutput implements FSTReader { // whether this DataOutput is already frozen private boolean frozen; - public ReadWriteDataOutput(ByteBuffersDataOutput dataOutput) { - this.dataOutput = dataOutput; - this.blockBits = dataOutput.getBlockBits(); + public ReadWriteDataOutput(int blockBits) { + this.dataOutput = new ByteBuffersDataOutput(blockBits, blockBits, ALLOCATE_BB_ON_HEAP, NO_REUSE); + this.blockBits = blockBits; this.blockSize = 1 << blockBits; this.blockMask = blockSize - 1; } From cbceb85eca836df4b36265671b198b1ebff5ee17 Mon Sep 17 00:00:00 2001 From: dungba88 Date: Sun, 10 Dec 2023 22:00:51 +0700 Subject: [PATCH 43/45] tidy code --- .../src/java/org/apache/lucene/util/fst/FSTCompiler.java | 3 --- .../org/apache/lucene/util/fst/ReadWriteDataOutput.java | 9 +++++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index dd7e2e47a066..1282e4111cf3 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.util.fst; -import static org.apache.lucene.store.ByteBuffersDataOutput.ALLOCATE_BB_ON_HEAP; -import static org.apache.lucene.store.ByteBuffersDataOutput.NO_REUSE; import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH; import static org.apache.lucene.util.fst.FST.ARCS_FOR_CONTINUOUS; import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING; @@ -35,7 +33,6 @@ import java.io.IOException; import java.util.Objects; import org.apache.lucene.store.ByteArrayDataOutput; -import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.ArrayUtil; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index 7beaa7812024..eb6ce3d36bc1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -16,15 +16,15 @@ */ package org.apache.lucene.util.fst; +import static org.apache.lucene.store.ByteBuffersDataOutput.ALLOCATE_BB_ON_HEAP; +import static org.apache.lucene.store.ByteBuffersDataOutput.NO_REUSE; + import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; -import static org.apache.lucene.store.ByteBuffersDataOutput.ALLOCATE_BB_ON_HEAP; -import static org.apache.lucene.store.ByteBuffersDataOutput.NO_REUSE; - /** * An adapter class to use {@link ByteBuffersDataOutput} as a {@link FSTReader}. It allows the FST * to be readable immediately after writing @@ -40,7 +40,8 @@ final class ReadWriteDataOutput extends DataOutput implements FSTReader { private boolean frozen; public ReadWriteDataOutput(int blockBits) { - this.dataOutput = new ByteBuffersDataOutput(blockBits, blockBits, ALLOCATE_BB_ON_HEAP, NO_REUSE); + this.dataOutput = + new ByteBuffersDataOutput(blockBits, blockBits, ALLOCATE_BB_ON_HEAP, NO_REUSE); this.blockBits = blockBits; this.blockSize = 1 << blockBits; this.blockMask = blockSize - 1; From 9fccfbcdff9159a29c42ad192de301115d4b8d5c Mon Sep 17 00:00:00 2001 From: Dzung Bui Date: Sat, 23 Dec 2023 15:49:06 +0900 Subject: [PATCH 44/45] Remove 0 initialization --- .../java/org/apache/lucene/util/fst/ReadWriteDataOutput.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index eb6ce3d36bc1..88b2838434f1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -80,7 +80,7 @@ public FST.BytesReader getReverseBytesReader() { return new FST.BytesReader() { private byte[] current = byteBuffers.get(0).array(); private int nextBuffer = -1; - private int nextRead = 0; + private int nextRead; @Override public byte readByte() { From 3c388fea405486cc4bbfd8e5113af95b47ce5cfe Mon Sep 17 00:00:00 2001 From: dungba88 Date: Fri, 5 Jan 2024 10:27:07 +0900 Subject: [PATCH 45/45] Add assertion and comment --- .../java/org/apache/lucene/util/fst/ReadWriteDataOutput.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java index 88b2838434f1..a43c2f4f04d4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReadWriteDataOutput.java @@ -68,6 +68,9 @@ public void freeze() { frozen = true; // this operation is costly, so we want to compute it once and cache this.byteBuffers = dataOutput.toWriteableBufferList(); + // ensure the ByteBuffer internal array is accessible. The call to toWriteableBufferList() above + // would ensure that it is accessible. + assert byteBuffers.stream().allMatch(ByteBuffer::hasArray); } @Override