Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
a6259fc
Move size() to FSTStore
dungba88 Nov 13, 2023
e0e1517
Remove size() completely
dungba88 Nov 14, 2023
9b986a8
Merge branch 'apache:main' into remove-size
dungba88 Nov 15, 2023
f2d8234
Allow FST builder to use different DataOutput
dungba88 Nov 15, 2023
13c9359
access BytesStore byte[] directly for copying
dungba88 Nov 15, 2023
fa08d51
Rename BytesStore
dungba88 Nov 15, 2023
c6fb4b5
Change class to final
dungba88 Nov 15, 2023
f1e8f89
Reorder methods
dungba88 Nov 16, 2023
2f9c730
Remove unused methods
dungba88 Nov 16, 2023
847828d
Rename truncate to setPosition() and remove skipBytes()
dungba88 Nov 16, 2023
0913062
Simplify the writing operations
dungba88 Nov 16, 2023
0de0d26
Update comment
dungba88 Nov 16, 2023
ef3fdc6
remove unused parameter
dungba88 Nov 16, 2023
f00d24f
Simplify BytesStore operation
dungba88 Nov 16, 2023
12963df
tidy code
dungba88 Nov 16, 2023
f1e81b8
Rename copyBytes to writeTo
dungba88 Nov 16, 2023
b7ceec9
Merge branch 'remove-size' into pr-12543-1
dungba88 Nov 16, 2023
c8165ad
Simplify BytesStore operations
dungba88 Nov 16, 2023
d8cb927
Merge branch 'simplify-bytesstore' into pr-12543-1
dungba88 Nov 16, 2023
096fa97
Merge branch 'pr-12543-1' into pr-12543
dungba88 Nov 16, 2023
9a002c0
Embed writeBytes() to FSTCompiler
dungba88 Nov 16, 2023
1c201d4
Fix the write bytes method
dungba88 Nov 16, 2023
cfdeeff
Merge branch 'apache:main' into pr-12543
dungba88 Nov 18, 2023
1420cfc
Merge branch 'apache:main' into pr-12543-1
dungba88 Nov 18, 2023
7efcde0
Remove the default block bits constant
dungba88 Nov 18, 2023
8f98e7b
Merge branch 'pr-12543-1' into pr-12543
dungba88 Nov 18, 2023
b140b91
add assertion
dungba88 Nov 18, 2023
dbc1918
Rename method parameter names
dungba88 Nov 19, 2023
a5c7e14
Move reverse to FSTCompiler
dungba88 Nov 19, 2023
50de8f7
Revert setPosition call
dungba88 Nov 19, 2023
d90902f
Merge branch 'main' into pr-12543
dungba88 Nov 21, 2023
5818fe2
Merge branch 'main' into pr-12543
dungba88 Nov 21, 2023
e88f452
Address comments
dungba88 Nov 21, 2023
2587fc7
Return immediately when writing 0 bytes
dungba88 Nov 21, 2023
f0b78d2
Add comment &
dungba88 Nov 21, 2023
94913bb
Merge branch 'main' into pr-12543
dungba88 Nov 22, 2023
fd458c4
Rename variables
dungba88 Nov 22, 2023
606fe45
Fix the compile error
dungba88 Nov 22, 2023
dc45d9a
Remove isReadable()
dungba88 Nov 23, 2023
36685cc
Remove isReadable()
dungba88 Nov 24, 2023
1c4f68d
Optimize ReadWriteDataOutput
dungba88 Nov 28, 2023
34aabf2
tidy code
dungba88 Nov 28, 2023
7ae16bb
Freeze the DataOutput once finished()
dungba88 Nov 28, 2023
4823bc1
Refactor
dungba88 Nov 28, 2023
817ae08
freeze the DataOutput before use
dungba88 Nov 28, 2023
e0d56f5
Merge branch 'main' into pr-12543
dungba88 Nov 29, 2023
14c977b
Improvement of ReadWriteDataOutput
dungba88 Dec 4, 2023
b7b7b0b
tidy code
dungba88 Dec 4, 2023
6f7a8c5
Address comments and add off-heap FST tests
dungba88 Dec 5, 2023
dba57ff
Remove the hardcoded random
dungba88 Dec 5, 2023
5552271
Ignore the Test2BFSTOffHeap test
dungba88 Dec 5, 2023
6cc31c9
Simplify ReadWriteDataOutput
dungba88 Dec 6, 2023
ea3d588
Merge branch 'pr-12543' into pr-12543-1
dungba88 Dec 6, 2023
e14a909
Merge branch 'pr-12543-1' into tmp-rwdo
dungba88 Dec 10, 2023
013b42b
Merge pull request #21 from dungba88/tmp-rwdo
dungba88 Dec 10, 2023
c9f230a
Merge branch 'apache:main' into pr-12543-1
dungba88 Dec 10, 2023
34efa96
Do not expose blockBits
dungba88 Dec 10, 2023
cbceb85
tidy code
dungba88 Dec 10, 2023
ef9eae6
Merge branch 'apache:main' into pr-12543-1
dungba88 Dec 23, 2023
9fccfbc
Remove 0 initialization
dungba88 Dec 23, 2023
3c388fe
Add assertion and comment
dungba88 Jan 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
*/
package org.apache.lucene.util.fst;

import static org.apache.lucene.store.ByteBuffersDataOutput.ALLOCATE_BB_ON_HEAP;
import static org.apache.lucene.store.ByteBuffersDataOutput.NO_REUSE;
import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH;
import static org.apache.lucene.util.fst.FST.ARCS_FOR_CONTINUOUS;
import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING;
Expand All @@ -34,7 +32,6 @@
import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
Expand Down Expand Up @@ -153,8 +150,7 @@ public class FSTCompiler<T> {
* @return the DataOutput
*/
public static DataOutput getOnHeapReaderWriter(int blockBits) {
return new ReadWriteDataOutput(
new ByteBuffersDataOutput(blockBits, blockBits, ALLOCATE_BB_ON_HEAP, NO_REUSE));
return new ReadWriteDataOutput(blockBits);
}

private FSTCompiler(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@
*/
package org.apache.lucene.util.fst;

import static org.apache.lucene.store.ByteBuffersDataOutput.ALLOCATE_BB_ON_HEAP;
import static org.apache.lucene.store.ByteBuffersDataOutput.NO_REUSE;

import java.io.IOException;
import org.apache.lucene.store.ByteBuffersDataInput;
import java.nio.ByteBuffer;
import java.util.List;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;

Expand All @@ -28,13 +32,19 @@
final class ReadWriteDataOutput extends DataOutput implements FSTReader {

private final ByteBuffersDataOutput dataOutput;
// the DataInput to read from once we finish writing
private ByteBuffersDataInput dataInput;
private final int blockBits;
private final int blockSize;
private final int blockMask;
private List<ByteBuffer> byteBuffers;
// whether this DataOutput is already frozen
private boolean frozen;

public ReadWriteDataOutput(ByteBuffersDataOutput dataOutput) {
this.dataOutput = dataOutput;
public ReadWriteDataOutput(int blockBits) {
this.dataOutput =
new ByteBuffersDataOutput(blockBits, blockBits, ALLOCATE_BB_ON_HEAP, NO_REUSE);
this.blockBits = blockBits;
this.blockSize = 1 << blockBits;
this.blockMask = blockSize - 1;
}

@Override
Expand All @@ -56,14 +66,62 @@ public long ramBytesUsed() {

public void freeze() {
frozen = true;
// this operation are costly, so we want to compute it once and cache
dataInput = dataOutput.toDataInput();
// this operation is costly, so we want to compute it once and cache
this.byteBuffers = dataOutput.toWriteableBufferList();
// ensure the ByteBuffer internal array is accessible. The call to toWriteableBufferList() above
// would ensure that it is accessible.
assert byteBuffers.stream().allMatch(ByteBuffer::hasArray);
}

@Override
public FST.BytesReader getReverseBytesReader() {
assert dataInput != null; // freeze() must be called first
return new ReverseRandomAccessReader(dataInput);
assert byteBuffers != null; // freeze() must be called first
if (byteBuffers.size() == 1) {
// use a faster implementation for single-block case
return new ReverseBytesReader(byteBuffers.get(0).array());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we assert hasArray() before this? .array() is optional (only valid for DirectByteBuffer I think).

Copy link
Contributor Author

@dungba88 dungba88 Jan 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we call toWriteableBufferList the array should be accessible. I think if the array is somehow not accessible HeapByteBuffer (the one being used) would throw an exception and thus would be caught by tests

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I realize the code is safe but it takes some thinking to confirm it -- maybe add the assert with your awesome above explanation? So future code readers know why this ByteBuffer is for sure backed by an array.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good idea. I've added the assertion. I decided to put it right after the ByteBuffer list is created (in freeze()) and assert all ByteBuffer in the list.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome, thanks, looks great @dungba88!

}
return new FST.BytesReader() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do tests exercise this path sometimes? We would need tests to either make large FSTs, or, randomize the block size so sometimes it's tiny?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe testRealFST did this sometimes

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm I'm still worried about test coverage of this. Conditionals like this are dangerous for Lucene, since our tests only test tiny FSTs, this code path would be rarely/never executed, likely even by our nightly benchmarks. Ideally we would find a way to randomize the page size in many tests, or at least, one test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I meant TestFSTs.testRealTerms. The default block size is 32KB and the FST is 55-80KB, thus 2-3 blocks.

private byte[] current = byteBuffers.get(0).array();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

array() returns the internal array and there's no byte-copy here

private int nextBuffer = -1;
private int nextRead;

@Override
public byte readByte() {
if (nextRead == -1) {
current = byteBuffers.get(nextBuffer--).array();
nextRead = blockSize - 1;
}
return current[nextRead--];
}

@Override
public void skipBytes(long count) {
setPosition(getPosition() - count);
}

@Override
public void readBytes(byte[] b, int offset, int len) {
for (int i = 0; i < len; i++) {
b[offset + i] = readByte();
}
}

@Override
public long getPosition() {
return ((long) nextBuffer + 1) * blockSize + nextRead;
}

@Override
public void setPosition(long pos) {
int bufferIndex = (int) (pos >> blockBits);
if (nextBuffer != bufferIndex - 1) {
nextBuffer = bufferIndex - 1;
current = byteBuffers.get(bufferIndex).array();
}
nextRead = (int) (pos & blockMask);
assert getPosition() == pos : "pos=" + pos + " getPos()=" + getPosition();
}
};
}

@Override
Expand Down