Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ API Changes

* GITHUB#14844: Change IndexInput.updateReadAdvice to take an IOContext instead (Simon Cooper)

* GITHUB#12980: Make FSTPostingsFormat to build FST off-heap. This PostingsFormat will now
create 2 FST files (tfp.meta and tfp.data) instead of a single one. (Anh Dung Bui)

* GITHUB#15131: Restrict visibility of TieredMergePolicy.score() API (Trevor McCulloch)

New Features
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;

/** FST term dict + Lucene50PBF */
/** FST term dict + Lucene99PBF */
public final class FSTPostingsFormat extends PostingsFormat {
public FSTPostingsFormat() {
super("FST50");
super("FST110");
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,55 +66,76 @@
public class FSTTermsReader extends FieldsProducer {
private final TreeMap<String, TermsReader> fields = new TreeMap<>();
private final PostingsReaderBase postingsReader;
private final IndexInput fstTermsInput;
// IndexInput for FST metadata
private final IndexInput fstMetaInput;
// IndexInput for FST data
private final IndexInput fstDataInput;

public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader)
throws IOException {
final String termsFileName =
final String termsMetaFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION);
state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_META_EXTENSION);
final String termsDataFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_DATA_EXTENSION);

this.postingsReader = postingsReader;
this.fstTermsInput =
state.directory.openInput(
termsFileName, state.context.withHints(FileTypeHint.DATA, PreloadHint.INSTANCE));

IndexInput in = this.fstTermsInput;
IndexInput metaIn = null, dataIn = null;

try {
CodecUtil.checkIndexHeader(
in,
FSTTermsWriter.TERMS_CODEC_NAME,
FSTTermsWriter.TERMS_VERSION_START,
FSTTermsWriter.TERMS_VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
CodecUtil.checksumEntireFile(in);
this.postingsReader.init(in, state);
seekDir(in);
metaIn =
state.directory.openInput(
termsMetaFileName, state.context.withHints(FileTypeHint.DATA, PreloadHint.INSTANCE));
dataIn =
state.directory.openInput(
termsDataFileName, state.context.withHints(FileTypeHint.DATA, PreloadHint.INSTANCE));

verifyInput(state, metaIn);
verifyInput(state, dataIn);

this.postingsReader.init(metaIn, state);
seekDir(metaIn);

final FieldInfos fieldInfos = state.fieldInfos;
final int numFields = in.readVInt();
final int numFields = metaIn.readVInt();
for (int i = 0; i < numFields; i++) {
int fieldNumber = in.readVInt();
int fieldNumber = metaIn.readVInt();
FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
long numTerms = in.readVLong();
long sumTotalTermFreq = in.readVLong();
long numTerms = metaIn.readVLong();
long sumTotalTermFreq = metaIn.readVLong();
// if frequencies are omitted, sumTotalTermFreq=sumDocFreq and we only write one value
long sumDocFreq =
fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : in.readVLong();
int docCount = in.readVInt();
fieldInfo.getIndexOptions() == IndexOptions.DOCS
? sumTotalTermFreq
: metaIn.readVLong();
int docCount = metaIn.readVInt();
TermsReader current =
new TermsReader(fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount);
new TermsReader(
fieldInfo, metaIn, dataIn, numTerms, sumTotalTermFreq, sumDocFreq, docCount);
TermsReader previous = fields.put(fieldInfo.name, current);
checkFieldSummary(state.segmentInfo, in, current, previous);
checkFieldSummary(state.segmentInfo, metaIn, current, previous);
}
this.fstMetaInput = metaIn;
this.fstDataInput = dataIn;
} catch (Throwable t) {
IOUtils.closeWhileSuppressingExceptions(t, in);
IOUtils.closeWhileSuppressingExceptions(t, metaIn, dataIn);
throw t;
}
}

private static void verifyInput(SegmentReadState state, IndexInput in) throws IOException {
CodecUtil.checkIndexHeader(
in,
FSTTermsWriter.TERMS_CODEC_NAME,
FSTTermsWriter.TERMS_VERSION_START,
FSTTermsWriter.TERMS_VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
CodecUtil.checksumEntireFile(in);
}

private void seekDir(IndexInput in) throws IOException {
in.seek(in.length() - CodecUtil.footerLength() - 8);
in.seek(in.readLong());
Expand Down Expand Up @@ -165,7 +186,7 @@ public int size() {
@Override
public void close() throws IOException {
try {
IOUtils.close(postingsReader, fstTermsInput);
IOUtils.close(postingsReader, fstMetaInput, fstDataInput);
} finally {
fields.clear();
}
Expand All @@ -182,7 +203,8 @@ final class TermsReader extends Terms {

TermsReader(
FieldInfo fieldInfo,
IndexInput in,
IndexInput metaIn,
IndexInput dataIn,
long numTerms,
long sumTotalTermFreq,
long sumDocFreq,
Expand All @@ -194,10 +216,11 @@ final class TermsReader extends Terms {
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
final var fstMetadata = FST.readMetadata(in, outputs);
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata);
final var fstMetadata = FST.readMetadata(metaIn, outputs);
OffHeapFSTStore offHeapFSTStore =
new OffHeapFSTStore(dataIn, dataIn.getFilePointer(), fstMetadata);
this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore);
in.skipBytes(offHeapFSTStore.size());
dataIn.skipBytes(offHeapFSTStore.size());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,39 +107,58 @@
* @lucene.experimental
*/
public class FSTTermsWriter extends FieldsConsumer {
static final String TERMS_EXTENSION = "tfp";
static final String TERMS_META_EXTENSION = "tfp.meta";
static final String TERMS_DATA_EXTENSION = "tfp.data";
static final String TERMS_CODEC_NAME = "FSTTerms";
public static final int TERMS_VERSION_START = 2;
public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START;

final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
IndexOutput out;
// IndexOutput for FST metadata
IndexOutput metaOut;
// IndexOutput for FST data
IndexOutput dataOut;
final int maxDoc;
final List<FieldMetaData> fields = new ArrayList<>();

public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter)
throws IOException {
final String termsFileName =
final String termsMetaFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
state.segmentInfo.name, state.segmentSuffix, TERMS_META_EXTENSION);
final String termsDataFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, TERMS_DATA_EXTENSION);

this.postingsWriter = postingsWriter;
this.fieldInfos = state.fieldInfos;
this.out = state.directory.createOutput(termsFileName, state.context);
this.maxDoc = state.segmentInfo.maxDoc();

IndexOutput metaOut = null, dataOut = null;
try {
metaOut = state.directory.createOutput(termsMetaFileName, state.context);
dataOut = state.directory.createOutput(termsDataFileName, state.context);

CodecUtil.writeIndexHeader(
metaOut,
TERMS_CODEC_NAME,
TERMS_VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);

CodecUtil.writeIndexHeader(
out,
dataOut,
TERMS_CODEC_NAME,
TERMS_VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);

this.postingsWriter.init(out, state);
this.postingsWriter.init(metaOut, state);
this.metaOut = metaOut;
this.dataOut = dataOut;
} catch (Throwable t) {
IOUtils.closeWhileSuppressingExceptions(t, out);
IOUtils.closeWhileSuppressingExceptions(t, metaOut, dataOut);
throw t;
}
}
Expand Down Expand Up @@ -184,27 +203,30 @@ public void write(Fields fields, NormsProducer norms) throws IOException {

@Override
public void close() throws IOException {
if (out != null) {
try (IndexOutput _ = out;
postingsWriter) {
// write field summary
final long dirStart = out.getFilePointer();
if (metaOut != null) {
assert dataOut != null;
try (IndexOutput _ = metaOut;
IndexOutput _ = dataOut;
postingsWriter) { // write field summary
final long dirStart = metaOut.getFilePointer();

out.writeVInt(fields.size());
metaOut.writeVInt(fields.size());
for (FieldMetaData field : fields) {
out.writeVInt(field.fieldInfo.number);
out.writeVLong(field.numTerms);
metaOut.writeVInt(field.fieldInfo.number);
metaOut.writeVLong(field.numTerms);
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
out.writeVLong(field.sumTotalTermFreq);
metaOut.writeVLong(field.sumTotalTermFreq);
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
field.dict.save(out, out);
metaOut.writeVLong(field.sumDocFreq);
metaOut.writeVInt(field.docCount);
field.fstMetadata.save(metaOut);
}
writeTrailer(out, dirStart);
CodecUtil.writeFooter(out);
writeTrailer(metaOut, dirStart);
CodecUtil.writeFooter(metaOut);
CodecUtil.writeFooter(dataOut);
} finally {
out = null;
metaOut = null;
dataOut = null;
}
}
}
Expand All @@ -215,21 +237,21 @@ private static class FieldMetaData {
public final long sumTotalTermFreq;
public final long sumDocFreq;
public final int docCount;
public final FST<FSTTermOutputs.TermData> dict;
public final FST.FSTMetadata<FSTTermOutputs.TermData> fstMetadata;

public FieldMetaData(
FieldInfo fieldInfo,
long numTerms,
long sumTotalTermFreq,
long sumDocFreq,
int docCount,
FST<FSTTermOutputs.TermData> fst) {
FST.FSTMetadata<FSTTermOutputs.TermData> fstMetadata) {
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.dict = fst;
this.fstMetadata = fstMetadata;
}
}

Expand All @@ -247,7 +269,8 @@ final class TermsWriter {
this.fieldInfo = fieldInfo;
postingsWriter.setField(fieldInfo);
this.outputs = new FSTTermOutputs(fieldInfo);
this.fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
this.fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).dataOutput(dataOut).build();
}

public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
Expand All @@ -268,10 +291,14 @@ public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
// save FST dict
if (numTerms > 0) {
final FST<FSTTermOutputs.TermData> fst =
FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
fields.add(
new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, fst));
new FieldMetaData(
fieldInfo,
numTerms,
sumTotalTermFreq,
sumDocFreq,
docCount,
fstCompiler.compile()));
}
}
}
Expand Down
Loading