diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3ea1326b4608..3d20022b3b58 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -31,6 +31,9 @@ API Changes * GITHUB#14844: Change IndexInput.updateReadAdvice to take an IOContext instead (Simon Cooper) +* GITHUB#12980: Make FSTPostingsFormat to build FST off-heap. This PostingsFormat will now + create 2 FST files (tfp.meta and tfp.data) instead of a single one. (Anh Dung Bui) + * GITHUB#15131: Restrict visibility of TieredMergePolicy.score() API (Trevor McCulloch) New Features diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java index a72fde421d61..2086fe3c586b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java @@ -28,10 +28,10 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; -/** FST term dict + Lucene50PBF */ +/** FST term dict + Lucene99PBF */ public final class FSTPostingsFormat extends PostingsFormat { public FSTPostingsFormat() { - super("FST50"); + super("FST110"); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java index 943298ffbf9d..c9eb4605a3fc 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java @@ -66,55 +66,76 @@ public class FSTTermsReader extends FieldsProducer { private final TreeMap fields = new TreeMap<>(); private final PostingsReaderBase postingsReader; - private final IndexInput fstTermsInput; + // IndexInput for FST metadata + private final IndexInput fstMetaInput; + // IndexInput for FST data + private final IndexInput fstDataInput; public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException { - final String termsFileName = + final String termsMetaFileName = IndexFileNames.segmentFileName( - state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION); + state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_META_EXTENSION); + final String termsDataFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_DATA_EXTENSION); this.postingsReader = postingsReader; - this.fstTermsInput = - state.directory.openInput( - termsFileName, state.context.withHints(FileTypeHint.DATA, PreloadHint.INSTANCE)); - IndexInput in = this.fstTermsInput; + IndexInput metaIn = null, dataIn = null; try { - CodecUtil.checkIndexHeader( - in, - FSTTermsWriter.TERMS_CODEC_NAME, - FSTTermsWriter.TERMS_VERSION_START, - FSTTermsWriter.TERMS_VERSION_CURRENT, - state.segmentInfo.getId(), - state.segmentSuffix); - CodecUtil.checksumEntireFile(in); - this.postingsReader.init(in, state); - seekDir(in); + metaIn = + state.directory.openInput( + termsMetaFileName, state.context.withHints(FileTypeHint.DATA, PreloadHint.INSTANCE)); + dataIn = + state.directory.openInput( + termsDataFileName, state.context.withHints(FileTypeHint.DATA, PreloadHint.INSTANCE)); + + verifyInput(state, metaIn); + verifyInput(state, dataIn); + + this.postingsReader.init(metaIn, state); + seekDir(metaIn); final FieldInfos fieldInfos = state.fieldInfos; - final int numFields = in.readVInt(); + final int numFields = metaIn.readVInt(); for (int i = 0; i < numFields; i++) { - int fieldNumber = in.readVInt(); + int fieldNumber = metaIn.readVInt(); FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); - long numTerms = in.readVLong(); - long sumTotalTermFreq = in.readVLong(); + long numTerms = metaIn.readVLong(); + long sumTotalTermFreq = metaIn.readVLong(); // if frequencies are omitted, sumTotalTermFreq=sumDocFreq and we only write one value long sumDocFreq = - fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : in.readVLong(); - int docCount = in.readVInt(); + fieldInfo.getIndexOptions() == IndexOptions.DOCS + ? sumTotalTermFreq + : metaIn.readVLong(); + int docCount = metaIn.readVInt(); TermsReader current = - new TermsReader(fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount); + new TermsReader( + fieldInfo, metaIn, dataIn, numTerms, sumTotalTermFreq, sumDocFreq, docCount); TermsReader previous = fields.put(fieldInfo.name, current); - checkFieldSummary(state.segmentInfo, in, current, previous); + checkFieldSummary(state.segmentInfo, metaIn, current, previous); } + this.fstMetaInput = metaIn; + this.fstDataInput = dataIn; } catch (Throwable t) { - IOUtils.closeWhileSuppressingExceptions(t, in); + IOUtils.closeWhileSuppressingExceptions(t, metaIn, dataIn); throw t; } } + private static void verifyInput(SegmentReadState state, IndexInput in) throws IOException { + CodecUtil.checkIndexHeader( + in, + FSTTermsWriter.TERMS_CODEC_NAME, + FSTTermsWriter.TERMS_VERSION_START, + FSTTermsWriter.TERMS_VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + CodecUtil.checksumEntireFile(in); + } + private void seekDir(IndexInput in) throws IOException { in.seek(in.length() - CodecUtil.footerLength() - 8); in.seek(in.readLong()); @@ -165,7 +186,7 @@ public int size() { @Override public void close() throws IOException { try { - IOUtils.close(postingsReader, fstTermsInput); + IOUtils.close(postingsReader, fstMetaInput, fstDataInput); } finally { fields.clear(); } @@ -182,7 +203,8 @@ final class TermsReader extends Terms { TermsReader( FieldInfo fieldInfo, - IndexInput in, + IndexInput metaIn, + IndexInput dataIn, long numTerms, long sumTotalTermFreq, long sumDocFreq, @@ -194,10 +216,11 @@ final class TermsReader extends Terms { this.sumDocFreq = sumDocFreq; this.docCount = docCount; FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo); - final var fstMetadata = FST.readMetadata(in, outputs); - OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata); + final var fstMetadata = FST.readMetadata(metaIn, outputs); + OffHeapFSTStore offHeapFSTStore = + new OffHeapFSTStore(dataIn, dataIn.getFilePointer(), fstMetadata); this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore); - in.skipBytes(offHeapFSTStore.size()); + dataIn.skipBytes(offHeapFSTStore.size()); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java index 73d864d3579a..c9ff83d38253 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java @@ -107,39 +107,58 @@ * @lucene.experimental */ public class FSTTermsWriter extends FieldsConsumer { - static final String TERMS_EXTENSION = "tfp"; + static final String TERMS_META_EXTENSION = "tfp.meta"; + static final String TERMS_DATA_EXTENSION = "tfp.data"; static final String TERMS_CODEC_NAME = "FSTTerms"; public static final int TERMS_VERSION_START = 2; public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START; final PostingsWriterBase postingsWriter; final FieldInfos fieldInfos; - IndexOutput out; + // IndexOutput for FST metadata + IndexOutput metaOut; + // IndexOutput for FST data + IndexOutput dataOut; final int maxDoc; final List fields = new ArrayList<>(); public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { - final String termsFileName = + final String termsMetaFileName = IndexFileNames.segmentFileName( - state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); + state.segmentInfo.name, state.segmentSuffix, TERMS_META_EXTENSION); + final String termsDataFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, TERMS_DATA_EXTENSION); this.postingsWriter = postingsWriter; this.fieldInfos = state.fieldInfos; - this.out = state.directory.createOutput(termsFileName, state.context); this.maxDoc = state.segmentInfo.maxDoc(); + IndexOutput metaOut = null, dataOut = null; try { + metaOut = state.directory.createOutput(termsMetaFileName, state.context); + dataOut = state.directory.createOutput(termsDataFileName, state.context); + + CodecUtil.writeIndexHeader( + metaOut, + TERMS_CODEC_NAME, + TERMS_VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + CodecUtil.writeIndexHeader( - out, + dataOut, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - this.postingsWriter.init(out, state); + this.postingsWriter.init(metaOut, state); + this.metaOut = metaOut; + this.dataOut = dataOut; } catch (Throwable t) { - IOUtils.closeWhileSuppressingExceptions(t, out); + IOUtils.closeWhileSuppressingExceptions(t, metaOut, dataOut); throw t; } } @@ -184,27 +203,30 @@ public void write(Fields fields, NormsProducer norms) throws IOException { @Override public void close() throws IOException { - if (out != null) { - try (IndexOutput _ = out; - postingsWriter) { - // write field summary - final long dirStart = out.getFilePointer(); + if (metaOut != null) { + assert dataOut != null; + try (IndexOutput _ = metaOut; + IndexOutput _ = dataOut; + postingsWriter) { // write field summary + final long dirStart = metaOut.getFilePointer(); - out.writeVInt(fields.size()); + metaOut.writeVInt(fields.size()); for (FieldMetaData field : fields) { - out.writeVInt(field.fieldInfo.number); - out.writeVLong(field.numTerms); + metaOut.writeVInt(field.fieldInfo.number); + metaOut.writeVLong(field.numTerms); if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) { - out.writeVLong(field.sumTotalTermFreq); + metaOut.writeVLong(field.sumTotalTermFreq); } - out.writeVLong(field.sumDocFreq); - out.writeVInt(field.docCount); - field.dict.save(out, out); + metaOut.writeVLong(field.sumDocFreq); + metaOut.writeVInt(field.docCount); + field.fstMetadata.save(metaOut); } - writeTrailer(out, dirStart); - CodecUtil.writeFooter(out); + writeTrailer(metaOut, dirStart); + CodecUtil.writeFooter(metaOut); + CodecUtil.writeFooter(dataOut); } finally { - out = null; + metaOut = null; + dataOut = null; } } } @@ -215,7 +237,7 @@ private static class FieldMetaData { public final long sumTotalTermFreq; public final long sumDocFreq; public final int docCount; - public final FST dict; + public final FST.FSTMetadata fstMetadata; public FieldMetaData( FieldInfo fieldInfo, @@ -223,13 +245,13 @@ public FieldMetaData( long sumTotalTermFreq, long sumDocFreq, int docCount, - FST fst) { + FST.FSTMetadata fstMetadata) { this.fieldInfo = fieldInfo; this.numTerms = numTerms; this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; - this.dict = fst; + this.fstMetadata = fstMetadata; } } @@ -247,7 +269,8 @@ final class TermsWriter { this.fieldInfo = fieldInfo; postingsWriter.setField(fieldInfo); this.outputs = new FSTTermOutputs(fieldInfo); - this.fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build(); + this.fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).dataOutput(dataOut).build(); } public void finishTerm(BytesRef text, BlockTermState state) throws IOException { @@ -268,10 +291,14 @@ public void finishTerm(BytesRef text, BlockTermState state) throws IOException { public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { // save FST dict if (numTerms > 0) { - final FST fst = - FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()); fields.add( - new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, fst)); + new FieldMetaData( + fieldInfo, + numTerms, + sumTotalTermFreq, + sumDocFreq, + docCount, + fstCompiler.compile())); } } }