Skip to content
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ API Changes

* GITHUB#14615 : Remove unnecessary public methods in FuzzySet (Greg Miller)

* GITHUB#15295 : Switched to a fixed CFS threshold (Shubham Sharma)

New Features
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,8 @@ private IndexWriter createFastIndexWriter(Directory dir, int maxBufferedDocs) th
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMaxBufferedDocs(maxBufferedDocs);
conf.setRAMBufferSizeMB(-1);
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
conf.setMergePolicy(newLogMergePolicy());
conf.getCodec().compoundFormat().setShouldUseCompoundFile(random().nextBoolean());
return new IndexWriter(dir, conf);
}

Expand Down Expand Up @@ -727,7 +728,8 @@ private void doTestSortedNumericBlocksOfVariousBitsPerValue(LongSupplier counts)
conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
conf.setRAMBufferSizeMB(-1);
// so Lucene docids are predictable / stay in order
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
conf.setMergePolicy(newLogMergePolicy());
conf.getCodec().compoundFormat().setShouldUseCompoundFile(random().nextBoolean());
IndexWriter writer = new IndexWriter(dir, conf);

final int numDocs = atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE * 3);
Expand Down Expand Up @@ -797,7 +799,8 @@ private void doTestSparseNumericBlocksOfVariousBitsPerValue(double density) thro
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
conf.setRAMBufferSizeMB(-1);
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
conf.setMergePolicy(newLogMergePolicy());
conf.getCodec().compoundFormat().setShouldUseCompoundFile(random().nextBoolean());
IndexWriter writer = new IndexWriter(dir, conf);
Document doc = new Document();
Field storedField = newStringField("stored", "", Field.Store.YES);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.MultiBits;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.MultiTerms;
Expand Down Expand Up @@ -125,15 +124,14 @@ protected void createIndex(Directory directory) throws IOException {
}

static void createIndex(Directory dir, boolean doCFS, boolean fullyMerged) throws IOException {
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
mp.setNoCFSRatio(doCFS ? 1.0 : 0.0);
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
// TODO: remove randomness
IndexWriterConfig conf =
new IndexWriterConfig(new MockAnalyzer(random()))
.setMaxBufferedDocs(10)
.setCodec(TestUtil.getDefaultCodec())
.setMergePolicy(NoMergePolicy.INSTANCE);
conf.getCodec().compoundFormat().setShouldUseCompoundFile(doCFS);
conf.getCodec().compoundFormat().setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
IndexWriter writer = new IndexWriter(dir, conf);

for (int i = 0; i < DOCS_COUNT; i++) {
Expand All @@ -147,14 +145,13 @@ static void createIndex(Directory dir, boolean doCFS, boolean fullyMerged) throw

if (!fullyMerged) {
// open fresh writer so we get no prx file in the added segment
mp = new LogByteSizeMergePolicy();
mp.setNoCFSRatio(doCFS ? 1.0 : 0.0);
// TODO: remove randomness
conf =
new IndexWriterConfig(new MockAnalyzer(random()))
.setMaxBufferedDocs(10)
.setCodec(TestUtil.getDefaultCodec())
.setMergePolicy(NoMergePolicy.INSTANCE);
conf.getCodec().compoundFormat().setShouldUseCompoundFile(doCFS);
writer = new IndexWriter(dir, conf);
addNoProxDoc(writer);
writer.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,15 @@ public void testSortedIndex() throws Exception {
@Override
protected void createIndex(Directory directory) throws IOException {
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
mp.setNoCFSRatio(1.0);
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
MockAnalyzer analyzer = new MockAnalyzer(random());

// Don't filter out tokens that are too short because we use those tokens in assertions (#14344)
analyzer.setMaxTokenLength(RandomizedTest.randomIntBetween(5, IndexWriter.MAX_TERM_LENGTH));

// TODO: remove randomness
IndexWriterConfig conf = new IndexWriterConfig(analyzer);
conf.getCodec().compoundFormat().setShouldUseCompoundFile(true);
conf.getCodec().compoundFormat().setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
conf.setMergePolicy(mp);
conf.setUseCompoundFile(false);
conf.setCodec(TestUtil.getDefaultCodec());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ public static Iterable<Object[]> testVersionsFactory() {
@Override
protected void createIndex(Directory directory) throws IOException {
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
mp.setNoCFSRatio(1.0);
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));

Expand All @@ -67,6 +65,8 @@ protected void createIndex(Directory directory) throws IOException {
.setMergePolicy(mp)
.setCodec(TestUtil.getDefaultCodec())
.setUseCompoundFile(false);
conf.getCodec().compoundFormat().setShouldUseCompoundFile(true);
conf.getCodec().compoundFormat().setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
IndexWriter writer = new IndexWriter(directory, conf);
LineFileDocs docs = new LineFileDocs(new Random(0));
for (int i = 0; i < 50; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ public PostingsFormat postingsFormat() {
throw new RuntimeException(
"unable to instantiate class '" + mergePolicy + "' as merge policy", e);
}
iwConf.getMergePolicy().setNoCFSRatio(isCompound ? 1.0 : 0.0);
iwConf.getCodec().compoundFormat().setShouldUseCompoundFile(isCompound);
if (iwConf.getMergePolicy() instanceof LogMergePolicy) {
LogMergePolicy logMergePolicy = (LogMergePolicy) iwConf.getMergePolicy();
logMergePolicy.setMergeFactor(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,7 @@ public void testIndexWriterSettings() throws Exception {
assertEquals(
IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
assertEquals(0.0d, writer.getConfig().getMergePolicy().getNoCFSRatio(), 0.0);
assertFalse(writer.getConfig().getCodec().compoundFormat().getShouldUseCompoundFile());
writer.close();
Directory dir = benchmark.getRunData().getDirectory();
IndexReader reader = DirectoryReader.open(dir);
Expand Down
145 changes: 145 additions & 0 deletions lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.lucene.codecs;

import java.io.IOException;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
Expand All @@ -34,6 +35,150 @@ protected CompoundFormat() {}
// TODO: this is very minimal. If we need more methods,
// we can add 'producer' classes.

/** Default document count threshold for using compound files with LogDocMergePolicy */
static final int DEFAULT_CFS_THRESHOLD_DOC_SIZE = 65536; // docs

/** Default byte size threshold for using compound files with other merge policies (64MB) */
static final long DEFAULT_CFS_THRESHOLD_BYTE_SIZE = 64L * 1024 * 1024; // 64MB

/** Default maximum segment size allowed for compound files (no limit) */
static final long DEFAULT_MAX_CFS_SEGMENT_SIZE = Long.MAX_VALUE;

/** Document count threshold for LogDocMergePolicy */
private int cfsThresholdDocSize = DEFAULT_CFS_THRESHOLD_DOC_SIZE;

/** Byte size threshold for other merge policies */
private long cfsThresholdByteSize = DEFAULT_CFS_THRESHOLD_BYTE_SIZE;

/** Whether compound files should be used at all */
private boolean shouldUseCompoundFile = true;

/** Maximum segment size that can be stored as compound file */
private long maxCFSSegmentSize = DEFAULT_MAX_CFS_SEGMENT_SIZE;

/**
* Sets the document count threshold for using compound files with LogDocMergePolicy. Segments
* with document count less than or equal to this threshold will use compound files.
*
* @param threshold the document count threshold
*/
public void setCfsThresholdDocSize(int threshold) {
this.cfsThresholdDocSize = threshold;
}

/**
* Sets the byte size threshold for using compound files with merge policies other than
* LogDocMergePolicy. Segments with size less than or equal to this threshold will use compound
* files.
*
* @param thresholdBytes the byte size threshold in bytes
*/
public void setCfsThresholdByteSize(long thresholdBytes) {
this.cfsThresholdByteSize = thresholdBytes;
}

/**
* Returns the current document count threshold for compound files.
*
* @return the document count threshold
*/
public int getCfsThresholdDocSize() {
return this.cfsThresholdDocSize;
}

/**
* Returns the current byte size threshold for compound files.
*
* @return the byte size threshold in bytes
*/
public long getCfsThresholdByteSize() {
return this.cfsThresholdByteSize;
}

/**
* Enables or disables the use of compound files entirely. When disabled, no segments will use
* compound files regardless of other settings.
*
* @param useCompoundFile true to enable compound files, false to disable
*/
public void setShouldUseCompoundFile(boolean useCompoundFile) {
this.shouldUseCompoundFile = useCompoundFile;
}

/**
* Returns whether compound files are enabled.
*
* @return true if compound files are enabled, false otherwise
*/
public boolean getShouldUseCompoundFile() {
return this.shouldUseCompoundFile;
}

/**
* Returns the largest size allowed for a compound file segment in megabytes. Segments larger than
* this size will not use compound files even if otherwise eligible.
*
* @return the maximum compound file segment size in MB
*/
public double getMaxCFSSegmentSizeMB() {
return maxCFSSegmentSize / 1024. / 1024.;
}

/**
* Sets the maximum size limit for compound file segments in megabytes. If a merged segment will
* be larger than this value, it will be left as a non-compound file even if compound files are
* enabled. Set this to Double.POSITIVE_INFINITY (default) to always use CFS when other conditions
* are met.
*
* @param v the maximum segment size in MB (must be >= 0)
* @throws IllegalArgumentException if v is negative
*/
public void setMaxCFSSegmentSizeMB(double v) {
if (v < 0.0) {
throw new IllegalArgumentException("maxCFSSegmentSizeMB must be >=0 (got " + v + ")");
}
v *= 1024 * 1024; // Convert MB to bytes
this.maxCFSSegmentSize = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v;
}

/**
* Determines whether a segment should use the compound file format based on its size and merge
* policy.
*
* <p>The decision logic is as follows:
*
* <ol>
* <li>If compound files are disabled globally, return false
* <li>If segment size exceeds the maximum CFS segment size, return false
* <li>For DOCS-based policies: use CFS if document count ≤ document threshold
* <li>For BYTES-based policies: use CFS if byte size ≤ byte threshold
* </ol>
*
* @param mergedInfoSize the size of the segment (document count for DOCS-based policies, bytes
* for BYTES-based policies)
* @param mergePolicy the merge policy being used
* @return true if the segment should use compound file format, false otherwise
* @throws IOException if an I/O error occurs
*/
public boolean useCompoundFile(long mergedInfoSize, MergePolicy mergePolicy) throws IOException {
// Check if compound files are globally disabled
if (this.shouldUseCompoundFile == false) {
return false;
}

// Check if segment exceeds maximum allowed size for CFS
if (mergedInfoSize > maxCFSSegmentSize) {
return false;
}

// Apply appropriate threshold based on merge policy's size unit
if (mergePolicy.getSizeUnit() == MergePolicy.SizeUnit.DOCS) {
return mergedInfoSize <= this.cfsThresholdDocSize;
} else {
return mergedInfoSize <= this.cfsThresholdByteSize;
}
}

/** Returns a Directory view (read-only) for the compound files in this segment */
public abstract CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si)
throws IOException;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,36 +75,14 @@ public MergeSpecification findFullFlushMerges(
return in.findFullFlushMerges(mergeTrigger, segmentInfos, mergeContext);
}

@Override
public boolean useCompoundFile(
SegmentInfos infos, SegmentCommitInfo mergedInfo, MergeContext mergeContext)
throws IOException {
return in.useCompoundFile(infos, mergedInfo, mergeContext);
}

@Override
protected long size(SegmentCommitInfo info, MergeContext context) throws IOException {
return in.size(info, context);
}

@Override
public double getNoCFSRatio() {
return in.getNoCFSRatio();
}

@Override
public final void setNoCFSRatio(double noCFSRatio) {
in.setNoCFSRatio(noCFSRatio);
}

@Override
public final void setMaxCFSSegmentSizeMB(double v) {
in.setMaxCFSSegmentSizeMB(v);
}

@Override
public final double getMaxCFSSegmentSizeMB() {
return in.getMaxCFSSegmentSizeMB();
public SizeUnit getSizeUnit() {
return super.getSizeUnit();
}

@Override
Expand Down
16 changes: 14 additions & 2 deletions lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -3481,7 +3481,13 @@ public void addIndexesReaderMerge(MergePolicy.OneMerge merge) throws IOException
boolean useCompoundFile;
synchronized (this) {
merge.checkAborted();
useCompoundFile = mergePolicy.useCompoundFile(segmentInfos, merge.getMergeInfo(), this);
useCompoundFile =
merge
.getMergeInfo()
.info
.getCodec()
.compoundFormat()
.useCompoundFile(mergePolicy.size(merge.getMergeInfo(), this), mergePolicy);
}

// Now create the compound file if needed
Expand Down Expand Up @@ -5337,7 +5343,13 @@ public int length() {
// this segment:
boolean useCompoundFile;
synchronized (this) { // Guard segmentInfos
useCompoundFile = mergePolicy.useCompoundFile(segmentInfos, merge.info, this);
useCompoundFile =
merge
.getMergeInfo()
.info
.getCodec()
.compoundFormat()
.useCompoundFile(mergePolicy.size(merge.getMergeInfo(), this), mergePolicy);
}

if (useCompoundFile) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
Expand Down Expand Up @@ -368,9 +369,8 @@ public InfoStream getInfoStream() {
*
* <p>Use <code>false</code> for batch indexing with very large ram buffer settings.
*
* <p><b>Note: To control compound file usage during segment merges see {@link
* MergePolicy#setNoCFSRatio(double)} and {@link MergePolicy#setMaxCFSSegmentSizeMB(double)}. This
* setting only applies to newly created segments.</b>
* <p><b>Note: To control compound file usage during segment merges. More here: {@link
* CompoundFormat}</b>.
*/
public LiveIndexWriterConfig setUseCompoundFile(boolean useCompoundFile) {
this.useCompoundFile = useCompoundFile;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ protected long size(SegmentCommitInfo info, MergeContext mergeContext) throws IO
return sizeDocs(info, mergeContext);
}

@Override
public SizeUnit getSizeUnit() {
return SizeUnit.DOCS;
}

/**
* Sets the minimum size for the lowest level segments. Any segments below this size are
* candidates for full-flush merges and merged more aggressively in order to avoid having a long
Expand Down
Loading