@@ -116,11 +116,16 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
116116 public static final int DEFAULT_NUM_MERGE_WORKER = 1 ;
117117
118118 /**
119- * Threshold below which HNSW graph building is bypassed for tiny segments. Segments with fewer
120- * vectors will use flat storage only, improving indexing performance when having frequent
121- * flushes.
119+ * Threshold which dynamically uses the HnswGraphSearcher#expectedVisitedNodes heuristic to find
120+ * the threshold below which HNSW graph building is bypassed. flushes. It is in terms of k for a
121+ * graph i.e. number of docs to match the query. So,
122+ *
123+ * <pre> k << size / log(size) </pre>
124+ *
125+ * i.e. k is at least 1 order less than size / log(size) where size if the number of nodes in the
126+ * graph
122127 */
123- public static final int HNSW_GRAPH_THRESHOLD = 10_000 ;
128+ public static final int HNSW_GRAPH_THRESHOLD = 10 ;
124129
125130 static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16 ;
126131
@@ -145,15 +150,16 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
145150 private final TaskExecutor mergeExec ;
146151
147152 /**
148- * Whether to bypass HNSW graph building for tiny segments (below {@link #HNSW_GRAPH_THRESHOLD}).
149- * When enabled, segments with fewer than the threshold number of vectors will store only flat
150- * vectors, significantly improving indexing performance for workloads with frequent flushes .
153+ * The threshold to use to bypass HNSW graph building for tiny segments in terms of k for a graph
154+ * i.e. number of docs to match the query (default is {@link
155+ * Lucene99HnswVectorsFormat#HNSW_GRAPH_THRESHOLD}) .
151156 */
152- private final boolean bypassTinySegments ;
157+ private final int tinySegmentsThreshold ;
153158
154159 /** Constructs a format using default graph construction parameters */
155160 public Lucene99HnswVectorsFormat () {
156- this (DEFAULT_MAX_CONN , DEFAULT_BEAM_WIDTH , DEFAULT_NUM_MERGE_WORKER , null , false );
161+ this (
162+ DEFAULT_MAX_CONN , DEFAULT_BEAM_WIDTH , DEFAULT_NUM_MERGE_WORKER , null , HNSW_GRAPH_THRESHOLD );
157163 }
158164
159165 /**
@@ -163,18 +169,19 @@ public Lucene99HnswVectorsFormat() {
163169 * @param beamWidth the size of the queue maintained during graph construction.
164170 */
165171 public Lucene99HnswVectorsFormat (int maxConn , int beamWidth ) {
166- this (maxConn , beamWidth , DEFAULT_NUM_MERGE_WORKER , null , false );
172+ this (maxConn , beamWidth , DEFAULT_NUM_MERGE_WORKER , null , HNSW_GRAPH_THRESHOLD );
167173 }
168174
169175 /**
170176 * Constructs a format using the given graph construction parameters.
171177 *
172178 * @param maxConn the maximum number of connections to a node in the HNSW graph
173179 * @param beamWidth the size of the queue maintained during graph construction.
174- * @param bypassTinySegments whether to bypass HNSW graph building for tiny segments
180+ * @param tinySegmentsThreshold the value of k for the expectedVisitedNodes heuristic, used to
181+ * determine the minimum required graph nodes
175182 */
176- public Lucene99HnswVectorsFormat (int maxConn , int beamWidth , boolean bypassTinySegments ) {
177- this (maxConn , beamWidth , DEFAULT_NUM_MERGE_WORKER , null , bypassTinySegments );
183+ public Lucene99HnswVectorsFormat (int maxConn , int beamWidth , int tinySegmentsThreshold ) {
184+ this (maxConn , beamWidth , DEFAULT_NUM_MERGE_WORKER , null , tinySegmentsThreshold );
178185 }
179186
180187 /**
@@ -190,7 +197,7 @@ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth, boolean bypassTinyS
190197 */
191198 public Lucene99HnswVectorsFormat (
192199 int maxConn , int beamWidth , int numMergeWorkers , ExecutorService mergeExec ) {
193- this (maxConn , beamWidth , numMergeWorkers , mergeExec , false );
200+ this (maxConn , beamWidth , numMergeWorkers , mergeExec , HNSW_GRAPH_THRESHOLD );
194201 }
195202
196203 /**
@@ -203,16 +210,15 @@ public Lucene99HnswVectorsFormat(
203210 * @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are
204211 * generated by this format to do the merge. If null, the configured {@link
205212 * MergeScheduler#getIntraMergeExecutor(MergePolicy.OneMerge)} is used.
206- * @param bypassTinySegments whether to bypass HNSW graph building for tiny segments (below {@link
207- * #HNSW_GRAPH_THRESHOLD} vectors). When enabled, improves indexing performance for workloads
208- * with frequent flushes.
213+ * @param tinySegmentsThreshold the value of k for the expectedVisitedNodes heuristic, used to
214+ * determine the minimum required graph nodes
209215 */
210216 public Lucene99HnswVectorsFormat (
211217 int maxConn ,
212218 int beamWidth ,
213219 int numMergeWorkers ,
214220 ExecutorService mergeExec ,
215- boolean bypassTinySegments ) {
221+ int tinySegmentsThreshold ) {
216222 super ("Lucene99HnswVectorsFormat" );
217223 if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN ) {
218224 throw new IllegalArgumentException (
@@ -230,7 +236,7 @@ public Lucene99HnswVectorsFormat(
230236 }
231237 this .maxConn = maxConn ;
232238 this .beamWidth = beamWidth ;
233- this .bypassTinySegments = bypassTinySegments ;
239+ this .tinySegmentsThreshold = tinySegmentsThreshold ;
234240 if (numMergeWorkers == 1 && mergeExec != null ) {
235241 throw new IllegalArgumentException (
236242 "No executor service is needed as we'll use single thread to merge" );
@@ -252,13 +258,13 @@ public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException
252258 flatVectorsFormat .fieldsWriter (state ),
253259 numMergeWorkers ,
254260 mergeExec ,
255- bypassTinySegments );
261+ tinySegmentsThreshold );
256262 }
257263
258264 @ Override
259265 public KnnVectorsReader fieldsReader (SegmentReadState state ) throws IOException {
260266 return new Lucene99HnswVectorsReader (
261- state , flatVectorsFormat .fieldsReader (state ), bypassTinySegments );
267+ state , flatVectorsFormat .fieldsReader (state ), tinySegmentsThreshold );
262268 }
263269
264270 @ Override
@@ -272,8 +278,8 @@ public String toString() {
272278 + maxConn
273279 + ", beamWidth="
274280 + beamWidth
275- + ", bypassTinySegments ="
276- + bypassTinySegments
281+ + ", tinySegmentsThreshold ="
282+ + tinySegmentsThreshold
277283 + ", flatVectorFormat="
278284 + flatVectorsFormat
279285 + ")" ;
0 commit comments