Skip to content

Commit 210d593

Browse files
committed
Address some comments
1 parent e5b07d2 commit 210d593

File tree

8 files changed

+147
-82
lines changed

8 files changed

+147
-82
lines changed

lucene/core/src/java/org/apache/lucene/codecs/lucene102/Lucene102HnswBinaryQuantizedVectorsFormat.java

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
2020
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN;
2121
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_NUM_MERGE_WORKER;
22+
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.HNSW_GRAPH_THRESHOLD;
2223
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.MAXIMUM_BEAM_WIDTH;
2324
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.MAXIMUM_MAX_CONN;
2425

@@ -64,9 +65,17 @@ public class Lucene102HnswBinaryQuantizedVectorsFormat extends KnnVectorsFormat
6465
private final int numMergeWorkers;
6566
private final TaskExecutor mergeExec;
6667

68+
/**
69+
* The threshold to use to bypass HNSW graph building for tiny segments in terms of k for a graph
70+
* i.e. number of docs to match the query (default is {@link
71+
* Lucene99HnswVectorsFormat#HNSW_GRAPH_THRESHOLD}).
72+
*/
73+
private final int tinySegmentsThreshold;
74+
6775
/** Constructs a format using default graph construction parameters */
6876
public Lucene102HnswBinaryQuantizedVectorsFormat() {
69-
this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null);
77+
this(
78+
DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null, HNSW_GRAPH_THRESHOLD);
7079
}
7180

7281
/**
@@ -76,7 +85,20 @@ public Lucene102HnswBinaryQuantizedVectorsFormat() {
7685
* @param beamWidth the size of the queue maintained during graph construction.
7786
*/
7887
public Lucene102HnswBinaryQuantizedVectorsFormat(int maxConn, int beamWidth) {
79-
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null);
88+
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null, HNSW_GRAPH_THRESHOLD);
89+
}
90+
91+
/**
92+
* Constructs a format using the given graph construction parameters.
93+
*
94+
* @param maxConn the maximum number of connections to a node in the HNSW graph
95+
* @param beamWidth the size of the queue maintained during graph construction.
96+
* @param tinySegmentsThreshold the value of k for the expectedVisitedNodes heuristic, used to
97+
* determine the minimum required graph nodes
98+
*/
99+
public Lucene102HnswBinaryQuantizedVectorsFormat(
100+
int maxConn, int beamWidth, int tinySegmentsThreshold) {
101+
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null, tinySegmentsThreshold);
80102
}
81103

82104
/**
@@ -91,6 +113,27 @@ public Lucene102HnswBinaryQuantizedVectorsFormat(int maxConn, int beamWidth) {
91113
*/
92114
public Lucene102HnswBinaryQuantizedVectorsFormat(
93115
int maxConn, int beamWidth, int numMergeWorkers, ExecutorService mergeExec) {
116+
this(maxConn, beamWidth, numMergeWorkers, mergeExec, HNSW_GRAPH_THRESHOLD);
117+
}
118+
119+
/**
120+
* Constructs a format using the given graph construction parameters and scalar quantization.
121+
*
122+
* @param maxConn the maximum number of connections to a node in the HNSW graph
123+
* @param beamWidth the size of the queue maintained during graph construction.
124+
* @param numMergeWorkers number of workers (threads) that will be used when doing merge. If
125+
* larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec
126+
* @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are
127+
* generated by this format to do the merge
128+
* @param tinySegmentsThreshold the value of k for the expectedVisitedNodes heuristic, used to
129+
* determine the minimum required graph nodes
130+
*/
131+
public Lucene102HnswBinaryQuantizedVectorsFormat(
132+
int maxConn,
133+
int beamWidth,
134+
int numMergeWorkers,
135+
ExecutorService mergeExec,
136+
int tinySegmentsThreshold) {
94137
super(NAME);
95138
if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) {
96139
throw new IllegalArgumentException(
@@ -107,6 +150,7 @@ public Lucene102HnswBinaryQuantizedVectorsFormat(
107150
+ beamWidth);
108151
}
109152
this.maxConn = maxConn;
153+
this.tinySegmentsThreshold = tinySegmentsThreshold;
110154
this.beamWidth = beamWidth;
111155
if (numMergeWorkers == 1 && mergeExec != null) {
112156
throw new IllegalArgumentException(
@@ -128,12 +172,14 @@ public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException
128172
beamWidth,
129173
flatVectorsFormat.fieldsWriter(state),
130174
numMergeWorkers,
131-
mergeExec);
175+
mergeExec,
176+
tinySegmentsThreshold);
132177
}
133178

134179
@Override
135180
public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException {
136-
return new Lucene99HnswVectorsReader(state, flatVectorsFormat.fieldsReader(state));
181+
return new Lucene99HnswVectorsReader(
182+
state, flatVectorsFormat.fieldsReader(state), tinySegmentsThreshold);
137183
}
138184

139185
@Override
@@ -147,6 +193,8 @@ public String toString() {
147193
+ maxConn
148194
+ ", beamWidth="
149195
+ beamWidth
196+
+ ", tinySegmentsThreshold="
197+
+ tinySegmentsThreshold
150198
+ ", flatVectorFormat="
151199
+ flatVectorsFormat
152200
+ ")";

lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswScalarQuantizedVectorsFormat.java

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
2121
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN;
2222
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_NUM_MERGE_WORKER;
23+
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.HNSW_GRAPH_THRESHOLD;
2324
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.MAXIMUM_BEAM_WIDTH;
2425
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.MAXIMUM_MAX_CONN;
2526

@@ -66,12 +67,11 @@ public class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFormat {
6667
private final TaskExecutor mergeExec;
6768

6869
/**
69-
* Whether to bypass HNSW graph building for tiny segments (below {@link
70-
* Lucene99HnswVectorsFormat#HNSW_GRAPH_THRESHOLD}). When enabled, segments with fewer than the
71-
* threshold number of vectors will store only flat vectors with quantization, significantly
72-
* improving indexing performance for workloads with frequent flushes.
70+
* The threshold to use to bypass HNSW graph building for tiny segments in terms of k for a graph
71+
* i.e. number of docs to match the query (default is {@link
72+
* Lucene99HnswVectorsFormat#HNSW_GRAPH_THRESHOLD}).
7373
*/
74-
private final boolean bypassTinySegments;
74+
private final int tinySegmentsThreshold;
7575

7676
/** Constructs a format using default graph construction parameters with 7 bit quantization */
7777
public Lucene99HnswScalarQuantizedVectorsFormat() {
@@ -83,7 +83,7 @@ public Lucene99HnswScalarQuantizedVectorsFormat() {
8383
false,
8484
null,
8585
null,
86-
false);
86+
HNSW_GRAPH_THRESHOLD);
8787
}
8888

8989
/**
@@ -93,19 +93,20 @@ public Lucene99HnswScalarQuantizedVectorsFormat() {
9393
* @param beamWidth the size of the queue maintained during graph construction.
9494
*/
9595
public Lucene99HnswScalarQuantizedVectorsFormat(int maxConn, int beamWidth) {
96-
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, 7, false, null, null, false);
96+
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, 7, false, null, null, HNSW_GRAPH_THRESHOLD);
9797
}
9898

9999
/**
100100
* Constructs a format using the given graph construction parameters with 7 bit quantization
101101
*
102102
* @param maxConn the maximum number of connections to a node in the HNSW graph
103103
* @param beamWidth the size of the queue maintained during graph construction.
104-
* @param bypassTinySegments whether to bypass HNSW graph building for tiny segments
104+
* @param tinySegmentsThreshold the value of k for the expectedVisitedNodes heuristic, used to
105+
* determine the minimum required graph nodes
105106
*/
106107
public Lucene99HnswScalarQuantizedVectorsFormat(
107-
int maxConn, int beamWidth, boolean bypassTinySegments) {
108-
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, 7, false, null, null, bypassTinySegments);
108+
int maxConn, int beamWidth, int tinySegmentsThreshold) {
109+
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, 7, false, null, null, tinySegmentsThreshold);
109110
}
110111

111112
/**
@@ -135,7 +136,15 @@ public Lucene99HnswScalarQuantizedVectorsFormat(
135136
boolean compress,
136137
Float confidenceInterval,
137138
ExecutorService mergeExec) {
138-
this(maxConn, beamWidth, numMergeWorkers, bits, compress, confidenceInterval, mergeExec, false);
139+
this(
140+
maxConn,
141+
beamWidth,
142+
numMergeWorkers,
143+
bits,
144+
compress,
145+
confidenceInterval,
146+
mergeExec,
147+
HNSW_GRAPH_THRESHOLD);
139148
}
140149

141150
/**
@@ -156,9 +165,8 @@ public Lucene99HnswScalarQuantizedVectorsFormat(
156165
* accurate pair.
157166
* @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are
158167
* generated by this format to do the merge
159-
* @param bypassTinySegments whether to bypass HNSW graph building for tiny segments (below {@link
160-
* Lucene99HnswVectorsFormat#HNSW_GRAPH_THRESHOLD} vectors). When enabled, improves indexing
161-
* performance for workloads with frequent flushes.
168+
* @param tinySegmentsThreshold the value of k for the expectedVisitedNodes heuristic, used to
169+
* determine the minimum required graph nodes
162170
*/
163171
public Lucene99HnswScalarQuantizedVectorsFormat(
164172
int maxConn,
@@ -168,7 +176,7 @@ public Lucene99HnswScalarQuantizedVectorsFormat(
168176
boolean compress,
169177
Float confidenceInterval,
170178
ExecutorService mergeExec,
171-
boolean bypassTinySegments) {
179+
int tinySegmentsThreshold) {
172180
super(NAME);
173181
if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) {
174182
throw new IllegalArgumentException(
@@ -186,7 +194,7 @@ public Lucene99HnswScalarQuantizedVectorsFormat(
186194
}
187195
this.maxConn = maxConn;
188196
this.beamWidth = beamWidth;
189-
this.bypassTinySegments = bypassTinySegments;
197+
this.tinySegmentsThreshold = tinySegmentsThreshold;
190198
if (numMergeWorkers == 1 && mergeExec != null) {
191199
throw new IllegalArgumentException(
192200
"No executor service is needed as we'll use single thread to merge");
@@ -210,12 +218,13 @@ public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException
210218
flatVectorsFormat.fieldsWriter(state),
211219
numMergeWorkers,
212220
mergeExec,
213-
bypassTinySegments);
221+
tinySegmentsThreshold);
214222
}
215223

216224
@Override
217225
public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException {
218-
return new Lucene99HnswVectorsReader(state, flatVectorsFormat.fieldsReader(state));
226+
return new Lucene99HnswVectorsReader(
227+
state, flatVectorsFormat.fieldsReader(state), tinySegmentsThreshold);
219228
}
220229

221230
@Override
@@ -229,8 +238,8 @@ public String toString() {
229238
+ maxConn
230239
+ ", beamWidth="
231240
+ beamWidth
232-
+ ", bypassTinySegments="
233-
+ bypassTinySegments
241+
+ ", tinySegmentsThreshold="
242+
+ tinySegmentsThreshold
234243
+ ", flatVectorFormat="
235244
+ flatVectorsFormat
236245
+ ")";

lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -116,11 +116,16 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
116116
public static final int DEFAULT_NUM_MERGE_WORKER = 1;
117117

118118
/**
119-
* Threshold below which HNSW graph building is bypassed for tiny segments. Segments with fewer
120-
* vectors will use flat storage only, improving indexing performance when having frequent
121-
* flushes.
119+
* Threshold which dynamically uses the HnswGraphSearcher#expectedVisitedNodes heuristic to find
120+
* the threshold below which HNSW graph building is bypassed. flushes. It is in terms of k for a
121+
* graph i.e. number of docs to match the query. So,
122+
*
123+
* <pre> k &lt;&lt; size / log(size) </pre>
124+
*
125+
* i.e. k is at least 1 order less than size / log(size) where size if the number of nodes in the
126+
* graph
122127
*/
123-
public static final int HNSW_GRAPH_THRESHOLD = 10_000;
128+
public static final int HNSW_GRAPH_THRESHOLD = 10;
124129

125130
static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16;
126131

@@ -145,15 +150,16 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
145150
private final TaskExecutor mergeExec;
146151

147152
/**
148-
* Whether to bypass HNSW graph building for tiny segments (below {@link #HNSW_GRAPH_THRESHOLD}).
149-
* When enabled, segments with fewer than the threshold number of vectors will store only flat
150-
* vectors, significantly improving indexing performance for workloads with frequent flushes.
153+
* The threshold to use to bypass HNSW graph building for tiny segments in terms of k for a graph
154+
* i.e. number of docs to match the query (default is {@link
155+
* Lucene99HnswVectorsFormat#HNSW_GRAPH_THRESHOLD}).
151156
*/
152-
private final boolean bypassTinySegments;
157+
private final int tinySegmentsThreshold;
153158

154159
/** Constructs a format using default graph construction parameters */
155160
public Lucene99HnswVectorsFormat() {
156-
this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null, false);
161+
this(
162+
DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null, HNSW_GRAPH_THRESHOLD);
157163
}
158164

159165
/**
@@ -163,18 +169,19 @@ public Lucene99HnswVectorsFormat() {
163169
* @param beamWidth the size of the queue maintained during graph construction.
164170
*/
165171
public Lucene99HnswVectorsFormat(int maxConn, int beamWidth) {
166-
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null, false);
172+
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null, HNSW_GRAPH_THRESHOLD);
167173
}
168174

169175
/**
170176
* Constructs a format using the given graph construction parameters.
171177
*
172178
* @param maxConn the maximum number of connections to a node in the HNSW graph
173179
* @param beamWidth the size of the queue maintained during graph construction.
174-
* @param bypassTinySegments whether to bypass HNSW graph building for tiny segments
180+
* @param tinySegmentsThreshold the value of k for the expectedVisitedNodes heuristic, used to
181+
* determine the minimum required graph nodes
175182
*/
176-
public Lucene99HnswVectorsFormat(int maxConn, int beamWidth, boolean bypassTinySegments) {
177-
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null, bypassTinySegments);
183+
public Lucene99HnswVectorsFormat(int maxConn, int beamWidth, int tinySegmentsThreshold) {
184+
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null, tinySegmentsThreshold);
178185
}
179186

180187
/**
@@ -190,7 +197,7 @@ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth, boolean bypassTinyS
190197
*/
191198
public Lucene99HnswVectorsFormat(
192199
int maxConn, int beamWidth, int numMergeWorkers, ExecutorService mergeExec) {
193-
this(maxConn, beamWidth, numMergeWorkers, mergeExec, false);
200+
this(maxConn, beamWidth, numMergeWorkers, mergeExec, HNSW_GRAPH_THRESHOLD);
194201
}
195202

196203
/**
@@ -203,16 +210,15 @@ public Lucene99HnswVectorsFormat(
203210
* @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are
204211
* generated by this format to do the merge. If null, the configured {@link
205212
* MergeScheduler#getIntraMergeExecutor(MergePolicy.OneMerge)} is used.
206-
* @param bypassTinySegments whether to bypass HNSW graph building for tiny segments (below {@link
207-
* #HNSW_GRAPH_THRESHOLD} vectors). When enabled, improves indexing performance for workloads
208-
* with frequent flushes.
213+
* @param tinySegmentsThreshold the value of k for the expectedVisitedNodes heuristic, used to
214+
* determine the minimum required graph nodes
209215
*/
210216
public Lucene99HnswVectorsFormat(
211217
int maxConn,
212218
int beamWidth,
213219
int numMergeWorkers,
214220
ExecutorService mergeExec,
215-
boolean bypassTinySegments) {
221+
int tinySegmentsThreshold) {
216222
super("Lucene99HnswVectorsFormat");
217223
if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) {
218224
throw new IllegalArgumentException(
@@ -230,7 +236,7 @@ public Lucene99HnswVectorsFormat(
230236
}
231237
this.maxConn = maxConn;
232238
this.beamWidth = beamWidth;
233-
this.bypassTinySegments = bypassTinySegments;
239+
this.tinySegmentsThreshold = tinySegmentsThreshold;
234240
if (numMergeWorkers == 1 && mergeExec != null) {
235241
throw new IllegalArgumentException(
236242
"No executor service is needed as we'll use single thread to merge");
@@ -252,13 +258,13 @@ public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException
252258
flatVectorsFormat.fieldsWriter(state),
253259
numMergeWorkers,
254260
mergeExec,
255-
bypassTinySegments);
261+
tinySegmentsThreshold);
256262
}
257263

258264
@Override
259265
public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException {
260266
return new Lucene99HnswVectorsReader(
261-
state, flatVectorsFormat.fieldsReader(state), bypassTinySegments);
267+
state, flatVectorsFormat.fieldsReader(state), tinySegmentsThreshold);
262268
}
263269

264270
@Override
@@ -272,8 +278,8 @@ public String toString() {
272278
+ maxConn
273279
+ ", beamWidth="
274280
+ beamWidth
275-
+ ", bypassTinySegments="
276-
+ bypassTinySegments
281+
+ ", tinySegmentsThreshold="
282+
+ tinySegmentsThreshold
277283
+ ", flatVectorFormat="
278284
+ flatVectorsFormat
279285
+ ")";

0 commit comments

Comments
 (0)