Skip to content

Commit c646d43

Browse files
authored
Bulk score hnsw diversity check (#15607)
* Utilize bulk scoring interface during HNSW graph builder diversity check * iter * iter * iter * adding changes, adjusting bulk chunk size * iter * fixing chunk size * no tail needed, 🤦 * iter * iter
1 parent bea2f67 commit c646d43

File tree

2 files changed

+16
-3
lines changed

2 files changed

+16
-3
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,9 @@ Optimizations
339339
* GITHUB#15632: Use a coarser-grained competitive iterator with lower construction costs for
340340
numeric sorts against fields with DocValuesSkippers. (Alan Woodward)
341341

342+
* GITHUB#15607: Utilize bulk scoring for diversity checking when building HNSW vector indices. This results
343+
in some performance improvements during indexing and segment merges. (Ben Trent)
344+
342345
Bug Fixes
343346
---------------------
344347
* GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException

lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,13 @@ public class HnswGraphBuilder implements HnswBuilder {
6464
@SuppressWarnings("NonFinalStaticField")
6565
public static long randSeed = DEFAULT_RAND_SEED;
6666

67+
private static final int MAX_BULK_SCORE_NODES = 8;
68+
6769
protected final int M; // max number of connections on upper layers
6870
private final double ml;
6971

72+
private final int[] bulkScoreNodes; // for bulk scoring
73+
private final float[] bulkScores; // for bulk scoring
7074
private final SplittableRandom random;
7175
protected final UpdateableRandomVectorScorer scorer;
7276
protected final HnswGraphSearcher graphSearcher;
@@ -156,6 +160,10 @@ protected HnswGraphBuilder(
156160
this.hnsw = hnsw;
157161
this.hnswLock = hnswLock;
158162
this.graphSearcher = graphSearcher;
163+
// pick a number that keeps us from scoring TOO much for diversity checking
164+
// but enough to take advantage of bulk scoring
165+
this.bulkScoreNodes = new int[MAX_BULK_SCORE_NODES];
166+
this.bulkScores = new float[MAX_BULK_SCORE_NODES];
159167
entryCandidates = new GraphBuilderKnnCollector(1);
160168
beamCandidates = new GraphBuilderKnnCollector(beamWidth);
161169
beamCandidates0 = new GraphBuilderKnnCollector(Math.min(beamWidth / 2, M * 3));
@@ -470,9 +478,11 @@ static void popToScratch(GraphBuilderKnnCollector candidates, NeighborArray scra
470478
*/
471479
private boolean diversityCheck(float score, NeighborArray neighbors, RandomVectorScorer scorer)
472480
throws IOException {
473-
for (int i = 0; i < neighbors.size(); i++) {
474-
float neighborSimilarity = scorer.score(neighbors.nodes()[i]);
475-
if (neighborSimilarity >= score) {
481+
final int bulkScoreChunk = Math.min((neighbors.size() + 1) / 2, bulkScoreNodes.length);
482+
for (int scored = 0; scored < neighbors.size(); scored += bulkScoreChunk) {
483+
int chunkSize = Math.min(bulkScoreChunk, neighbors.size() - scored);
484+
System.arraycopy(neighbors.nodes(), scored, bulkScoreNodes, 0, chunkSize);
485+
if (scorer.bulkScore(bulkScoreNodes, bulkScores, chunkSize) >= score) {
476486
return false;
477487
}
478488
}

0 commit comments

Comments
 (0)