Skip to content

Commit 39969e5

Browse files
committed
Bulk score hnsw diversity check (#15607)
* Utilize bulk scoring interface during HNSW graph builder diversity check * iter * iter * iter * adding changes, adjusting bulk chunk size * iter * fixing chunk size * no tail needed, 🤦 * iter * iter
1 parent 0bf85ad commit 39969e5

File tree

2 files changed

+16
-3
lines changed

2 files changed

+16
-3
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,9 @@ Optimizations
174174
* GITHUB#15632: Use a coarser-grained competitive iterator with lower construction costs for
175175
numeric sorts against fields with DocValuesSkippers. (Alan Woodward)
176176

177+
* GITHUB#15607: Utilize bulk scoring for diversity checking when building HNSW vector indices. This results
178+
in some performance improvements during indexing and segment merges. (Ben Trent)
179+
177180
Bug Fixes
178181
---------------------
179182
* GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException

lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,13 @@ public class HnswGraphBuilder implements HnswBuilder {
6464
@SuppressWarnings("NonFinalStaticField")
6565
public static long randSeed = DEFAULT_RAND_SEED;
6666

67+
private static final int MAX_BULK_SCORE_NODES = 8;
68+
6769
protected final int M; // max number of connections on upper layers
6870
private final double ml;
6971

72+
private final int[] bulkScoreNodes; // for bulk scoring
73+
private final float[] bulkScores; // for bulk scoring
7074
private final SplittableRandom random;
7175
protected final UpdateableRandomVectorScorer scorer;
7276
protected final HnswGraphSearcher graphSearcher;
@@ -177,6 +181,10 @@ protected HnswGraphBuilder(
177181
this.hnsw = hnsw;
178182
this.hnswLock = hnswLock;
179183
this.graphSearcher = graphSearcher;
184+
// pick a number that keeps us from scoring TOO much for diversity checking
185+
// but enough to take advantage of bulk scoring
186+
this.bulkScoreNodes = new int[MAX_BULK_SCORE_NODES];
187+
this.bulkScores = new float[MAX_BULK_SCORE_NODES];
180188
entryCandidates = new GraphBuilderKnnCollector(1);
181189
beamCandidates = new GraphBuilderKnnCollector(beamWidth);
182190
beamCandidates0 = new GraphBuilderKnnCollector(Math.min(beamWidth / 2, M * 3));
@@ -491,9 +499,11 @@ static void popToScratch(GraphBuilderKnnCollector candidates, NeighborArray scra
491499
*/
492500
private boolean diversityCheck(float score, NeighborArray neighbors, RandomVectorScorer scorer)
493501
throws IOException {
494-
for (int i = 0; i < neighbors.size(); i++) {
495-
float neighborSimilarity = scorer.score(neighbors.nodes()[i]);
496-
if (neighborSimilarity >= score) {
502+
final int bulkScoreChunk = Math.min((neighbors.size() + 1) / 2, bulkScoreNodes.length);
503+
for (int scored = 0; scored < neighbors.size(); scored += bulkScoreChunk) {
504+
int chunkSize = Math.min(bulkScoreChunk, neighbors.size() - scored);
505+
System.arraycopy(neighbors.nodes(), scored, bulkScoreNodes, 0, chunkSize);
506+
if (scorer.bulkScore(bulkScoreNodes, bulkScores, chunkSize) >= score) {
497507
return false;
498508
}
499509
}

0 commit comments

Comments
 (0)