-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Bulk score hnsw diversity check #15607
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
f46d076
cb0dff2
f7b2d29
b37d52f
cb92c93
a80f79d
674dc98
6e71ac1
aca7509
2ce3ace
1a602af
1b606c6
929fb6c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -67,6 +67,8 @@ public class HnswGraphBuilder implements HnswBuilder { | |
| protected final int M; // max number of connections on upper layers | ||
| private final double ml; | ||
|
|
||
| private final int[] bulkScoreNodes; // for bulk scoring | ||
| private final float[] bulkScores; // for bulk scoring | ||
| private final SplittableRandom random; | ||
| protected final UpdateableRandomVectorScorer scorer; | ||
| protected final HnswGraphSearcher graphSearcher; | ||
|
|
@@ -156,6 +158,10 @@ protected HnswGraphBuilder( | |
| this.hnsw = hnsw; | ||
| this.hnswLock = hnswLock; | ||
| this.graphSearcher = graphSearcher; | ||
| // pick a number that keeps us from scoring TOO much for diversity checking | ||
| // but enough to take advantage of bulk scoring | ||
| this.bulkScoreNodes = new int[8]; | ||
| this.bulkScores = new float[8]; | ||
|
||
| entryCandidates = new GraphBuilderKnnCollector(1); | ||
| beamCandidates = new GraphBuilderKnnCollector(beamWidth); | ||
| beamCandidates0 = new GraphBuilderKnnCollector(Math.min(beamWidth / 2, M * 3)); | ||
|
|
@@ -470,9 +476,20 @@ static void popToScratch(GraphBuilderKnnCollector candidates, NeighborArray scra | |
| */ | ||
| private boolean diversityCheck(float score, NeighborArray neighbors, RandomVectorScorer scorer) | ||
| throws IOException { | ||
| for (int i = 0; i < neighbors.size(); i++) { | ||
| float neighborSimilarity = scorer.score(neighbors.nodes()[i]); | ||
| if (neighborSimilarity >= score) { | ||
| final int bulkScoreChunk = Math.min((neighbors.size() + 1) / 2, bulkScoreNodes.length); | ||
| int scored = 0; | ||
| for (scored = 0; scored < neighbors.size(); scored += bulkScoreChunk) { | ||
| int chunkSize = Math.min(bulkScoreChunk, neighbors.size() - scored); | ||
| System.arraycopy(neighbors.nodes(), scored, bulkScoreNodes, 0, chunkSize); | ||
| if (scorer.bulkScore(bulkScoreNodes, bulkScores, chunkSize) >= score) { | ||
| return false; | ||
| } | ||
| } | ||
| // handle a tail | ||
| if (scored < neighbors.size()) { | ||
| int chunkSize = neighbors.size() - scored; | ||
| System.arraycopy(neighbors.nodes(), scored, bulkScoreNodes, 0, chunkSize); | ||
| if (scorer.bulkScore(bulkScoreNodes, bulkScores, chunkSize) >= score) { | ||
| return false; | ||
| } | ||
|
||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was worried about thread-safety of these arrays (given we have concurrent merging), but from this comment it looks like instances of this class are not shared across threads, but rather multiple instances of this class (across different threads) can operate on a single
HnswGraph?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@kaivalnp the scorer itself isn't threadsafe. I assumed that since we were using a scorer, we were OK.
I had the same threading concerns and looked up and it seems that each thread as a unique builder object instance (The worker of the thread) and they all work on the same graph.