Skip to content

Commit 68beb1a

Browse files
LUCENE-10054 Make HnswGraph hierarchical (#608) (#629)
Currently HNSW has only a single layer. This patch makes HNSW graph multi-layered. This PR is based on the following PRs: #250, #267, #287, #315, #536, #416 Main changes: - Multi layers are introduced into HnswGraph and HnswGraphBuilder - A new Lucene91HnswVectorsFormat with new Lucene91HnswVectorsReader and Lucene91HnswVectorsWriter are introduced to encode graph layers' information - Lucene90Codec, Lucene90HnswVectorsFormat, and the reading logic of Lucene90HnswVectorsReader and Lucene90HnswGraph are moved to backward_codecs to support reading and searching of graphs built in pre 9.1 version. Lucene90HnswVectorsWriter is deleted. - For backwards compatible tests, previous Lucene90 graph reading and writing logic was copied into test files of Lucene90RWHnswVectorsFormat, Lucene90HnswVectorsWriter, Lucene90HnswGraphBuilder and Lucene90HnswRWGraph. TODO: tests for KNN search for graphs built in pre 9.1 version; tests for merge of indices of pre 9.1 + current versions.
1 parent f561403 commit 68beb1a

39 files changed

+3011
-749
lines changed

lucene/CHANGES.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ Improvements
108108
* LUCENE-10229: Unify behaviour of match offsets for interval queries on fields
109109
with or without offsets enabled. (Patrick Zhai)
110110

111+
* LUCENE-10054 Make HnswGraph hierarchical (Mayya Sharipova, Julie Tibshirani, Mike Sokolov,
112+
Adrien Grand)
113+
114+
111115
Optimizations
112116
---------------------
113117

lucene/backward-codecs/src/java/module-info.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
exports org.apache.lucene.backward_codecs.lucene84;
3030
exports org.apache.lucene.backward_codecs.lucene86;
3131
exports org.apache.lucene.backward_codecs.lucene87;
32+
exports org.apache.lucene.backward_codecs.lucene90;
3233
exports org.apache.lucene.backward_codecs.packed;
3334
exports org.apache.lucene.backward_codecs.store;
3435

@@ -38,10 +39,13 @@
3839
provides org.apache.lucene.codecs.PostingsFormat with
3940
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
4041
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat;
42+
provides org.apache.lucene.codecs.KnnVectorsFormat with
43+
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat;
4144
provides org.apache.lucene.codecs.Codec with
4245
org.apache.lucene.backward_codecs.lucene70.Lucene70Codec,
4346
org.apache.lucene.backward_codecs.lucene80.Lucene80Codec,
4447
org.apache.lucene.backward_codecs.lucene84.Lucene84Codec,
4548
org.apache.lucene.backward_codecs.lucene86.Lucene86Codec,
46-
org.apache.lucene.backward_codecs.lucene87.Lucene87Codec;
49+
org.apache.lucene.backward_codecs.lucene87.Lucene87Codec,
50+
org.apache.lucene.backward_codecs.lucene90.Lucene90Codec;
4751
}

lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java renamed to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17-
package org.apache.lucene.codecs.lucene90;
17+
package org.apache.lucene.backward_codecs.lucene90;
1818

1919
import java.util.Objects;
2020
import org.apache.lucene.codecs.Codec;
@@ -30,6 +30,16 @@
3030
import org.apache.lucene.codecs.SegmentInfoFormat;
3131
import org.apache.lucene.codecs.StoredFieldsFormat;
3232
import org.apache.lucene.codecs.TermVectorsFormat;
33+
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
34+
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
35+
import org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat;
36+
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
37+
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
38+
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
39+
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
40+
import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat;
41+
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
42+
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
3343
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
3444
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
3545
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.backward_codecs.lucene90;
19+
20+
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
21+
22+
import java.io.IOException;
23+
import java.util.ArrayList;
24+
import java.util.List;
25+
import java.util.SplittableRandom;
26+
import org.apache.lucene.index.KnnGraphValues;
27+
import org.apache.lucene.index.RandomAccessVectorValues;
28+
import org.apache.lucene.index.VectorSimilarityFunction;
29+
import org.apache.lucene.util.Bits;
30+
import org.apache.lucene.util.SparseFixedBitSet;
31+
import org.apache.lucene.util.hnsw.BoundsChecker;
32+
import org.apache.lucene.util.hnsw.NeighborArray;
33+
import org.apache.lucene.util.hnsw.NeighborQueue;
34+
35+
/**
36+
* Navigable Small-world graph. Provides efficient approximate nearest neighbor search for high
37+
* dimensional vectors. See <a href="https://doi.org/10.1016/j.is.2013.10.006">Approximate nearest
38+
* neighbor algorithm based on navigable small world graphs [2014]</a> and <a
39+
* href="https://arxiv.org/abs/1603.09320">this paper [2018]</a> for details.
40+
*
41+
* <p>The nomenclature is a bit different here from what's used in those papers:
42+
*
43+
* <h2>Hyperparameters</h2>
44+
*
45+
* <ul>
46+
* <li><code>numSeed</code> is the equivalent of <code>m</code> in the 2014 paper; it controls the
47+
* number of random entry points to sample.
48+
* <li><code>beamWidth</code> in {@link Lucene90HnswGraphBuilder} has the same meaning as <code>
49+
* efConst </code> in the 2018 paper. It is the number of nearest neighbor candidates to track
50+
* while searching the graph for each newly inserted node.
51+
* <li><code>maxConn</code> has the same meaning as <code>M</code> in the later paper; it controls
52+
* how many of the <code>efConst</code> neighbors are connected to the new node
53+
* </ul>
54+
*
55+
* <p>Note: The graph may be searched by multiple threads concurrently, but updates are not
56+
* thread-safe. Also note: there is no notion of deletions. Document searching built on top of this
57+
* must do its own deletion-filtering.
58+
*
59+
* <p>Graph building logic is preserved here only for tests.
60+
*/
61+
public final class Lucene90HnswGraph extends KnnGraphValues {
62+
63+
private final int maxConn;
64+
65+
// Each entry lists the top maxConn neighbors of a node. The nodes correspond to vectors added to
66+
// HnswBuilder, and the
67+
// node values are the ordinals of those vectors.
68+
private final List<NeighborArray> graph;
69+
70+
// KnnGraphValues iterator members
71+
private int upto;
72+
private NeighborArray cur;
73+
74+
Lucene90HnswGraph(int maxConn) {
75+
graph = new ArrayList<>();
76+
// Typically with diversity criteria we see nodes not fully occupied; average fanout seems to be
77+
// about 1/2 maxConn. There is some indexing time penalty for under-allocating, but saves RAM
78+
graph.add(new NeighborArray(Math.max(32, maxConn / 4)));
79+
this.maxConn = maxConn;
80+
}
81+
82+
/**
83+
* Searches for the nearest neighbors of a query vector.
84+
*
85+
* @param query search query vector
86+
* @param topK the number of nodes to be returned
87+
* @param numSeed the size of the queue maintained while searching, and controls the number of
88+
* random entry points to sample
89+
* @param vectors vector values
90+
* @param graphValues the graph values. May represent the entire graph, or a level in a
91+
* hierarchical graph.
92+
* @param acceptOrds {@link Bits} that represents the allowed document ordinals to match, or
93+
* {@code null} if they are all allowed to match.
94+
* @param random a source of randomness, used for generating entry points to the graph
95+
* @return a priority queue holding the closest neighbors found
96+
*/
97+
public static NeighborQueue search(
98+
float[] query,
99+
int topK,
100+
int numSeed,
101+
RandomAccessVectorValues vectors,
102+
VectorSimilarityFunction similarityFunction,
103+
KnnGraphValues graphValues,
104+
Bits acceptOrds,
105+
SplittableRandom random)
106+
throws IOException {
107+
int size = graphValues.size();
108+
109+
// MIN heap, holding the top results
110+
NeighborQueue results = new NeighborQueue(numSeed, similarityFunction.reversed);
111+
// MAX heap, from which to pull the candidate nodes
112+
NeighborQueue candidates = new NeighborQueue(numSeed, !similarityFunction.reversed);
113+
114+
// set of ordinals that have been visited by search on this layer, used to avoid backtracking
115+
SparseFixedBitSet visited = new SparseFixedBitSet(size);
116+
// get initial candidates at random
117+
int boundedNumSeed = Math.min(numSeed, 2 * size);
118+
for (int i = 0; i < boundedNumSeed; i++) {
119+
int entryPoint = random.nextInt(size);
120+
if (visited.getAndSet(entryPoint) == false) {
121+
// explore the topK starting points of some random numSeed probes
122+
float score = similarityFunction.compare(query, vectors.vectorValue(entryPoint));
123+
candidates.add(entryPoint, score);
124+
if (acceptOrds == null || acceptOrds.get(entryPoint)) {
125+
results.add(entryPoint, score);
126+
}
127+
}
128+
}
129+
130+
// Set the bound to the worst current result and below reject any newly-generated candidates
131+
// failing
132+
// to exceed this bound
133+
BoundsChecker bound = BoundsChecker.create(similarityFunction.reversed);
134+
bound.set(results.topScore());
135+
while (candidates.size() > 0) {
136+
// get the best candidate (closest or best scoring)
137+
float topCandidateScore = candidates.topScore();
138+
if (results.size() >= topK) {
139+
if (bound.check(topCandidateScore)) {
140+
break;
141+
}
142+
}
143+
int topCandidateNode = candidates.pop();
144+
graphValues.seek(0, topCandidateNode);
145+
int friendOrd;
146+
while ((friendOrd = graphValues.nextNeighbor()) != NO_MORE_DOCS) {
147+
assert friendOrd < size : "friendOrd=" + friendOrd + "; size=" + size;
148+
if (visited.getAndSet(friendOrd)) {
149+
continue;
150+
}
151+
152+
float score = similarityFunction.compare(query, vectors.vectorValue(friendOrd));
153+
if (results.size() < numSeed || bound.check(score) == false) {
154+
candidates.add(friendOrd, score);
155+
if (acceptOrds == null || acceptOrds.get(friendOrd)) {
156+
results.insertWithOverflow(friendOrd, score);
157+
bound.set(results.topScore());
158+
}
159+
}
160+
}
161+
}
162+
while (results.size() > topK) {
163+
results.pop();
164+
}
165+
results.setVisitedCount(visited.approximateCardinality());
166+
return results;
167+
}
168+
169+
/**
170+
* Returns the {@link NeighborQueue} connected to the given node.
171+
*
172+
* @param node the node whose neighbors are returned
173+
*/
174+
public NeighborArray getNeighbors(int node) {
175+
return graph.get(node);
176+
}
177+
178+
@Override
179+
public int size() {
180+
return graph.size();
181+
}
182+
183+
int addNode() {
184+
graph.add(new NeighborArray(maxConn + 1));
185+
return graph.size() - 1;
186+
}
187+
188+
@Override
189+
public void seek(int level, int targetNode) {
190+
cur = getNeighbors(targetNode);
191+
upto = -1;
192+
}
193+
194+
@Override
195+
public int nextNeighbor() {
196+
if (++upto < cur.size()) {
197+
return cur.node()[upto];
198+
}
199+
return NO_MORE_DOCS;
200+
}
201+
202+
@Override
203+
public int numLevels() {
204+
throw new UnsupportedOperationException();
205+
}
206+
207+
@Override
208+
public int entryNode() {
209+
throw new UnsupportedOperationException();
210+
}
211+
212+
@Override
213+
public NodesIterator getNodesOnLevel(int level) {
214+
throw new UnsupportedOperationException();
215+
}
216+
}

0 commit comments

Comments
 (0)