Merge latest commits from hnsw-3 (#423)

marianotepper · web-flow · commit a6b004d7e017 · 2025-04-09T09:07:01.000-07:00
See #402
diff --git a/README.md b/README.md
@@ -10,18 +10,18 @@ There are two broad categories of ANN index:
 
 Graph-based indexes tend to be simpler to implement and faster, but more importantly they can be constructed and updated incrementally.  This makes them a much better fit for a general-purpose index than partitioning approaches that only work on static datasets that are completely specified up front.  That is why all the major commercial vector indexes use graph approaches.
 
-JVector is a graph index that takes a hybrid merging the the DiskANN and HNSW family trees.
+JVector is a graph index that merges the DiskANN and HNSW family trees.
 JVector borrows the hierarchical structure from HNSW, and uses Vamana (the algorithm behind DiskANN) within each layer.
 
 
 ## JVector Architecture
 
-JVector is a graph-based index that builds on the HNSW anD DiskANN designs with composable extensions.
+JVector is a graph-based index that builds on the HNSW and DiskANN designs with composable extensions.
 
 JVector implements a multi-layer graph with nonblocking concurrency control, allowing construction to scale linearly with the number of cores:
 ![JVector scales linearly as thread count increases](https://github.com/jbellis/jvector/assets/42158/f0127bfc-6c45-48b9-96ea-95b2120da0d9)
 
-The upper layers of the hierarchy are represnted by an in-memory adjacency list per node. This allows for quick navigation with no IOs.
+The upper layers of the hierarchy are represented by an in-memory adjacency list per node. This allows for quick navigation with no IOs.
 The bottom layer of the graph is represented by an on-disk adjacency list per node. JVector uses additional data stored inline to support two-pass searches, with the first pass powered by lossily compressed representations of the vectors kept in memory, and the second by a more accurate representation read from disk.  The first pass can be performed with
 * Product quantization (PQ), optionally with [anisotropic weighting](https://arxiv.org/abs/1908.10396)
 * [Binary quantization](https://huggingface.co/blog/embedding-quantization) (BQ)
diff --git a/UPGRADING.md b/UPGRADING.md
@@ -5,7 +5,7 @@
   in each vector with high accuracy by first applying a nonlinear transformation that is individually fit to each
   vector. These nonlinearities are designed to be lightweight and have a negligible impact on distance computation
   performance.
-- Support for hierarchical graph indices. These new type of indices blends HNSW and DiskANN in a novel way. An
+- Support for hierarchical graph indices. This new type of index blends HNSW and DiskANN in a novel way. An
   HNSW-like hierarchy resides in memory for quickly seeding the search. This also reduces the need for caching the
   DiskANN graph near the entrypoint. The base layer of the hierarchy is a DiskANN-like index and inherits its
   properties. This hierarchical structure can be disabled, ending up with just the base DiskANN layer.  
@@ -19,6 +19,8 @@
 - GraphSearcher can be configured to run pruned searches using GraphSearcher.usePruning. When this is set to true,
   we do early termination of the search. In certain cases, this can accelerate the search at the potential cost of some
   accuracy. It is set to false by default.
+- The constructors of GraphIndexBuilder allow to specify different maximum out-degrees for the graphs in each layer.
+  However, this feature does not work with FusedADC in this version.
 
 ### API changes in 3.0.6
 
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphIndex.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphIndex.java
@@ -96,7 +96,7 @@ default boolean containsNode(int nodeId) {
     void close() throws IOException;
 
     /**
-     * @return The maximum (coarser) level with that contains a vector in the graph.
+     * @return The maximum (coarser) level that contains a vector in the graph.
      */
     int getMaxLevel();
 
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphIndexBuilder.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphIndexBuilder.java
@@ -279,33 +279,35 @@ public static GraphIndexBuilder rescore(GraphIndexBuilder other, BuildScoreProvi
                 other.parallelExecutor);
 
         // Copy each node and its neighbors from the old graph to the new one
-        IntStream.range(0, other.graph.getIdUpperBound()).parallel().forEach(i -> {
-            // Find the highest layer this node exists in
-            int maxLayer = -1;
-            for (int lvl = 0; lvl < other.graph.layers.size(); lvl++) {
-                if (other.graph.getNeighbors(lvl, i) == null) {
-                    break;
+        other.parallelExecutor.submit(() -> {
+            IntStream.range(0, other.graph.getIdUpperBound()).parallel().forEach(i -> {
+                // Find the highest layer this node exists in
+                int maxLayer = -1;
+                for (int lvl = 0; lvl < other.graph.layers.size(); lvl++) {
+                    if (other.graph.getNeighbors(lvl, i) == null) {
+                        break;
+                    }
+                    maxLayer = lvl;
+                }
+                if (maxLayer < 0) {
+                    return;
                 }
-                maxLayer = lvl;
-            }
-            if (maxLayer < 0) {
-                return;
-            }
 
-            // Loop over 0..maxLayer, re-score neighbors for each layer
-            var sf = newProvider.searchProviderFor(i).scoreFunction();
-            for (int lvl = 0; lvl <= maxLayer; lvl++) {
-                var oldNeighbors = other.graph.getNeighbors(lvl, i);
-                // Copy edges, compute new scores
-                var newNeighbors = new NodeArray(oldNeighbors.size());
-                for (var it = oldNeighbors.iterator(); it.hasNext();) {
-                    int neighbor = it.nextInt();
-                    // since we're using a different score provider, use insertSorted instead of addInOrder
-                    newNeighbors.insertSorted(neighbor, sf.similarityTo(neighbor));
+                // Loop over 0..maxLayer, re-score neighbors for each layer
+                var sf = newProvider.searchProviderFor(i).scoreFunction();
+                for (int lvl = 0; lvl <= maxLayer; lvl++) {
+                    var oldNeighbors = other.graph.getNeighbors(lvl, i);
+                    // Copy edges, compute new scores
+                    var newNeighbors = new NodeArray(oldNeighbors.size());
+                    for (var it = oldNeighbors.iterator(); it.hasNext();) {
+                        int neighbor = it.nextInt();
+                        // since we're using a different score provider, use insertSorted instead of addInOrder
+                        newNeighbors.insertSorted(neighbor, sf.similarityTo(neighbor));
+                    }
+                    newBuilder.graph.addNode(lvl, i, newNeighbors);
                 }
-                newBuilder.graph.addNode(lvl, i, newNeighbors);
-            }
-        });
+            });
+        }).join();
 
         // Set the entry node
         newBuilder.graph.updateEntryNode(other.graph.entry());
@@ -375,15 +377,24 @@ private void improveConnections(int node) {
         var bits = new ExcludingBits(node);
         try (var gs = searchers.get()) {
             gs.initializeInternal(ssp, graph.entry(), bits);
+            var acceptedBits = Bits.intersectionOf(bits, gs.getView().liveNodes());
 
-            // Move downward from entry.level to 1
+            // Move downward from entry.level to 0
             for (int lvl = graph.entry().level; lvl >= 0; lvl--) {
-                gs.searchOneLayer(ssp, 1, 0.0f, lvl, Bits.intersectionOf(bits, gs.getView().liveNodes()));
+                // This additional call seems redundant given that we have already initialized an ssp above.
+                // However, there is a subtle interplay between the ssp of the search and the ssp used in insertDiverse.
+                // Do not remove this line.
+                ssp = scoreProvider.searchProviderFor(node);
+
                 if (graph.layers.get(lvl).get(node) != null) {
+                    gs.searchOneLayer(ssp, beamWidth, 0.0f, lvl, acceptedBits);
+
                     var candidates = new NodeArray(gs.approximateResults.size());
                     gs.approximateResults.foreach(candidates::insertSorted);
                     var newNeighbors = graph.layers.get(lvl).insertDiverse(node, candidates);
                     graph.layers.get(lvl).backlink(newNeighbors, node, neighborOverflow);
+                } else {
+                    gs.searchOneLayer(ssp, 1, 0.0f, lvl, acceptedBits);
                 }
                 gs.setEntryPointsFromPreviousLayer();
             }
@@ -530,7 +541,7 @@ public synchronized long removeDeletedNodes() {
         if (nRemoved == 0) {
             return 0;
         }
-        // make a list of remaining live nodes 
+        // make a list of remaining live nodes
         var liveNodes = new IntArrayList();
         for (int i = 0; i < graph.getIdUpperBound(); i++) {
             if (graph.containsNode(i) && !toDelete.get(i)) {
@@ -627,14 +638,17 @@ public synchronized long removeDeletedNodes() {
             graph.updateEntryNode(newEntry >= 0 ? new NodeAtLevel(newLevel, newEntry) : null);
         }
 
+        long memorySize = 0;
+
         // Remove the deleted nodes from the graph
         assert toDelete.cardinality() == nRemoved : "cardinality changed";
-        int nodeLayers = 0;
         for (int i = toDelete.nextSetBit(0); i != NO_MORE_DOCS; i = toDelete.nextSetBit(i + 1)) {
-            nodeLayers += graph.removeNode(i);
+            int nDeletions = graph.removeNode(i);
+            for (var iLayer = 0; iLayer < nDeletions; iLayer++) {
+                memorySize += graph.ramBytesUsedOneNode(iLayer);
+            }
         }
-        // TODO this is not correct since different layers can use more or less ram due to different degrees
-        return nodeLayers * graph.ramBytesUsedOneNode(0);
+        return memorySize;
     }
 
     private void updateNeighbors(int layer, int nodeId, NodeArray natural, NodeArray concurrent) {
@@ -703,10 +717,30 @@ public void load(RandomAccessReader in) throws IOException {
             throw new IllegalStateException("Cannot load into a non-empty graph");
         }
 
+        int maybeMagic = in.readInt();
+        int version; // This is not used in V4 but may be useful in the future, putting it as a placeholder.
+        if (maybeMagic != OnHeapGraphIndex.MAGIC) {
+            // JVector 3 format, no magic or version, starts straight off with the number of nodes
+            version = 3;
+            int size = maybeMagic;
+            loadV3(in, size);
+        } else {
+            version = in.readInt();
+            loadV4(in);
+        }
+    }
+
+    private void loadV4(RandomAccessReader in) throws IOException {
+        if (graph.size(0) != 0) {
+            throw new IllegalStateException("Cannot load into a non-empty graph");
+        }
+
         int layerCount = in.readInt();
         int entryNode = in.readInt();
         var layerDegrees = new ArrayList<Integer>(layerCount);
 
+        Map<Integer, Integer> nodeLevelMap = new HashMap<>();
+
         // Read layer info
         for (int level = 0; level < layerCount; level++) {
             int layerSize = in.readInt();
@@ -721,19 +755,25 @@ public void load(RandomAccessReader in) throws IOException {
                     ca.addInOrder(neighbor, sf.similarityTo(neighbor));
                 }
                 graph.addNode(level, nodeId, ca);
+                nodeLevelMap.put(nodeId, level);
             }
         }
 
+        for (var k : nodeLevelMap.keySet()) {
+            NodeAtLevel nal = new NodeAtLevel(nodeLevelMap.get(k), k);
+            graph.markComplete(nal);
+        }
+
         graph.setDegrees(layerDegrees);
         graph.updateEntryNode(new NodeAtLevel(graph.getMaxLevel(), entryNode));
     }
 
-    public void loadV3(RandomAccessReader in) throws IOException {
+
+    private void loadV3(RandomAccessReader in, int size) throws IOException {
         if (graph.size() != 0) {
             throw new IllegalStateException("Cannot load into a non-empty graph");
         }
 
-        int size = in.readInt();
         int entryNode = in.readInt();
         int maxDegree = in.readInt();
 
@@ -747,6 +787,7 @@ public void loadV3(RandomAccessReader in) throws IOException {
                 ca.addInOrder(neighbor, sf.similarityTo(neighbor));
             }
             graph.addNode(0, nodeId, ca);
+            graph.markComplete(new NodeAtLevel(0, nodeId));
         }
 
         graph.updateEntryNode(new NodeAtLevel(0, entryNode));
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java
@@ -99,7 +99,7 @@ public GraphIndex.View getView() {
     }
 
     /**
-     * Whe using pruning, we are using a heuristic to terminate the search earlier.
+     * When using pruning, we are using a heuristic to terminate the search earlier.
      * In certain cases, it can lead to speedups. This is set to false by default.
      * @param usage a boolean that determines whether we do early termination or not.
      */
@@ -402,10 +402,8 @@ SearchResult resume(int topK, int rerankK, float threshold, float rerankFloor) {
         rerankedResults.setMaxSize(topK);
 
         // add evicted results from the last call back to the candidates
-        var previouslyEvicted = evictedResults.size() > 0 ? new SparseBits() : Bits.NONE;
         evictedResults.foreach((node, score) -> {
             candidates.push(node, score);
-            ((SparseBits) previouslyEvicted).set(node);
         });
         evictedResults.clear();
 
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/OnHeapGraphIndex.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/OnHeapGraphIndex.java
@@ -56,6 +56,9 @@
  * For searching, use a view obtained from {@link #getView()} which supports level–aware operations.
  */
 public class OnHeapGraphIndex implements GraphIndex {
+    // Used for saving and loading OnHeapGraphIndex
+    public static final int MAGIC = 0x75EC4012; // JVECTOR, with some imagination
+
     // The current entry node for searches
     private final AtomicReference<NodeAtLevel> entryPoint;
 
@@ -448,6 +451,9 @@ public void save(DataOutput out) {
         }
 
         try (var view = getView()) {
+            out.writeInt(OnHeapGraphIndex.MAGIC); // the magic number
+            out.writeInt(4); // The version
+
             // Write graph-level properties.
             out.writeInt(layers.size());
             assert view.entryNode().level == getMaxLevel();
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/OnDiskGraphIndex.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/OnDiskGraphIndex.java
@@ -78,7 +78,7 @@ public class OnDiskGraphIndex implements GraphIndex, AutoCloseable, Accountable
     // offset of L0 adjacency data
     private final long neighborsOffset;
     /** For layers > 0, store adjacency fully in memory. */
-    private volatile AtomicReference<List<Int2ObjectHashMap<int[]>>> inMemoryNeighbors;
+    private final AtomicReference<List<Int2ObjectHashMap<int[]>>> inMemoryNeighbors;
 
     OnDiskGraphIndex(ReaderSupplier readerSupplier, Header header, long neighborsOffset)
     {
@@ -202,17 +202,17 @@ public NodesIterator getNodes(int level) {
         }
 
         try (var reader = readerSupplier.get()) {
-            int[] valid_nodes = new int[size(level)];
+            int[] validNodes = new int[size(level)];
             int upperBound = level == 0 ? getIdUpperBound() : size(level);
             int pos = 0;
             for (int node = 0; node < upperBound; node++) {
-                long node_offset = layerOffset + (node * thisLayerNodeSide);
-                reader.seek(node_offset);
+                long nodeOffset = layerOffset + (node * thisLayerNodeSide);
+                reader.seek(nodeOffset);
                 if (reader.readInt() != -1) {
-                    valid_nodes[pos++] = node;
+                    validNodes[pos++] = node;
                 }
             }
-            return new NodesIterator.ArrayNodesIterator(valid_nodes, size);
+            return new NodesIterator.ArrayNodesIterator(validNodes, size);
         } catch (IOException e) {
             throw new UncheckedIOException(e);
         }
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/OnDiskGraphIndexWriter.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/OnDiskGraphIndexWriter.java
@@ -205,7 +205,7 @@ public synchronized void write(Map<FeatureId, IntFunction<Feature.State>> featur
                     out.seek(out.position() + feature.featureSize());
                 }
                 out.writeInt(0);
-                for (int n = 0; n < graph.maxDegree(); n++) {
+                for (int n = 0; n < graph.getDegree(0); n++) {
                     out.writeInt(-1);
                 }
                 continue;
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/feature/FusedADC.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/feature/FusedADC.java
@@ -103,7 +103,7 @@ public void writeInline(DataOutput out, Feature.State state_) throws IOException
         var state = (FusedADC.State) state_;
         var pqv = state.pqVectors;
 
-        var neighbors = state.view.getNeighborsIterator(0, state.nodeId); // TODO
+        var neighbors = state.view.getNeighborsIterator(0, state.nodeId);
         int n = 0;
         compressedNeighbors.zero();
         while (neighbors.hasNext()) {
diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/graph/GraphIndexBuilderTest.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/graph/GraphIndexBuilderTest.java
diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/graph/TestDeletions.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/graph/TestDeletions.java
diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestADCGraphIndex.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestADCGraphIndex.java

Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ public class OnDiskGraphIndex implements GraphIndex, AutoCloseable, Accountable`
`78`	`78`	`// offset of L0 adjacency data`
`79`	`79`	`private final long neighborsOffset;`
`80`	`80`	`/** For layers > 0, store adjacency fully in memory. */`
`81`		`- private volatile AtomicReference<List<Int2ObjectHashMap<int[]>>> inMemoryNeighbors;`
	`81`	`+ private final AtomicReference<List<Int2ObjectHashMap<int[]>>> inMemoryNeighbors;`
`82`	`82`
`83`	`83`	`OnDiskGraphIndex(ReaderSupplier readerSupplier, Header header, long neighborsOffset)`
`84`	`84`	`{`
`@@ -202,17 +202,17 @@ public NodesIterator getNodes(int level) {`
`202`	`202`	`}`
`203`	`203`
`204`	`204`	`try (var reader = readerSupplier.get()) {`
`205`		`- int[] valid_nodes = new int[size(level)];`
	`205`	`+ int[] validNodes = new int[size(level)];`
`206`	`206`	`int upperBound = level == 0 ? getIdUpperBound() : size(level);`
`207`	`207`	`int pos = 0;`
`208`	`208`	`for (int node = 0; node < upperBound; node++) {`
`209`		`- long node_offset = layerOffset + (node * thisLayerNodeSide);`
`210`		`- reader.seek(node_offset);`
	`209`	`+ long nodeOffset = layerOffset + (node * thisLayerNodeSide);`
	`210`	`+ reader.seek(nodeOffset);`
`211`	`211`	`if (reader.readInt() != -1) {`
`212`		`- valid_nodes[pos++] = node;`
	`212`	`+ validNodes[pos++] = node;`
`213`	`213`	`}`
`214`	`214`	`}`
`215`		`- return new NodesIterator.ArrayNodesIterator(valid_nodes, size);`
	`215`	`+ return new NodesIterator.ArrayNodesIterator(validNodes, size);`
`216`	`216`	`} catch (IOException e) {`
`217`	`217`	`throw new UncheckedIOException(e);`
`218`	`218`	`}`
Original file line number	Diff line number	Diff line change
`@@ -205,7 +205,7 @@ public synchronized void write(Map<FeatureId, IntFunction<Feature.State>> featur`
`205`	`205`	`out.seek(out.position() + feature.featureSize());`
`206`	`206`	`}`
`207`	`207`	`out.writeInt(0);`
`208`		`- for (int n = 0; n < graph.maxDegree(); n++) {`
	`208`	`+ for (int n = 0; n < graph.getDegree(0); n++) {`
`209`	`209`	`out.writeInt(-1);`
`210`	`210`	`}`
`211`	`211`	`continue;`