apache · benwtrent · Dec 1, 2025 · Jul 29, 2025 · Aug 9, 2025 · Aug 9, 2025
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -222,6 +222,8 @@ Optimizations
 
 # GITHUB#15303: Speed up NumericUtils#{add,subtract} by operating on integers instead of bytes. (Kaival Parikh)
 
+* GITHUB#15003: Avoid reconstructing HNSW graph during merging (Pulkit Gupta)
+
 Bug Fixes
 ---------------------
 * GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException

diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/ConcurrentHnswMerger.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/ConcurrentHnswMerger.java
@@ -19,7 +19,7 @@
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 
 import java.io.IOException;
-import java.util.Comparator;
+import java.util.Arrays;
 import org.apache.lucene.codecs.KnnVectorsReader;
 import org.apache.lucene.codecs.hnsw.HnswGraphProvider;
 import org.apache.lucene.index.FieldInfo;
@@ -57,14 +57,12 @@ protected HnswBuilder createBuilder(KnnVectorValues mergedVectorValues, int maxO
     OnHeapHnswGraph graph;
     BitSet initializedNodes = null;
 
-    if (graphReaders.size() == 0) {
+    if (largestGraphReader == null) {
       graph = new OnHeapHnswGraph(M, maxOrd);
     } else {
-      graphReaders.sort(Comparator.comparingInt(GraphReader::graphSize).reversed());
-      GraphReader initGraphReader = graphReaders.get(0);
-      KnnVectorsReader initReader = initGraphReader.reader();
-      MergeState.DocMap initDocMap = initGraphReader.initDocMap();
-      int initGraphSize = initGraphReader.graphSize();
+      KnnVectorsReader initReader = largestGraphReader.reader();
+      MergeState.DocMap initDocMap = largestGraphReader.initDocMap();
+      int initGraphSize = largestGraphReader.graphSize();
       HnswGraph initializerGraph = ((HnswGraphProvider) initReader).getGraph(fieldInfo.name);
 
       if (initializerGraph.size() == 0) {
@@ -79,7 +77,9 @@ protected HnswBuilder createBuilder(KnnVectorValues mergedVectorValues, int maxO
                 initGraphSize,
                 mergedVectorValues,
                 initializedNodes);
-        graph = InitializedHnswGraphBuilder.initGraph(initializerGraph, oldToNewOrdinalMap, maxOrd);
+        graph =
+            InitializedHnswGraphBuilder.initGraph(
+                initializerGraph, oldToNewOrdinalMap, maxOrd, beamWidth, scorerSupplier);
       }
     }
     return new HnswConcurrentMergeBuilder(
@@ -117,6 +117,9 @@ private static int[] getNewOrdMapping(
         docId != NO_MORE_DOCS;
         docId = initializerIterator.nextDoc()) {
       int newId = initDocMap.get(docId);
+      if (newId == -1) {
+        continue;
+      }
       maxNewDocID = Math.max(newId, maxNewDocID);
       assert newIdToOldOrdinal.containsKey(newId) == false;
       newIdToOldOrdinal.put(newId, initializerIterator.index());
@@ -126,6 +129,7 @@ private static int[] getNewOrdMapping(
       return new int[0];
     }
     final int[] oldToNewOrdinalMap = new int[initGraphSize];
+    Arrays.fill(oldToNewOrdinalMap, -1);
     KnnVectorValues.DocIndexIterator mergedVectorIterator = mergedVectorValues.iterator();
     for (int newDocId = mergedVectorIterator.nextDoc();
         newDocId <= maxNewDocID;

diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java
@@ -64,14 +64,14 @@ public class HnswGraphBuilder implements HnswBuilder {
   @SuppressWarnings("NonFinalStaticField")
   public static long randSeed = DEFAULT_RAND_SEED;
 
-  private final int M; // max number of connections on upper layers
+  protected final int M; // max number of connections on upper layers
   private final double ml;
 
   private final SplittableRandom random;
-  private final UpdateableRandomVectorScorer scorer;
-  private final HnswGraphSearcher graphSearcher;
+  protected final UpdateableRandomVectorScorer scorer;
+  protected final HnswGraphSearcher graphSearcher;
   private final GraphBuilderKnnCollector entryCandidates; // for upper levels of graph search
-  private final GraphBuilderKnnCollector
+  protected final GraphBuilderKnnCollector
       beamCandidates; // for levels of graph where we add the node
   private final GraphBuilderKnnCollector beamCandidates0;
 
@@ -288,7 +288,7 @@ private void addGraphNodeInternal(int node, UpdateableRandomVectorScorer scorer,
 
       // then do connections from bottom up
       for (int i = 0; i < scratchPerLevel.length; i++) {
-        addDiverseNeighbors(i + lowestUnsetLevel, node, scratchPerLevel[i], scorer);
+        addDiverseNeighbors(i + lowestUnsetLevel, node, scratchPerLevel[i], scorer, false);
       }
       lowestUnsetLevel += scratchPerLevel.length;
       assert lowestUnsetLevel == Math.min(nodeLevel, curMaxLevel) + 1;
@@ -344,17 +344,22 @@ private long printGraphBuildStatus(int node, long start, long t) {
     return now;
   }
 
-  private void addDiverseNeighbors(
-      int level, int node, NeighborArray candidates, UpdateableRandomVectorScorer scorer)
+  void addDiverseNeighbors(
+      int level,
+      int node,
+      NeighborArray candidates,
+      UpdateableRandomVectorScorer scorer,
+      boolean outOfOrderInsertion)
       throws IOException {
     /* For each of the beamWidth nearest candidates (going from best to worst), select it only if it
      * is closer to target than it is to any of the already-selected neighbors (ie selected in this method,
      * since the node is new and has no prior neighbors).
      */
     NeighborArray neighbors = hnsw.getNeighbors(level, node);
-    assert neighbors.size() == 0; // new node
     int maxConnOnLevel = level == 0 ? M * 2 : M;
-    boolean[] mask = selectAndLinkDiverse(neighbors, candidates, maxConnOnLevel, scorer);
+    boolean[] mask =
+        selectAndLinkDiverse(
+            node, neighbors, candidates, maxConnOnLevel, scorer, outOfOrderInsertion);
 
     // Link the selected nodes to the new node, and the new node to the selected nodes (again
     // applying diversity heuristic)
@@ -386,31 +391,40 @@ private void addDiverseNeighbors(
    * are selected
    */
   private boolean[] selectAndLinkDiverse(
+      int node,
       NeighborArray neighbors,
       NeighborArray candidates,
       int maxConnOnLevel,
-      UpdateableRandomVectorScorer scorer)
+      UpdateableRandomVectorScorer scorer,
+      boolean outOfOrderInsertion)
       throws IOException {
     boolean[] mask = new boolean[candidates.size()];
     // Select the best maxConnOnLevel neighbors of the new node, applying the diversity heuristic
     for (int i = candidates.size() - 1; neighbors.size() < maxConnOnLevel && i >= 0; i--) {
       // compare each neighbor (in distance order) against the closer neighbors selected so far,
       // only adding it if it is closer to the target than to any of the other selected neighbors
       int cNode = candidates.nodes()[i];
+      if (node == cNode) {
+        continue;
+      }
       float cScore = candidates.getScores(i);
       assert cNode <= hnsw.maxNodeId();
       scorer.setScoringOrdinal(cNode);
       if (diversityCheck(cScore, neighbors, scorer)) {
         mask[i] = true;
         // here we don't need to lock, because there's no incoming link so no others is able to
         // discover this node such that no others will modify this neighbor array as well
-        neighbors.addInOrder(cNode, cScore);
+        if (outOfOrderInsertion) {
+          neighbors.addOutOfOrder(cNode, cScore);
+        } else {
+          neighbors.addInOrder(cNode, cScore);
+        }
       }
     }
     return mask;
   }
 
-  private static void popToScratch(GraphBuilderKnnCollector candidates, NeighborArray scratch) {
+  static void popToScratch(GraphBuilderKnnCollector candidates, NeighborArray scratch) {
     scratch.clear();
     int candidateCount = candidates.size();
     // extract all the Neighbors from the queue into an array; these will now be

diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java
@@ -20,16 +20,16 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;
 import org.apache.lucene.codecs.KnnVectorsReader;
 import org.apache.lucene.codecs.hnsw.HnswGraphProvider;
-import org.apache.lucene.index.ByteVectorValues;
 import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.FloatVectorValues;
 import org.apache.lucene.index.KnnVectorValues;
 import org.apache.lucene.index.MergeState;
 import org.apache.lucene.internal.hppc.IntIntHashMap;
+import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.BitSet;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.FixedBitSet;
@@ -48,8 +48,20 @@ public class IncrementalHnswGraphMerger implements HnswGraphMerger {
   protected final int beamWidth;
 
   protected List<GraphReader> graphReaders = new ArrayList<>();
+  protected GraphReader largestGraphReader;
+
   private int numReaders = 0;
 
+  /**
+   * The maximum acceptable deletion percentage for a graph to be considered as the base graph.
+   * Graphs with deletion percentages above this threshold are not used for initialization as they
+   * may have degraded connectivity.
+   *
+   * <p>A value of 40 means that if more than 40% of the graph's original vectors have been deleted,
+   * the graph will not be selected as the base.
+   */
+  private final int DELETE_PCT_THRESHOLD = 40;
+
   /** Represents a vector reader that contains graph info. */
   protected record GraphReader(
       KnnVectorsReader reader, MergeState.DocMap initDocMap, int graphSize) {}
@@ -67,38 +79,43 @@ public IncrementalHnswGraphMerger(
 
   /**
    * Adds a reader to the graph merger if it meets the following criteria: 1. does not contain any
-   * deleted docs 2. is a HnswGraphProvider
+   * deleted vector 2. is a HnswGraphProvider
    */
   @Override
   public IncrementalHnswGraphMerger addReader(
       KnnVectorsReader reader, MergeState.DocMap docMap, Bits liveDocs) throws IOException {
     numReaders++;
-    if (hasDeletes(liveDocs) || !(reader instanceof HnswGraphProvider)) {
+    if (!(reader instanceof HnswGraphProvider)) {
       return this;
     }
     HnswGraph graph = ((HnswGraphProvider) reader).getGraph(fieldInfo.name);
     if (graph == null || graph.size() == 0) {
       return this;
     }
 
-    int candidateVectorCount = 0;
-    switch (fieldInfo.getVectorEncoding()) {
-      case BYTE -> {
-        ByteVectorValues byteVectorValues = reader.getByteVectorValues(fieldInfo.name);
-        if (byteVectorValues == null) {
-          return this;
-        }
-        candidateVectorCount = byteVectorValues.size();
-      }
-      case FLOAT32 -> {
-        FloatVectorValues vectorValues = reader.getFloatVectorValues(fieldInfo.name);
-        if (vectorValues == null) {
-          return this;
-        }
-        candidateVectorCount = vectorValues.size();
-      }
+    KnnVectorValues knnVectorValues =
+        switch (fieldInfo.getVectorEncoding()) {
+          case BYTE -> reader.getByteVectorValues(fieldInfo.name);
+          case FLOAT32 -> reader.getFloatVectorValues(fieldInfo.name);
+        };
+
+    int candidateVectorCount = countLiveVectors(liveDocs, knnVectorValues);
+    int graphSize = graph.size();
+
+    GraphReader graphReader = new GraphReader(reader, docMap, graphSize);
+
+    int deletePct = ((graphSize - candidateVectorCount) * 100) / graphSize;
+
+    if (deletePct <= DELETE_PCT_THRESHOLD
+        && (largestGraphReader == null || candidateVectorCount > largestGraphReader.graphSize)) {
+      largestGraphReader = graphReader;
     }
-    graphReaders.add(new GraphReader(reader, docMap, candidateVectorCount));
+
+    // if graph has no deletes
+    if (candidateVectorCount == graphSize) {
+      graphReaders.add(graphReader);
+    }
+
     return this;
   }
 
@@ -112,11 +129,15 @@ public IncrementalHnswGraphMerger addReader(
    */
   protected HnswBuilder createBuilder(KnnVectorValues mergedVectorValues, int maxOrd)
       throws IOException {
-    if (graphReaders.size() == 0) {
+    if (largestGraphReader == null) {
       return HnswGraphBuilder.create(
           scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed, maxOrd);
     }
-    graphReaders.sort(Comparator.comparingInt(GraphReader::graphSize).reversed());
+    if (!graphReaders.contains(largestGraphReader)) {
+      graphReaders.addFirst(largestGraphReader);
+    } else {
+      graphReaders.sort(Comparator.comparingInt(GraphReader::graphSize).reversed());
+    }
 
     final BitSet initializedNodes =
         graphReaders.size() == numReaders ? null : new FixedBitSet(maxOrd);
@@ -163,6 +184,7 @@ protected final int[][] getNewOrdMapping(
         newDocIdToOldOrdinals[i].put(newDocId, vectorsIter.index());
       }
       oldToNewOrdinalMap[i] = new int[graphReaders.get(i).graphSize];
+      Arrays.fill(oldToNewOrdinalMap[i], -1);
     }
 
     KnnVectorValues.DocIndexIterator mergedVectorIterator = mergedVectorValues.iterator();
@@ -192,16 +214,21 @@ public OnHeapHnswGraph merge(
     return builder.build(maxOrd);
   }
 
-  private static boolean hasDeletes(Bits liveDocs) {
+  private static int countLiveVectors(Bits liveDocs, KnnVectorValues knnVectorValues)
+      throws IOException {
     if (liveDocs == null) {
-      return false;
+      return knnVectorValues.size();
     }
 
-    for (int i = 0; i < liveDocs.length(); i++) {
-      if (!liveDocs.get(i)) {
-        return true;
+    int count = 0;
+    DocIdSetIterator docIdSetIterator = knnVectorValues.iterator();
+    for (int doc = docIdSetIterator.nextDoc();
+        doc != NO_MORE_DOCS;
+        doc = docIdSetIterator.nextDoc()) {
+      if (liveDocs.get(doc)) {
+        count++;
       }
     }
-    return false;
+    return count;
   }
 }