elastic · iverase · Sep 5, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/cluster/KMeansLocal.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/cluster/KMeansLocal.java
@@ -10,9 +10,18 @@
 package org.elasticsearch.index.codec.vectors.cluster;
 
 import org.apache.lucene.index.FloatVectorValues;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.search.KnnCollector;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.VectorUtil;
+import org.apache.lucene.util.hnsw.HnswGraphBuilder;
+import org.apache.lucene.util.hnsw.HnswGraphSearcher;
 import org.apache.lucene.util.hnsw.IntToIntFunction;
+import org.apache.lucene.util.hnsw.OnHeapHnswGraph;
+import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
+import org.apache.lucene.util.hnsw.UpdateableRandomVectorScorer;
 import org.elasticsearch.index.codec.vectors.SampleReader;
 import org.elasticsearch.simdvec.ESVectorUtil;
 
@@ -210,9 +219,92 @@ private static int getBestCentroid(float[][] centroids, float[] vector, float[]
         return bestCentroidOffset;
     }
 
-    private NeighborHood[] computeNeighborhoods(float[][] centers, int clustersPerNeighborhood) {
+    private NeighborHood[] computeNeighborhoods(float[][] centers, int clustersPerNeighborhood) throws IOException {
+        assert centers.length > clustersPerNeighborhood;
+        // experiments shows that below 15k, we better use brute force, otherwise hnsw gives us a nice speed up
+        if (centers.length < 15_000) {
+            return computeNeighborhoodsBruteForce(centers, clustersPerNeighborhood);
+        } else {
+            return computeNeighborhoodsGraph(centers, clustersPerNeighborhood);
+        }
+    }
+
+    static NeighborHood[] computeNeighborhoodsGraph(float[][] centers, int clustersPerNeighborhood) throws IOException {
+        final UpdateableRandomVectorScorer scorer = new UpdateableRandomVectorScorer() {
+            int scoringOrdinal;
+
+            @Override
+            public float score(int node) {
+                return VectorSimilarityFunction.EUCLIDEAN.compare(centers[scoringOrdinal], centers[node]);
+            }
+
+            @Override
+            public int maxOrd() {
+                return centers.length;
+            }
+
+            @Override
+            public void setScoringOrdinal(int node) {
+                scoringOrdinal = node;
+            }
+        };
+        final RandomVectorScorerSupplier supplier = new RandomVectorScorerSupplier() {
+            @Override
+            public UpdateableRandomVectorScorer scorer() {
+                return scorer;
+            }
+
+            @Override
+            public RandomVectorScorerSupplier copy() {
+                return this;
+            }
+        };
+        final OnHeapHnswGraph graph = HnswGraphBuilder.create(supplier, 16, 100, 42L).build(centers.length);
+        final NeighborHood[] neighborhoods = new NeighborHood[centers.length];
+        final SingleBit singleBit = new SingleBit(centers.length);
+        for (int i = 0; i < centers.length; i++) {
+            scorer.setScoringOrdinal(i);
+            singleBit.indexSet = i;
+            final KnnCollector collector = HnswGraphSearcher.search(scorer, clustersPerNeighborhood, graph, singleBit, Integer.MAX_VALUE);
+            final ScoreDoc[] scoreDocs = collector.topDocs().scoreDocs;
+            if (scoreDocs.length == 0) {
+                // no neighbors, skip
+                neighborhoods[i] = NeighborHood.EMPTY;
+                continue;
+            }
+            final int[] neighbors = new int[scoreDocs.length];
+            for (int j = 0; j < neighbors.length; j++) {
+                neighbors[j] = scoreDocs[j].doc;
+                assert neighbors[j] != i;
+            }
+            final float minCompetitiveSimilarity = (1f / scoreDocs[neighbors.length - 1].score) - 1;
+            neighborhoods[i] = new NeighborHood(neighbors, minCompetitiveSimilarity);
+        }
+        return neighborhoods;
+    }
+
+    private static class SingleBit implements Bits {
+
+        private final int length;
+        private int indexSet;
+
+        SingleBit(int length) {
+            this.length = length;
+        }
+
+        @Override
+        public boolean get(int index) {
+            return index != indexSet;
+        }
+
+        @Override
+        public int length() {
+            return length;
+        }
+    }
+
+    static NeighborHood[] computeNeighborhoodsBruteForce(float[][] centers, int clustersPerNeighborhood) {
         int k = centers.length;
-        assert k > clustersPerNeighborhood;
         NeighborQueue[] neighborQueues = new NeighborQueue[k];
         for (int i = 0; i < k; i++) {
             neighborQueues[i] = new NeighborQueue(clustersPerNeighborhood, true);

diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/cluster/KMeansLocalTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/cluster/KMeansLocalTests.java
@@ -15,9 +15,12 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.greaterThan;
 
 public class KMeansLocalTests extends ESTestCase {
 
@@ -141,4 +144,47 @@ private static FloatVectorValues generateData(int nSamples, int nDims, int nClus
         }
         return FloatVectorValues.fromFloats(vectors, nDims);
     }
+
+    public void testComputeNeighbours() throws IOException {
+        int numCentroids = randomIntBetween(100, 10000);
+        int dims = randomIntBetween(10, 200);
+        float[][] vectors = new float[numCentroids][dims];
+        for (int i = 0; i < numCentroids; i++) {
+            for (int j = 0; j < dims; j++) {
+                vectors[i][j] = randomFloat();
+            }
+        }
+        int clustersPerNeighbour = randomIntBetween(6, 32);
+        KMeansLocal.NeighborHood[] neighborHoodsGraph = KMeansLocal.computeNeighborhoodsGraph(vectors, clustersPerNeighbour);
+        KMeansLocal.NeighborHood[] neighborHoodsBruteForce = KMeansLocal.computeNeighborhoodsBruteForce(vectors, clustersPerNeighbour);
+        assertEquals(neighborHoodsGraph.length, neighborHoodsBruteForce.length);
+        for (int i = 0; i < neighborHoodsGraph.length; i++) {
+            assertEquals(neighborHoodsBruteForce[i].neighbors().length, neighborHoodsGraph[i].neighbors().length);
+            int matched = compareNN(i, neighborHoodsBruteForce[i].neighbors(), neighborHoodsGraph[i].neighbors());
+            double recall = (double) matched / neighborHoodsGraph[i].neighbors().length;
+            assertThat(recall, greaterThan(0.4));
+            if (recall == 1.0) {
+                // we cannot assert on array equality as there can be small differences due to numerical errors
+                assertEquals(neighborHoodsBruteForce[i].maxIntraDistance(), neighborHoodsGraph[i].maxIntraDistance(), 1e-5f);
+            }
+        }
+    }
+
+    private static int compareNN(int currentId, int[] expected, int[] results) {
+        int matched = 0;
+        Set<Integer> expectedSet = new HashSet<>();
+        Set<Integer> alreadySeen = new HashSet<>();
+        for (int i : expected) {
+            assertNotEquals(currentId, i);
+            assertTrue(expectedSet.add(i));
+        }
+        for (int i : results) {
+            assertNotEquals(currentId, i);
+            assertTrue(alreadySeen.add(i));
+            if (expectedSet.contains(i)) {
+                ++matched;
+            }
+        }
+        return matched;
+    }
 }