cspann: add BestCentroids vector index test

andy-kimball · andy-kimball · commit 56287cca5103 · 2025-07-04T08:37:55.000-06:00
Add a new "best-centroids" test that prints out the partitions with the closest centroids for a query vector. This is useful when gauging the quality of the index. Also, update the "recall" test to only sample from vectors that are not part of the index. Achieving high recall is more challenging when searching for such vectors. Epic: CRDB-42943 Release note: None
diff --git a/pkg/sql/vecindex/cspann/BUILD.bazel b/pkg/sql/vecindex/cspann/BUILD.bazel
@@ -89,6 +89,7 @@ go_test(
         "@com_github_cockroachdb_errors//:errors",
         "@com_github_guptarohit_asciigraph//:asciigraph",
         "@com_github_stretchr_testify//require",
+        "@org_golang_x_exp//slices",
         "@org_gonum_v1_gonum//floats/scalar",
         "@org_gonum_v1_gonum//stat",
     ],
diff --git a/pkg/sql/vecindex/cspann/index_test.go b/pkg/sql/vecindex/cspann/index_test.go
@@ -34,6 +34,7 @@ import (
 	"github.com/cockroachdb/datadriven"
 	"github.com/cockroachdb/errors"
 	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/slices"
 )
 
 func TestIndex(t *testing.T) {
@@ -108,6 +109,9 @@ func TestIndex(t *testing.T) {
 			case "recall":
 				result = state.Recall(d)
 
+			case "best-centroids":
+				result = state.BestCentroids(d)
+
 			case "validate-tree":
 				result = state.ValidateTree(d)
 
@@ -503,7 +507,7 @@ func (s *testState) Recall(d *datadriven.TestData) string {
 		rng := rand.New(rand.NewSource(int64(seed)))
 		remaining := make([]int, s.Dataset.Count-len(data))
 		for i := range remaining {
-			remaining[i] = i
+			remaining[i] = len(data) + i
 		}
 		rng.Shuffle(len(remaining), func(i, j int) {
 			remaining[i], remaining[j] = remaining[j], remaining[i]
@@ -522,6 +526,7 @@ func (s *testState) Recall(d *datadriven.TestData) string {
 		for i := range samples {
 			// Calculate truth set for the vector.
 			queryVector := s.Dataset.At(samples[i])
+
 			truth := testutils.CalculateTruth(searchSet.MaxResults,
 				s.Quantizer.GetDistanceMetric(), queryVector, dataVectors, dataKeys)
 
@@ -554,6 +559,90 @@ func (s *testState) Recall(d *datadriven.TestData) string {
 	return buf.String()
 }
 
+func (s *testState) BestCentroids(d *datadriven.TestData) string {
+	randomized := make(vector.T, s.Dataset.Dims)
+	topk := 10
+	for _, arg := range d.CmdArgs {
+		switch arg.Key {
+		case "use-dataset":
+			original := s.parseUseDataset(arg)
+			s.Index.TransformVector(original, randomized)
+
+		case "topk":
+			topk = s.parseInt(arg)
+		}
+	}
+
+	var w workspace.T
+	var distances, errorBounds []float32
+	var partitionKeys []cspann.PartitionKey
+
+	var findCentroids func(partitionKey cspann.PartitionKey)
+	findCentroids = func(partitionKey cspann.PartitionKey) {
+		partition, err := s.MemStore.TryGetPartition(s.Ctx, s.TreeKey, partitionKey)
+		require.NoError(s.T, err)
+		count := partition.Count()
+
+		switch partition.Level() {
+		case cspann.LeafLevel:
+			// Nothing to do.
+
+		case cspann.SecondLevel:
+			distances = slices.Grow(distances, count)
+			distances = distances[:len(distances)+count]
+			errorBounds = slices.Grow(errorBounds, count)
+			errorBounds = errorBounds[:len(errorBounds)+count]
+
+			partition.Quantizer().EstimateDistances(&w, partition.QuantizedSet(), randomized,
+				distances[len(distances)-count:],
+				errorBounds[len(errorBounds)-count:])
+
+			for _, key := range partition.ChildKeys() {
+				partitionKeys = append(partitionKeys, key.PartitionKey)
+			}
+
+		default:
+			// Descend to next level.
+			for _, key := range partition.ChildKeys() {
+				findCentroids(key.PartitionKey)
+			}
+		}
+	}
+
+	findCentroids(cspann.RootKey)
+
+	// Create offsets for argsort.
+	offsets := make([]int, len(partitionKeys))
+	for i := range offsets {
+		offsets[i] = i
+	}
+
+	// Sort indices by distance (argsort).
+	slices.SortFunc(offsets, func(a, b int) int {
+		if distances[a] < distances[b] {
+			return -1
+		} else if distances[a] > distances[b] {
+			return 1
+		}
+		return 0
+	})
+
+	// Print top results.
+	var buf strings.Builder
+	for i := range min(topk, len(offsets)) {
+		offset := offsets[i]
+
+		partition, err := s.MemStore.TryGetPartition(s.Ctx, s.TreeKey, partitionKeys[offset])
+		require.NoError(s.T, err)
+		exact := vecpb.MeasureDistance(vecpb.L2SquaredDistance, randomized, partition.Centroid())
+
+		fmt.Fprintf(&buf, "%d: %.4f ± %.4f (exact=%.4f)\n",
+			partitionKeys[offset], distances[offset], errorBounds[offset], exact)
+	}
+
+	return buf.String()
+}
+
 func (s *testState) ValidateTree(d *datadriven.TestData) string {
 	vectorCount := 0
 	partitionKeys := []cspann.PartitionKey{cspann.RootKey}
diff --git a/pkg/sql/vecindex/cspann/testdata/search-embeddings.ddt b/pkg/sql/vecindex/cspann/testdata/search-embeddings.ddt
@@ -91,31 +91,65 @@ recall topk=20 use-dataset=2717 beam-size=8
 40.00% recall@20
 90 leaf vectors, 143 vectors, 42 full vectors, 13 partitions
 
+# Show the nearest partitions to the "easy" vector, ordered by estimated
+# distance to their centroids. Notice that there are several partitions that are
+# very near, and yet the "spread" between centroids is fairly large, which makes
+# finding results easier.
+best-centroids topk=10 use-dataset=8601
+----
+151: 0.1696 ± 0.0098 (exact=0.1569)
+113: 0.2114 ± 0.0091 (exact=0.2164)
+150: 0.2365 ± 0.0089 (exact=0.2380)
+155: 0.2836 ± 0.0091 (exact=0.2778)
+154: 0.2943 ± 0.0108 (exact=0.2954)
+68: 0.2953 ± 0.0146 (exact=0.3056)
+97: 0.2988 ± 0.0097 (exact=0.3037)
+147: 0.2994 ± 0.0156 (exact=0.2853)
+159: 0.3001 ± 0.0120 (exact=0.2995)
+139: 0.3274 ± 0.0133 (exact=0.3368)
+
+# Show the nearest partitions to the "hard" vector, ordered by estimated
+# distance to their centroids. Notice that the partitions are relatively far
+# away and are bunched together, with low "spread". This makes finding results
+# more difficult.
+best-centroids topk=10 use-dataset=2717
+----
+197: 0.5183 ± 0.0161 (exact=0.5179)
+166: 0.5361 ± 0.0223 (exact=0.5644)
+170: 0.5403 ± 0.0156 (exact=0.5453)
+30: 0.5524 ± 0.0197 (exact=0.5515)
+196: 0.5546 ± 0.0206 (exact=0.5621)
+187: 0.5646 ± 0.0171 (exact=0.5625)
+135: 0.5674 ± 0.0234 (exact=0.6034)
+177: 0.5708 ± 0.0254 (exact=0.5674)
+61: 0.5755 ± 0.0211 (exact=0.5581)
+183: 0.5777 ± 0.0159 (exact=0.5915)
+
 # Test recall at different beam sizes.
 recall topk=10 beam-size=2 samples=64
 ----
-34.22% recall@10
+29.84% recall@10
 21 leaf vectors, 42 vectors, 15 full vectors, 4 partitions
 
 recall topk=10 beam-size=4 samples=64
 ----
-50.31% recall@10
-42 leaf vectors, 73 vectors, 19 full vectors, 7 partitions
+47.97% recall@10
+42 leaf vectors, 74 vectors, 19 full vectors, 7 partitions
 
 recall topk=10 beam-size=8 samples=64
 ----
-73.75% recall@10
-84 leaf vectors, 137 vectors, 23 full vectors, 13 partitions
+69.06% recall@10
+85 leaf vectors, 138 vectors, 24 full vectors, 13 partitions
 
 recall topk=10 beam-size=16 samples=64
 ----
-87.81% recall@10
-168 leaf vectors, 262 vectors, 26 full vectors, 25 partitions
+87.66% recall@10
+168 leaf vectors, 263 vectors, 27 full vectors, 25 partitions
 
 recall topk=10 beam-size=32 samples=64
 ----
-97.50% recall@10
-335 leaf vectors, 441 vectors, 29 full vectors, 42 partitions
+95.62% recall@10
+336 leaf vectors, 442 vectors, 30 full vectors, 42 partitions
 
 # ----------------------------------------------------------------------
 # Compare orderings of same dataset with different distance metrics.
@@ -255,23 +289,23 @@ CV stats:
 
 recall topk=10 beam-size=4 samples=50
 ----
-62.40% recall@10
-42 leaf vectors, 72 vectors, 18 full vectors, 7 partitions
+61.20% recall@10
+42 leaf vectors, 72 vectors, 20 full vectors, 7 partitions
 
 recall topk=10 beam-size=8 samples=50
 ----
-83.40% recall@10
+79.80% recall@10
 83 leaf vectors, 133 vectors, 21 full vectors, 13 partitions
 
 recall topk=10 beam-size=16 samples=50
 ----
-92.60% recall@10
-166 leaf vectors, 257 vectors, 24 full vectors, 25 partitions
+91.00% recall@10
+165 leaf vectors, 256 vectors, 24 full vectors, 25 partitions
 
 recall topk=10 beam-size=32 samples=50
 ----
-98.20% recall@10
-329 leaf vectors, 431 vectors, 25 full vectors, 42 partitions
+97.20% recall@10
+329 leaf vectors, 431 vectors, 26 full vectors, 42 partitions
 
 # ----------------------------------------------------------------------
 # Load 950 768-dimension image embeddings and search them using
@@ -288,20 +322,20 @@ CV stats:
 
 recall topk=10 beam-size=4 samples=50
 ----
-55.80% recall@10
-44 leaf vectors, 74 vectors, 19 full vectors, 7 partitions
+48.60% recall@10
+44 leaf vectors, 76 vectors, 20 full vectors, 7 partitions
 
 recall topk=10 beam-size=8 samples=50
 ----
-74.40% recall@10
-88 leaf vectors, 143 vectors, 23 full vectors, 13 partitions
+69.00% recall@10
+88 leaf vectors, 144 vectors, 25 full vectors, 13 partitions
 
 recall topk=10 beam-size=16 samples=50
 ----
-89.00% recall@10
-172 leaf vectors, 271 vectors, 27 full vectors, 25 partitions
+85.00% recall@10
+173 leaf vectors, 272 vectors, 30 full vectors, 25 partitions
 
 recall topk=10 beam-size=32 samples=50
 ----
-97.60% recall@10
-344 leaf vectors, 443 vectors, 30 full vectors, 41 partitions
+95.20% recall@10
+342 leaf vectors, 441 vectors, 33 full vectors, 41 partitions
diff --git a/pkg/sql/vecindex/cspann/utils/BUILD.bazel b/pkg/sql/vecindex/cspann/utils/BUILD.bazel
@@ -22,5 +22,8 @@ go_test(
     name = "utils_test",
     srcs = ["slice_test.go"],
     embed = [":utils"],
-    deps = ["@com_github_stretchr_testify//require"],
+    deps = [
+        "//pkg/util/buildutil",
+        "@com_github_stretchr_testify//require",
+    ],
 )