Skip to content

Commit f026464

Browse files
committed
cspann: use correct metric for assigning vectors during split
Previously, during split the BalancedKmeans class was not initialized with the distance metric used by the index. This caused vectors to be assigned to partitions using a potentially incorrect metric, which can negatively impact accuracy. Epic: CRDB-42943 Release note: None
1 parent a5ae112 commit f026464

File tree

3 files changed

+49
-14
lines changed

3 files changed

+49
-14
lines changed

pkg/sql/vecindex/cspann/fixup_split.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1023,7 +1023,11 @@ func (fw *fixupWorker) copyToSplitSubPartitions(
10231023
defer fw.workspace.FreeUint64s(tempAssignments)
10241024

10251025
// Assign vectors to the partition with the nearest centroid.
1026-
kmeans := BalancedKmeans{Workspace: &fw.workspace, Rand: fw.rng}
1026+
kmeans := BalancedKmeans{
1027+
Workspace: &fw.workspace,
1028+
Rand: fw.rng,
1029+
DistanceMetric: fw.index.quantizer.GetDistanceMetric(),
1030+
}
10271031
leftCount = kmeans.AssignPartitions(
10281032
vectors, leftMetadata.Centroid, rightMetadata.Centroid, tempAssignments)
10291033

pkg/sql/vecindex/cspann/testdata/search-embeddings.ddt

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ vec420: 3781823
197197
new-index dataset=fashion-784d-1k.gob dataset-count=1000 distance-metric=Cosine min-partition-size=4 max-partition-size=16 quality-samples=8 beam-size=4 hide-tree
198198
----
199199
Created index with 1000 vectors with 784 dimensions.
200-
3 levels, 209 partitions.
200+
3 levels, 211 partitions.
201201
CV stats:
202202
level 2 - mean: 0.0000, stdev: 0.0000
203203
level 3 - mean: 0.0000, stdev: 0.0000
@@ -215,7 +215,7 @@ vec409: 0.1185
215215
vec144: 0.1197
216216
vec476: 0.124
217217
vec109: 0.1273
218-
1000 leaf vectors, 1108 vectors, 11 full vectors, 109 partitions
218+
1000 leaf vectors, 1109 vectors, 12 full vectors, 110 partitions
219219

220220
# Now use lower beam size.
221221
search max-results=10 use-dataset=999 beam-size=8
@@ -230,13 +230,13 @@ vec409: 0.1185
230230
vec144: 0.1197
231231
vec476: 0.124
232232
vec109: 0.1273
233-
84 leaf vectors, 135 vectors, 11 full vectors, 13 partitions
233+
91 leaf vectors, 134 vectors, 12 full vectors, 13 partitions
234234

235235
# InnerProduct.
236236
new-index dataset=fashion-784d-1k.gob dataset-count=1000 distance-metric=InnerProduct min-partition-size=4 max-partition-size=16 quality-samples=8 beam-size=4 hide-tree
237237
----
238238
Created index with 1000 vectors with 784 dimensions.
239-
3 levels, 239 partitions.
239+
3 levels, 245 partitions.
240240
CV stats:
241241
level 2 - mean: 0.0000, stdev: 0.0000
242242
level 3 - mean: 0.0000, stdev: 0.0000
@@ -257,22 +257,22 @@ vec312: -14063724
257257
vec197: -14040257
258258
vec476: -13816669
259259
vec311: -13589641
260-
1000 leaf vectors, 1123 vectors, 18 full vectors, 124 partitions
260+
1000 leaf vectors, 1125 vectors, 21 full vectors, 126 partitions
261261

262262
# Now use lower beam size.
263263
search max-results=10 use-dataset=999 beam-size=8
264264
----
265+
vec109: -14526173
265266
vec811: -14265605
266-
vec312: -14063724
267-
vec311: -13589641
268-
vec265: -13573769
267+
vec660: -13573067
269268
vec984: -13534513
270269
vec610: -13491291
271-
vec220: -13433810
270+
vec226: -13364679
271+
vec144: -13148124
272272
vec968: -13060514
273273
vec999: -12779612
274-
vec735: -12533078
275-
71 leaf vectors, 131 vectors, 12 full vectors, 13 partitions
274+
vec853: -12163027
275+
64 leaf vectors, 115 vectors, 10 full vectors, 13 partitions
276276

277277
# ----------------------------------------------------------------------
278278
# Load 950 1536-dimension image embeddings and search them using Cosine
@@ -322,13 +322,13 @@ CV stats:
322322

323323
recall topk=10 beam-size=4 samples=50
324324
----
325-
50.60% recall@10
325+
50.40% recall@10
326326
44 leaf vectors, 74 vectors, 18 full vectors, 7 partitions
327327

328328
recall topk=10 beam-size=8 samples=50
329329
----
330330
69.80% recall@10
331-
86 leaf vectors, 136 vectors, 21 full vectors, 13 partitions
331+
86 leaf vectors, 136 vectors, 22 full vectors, 13 partitions
332332

333333
recall topk=10 beam-size=16 samples=50
334334
----

pkg/sql/vecindex/cspann/testdata/split.ddt

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -865,3 +865,34 @@ force-split partition-key=3 parent-partition-key=2
865865
│ │
866866
│ └───• 5 (-9, 18)
867867
└───• 7 (5, 8)
868+
869+
# ----------------------------------------------------------------------
870+
# Ensure that the split operation groups vectors according to the
871+
# expected distance metric. This regresses a bug where the
872+
# BalancedKmeans class was always initialized with L2Squared. Test a
873+
# case where the expected grouping is different when InnerProduct is
874+
# used as the metric.
875+
# ----------------------------------------------------------------------
876+
load-index min-partition-size=1 max-partition-size=4 beam-size=2 distance-metric=InnerProduct
877+
• 1 (0, 0)
878+
879+
├───• vec1 (-1, 8)
880+
├───• vec2 (-1, 2)
881+
├───• vec3 (1, 10)
882+
└───• vec4 (1, 6)
883+
----
884+
Loaded 4 vectors.
885+
886+
force-split partition-key=1
887+
----
888+
• 1 (0, 0)
889+
890+
├───• 2 (-1, 5)
891+
│ │
892+
│ ├───• vec1 (-1, 8)
893+
│ └───• vec2 (-1, 2)
894+
895+
└───• 3 (1, 8)
896+
897+
├───• vec3 (1, 10)
898+
└───• vec4 (1, 6)

0 commit comments

Comments
 (0)