Skip to content

Commit a5ae112

Browse files
committed
quantize: set zero dot product for centroid data vector
Previously, the dot product between a data unit vector and its quantized form was not being set in the case where the data vector is equal to the centroid. This could cause an issue when a RaBitQuantizedSet is reused and the dot product memory is not zero. Fix this buglet and update the code to scribble undefined memory. Epic: CRDB-42943 Release note: None
1 parent f4fae60 commit a5ae112

File tree

3 files changed

+45
-13
lines changed

3 files changed

+45
-13
lines changed

pkg/sql/vecindex/cspann/quantize/rabitq.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -557,9 +557,11 @@ func (q *RaBitQuantizer) quantizeHelper(
557557
// Store the inverted dot product, which will be used to make distance
558558
// estimates. The dot product is only zero in the case where the data vector
559559
// is equal to the centroid vector. That case is handled separately in
560-
// EstimatedDistances.
560+
// EstimateDistances.
561561
if dotProduct != 0 {
562562
dotProducts[i] = 1.0 / dotProduct
563+
} else {
564+
dotProducts[i] = 0
563565
}
564566
}
565567
}

pkg/sql/vecindex/cspann/quantize/rabitq_test.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,19 +150,29 @@ func TestRaBitQuantizerEdge(t *testing.T) {
150150

151151
t.Run("add centroid to set", func(t *testing.T) {
152152
quantizer := NewRaBitQuantizer(2, 42, vecpb.L2SquaredDistance)
153+
quantizedSet := quantizer.NewSet(4, []float32{3, 9}).(*RaBitQuantizedVectorSet)
153154
vectors := vector.MakeSetFromRawData([]float32{1, 5, 5, 13}, 2)
154-
quantizedSet := quantizer.Quantize(&workspace, vectors).(*RaBitQuantizedVectorSet)
155-
require.Equal(t, []float32{3, 9}, quantizedSet.Centroid)
155+
quantizer.QuantizeInSet(&workspace, quantizedSet, vectors)
156156

157157
// Add centroid to the set along with another vector.
158158
vectors = vector.MakeSetFromRawData([]float32{1, 5, 3, 9}, 2)
159159
quantizer.QuantizeInSet(&workspace, quantizedSet, vectors)
160+
require.Equal(t, float32(0), quantizedSet.QuantizedDotProducts[3],
161+
"dot product for centroid should be zero")
162+
163+
// Estimate distances from a query vector not in the set.
160164
distances := make([]float32, 4)
161165
errorBounds := make([]float32, 4)
162166
quantizer.EstimateDistances(
163167
&workspace, quantizedSet, vector.T{3, 2}, distances, errorBounds)
164168
require.Equal(t, []float32{22.33, 115.67, 22.33, 49}, testutils.RoundFloats(distances, 2))
165169
require.Equal(t, []float32{44.27, 44.27, 44.27, 0}, testutils.RoundFloats(errorBounds, 2))
170+
171+
// Estimate distances when the query vector is the centroid.
172+
quantizer.EstimateDistances(
173+
&workspace, quantizedSet, vector.T{3, 9}, distances, errorBounds)
174+
require.Equal(t, []float32{20, 20, 20, 0}, testutils.RoundFloats(distances, 2))
175+
require.Equal(t, []float32{0, 0, 0, 0}, testutils.RoundFloats(errorBounds, 2))
166176
})
167177

168178
t.Run("query vector is centroid", func(t *testing.T) {

pkg/sql/vecindex/cspann/quantize/rabitqpb.go

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,11 @@ func (cs *RaBitQCodeSet) AddUndefined(count int) {
9898
cs.Data = slices.Grow(cs.Data, count*cs.Width)
9999
cs.Count += count
100100
cs.Data = cs.Data[:cs.Count*cs.Width]
101+
if buildutil.CrdbTestBuild {
102+
for i := len(cs.Data) - count*cs.Width; i < len(cs.Data); i++ {
103+
cs.Data[i] = 0xBADF00D
104+
}
105+
}
101106
}
102107

103108
// ReplaceWithLast removes the code at the given offset from the set, replacing
@@ -148,16 +153,7 @@ func (vs *RaBitQuantizedVectorSet) Clear(centroid vector.T) {
148153
if vs.Centroid == nil {
149154
panic(errors.New("Clear cannot be called on an uninitialized vector set"))
150155
}
151-
for i := range len(vs.CodeCounts) {
152-
vs.CodeCounts[i] = 0xBADF00D
153-
}
154-
for i := range len(vs.CentroidDistances) {
155-
vs.CentroidDistances[i] = math.Pi
156-
}
157-
for i := range len(vs.QuantizedDotProducts) {
158-
vs.QuantizedDotProducts[i] = math.Pi
159-
}
160-
// RaBitQCodeSet.Clear takes care of scribbling memory for vs.Codes.
156+
vs.scribble(0, len(vs.CodeCounts))
161157
}
162158

163159
// Recompute the centroid norm for Cosine and InnerProduct metrics, but only
@@ -193,4 +189,28 @@ func (vs *RaBitQuantizedVectorSet) AddUndefined(count int) {
193189
vs.CentroidDotProducts = slices.Grow(vs.CentroidDotProducts, count)
194190
vs.CentroidDotProducts = vs.CentroidDotProducts[:newCount]
195191
}
192+
if buildutil.CrdbTestBuild {
193+
vs.scribble(newCount-count, newCount)
194+
}
195+
}
196+
197+
// scribble writes garbage values to undefined vector set values. This is only
198+
// called in test builds to make detecting bugs easier.
199+
func (vs *RaBitQuantizedVectorSet) scribble(start, end int) {
200+
for i := start; i < end; i++ {
201+
vs.CodeCounts[i] = 0xBADF00D
202+
}
203+
for i := start; i < end; i++ {
204+
vs.CentroidDistances[i] = math.Pi
205+
}
206+
for i := start; i < end; i++ {
207+
vs.QuantizedDotProducts[i] = math.Pi
208+
}
209+
if vs.Metric != vecpb.L2SquaredDistance {
210+
for i := start; i < end; i++ {
211+
vs.CentroidDotProducts[i] = math.Pi
212+
}
213+
}
214+
// RaBitQCodeSet Clear and AddUndefined methods take care of scribbling
215+
// memory for vs.Codes.
196216
}

0 commit comments

Comments
 (0)