Merge #149975 #150146 #150147

craig[bot] · Peter-Fayez95 · andy-kimball · craig[bot] · commit 7d97799d7119 · 2025-07-14T18:20:50.000Z
149975: changefeedccl: emit warning when `resolved` or `min_checkpoint_frequency` is set too low r=asg0451 a=Peter-Fayez95 This change adds client-side notices to `CREATE CHANGEFEED` and `ALTER CHANGEFEED` statements when the `resolved` or `min_checkpoint_frequency` options are set below a recommended threshold (e.g., 500ms). These warnings aim to guide users toward more balanced configurations. Setting these options too low can significantly increase CPU usage due to more frequent checkpointing and resolved timestamp emissions, introducing performance trade-offs. Epic: CRDB-52074 Fixes #149238 Release note (general change): A warning is now emitted when creating or altering a changefeed with `resolved` or `min_checkpoint_frequency` set below 500ms. This helps users understand the tradeoff between message latency and cluster CPU usage. 150146: vecindex: fix Cosine/InnerProduct accuracy bugs r=drewkimball a=andy-kimball #### cspann: use correct metric for assigning vectors during split Previously, during split the BalancedKmeans class was not initialized with the distance metric used by the index. This caused vectors to be assigned to partitions using a potentially incorrect metric, which can negatively impact accuracy. #### quantize: set zero dot product for centroid data vector Previously, the dot product between a data unit vector and its quantized form was not being set in the case where the data vector is equal to the centroid. This could cause an issue when a RaBitQuantizedSet is reused and the dot product memory is not zero. Fix this buglet and update the code to scribble undefined memory. #### quantize: recompute norm when centroid is updated When RaBitQuantizedVectorSet.Clear was called with a new centroid, the norm was not being recomputed. This commit fixes that bug. 150147: roachtest: actually fail TPCC bench if reached max warehouses r=miraradeva a=miraradeva In 1bfe55b, we attempted to fail the TPCC bench test run if the configured maximum warehouses were reached and the success criteria were met. The idea was that this would prompt us to increase the max warehouses. However, that commit failed only the specific line search run, not the full test run. This commit moves the max warehouses check out of the result handling and actually fatals the test. Part of: #148235 Release note: None Co-authored-by: Peter <peterfayez1285@gmail.com> Co-authored-by: Andrew Kimball <andyk@cockroachlabs.com> Co-authored-by: Mira Radeva <mira@cockroachlabs.com>
diff --git a/pkg/ccl/changefeedccl/changefeed_stmt.go b/pkg/ccl/changefeedccl/changefeed_stmt.go
@@ -875,6 +875,20 @@ func createChangefeedJobRecord(
 			"less frequently", resolved, resolvedStr, freqStr, freq))
 	}
 
+	const minRecommendedFrequency = 500 * time.Millisecond
+
+	if emit && resolvedOpt != nil && *resolvedOpt < minRecommendedFrequency {
+		p.BufferClientNotice(ctx, pgnotice.Newf(
+			"the 'resolved' timestamp interval (%s) is very low; consider increasing it to at least %s",
+			resolvedOpt, minRecommendedFrequency))
+	}
+
+	if freqOpt != nil && *freqOpt < minRecommendedFrequency {
+		p.BufferClientNotice(ctx, pgnotice.Newf(
+			"the 'min_checkpoint_frequency' timestamp interval (%s) is very low; consider increasing it to at least %s",
+			freqOpt, minRecommendedFrequency))
+	}
+
 	ptsExpiration, err := opts.GetPTSExpiration()
 	if err != nil {
 		return nil, err
diff --git a/pkg/ccl/changefeedccl/changefeed_test.go b/pkg/ccl/changefeedccl/changefeed_test.go
@@ -5074,6 +5074,10 @@ func TestChangefeedResolvedNotice(t *testing.T) {
 	defer cleanup()
 	s := cluster.Server(1)
 
+	// Set the default min_checkpoint_frequency to 30 seconds for this test
+	restoreDefault := changefeedbase.TestingSetDefaultMinCheckpointFrequency(30 * time.Second)
+	defer restoreDefault()
+
 	pgURL, cleanup := pgurlutils.PGUrl(t, s.SQLAddr(), t.Name(), url.User(username.RootUser))
 	defer cleanup()
 	pgBase, err := pq.NewConnector(pgURL.String())
@@ -5103,10 +5107,9 @@ func TestChangefeedResolvedNotice(t *testing.T) {
 	t.Run("resolved<min_checkpoint_frequency default", func(t *testing.T) {
 		actual = "(no notice)"
 		f := makeKafkaFeedFactory(t, s, dbWithHandler)
-		testFeed := feed(t, f, `CREATE CHANGEFEED FOR ☃ INTO 'kafka://does.not.matter/' WITH resolved='20ms'`)
+		testFeed := feed(t, f, `CREATE CHANGEFEED FOR ☃ INTO 'kafka://does.not.matter/' WITH resolved='5s'`)
 		defer closeFeed(t, testFeed)
-		// Note: default min_checkpoint_frequency is set to 100ms in startTestCluster.
-		require.Equal(t, `resolved (20ms) messages will not be emitted more frequently than the default min_checkpoint_frequency (100ms), but may be emitted less frequently`, actual)
+		require.Equal(t, `resolved (5s) messages will not be emitted more frequently than the default min_checkpoint_frequency (30s), but may be emitted less frequently`, actual)
 	})
 	t.Run("resolved=min_checkpoint_frequency", func(t *testing.T) {
 		actual = "(no notice)"
@@ -5131,6 +5134,63 @@ func TestChangefeedResolvedNotice(t *testing.T) {
 	})
 }
 
+func TestChangefeedLowFrequencyNotices(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+
+	cluster, _, cleanup := startTestCluster(t)
+	defer cleanup()
+	s := cluster.Server(1)
+
+	pgURL, cleanup := pgurlutils.PGUrl(t, s.SQLAddr(), t.Name(), url.User(username.RootUser))
+	defer cleanup()
+	pgBase, err := pq.NewConnector(pgURL.String())
+	if err != nil {
+		t.Fatal(err)
+	}
+	var actual string
+	connector := pq.ConnectorWithNoticeHandler(pgBase, func(n *pq.Error) {
+		actual = n.Message
+	})
+
+	dbWithHandler := gosql.OpenDB(connector)
+	defer dbWithHandler.Close()
+
+	sqlDB := sqlutils.MakeSQLRunner(dbWithHandler)
+
+	sqlDB.Exec(t, `CREATE TABLE ☃ (i INT PRIMARY KEY)`)
+	sqlDB.Exec(t, `INSERT INTO ☃ VALUES (0)`)
+
+	t.Run("no options specified", func(t *testing.T) {
+		actual = "(no notice)"
+		f := makeKafkaFeedFactory(t, s, dbWithHandler)
+		testFeed := feed(t, f, `CREATE CHANGEFEED FOR ☃ INTO 'kafka://does.not.matter/'`)
+		defer closeFeed(t, testFeed)
+		require.Equal(t, `changefeed will emit to topic _u2603_`, actual)
+	})
+	t.Run("normal resolved and min_checkpoint_frequency", func(t *testing.T) {
+		actual = "(no notice)"
+		f := makeKafkaFeedFactory(t, s, dbWithHandler)
+		testFeed := feed(t, f, `CREATE CHANGEFEED FOR ☃ INTO 'kafka://does.not.matter/' WITH resolved='10s', min_checkpoint_frequency='10s'`)
+		defer closeFeed(t, testFeed)
+		require.Equal(t, `changefeed will emit to topic _u2603_`, actual)
+	})
+	t.Run("low resolved timestamp", func(t *testing.T) {
+		actual = "(no notice)"
+		f := makeKafkaFeedFactory(t, s, dbWithHandler)
+		testFeed := feed(t, f, `CREATE CHANGEFEED FOR ☃ INTO 'kafka://does.not.matter/' WITH resolved='200ms'`)
+		defer closeFeed(t, testFeed)
+		require.Equal(t, `the 'resolved' timestamp interval (200ms) is very low; consider increasing it to at least 500ms`, actual)
+	})
+	t.Run("low min_checkpoint_frequency timestamp", func(t *testing.T) {
+		actual = "(no notice)"
+		f := makeKafkaFeedFactory(t, s, dbWithHandler)
+		testFeed := feed(t, f, `CREATE CHANGEFEED FOR ☃ INTO 'kafka://does.not.matter/' WITH min_checkpoint_frequency='200ms'`)
+		defer closeFeed(t, testFeed)
+		require.Equal(t, `the 'min_checkpoint_frequency' timestamp interval (200ms) is very low; consider increasing it to at least 500ms`, actual)
+	})
+}
+
 func TestChangefeedOutputTopics(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	defer log.Scope(t).Close(t)
diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go
@@ -2250,7 +2250,22 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen
 				results = append(results, partial)
 			}
 			res = tpcc.MergeResults(results...)
-			failErr = res.FailureError(b.LoadWarehouses(c.Cloud()))
+			failErr = res.FailureError()
+			// If the active warehouses have reached the load warehouses, fail the test;
+			// it needs to be updated to allow for more warehouses. Note that the line
+			// search assumes that the test fails at the number of load warehouses, so it
+			// never attempts to reach it exactly. Therefore, active warehouses can be at
+			// most LoadWarehouses-1.
+			if res.ActiveWarehouses >= b.LoadWarehouses(c.Cloud())-1 {
+				err = errors.CombineErrors(
+					failErr,
+					errors.Errorf(
+						"the number of active warehouses (%d) reached the maximum number of "+
+							"warehouses; consider updating LoadWarehouses and EstimatedMax", res.ActiveWarehouses,
+					),
+				)
+				t.Fatal(err)
+			}
 		}
 
 		// Print the result.
@@ -2681,7 +2696,7 @@ func runTPCCPublished(
 				results = append(results, partial)
 			}
 			res = tpcc.MergeResults(results...)
-			failErr = res.FailureError(opts.LoadWarehousesGCE)
+			failErr = res.FailureError()
 		}
 
 		// Print result for current iteration
diff --git a/pkg/sql/vecindex/cspann/fixup_split.go b/pkg/sql/vecindex/cspann/fixup_split.go
@@ -1023,7 +1023,11 @@ func (fw *fixupWorker) copyToSplitSubPartitions(
 	defer fw.workspace.FreeUint64s(tempAssignments)
 
 	// Assign vectors to the partition with the nearest centroid.
-	kmeans := BalancedKmeans{Workspace: &fw.workspace, Rand: fw.rng}
+	kmeans := BalancedKmeans{
+		Workspace:      &fw.workspace,
+		Rand:           fw.rng,
+		DistanceMetric: fw.index.quantizer.GetDistanceMetric(),
+	}
 	leftCount = kmeans.AssignPartitions(
 		vectors, leftMetadata.Centroid, rightMetadata.Centroid, tempAssignments)
 
diff --git a/pkg/sql/vecindex/cspann/quantize/rabitq.go b/pkg/sql/vecindex/cspann/quantize/rabitq.go
@@ -557,9 +557,11 @@ func (q *RaBitQuantizer) quantizeHelper(
 		// Store the inverted dot product, which will be used to make distance
 		// estimates. The dot product is only zero in the case where the data vector
 		// is equal to the centroid vector. That case is handled separately in
-		// EstimatedDistances.
+		// EstimateDistances.
 		if dotProduct != 0 {
 			dotProducts[i] = 1.0 / dotProduct
+		} else {
+			dotProducts[i] = 0
 		}
 	}
 }
diff --git a/pkg/sql/vecindex/cspann/quantize/rabitq_test.go b/pkg/sql/vecindex/cspann/quantize/rabitq_test.go
@@ -150,19 +150,29 @@ func TestRaBitQuantizerEdge(t *testing.T) {
 
 	t.Run("add centroid to set", func(t *testing.T) {
 		quantizer := NewRaBitQuantizer(2, 42, vecpb.L2SquaredDistance)
+		quantizedSet := quantizer.NewSet(4, []float32{3, 9}).(*RaBitQuantizedVectorSet)
 		vectors := vector.MakeSetFromRawData([]float32{1, 5, 5, 13}, 2)
-		quantizedSet := quantizer.Quantize(&workspace, vectors).(*RaBitQuantizedVectorSet)
-		require.Equal(t, []float32{3, 9}, quantizedSet.Centroid)
+		quantizer.QuantizeInSet(&workspace, quantizedSet, vectors)
 
 		// Add centroid to the set along with another vector.
 		vectors = vector.MakeSetFromRawData([]float32{1, 5, 3, 9}, 2)
 		quantizer.QuantizeInSet(&workspace, quantizedSet, vectors)
+		require.Equal(t, float32(0), quantizedSet.QuantizedDotProducts[3],
+			"dot product for centroid should be zero")
+
+		// Estimate distances from a query vector not in the set.
 		distances := make([]float32, 4)
 		errorBounds := make([]float32, 4)
 		quantizer.EstimateDistances(
 			&workspace, quantizedSet, vector.T{3, 2}, distances, errorBounds)
 		require.Equal(t, []float32{22.33, 115.67, 22.33, 49}, testutils.RoundFloats(distances, 2))
 		require.Equal(t, []float32{44.27, 44.27, 44.27, 0}, testutils.RoundFloats(errorBounds, 2))
+
+		// Estimate distances when the query vector is the centroid.
+		quantizer.EstimateDistances(
+			&workspace, quantizedSet, vector.T{3, 9}, distances, errorBounds)
+		require.Equal(t, []float32{20, 20, 20, 0}, testutils.RoundFloats(distances, 2))
+		require.Equal(t, []float32{0, 0, 0, 0}, testutils.RoundFloats(errorBounds, 2))
 	})
 
 	t.Run("query vector is centroid", func(t *testing.T) {
diff --git a/pkg/sql/vecindex/cspann/quantize/rabitqpb.go b/pkg/sql/vecindex/cspann/quantize/rabitqpb.go
@@ -98,6 +98,11 @@ func (cs *RaBitQCodeSet) AddUndefined(count int) {
 	cs.Data = slices.Grow(cs.Data, count*cs.Width)
 	cs.Count += count
 	cs.Data = cs.Data[:cs.Count*cs.Width]
+	if buildutil.CrdbTestBuild {
+		for i := len(cs.Data) - count*cs.Width; i < len(cs.Data); i++ {
+			cs.Data[i] = 0xBADF00D
+		}
+	}
 }
 
 // ReplaceWithLast removes the code at the given offset from the set, replacing
@@ -145,16 +150,18 @@ func (vs *RaBitQuantizedVectorSet) Clone() QuantizedVectorSet {
 // Clear implements the QuantizedVectorSet interface
 func (vs *RaBitQuantizedVectorSet) Clear(centroid vector.T) {
 	if buildutil.CrdbTestBuild {
-		for i := range len(vs.CodeCounts) {
-			vs.CodeCounts[i] = 0xBADF00D
+		if vs.Centroid == nil {
+			panic(errors.New("Clear cannot be called on an uninitialized vector set"))
 		}
-		for i := range len(vs.CentroidDistances) {
-			vs.CentroidDistances[i] = math.Pi
-		}
-		for i := range len(vs.QuantizedDotProducts) {
-			vs.QuantizedDotProducts[i] = math.Pi
+		vs.scribble(0, len(vs.CodeCounts))
+	}
+
+	// Recompute the centroid norm for Cosine and InnerProduct metrics, but only
+	// if a new centroid is provided.
+	if vs.Metric != vecpb.L2SquaredDistance {
+		if &vs.Centroid[0] != &centroid[0] {
+			vs.CentroidNorm = num32.Norm(centroid)
 		}
-		// RaBitQCodeSet.Clear takes care of scribbling memory for vs.Codes.
 	}
 
 	// vs.Centroid is immutable, so do not try to reuse its memory.
@@ -164,11 +171,6 @@ func (vs *RaBitQuantizedVectorSet) Clear(centroid vector.T) {
 	vs.CentroidDistances = vs.CentroidDistances[:0]
 	vs.QuantizedDotProducts = vs.QuantizedDotProducts[:0]
 	vs.CentroidDotProducts = vs.CentroidDotProducts[:0]
-	if vs.Metric != vecpb.L2SquaredDistance {
-		if &vs.Centroid[0] != &centroid[0] {
-			vs.CentroidNorm = num32.Norm(centroid)
-		}
-	}
 }
 
 // AddUndefined adds the given number of quantized vectors to this set. The new
@@ -187,4 +189,28 @@ func (vs *RaBitQuantizedVectorSet) AddUndefined(count int) {
 		vs.CentroidDotProducts = slices.Grow(vs.CentroidDotProducts, count)
 		vs.CentroidDotProducts = vs.CentroidDotProducts[:newCount]
 	}
+	if buildutil.CrdbTestBuild {
+		vs.scribble(newCount-count, newCount)
+	}
+}
+
+// scribble writes garbage values to undefined vector set values. This is only
+// called in test builds to make detecting bugs easier.
+func (vs *RaBitQuantizedVectorSet) scribble(start, end int) {
+	for i := start; i < end; i++ {
+		vs.CodeCounts[i] = 0xBADF00D
+	}
+	for i := start; i < end; i++ {
+		vs.CentroidDistances[i] = math.Pi
+	}
+	for i := start; i < end; i++ {
+		vs.QuantizedDotProducts[i] = math.Pi
+	}
+	if vs.Metric != vecpb.L2SquaredDistance {
+		for i := start; i < end; i++ {
+			vs.CentroidDotProducts[i] = math.Pi
+		}
+	}
+	// RaBitQCodeSet Clear and AddUndefined methods take care of scribbling
+	// memory for vs.Codes.
 }
diff --git a/pkg/sql/vecindex/cspann/quantize/rabitqpb_test.go b/pkg/sql/vecindex/cspann/quantize/rabitqpb_test.go
@@ -105,10 +105,15 @@ func TestRaBitQuantizedVectorSet(t *testing.T) {
 	require.Equal(t, []float32{10}, cloned.CentroidDistances)
 	require.Equal(t, []float32{10}, cloned.QuantizedDotProducts)
 
+	// Clear the set and ensure that norm is not updated.
+	quantizedSet.Clear(quantizedSet.Centroid)
+	require.Equal(t, float32(0), quantizedSet.CentroidNorm)
+
 	// Test InnerProduct distance metric, which uses the CentroidDotProducts
 	// field (L2Squared does not use it).
-	quantizedSet.Clear(quantizedSet.Centroid)
 	quantizedSet.Metric = vecpb.InnerProductDistance
+	quantizedSet.Clear(quantizedSet.Centroid)
+	require.Equal(t, float32(0), quantizedSet.CentroidNorm)
 	quantizedSet.AddUndefined(2)
 	copy(quantizedSet.Codes.At(1), []uint64{1, 2, 3})
 	quantizedSet.CodeCounts[1] = 15
@@ -124,4 +129,8 @@ func TestRaBitQuantizedVectorSet(t *testing.T) {
 	require.Len(t, cloned.CentroidDotProducts, 2)
 	cloned.Clear(quantizedSet.Centroid)
 	require.Len(t, cloned.CentroidDotProducts, 0)
+
+	// Update the centroid and ensure that norm is updated.
+	quantizedSet.Clear([]float32{2, 3, 6})
+	require.Equal(t, float32(7), quantizedSet.CentroidNorm)
 }
diff --git a/pkg/sql/vecindex/cspann/testdata/search-embeddings.ddt b/pkg/sql/vecindex/cspann/testdata/search-embeddings.ddt
@@ -197,7 +197,7 @@ vec420: 3781823
 new-index dataset=fashion-784d-1k.gob dataset-count=1000 distance-metric=Cosine min-partition-size=4 max-partition-size=16 quality-samples=8 beam-size=4 hide-tree
 ----
 Created index with 1000 vectors with 784 dimensions.
-3 levels, 209 partitions.
+3 levels, 211 partitions.
 CV stats:
   level 2 - mean: 0.0000, stdev: 0.0000
   level 3 - mean: 0.0000, stdev: 0.0000
@@ -215,7 +215,7 @@ vec409: 0.1185
 vec144: 0.1197
 vec476: 0.124
 vec109: 0.1273
-1000 leaf vectors, 1108 vectors, 11 full vectors, 109 partitions
+1000 leaf vectors, 1109 vectors, 12 full vectors, 110 partitions
 
 # Now use lower beam size.
 search max-results=10 use-dataset=999 beam-size=8
@@ -230,13 +230,13 @@ vec409: 0.1185
 vec144: 0.1197
 vec476: 0.124
 vec109: 0.1273
-84 leaf vectors, 135 vectors, 11 full vectors, 13 partitions
+91 leaf vectors, 134 vectors, 12 full vectors, 13 partitions
 
 # InnerProduct.
 new-index dataset=fashion-784d-1k.gob dataset-count=1000 distance-metric=InnerProduct min-partition-size=4 max-partition-size=16 quality-samples=8 beam-size=4 hide-tree
 ----
 Created index with 1000 vectors with 784 dimensions.
-3 levels, 239 partitions.
+3 levels, 245 partitions.
 CV stats:
   level 2 - mean: 0.0000, stdev: 0.0000
   level 3 - mean: 0.0000, stdev: 0.0000
@@ -257,22 +257,22 @@ vec312: -14063724
 vec197: -14040257
 vec476: -13816669
 vec311: -13589641
-1000 leaf vectors, 1123 vectors, 18 full vectors, 124 partitions
+1000 leaf vectors, 1125 vectors, 21 full vectors, 126 partitions
 
 # Now use lower beam size.
 search max-results=10 use-dataset=999 beam-size=8
 ----
+vec109: -14526173
 vec811: -14265605
-vec312: -14063724
-vec311: -13589641
-vec265: -13573769
+vec660: -13573067
 vec984: -13534513
 vec610: -13491291
-vec220: -13433810
+vec226: -13364679
+vec144: -13148124
 vec968: -13060514
 vec999: -12779612
-vec735: -12533078
-71 leaf vectors, 131 vectors, 12 full vectors, 13 partitions
+vec853: -12163027
+64 leaf vectors, 115 vectors, 10 full vectors, 13 partitions
 
 # ----------------------------------------------------------------------
 # Load 950 1536-dimension image embeddings and search them using Cosine
@@ -322,13 +322,13 @@ CV stats:
 
 recall topk=10 beam-size=4 samples=50
 ----
-50.60% recall@10
+50.40% recall@10
 44 leaf vectors, 74 vectors, 18 full vectors, 7 partitions
 
 recall topk=10 beam-size=8 samples=50
 ----
 69.80% recall@10
-86 leaf vectors, 136 vectors, 21 full vectors, 13 partitions
+86 leaf vectors, 136 vectors, 22 full vectors, 13 partitions
 
 recall topk=10 beam-size=16 samples=50
 ----
diff --git a/pkg/sql/vecindex/cspann/testdata/split.ddt b/pkg/sql/vecindex/cspann/testdata/split.ddt
diff --git a/pkg/workload/tpcc/result.go b/pkg/workload/tpcc/result.go
diff --git a/pkg/workload/tpcc/result_test.go b/pkg/workload/tpcc/result_test.go

Original file line number	Diff line number	Diff line change
`@@ -557,9 +557,11 @@ func (q *RaBitQuantizer) quantizeHelper(`
`557`	`557`	`// Store the inverted dot product, which will be used to make distance`
`558`	`558`	`// estimates. The dot product is only zero in the case where the data vector`
`559`	`559`	`// is equal to the centroid vector. That case is handled separately in`
`560`		`- // EstimatedDistances.`
	`560`	`+ // EstimateDistances.`
`561`	`561`	`if dotProduct != 0 {`
`562`	`562`	`dotProducts[i] = 1.0 / dotProduct`
	`563`	`+ } else {`
	`564`	`+ dotProducts[i] = 0`
`563`	`565`	`}`
`564`	`566`	`}`
`565`	`567`	`}`