Merge #154552 #154651 #154750 #154771

craig[bot] · KeithCh · yuzefovich · craig[bot] · commit cbea4d227b5a · 2025-10-03T18:57:44.000Z
154552: changefeedccl: improve parallel io metrics r=log-head,asg0451 a=KeithCh **changefeedccl: improve parallel io metrics** Rename function parameter used to update pending rows metric and y-axis label for that metric to be more accurate. Release note(ops change): Fix changefeed.parallel_io_pending_rows metric y-axis label to match the metric's definition. Fixes: #147625 --- **changefeedccl: add parallel io workers metric** Add a gauge metric to track the number of workers in ParallelIO. Release note(ops change): Add metric changefeed.parallel_io_workers to track the number of workers in ParallelIO. Resolves: #147625 154651: opt/bench: improve BenchmarkEndToEnd for INSERTs r=yuzefovich a=yuzefovich In `BenchmarkEndToEnd` we have 3 bench cases where we have INSERT statements. Previously, we always used the same placeholder values, which forced us to do TRUNCATE TABLE after _every_ iteration, and that TRUNCATE was included into the operation time. We recently saw a supposed regression on this benchmark because the performance of TRUNCATE has regressed. In my initial approach I tried simply stopping and starting the timer around the TRUNCATE, but it made the benchmark extremely long. (Timer operations require stop-the-world pause, and since the time to perform TRUNCATE wasn't included into the benchmark time, now every single iteration seemed very short, so we'd do thousands of iterations with the default `bench-time=1s`, but truncating the table would make it about 1 minute instead.) To go around this issue I refactored three INSERT queries to generate slightly different arguments for each iteration so that we don't get PK duplicates and then moved the TRUNCATE outside the benchmark loop (and also excluded it from the timer). Now these benchmark cases truly measure what they were supposed to. Fixes: #154597. Release note: None 154750: dbconsole: custom metrics update when units change r=dhartunian a=stevendanna This fixes a long-standing bug in which changing the axis units fails to update the graph unless you also make some other change. This PR was generated by Claude Code. I asked it to write a test and it produced something with enough mocks that I wasn't sure on the value. I have manually tested this and have confirmed it does result in custom graphs being updated immediately when the axis units are changed. Epic: none Release note: None 154771: workflows: change name of provider r=rail a=rickystewart Part of: DEVINFHD-1916 Co-authored-by: Keith Chow <keith.chow@cockroachlabs.com> Co-authored-by: Yahor Yuzefovich <yahor@cockroachlabs.com> Co-authored-by: Steven Danna <danna@cockroachlabs.com> Co-authored-by: Ricky Stewart <ricky@cockroachlabs.com>
diff --git a/.github/workflows/pr-analyzer-threestage.yml b/.github/workflows/pr-analyzer-threestage.yml
@@ -23,7 +23,7 @@ jobs:
         with:
           project_id: 'vertex-model-runners'
           service_account: 'ai-review@dev-inf-prod.iam.gserviceaccount.com'
-          workload_identity_provider: 'projects/72497726731/locations/global/workloadIdentityPools/ai-review/providers/github'
+          workload_identity_provider: 'projects/72497726731/locations/global/workloadIdentityPools/ai-review/providers/ai-review'
 
       - name: Stage 1 - Initial Bug Screening
         id: stage1
diff --git a/docs/generated/metrics/metrics.yaml b/docs/generated/metrics/metrics.yaml
@@ -1780,7 +1780,7 @@ layers:
     - name: changefeed.parallel_io_pending_rows
       exported_name: changefeed_parallel_io_pending_rows
       description: Number of rows which are blocked from being sent due to conflicting in-flight keys
-      y_axis_label: Keys
+      y_axis_label: Messages
       type: GAUGE
       unit: COUNT
       aggregation: AVG
@@ -1801,6 +1801,14 @@ layers:
       unit: NANOSECONDS
       aggregation: AVG
       derivative: NONE
+    - name: changefeed.parallel_io_workers
+      exported_name: changefeed_parallel_io_workers
+      description: The number of workers in the ParallelIO
+      y_axis_label: Workers
+      type: GAUGE
+      unit: COUNT
+      aggregation: AVG
+      derivative: NONE
     - name: changefeed.progress_skew.span
       exported_name: changefeed_progress_skew_span
       description: The time difference between the fastest and slowest span's resolved timestamp
diff --git a/pkg/ccl/changefeedccl/changefeed_test.go b/pkg/ccl/changefeedccl/changefeed_test.go
@@ -11712,9 +11712,9 @@ func TestParallelIOMetrics(t *testing.T) {
 	testFn := func(t *testing.T, s TestServer, f cdctest.TestFeedFactory) {
 		registry := s.Server.JobRegistry().(*jobs.Registry)
 		metrics := registry.MetricsStruct().Changefeed.(*Metrics).AggMetrics
-
+		numWorkers := 1
 		db := sqlutils.MakeSQLRunner(s.DB)
-		db.Exec(t, `SET CLUSTER SETTING changefeed.sink_io_workers = 1`)
+		db.Exec(t, fmt.Sprintf(`SET CLUSTER SETTING changefeed.sink_io_workers = %d`, numWorkers))
 		db.Exec(t, `
 		  CREATE TABLE foo (a INT PRIMARY KEY);
 		`)
@@ -11743,6 +11743,7 @@ func TestParallelIOMetrics(t *testing.T) {
 		// Set the frequency to 1s. The default frequency at the time of writing is
 		foo, err := f.Feed("CREATE CHANGEFEED FOR TABLE foo WITH pubsub_sink_config=" +
 			"'{\"Flush\": {\"Frequency\": \"100ms\"}}'")
+		defer closeFeed(t, foo)
 		require.NoError(t, err)
 
 		testutils.SucceedsSoon(t, func() error {
@@ -11775,6 +11776,19 @@ func TestParallelIOMetrics(t *testing.T) {
 			}
 			return nil
 		})
+
+		assert.Equal(t, int64(numWorkers), metrics.ParallelIOWorkers.Value())
+		jobFeed := foo.(cdctest.EnterpriseTestFeed)
+		require.NoError(t, jobFeed.Pause())
+		db.Exec(t, fmt.Sprintf(`SET CLUSTER SETTING changefeed.sink_io_workers = %d`, numWorkers+1))
+		require.NoError(t, jobFeed.Resume())
+		testutils.SucceedsSoon(t, func() error {
+			if metrics.ParallelIOWorkers.Value() != int64(numWorkers+1) {
+				return errors.Newf("waiting for workers: %d", metrics.ParallelIOWorkers.Value())
+			}
+			return nil
+		})
+
 		close(done)
 		require.NoError(t, g.Wait())
 		require.NoError(t, foo.Close())
diff --git a/pkg/ccl/changefeedccl/metrics.go b/pkg/ccl/changefeedccl/metrics.go
@@ -69,6 +69,7 @@ type AggMetrics struct {
 	ParallelIOPendingRows       *aggmetric.AggGauge
 	ParallelIOResultQueueNanos  *aggmetric.AggHistogram
 	ParallelIOInFlightKeys      *aggmetric.AggGauge
+	ParallelIOWorkers           *aggmetric.AggGauge
 	SinkIOInflight              *aggmetric.AggGauge
 	SinkBackpressureNanos       *aggmetric.AggHistogram
 	CommitLatency               *aggmetric.AggHistogram
@@ -128,6 +129,7 @@ type metricsRecorder interface {
 	recordSizeBasedFlush()
 	newParallelIOMetricsRecorder() parallelIOMetricsRecorder
 	recordSinkIOInflightChange(int64)
+	recordParallelIOWorkers(int64)
 	recordSinkBackpressure(time.Duration)
 	makeCloudstorageFileAllocCallback() func(delta int64)
 	getKafkaThrottlingMetrics(*cluster.Settings) metrics.Histogram
@@ -158,6 +160,7 @@ type sliMetrics struct {
 	ParallelIOPendingRows       *aggmetric.Gauge
 	ParallelIOResultQueueNanos  *aggmetric.Histogram
 	ParallelIOInFlightKeys      *aggmetric.Gauge
+	ParallelIOWorkers           *aggmetric.Gauge
 	SinkIOInflight              *aggmetric.Gauge
 	SinkBackpressureNanos       *aggmetric.Histogram
 	CommitLatency               *aggmetric.Histogram
@@ -554,8 +557,8 @@ func (k *kafkaHistogramAdapter) Variance() (_ float64) {
 }
 
 type parallelIOMetricsRecorder interface {
-	recordPendingQueuePush(numKeys int64)
-	recordPendingQueuePop(numKeys int64, latency time.Duration)
+	recordPendingQueuePush(numMessages int64)
+	recordPendingQueuePop(numMessages int64, latency time.Duration)
 	recordResultQueueLatency(latency time.Duration)
 	setInFlightKeys(n int64)
 }
@@ -626,6 +629,14 @@ func (m *sliMetrics) recordSinkIOInflightChange(delta int64) {
 	m.SinkIOInflight.Inc(delta)
 }
 
+func (m *sliMetrics) recordParallelIOWorkers(n int64) {
+	if m == nil {
+		return
+	}
+
+	m.ParallelIOWorkers.Update(n)
+}
+
 func (m *sliMetrics) recordSinkBackpressure(duration time.Duration) {
 	if m == nil {
 		return
@@ -712,6 +723,10 @@ func (w *wrappingCostController) recordSinkIOInflightChange(delta int64) {
 	w.inner.recordSinkIOInflightChange(delta)
 }
 
+func (w *wrappingCostController) recordParallelIOWorkers(n int64) {
+	w.inner.recordParallelIOWorkers(n)
+}
+
 func (w *wrappingCostController) recordSinkBackpressure(duration time.Duration) {
 	w.inner.recordSinkBackpressure(duration)
 }
@@ -976,7 +991,7 @@ func newAggregateMetrics(histogramWindow time.Duration, lookup *cidr.Lookup) *Ag
 	metaChangefeedParallelIOPendingRows := metric.Metadata{
 		Name:        "changefeed.parallel_io_pending_rows",
 		Help:        "Number of rows which are blocked from being sent due to conflicting in-flight keys",
-		Measurement: "Keys",
+		Measurement: "Messages",
 		Unit:        metric.Unit_COUNT,
 	}
 	metaChangefeedParallelIOResultQueueNanos := metric.Metadata{
@@ -992,6 +1007,12 @@ func newAggregateMetrics(histogramWindow time.Duration, lookup *cidr.Lookup) *Ag
 		Measurement: "Keys",
 		Unit:        metric.Unit_COUNT,
 	}
+	metaChangefeedParallelIOWorkers := metric.Metadata{
+		Name:        "changefeed.parallel_io_workers",
+		Help:        "The number of workers in the ParallelIO",
+		Measurement: "Workers",
+		Unit:        metric.Unit_COUNT,
+	}
 	metaChangefeedSinkIOInflight := metric.Metadata{
 		Name:        "changefeed.sink_io_inflight",
 		Help:        "The number of keys currently inflight as IO requests being sent to the sink",
@@ -1138,6 +1159,7 @@ func newAggregateMetrics(histogramWindow time.Duration, lookup *cidr.Lookup) *Ag
 			SigFigs:      2,
 			BucketConfig: metric.ChangefeedBatchLatencyBuckets,
 		}),
+		ParallelIOWorkers: b.Gauge(metaChangefeedParallelIOWorkers),
 		BatchHistNanos: b.Histogram(metric.HistogramOptions{
 			Metadata:     metaChangefeedBatchHistNanos,
 			Duration:     histogramWindow,
@@ -1245,6 +1267,7 @@ func (a *AggMetrics) getOrCreateScope(scope string) (*sliMetrics, error) {
 		ParallelIOPendingRows:       a.ParallelIOPendingRows.AddChild(scope),
 		ParallelIOResultQueueNanos:  a.ParallelIOResultQueueNanos.AddChild(scope),
 		ParallelIOInFlightKeys:      a.ParallelIOInFlightKeys.AddChild(scope),
+		ParallelIOWorkers:           a.ParallelIOWorkers.AddChild(scope),
 		SinkIOInflight:              a.SinkIOInflight.AddChild(scope),
 		SinkBackpressureNanos:       a.SinkBackpressureNanos.AddChild(scope),
 		CommitLatency:               a.CommitLatency.AddChild(scope),
diff --git a/pkg/ccl/changefeedccl/parallel_io.go b/pkg/ccl/changefeedccl/parallel_io.go
@@ -100,7 +100,7 @@ func NewParallelIO(
 	wg.GoCtx(func(ctx context.Context) error {
 		return io.processIO(ctx, numWorkers)
 	})
-
+	io.metrics.recordParallelIOWorkers(int64(numWorkers))
 	return io
 }
 
diff --git a/pkg/sql/opt/bench/bench_test.go b/pkg/sql/opt/bench/bench_test.go
diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/customChart/index.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/customChart/index.tsx

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ func NewParallelIO(`
`100`	`100`	`wg.GoCtx(func(ctx context.Context) error {`
`101`	`101`	`return io.processIO(ctx, numWorkers)`
`102`	`102`	`})`
`103`		`-`
	`103`	`+ io.metrics.recordParallelIOWorkers(int64(numWorkers))`
`104`	`104`	`return io`
`105`	`105`	`}`
`106`	`106`