kvserver: improve tenant_id CPU observability

tbg · tbg · commit fdb417acf481 · 2025-06-13T12:09:33.000+02:00
Motivated by cockroachlabs/support#3297. This commit - adds a new `replicas.cpunanospersecond` metric, which aggregates the Replica ReqCPUNanosPerSecond at the tenant level. - Adds a tenant_id tag to CPU profiles. This should simplify investigations related to tenant-induced overload: the new metric should often help pinpoint the set of hot tenants. CPU profiles can help dig into the specific code paths this tenant is exercising. This can then be rounded out with the existing metrics for request counts by tenant (both received by KV and sent by the tenant Pod) for a comprehensive picture. Release note: the `replicas.cpunanospersecond` metric was added. Notably, when child labels are enabled, it exposes evaluation-related Replica CPU usage by tenant. Epic: none
diff --git a/docs/generated/metrics/metrics.yaml b/docs/generated/metrics/metrics.yaml
@@ -15115,6 +15115,14 @@ layers:
       unit: COUNT
       aggregation: AVG
       derivative: NONE
+    - name: replicas.cpunanospersecond
+      exported_name: replicas_cpunanospersecond
+      description: Nanoseconds of CPU time in Replica request processing including evaluation but not replication
+      y_axis_label: Nanoseconds
+      type: COUNTER
+      unit: NANOSECONDS
+      aggregation: AVG
+      derivative: NON_NEGATIVE_DERIVATIVE
     - name: replicas.leaders
       exported_name: replicas_leaders
       description: Number of raft leaders
diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go
@@ -249,6 +249,13 @@ var (
 		Unit:        metric.Unit_COUNT,
 	}
 
+	metaReqCPUNanos = metric.Metadata{
+		Name:        "replicas.cpunanospersecond",
+		Help:        "Nanoseconds of CPU time in Replica request processing including evaluation but not replication",
+		Measurement: "Nanoseconds",
+		Unit:        metric.Unit_NANOSECONDS,
+	}
+
 	// Storage metrics.
 	metaLiveBytes = metric.Metadata{
 		Name:        "livebytes",
@@ -3215,6 +3222,7 @@ type StoreMetrics struct {
 type TenantsStorageMetrics struct {
 	// NB: If adding more metrics to this struct, be sure to
 	// also update tenantsStorageMetricsSet().
+	ReqCPUNanos    *aggmetric.AggCounterFloat64
 	LiveBytes      *aggmetric.AggGauge
 	KeyBytes       *aggmetric.AggGauge
 	ValBytes       *aggmetric.AggGauge
@@ -3251,6 +3259,7 @@ type TenantsStorageMetrics struct {
 // see kvbase.TenantsStorageMetricsSet for public access. Assigned in init().
 func tenantsStorageMetricsSet() map[string]struct{} {
 	return map[string]struct{}{
+		metaReqCPUNanos.Name:    {},
 		metaLiveBytes.Name:      {},
 		metaKeyBytes.Name:       {},
 		metaValBytes.Name:       {},
@@ -3313,6 +3322,7 @@ func (sm *TenantsStorageMetrics) acquireTenant(tenantID roachpb.TenantID) *tenan
 			// Successfully stored a new instance, initialize it and then unlock it.
 			tenantIDStr := tenantID.String()
 			m.mu.refCount++
+			m.ReqCPUNanos = sm.ReqCPUNanos.AddChild(tenantIDStr)
 			m.LiveBytes = sm.LiveBytes.AddChild(tenantIDStr)
 			m.KeyBytes = sm.KeyBytes.AddChild(tenantIDStr)
 			m.ValBytes = sm.ValBytes.AddChild(tenantIDStr)
@@ -3360,6 +3370,8 @@ func (sm *TenantsStorageMetrics) releaseTenant(ctx context.Context, m *tenantSto
 	// The refCount is zero, delete this instance after destroying its metrics.
 	// Note that concurrent attempts to create an instance will detect the zero
 	// refCount value and construct a new instance.
+	m.ReqCPUNanos.Unlink() // counter
+	m.ReqCPUNanos = nil
 	for _, gptr := range []**aggmetric.Gauge{
 		&m.LiveBytes,
 		&m.KeyBytes,
@@ -3409,6 +3421,8 @@ type tenantStorageMetrics struct {
 		stack    debugutil.SafeStack
 	}
 
+	ReqCPUNanos *aggmetric.CounterFloat64
+
 	LiveBytes      *aggmetric.Gauge
 	KeyBytes       *aggmetric.Gauge
 	ValBytes       *aggmetric.Gauge
@@ -3442,6 +3456,7 @@ func (tm *tenantStorageMetrics) assert(ctx context.Context) {
 func newTenantsStorageMetrics() *TenantsStorageMetrics {
 	b := aggmetric.MakeBuilder(multitenant.TenantIDLabel)
 	sm := &TenantsStorageMetrics{
+		ReqCPUNanos:    b.CounterFloat64(metaReqCPUNanos),
 		LiveBytes:      b.Gauge(metaLiveBytes),
 		KeyBytes:       b.Gauge(metaKeyBytes),
 		ValBytes:       b.Gauge(metaValBytes),
diff --git a/pkg/kv/kvserver/mvcc_gc_queue.go b/pkg/kv/kvserver/mvcc_gc_queue.go
@@ -664,7 +664,7 @@ func (mgcq *mvccGCQueue) process(
 ) (processed bool, err error) {
 	// Record the CPU time processing the request for this replica. This is
 	// recorded regardless of errors that are encountered.
-	defer repl.MeasureReqCPUNanos(grunning.Time())
+	defer repl.MeasureReqCPUNanos(ctx, grunning.Time())
 
 	// Lookup the descriptor and GC policy for the zone containing this key range.
 	desc, conf := repl.DescAndSpanConfig()
diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go
@@ -2768,9 +2768,23 @@ func init() {
 
 // MeasureReqCPUNanos measures the cpu time spent on this replica processing
 // requests.
-func (r *Replica) MeasureReqCPUNanos(start time.Duration) {
+func (r *Replica) MeasureReqCPUNanos(ctx context.Context, start time.Duration) {
 	r.measureNanosRunning(start, func(dur float64) {
 		r.loadStats.RecordReqCPUNanos(dur)
+		// NB: the caller also has a tenant ID, but we use the replica's here for
+		// simplicity. There is no established pattern for short-lived references
+		// to a specific tenant's metrics.
+		if r.tenantMetricsRef != nil {
+			// We can *not* use the tenant metrics directly because nothing in this
+			// current code path prevents the surrounding replica from getting
+			// destroyed, which could zero the refcount and release the metrics
+			// object. Instead, we go through acquireTenant, which gives us an object
+			// that is and remain valid. This is not an expensive operation in
+			// the common case (the replica still exists).
+			tm := r.store.metrics.acquireTenant(r.tenantMetricsRef.tenantID)
+			tm.ReqCPUNanos.Inc(dur)
+			r.store.metrics.releaseTenant(ctx, tm)
+		}
 	})
 }
 
diff --git a/pkg/kv/kvserver/replica_rate_limit.go b/pkg/kv/kvserver/replica_rate_limit.go
@@ -18,12 +18,13 @@ import (
 // maybeRateLimitBatch may block the batch waiting to be rate-limited. Note that
 // the replica must be initialized and thus there is no synchronization issue
 // on the tenantRateLimiter.
-func (r *Replica) maybeRateLimitBatch(ctx context.Context, ba *kvpb.BatchRequest) error {
+func (r *Replica) maybeRateLimitBatch(
+	ctx context.Context, ba *kvpb.BatchRequest, tenantIDOrZero roachpb.TenantID,
+) error {
 	if r.tenantLimiter == nil {
 		return nil
 	}
-	tenantID, ok := roachpb.ClientTenantFromContext(ctx)
-	if !ok || tenantID == roachpb.SystemTenantID {
+	if !tenantIDOrZero.IsSet() || tenantIDOrZero.IsSystem() {
 		return nil
 	}
 
diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go
@@ -123,10 +123,23 @@ func (r *Replica) Send(
 func (r *Replica) SendWithWriteBytes(
 	ctx context.Context, ba *kvpb.BatchRequest,
 ) (*kvpb.BatchResponse, *kvadmission.StoreWriteBytes, *kvpb.Error) {
+	tenantIDOrZero, _ := roachpb.ClientTenantFromContext(ctx)
+
+	// Record the CPU time processing the request for this replica. This is
+	// recorded regardless of errors that are encountered.
+	startCPU := grunning.Time()
+	defer r.MeasureReqCPUNanos(ctx, startCPU)
+
 	if r.store.cfg.Settings.CPUProfileType() == cluster.CPUProfileWithLabels {
 		defer pprof.SetGoroutineLabels(ctx)
 		// Note: the defer statement captured the previous context.
-		ctx = pprof.WithLabels(ctx, pprof.Labels("range_str", r.rangeStr.ID()))
+		var lbls pprof.LabelSet
+		if tenantIDOrZero.IsSet() {
+			lbls = pprof.Labels("range_str", r.rangeStr.ID(), "tenant_id", tenantIDOrZero.String())
+		} else {
+			lbls = pprof.Labels("range_str", r.rangeStr.ID())
+		}
+		ctx = pprof.WithLabels(ctx, lbls)
 		pprof.SetGoroutineLabels(ctx)
 	}
 	if trace.IsEnabled() {
@@ -135,11 +148,6 @@ func (r *Replica) SendWithWriteBytes(
 	// Add the range log tag.
 	ctx = r.AnnotateCtx(ctx)
 
-	// Record the CPU time processing the request for this replica. This is
-	// recorded regardless of errors that are encountered.
-	startCPU := grunning.Time()
-	defer r.MeasureReqCPUNanos(startCPU)
-
 	isReadOnly := ba.IsReadOnly()
 	if err := r.checkBatchRequest(ba, isReadOnly); err != nil {
 		return nil, nil, kvpb.NewError(err)
@@ -148,7 +156,7 @@ func (r *Replica) SendWithWriteBytes(
 	if err := r.maybeBackpressureBatch(ctx, ba); err != nil {
 		return nil, nil, kvpb.NewError(err)
 	}
-	if err := r.maybeRateLimitBatch(ctx, ba); err != nil {
+	if err := r.maybeRateLimitBatch(ctx, ba, tenantIDOrZero); err != nil {
 		return nil, nil, kvpb.NewError(err)
 	}
 	if err := r.maybeCommitWaitBeforeCommitTrigger(ctx, ba); err != nil {
diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go
@@ -14319,7 +14319,7 @@ func TestReplicaRateLimit(t *testing.T) {
 		ba.Add(&req)
 		ctx, cancel := context.WithTimeout(tenCtx, timeout)
 		defer cancel()
-		return tenRepl.maybeRateLimitBatch(ctx, ba)
+		return tenRepl.maybeRateLimitBatch(ctx, ba, ten123)
 	}
 
 	// Verify that first few writes succeed fast, but eventually requests start

Original file line number	Diff line number	Diff line change
`@@ -14319,7 +14319,7 @@ func TestReplicaRateLimit(t *testing.T) {`
`14319`	`14319`	`ba.Add(&req)`
`14320`	`14320`	`ctx, cancel := context.WithTimeout(tenCtx, timeout)`
`14321`	`14321`	`defer cancel()`
`14322`		`- return tenRepl.maybeRateLimitBatch(ctx, ba)`
	`14322`	`+ return tenRepl.maybeRateLimitBatch(ctx, ba, ten123)`
`14323`	`14323`	`}`
`14324`	`14324`
`14325`	`14325`	`// Verify that first few writes succeed fast, but eventually requests start`