kvserver: allow logs from callbacks up to 15 replicas per updateReplicationGauges

wenyihu6 · wenyihu6 · commit 4dab0d8a3f38 · 2025-09-03T13:00:35.000-04:00
Previously, logs from the decommission nudger were not gated by a vmodule and
could become spammy when many replicas were decommissioned at a low nudger
frequency. This commit introduces a per-store budget, allowing logs from
callbacks for up to 15 replicas per updateReplicationGauges call.

Drawbacks of this approach:
- Replicas are not visited in a sorted order, so we may be opening the floodgates
from 15 different replicas each iteration.
- Once a replica is permitted to log, its future logs from callbacks are not
  restricted.
- If EnqueueProblemRangeInReplicateQueueInterval is set too low, 1 and 2 may
  become worse.

For 1, we could consider visit the replica set with WithReplicasInOrder. I'm not
sure about the overhead here since updateReplicationGauges is called
periodically when collecting metrics.

Here are the reasons that I think this approach is acceptable for now:
- onEnqueueResult is unlikely to be reinvoked for replicas already in the queue
unless they are processing or in purgatory (both are short-lived states we want
visibility into). Once processed, replicas are removed from the set.
onProcessResult should be called at most twice. For replicas merely waiting in
the queue, the callback is not invoked, since their priority should not be
actively updated.
- We could cap logging per maybeEnqueueProblemRange, but granting full logging
permission for each replica simplifies reasoning and gives complete visibility
for specific replias.
- In practice, escalations show that slow decommissioning usually involves &lt;15
  ranges, and EnqueueProblemRangeInReplicateQueueInterval is typically large
  (~15 minutes).
diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go
@@ -2918,8 +2918,9 @@ func (r *Replica) TestingRefreshLeaderlessWatcherUnavailableState(
 // manner via the replica scanner, see #130199. This functionality is disabled
 // by default for this reason.
 func (r *Replica) maybeEnqueueProblemRange(
-	ctx context.Context, now time.Time, leaseValid, isLeaseholder bool,
+	ctx context.Context, now time.Time, leaseValid, isLeaseholder bool, shouldLog bool,
 ) {
+
 	// The method expects the caller to provide whether the lease is valid and
 	// the replica is the leaseholder for the range, so that it can avoid
 	// unnecessary work. We expect this method to be called in the context of
@@ -2961,24 +2962,41 @@ func (r *Replica) maybeEnqueueProblemRange(
 	r.store.replicateQueue.AddAsyncWithCallback(ctx, r,
 		allocatorimpl.AllocatorReplaceDecommissioningVoter.Priority(), processCallback{
 			onEnqueueResult: func(indexOnHeap int, err error) {
-				if err != nil {
-					// TODO(wenyihu6): if we want to put these logs behind vmodule, move
-					// this function to another file so that we can avoid the spam on
-					// other logs.
-					log.KvDistribution.Infof(ctx,
-						"decommissioning nudger failed to enqueue range %v due to %v", r.Desc(), err)
+				if shouldLog {
+					if err != nil {
+						log.KvDistribution.Infof(ctx,
+							"decommissioning nudger failed to enqueue range %v due to %v", r.Desc(), err)
+					} else {
+						log.KvDistribution.Infof(ctx,
+							"decommissioning nudger successfully enqueued range %v at index %d", r.Desc(), indexOnHeap)
+					}
 				} else {
-					log.KvDistribution.Infof(ctx,
-						"decommissioning nudger successfully enqueued range %v at index %d", r.Desc(), indexOnHeap)
+					if err != nil {
+						log.KvDistribution.VInfof(ctx, 2,
+							"decommissioning nudger failed to enqueue range %v due to %v", r.Desc(), err)
+					} else {
+						log.KvDistribution.VInfof(ctx, 2,
+							"decommissioning nudger successfully enqueued range %v at index %d", r.Desc(), indexOnHeap)
+					}
 				}
 			},
 			onProcessResult: func(err error) {
-				if err != nil {
-					log.KvDistribution.Infof(ctx,
-						"decommissioning nudger failed to process range %v due to %v", r.Desc(), err)
+				if shouldLog {
+					if err != nil {
+						log.KvDistribution.Infof(ctx,
+							"decommissioning nudger failed to process range %v due to %v", r.Desc(), err)
+					} else {
+						log.KvDistribution.Infof(ctx,
+							"decommissioning nudger successfully processed replica %s", r.Desc())
+					}
 				} else {
-					log.KvDistribution.Infof(ctx,
-						"decommissioning nudger successfully processed replica %s", r.Desc())
+					if err != nil {
+						log.KvDistribution.VInfof(ctx, 2,
+							"decommissioning nudger failed to process range %v due to %v", r.Desc(), err)
+					} else {
+						log.KvDistribution.VInfof(ctx, 2,
+							"decommissioning nudger successfully processed replica %s", r.Desc())
+					}
 				}
 			},
 		})
diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go
@@ -3418,6 +3418,12 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
 	ioOverload, _ = s.ioThreshold.t.Score()
 	s.ioThreshold.Unlock()
 
+	// TODO(wenyihu6): it would be nicer if we can sort the replicas so that we
+	// can always get the nudger story on the same set of replicas, will this
+	// introduce a lot of overhead? For now, it seems fine since we usually see <
+	// 15 ranges on decommission stall.
+	var logBudgetOnDecommissioningNudger = 15
+
 	// We want to avoid having to read this multiple times during the replica
 	// visiting, so load it once up front for all nodes.
 	livenessMap := s.cfg.NodeLiveness.ScanNodeVitalityFromCache()
@@ -3488,7 +3494,11 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
 			if metrics.Decommissioning {
 				// NB: Enqueue is disabled by default from here and throttled async if
 				// enabled.
-				rep.maybeEnqueueProblemRange(ctx, goNow, metrics.LeaseValid, metrics.Leaseholder)
+				shouldLog := logBudgetOnDecommissioningNudger > 0
+				if shouldLog {
+					logBudgetOnDecommissioningNudger--
+				}
+				rep.maybeEnqueueProblemRange(ctx, goNow, metrics.LeaseValid, metrics.Leaseholder, shouldLog)
 				decommissioningRangeCount++
 			}
 		}