Skip to content

Commit 02dfa8f

Browse files
committed
kvserver: improve observability with decommission nudger
Previously, we added the decommissioning nudger which nudges the leaseholder replica of decommissioning ranges to enqueue themselves into the replicate queue for decommissioning. However, we are still observing extended decommission stall with the nudger enabled. Observability was limited, and we could not easily tell whether replicas were successfully enqueued or processed. This commit improves observability by adding four metrics to track the enqueue and processing results of the decommissioning nudger: ranges.decommissioning.nudger.{enqueue,process}.{success,failure}.
1 parent 131cc9e commit 02dfa8f

File tree

3 files changed

+42
-2
lines changed

3 files changed

+42
-2
lines changed

docs/generated/metrics/metrics.html

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,11 @@
598598
<tr><td>STORAGE</td><td>ranges</td><td>Number of ranges</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
599599
<tr><td>STORAGE</td><td>ranges.decommissioning</td><td>Number of ranges with at lease one replica on a decommissioning node</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
600600
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.enqueue</td><td>Number of enqueued enqueues of a range for decommissioning by the decommissioning nudger. Note: This metric tracks when the nudger attempts to enqueue, but the replica might not end up being enqueued by the priority queue due to various filtering or failure conditions.</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
601-
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease</td><td>Number of enqueues of a range for decommissioning by the decommissioning nudger that were not the leaseholder or had an invalid lease</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
601+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.enqueue.failure</td><td>Number of ranges that failed to enqueue at the replicate queue</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
602+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.enqueue.success</td><td>Number of ranges that were successfully enqueued by the decommisioning nudger</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
603+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease</td><td>Number of ranges that were not the leaseholder or had an invalid lease at the decommissioning nudger</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
604+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.process.failure</td><td>Number of ranges enqueued by the decommissioning nudger that failed to process by the replicate queue</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
605+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.process.success</td><td>Number of ranges enqueued by the decommissioning nudger that were successfully processed by the replicate queue</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
602606
<tr><td>STORAGE</td><td>ranges.overreplicated</td><td>Number of ranges with more live replicas than the replication target</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
603607
<tr><td>STORAGE</td><td>ranges.unavailable</td><td>Number of ranges with fewer live replicas than needed for quorum</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
604608
<tr><td>STORAGE</td><td>ranges.underreplicated</td><td>Number of ranges with fewer live replicas than the replication target</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>

pkg/kv/kvserver/metrics.go

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,33 @@ var (
149149
Measurement: "Ranges",
150150
Unit: metric.Unit_COUNT,
151151
}
152+
metaDecommissioningNudgerEnqueueSuccess = metric.Metadata{
153+
Name: "ranges.decommissioning.nudger.enqueue.success",
154+
Help: "Number of ranges that were successfully enqueued by the decommisioning nudger",
155+
Measurement: "Ranges",
156+
Unit: metric.Unit_COUNT,
157+
}
158+
metaDecommissioningNudgerEnqueueFailure = metric.Metadata{
159+
Name: "ranges.decommissioning.nudger.enqueue.failure",
160+
Help: "Number of ranges that failed to enqueue at the replicate queue",
161+
Measurement: "Ranges",
162+
Unit: metric.Unit_COUNT,
163+
}
164+
metaDecommissioningNudgerProcessSuccess = metric.Metadata{
165+
Name: "ranges.decommissioning.nudger.process.success",
166+
Help: "Number of ranges enqueued by the decommissioning nudger that were successfully processed by the replicate queue",
167+
Measurement: "Ranges",
168+
Unit: metric.Unit_COUNT,
169+
}
170+
metaDecommissioningNudgerProcessFailure = metric.Metadata{
171+
Name: "ranges.decommissioning.nudger.process.failure",
172+
Help: "Number of ranges enqueued by the decommissioning nudger that failed to process by the replicate queue",
173+
Measurement: "Ranges",
174+
Unit: metric.Unit_COUNT,
175+
}
152176
metaDecommissioningNudgerNotLeaseholderOrInvalidLease = metric.Metadata{
153177
Name: "ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease",
154-
Help: "Number of enqueues of a range for decommissioning by the decommissioning nudger that were not the leaseholder or had an invalid lease",
178+
Help: "Number of ranges that were not the leaseholder or had an invalid lease at the decommissioning nudger",
155179
Measurement: "Ranges",
156180
Unit: metric.Unit_COUNT,
157181
}
@@ -2653,6 +2677,10 @@ type StoreMetrics struct {
26532677

26542678
// Decommissioning nudger metrics.
26552679
DecommissioningNudgerEnqueue *metric.Counter
2680+
DecommissioningNudgerEnqueueSuccess *metric.Counter
2681+
DecommissioningNudgerEnqueueFailure *metric.Counter
2682+
DecommissioningNudgerProcessSuccess *metric.Counter
2683+
DecommissioningNudgerProcessFailure *metric.Counter
26562684
DecommissioningNudgerNotLeaseholderOrInvalidLease *metric.Counter
26572685

26582686
// Lease request metrics for successful and failed lease requests. These
@@ -3362,6 +3390,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
33623390

33633391
// Decommissioning nuder metrics.
33643392
DecommissioningNudgerEnqueue: metric.NewCounter(metaDecommissioningNudgerEnqueue),
3393+
DecommissioningNudgerEnqueueSuccess: metric.NewCounter(metaDecommissioningNudgerEnqueueSuccess),
3394+
DecommissioningNudgerEnqueueFailure: metric.NewCounter(metaDecommissioningNudgerEnqueueFailure),
3395+
DecommissioningNudgerProcessSuccess: metric.NewCounter(metaDecommissioningNudgerProcessSuccess),
3396+
DecommissioningNudgerProcessFailure: metric.NewCounter(metaDecommissioningNudgerProcessFailure),
33653397
DecommissioningNudgerNotLeaseholderOrInvalidLease: metric.NewCounter(metaDecommissioningNudgerNotLeaseholderOrInvalidLease),
33663398

33673399
// Lease request metrics.

pkg/kv/kvserver/replica.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2618,18 +2618,22 @@ func (r *Replica) maybeEnqueueProblemRange(
26182618
if err != nil {
26192619
log.KvDistribution.VInfof(ctx, level,
26202620
"decommissioning nudger failed to enqueue range %v due to %v", r.Desc(), err)
2621+
r.store.metrics.DecommissioningNudgerEnqueueFailure.Inc(1)
26212622
} else {
26222623
log.KvDistribution.VInfof(ctx, level,
26232624
"decommissioning nudger successfully enqueued range %v at index %d", r.Desc(), indexOnHeap)
2625+
r.store.metrics.DecommissioningNudgerEnqueueSuccess.Inc(1)
26242626
}
26252627
},
26262628
onProcessResult: func(err error) {
26272629
if err != nil {
26282630
log.KvDistribution.VInfof(ctx, level,
26292631
"decommissioning nudger failed to process range %v due to %v", r.Desc(), err)
2632+
r.store.metrics.DecommissioningNudgerProcessFailure.Inc(1)
26302633
} else {
26312634
log.KvDistribution.VInfof(ctx, level,
26322635
"decommissioning nudger successfully processed replica %s", r.Desc())
2636+
r.store.metrics.DecommissioningNudgerProcessSuccess.Inc(1)
26332637
}
26342638
},
26352639
})

0 commit comments

Comments
 (0)