Skip to content

Commit efcf9ce

Browse files
committed
kvserver: improve observability with decommission nudger
Previously, we added the decommissioning nudger which nudges the leaseholder replica of decommissioning ranges to enqueue themselves into the replicate queue for decommissioning. However, we are still observing extended decommission stall with the nudger enabled. Observability was limited, and we could not easily tell whether replicas were successfully enqueued or processed. This commit improves observability by adding four metrics to track the enqueue and processing results of the decommissioning nudger: ranges.decommissioning.nudger.{enqueue,process}.{success,failure}.
1 parent a9194dc commit efcf9ce

File tree

3 files changed

+46
-2
lines changed

3 files changed

+46
-2
lines changed

docs/generated/metrics/metrics.html

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,11 @@
582582
<tr><td>STORAGE</td><td>ranges</td><td>Number of ranges</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
583583
<tr><td>STORAGE</td><td>ranges.decommissioning</td><td>Number of ranges with at lease one replica on a decommissioning node</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
584584
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.enqueue</td><td>Number of enqueued enqueues of a range for decommissioning by the decommissioning nudger. Note: This metric tracks when the nudger attempts to enqueue, but the replica might not end up being enqueued by the priority queue due to various filtering or failure conditions.</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
585-
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease</td><td>Number of enqueues of a range for decommissioning by the decommissioning nudger that were not the leaseholder or had an invalid lease</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
585+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.enqueue.failure</td><td>Number of ranges that failed to enqueue at the replicate queue</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
586+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.enqueue.success</td><td>Number of ranges that were successfully enqueued by the decommisioning nudger</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
587+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease</td><td>Number of ranges that were not the leaseholder or had an invalid lease at the decommissioning nudger</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
588+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.process.failure</td><td>Number of ranges enqueued by the decommissioning nudger that failed to process by the replicate queue</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
589+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.process.success</td><td>Number of ranges enqueued by the decommissioning nudger that were successfully processed by the replicate queue</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
586590
<tr><td>STORAGE</td><td>ranges.overreplicated</td><td>Number of ranges with more live replicas than the replication target</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
587591
<tr><td>STORAGE</td><td>ranges.unavailable</td><td>Number of ranges with fewer live replicas than needed for quorum</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
588592
<tr><td>STORAGE</td><td>ranges.underreplicated</td><td>Number of ranges with fewer live replicas than the replication target</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>

pkg/kv/kvserver/metrics.go

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,37 @@ var (
161161
Measurement: "Ranges",
162162
Unit: metric.Unit_COUNT,
163163
}
164+
metaDecommissioningNudgerEnqueueSuccess = metric.Metadata{
165+
Name: "ranges.decommissioning.nudger.enqueue.success",
166+
Help: "Number of ranges that were successfully enqueued by the decommisioning nudger",
167+
Measurement: "Ranges",
168+
Unit: metric.Unit_COUNT,
169+
LabeledName: "ranges.decommissioning.nudger.enqueue.success",
170+
}
171+
metaDecommissioningNudgerEnqueueFailure = metric.Metadata{
172+
Name: "ranges.decommissioning.nudger.enqueue.failure",
173+
Help: "Number of ranges that failed to enqueue at the replicate queue",
174+
Measurement: "Ranges",
175+
Unit: metric.Unit_COUNT,
176+
LabeledName: "ranges.decommissioning.nudger.enqueue.failure",
177+
}
178+
metaDecommissioningNudgerProcessSuccess = metric.Metadata{
179+
Name: "ranges.decommissioning.nudger.process.success",
180+
Help: "Number of ranges enqueued by the decommissioning nudger that were successfully processed by the replicate queue",
181+
Measurement: "Ranges",
182+
Unit: metric.Unit_COUNT,
183+
LabeledName: "ranges.decommissioning.nudger.process.success",
184+
}
185+
metaDecommissioningNudgerProcessFailure = metric.Metadata{
186+
Name: "ranges.decommissioning.nudger.process.failure",
187+
Help: "Number of ranges enqueued by the decommissioning nudger that failed to process by the replicate queue",
188+
Measurement: "Ranges",
189+
Unit: metric.Unit_COUNT,
190+
LabeledName: "ranges.decommissioning.nudger.process.failure",
191+
}
164192
metaDecommissioningNudgerNotLeaseholderOrInvalidLease = metric.Metadata{
165193
Name: "ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease",
166-
Help: "Number of enqueues of a range for decommissioning by the decommissioning nudger that were not the leaseholder or had an invalid lease",
194+
Help: "Number of ranges that were not the leaseholder or had an invalid lease at the decommissioning nudger",
167195
Measurement: "Ranges",
168196
Unit: metric.Unit_COUNT,
169197
}
@@ -2679,6 +2707,10 @@ type StoreMetrics struct {
26792707

26802708
// Decommissioning nudger metrics.
26812709
DecommissioningNudgerEnqueue *metric.Counter
2710+
DecommissioningNudgerEnqueueSuccess *metric.Counter
2711+
DecommissioningNudgerEnqueueFailure *metric.Counter
2712+
DecommissioningNudgerProcessSuccess *metric.Counter
2713+
DecommissioningNudgerProcessFailure *metric.Counter
26822714
DecommissioningNudgerNotLeaseholderOrInvalidLease *metric.Counter
26832715

26842716
// Lease request metrics for successful and failed lease requests. These
@@ -3392,6 +3424,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
33923424

33933425
// Decommissioning nuder metrics.
33943426
DecommissioningNudgerEnqueue: metric.NewCounter(metaDecommissioningNudgerEnqueue),
3427+
DecommissioningNudgerEnqueueSuccess: metric.NewCounter(metaDecommissioningNudgerEnqueueSuccess),
3428+
DecommissioningNudgerEnqueueFailure: metric.NewCounter(metaDecommissioningNudgerEnqueueFailure),
3429+
DecommissioningNudgerProcessSuccess: metric.NewCounter(metaDecommissioningNudgerProcessSuccess),
3430+
DecommissioningNudgerProcessFailure: metric.NewCounter(metaDecommissioningNudgerProcessFailure),
33953431
DecommissioningNudgerNotLeaseholderOrInvalidLease: metric.NewCounter(metaDecommissioningNudgerNotLeaseholderOrInvalidLease),
33963432

33973433
// Lease request metrics.

pkg/kv/kvserver/replica.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2918,18 +2918,22 @@ func (r *Replica) maybeEnqueueProblemRange(
29182918
if err != nil {
29192919
log.KvDistribution.VInfof(ctx, level,
29202920
"decommissioning nudger failed to enqueue range %v due to %v", r.Desc(), err)
2921+
r.store.metrics.DecommissioningNudgerEnqueueFailure.Inc(1)
29212922
} else {
29222923
log.KvDistribution.VInfof(ctx, level,
29232924
"decommissioning nudger successfully enqueued range %v at index %d", r.Desc(), indexOnHeap)
2925+
r.store.metrics.DecommissioningNudgerEnqueueSuccess.Inc(1)
29242926
}
29252927
},
29262928
onProcessResult: func(err error) {
29272929
if err != nil {
29282930
log.KvDistribution.VInfof(ctx, level,
29292931
"decommissioning nudger failed to process range %v due to %v", r.Desc(), err)
2932+
r.store.metrics.DecommissioningNudgerProcessFailure.Inc(1)
29302933
} else {
29312934
log.KvDistribution.VInfof(ctx, level,
29322935
"decommissioning nudger successfully processed replica %s", r.Desc())
2936+
r.store.metrics.DecommissioningNudgerProcessSuccess.Inc(1)
29332937
}
29342938
},
29352939
})

0 commit comments

Comments
 (0)