Skip to content

Commit fef65e5

Browse files
committed
kvserver: improve observability with decommission nudger
Previously, we added the decommissioning nudger which nudges the leaseholder replica of decommissioning ranges to enqueue themselves into the replicate queue for decommissioning. However, we are still observing extended decommission stall with the nudger enabled. Observability was limited, and we could not easily tell whether replicas were successfully enqueued or processed. This commit improves observability by adding four metrics to track the enqueue and processing results of the decommissioning nudger: ranges.decommissioning.nudger.{enqueue,process}.{success,failure}.
1 parent 4b3c782 commit fef65e5

File tree

3 files changed

+78
-2
lines changed

3 files changed

+78
-2
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15342,10 +15342,46 @@ layers:
1534215342
unit: COUNT
1534315343
aggregation: AVG
1534415344
derivative: NON_NEGATIVE_DERIVATIVE
15345+
- name: ranges.decommissioning.nudger.enqueue.failure
15346+
exported_name: ranges_decommissioning_nudger_enqueue_failure
15347+
labeled_name: ranges.decommissioning.nudger.enqueue.failure
15348+
description: Number of ranges that failed to enqueue at the replicate queue
15349+
y_axis_label: Ranges
15350+
type: COUNTER
15351+
unit: COUNT
15352+
aggregation: AVG
15353+
derivative: NON_NEGATIVE_DERIVATIVE
15354+
- name: ranges.decommissioning.nudger.enqueue.success
15355+
exported_name: ranges_decommissioning_nudger_enqueue_success
15356+
labeled_name: ranges.decommissioning.nudger.enqueue.success
15357+
description: Number of ranges that were successfully enqueued by the decommisioning nudger
15358+
y_axis_label: Ranges
15359+
type: COUNTER
15360+
unit: COUNT
15361+
aggregation: AVG
15362+
derivative: NON_NEGATIVE_DERIVATIVE
1534515363
- name: ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease
1534615364
exported_name: ranges_decommissioning_nudger_not_leaseholder_or_invalid_lease
1534715365
labeled_name: ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease
15348-
description: Number of enqueues of a range for decommissioning by the decommissioning nudger that were not the leaseholder or had an invalid lease
15366+
description: Number of ranges that were not the leaseholder or had an invalid lease at the decommissioning nudger
15367+
y_axis_label: Ranges
15368+
type: COUNTER
15369+
unit: COUNT
15370+
aggregation: AVG
15371+
derivative: NON_NEGATIVE_DERIVATIVE
15372+
- name: ranges.decommissioning.nudger.process.failure
15373+
exported_name: ranges_decommissioning_nudger_process_failure
15374+
labeled_name: ranges.decommissioning.nudger.process.failure
15375+
description: Number of ranges enqueued by the decommissioning nudger that failed to process by the replicate queue
15376+
y_axis_label: Ranges
15377+
type: COUNTER
15378+
unit: COUNT
15379+
aggregation: AVG
15380+
derivative: NON_NEGATIVE_DERIVATIVE
15381+
- name: ranges.decommissioning.nudger.process.success
15382+
exported_name: ranges_decommissioning_nudger_process_success
15383+
labeled_name: ranges.decommissioning.nudger.process.success
15384+
description: Number of ranges enqueued by the decommissioning nudger that were successfully processed by the replicate queue
1534915385
y_axis_label: Ranges
1535015386
type: COUNTER
1535115387
unit: COUNT

pkg/kv/kvserver/metrics.go

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,37 @@ var (
180180
LabeledName: "ranges.decommissioning.nudger.enqueue",
181181
StaticLabels: metric.MakeLabelPairs(metric.LabelStatus, "enqueue"),
182182
}
183+
metaDecommissioningNudgerEnqueueSuccess = metric.Metadata{
184+
Name: "ranges.decommissioning.nudger.enqueue.success",
185+
Help: "Number of ranges that were successfully enqueued by the decommisioning nudger",
186+
Measurement: "Ranges",
187+
Unit: metric.Unit_COUNT,
188+
LabeledName: "ranges.decommissioning.nudger.enqueue.success",
189+
}
190+
metaDecommissioningNudgerEnqueueFailure = metric.Metadata{
191+
Name: "ranges.decommissioning.nudger.enqueue.failure",
192+
Help: "Number of ranges that failed to enqueue at the replicate queue",
193+
Measurement: "Ranges",
194+
Unit: metric.Unit_COUNT,
195+
LabeledName: "ranges.decommissioning.nudger.enqueue.failure",
196+
}
197+
metaDecommissioningNudgerProcessSuccess = metric.Metadata{
198+
Name: "ranges.decommissioning.nudger.process.success",
199+
Help: "Number of ranges enqueued by the decommissioning nudger that were successfully processed by the replicate queue",
200+
Measurement: "Ranges",
201+
Unit: metric.Unit_COUNT,
202+
LabeledName: "ranges.decommissioning.nudger.process.success",
203+
}
204+
metaDecommissioningNudgerProcessFailure = metric.Metadata{
205+
Name: "ranges.decommissioning.nudger.process.failure",
206+
Help: "Number of ranges enqueued by the decommissioning nudger that failed to process by the replicate queue",
207+
Measurement: "Ranges",
208+
Unit: metric.Unit_COUNT,
209+
LabeledName: "ranges.decommissioning.nudger.process.failure",
210+
}
183211
metaDecommissioningNudgerNotLeaseholderOrInvalidLease = metric.Metadata{
184212
Name: "ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease",
185-
Help: "Number of enqueues of a range for decommissioning by the decommissioning nudger that were not the leaseholder or had an invalid lease",
213+
Help: "Number of ranges that were not the leaseholder or had an invalid lease at the decommissioning nudger",
186214
Measurement: "Ranges",
187215
Unit: metric.Unit_COUNT,
188216
LabeledName: "ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease",
@@ -2892,6 +2920,10 @@ type StoreMetrics struct {
28922920

28932921
// Decommissioning nudger metrics.
28942922
DecommissioningNudgerEnqueue *metric.Counter
2923+
DecommissioningNudgerEnqueueSuccess *metric.Counter
2924+
DecommissioningNudgerEnqueueFailure *metric.Counter
2925+
DecommissioningNudgerProcessSuccess *metric.Counter
2926+
DecommissioningNudgerProcessFailure *metric.Counter
28952927
DecommissioningNudgerNotLeaseholderOrInvalidLease *metric.Counter
28962928

28972929
// Lease request metrics for successful and failed lease requests. These
@@ -3616,6 +3648,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
36163648

36173649
// Decommissioning nuder metrics.
36183650
DecommissioningNudgerEnqueue: metric.NewCounter(metaDecommissioningNudgerEnqueue),
3651+
DecommissioningNudgerEnqueueSuccess: metric.NewCounter(metaDecommissioningNudgerEnqueueSuccess),
3652+
DecommissioningNudgerEnqueueFailure: metric.NewCounter(metaDecommissioningNudgerEnqueueFailure),
3653+
DecommissioningNudgerProcessSuccess: metric.NewCounter(metaDecommissioningNudgerProcessSuccess),
3654+
DecommissioningNudgerProcessFailure: metric.NewCounter(metaDecommissioningNudgerProcessFailure),
36193655
DecommissioningNudgerNotLeaseholderOrInvalidLease: metric.NewCounter(metaDecommissioningNudgerNotLeaseholderOrInvalidLease),
36203656

36213657
// Lease request metrics.

pkg/kv/kvserver/replica.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2969,18 +2969,22 @@ func (r *Replica) maybeEnqueueProblemRange(
29692969
if err != nil {
29702970
log.KvDistribution.VInfof(ctx, level,
29712971
"decommissioning nudger failed to enqueue range %v due to %v", r.Desc(), err)
2972+
r.store.metrics.DecommissioningNudgerEnqueueFailure.Inc(1)
29722973
} else {
29732974
log.KvDistribution.VInfof(ctx, level,
29742975
"decommissioning nudger successfully enqueued range %v at index %d", r.Desc(), indexOnHeap)
2976+
r.store.metrics.DecommissioningNudgerEnqueueSuccess.Inc(1)
29752977
}
29762978
},
29772979
onProcessResult: func(err error) {
29782980
if err != nil {
29792981
log.KvDistribution.VInfof(ctx, level,
29802982
"decommissioning nudger failed to process range %v due to %v", r.Desc(), err)
2983+
r.store.metrics.DecommissioningNudgerProcessFailure.Inc(1)
29812984
} else {
29822985
log.KvDistribution.VInfof(ctx, level,
29832986
"decommissioning nudger successfully processed replica %s", r.Desc())
2987+
r.store.metrics.DecommissioningNudgerProcessSuccess.Inc(1)
29842988
}
29852989
},
29862990
})

0 commit comments

Comments
 (0)