Skip to content

Commit f328e00

Browse files
craig[bot]wenyihu6spilchen
committed
152787: kvserver: improve observability with decommission nudger r=tbg a=wenyihu6 Stacked on top of #152792 Resolves: #151847 Epic: none --- **kvserver: improve observability with decommission nudger** Previously, we added the decommissioning nudger which nudges the leaseholder replica of decommissioning ranges to enqueue themselves into the replicate queue for decommissioning. However, we are still observing extended decommission stall with the nudger enabled. Observability was limited, and we could not easily tell whether replicas were successfully enqueued or processed. This commit improves observability by adding four metrics to track the enqueue and processing results of the decommissioning nudger: ranges.decommissioning.nudger.{enqueue,process}.{success,failure}. --- **kvserver: add enqueue metrics to base queue** Previously, observability into base queue enqueuing was limited to pending queue length and process results. This commit adds enqueue-specific metrics for the replicate queue: - queue.replicate.enqueue.add: counts replicas successfully added to the queue - queue.replicate.enqueue.failedprecondition: counts replicas that failed the replicaCanBeProcessed precondition check - queue.replicate.enqueue.noaction: counts replicas skipped because ShouldQueue determined no action was needed - queue.replicate.enqueue.unexpectederror: counts replicas that were expected to be enqueued (ShouldQueue returned true or the caller attempted a direct enqueue) but failed due to unexpected errors --- **kvserver: move bq.enqueueAdd update to be outside of defer** Previously, we updated bq.enqueueAdd inside the defer statement of addInternal. This was incorrect because we may return queued = true for a replica already processing and was marked for requeue. That replica would later be requeued in finishProcessingReplica, incrementing the metric again, lead to double counting. --- **kvserver: test metrics in TestBaseQueueCallback* and TestReplicateQueueDecommissionScannerDisabled** his commit extends TestBaseQueueCallback* and TestReplicateQueueDecommissionScannerDisabled to also verify metric updates. 152984: sql/inspect: convert internal errors to inspect issues r=spilchen a=spilchen Previously, internal errors during index consistency checks would fail the entire job. Now these errors are converted to structured inspect issues with detailed context. Closes #148299 Release Notes: None Epic: None Co-authored-by: wenyihu6 <[email protected]> Co-authored-by: Matt Spilchen <[email protected]>
3 parents e06b5d1 + 28e1dc1 + ed8c021 commit f328e00

File tree

10 files changed

+338
-25
lines changed

10 files changed

+338
-25
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13903,6 +13903,38 @@ layers:
1390313903
unit: COUNT
1390413904
aggregation: AVG
1390513905
derivative: NON_NEGATIVE_DERIVATIVE
13906+
- name: queue.replicate.enqueue.add
13907+
exported_name: queue_replicate_enqueue_add
13908+
description: Number of replicas successfully added to the replicate queue
13909+
y_axis_label: Replicas
13910+
type: COUNTER
13911+
unit: COUNT
13912+
aggregation: AVG
13913+
derivative: NON_NEGATIVE_DERIVATIVE
13914+
- name: queue.replicate.enqueue.failedprecondition
13915+
exported_name: queue_replicate_enqueue_failedprecondition
13916+
description: Number of replicas that failed the precondition checks and were therefore not added to the replicate queue
13917+
y_axis_label: Replicas
13918+
type: COUNTER
13919+
unit: COUNT
13920+
aggregation: AVG
13921+
derivative: NON_NEGATIVE_DERIVATIVE
13922+
- name: queue.replicate.enqueue.noaction
13923+
exported_name: queue_replicate_enqueue_noaction
13924+
description: Number of replicas for which ShouldQueue determined no action was needed and were therefore not added to the replicate queue
13925+
y_axis_label: Replicas
13926+
type: COUNTER
13927+
unit: COUNT
13928+
aggregation: AVG
13929+
derivative: NON_NEGATIVE_DERIVATIVE
13930+
- name: queue.replicate.enqueue.unexpectederror
13931+
exported_name: queue_replicate_enqueue_unexpectederror
13932+
description: Number of replicas that were expected to be enqueued (ShouldQueue returned true or the caller decided to add to the replicate queue directly), but failed to be enqueued due to unexpected errors
13933+
y_axis_label: Replicas
13934+
type: COUNTER
13935+
unit: COUNT
13936+
aggregation: AVG
13937+
derivative: NON_NEGATIVE_DERIVATIVE
1390613938
- name: queue.replicate.nonvoterpromotions
1390713939
exported_name: queue_replicate_nonvoterpromotions
1390813940
description: Number of non-voters promoted to voters by the replicate queue
@@ -15342,10 +15374,46 @@ layers:
1534215374
unit: COUNT
1534315375
aggregation: AVG
1534415376
derivative: NON_NEGATIVE_DERIVATIVE
15377+
- name: ranges.decommissioning.nudger.enqueue.failure
15378+
exported_name: ranges_decommissioning_nudger_enqueue_failure
15379+
labeled_name: ranges.decommissioning.nudger.enqueue.failure
15380+
description: Number of ranges that failed to enqueue at the replicate queue
15381+
y_axis_label: Ranges
15382+
type: COUNTER
15383+
unit: COUNT
15384+
aggregation: AVG
15385+
derivative: NON_NEGATIVE_DERIVATIVE
15386+
- name: ranges.decommissioning.nudger.enqueue.success
15387+
exported_name: ranges_decommissioning_nudger_enqueue_success
15388+
labeled_name: ranges.decommissioning.nudger.enqueue.success
15389+
description: Number of ranges that were successfully enqueued by the decommisioning nudger
15390+
y_axis_label: Ranges
15391+
type: COUNTER
15392+
unit: COUNT
15393+
aggregation: AVG
15394+
derivative: NON_NEGATIVE_DERIVATIVE
1534515395
- name: ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease
1534615396
exported_name: ranges_decommissioning_nudger_not_leaseholder_or_invalid_lease
1534715397
labeled_name: ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease
15348-
description: Number of enqueues of a range for decommissioning by the decommissioning nudger that were not the leaseholder or had an invalid lease
15398+
description: Number of ranges that were not the leaseholder or had an invalid lease at the decommissioning nudger
15399+
y_axis_label: Ranges
15400+
type: COUNTER
15401+
unit: COUNT
15402+
aggregation: AVG
15403+
derivative: NON_NEGATIVE_DERIVATIVE
15404+
- name: ranges.decommissioning.nudger.process.failure
15405+
exported_name: ranges_decommissioning_nudger_process_failure
15406+
labeled_name: ranges.decommissioning.nudger.process.failure
15407+
description: Number of ranges enqueued by the decommissioning nudger that failed to process by the replicate queue
15408+
y_axis_label: Ranges
15409+
type: COUNTER
15410+
unit: COUNT
15411+
aggregation: AVG
15412+
derivative: NON_NEGATIVE_DERIVATIVE
15413+
- name: ranges.decommissioning.nudger.process.success
15414+
exported_name: ranges_decommissioning_nudger_process_success
15415+
labeled_name: ranges.decommissioning.nudger.process.success
15416+
description: Number of ranges enqueued by the decommissioning nudger that were successfully processed by the replicate queue
1534915417
y_axis_label: Ranges
1535015418
type: COUNTER
1535115419
unit: COUNT

pkg/kv/kvserver/metrics.go

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,37 @@ var (
180180
LabeledName: "ranges.decommissioning.nudger.enqueue",
181181
StaticLabels: metric.MakeLabelPairs(metric.LabelStatus, "enqueue"),
182182
}
183+
metaDecommissioningNudgerEnqueueSuccess = metric.Metadata{
184+
Name: "ranges.decommissioning.nudger.enqueue.success",
185+
Help: "Number of ranges that were successfully enqueued by the decommisioning nudger",
186+
Measurement: "Ranges",
187+
Unit: metric.Unit_COUNT,
188+
LabeledName: "ranges.decommissioning.nudger.enqueue.success",
189+
}
190+
metaDecommissioningNudgerEnqueueFailure = metric.Metadata{
191+
Name: "ranges.decommissioning.nudger.enqueue.failure",
192+
Help: "Number of ranges that failed to enqueue at the replicate queue",
193+
Measurement: "Ranges",
194+
Unit: metric.Unit_COUNT,
195+
LabeledName: "ranges.decommissioning.nudger.enqueue.failure",
196+
}
197+
metaDecommissioningNudgerProcessSuccess = metric.Metadata{
198+
Name: "ranges.decommissioning.nudger.process.success",
199+
Help: "Number of ranges enqueued by the decommissioning nudger that were successfully processed by the replicate queue",
200+
Measurement: "Ranges",
201+
Unit: metric.Unit_COUNT,
202+
LabeledName: "ranges.decommissioning.nudger.process.success",
203+
}
204+
metaDecommissioningNudgerProcessFailure = metric.Metadata{
205+
Name: "ranges.decommissioning.nudger.process.failure",
206+
Help: "Number of ranges enqueued by the decommissioning nudger that failed to process by the replicate queue",
207+
Measurement: "Ranges",
208+
Unit: metric.Unit_COUNT,
209+
LabeledName: "ranges.decommissioning.nudger.process.failure",
210+
}
183211
metaDecommissioningNudgerNotLeaseholderOrInvalidLease = metric.Metadata{
184212
Name: "ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease",
185-
Help: "Number of enqueues of a range for decommissioning by the decommissioning nudger that were not the leaseholder or had an invalid lease",
213+
Help: "Number of ranges that were not the leaseholder or had an invalid lease at the decommissioning nudger",
186214
Measurement: "Ranges",
187215
Unit: metric.Unit_COUNT,
188216
LabeledName: "ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease",
@@ -2144,6 +2172,33 @@ The messages are dropped to help these replicas to recover from I/O overload.`,
21442172
Measurement: "Processing Time",
21452173
Unit: metric.Unit_NANOSECONDS,
21462174
}
2175+
metaReplicateQueueEnqueueAdd = metric.Metadata{
2176+
Name: "queue.replicate.enqueue.add",
2177+
Help: "Number of replicas successfully added to the replicate queue",
2178+
Measurement: "Replicas",
2179+
Unit: metric.Unit_COUNT,
2180+
}
2181+
metaReplicateQueueEnqueueFailedPrecondition = metric.Metadata{
2182+
Name: "queue.replicate.enqueue.failedprecondition",
2183+
Help: "Number of replicas that failed the precondition checks and were therefore not added to the replicate " +
2184+
"queue",
2185+
Measurement: "Replicas",
2186+
Unit: metric.Unit_COUNT,
2187+
}
2188+
metaReplicateQueueEnqueueNoAction = metric.Metadata{
2189+
Name: "queue.replicate.enqueue.noaction",
2190+
Help: "Number of replicas for which ShouldQueue determined no action was needed and were therefore not " +
2191+
"added to the replicate queue",
2192+
Measurement: "Replicas",
2193+
Unit: metric.Unit_COUNT,
2194+
}
2195+
metaReplicateQueueEnqueueUnexpectedError = metric.Metadata{
2196+
Name: "queue.replicate.enqueue.unexpectederror",
2197+
Help: "Number of replicas that were expected to be enqueued (ShouldQueue returned true or the caller decided to " +
2198+
"add to the replicate queue directly), but failed to be enqueued due to unexpected errors",
2199+
Measurement: "Replicas",
2200+
Unit: metric.Unit_COUNT,
2201+
}
21472202
metaLeaseQueueSuccesses = metric.Metadata{
21482203
Name: "queue.lease.process.success",
21492204
Help: "Number of replicas successfully processed by the replica lease queue",
@@ -2892,6 +2947,10 @@ type StoreMetrics struct {
28922947

28932948
// Decommissioning nudger metrics.
28942949
DecommissioningNudgerEnqueue *metric.Counter
2950+
DecommissioningNudgerEnqueueSuccess *metric.Counter
2951+
DecommissioningNudgerEnqueueFailure *metric.Counter
2952+
DecommissioningNudgerProcessSuccess *metric.Counter
2953+
DecommissioningNudgerProcessFailure *metric.Counter
28952954
DecommissioningNudgerNotLeaseholderOrInvalidLease *metric.Counter
28962955

28972956
// Lease request metrics for successful and failed lease requests. These
@@ -3188,6 +3247,10 @@ type StoreMetrics struct {
31883247
ReplicaGCQueueFailures *metric.Counter
31893248
ReplicaGCQueuePending *metric.Gauge
31903249
ReplicaGCQueueProcessingNanos *metric.Counter
3250+
ReplicateQueueEnqueueAdd *metric.Counter
3251+
ReplicateQueueEnqueueFailedPrecondition *metric.Counter
3252+
ReplicateQueueEnqueueNoAction *metric.Counter
3253+
ReplicateQueueEnqueueUnexpectedError *metric.Counter
31913254
ReplicateQueueSuccesses *metric.Counter
31923255
ReplicateQueueFailures *metric.Counter
31933256
ReplicateQueuePending *metric.Gauge
@@ -3616,6 +3679,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
36163679

36173680
// Decommissioning nuder metrics.
36183681
DecommissioningNudgerEnqueue: metric.NewCounter(metaDecommissioningNudgerEnqueue),
3682+
DecommissioningNudgerEnqueueSuccess: metric.NewCounter(metaDecommissioningNudgerEnqueueSuccess),
3683+
DecommissioningNudgerEnqueueFailure: metric.NewCounter(metaDecommissioningNudgerEnqueueFailure),
3684+
DecommissioningNudgerProcessSuccess: metric.NewCounter(metaDecommissioningNudgerProcessSuccess),
3685+
DecommissioningNudgerProcessFailure: metric.NewCounter(metaDecommissioningNudgerProcessFailure),
36193686
DecommissioningNudgerNotLeaseholderOrInvalidLease: metric.NewCounter(metaDecommissioningNudgerNotLeaseholderOrInvalidLease),
36203687

36213688
// Lease request metrics.
@@ -3978,6 +4045,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
39784045
ReplicaGCQueueFailures: metric.NewCounter(metaReplicaGCQueueFailures),
39794046
ReplicaGCQueuePending: metric.NewGauge(metaReplicaGCQueuePending),
39804047
ReplicaGCQueueProcessingNanos: metric.NewCounter(metaReplicaGCQueueProcessingNanos),
4048+
ReplicateQueueEnqueueAdd: metric.NewCounter(metaReplicateQueueEnqueueAdd),
4049+
ReplicateQueueEnqueueFailedPrecondition: metric.NewCounter(metaReplicateQueueEnqueueFailedPrecondition),
4050+
ReplicateQueueEnqueueNoAction: metric.NewCounter(metaReplicateQueueEnqueueNoAction),
4051+
ReplicateQueueEnqueueUnexpectedError: metric.NewCounter(metaReplicateQueueEnqueueUnexpectedError),
39814052
ReplicateQueueSuccesses: metric.NewCounter(metaReplicateQueueSuccesses),
39824053
ReplicateQueueFailures: metric.NewCounter(metaReplicateQueueFailures),
39834054
ReplicateQueuePending: metric.NewGauge(metaReplicateQueuePending),

pkg/kv/kvserver/queue.go

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,20 @@ type queueConfig struct {
388388
processDestroyedReplicas bool
389389
// processTimeout returns the timeout for processing a replica.
390390
processTimeoutFunc queueProcessTimeoutFunc
391+
// enqueueAdd is a counter of replicas that were successfully added to the
392+
// queue.
393+
enqueueAdd *metric.Counter
394+
// enqueueFailedPrecondition is a counter of replicas that failed the
395+
// precondition checks and were therefore not added to the queue.
396+
enqueueFailedPrecondition *metric.Counter
397+
// enqueueNoAction is a counter of replicas that had ShouldQueue determine no
398+
// action was needed and were therefore not added to the queue.
399+
enqueueNoAction *metric.Counter
400+
// enqueueUnexpectedError is a counter of replicas that were expected to be
401+
// enqueued (either had ShouldQueue return true or the caller explicitly
402+
// requested to be added to the queue directly), but failed to be enqueued
403+
// during the enqueue process (such as Async was rated limited).
404+
enqueueUnexpectedError *metric.Counter
391405
// successes is a counter of replicas processed successfully.
392406
successes *metric.Counter
393407
// failures is a counter of replicas which failed processing.
@@ -733,16 +747,62 @@ func (bq *baseQueue) AddAsyncWithCallback(
733747
h.Add(ctx, repl, prio, cb)
734748
}); err != nil {
735749
cb.onEnqueueResult(-1 /*indexOnHeap*/, err)
750+
bq.updateMetricsOnEnqueueUnexpectedError()
736751
}
737752
}
738753

739754
// AddAsync adds the replica to the queue. Unlike MaybeAddAsync, it will wait
740755
// for other operations to finish instead of turning into a noop (because
741756
// unlikely MaybeAdd, Add is not subject to being called opportunistically).
742757
func (bq *baseQueue) AddAsync(ctx context.Context, repl replicaInQueue, prio float64) {
743-
_ = bq.Async(ctx, "Add", true /* wait */, func(ctx context.Context, h queueHelper) {
758+
if err := bq.Async(ctx, "Add", true /* wait */, func(ctx context.Context, h queueHelper) {
744759
h.Add(ctx, repl, prio, noopProcessCallback)
745-
})
760+
}); err != nil {
761+
// We don't update metrics in MaybeAddAsync because we don't know if the
762+
// replica should be queued at this point. We only count it as an unexpected
763+
// error when we're certain the replica should be enqueued. In this case,
764+
// the caller explicitly wants to add the replica to the queue directly, so
765+
// we do update the metrics on unexpected error.
766+
bq.updateMetricsOnEnqueueUnexpectedError()
767+
}
768+
}
769+
770+
// updateMetricsOnEnqueueFailedPrecondition updates the metrics when a replica
771+
// fails precondition checks (replicaCanBeProcessed) and should not be
772+
// considered for enqueueing. This may include cases where the replica does not
773+
// have a valid lease, is uninitialized, is destroyed, failed to retrieve span
774+
// conf reader, or unsplit ranges.
775+
func (bq *baseQueue) updateMetricsOnEnqueueFailedPrecondition() {
776+
if bq.enqueueFailedPrecondition != nil {
777+
bq.enqueueFailedPrecondition.Inc(1)
778+
}
779+
}
780+
781+
// updateMetricsOnEnqueueNoAction updates the metrics when shouldQueue
782+
// determines no action is needed and the replica is not added to the queue.
783+
func (bq *baseQueue) updateMetricsOnEnqueueNoAction() {
784+
if bq.enqueueNoAction != nil {
785+
bq.enqueueNoAction.Inc(1)
786+
}
787+
}
788+
789+
// updateMetricsOnEnqueueUnexpectedError updates the metrics when an unexpected
790+
// error occurs during enqueue operations. This should be called for replicas
791+
// that were expected to be enqueued (either had ShouldQueue return true or the
792+
// caller explicitly requested to be added to the queue directly), but failed to
793+
// be enqueued during the enqueue process (such as Async was rated limited).
794+
func (bq *baseQueue) updateMetricsOnEnqueueUnexpectedError() {
795+
if bq.enqueueUnexpectedError != nil {
796+
bq.enqueueUnexpectedError.Inc(1)
797+
}
798+
}
799+
800+
// updateMetricsOnEnqueueAdd updates the metrics when a replica is successfully
801+
// added to the queue.
802+
func (bq *baseQueue) updateMetricsOnEnqueueAdd() {
803+
if bq.enqueueAdd != nil {
804+
bq.enqueueAdd.Inc(1)
805+
}
746806
}
747807

748808
func (bq *baseQueue) maybeAdd(ctx context.Context, repl replicaInQueue, now hlc.ClockTimestamp) {
@@ -779,6 +839,7 @@ func (bq *baseQueue) maybeAdd(ctx context.Context, repl replicaInQueue, now hlc.
779839
// Load the system config if it's needed.
780840
confReader, err := bq.replicaCanBeProcessed(ctx, repl, false /* acquireLeaseIfNeeded */)
781841
if err != nil {
842+
bq.updateMetricsOnEnqueueFailedPrecondition()
782843
return
783844
}
784845

@@ -788,17 +849,20 @@ func (bq *baseQueue) maybeAdd(ctx context.Context, repl replicaInQueue, now hlc.
788849
realRepl, _ := repl.(*Replica)
789850
should, priority := bq.impl.shouldQueue(ctx, now, realRepl, confReader)
790851
if !should {
852+
bq.updateMetricsOnEnqueueNoAction()
791853
return
792854
}
793855

794856
extConf := bq.skipIfReplicaHasExternalFilesConfig
795857
if extConf != nil && extConf.Get(&bq.store.cfg.Settings.SV) {
796858
hasExternal, err := realRepl.HasExternalBytes()
797859
if err != nil {
860+
bq.updateMetricsOnEnqueueUnexpectedError()
798861
log.Dev.Warningf(ctx, "could not determine if %s has external bytes: %s", realRepl, err)
799862
return
800863
}
801864
if hasExternal {
865+
bq.updateMetricsOnEnqueueUnexpectedError()
802866
log.Dev.VInfof(ctx, 1, "skipping %s for %s because it has external bytes", bq.name, realRepl)
803867
return
804868
}
@@ -841,8 +905,10 @@ func (bq *baseQueue) addInternal(
841905
cb processCallback,
842906
) (added bool, err error) {
843907
defer func() {
908+
// INVARIANT: added => err == nil.
844909
if err != nil {
845910
cb.onEnqueueResult(-1 /* indexOnHeap */, err)
911+
bq.updateMetricsOnEnqueueUnexpectedError()
846912
}
847913
}()
848914
// NB: this is intentionally outside of bq.mu to avoid having to consider
@@ -935,6 +1001,10 @@ func (bq *baseQueue) addInternal(
9351001
default:
9361002
// No need to signal again.
9371003
}
1004+
// Note that we are bumping enqueueAdd here instead of during defer to avoid
1005+
// treating requeuing a processing replica as newly added. They will be
1006+
// re-added to the queue later which will double count them.
1007+
bq.updateMetricsOnEnqueueAdd()
9381008
// Note: it may already be dropped or dropped afterwards.
9391009
cb.onEnqueueResult(item.index /*indexOnHeap*/, nil)
9401010
return true, nil

0 commit comments

Comments
 (0)