Skip to content

Commit a93cbd8

Browse files
committed
kvserver: delete per action priority inversion metrics
This commit removes per-action priority inversion metrics due to their high cardinality. We already have logging in place, which should provide sufficient observability. For now, we care about is priority inversion that leads to consider rebalance and requeuing the most.
1 parent 7fcad47 commit a93cbd8

File tree

3 files changed

+7
-179
lines changed

3 files changed

+7
-179
lines changed

docs/generated/metrics/metrics.html

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -426,20 +426,6 @@
426426
<tr><td>STORAGE</td><td>queue.replicate.enqueue.unexpectederror</td><td>Number of replicas that were expected to be enqueued (ShouldQueue returned true or the caller decided to add to the replicate queue directly), but failed to be enqueued due to unexpected errors</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
427427
<tr><td>STORAGE</td><td>queue.replicate.nonvoterpromotions</td><td>Number of non-voters promoted to voters by the replicate queue</td><td>Promotions of Non Voters to Voters</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
428428
<tr><td>STORAGE</td><td>queue.replicate.pending</td><td>Number of pending replicas in the replicate queue</td><td>Replicas</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
429-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.addnonvoter</td><td>Number of priority inversions in the replicate queue that resulted in add non-voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
430-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.addvoter</td><td>Number of priority inversions in the replicate queue that resulted in add voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
431-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.considerrebalance</td><td>Number of priority inversions in the replicate queue that resulted in consider rebalance action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
432-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.noop</td><td>Number of priority inversions in the replicate queue that resulted in noop action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
433-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.rangeunavailable</td><td>Number of priority inversions in the replicate queue that resulted in range unavailable action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
434-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.removedeadnonvoter</td><td>Number of priority inversions in the replicate queue that resulted in remove dead non-voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
435-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.removedeadvoter</td><td>Number of priority inversions in the replicate queue that resulted in remove dead voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
436-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.removedecommissioningnonvoter</td><td>Number of priority inversions in the replicate queue that resulted in remove decommissioning non-voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
437-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.removedecommissioningvoter</td><td>Number of priority inversions in the replicate queue that resulted in remove decommissioning voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
438-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.removenonvoter</td><td>Number of priority inversions in the replicate queue that resulted in remove non-voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
439-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.removevoter</td><td>Number of priority inversions in the replicate queue that resulted in remove voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
440-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.replacedeadnonvoter</td><td>Number of priority inversions in the replicate queue that resulted in replace dead non-voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
441-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.replacedecommissioningnonvoter</td><td>Number of priority inversions in the replicate queue that resulted in replace decommissioning non-voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
442-
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.replacedecommissioningvoter</td><td>Number of priority inversions in the replicate queue that resulted in replace decommissioning voter action during processing</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
443429
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.requeue</td><td>Number of priority inversions in the replicate queue that resulted in requeuing of the replicas. A priority inversion occurs when the priority at processing time ends up being lower than at enqueue time. When the priority has changed from a high priority repair action to rebalance, the change is requeued to avoid unfairness.</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
444430
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.total</td><td>Total number of priority inversions in the replicate queue. A priority inversion occurs when the priority at processing time ends up being lower than at enqueue time</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
445431
<tr><td>STORAGE</td><td>queue.replicate.process.failure</td><td>Number of replicas which failed processing in the replicate queue</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>

pkg/kv/kvserver/replicate_queue.go

Lines changed: 6 additions & 162 deletions
Original file line numberDiff line numberDiff line change
@@ -335,90 +335,6 @@ var (
335335
Measurement: "Replicas",
336336
Unit: metric.Unit_COUNT,
337337
}
338-
metaReplicateQueuePriorityInversionForAddVoterCount = metric.Metadata{
339-
Name: "queue.replicate.priority_inversion.addvoter",
340-
Help: "Number of priority inversions in the replicate queue that resulted in add voter action during processing",
341-
Measurement: "Replicas",
342-
Unit: metric.Unit_COUNT,
343-
}
344-
metaReplicateQueuePriorityInversionForReplaceDecommissioningVoterCount = metric.Metadata{
345-
Name: "queue.replicate.priority_inversion.replacedecommissioningvoter",
346-
Help: "Number of priority inversions in the replicate queue that resulted in replace decommissioning voter action during processing",
347-
Measurement: "Replicas",
348-
Unit: metric.Unit_COUNT,
349-
}
350-
metaReplicateQueuePriorityInversionForRemoveDeadVoterCount = metric.Metadata{
351-
Name: "queue.replicate.priority_inversion.removedeadvoter",
352-
Help: "Number of priority inversions in the replicate queue that resulted in remove dead voter action during processing",
353-
Measurement: "Replicas",
354-
Unit: metric.Unit_COUNT,
355-
}
356-
metaReplicateQueuePriorityInversionForRemoveDecommissioningVoterCount = metric.Metadata{
357-
Name: "queue.replicate.priority_inversion.removedecommissioningvoter",
358-
Help: "Number of priority inversions in the replicate queue that resulted in remove decommissioning voter action during processing",
359-
Measurement: "Replicas",
360-
Unit: metric.Unit_COUNT,
361-
}
362-
metaReplicateQueuePriorityInversionForRemoveVoterCount = metric.Metadata{
363-
Name: "queue.replicate.priority_inversion.removevoter",
364-
Help: "Number of priority inversions in the replicate queue that resulted in remove voter action during processing",
365-
Measurement: "Replicas",
366-
Unit: metric.Unit_COUNT,
367-
}
368-
metaReplicateQueuePriorityInversionForReplaceDeadNonVoterCount = metric.Metadata{
369-
Name: "queue.replicate.priority_inversion.replacedeadnonvoter",
370-
Help: "Number of priority inversions in the replicate queue that resulted in replace dead non-voter action during processing",
371-
Measurement: "Replicas",
372-
Unit: metric.Unit_COUNT,
373-
}
374-
metaReplicateQueuePriorityInversionForAddNonVoterCount = metric.Metadata{
375-
Name: "queue.replicate.priority_inversion.addnonvoter",
376-
Help: "Number of priority inversions in the replicate queue that resulted in add non-voter action during processing",
377-
Measurement: "Replicas",
378-
Unit: metric.Unit_COUNT,
379-
}
380-
metaReplicateQueuePriorityInversionForReplaceDecommissioningNonVoterCount = metric.Metadata{
381-
Name: "queue.replicate.priority_inversion.replacedecommissioningnonvoter",
382-
Help: "Number of priority inversions in the replicate queue that resulted in replace decommissioning non-voter action during processing",
383-
Measurement: "Replicas",
384-
Unit: metric.Unit_COUNT,
385-
}
386-
metaReplicateQueuePriorityInversionForRemoveDeadNonVoterCount = metric.Metadata{
387-
Name: "queue.replicate.priority_inversion.removedeadnonvoter",
388-
Help: "Number of priority inversions in the replicate queue that resulted in remove dead non-voter action during processing",
389-
Measurement: "Replicas",
390-
Unit: metric.Unit_COUNT,
391-
}
392-
metaReplicateQueuePriorityInversionForRemoveDecommissioningNonVoterCount = metric.Metadata{
393-
Name: "queue.replicate.priority_inversion.removedecommissioningnonvoter",
394-
Help: "Number of priority inversions in the replicate queue that resulted in remove decommissioning non-voter action during processing",
395-
Measurement: "Replicas",
396-
Unit: metric.Unit_COUNT,
397-
}
398-
metaReplicateQueuePriorityInversionForRemoveNonVoterCount = metric.Metadata{
399-
Name: "queue.replicate.priority_inversion.removenonvoter",
400-
Help: "Number of priority inversions in the replicate queue that resulted in remove non-voter action during processing",
401-
Measurement: "Replicas",
402-
Unit: metric.Unit_COUNT,
403-
}
404-
metaReplicateQueuePriorityInversionForConsiderRebalance = metric.Metadata{
405-
Name: "queue.replicate.priority_inversion.considerrebalance",
406-
Help: "Number of priority inversions in the replicate queue that resulted in consider rebalance action during processing",
407-
Measurement: "Replicas",
408-
Unit: metric.Unit_COUNT,
409-
}
410-
metaReplicateQueuePriorityInversionForRangeUnavailable = metric.Metadata{
411-
Name: "queue.replicate.priority_inversion.rangeunavailable",
412-
Help: "Number of priority inversions in the replicate queue that resulted in range unavailable action during processing",
413-
Measurement: "Replicas",
414-
Unit: metric.Unit_COUNT,
415-
}
416-
metaReplicateQueuePriorityInversionForNoop = metric.Metadata{
417-
Name: "queue.replicate.priority_inversion.noop",
418-
Help: "Number of priority inversions in the replicate queue that resulted in noop action during processing",
419-
Measurement: "Replicas",
420-
Unit: metric.Unit_COUNT,
421-
}
422338
)
423339

424340
// quorumError indicates a retryable error condition which sends replicas being
@@ -480,26 +396,9 @@ type ReplicateQueueMetrics struct {
480396
// AllocatorConsiderRebalance, and AllocatorFinalizeAtomicReplicationChange
481397
// allocator actions.
482398

483-
// Priority Inversion. Not tracked for
484-
// AllocatorFinalizeAtomicReplicationChange, AllocatorRemoveLearner,
485-
// AllocatorReplaceDeadVoter since they are the highest priority actions and
486-
// cannot be inverted. (17 total actions-3=14)
487-
RequeueDueToPriorityInversion *metric.Counter
488-
PriorityInversionTotal *metric.Counter
489-
PriorityInversionForAddVoterCount *metric.Counter
490-
PriorityInversionForReplaceDecommissioningVoterCount *metric.Counter
491-
PriorityInversionForRemoveDeadVoterCount *metric.Counter
492-
PriorityInversionForRemoveDecommissioningVoterCount *metric.Counter
493-
PriorityInversionForRemoveVoterCount *metric.Counter
494-
PriorityInversionForReplaceDeadNonVoterCount *metric.Counter
495-
PriorityInversionForAddNonVoterCount *metric.Counter
496-
PriorityInversionForReplaceDecommissioningNonVoterCount *metric.Counter
497-
PriorityInversionForRemoveDeadNonVoterCount *metric.Counter
498-
PriorityInversionForRemoveDecommissioningNonVoterCount *metric.Counter
499-
PriorityInversionForRemoveNonVoterCount *metric.Counter
500-
PriorityInversionForConsiderRebalance *metric.Counter
501-
PriorityInversionForRangeUnavailable *metric.Counter
502-
PriorityInversionForNoop *metric.Counter
399+
// Priority Inversion.
400+
RequeueDueToPriorityInversion *metric.Counter
401+
PriorityInversionTotal *metric.Counter
503402
}
504403

505404
func makeReplicateQueueMetrics() ReplicateQueueMetrics {
@@ -537,22 +436,8 @@ func makeReplicateQueueMetrics() ReplicateQueueMetrics {
537436
RemoveDecommissioningReplicaSuccessCount: metric.NewCounter(metaReplicateQueueRemoveDecommissioningReplicaSuccessCount),
538437
RemoveDecommissioningReplicaErrorCount: metric.NewCounter(metaReplicateQueueRemoveDecommissioningReplicaErrorCount),
539438

540-
RequeueDueToPriorityInversion: metric.NewCounter(metaReplicateQueueRequeueDueToPriorityInversion),
541-
PriorityInversionTotal: metric.NewCounter(metaReplicateQueuePriorityInversionTotal),
542-
PriorityInversionForAddVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForAddVoterCount),
543-
PriorityInversionForReplaceDecommissioningVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForReplaceDecommissioningVoterCount),
544-
PriorityInversionForRemoveDeadVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDeadVoterCount),
545-
PriorityInversionForRemoveDecommissioningVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDecommissioningVoterCount),
546-
PriorityInversionForRemoveVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveVoterCount),
547-
PriorityInversionForReplaceDeadNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForReplaceDeadNonVoterCount),
548-
PriorityInversionForAddNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForAddNonVoterCount),
549-
PriorityInversionForReplaceDecommissioningNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForReplaceDecommissioningNonVoterCount),
550-
PriorityInversionForRemoveDeadNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDeadNonVoterCount),
551-
PriorityInversionForRemoveDecommissioningNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDecommissioningNonVoterCount),
552-
PriorityInversionForRemoveNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveNonVoterCount),
553-
PriorityInversionForConsiderRebalance: metric.NewCounter(metaReplicateQueuePriorityInversionForConsiderRebalance),
554-
PriorityInversionForRangeUnavailable: metric.NewCounter(metaReplicateQueuePriorityInversionForRangeUnavailable),
555-
PriorityInversionForNoop: metric.NewCounter(metaReplicateQueuePriorityInversionForNoop),
439+
RequeueDueToPriorityInversion: metric.NewCounter(metaReplicateQueueRequeueDueToPriorityInversion),
440+
PriorityInversionTotal: metric.NewCounter(metaReplicateQueuePriorityInversionTotal),
556441
}
557442
}
558443

@@ -676,47 +561,6 @@ func (metrics *ReplicateQueueMetrics) trackErrorByAllocatorAction(
676561

677562
}
678563

679-
// trackPriorityInversion tracks the action that the replicate queue ended up
680-
// processing when the priority at enqueue time was higher than the priority at
681-
// processing time.
682-
func (metrics *ReplicateQueueMetrics) trackPriorityInversion(
683-
actionAtProcessingTime allocatorimpl.AllocatorAction,
684-
) {
685-
metrics.PriorityInversionTotal.Inc(1)
686-
switch actionAtProcessingTime {
687-
case allocatorimpl.AllocatorAddVoter:
688-
metrics.PriorityInversionForAddVoterCount.Inc(1)
689-
case allocatorimpl.AllocatorReplaceDecommissioningVoter:
690-
metrics.PriorityInversionForReplaceDecommissioningVoterCount.Inc(1)
691-
case allocatorimpl.AllocatorRemoveDeadVoter:
692-
metrics.PriorityInversionForRemoveDeadVoterCount.Inc(1)
693-
case allocatorimpl.AllocatorRemoveDecommissioningVoter:
694-
metrics.PriorityInversionForRemoveDecommissioningVoterCount.Inc(1)
695-
case allocatorimpl.AllocatorRemoveVoter:
696-
metrics.PriorityInversionForRemoveVoterCount.Inc(1)
697-
case allocatorimpl.AllocatorReplaceDeadNonVoter:
698-
metrics.PriorityInversionForReplaceDeadNonVoterCount.Inc(1)
699-
case allocatorimpl.AllocatorAddNonVoter:
700-
metrics.PriorityInversionForAddNonVoterCount.Inc(1)
701-
case allocatorimpl.AllocatorReplaceDecommissioningNonVoter:
702-
metrics.PriorityInversionForReplaceDecommissioningNonVoterCount.Inc(1)
703-
case allocatorimpl.AllocatorRemoveDeadNonVoter:
704-
metrics.PriorityInversionForRemoveDeadNonVoterCount.Inc(1)
705-
case allocatorimpl.AllocatorRemoveDecommissioningNonVoter:
706-
metrics.PriorityInversionForRemoveDecommissioningNonVoterCount.Inc(1)
707-
case allocatorimpl.AllocatorRemoveNonVoter:
708-
metrics.PriorityInversionForRemoveNonVoterCount.Inc(1)
709-
case allocatorimpl.AllocatorConsiderRebalance:
710-
metrics.PriorityInversionForConsiderRebalance.Inc(1)
711-
case allocatorimpl.AllocatorRangeUnavailable:
712-
metrics.PriorityInversionForRangeUnavailable.Inc(1)
713-
case allocatorimpl.AllocatorNoop:
714-
metrics.PriorityInversionForNoop.Inc(1)
715-
default:
716-
panic("unhandled default case")
717-
}
718-
}
719-
720564
// trackProcessResult increases the corresponding success/error count metric for
721565
// processing a particular allocator action through the replicate queue.
722566
func (metrics *ReplicateQueueMetrics) trackResultByAllocatorAction(
@@ -1129,7 +973,7 @@ func (rq *replicateQueue) processOneChange(
1129973
// starving other higher priority work.
1130974
if PriorityInversionRequeue.Get(&rq.store.cfg.Settings.SV) {
1131975
if inversion, shouldRequeue := allocatorimpl.CheckPriorityInversion(priorityAtEnqueue, change.Action); inversion {
1132-
rq.metrics.trackPriorityInversion(change.Action)
976+
rq.metrics.PriorityInversionTotal.Inc(1)
1133977
if priorityInversionLogEveryN.ShouldLog() {
1134978
log.KvDistribution.Infof(ctx,
1135979
"priority inversion during process: shouldRequeue = %t action=%s, priority=%v, enqueuePriority=%v",

pkg/kv/kvserver/replicate_queue_test.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2567,6 +2567,7 @@ func TestReplicateQueueDecommissionScannerDisabled(t *testing.T) {
25672567
func TestPriorityInversionRequeue(t *testing.T) {
25682568
defer leaktest.AfterTest(t)()
25692569
defer log.Scope(t).Close(t)
2570+
skip.UnderDuress(t)
25702571

25712572
ctx := context.Background()
25722573
settings := cluster.MakeTestingClusterSettings()
@@ -2646,9 +2647,6 @@ func TestPriorityInversionRequeue(t *testing.T) {
26462647
if c := store.ReplicateQueueMetrics().PriorityInversionTotal.Count(); c == 0 {
26472648
return errors.New("expected non-zero priority inversion total count but got 0")
26482649
}
2649-
if c := store.ReplicateQueueMetrics().PriorityInversionForConsiderRebalance.Count(); c == 0 {
2650-
return errors.New("expected non-zero priority inversion count for consider rebalance but got 0")
2651-
}
26522650
if c := store.ReplicateQueueMetrics().RequeueDueToPriorityInversion.Count(); c == 0 {
26532651
return errors.New("expected to requeue due to priority inversion but got 0")
26542652
}

0 commit comments

Comments
 (0)