Skip to content

Commit a96cb74

Browse files
committed
kvserver: delete per action priority inversion metrics
This commit removes per-action priority inversion metrics due to their high cardinality. We already have logging in place, which should provide sufficient observability. For now, we care about is priority inversion that leads to consider rebalance and requeuing the most.
1 parent 9315a46 commit a96cb74

File tree

3 files changed

+7
-277
lines changed

3 files changed

+7
-277
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 0 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -13951,118 +13951,6 @@ layers:
1395113951
unit: COUNT
1395213952
aggregation: AVG
1395313953
derivative: NONE
13954-
- name: queue.replicate.priority_inversion.addnonvoter
13955-
exported_name: queue_replicate_priority_inversion_addnonvoter
13956-
description: Number of priority inversions in the replicate queue that resulted in add non-voter action during processing
13957-
y_axis_label: Replicas
13958-
type: COUNTER
13959-
unit: COUNT
13960-
aggregation: AVG
13961-
derivative: NON_NEGATIVE_DERIVATIVE
13962-
- name: queue.replicate.priority_inversion.addvoter
13963-
exported_name: queue_replicate_priority_inversion_addvoter
13964-
description: Number of priority inversions in the replicate queue that resulted in add voter action during processing
13965-
y_axis_label: Replicas
13966-
type: COUNTER
13967-
unit: COUNT
13968-
aggregation: AVG
13969-
derivative: NON_NEGATIVE_DERIVATIVE
13970-
- name: queue.replicate.priority_inversion.considerrebalance
13971-
exported_name: queue_replicate_priority_inversion_considerrebalance
13972-
description: Number of priority inversions in the replicate queue that resulted in consider rebalance action during processing
13973-
y_axis_label: Replicas
13974-
type: COUNTER
13975-
unit: COUNT
13976-
aggregation: AVG
13977-
derivative: NON_NEGATIVE_DERIVATIVE
13978-
- name: queue.replicate.priority_inversion.noop
13979-
exported_name: queue_replicate_priority_inversion_noop
13980-
description: Number of priority inversions in the replicate queue that resulted in noop action during processing
13981-
y_axis_label: Replicas
13982-
type: COUNTER
13983-
unit: COUNT
13984-
aggregation: AVG
13985-
derivative: NON_NEGATIVE_DERIVATIVE
13986-
- name: queue.replicate.priority_inversion.rangeunavailable
13987-
exported_name: queue_replicate_priority_inversion_rangeunavailable
13988-
description: Number of priority inversions in the replicate queue that resulted in range unavailable action during processing
13989-
y_axis_label: Replicas
13990-
type: COUNTER
13991-
unit: COUNT
13992-
aggregation: AVG
13993-
derivative: NON_NEGATIVE_DERIVATIVE
13994-
- name: queue.replicate.priority_inversion.removedeadnonvoter
13995-
exported_name: queue_replicate_priority_inversion_removedeadnonvoter
13996-
description: Number of priority inversions in the replicate queue that resulted in remove dead non-voter action during processing
13997-
y_axis_label: Replicas
13998-
type: COUNTER
13999-
unit: COUNT
14000-
aggregation: AVG
14001-
derivative: NON_NEGATIVE_DERIVATIVE
14002-
- name: queue.replicate.priority_inversion.removedeadvoter
14003-
exported_name: queue_replicate_priority_inversion_removedeadvoter
14004-
description: Number of priority inversions in the replicate queue that resulted in remove dead voter action during processing
14005-
y_axis_label: Replicas
14006-
type: COUNTER
14007-
unit: COUNT
14008-
aggregation: AVG
14009-
derivative: NON_NEGATIVE_DERIVATIVE
14010-
- name: queue.replicate.priority_inversion.removedecommissioningnonvoter
14011-
exported_name: queue_replicate_priority_inversion_removedecommissioningnonvoter
14012-
description: Number of priority inversions in the replicate queue that resulted in remove decommissioning non-voter action during processing
14013-
y_axis_label: Replicas
14014-
type: COUNTER
14015-
unit: COUNT
14016-
aggregation: AVG
14017-
derivative: NON_NEGATIVE_DERIVATIVE
14018-
- name: queue.replicate.priority_inversion.removedecommissioningvoter
14019-
exported_name: queue_replicate_priority_inversion_removedecommissioningvoter
14020-
description: Number of priority inversions in the replicate queue that resulted in remove decommissioning voter action during processing
14021-
y_axis_label: Replicas
14022-
type: COUNTER
14023-
unit: COUNT
14024-
aggregation: AVG
14025-
derivative: NON_NEGATIVE_DERIVATIVE
14026-
- name: queue.replicate.priority_inversion.removenonvoter
14027-
exported_name: queue_replicate_priority_inversion_removenonvoter
14028-
description: Number of priority inversions in the replicate queue that resulted in remove non-voter action during processing
14029-
y_axis_label: Replicas
14030-
type: COUNTER
14031-
unit: COUNT
14032-
aggregation: AVG
14033-
derivative: NON_NEGATIVE_DERIVATIVE
14034-
- name: queue.replicate.priority_inversion.removevoter
14035-
exported_name: queue_replicate_priority_inversion_removevoter
14036-
description: Number of priority inversions in the replicate queue that resulted in remove voter action during processing
14037-
y_axis_label: Replicas
14038-
type: COUNTER
14039-
unit: COUNT
14040-
aggregation: AVG
14041-
derivative: NON_NEGATIVE_DERIVATIVE
14042-
- name: queue.replicate.priority_inversion.replacedeadnonvoter
14043-
exported_name: queue_replicate_priority_inversion_replacedeadnonvoter
14044-
description: Number of priority inversions in the replicate queue that resulted in replace dead non-voter action during processing
14045-
y_axis_label: Replicas
14046-
type: COUNTER
14047-
unit: COUNT
14048-
aggregation: AVG
14049-
derivative: NON_NEGATIVE_DERIVATIVE
14050-
- name: queue.replicate.priority_inversion.replacedecommissioningnonvoter
14051-
exported_name: queue_replicate_priority_inversion_replacedecommissioningnonvoter
14052-
description: Number of priority inversions in the replicate queue that resulted in replace decommissioning non-voter action during processing
14053-
y_axis_label: Replicas
14054-
type: COUNTER
14055-
unit: COUNT
14056-
aggregation: AVG
14057-
derivative: NON_NEGATIVE_DERIVATIVE
14058-
- name: queue.replicate.priority_inversion.replacedecommissioningvoter
14059-
exported_name: queue_replicate_priority_inversion_replacedecommissioningvoter
14060-
description: Number of priority inversions in the replicate queue that resulted in replace decommissioning voter action during processing
14061-
y_axis_label: Replicas
14062-
type: COUNTER
14063-
unit: COUNT
14064-
aggregation: AVG
14065-
derivative: NON_NEGATIVE_DERIVATIVE
1406613954
- name: queue.replicate.priority_inversion.requeue
1406713955
exported_name: queue_replicate_priority_inversion_requeue
1406813956
description: Number of priority inversions in the replicate queue that resulted in requeuing of the replicas. A priority inversion occurs when the priority at processing time ends up being lower than at enqueue time. When the priority has changed from a high priority repair action to rebalance, the change is requeued to avoid unfairness.

pkg/kv/kvserver/replicate_queue.go

Lines changed: 6 additions & 162 deletions
Original file line numberDiff line numberDiff line change
@@ -338,90 +338,6 @@ var (
338338
Measurement: "Replicas",
339339
Unit: metric.Unit_COUNT,
340340
}
341-
metaReplicateQueuePriorityInversionForAddVoterCount = metric.Metadata{
342-
Name: "queue.replicate.priority_inversion.addvoter",
343-
Help: "Number of priority inversions in the replicate queue that resulted in add voter action during processing",
344-
Measurement: "Replicas",
345-
Unit: metric.Unit_COUNT,
346-
}
347-
metaReplicateQueuePriorityInversionForReplaceDecommissioningVoterCount = metric.Metadata{
348-
Name: "queue.replicate.priority_inversion.replacedecommissioningvoter",
349-
Help: "Number of priority inversions in the replicate queue that resulted in replace decommissioning voter action during processing",
350-
Measurement: "Replicas",
351-
Unit: metric.Unit_COUNT,
352-
}
353-
metaReplicateQueuePriorityInversionForRemoveDeadVoterCount = metric.Metadata{
354-
Name: "queue.replicate.priority_inversion.removedeadvoter",
355-
Help: "Number of priority inversions in the replicate queue that resulted in remove dead voter action during processing",
356-
Measurement: "Replicas",
357-
Unit: metric.Unit_COUNT,
358-
}
359-
metaReplicateQueuePriorityInversionForRemoveDecommissioningVoterCount = metric.Metadata{
360-
Name: "queue.replicate.priority_inversion.removedecommissioningvoter",
361-
Help: "Number of priority inversions in the replicate queue that resulted in remove decommissioning voter action during processing",
362-
Measurement: "Replicas",
363-
Unit: metric.Unit_COUNT,
364-
}
365-
metaReplicateQueuePriorityInversionForRemoveVoterCount = metric.Metadata{
366-
Name: "queue.replicate.priority_inversion.removevoter",
367-
Help: "Number of priority inversions in the replicate queue that resulted in remove voter action during processing",
368-
Measurement: "Replicas",
369-
Unit: metric.Unit_COUNT,
370-
}
371-
metaReplicateQueuePriorityInversionForReplaceDeadNonVoterCount = metric.Metadata{
372-
Name: "queue.replicate.priority_inversion.replacedeadnonvoter",
373-
Help: "Number of priority inversions in the replicate queue that resulted in replace dead non-voter action during processing",
374-
Measurement: "Replicas",
375-
Unit: metric.Unit_COUNT,
376-
}
377-
metaReplicateQueuePriorityInversionForAddNonVoterCount = metric.Metadata{
378-
Name: "queue.replicate.priority_inversion.addnonvoter",
379-
Help: "Number of priority inversions in the replicate queue that resulted in add non-voter action during processing",
380-
Measurement: "Replicas",
381-
Unit: metric.Unit_COUNT,
382-
}
383-
metaReplicateQueuePriorityInversionForReplaceDecommissioningNonVoterCount = metric.Metadata{
384-
Name: "queue.replicate.priority_inversion.replacedecommissioningnonvoter",
385-
Help: "Number of priority inversions in the replicate queue that resulted in replace decommissioning non-voter action during processing",
386-
Measurement: "Replicas",
387-
Unit: metric.Unit_COUNT,
388-
}
389-
metaReplicateQueuePriorityInversionForRemoveDeadNonVoterCount = metric.Metadata{
390-
Name: "queue.replicate.priority_inversion.removedeadnonvoter",
391-
Help: "Number of priority inversions in the replicate queue that resulted in remove dead non-voter action during processing",
392-
Measurement: "Replicas",
393-
Unit: metric.Unit_COUNT,
394-
}
395-
metaReplicateQueuePriorityInversionForRemoveDecommissioningNonVoterCount = metric.Metadata{
396-
Name: "queue.replicate.priority_inversion.removedecommissioningnonvoter",
397-
Help: "Number of priority inversions in the replicate queue that resulted in remove decommissioning non-voter action during processing",
398-
Measurement: "Replicas",
399-
Unit: metric.Unit_COUNT,
400-
}
401-
metaReplicateQueuePriorityInversionForRemoveNonVoterCount = metric.Metadata{
402-
Name: "queue.replicate.priority_inversion.removenonvoter",
403-
Help: "Number of priority inversions in the replicate queue that resulted in remove non-voter action during processing",
404-
Measurement: "Replicas",
405-
Unit: metric.Unit_COUNT,
406-
}
407-
metaReplicateQueuePriorityInversionForConsiderRebalance = metric.Metadata{
408-
Name: "queue.replicate.priority_inversion.considerrebalance",
409-
Help: "Number of priority inversions in the replicate queue that resulted in consider rebalance action during processing",
410-
Measurement: "Replicas",
411-
Unit: metric.Unit_COUNT,
412-
}
413-
metaReplicateQueuePriorityInversionForRangeUnavailable = metric.Metadata{
414-
Name: "queue.replicate.priority_inversion.rangeunavailable",
415-
Help: "Number of priority inversions in the replicate queue that resulted in range unavailable action during processing",
416-
Measurement: "Replicas",
417-
Unit: metric.Unit_COUNT,
418-
}
419-
metaReplicateQueuePriorityInversionForNoop = metric.Metadata{
420-
Name: "queue.replicate.priority_inversion.noop",
421-
Help: "Number of priority inversions in the replicate queue that resulted in noop action during processing",
422-
Measurement: "Replicas",
423-
Unit: metric.Unit_COUNT,
424-
}
425341
)
426342

427343
// quorumError indicates a retryable error condition which sends replicas being
@@ -483,26 +399,9 @@ type ReplicateQueueMetrics struct {
483399
// AllocatorConsiderRebalance, and AllocatorFinalizeAtomicReplicationChange
484400
// allocator actions.
485401

486-
// Priority Inversion. Not tracked for
487-
// AllocatorFinalizeAtomicReplicationChange, AllocatorRemoveLearner,
488-
// AllocatorReplaceDeadVoter since they are the highest priority actions and
489-
// cannot be inverted. (17 total actions-3=14)
490-
RequeueDueToPriorityInversion *metric.Counter
491-
PriorityInversionTotal *metric.Counter
492-
PriorityInversionForAddVoterCount *metric.Counter
493-
PriorityInversionForReplaceDecommissioningVoterCount *metric.Counter
494-
PriorityInversionForRemoveDeadVoterCount *metric.Counter
495-
PriorityInversionForRemoveDecommissioningVoterCount *metric.Counter
496-
PriorityInversionForRemoveVoterCount *metric.Counter
497-
PriorityInversionForReplaceDeadNonVoterCount *metric.Counter
498-
PriorityInversionForAddNonVoterCount *metric.Counter
499-
PriorityInversionForReplaceDecommissioningNonVoterCount *metric.Counter
500-
PriorityInversionForRemoveDeadNonVoterCount *metric.Counter
501-
PriorityInversionForRemoveDecommissioningNonVoterCount *metric.Counter
502-
PriorityInversionForRemoveNonVoterCount *metric.Counter
503-
PriorityInversionForConsiderRebalance *metric.Counter
504-
PriorityInversionForRangeUnavailable *metric.Counter
505-
PriorityInversionForNoop *metric.Counter
402+
// Priority Inversion.
403+
RequeueDueToPriorityInversion *metric.Counter
404+
PriorityInversionTotal *metric.Counter
506405
}
507406

508407
func makeReplicateQueueMetrics() ReplicateQueueMetrics {
@@ -540,22 +439,8 @@ func makeReplicateQueueMetrics() ReplicateQueueMetrics {
540439
RemoveDecommissioningReplicaSuccessCount: metric.NewCounter(metaReplicateQueueRemoveDecommissioningReplicaSuccessCount),
541440
RemoveDecommissioningReplicaErrorCount: metric.NewCounter(metaReplicateQueueRemoveDecommissioningReplicaErrorCount),
542441

543-
RequeueDueToPriorityInversion: metric.NewCounter(metaReplicateQueueRequeueDueToPriorityInversion),
544-
PriorityInversionTotal: metric.NewCounter(metaReplicateQueuePriorityInversionTotal),
545-
PriorityInversionForAddVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForAddVoterCount),
546-
PriorityInversionForReplaceDecommissioningVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForReplaceDecommissioningVoterCount),
547-
PriorityInversionForRemoveDeadVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDeadVoterCount),
548-
PriorityInversionForRemoveDecommissioningVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDecommissioningVoterCount),
549-
PriorityInversionForRemoveVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveVoterCount),
550-
PriorityInversionForReplaceDeadNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForReplaceDeadNonVoterCount),
551-
PriorityInversionForAddNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForAddNonVoterCount),
552-
PriorityInversionForReplaceDecommissioningNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForReplaceDecommissioningNonVoterCount),
553-
PriorityInversionForRemoveDeadNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDeadNonVoterCount),
554-
PriorityInversionForRemoveDecommissioningNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDecommissioningNonVoterCount),
555-
PriorityInversionForRemoveNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveNonVoterCount),
556-
PriorityInversionForConsiderRebalance: metric.NewCounter(metaReplicateQueuePriorityInversionForConsiderRebalance),
557-
PriorityInversionForRangeUnavailable: metric.NewCounter(metaReplicateQueuePriorityInversionForRangeUnavailable),
558-
PriorityInversionForNoop: metric.NewCounter(metaReplicateQueuePriorityInversionForNoop),
442+
RequeueDueToPriorityInversion: metric.NewCounter(metaReplicateQueueRequeueDueToPriorityInversion),
443+
PriorityInversionTotal: metric.NewCounter(metaReplicateQueuePriorityInversionTotal),
559444
}
560445
}
561446

@@ -679,47 +564,6 @@ func (metrics *ReplicateQueueMetrics) trackErrorByAllocatorAction(
679564

680565
}
681566

682-
// trackPriorityInversion tracks the action that the replicate queue ended up
683-
// processing when the priority at enqueue time was higher than the priority at
684-
// processing time.
685-
func (metrics *ReplicateQueueMetrics) trackPriorityInversion(
686-
actionAtProcessingTime allocatorimpl.AllocatorAction,
687-
) {
688-
metrics.PriorityInversionTotal.Inc(1)
689-
switch actionAtProcessingTime {
690-
case allocatorimpl.AllocatorAddVoter:
691-
metrics.PriorityInversionForAddVoterCount.Inc(1)
692-
case allocatorimpl.AllocatorReplaceDecommissioningVoter:
693-
metrics.PriorityInversionForReplaceDecommissioningVoterCount.Inc(1)
694-
case allocatorimpl.AllocatorRemoveDeadVoter:
695-
metrics.PriorityInversionForRemoveDeadVoterCount.Inc(1)
696-
case allocatorimpl.AllocatorRemoveDecommissioningVoter:
697-
metrics.PriorityInversionForRemoveDecommissioningVoterCount.Inc(1)
698-
case allocatorimpl.AllocatorRemoveVoter:
699-
metrics.PriorityInversionForRemoveVoterCount.Inc(1)
700-
case allocatorimpl.AllocatorReplaceDeadNonVoter:
701-
metrics.PriorityInversionForReplaceDeadNonVoterCount.Inc(1)
702-
case allocatorimpl.AllocatorAddNonVoter:
703-
metrics.PriorityInversionForAddNonVoterCount.Inc(1)
704-
case allocatorimpl.AllocatorReplaceDecommissioningNonVoter:
705-
metrics.PriorityInversionForReplaceDecommissioningNonVoterCount.Inc(1)
706-
case allocatorimpl.AllocatorRemoveDeadNonVoter:
707-
metrics.PriorityInversionForRemoveDeadNonVoterCount.Inc(1)
708-
case allocatorimpl.AllocatorRemoveDecommissioningNonVoter:
709-
metrics.PriorityInversionForRemoveDecommissioningNonVoterCount.Inc(1)
710-
case allocatorimpl.AllocatorRemoveNonVoter:
711-
metrics.PriorityInversionForRemoveNonVoterCount.Inc(1)
712-
case allocatorimpl.AllocatorConsiderRebalance:
713-
metrics.PriorityInversionForConsiderRebalance.Inc(1)
714-
case allocatorimpl.AllocatorRangeUnavailable:
715-
metrics.PriorityInversionForRangeUnavailable.Inc(1)
716-
case allocatorimpl.AllocatorNoop:
717-
metrics.PriorityInversionForNoop.Inc(1)
718-
default:
719-
panic("unhandled default case")
720-
}
721-
}
722-
723567
// trackProcessResult increases the corresponding success/error count metric for
724568
// processing a particular allocator action through the replicate queue.
725569
func (metrics *ReplicateQueueMetrics) trackResultByAllocatorAction(
@@ -1135,7 +979,7 @@ func (rq *replicateQueue) processOneChange(
1135979
// starving other higher priority work.
1136980
if PriorityInversionRequeue.Get(&rq.store.cfg.Settings.SV) {
1137981
if inversion, shouldRequeue := allocatorimpl.CheckPriorityInversion(priorityAtEnqueue, change.Action); inversion {
1138-
rq.metrics.trackPriorityInversion(change.Action)
982+
rq.metrics.PriorityInversionTotal.Inc(1)
1139983
if priorityInversionLogEveryN.ShouldLog() {
1140984
log.KvDistribution.Infof(ctx,
1141985
"priority inversion during process: shouldRequeue = %t action=%s, priority=%v, enqueuePriority=%v",

pkg/kv/kvserver/replicate_queue_test.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2568,6 +2568,7 @@ func TestReplicateQueueDecommissionScannerDisabled(t *testing.T) {
25682568
func TestPriorityInversionRequeue(t *testing.T) {
25692569
defer leaktest.AfterTest(t)()
25702570
defer log.Scope(t).Close(t)
2571+
skip.UnderDuress(t)
25712572

25722573
ctx := context.Background()
25732574
settings := cluster.MakeTestingClusterSettings()
@@ -2647,9 +2648,6 @@ func TestPriorityInversionRequeue(t *testing.T) {
26472648
if c := store.ReplicateQueueMetrics().PriorityInversionTotal.Count(); c == 0 {
26482649
return errors.New("expected non-zero priority inversion total count but got 0")
26492650
}
2650-
if c := store.ReplicateQueueMetrics().PriorityInversionForConsiderRebalance.Count(); c == 0 {
2651-
return errors.New("expected non-zero priority inversion count for consider rebalance but got 0")
2652-
}
26532651
if c := store.ReplicateQueueMetrics().RequeueDueToPriorityInversion.Count(); c == 0 {
26542652
return errors.New("expected to requeue due to priority inversion but got 0")
26552653
}

0 commit comments

Comments
 (0)