Skip to content

Commit 3509826

Browse files
authored
Merge pull request #153052 from wenyihu6/backportrelease-25.2.6-rc-151898-152508-152512-152675-152507-152699-152596-152792-152885-152787-152697
release-25.2.6-rc: kvserver: requeue on priority inversion for replicate queue
2 parents 68a653b + a93cbd8 commit 3509826

30 files changed

+1564
-144
lines changed

docs/generated/metrics/metrics.html

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,12 +420,19 @@
420420
<tr><td>STORAGE</td><td>queue.replicate.addreplica.error</td><td>Number of failed replica additions processed by the replicate queue</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
421421
<tr><td>STORAGE</td><td>queue.replicate.addreplica.success</td><td>Number of successful replica additions processed by the replicate queue</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
422422
<tr><td>STORAGE</td><td>queue.replicate.addvoterreplica</td><td>Number of voter replica additions attempted by the replicate queue</td><td>Replica Additions</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
423+
<tr><td>STORAGE</td><td>queue.replicate.enqueue.add</td><td>Number of replicas successfully added to the replicate queue</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
424+
<tr><td>STORAGE</td><td>queue.replicate.enqueue.failedprecondition</td><td>Number of replicas that failed the precondition checks and were therefore not added to the replicate queue</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
425+
<tr><td>STORAGE</td><td>queue.replicate.enqueue.noaction</td><td>Number of replicas for which ShouldQueue determined no action was needed and were therefore not added to the replicate queue</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
426+
<tr><td>STORAGE</td><td>queue.replicate.enqueue.unexpectederror</td><td>Number of replicas that were expected to be enqueued (ShouldQueue returned true or the caller decided to add to the replicate queue directly), but failed to be enqueued due to unexpected errors</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
423427
<tr><td>STORAGE</td><td>queue.replicate.nonvoterpromotions</td><td>Number of non-voters promoted to voters by the replicate queue</td><td>Promotions of Non Voters to Voters</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
424428
<tr><td>STORAGE</td><td>queue.replicate.pending</td><td>Number of pending replicas in the replicate queue</td><td>Replicas</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
429+
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.requeue</td><td>Number of priority inversions in the replicate queue that resulted in requeuing of the replicas. A priority inversion occurs when the priority at processing time ends up being lower than at enqueue time. When the priority has changed from a high priority repair action to rebalance, the change is requeued to avoid unfairness.</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
430+
<tr><td>STORAGE</td><td>queue.replicate.priority_inversion.total</td><td>Total number of priority inversions in the replicate queue. A priority inversion occurs when the priority at processing time ends up being lower than at enqueue time</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
425431
<tr><td>STORAGE</td><td>queue.replicate.process.failure</td><td>Number of replicas which failed processing in the replicate queue</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
426432
<tr><td>STORAGE</td><td>queue.replicate.process.success</td><td>Number of replicas successfully processed by the replicate queue</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
427433
<tr><td>STORAGE</td><td>queue.replicate.processingnanos</td><td>Nanoseconds spent processing replicas in the replicate queue</td><td>Processing Time</td><td>COUNTER</td><td>NANOSECONDS</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
428434
<tr><td>STORAGE</td><td>queue.replicate.purgatory</td><td>Number of replicas in the replicate queue&#39;s purgatory, awaiting allocation options</td><td>Replicas</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
435+
<tr><td>STORAGE</td><td>queue.replicate.queue_full</td><td>Number of times a replica was dropped from the queue due to queue fullness</td><td>Replicas</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
429436
<tr><td>STORAGE</td><td>queue.replicate.rebalancenonvoterreplica</td><td>Number of non-voter replica rebalancer-initiated additions attempted by the replicate queue</td><td>Replica Additions</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
430437
<tr><td>STORAGE</td><td>queue.replicate.rebalancereplica</td><td>Number of replica rebalancer-initiated additions attempted by the replicate queue</td><td>Replica Additions</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
431438
<tr><td>STORAGE</td><td>queue.replicate.rebalancevoterreplica</td><td>Number of voter replica rebalancer-initiated additions attempted by the replicate queue</td><td>Replica Additions</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
@@ -580,6 +587,12 @@
580587
<tr><td>STORAGE</td><td>rangekeycount</td><td>Count of all range keys (e.g. MVCC range tombstones)</td><td>Keys</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
581588
<tr><td>STORAGE</td><td>ranges</td><td>Number of ranges</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
582589
<tr><td>STORAGE</td><td>ranges.decommissioning</td><td>Number of ranges with at lease one replica on a decommissioning node</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
590+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.enqueue</td><td>Number of enqueued enqueues of a range for decommissioning by the decommissioning nudger. Note: This metric tracks when the nudger attempts to enqueue, but the replica might not end up being enqueued by the priority queue due to various filtering or failure conditions.</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
591+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.enqueue.failure</td><td>Number of ranges that failed to enqueue at the replicate queue</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
592+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.enqueue.success</td><td>Number of ranges that were successfully enqueued by the decommisioning nudger</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
593+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease</td><td>Number of ranges that were not the leaseholder or had an invalid lease at the decommissioning nudger</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
594+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.process.failure</td><td>Number of ranges enqueued by the decommissioning nudger that failed to process by the replicate queue</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
595+
<tr><td>STORAGE</td><td>ranges.decommissioning.nudger.process.success</td><td>Number of ranges enqueued by the decommissioning nudger that were successfully processed by the replicate queue</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
583596
<tr><td>STORAGE</td><td>ranges.overreplicated</td><td>Number of ranges with more live replicas than the replication target</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
584597
<tr><td>STORAGE</td><td>ranges.unavailable</td><td>Number of ranges with fewer live replicas than needed for quorum</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
585598
<tr><td>STORAGE</td><td>ranges.underreplicated</td><td>Number of ranges with fewer live replicas than the replication target</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>

pkg/kv/kvserver/allocator/allocatorimpl/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ go_library(
2828
"//pkg/settings",
2929
"//pkg/settings/cluster",
3030
"//pkg/util/admission/admissionpb",
31+
"//pkg/util/buildutil",
3132
"//pkg/util/log",
3233
"//pkg/util/metric",
3334
"//pkg/util/stop",

pkg/kv/kvserver/allocator/allocatorimpl/allocator.go

Lines changed: 150 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/cockroachdb/cockroach/pkg/roachpb"
2828
"github.com/cockroachdb/cockroach/pkg/settings"
2929
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
30+
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
3031
"github.com/cockroachdb/cockroach/pkg/util/log"
3132
"github.com/cockroachdb/cockroach/pkg/util/metric"
3233
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
@@ -138,6 +139,7 @@ const (
138139
AllocatorConsiderRebalance
139140
AllocatorRangeUnavailable
140141
AllocatorFinalizeAtomicReplicationChange
142+
AllocatorMaxPriority
141143
)
142144

143145
// Add indicates an action adding a replica.
@@ -163,6 +165,15 @@ func (a AllocatorAction) Remove() bool {
163165
a == AllocatorRemoveDecommissioningNonVoter
164166
}
165167

168+
// Decommissioning indicates an action replacing or removing a decommissioning
169+
// replicas.
170+
func (a AllocatorAction) Decommissioning() bool {
171+
return a == AllocatorRemoveDecommissioningVoter ||
172+
a == AllocatorRemoveDecommissioningNonVoter ||
173+
a == AllocatorReplaceDecommissioningVoter ||
174+
a == AllocatorReplaceDecommissioningNonVoter
175+
}
176+
166177
// TargetReplicaType returns that the action is for a voter or non-voter replica.
167178
func (a AllocatorAction) TargetReplicaType() TargetReplicaType {
168179
var t TargetReplicaType
@@ -242,10 +253,27 @@ func (a AllocatorAction) SafeValue() {}
242253
// range. Within a given range, the ordering of the various checks inside
243254
// `Allocator.computeAction` determines which repair/rebalancing actions are
244255
// taken before the others.
256+
//
257+
// NB: Priorities should be non-negative and should be spaced in multiples of
258+
// 100 unless you believe they should belong to the same priority category.
259+
// AllocatorNoop should have the lowest priority. CheckPriorityInversion depends
260+
// on this contract. In most cases, the allocator returns a priority that
261+
// matches the definitions below. For AllocatorAddVoter,
262+
// AllocatorRemoveDeadVoter, and AllocatorRemoveVoter, the priority may be
263+
// adjusted (see ComputeAction for details), but the adjustment is expected to
264+
// be small (<49).
265+
//
266+
// Exceptions: AllocatorFinalizeAtomicReplicationChange, AllocatorRemoveLearner,
267+
// and AllocatorReplaceDeadVoter violates the spacing of 100. These cases
268+
// predate this comment, so we allow them as they belong to the same general
269+
// priority category.
245270
func (a AllocatorAction) Priority() float64 {
271+
const maxPriority = 12002
246272
switch a {
273+
case AllocatorMaxPriority:
274+
return maxPriority
247275
case AllocatorFinalizeAtomicReplicationChange:
248-
return 12002
276+
return maxPriority
249277
case AllocatorRemoveLearner:
250278
return 12001
251279
case AllocatorReplaceDeadVoter:
@@ -946,10 +974,68 @@ func (a *Allocator) ComputeAction(
946974
return action, action.Priority()
947975
}
948976

949-
return a.computeAction(ctx, storePool, conf, desc.Replicas().VoterDescriptors(),
977+
action, priority = a.computeAction(ctx, storePool, conf, desc.Replicas().VoterDescriptors(),
950978
desc.Replicas().NonVoterDescriptors())
979+
// Ensure that priority is never -1. Typically, computeAction return
980+
// action.Priority(), but we sometimes modify the priority for specific
981+
// actions like AllocatorAddVoter, AllocatorRemoveDeadVoter, and
982+
// AllocatorRemoveVoter. A priority of -1 is a special case, indicating that
983+
// the caller expects the processing logic to be invoked even if there's a
984+
// priority inversion. If the priority is not -1, the range might be re-queued
985+
// to be processed with the correct priority.
986+
if priority == -1 {
987+
if buildutil.CrdbTestBuild {
988+
log.Fatalf(ctx, "allocator returned -1 priority for range %s: %v", desc, action)
989+
} else {
990+
log.Warningf(ctx, "allocator returned -1 priority for range %s: %v", desc, action)
991+
}
992+
}
993+
return action, priority
951994
}
952995

996+
// computeAction determines the action to take on a range along with its
997+
// priority.
998+
//
999+
// NB: The returned priority may include a small adjustment and therefore might
1000+
// not exactly match action.Priority(). See AllocatorAddVoter,
1001+
// AllocatorRemoveDeadVoter, AllocatorRemoveVoter below. The adjustment should
1002+
// be <49 with two assumptions below. New uses on this contract should be
1003+
// avoided since the assumptions are not strong guarantees (especially the
1004+
// second one).
1005+
//
1006+
// The claim that the adjustment is < 49 has two assumptions:
1007+
// 1. min(num_replicas,total_nodes) in zone configuration is < 98.
1008+
// 2. when ranges are not under-replicated, the difference between
1009+
// min(num_replicas,total_nodes)/2-1 and existing_replicas is < 49.
1010+
//
1011+
// neededVoters <= min(num_replicas,total_nodes)
1012+
// desiredQuorum = neededVoters/2-1
1013+
// quorum = haveVoters/2-1
1014+
//
1015+
// For AllocatorAddVoter, we know haveVoters < neededVoters
1016+
// adjustment = desiredQuorum-haveVoters = neededVoters/2-1-haveVoters
1017+
// To find the worst case (largest adjustment),
1018+
// 1. haveVoters = neededVoters-1,
1019+
// adjustment = neededVoters/2-1-(neededVoters-1)
1020+
// = neededVoters/2-neededVoters = -neededVoters/2
1021+
// 2. haveVoters = 0
1022+
// adjustement = neededVoters/2-1
1023+
//
1024+
// In order for adjustment to be <49, neededVoters/2<49 => neededVoters<98.
1025+
// Hence the first assumption.
1026+
//
1027+
// For AllocatorRemoveDeadVoter, we know haveVoters >= neededVoters
1028+
// adjustment = desiredQuorum-haveVoters = neededVoters/2-1-haveVoters
1029+
// To find the worst case (largest adjustment),
1030+
// 1. neededVoters/2-1 is much larger than haveVoters: given haveVoters >=
1031+
// neededVoters, haveVoters/2-1 >= neededVoters/2-1. So this case is impossible.
1032+
// 2. neededVoters/2-1 is much smaller than haveVoters: since ranges could be
1033+
// over-replicated, theoretically speaking, there may be no upper bounds on
1034+
// haveVoters. In order for adjustment to be < 49, we can only make an
1035+
// assumption here that the difference between neededVoters/2-1 and haveVoters
1036+
// cannot be >= 49 in this case.
1037+
//
1038+
// For AllocatorRemoveVoter, adjustment is haveVoters%2 = 0 or 1 < 49.
9531039
func (a *Allocator) computeAction(
9541040
ctx context.Context,
9551041
storePool storepool.AllocatorStorePool,
@@ -3220,3 +3306,65 @@ func replDescsToStoreIDs(descs []roachpb.ReplicaDescriptor) []roachpb.StoreID {
32203306
}
32213307
return ret
32223308
}
3309+
3310+
// roundToNearestPriorityCategory rounds a priority to the nearest 100. n should
3311+
// be non-negative.
3312+
func roundToNearestPriorityCategory(n float64) float64 {
3313+
return math.Round(n/100.0) * 100
3314+
}
3315+
3316+
// CheckPriorityInversion returns whether there was a priority inversion (and
3317+
// the range should not be processed at this time, since doing so could starve
3318+
// higher-priority items), and whether the caller should re-add the range to the
3319+
// queue (presumably under its new priority). A priority inversion happens if
3320+
// the priority at enqueue time is higher than the priority corresponding to the
3321+
// action computed at processing time. Caller should re-add the range to the
3322+
// queue if it has gone from a repair action to lowest priority
3323+
// (AllocatorConsiderRebalance).
3324+
//
3325+
// Note: Changing from AllocatorRangeUnavailable/AllocatorNoop to
3326+
// AllocatorConsiderRebalance is not treated as a priority inversion. Going from
3327+
// a repair action to AllocatorRangeUnavailable/AllocatorNoop is considered a
3328+
// priority inversion but shouldRequeue = false.
3329+
//
3330+
// INVARIANT: shouldRequeue => isInversion
3331+
func CheckPriorityInversion(
3332+
priorityAtEnqueue float64, actionAtProcessing AllocatorAction,
3333+
) (isInversion bool, shouldRequeue bool) {
3334+
// NB: priorityAtEnqueue is -1 for callers such as scatter, dry runs, and
3335+
// manual queue runs. Priority inversion does not apply to these calls.
3336+
if priorityAtEnqueue == -1 {
3337+
return false, false
3338+
}
3339+
3340+
// NB: we need to check for when priorityAtEnqueue falls within the range
3341+
// of the allocator actions because store.Enqueue might enqueue things with
3342+
// a very high priority (1e5). In those cases, we do not want to requeue
3343+
// these actions or count it as an inversion.
3344+
withinPriorityRange := func(priority float64) bool {
3345+
return AllocatorNoop.Priority() <= priority && priority <= AllocatorMaxPriority.Priority()
3346+
}
3347+
if !withinPriorityRange(priorityAtEnqueue) {
3348+
return false, false
3349+
}
3350+
3351+
if priorityAtEnqueue > AllocatorConsiderRebalance.Priority() && actionAtProcessing == AllocatorConsiderRebalance {
3352+
return true, true
3353+
}
3354+
3355+
// NB: Usually, the priority at enqueue time should correspond to
3356+
// action.Priority(). However, for AllocatorAddVoter,
3357+
// AllocatorRemoveDeadVoter, AllocatorRemoveVoter, the priority can be
3358+
// adjusted at enqueue time (See ComputeAction for more details). However, we
3359+
// expect the adjustment to be relatively small (<49). So we round the
3360+
// priority to the nearest 100 to compare against
3361+
// actionAtProcessing.Priority(). Without this rounding, we might treat going
3362+
// from 10000 to 999 as an inversion, but it was just due to the adjustment.
3363+
// Note that priorities at AllocatorFinalizeAtomicReplicationChange,
3364+
// AllocatorRemoveLearner, and AllocatorReplaceDeadVoter will be rounded to
3365+
// the same priority. They are so close to each other, so we don't really
3366+
// count it as an inversion among them.
3367+
normPriorityAtEnqueue := roundToNearestPriorityCategory(priorityAtEnqueue)
3368+
isInversion = normPriorityAtEnqueue > actionAtProcessing.Priority()
3369+
return isInversion, false
3370+
}

0 commit comments

Comments
 (0)