@@ -252,6 +252,20 @@ func (a AllocatorAction) SafeValue() {}
252
252
// range. Within a given range, the ordering of the various checks inside
253
253
// `Allocator.computeAction` determines which repair/rebalancing actions are
254
254
// taken before the others.
255
+ //
256
+ // NB: Priorities should be non-negative and should be spaced in multiples of
257
+ // 100 unless you believe they should belong to the same priority category.
258
+ // AllocatorNoop should have the lowest priority. CheckPriorityInversion depends
259
+ // on this contract. In most cases, the allocator returns a priority that
260
+ // matches the definitions below. For AllocatorAddVoter,
261
+ // AllocatorRemoveDeadVoter, and AllocatorRemoveVoter, the priority may be
262
+ // adjusted (see ComputeAction for details), but the adjustment is expected to
263
+ // be small (<49).
264
+ //
265
+ // Exceptions: AllocatorFinalizeAtomicReplicationChange, AllocatorRemoveLearner,
266
+ // and AllocatorReplaceDeadVoter violates the spacing of 100. These cases
267
+ // predate this comment, so we allow them as they belong to the same general
268
+ // priority category.
255
269
func (a AllocatorAction ) Priority () float64 {
256
270
switch a {
257
271
case AllocatorFinalizeAtomicReplicationChange :
@@ -975,6 +989,49 @@ func (a *Allocator) ComputeAction(
975
989
return action , priority
976
990
}
977
991
992
+ // computeAction determines the action to take on a range along with its
993
+ // priority.
994
+ //
995
+ // NB: The returned priority may include a small adjustment and therefore might
996
+ // not exactly match action.Priority(). See AllocatorAddVoter,
997
+ // AllocatorRemoveDeadVoter, AllocatorRemoveVoter below. The adjustment should
998
+ // be <49 with two assumptions below. New uses on this contract should be
999
+ // avoided since the assumptions are not strong guarantees (especially the
1000
+ // second one).
1001
+ //
1002
+ // The claim that the adjustment is < 49 has two assumptions:
1003
+ // 1. min(num_replicas,total_nodes) in zone configuration is < 98.
1004
+ // 2. when ranges are not under-replicated, the difference between
1005
+ // min(num_replicas,total_nodes)/2-1 and existing_replicas is < 49.
1006
+ //
1007
+ // neededVoters <= min(num_replicas,total_nodes)
1008
+ // desiredQuorum = neededVoters/2-1
1009
+ // quorum = haveVoters/2-1
1010
+ //
1011
+ // For AllocatorAddVoter, we know haveVoters < neededVoters
1012
+ // adjustment = desiredQuorum-haveVoters = neededVoters/2-1-haveVoters
1013
+ // To find the worst case (largest adjustment),
1014
+ // 1. haveVoters = neededVoters-1,
1015
+ // adjustment = neededVoters/2-1-(neededVoters-1)
1016
+ // = neededVoters/2-neededVoters = -neededVoters/2
1017
+ // 2. haveVoters = 0
1018
+ // adjustement = neededVoters/2-1
1019
+ //
1020
+ // In order for adjustment to be <49, neededVoters/2<49 => neededVoters<98.
1021
+ // Hence the first assumption.
1022
+ //
1023
+ // For AllocatorRemoveDeadVoter, we know haveVoters >= neededVoters
1024
+ // adjustment = desiredQuorum-haveVoters = neededVoters/2-1-haveVoters
1025
+ // To find the worst case (largest adjustment),
1026
+ // 1. neededVoters/2-1 is much larger than haveVoters: given haveVoters >=
1027
+ // neededVoters, haveVoters/2-1 >= neededVoters/2-1. So this case is impossible.
1028
+ // 2. neededVoters/2-1 is much smaller than haveVoters: since ranges could be
1029
+ // over-replicated, theoretically speaking, there may be no upper bounds on
1030
+ // haveVoters. In order for adjustment to be < 49, we can only make an
1031
+ // assumption here that the difference between neededVoters/2-1 and haveVoters
1032
+ // cannot be >= 49 in this case.
1033
+ //
1034
+ // For AllocatorRemoveVoter, adjustment is haveVoters%2 = 0 or 1 < 49.
978
1035
func (a * Allocator ) computeAction (
979
1036
ctx context.Context ,
980
1037
storePool storepool.AllocatorStorePool ,
@@ -3246,7 +3303,8 @@ func replDescsToStoreIDs(descs []roachpb.ReplicaDescriptor) []roachpb.StoreID {
3246
3303
return ret
3247
3304
}
3248
3305
3249
- // roundToNearestPriorityCategory rounds a priority to the nearest 100. n should be non-negative.
3306
+ // roundToNearestPriorityCategory rounds a priority to the nearest 100. n should
3307
+ // be non-negative.
3250
3308
func roundToNearestPriorityCategory (n float64 ) float64 {
3251
3309
return math .Round (n / 100.0 ) * 100
3252
3310
}
@@ -3257,15 +3315,21 @@ func withinPriorityRange(priority float64) bool {
3257
3315
return AllocatorNoop .Priority () <= priority && priority <= AllocatorFinalizeAtomicReplicationChange .Priority ()
3258
3316
}
3259
3317
3260
- // CheckPriorityInversion checks if the priority at enqueue time is higher than
3261
- // the priority corresponding to the action computed at processing time. It
3262
- // returns whether there was a priority inversion and whether the caller should
3263
- // skip the processing of the range since the inversion is considered unfair.
3264
- // Currently, we only consider the inversion as unfair if it has gone from a
3265
- // repair action to lowest priority (AllocatorConsiderRebalance). We let
3266
- // AllocatorRangeUnavailable, AllocatorNoop pass through since they are noop.
3318
+ // CheckPriorityInversion returns whether there was a priority inversion (and
3319
+ // the range should not be processed at this time, since doing so could starve
3320
+ // higher-priority items), and whether the caller should re-add the range to the
3321
+ // queue (presumably under its new priority). A priority inversion happens if
3322
+ // the priority at enqueue time is higher than the priority corresponding to the
3323
+ // action computed at processing time. Caller should re-add the range to the
3324
+ // queue if it has gone from a repair action to lowest priority
3325
+ // (AllocatorConsiderRebalance).
3326
+ //
3327
+ // Note: Changing from AllocatorRangeUnavailable/AllocatorNoop to
3328
+ // AllocatorConsiderRebalance is not treated as a priority inversion. Going from
3329
+ // a repair action to AllocatorRangeUnavailable/AllocatorNoop is considered a
3330
+ // priority inversion but shouldRequeue = false.
3267
3331
//
3268
- // NB: If shouldRequeue is true, isInversion must be true.
3332
+ // INVARIANT: shouldRequeue => isInversion
3269
3333
func CheckPriorityInversion (
3270
3334
priorityAtEnqueue float64 , actionAtProcessing AllocatorAction ,
3271
3335
) (isInversion bool , shouldRequeue bool ) {
@@ -3292,7 +3356,7 @@ func CheckPriorityInversion(
3292
3356
// action.Priority(). However, for AllocatorAddVoter,
3293
3357
// AllocatorRemoveDeadVoter, AllocatorRemoveVoter, the priority can be
3294
3358
// adjusted at enqueue time (See ComputeAction for more details). However, we
3295
- // expect the adjustment to be relatively small (<100 ). So we round the
3359
+ // expect the adjustment to be relatively small (<49 ). So we round the
3296
3360
// priority to the nearest 100 to compare against
3297
3361
// actionAtProcessing.Priority(). Without this rounding, we might treat going
3298
3362
// from 10000 to 999 as an inversion, but it was just due to the adjustment.
0 commit comments