Skip to content

Commit dc03efb

Browse files
committed
mmaprototype: small cleanup related to negative adjusted loads
Epic: CRDB-55052 Release note: None
1 parent 2d5eaef commit dc03efb

File tree

2 files changed

+31
-20
lines changed

2 files changed

+31
-20
lines changed

pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -772,10 +772,9 @@ type storeState struct {
772772
storeLoad
773773
StoreAttributesAndLocality
774774
adjusted struct {
775-
// TODO: these adjusted load values can become negative due to applying
776-
// pending changes. We need to let them be negative to retain the ability
777-
// to undo pending changes. Audit the mean computation and rebalancing
778-
// code to ensure that we bump up to a lower bound of zero.
775+
// NB: these load values can become negative due to applying pending
776+
// changes. We need to let them be negative to retain the ability to undo
777+
// pending changes.
779778
load LoadVector
780779
secondaryLoad SecondaryLoadVector
781780
// Pending changes for computing loadReplicas and load.
@@ -977,6 +976,7 @@ func newStoreState() *storeState {
977976
type nodeState struct {
978977
stores []roachpb.StoreID
979978
NodeLoad
979+
// NB: adjustedCPU can be negative.
980980
adjustedCPU LoadValue
981981
}
982982

@@ -1780,10 +1780,14 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
17801780
// will start shedding replicas, so this is just a heuristic.
17811781
fraction = minLeaseLoadFraction
17821782
}
1783-
threshold := LoadValue(float64(ss.adjusted.load[topk.dim]) * fraction)
1783+
// The max(0, ...) is defensive, in case the adjusted load is negative.
1784+
// Given that this is a most overloaded dim, the likelihood of the
1785+
// adjusted load being negative is very low.
1786+
adjustedStoreLoadValue := max(0, ss.adjusted.load[topk.dim])
1787+
threshold := LoadValue(float64(adjustedStoreLoadValue) * fraction)
17841788
if ss.reportedSecondaryLoad[ReplicaCount] > 0 {
17851789
// Allow all ranges above 90% of the mean. This is quite arbitrary.
1786-
meanLoad := (ss.adjusted.load[topk.dim] * 9) / (ss.reportedSecondaryLoad[ReplicaCount] * 10)
1790+
meanLoad := (adjustedStoreLoadValue * 9) / (ss.reportedSecondaryLoad[ReplicaCount] * 10)
17871791
threshold = min(meanLoad, threshold)
17881792
}
17891793
topk.threshold = threshold
@@ -2209,11 +2213,6 @@ func (cs *clusterState) undoReplicaChange(change ReplicaChange) {
22092213
cs.undoChangeLoadDelta(change)
22102214
}
22112215

2212-
// TODO(kvoli,sumeerbhola): The load of the store and node can become negative
2213-
// when applying or undoing load adjustments. For load adjustments to be
2214-
// reversible quickly, we aren't able to zero out the value when negative. We
2215-
// should handle the negative values when using them.
2216-
22172216
// applyChangeLoadDelta adds the change load delta to the adjusted load of the
22182217
// store and node affected.
22192218
func (cs *clusterState) applyChangeLoadDelta(change ReplicaChange) {
@@ -2416,6 +2415,11 @@ func (cs *clusterState) canShedAndAddLoad(
24162415
if targetSS.adjusted.load[overloadedDim] > 0 {
24172416
overloadedDimFractionIncrease = float64(deltaToAdd[overloadedDim]) /
24182417
float64(targetSS.adjusted.load[overloadedDim])
2418+
} else {
2419+
// Else, the adjusted load on the overloadedDim is zero or negative, which
2420+
// is possible, but extremely rare in practice. We arbitrarily set the
2421+
// fraction increase to 1.0 in this case.
2422+
overloadedDimFractionIncrease = 1.0
24192423
}
24202424
otherDimensionsBecameWorseInTarget := false
24212425
for i := range targetSLS.dimSummary {
@@ -2428,16 +2432,18 @@ func (cs *clusterState) canShedAndAddLoad(
24282432
}
24292433
// This is an overloaded dimension in the target. Only allow small
24302434
// increases along this dimension.
2431-
dimFractionIncrease := math.MaxFloat64
24322435
if targetSS.adjusted.load[dim] > 0 {
2433-
dimFractionIncrease = float64(deltaToAdd[dim]) / float64(targetSS.adjusted.load[dim])
2434-
}
2435-
// The use of 33% is arbitrary.
2436-
if dimFractionIncrease > overloadedDimFractionIncrease/3 {
2437-
log.KvDistribution.Infof(ctx, "%v: %f > %f/3", dim, dimFractionIncrease, overloadedDimFractionIncrease)
2438-
otherDimensionsBecameWorseInTarget = true
2439-
break
2436+
dimFractionIncrease := float64(deltaToAdd[dim]) / float64(targetSS.adjusted.load[dim])
2437+
// The use of 33% is arbitrary.
2438+
if dimFractionIncrease > overloadedDimFractionIncrease/3 {
2439+
log.KvDistribution.Infof(ctx, "%v: %f > %f/3", dim, dimFractionIncrease, overloadedDimFractionIncrease)
2440+
otherDimensionsBecameWorseInTarget = true
2441+
break
2442+
}
24402443
}
2444+
// Else the adjusted load in dimension dim is zero or negative, which is
2445+
// possible, but extremely rare in practice. We ignore this dimension in
2446+
// that case.
24412447
}
24422448
canAddLoad = overloadedDimPermitsChange && !otherDimensionsBecameWorseInTarget &&
24432449
targetSLS.maxFractionPendingIncrease < epsilon &&
@@ -2527,7 +2533,6 @@ func computeLoadSummary(
25272533
var dimSummary [NumLoadDimensions]loadSummary
25282534
var worstDim LoadDimension
25292535
for i := range msl.load {
2530-
// TODO(kvoli,sumeerbhola): Handle negative adjusted store/node loads.
25312536
const nodeIDForLogging = 0
25322537
ls := loadSummaryForDimension(ctx, ss.StoreID, nodeIDForLogging, LoadDimension(i), ss.adjusted.load[i], ss.capacity[i],
25332538
msl.load[i], msl.util[i])

pkg/kv/kvserver/allocator/mmaprototype/load.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,8 @@ func computeMeansForStoreSet(
370370
clear(scratchStores)
371371
n := 0
372372
for _, storeID := range stores {
373+
// NB: using reported load and not adjusted load, so cannot be
374+
// negative.
373375
nodeID, sload := loadProvider.getStoreReportedLoad(storeID)
374376
if _, ok := scratchStores[storeID]; ok {
375377
continue
@@ -389,6 +391,8 @@ func computeMeansForStoreSet(
389391
}
390392
nLoad := scratchNodes[nodeID]
391393
if nLoad == nil {
394+
// NB: using reported load and not adjusted load, so cannot be
395+
// negative.
392396
scratchNodes[nodeID] = loadProvider.getNodeReportedLoad(nodeID)
393397
}
394398
}
@@ -467,6 +471,8 @@ func (ls loadSummary) SafeFormat(w redact.SafePrinter, _ rune) {
467471
}
468472

469473
// Computes the loadSummary for a particular load dimension.
474+
//
475+
// NB: load can be negative since it may be adjusted load.
470476
func loadSummaryForDimension(
471477
ctx context.Context,
472478
storeID roachpb.StoreID,

0 commit comments

Comments
 (0)