Merge #160615

craig[bot] · tbg · craig[bot] · commit 288bc25f83f0 · 2026-01-07T15:40:09.000Z
160615: mmaprototype: fix panic when sls <= loadNoChange but nls > loadNoChange r=wenyihu6 a=tbg When deciding whether a store should shed load, computeCandidatesForReplicaTransfer was checking both sls (store-level) and nls (node-level) load summaries. If nls indicated overload but sls did not, shedding would proceed anyway. This caused a panic in sortTargetCandidateSetAndPick which requires loadThreshold > loadNoChange. The fix is to only check sls when deciding if a store should shed. If sls <= loadNoChange, the store itself isn't overloaded relative to candidates that can receive the load. High nls with low sls means other stores on the node are causing node-level overload, so shedding from this store wouldn't help. Fixes #160569 Co-authored-by: Tobias Grieger <tobias.b.grieger@gmail.com>
diff --git a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go
@@ -947,8 +947,10 @@ func (cs *clusterState) computeCandidatesForReplicaTransfer(
 	}
 
 	sheddingSLS = cs.computeLoadSummary(ctx, loadSheddingStore, &effectiveMeans.storeLoad, &effectiveMeans.nodeLoad)
-	if sheddingSLS.sls <= loadNoChange && sheddingSLS.nls <= loadNoChange {
-		// In this set of stores, this store no longer looks overloaded.
+	if sheddingSLS.sls <= loadNoChange {
+		// In this set of stores, this store no longer looks overloaded. Note that
+		// we don't consider nls here: if nls is high but sls is low, it means other
+		// stores on the node are causing node-level overload, not this store.
 		passObs.replicaShed(notOverloaded)
 		return candidateSet{}, sheddingSLS
 	}
diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go
@@ -413,7 +413,14 @@ func TestClusterState(t *testing.T) {
 
 			// Recursively invoked in `include` directive.
 			var invokeFn func(t *testing.T, d *datadriven.TestData) string
-			invokeFn = func(t *testing.T, d *datadriven.TestData) string {
+			invokeFn = func(t *testing.T, d *datadriven.TestData) (output string) {
+				// Catch panics and return them as output instead of failing the test.
+				// This allows us to write regression tests for panics.
+				defer func() {
+					if r := recover(); r != nil {
+						output = fmt.Sprintf("panic: %v", r)
+					}
+				}()
 				// Start a recording span for each command. Commands that want to
 				// include the trace in their output can call finishAndGet().
 				ctx, finishAndGet := tracing.ContextWithRecordingSpan(
diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_nls_sls_mismatch.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_nls_sls_mismatch.txt
@@ -0,0 +1,109 @@
+# Regression test for issue #160569: panic in sortTargetCandidateSetAndPick
+# when sls <= loadNoChange but nls > loadNoChange.
+#
+# Setup: Node n1 has two stores:
+# - s1: low CPU, moderate writes
+# - s2: very high CPU, replica disposition set to "refusing" (excluded from candidate set)
+# Node n2 has s3.
+#
+# When we try to shed from s1:
+# - Store mean over {s1, s3} (excluding s2 due to disposition)
+# - s1's store-level load relative to this mean is <= loadNoChange
+# - But n1's node CPU (s1+s2) is high, making nls > loadNoChange
+#
+# Before fix: computeCandidatesForReplicaTransfer doesn't bail out when
+# sls <= loadNoChange && nls > loadNoChange, then passes sls as loadThreshold
+# to sortTargetCandidateSetAndPick which requires loadThreshold > loadNoChange.
+
+set-store
+  store-id=1 node-id=1
+  store-id=2 node-id=1
+  store-id=3 node-id=2
+----
+node-id=1 locality-tiers=node=1
+  store-id=1 attrs=
+  store-id=2 attrs=
+node-id=2 locality-tiers=node=2
+  store-id=3 attrs=
+
+# s2 refuses replicas, so it will be filtered out of the candidate set
+# but its CPU still contributes to n1's node load.
+set-store-status store-id=2 replicas=refusing
+----
+ok refusing=replicas
+
+# 8-vCPU machines: capacity = 8e9 ns/s
+# s1: 1 vCPU (1B ns/s), 50% write bandwidth - at cluster level, WriteBandwidth is overloaded
+#     (because cluster mean includes s2's low writes, bringing mean down)
+# s2: 6 vCPU (6B ns/s), 10% writes - makes n1's total CPU = 7 vCPU
+# s3: 1 vCPU (1B ns/s), 50% write bandwidth
+#
+# Cluster store write mean = (50M + 10M + 50M) / 3 = 36.7M
+# s1's 50M is ~36% above mean => overloadSlow for WriteBandwidth
+#
+# Candidate set {s1, s3} store write mean = (50M + 50M) / 2 = 50M
+# s1's 50M equals mean => loadNoChange for WriteBandwidth
+#
+# Node CPU: n1 = 7B (s1+s2), n2 = 1B
+# Node mean = (7B + 1B) / 2 = 4B
+# n1's 7B is 75% above mean => overloadSlow for node CPU
+store-load-msg
+  store-id=1 node-id=1 load=[1000000000,50000000,0] capacity=[8000000000,100000000,1000000000] secondary-load=0 load-time=0s
+  store-id=2 node-id=1 load=[6000000000,10000000,0] capacity=[8000000000,100000000,1000000000] secondary-load=0 load-time=0s
+  store-id=3 node-id=2 load=[1000000000,50000000,0] capacity=[8000000000,100000000,1000000000] secondary-load=0 load-time=0s
+----
+
+# Single-replica range on s1.
+store-leaseholder-msg
+store-id=1
+  range-id=1 load=[100000000,10000000,0] raft-cpu=50000000
+    store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true
+    config=num_replicas=1 constraints={} voter_constraints={}
+----
+
+# Before fix: panics in sortTargetCandidateSetAndPick
+# After fix: s1 should not try to shed since sls <= loadNoChange
+rebalance-stores store-id=1
+----
+[mmaid=1] rebalanceStores begins
+[mmaid=1] cluster means: (stores-load [cpu:2.7s/s, write-bandwidth:37 MB/s, byte-size:0 B]) (stores-capacity [cpu:8s/s, write-bandwidth:100 MB/s, byte-size:1.0 GB]) (nodes-cpu-load 4000000000) (nodes-cpu-capacity 12000000000)
+[mmaid=1] load summary for dim=CPURate (s1): loadLow, reason: load is >10% below mean [load=1000000000 meanLoad=2666666666 fractionUsed=12.50% meanUtil=33.33% capacity=8000000000]
+[mmaid=1] load summary for dim=WriteBandwidth (s1): overloadSlow, reason: fractionUsed < 75% [load=50000000 meanLoad=36666666 fractionUsed=50.00% meanUtil=36.67% capacity=100000000]
+[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000000000]
+[mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed < 75% [load=7000000000 meanLoad=4000000000 fractionUsed=43.75% meanUtil=33.33% capacity=16000000000]
+[mmaid=1] evaluating s1: node load overloadSlow, store load overloadSlow, worst dim WriteBandwidth
+[mmaid=1] overload-continued s1 ((store=overloadSlow worst=WriteBandwidth cpu=loadLow writes=overloadSlow bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true))) - within grace period
+[mmaid=1] store s1 was added to shedding store list
+[mmaid=1] load summary for dim=CPURate (s2): overloadSlow, reason: fractionUsed < 75% and >1.75x meanUtil [load=6000000000 meanLoad=2666666666 fractionUsed=75.00% meanUtil=33.33% capacity=8000000000]
+[mmaid=1] load summary for dim=WriteBandwidth (s2): loadLow, reason: load is >10% below mean [load=10000000 meanLoad=36666666 fractionUsed=10.00% meanUtil=36.67% capacity=100000000]
+[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000000000]
+[mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed < 75% [load=7000000000 meanLoad=4000000000 fractionUsed=43.75% meanUtil=33.33% capacity=16000000000]
+[mmaid=1] evaluating s2: node load overloadSlow, store load overloadSlow, worst dim CPURate
+[mmaid=1] overload-continued s2 ((store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadLow bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true))) - within grace period
+[mmaid=1] store s2 was added to shedding store list
+[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=1000000000 meanLoad=2666666666 fractionUsed=12.50% meanUtil=33.33% capacity=8000000000]
+[mmaid=1] load summary for dim=WriteBandwidth (s3): overloadSlow, reason: fractionUsed < 75% [load=50000000 meanLoad=36666666 fractionUsed=50.00% meanUtil=36.67% capacity=100000000]
+[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000000000]
+[mmaid=1] load summary for dim=CPURate (n2): loadLow, reason: load is >10% below mean [load=1000000000 meanLoad=4000000000 fractionUsed=12.50% meanUtil=33.33% capacity=8000000000]
+[mmaid=1] evaluating s3: node load loadLow, store load overloadSlow, worst dim WriteBandwidth
+[mmaid=1] overload-continued s3 ((store=overloadSlow worst=WriteBandwidth cpu=loadLow writes=overloadSlow bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))) - within grace period
+[mmaid=1] store s3 was added to shedding store list
+[mmaid=1] start processing shedding store s1: cpu node load overloadSlow, store load overloadSlow, worst dim WriteBandwidth
+[mmaid=1] top-K[WriteBandwidth] ranges for s1 with lease on local s1: r1:[cpu:100ms/s, write-bandwidth:10 MB/s, byte-size:0 B]
+[mmaid=1] skipping lease shedding for calling store s1: not cpu overloaded: loadLow
+[mmaid=1] attempting to shed replicas next
+[mmaid=1] skipping s2 for replica transfer: replica disposition refusing (health ok)
+[mmaid=1] pre-means filtered 1 stores → remaining [1 3], means: store={[1000000000 50000000 0] [8000000000 100000000 1000000000] [0.125 0.5 0] [0 0]} node={4000000000 12000000000 0.3333333333333333}
+[mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=1000000000 meanLoad=1000000000 fractionUsed=12.50% meanUtil=12.50% capacity=8000000000]
+[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=50000000 meanLoad=50000000 fractionUsed=50.00% meanUtil=50.00% capacity=100000000]
+[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000000000]
+[mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed < 75% [load=7000000000 meanLoad=4000000000 fractionUsed=43.75% meanUtil=33.33% capacity=16000000000]
+[mmaid=1] considering replica-transfer r1 from s1: store load [cpu:1s/s, write-bandwidth:50 MB/s, byte-size:0 B]
+[mmaid=1] candidates are:
+[mmaid=1] result(failed): no candidates found for r1 after exclusions
+[mmaid=1] start processing shedding store s2: cpu node load overloadSlow, store load overloadSlow, worst dim CPURate
+[mmaid=1] no top-K[CPURate] ranges found for s2 with lease on local s1
+[mmaid=1] start processing shedding store s3: cpu node load loadLow, store load overloadSlow, worst dim WriteBandwidth
+[mmaid=1] no top-K[WriteBandwidth] ranges found for s3 with lease on local s1
+[mmaid=1] rebalancing pass failures (store,reason:count): (s1,not-overloaded:1)
+pending(0)

Original file line number	Diff line number	Diff line change
`@@ -947,8 +947,10 @@ func (cs *clusterState) computeCandidatesForReplicaTransfer(`
`947`	`947`	`}`
`948`	`948`
`949`	`949`	`sheddingSLS = cs.computeLoadSummary(ctx, loadSheddingStore, &effectiveMeans.storeLoad, &effectiveMeans.nodeLoad)`
`950`		`- if sheddingSLS.sls <= loadNoChange && sheddingSLS.nls <= loadNoChange {`
`951`		`- // In this set of stores, this store no longer looks overloaded.`
	`950`	`+ if sheddingSLS.sls <= loadNoChange {`
	`951`	`+ // In this set of stores, this store no longer looks overloaded. Note that`
	`952`	`+ // we don't consider nls here: if nls is high but sls is low, it means other`
	`953`	`+ // stores on the node are causing node-level overload, not this store.`
`952`	`954`	`passObs.replicaShed(notOverloaded)`
`953`	`955`	`return candidateSet{}, sheddingSLS`
`954`	`956`	`}`