Skip to content

Commit 288bc25

Browse files
craig[bot]tbg
andcommitted
Merge #160615
160615: mmaprototype: fix panic when sls <= loadNoChange but nls > loadNoChange r=wenyihu6 a=tbg When deciding whether a store should shed load, computeCandidatesForReplicaTransfer was checking both sls (store-level) and nls (node-level) load summaries. If nls indicated overload but sls did not, shedding would proceed anyway. This caused a panic in sortTargetCandidateSetAndPick which requires loadThreshold > loadNoChange. The fix is to only check sls when deciding if a store should shed. If sls <= loadNoChange, the store itself isn't overloaded relative to candidates that can receive the load. High nls with low sls means other stores on the node are causing node-level overload, so shedding from this store wouldn't help. Fixes #160569 Co-authored-by: Tobias Grieger <tobias.b.grieger@gmail.com>
2 parents 905a2a4 + 1ae5f84 commit 288bc25

File tree

3 files changed

+121
-3
lines changed

3 files changed

+121
-3
lines changed

pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -947,8 +947,10 @@ func (cs *clusterState) computeCandidatesForReplicaTransfer(
947947
}
948948

949949
sheddingSLS = cs.computeLoadSummary(ctx, loadSheddingStore, &effectiveMeans.storeLoad, &effectiveMeans.nodeLoad)
950-
if sheddingSLS.sls <= loadNoChange && sheddingSLS.nls <= loadNoChange {
951-
// In this set of stores, this store no longer looks overloaded.
950+
if sheddingSLS.sls <= loadNoChange {
951+
// In this set of stores, this store no longer looks overloaded. Note that
952+
// we don't consider nls here: if nls is high but sls is low, it means other
953+
// stores on the node are causing node-level overload, not this store.
952954
passObs.replicaShed(notOverloaded)
953955
return candidateSet{}, sheddingSLS
954956
}

pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,14 @@ func TestClusterState(t *testing.T) {
413413

414414
// Recursively invoked in `include` directive.
415415
var invokeFn func(t *testing.T, d *datadriven.TestData) string
416-
invokeFn = func(t *testing.T, d *datadriven.TestData) string {
416+
invokeFn = func(t *testing.T, d *datadriven.TestData) (output string) {
417+
// Catch panics and return them as output instead of failing the test.
418+
// This allows us to write regression tests for panics.
419+
defer func() {
420+
if r := recover(); r != nil {
421+
output = fmt.Sprintf("panic: %v", r)
422+
}
423+
}()
417424
// Start a recording span for each command. Commands that want to
418425
// include the trace in their output can call finishAndGet().
419426
ctx, finishAndGet := tracing.ContextWithRecordingSpan(
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# Regression test for issue #160569: panic in sortTargetCandidateSetAndPick
2+
# when sls <= loadNoChange but nls > loadNoChange.
3+
#
4+
# Setup: Node n1 has two stores:
5+
# - s1: low CPU, moderate writes
6+
# - s2: very high CPU, replica disposition set to "refusing" (excluded from candidate set)
7+
# Node n2 has s3.
8+
#
9+
# When we try to shed from s1:
10+
# - Store mean over {s1, s3} (excluding s2 due to disposition)
11+
# - s1's store-level load relative to this mean is <= loadNoChange
12+
# - But n1's node CPU (s1+s2) is high, making nls > loadNoChange
13+
#
14+
# Before fix: computeCandidatesForReplicaTransfer doesn't bail out when
15+
# sls <= loadNoChange && nls > loadNoChange, then passes sls as loadThreshold
16+
# to sortTargetCandidateSetAndPick which requires loadThreshold > loadNoChange.
17+
18+
set-store
19+
store-id=1 node-id=1
20+
store-id=2 node-id=1
21+
store-id=3 node-id=2
22+
----
23+
node-id=1 locality-tiers=node=1
24+
store-id=1 attrs=
25+
store-id=2 attrs=
26+
node-id=2 locality-tiers=node=2
27+
store-id=3 attrs=
28+
29+
# s2 refuses replicas, so it will be filtered out of the candidate set
30+
# but its CPU still contributes to n1's node load.
31+
set-store-status store-id=2 replicas=refusing
32+
----
33+
ok refusing=replicas
34+
35+
# 8-vCPU machines: capacity = 8e9 ns/s
36+
# s1: 1 vCPU (1B ns/s), 50% write bandwidth - at cluster level, WriteBandwidth is overloaded
37+
# (because cluster mean includes s2's low writes, bringing mean down)
38+
# s2: 6 vCPU (6B ns/s), 10% writes - makes n1's total CPU = 7 vCPU
39+
# s3: 1 vCPU (1B ns/s), 50% write bandwidth
40+
#
41+
# Cluster store write mean = (50M + 10M + 50M) / 3 = 36.7M
42+
# s1's 50M is ~36% above mean => overloadSlow for WriteBandwidth
43+
#
44+
# Candidate set {s1, s3} store write mean = (50M + 50M) / 2 = 50M
45+
# s1's 50M equals mean => loadNoChange for WriteBandwidth
46+
#
47+
# Node CPU: n1 = 7B (s1+s2), n2 = 1B
48+
# Node mean = (7B + 1B) / 2 = 4B
49+
# n1's 7B is 75% above mean => overloadSlow for node CPU
50+
store-load-msg
51+
store-id=1 node-id=1 load=[1000000000,50000000,0] capacity=[8000000000,100000000,1000000000] secondary-load=0 load-time=0s
52+
store-id=2 node-id=1 load=[6000000000,10000000,0] capacity=[8000000000,100000000,1000000000] secondary-load=0 load-time=0s
53+
store-id=3 node-id=2 load=[1000000000,50000000,0] capacity=[8000000000,100000000,1000000000] secondary-load=0 load-time=0s
54+
----
55+
56+
# Single-replica range on s1.
57+
store-leaseholder-msg
58+
store-id=1
59+
range-id=1 load=[100000000,10000000,0] raft-cpu=50000000
60+
store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true
61+
config=num_replicas=1 constraints={} voter_constraints={}
62+
----
63+
64+
# Before fix: panics in sortTargetCandidateSetAndPick
65+
# After fix: s1 should not try to shed since sls <= loadNoChange
66+
rebalance-stores store-id=1
67+
----
68+
[mmaid=1] rebalanceStores begins
69+
[mmaid=1] cluster means: (stores-load [cpu:2.7s/s, write-bandwidth:37 MB/s, byte-size:0 B]) (stores-capacity [cpu:8s/s, write-bandwidth:100 MB/s, byte-size:1.0 GB]) (nodes-cpu-load 4000000000) (nodes-cpu-capacity 12000000000)
70+
[mmaid=1] load summary for dim=CPURate (s1): loadLow, reason: load is >10% below mean [load=1000000000 meanLoad=2666666666 fractionUsed=12.50% meanUtil=33.33% capacity=8000000000]
71+
[mmaid=1] load summary for dim=WriteBandwidth (s1): overloadSlow, reason: fractionUsed < 75% [load=50000000 meanLoad=36666666 fractionUsed=50.00% meanUtil=36.67% capacity=100000000]
72+
[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000000000]
73+
[mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed < 75% [load=7000000000 meanLoad=4000000000 fractionUsed=43.75% meanUtil=33.33% capacity=16000000000]
74+
[mmaid=1] evaluating s1: node load overloadSlow, store load overloadSlow, worst dim WriteBandwidth
75+
[mmaid=1] overload-continued s1 ((store=overloadSlow worst=WriteBandwidth cpu=loadLow writes=overloadSlow bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true))) - within grace period
76+
[mmaid=1] store s1 was added to shedding store list
77+
[mmaid=1] load summary for dim=CPURate (s2): overloadSlow, reason: fractionUsed < 75% and >1.75x meanUtil [load=6000000000 meanLoad=2666666666 fractionUsed=75.00% meanUtil=33.33% capacity=8000000000]
78+
[mmaid=1] load summary for dim=WriteBandwidth (s2): loadLow, reason: load is >10% below mean [load=10000000 meanLoad=36666666 fractionUsed=10.00% meanUtil=36.67% capacity=100000000]
79+
[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000000000]
80+
[mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed < 75% [load=7000000000 meanLoad=4000000000 fractionUsed=43.75% meanUtil=33.33% capacity=16000000000]
81+
[mmaid=1] evaluating s2: node load overloadSlow, store load overloadSlow, worst dim CPURate
82+
[mmaid=1] overload-continued s2 ((store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadLow bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true))) - within grace period
83+
[mmaid=1] store s2 was added to shedding store list
84+
[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=1000000000 meanLoad=2666666666 fractionUsed=12.50% meanUtil=33.33% capacity=8000000000]
85+
[mmaid=1] load summary for dim=WriteBandwidth (s3): overloadSlow, reason: fractionUsed < 75% [load=50000000 meanLoad=36666666 fractionUsed=50.00% meanUtil=36.67% capacity=100000000]
86+
[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000000000]
87+
[mmaid=1] load summary for dim=CPURate (n2): loadLow, reason: load is >10% below mean [load=1000000000 meanLoad=4000000000 fractionUsed=12.50% meanUtil=33.33% capacity=8000000000]
88+
[mmaid=1] evaluating s3: node load loadLow, store load overloadSlow, worst dim WriteBandwidth
89+
[mmaid=1] overload-continued s3 ((store=overloadSlow worst=WriteBandwidth cpu=loadLow writes=overloadSlow bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))) - within grace period
90+
[mmaid=1] store s3 was added to shedding store list
91+
[mmaid=1] start processing shedding store s1: cpu node load overloadSlow, store load overloadSlow, worst dim WriteBandwidth
92+
[mmaid=1] top-K[WriteBandwidth] ranges for s1 with lease on local s1: r1:[cpu:100ms/s, write-bandwidth:10 MB/s, byte-size:0 B]
93+
[mmaid=1] skipping lease shedding for calling store s1: not cpu overloaded: loadLow
94+
[mmaid=1] attempting to shed replicas next
95+
[mmaid=1] skipping s2 for replica transfer: replica disposition refusing (health ok)
96+
[mmaid=1] pre-means filtered 1 stores → remaining [1 3], means: store={[1000000000 50000000 0] [8000000000 100000000 1000000000] [0.125 0.5 0] [0 0]} node={4000000000 12000000000 0.3333333333333333}
97+
[mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=1000000000 meanLoad=1000000000 fractionUsed=12.50% meanUtil=12.50% capacity=8000000000]
98+
[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=50000000 meanLoad=50000000 fractionUsed=50.00% meanUtil=50.00% capacity=100000000]
99+
[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000000000]
100+
[mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed < 75% [load=7000000000 meanLoad=4000000000 fractionUsed=43.75% meanUtil=33.33% capacity=16000000000]
101+
[mmaid=1] considering replica-transfer r1 from s1: store load [cpu:1s/s, write-bandwidth:50 MB/s, byte-size:0 B]
102+
[mmaid=1] candidates are:
103+
[mmaid=1] result(failed): no candidates found for r1 after exclusions
104+
[mmaid=1] start processing shedding store s2: cpu node load overloadSlow, store load overloadSlow, worst dim CPURate
105+
[mmaid=1] no top-K[CPURate] ranges found for s2 with lease on local s1
106+
[mmaid=1] start processing shedding store s3: cpu node load loadLow, store load overloadSlow, worst dim WriteBandwidth
107+
[mmaid=1] no top-K[WriteBandwidth] ranges found for s3 with lease on local s1
108+
[mmaid=1] rebalancing pass failures (store,reason:count): (s1,not-overloaded:1)
109+
pending(0)

0 commit comments

Comments
 (0)