|
| 1 | +# This test verifies mma's candidate exclusion and shedding behavior based on |
| 2 | +# store status. Note that this test does not verify the correctness of the |
| 3 | +# actual translation from store pool status to mma status. That is tested in |
| 4 | +# the mmaintegration package. |
| 5 | +# |
| 6 | +# Setup: 3 stores |
| 7 | +# - s1: source store (overloaded, wants to shed) |
| 8 | +# - s2: always available (good target) |
| 9 | +# - s3: test store (status changes to test each scenario) |
| 10 | +set-store |
| 11 | + store-id=1 node-id=1 |
| 12 | + store-id=2 node-id=2 |
| 13 | + store-id=3 node-id=3 |
| 14 | +---- |
| 15 | +node-id=1 locality-tiers=node=1 |
| 16 | + store-id=1 attrs= |
| 17 | +node-id=2 locality-tiers=node=2 |
| 18 | + store-id=2 attrs= |
| 19 | +node-id=3 locality-tiers=node=3 |
| 20 | + store-id=3 attrs= |
| 21 | + |
| 22 | +# s1 is overloaded, s2 and s3 are low load |
| 23 | +store-load-msg |
| 24 | + store-id=1 node-id=1 load=[1000,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s |
| 25 | + store-id=2 node-id=2 load=[100,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s |
| 26 | + store-id=3 node-id=3 load=[100,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s |
| 27 | +---- |
| 28 | + |
| 29 | +# Range r1: lease on s1, replicas on s1, s2, s3 |
| 30 | +store-leaseholder-msg |
| 31 | +store-id=1 |
| 32 | + range-id=1 load=[100,0,0] raft-cpu=100 |
| 33 | + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true |
| 34 | + store-id=2 replica-id=2 type=VOTER_FULL |
| 35 | + store-id=3 replica-id=3 type=VOTER_FULL |
| 36 | + config=num_replicas=3 constraints={} voter_constraints={} |
| 37 | +---- |
| 38 | + |
| 39 | +# Baseline: all stores available |
| 40 | +retain-ready-replica-target-stores-only in=(2,3) |
| 41 | +---- |
| 42 | +[2 3] |
| 43 | + |
| 44 | +retain-ready-lease-target-stores-only in=(2,3) range-id=1 |
| 45 | +---- |
| 46 | +[2 3] |
| 47 | + |
| 48 | +# Dead: excluded from all targets (shedding leases, shedding replicas) |
| 49 | +set-store-status store-id=3 health=dead leases=shedding replicas=shedding |
| 50 | +---- |
| 51 | +dead shedding=leases,replicas |
| 52 | + |
| 53 | +retain-ready-replica-target-stores-only in=(2,3) |
| 54 | +---- |
| 55 | +skipping s3 for replica transfer: replica disposition shedding (health dead) |
| 56 | +[2] |
| 57 | + |
| 58 | +retain-ready-lease-target-stores-only in=(2,3) range-id=1 |
| 59 | +---- |
| 60 | +skipping s3 for lease transfer: lease disposition shedding (health dead) |
| 61 | +[2] |
| 62 | + |
| 63 | +# Unknown: excluded from all targets (refusing leases, refusing replicas) |
| 64 | +set-store-status store-id=3 health=unknown leases=refusing replicas=refusing |
| 65 | +---- |
| 66 | +unknown refusing=leases,replicas |
| 67 | + |
| 68 | +retain-ready-replica-target-stores-only in=(2,3) |
| 69 | +---- |
| 70 | +skipping s3 for replica transfer: replica disposition refusing (health unknown) |
| 71 | +[2] |
| 72 | + |
| 73 | +retain-ready-lease-target-stores-only in=(2,3) range-id=1 |
| 74 | +---- |
| 75 | +skipping s3 for lease transfer: lease disposition refusing (health unknown) |
| 76 | +[2] |
| 77 | + |
| 78 | +# Decommissioning: excluded from all targets (shedding leases, shedding replicas) |
| 79 | +set-store-status store-id=3 health=ok leases=shedding replicas=shedding |
| 80 | +---- |
| 81 | +ok shedding=leases,replicas |
| 82 | + |
| 83 | +retain-ready-replica-target-stores-only in=(2,3) |
| 84 | +---- |
| 85 | +skipping s3 for replica transfer: replica disposition shedding (health ok) |
| 86 | +[2] |
| 87 | + |
| 88 | +retain-ready-lease-target-stores-only in=(2,3) range-id=1 |
| 89 | +---- |
| 90 | +skipping s3 for lease transfer: lease disposition shedding (health ok) |
| 91 | +[2] |
| 92 | + |
| 93 | +# Draining: excluded from all targets (shedding leases, refusing replicas) |
| 94 | +set-store-status store-id=3 health=ok leases=shedding replicas=refusing |
| 95 | +---- |
| 96 | +ok refusing=replicas shedding=leases |
| 97 | + |
| 98 | +retain-ready-replica-target-stores-only in=(2,3) |
| 99 | +---- |
| 100 | +skipping s3 for replica transfer: replica disposition refusing (health ok) |
| 101 | +[2] |
| 102 | + |
| 103 | +retain-ready-lease-target-stores-only in=(2,3) range-id=1 |
| 104 | +---- |
| 105 | +skipping s3 for lease transfer: lease disposition shedding (health ok) |
| 106 | +[2] |
| 107 | + |
| 108 | +# Suspect: excluded from all targets (shedding leases, refusing replicas) |
| 109 | +set-store-status store-id=3 health=unhealthy leases=shedding replicas=refusing |
| 110 | +---- |
| 111 | +unhealthy refusing=replicas shedding=leases |
| 112 | + |
| 113 | +retain-ready-replica-target-stores-only in=(2,3) |
| 114 | +---- |
| 115 | +skipping s3 for replica transfer: replica disposition refusing (health unhealthy) |
| 116 | +[2] |
| 117 | + |
| 118 | +retain-ready-lease-target-stores-only in=(2,3) range-id=1 |
| 119 | +---- |
| 120 | +skipping s3 for lease transfer: lease disposition shedding (health unhealthy) |
| 121 | +[2] |
| 122 | + |
| 123 | +# Throttled: excluded from replica targets, can receive leases |
| 124 | +set-store-status store-id=3 health=ok leases=ok replicas=refusing |
| 125 | +---- |
| 126 | +ok refusing=replicas |
| 127 | + |
| 128 | +retain-ready-replica-target-stores-only in=(2,3) |
| 129 | +---- |
| 130 | +skipping s3 for replica transfer: replica disposition refusing (health ok) |
| 131 | +[2] |
| 132 | + |
| 133 | +retain-ready-lease-target-stores-only in=(2,3) range-id=1 |
| 134 | +---- |
| 135 | +[2 3] |
| 136 | + |
| 137 | +# Available: accepts everything |
| 138 | +set-store-status store-id=3 health=ok leases=ok replicas=ok |
| 139 | +---- |
| 140 | +ok accepting all |
| 141 | + |
| 142 | +retain-ready-replica-target-stores-only in=(2,3) |
| 143 | +---- |
| 144 | +[2 3] |
| 145 | + |
| 146 | +retain-ready-lease-target-stores-only in=(2,3) range-id=1 |
| 147 | +---- |
| 148 | +[2 3] |
| 149 | + |
| 150 | +# Rebalance test: verify s3 (dead) is excluded during actual rebalance |
| 151 | +set-store-status store-id=3 health=dead leases=shedding replicas=shedding |
| 152 | +---- |
| 153 | +dead shedding=leases,replicas |
| 154 | + |
| 155 | +rebalance-stores store-id=1 |
| 156 | +---- |
| 157 | +[mmaid=1] rebalanceStores begins |
| 158 | +[mmaid=1] cluster means: (stores-load [cpu:400ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (stores-capacity [cpu:1µs/s, write-bandwidth:1.0 kB/s, byte-size:1.0 kB]) (nodes-cpu-load 400) (nodes-cpu-capacity 1000) |
| 159 | +[mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] |
| 160 | +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 161 | +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 162 | +[mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] |
| 163 | +[mmaid=1] evaluating s1: node load overloadUrgent, store load overloadUrgent, worst dim CPURate |
| 164 | +[mmaid=1] overload-continued s1 ((store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))) - within grace period |
| 165 | +[mmaid=1] store s1 was added to shedding store list |
| 166 | +[mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=100 meanLoad=400 fractionUsed=10.00% meanUtil=40.00% capacity=1000] |
| 167 | +[mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 168 | +[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 169 | +[mmaid=1] load summary for dim=CPURate (n2): loadLow, reason: load is >10% below mean [load=100 meanLoad=400 fractionUsed=10.00% meanUtil=40.00% capacity=1000] |
| 170 | +[mmaid=1] evaluating s2: node load loadLow, store load loadNormal, worst dim WriteBandwidth |
| 171 | +[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=100 meanLoad=400 fractionUsed=10.00% meanUtil=40.00% capacity=1000] |
| 172 | +[mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 173 | +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 174 | +[mmaid=1] load summary for dim=CPURate (n3): loadLow, reason: load is >10% below mean [load=100 meanLoad=400 fractionUsed=10.00% meanUtil=40.00% capacity=1000] |
| 175 | +[mmaid=1] evaluating s3: node load loadLow, store load loadNormal, worst dim WriteBandwidth |
| 176 | +[mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate |
| 177 | +[mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] |
| 178 | +[mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first |
| 179 | +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] |
| 180 | +[mmaid=1] skipping s3 for lease transfer: lease disposition shedding (health dead) |
| 181 | +[mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=550 fractionUsed=100.00% meanUtil=55.00% capacity=1000] |
| 182 | +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 183 | +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 184 | +[mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=550 fractionUsed=100.00% meanUtil=55.00% capacity=1000] |
| 185 | +[mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=100 meanLoad=550 fractionUsed=10.00% meanUtil=55.00% capacity=1000] |
| 186 | +[mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 187 | +[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 188 | +[mmaid=1] load summary for dim=CPURate (n2): loadLow, reason: load is >10% below mean [load=100 meanLoad=550 fractionUsed=10.00% meanUtil=55.00% capacity=1000] |
| 189 | +[mmaid=1] sortTargetCandidateSetAndPick: candidates: s2(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s2 |
| 190 | +[mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=100 meanLoad=550 fractionUsed=10.00% meanUtil=55.00% capacity=1000] |
| 191 | +[mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 192 | +[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 193 | +[mmaid=1] load summary for dim=CPURate (n2): loadLow, reason: load is >10% below mean [load=100 meanLoad=550 fractionUsed=10.00% meanUtil=55.00% capacity=1000] |
| 194 | +[mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=550 fractionUsed=100.00% meanUtil=55.00% capacity=1000] |
| 195 | +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 196 | +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 197 | +[mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=550 fractionUsed=100.00% meanUtil=55.00% capacity=1000] |
| 198 | +[mmaid=1] can add load to n2s2: true targetSLS[(store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))] srcSLS[(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))] |
| 199 | +[mmaid=1] result(success): shedding r1 lease from s1 to s2 [change:r1=[transfer_to=2 cids=1,2]] with resulting loads source:[cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] target:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] (means: [cpu:550ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (frac_pending: (src:0.00,target:0.00) (src:0.00,target:0.00)) |
| 200 | +[mmaid=1] skipping replica transfers for s1 to try more leases next time |
| 201 | +[mmaid=1] rebalancing pass shed: {s1} |
| 202 | +pending(2) |
| 203 | +change-id=1 store-id=1 node-id=1 range-id=1 load-delta=[cpu:0s/s, write-bandwidth:0 B/s, byte-size:0 B] start=0s gc=1m0s |
| 204 | + prev=(replica-id=1 type=VOTER_FULL leaseholder=true) |
| 205 | + next=(replica-id=1 type=VOTER_FULL) |
| 206 | +change-id=2 store-id=2 node-id=2 range-id=1 load-delta=[cpu:0s/s, write-bandwidth:0 B/s, byte-size:0 B] start=0s gc=1m0s |
| 207 | + prev=(replica-id=2 type=VOTER_FULL) |
| 208 | + next=(replica-id=2 type=VOTER_FULL leaseholder=true) |
0 commit comments