|
| 1 | +# This test verifies the filtering of candidate stores for lease transfers in |
| 2 | +# cluster_state_rebalance_stores.go using retainReadyLeaseTargetStoresOnly, and |
| 3 | +# serves as a regression test for |
| 4 | +# https://github.com/cockroachdb/cockroach/issues/159536, where stores were |
| 5 | +# filtered only during initial candidate selection but not during the |
| 6 | +# sortTargetCandidateSetAndPick stage. |
| 7 | +# |
| 8 | +# Setup: s1 holds the lease for r1, with replicas on s1, s2, s3. |
| 9 | +# - s1: overloaded (wants to shed leases) |
| 10 | +# - s2: slightly less overloaded but the replica on s2 for r1 has |
| 11 | +# lease-disposition=refusing (not store-level) |
| 12 | +# - s3: slightly less overloaded |
| 13 | +# |
| 14 | +# Expected: s2 is filtered out due to per-replica lease disposition. Lease |
| 15 | +# transfer from s1 to s3 occurs. |
| 16 | +# Buggy output: s2 is considered and picked for lease transfer. |
| 17 | +set-store |
| 18 | + store-id=1 node-id=1 |
| 19 | + store-id=2 node-id=2 |
| 20 | + store-id=3 node-id=3 |
| 21 | +---- |
| 22 | +node-id=1 locality-tiers=node=1 |
| 23 | + store-id=1 attrs= |
| 24 | +node-id=2 locality-tiers=node=2 |
| 25 | + store-id=2 attrs= |
| 26 | +node-id=3 locality-tiers=node=3 |
| 27 | + store-id=3 attrs= |
| 28 | + |
| 29 | +store-load-msg |
| 30 | + store-id=1 node-id=1 load=[1000,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s |
| 31 | + store-id=2 node-id=2 load=[800,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s |
| 32 | + store-id=3 node-id=3 load=[800,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s |
| 33 | +---- |
| 34 | + |
| 35 | +# Key difference from rebalance_stores_cpu_lease_refusing_target.txt: |
| 36 | +# Here we set lease-disposition=refusing on the replica itself (s2's replica for r1), |
| 37 | +# rather than on the store status. The store s2 is healthy and accepting leases |
| 38 | +# at the store level, but this specific replica is refusing leases. |
| 39 | +store-leaseholder-msg |
| 40 | +store-id=1 |
| 41 | + range-id=1 load=[100,0,0] raft-cpu=10 |
| 42 | + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true |
| 43 | + store-id=2 replica-id=2 type=VOTER_FULL lease-disposition=refusing |
| 44 | + store-id=3 replica-id=3 type=VOTER_FULL |
| 45 | + config=num_replicas=3 constraints={} voter_constraints={} |
| 46 | +---- |
| 47 | + |
| 48 | +# Verify s2's store-level status is OK (not refusing). |
| 49 | +# This confirms we're testing per-replica disposition, not store-level. |
| 50 | +set-store-status store-id=2 health=ok leases=ok replicas=ok |
| 51 | +---- |
| 52 | +ok accepting all |
| 53 | + |
| 54 | +# s1 tries to shed leases. s2 is filtered out due to per-replica disposition, |
| 55 | +# leaving only [1 3]. Since s1 and s3 are equally loaded, no lease transfer occurs. |
| 56 | +rebalance-stores store-id=1 |
| 57 | +---- |
| 58 | +[mmaid=1] rebalanceStores begins |
| 59 | +[mmaid=1] cluster means: (stores-load [cpu:866ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (stores-capacity [cpu:1µs/s, write-bandwidth:1.0 kB/s, byte-size:1.0 kB]) (nodes-cpu-load 866) (nodes-cpu-capacity 1000) |
| 60 | +[mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=866 fractionUsed=100.00% meanUtil=86.67% capacity=1000] |
| 61 | +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 62 | +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 63 | +[mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=866 fractionUsed=100.00% meanUtil=86.67% capacity=1000] |
| 64 | +[mmaid=1] evaluating s1: node load overloadUrgent, store load overloadUrgent, worst dim CPURate |
| 65 | +[mmaid=1] overload-continued s1 ((store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))) - within grace period |
| 66 | +[mmaid=1] store s1 was added to shedding store list |
| 67 | +[mmaid=1] load summary for dim=CPURate (s2): loadNormal, reason: load is within 5% of mean [load=800 meanLoad=866 fractionUsed=80.00% meanUtil=86.67% capacity=1000] |
| 68 | +[mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 69 | +[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 70 | +[mmaid=1] load summary for dim=CPURate (n2): loadNormal, reason: load is within 5% of mean [load=800 meanLoad=866 fractionUsed=80.00% meanUtil=86.67% capacity=1000] |
| 71 | +[mmaid=1] evaluating s2: node load loadNormal, store load loadNormal, worst dim CPURate |
| 72 | +[mmaid=1] load summary for dim=CPURate (s3): loadNormal, reason: load is within 5% of mean [load=800 meanLoad=866 fractionUsed=80.00% meanUtil=86.67% capacity=1000] |
| 73 | +[mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 74 | +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 75 | +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=800 meanLoad=866 fractionUsed=80.00% meanUtil=86.67% capacity=1000] |
| 76 | +[mmaid=1] evaluating s3: node load loadNormal, store load loadNormal, worst dim CPURate |
| 77 | +[mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate |
| 78 | +[mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] |
| 79 | +[mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first |
| 80 | +[mmaid=1] skipping s2 for lease transfer: replica lease disposition refusing (health ok) |
| 81 | +[mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=900 fractionUsed=100.00% meanUtil=90.00% capacity=1000] |
| 82 | +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 83 | +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 84 | +[mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=900 fractionUsed=100.00% meanUtil=90.00% capacity=1000] |
| 85 | +[mmaid=1] considering lease-transfer r1 from s1: candidates are [1 3] |
| 86 | +[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=800 meanLoad=900 fractionUsed=80.00% meanUtil=90.00% capacity=1000] |
| 87 | +[mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 88 | +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 89 | +[mmaid=1] load summary for dim=CPURate (n3): loadLow, reason: load is >10% below mean [load=800 meanLoad=900 fractionUsed=80.00% meanUtil=90.00% capacity=1000] |
| 90 | +[mmaid=1] sortTargetCandidateSetAndPick: candidates: s3(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s3 |
| 91 | +[mmaid=1] load summary for dim=CPURate (s3): loadNormal, reason: load is within 5% of mean [load=899 meanLoad=900 fractionUsed=89.90% meanUtil=90.00% capacity=1000] |
| 92 | +[mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 93 | +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 94 | +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=899 meanLoad=900 fractionUsed=89.90% meanUtil=90.00% capacity=1000] |
| 95 | +[mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=910 meanLoad=900 fractionUsed=91.00% meanUtil=90.00% capacity=1000] |
| 96 | +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 97 | +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] |
| 98 | +[mmaid=1] load summary for dim=CPURate (n1): loadNormal, reason: load is within 5% of mean [load=910 meanLoad=900 fractionUsed=91.00% meanUtil=90.00% capacity=1000] |
| 99 | +[mmaid=1] can add load to n3s3: true targetSLS[(store=loadNormal worst=CPURate cpu=loadNormal writes=loadNormal bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))] srcSLS[(store=loadNormal worst=CPURate cpu=loadNormal writes=loadNormal bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))] |
| 100 | +[mmaid=1] result(success): shedding r1 lease from s1 to s3 [change:r1=[transfer_to=3 cids=1,2]] with resulting loads source:[cpu:910ns/s, write-bandwidth:0 B/s, byte-size:0 B] target:[cpu:899ns/s, write-bandwidth:0 B/s, byte-size:0 B] (means: [cpu:900ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (frac_pending: (src:0.00,target:0.09) (src:0.12,target:0.00)) |
| 101 | +[mmaid=1] skipping replica transfers for s1 to try more leases next time |
| 102 | +[mmaid=1] rebalancing pass shed: {s1} |
| 103 | +pending(2) |
| 104 | +change-id=1 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-90ns/s, write-bandwidth:0 B/s, byte-size:0 B] start=0s gc=1m0s |
| 105 | + prev=(replica-id=1 type=VOTER_FULL leaseholder=true) |
| 106 | + next=(replica-id=1 type=VOTER_FULL) |
| 107 | +change-id=2 store-id=3 node-id=3 range-id=1 load-delta=[cpu:99ns/s, write-bandwidth:0 B/s, byte-size:0 B] start=0s gc=1m0s |
| 108 | + prev=(replica-id=3 type=VOTER_FULL) |
| 109 | + next=(replica-id=3 type=VOTER_FULL leaseholder=true) |
0 commit comments