Skip to content

Commit b01ea0d

Browse files
committed
asim: make store rebalancer refresh store status
This commit updates asis's mma store rebalancer to refresh store status before calling ComputeChanges(), matching production behavior.
1 parent e866982 commit b01ea0d

File tree

3 files changed

+14
-24
lines changed

3 files changed

+14
-24
lines changed

pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2180,16 +2180,10 @@ func (cs *clusterState) setStore(sal storeAttributesAndLocalityWithNodeTier) {
21802180
if !ok {
21812181
// This is the first time seeing this store.
21822182
ss := newStoreState()
2183-
// TODO(tbg): below is what we should be doing once asim and production code actually
2184-
// have a way to update the health status. For now, we just set it to healthy initially
2185-
// and that's where it will stay (outside of unit tests).
2186-
//
2187-
// At this point, the store's health is unknown. It will need to be marked
2188-
// as healthy separately. Until we know more, we won't place leases or
2189-
// replicas on it (nor will we try to shed any that are already reported to
2190-
// have replicas on it).
2191-
// ss.status = MakeStatus(HealthUnknown, LeaseDispositionRefusing, ReplicaDispositionRefusing)
2192-
ss.status = MakeStatus(HealthOK, LeaseDispositionOK, ReplicaDispositionOK)
2183+
// At this point, the store's health is unknown. It will be updated by cs.updateStoreStatuses. Until we know more, we
2184+
// won't place leases or replicas on it (nor will we try to shed any that
2185+
// are already reported to have replicas on it).
2186+
ss.status = MakeStatus(HealthUnknown, LeaseDispositionRefusing, ReplicaDispositionRefusing)
21932187
ss.overloadStartTime = cs.ts.Now()
21942188
ss.overloadEndTime = cs.ts.Now()
21952189
cs.stores[sal.StoreID] = ss

pkg/kv/kvserver/asim/mmaintegration/mma_store_rebalancer.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,11 @@ func (msr *MMAStoreRebalancer) Tick(ctx context.Context, tick time.Time, s state
152152
msr.pendingChangeIdx = 0
153153
msr.lastRebalanceTime = tick
154154
log.KvDistribution.VInfof(ctx, 1, "no more pending changes to process, will call compute changes again")
155+
// Refresh store status from StorePool before computing changes.
156+
// This uses the real production RefreshStoreStatus, which queries
157+
// StorePool (backed by StatusTracker via NodeLivenessFn) and
158+
// translates to MMA's status model.
159+
msr.allocator.UpdateStoreStatus(ctx, msr.as.GetMMAStoreStatuses())
155160
storeLeaseholderMsg := MakeStoreLeaseholderMsgFromState(s, msr.localStoreID)
156161
pendingChanges := msr.allocator.ComputeChanges(ctx, &storeLeaseholderMsg, mmaprototype.ChangeOptions{
157162
LocalStoreID: roachpb.StoreID(msr.localStoreID),

pkg/kv/kvserver/asim/tests/testdata/non_rand/mma/store_status_shedding.txt

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -42,21 +42,12 @@ assertion type=stat stat=replicas ticks=6 exact_bound=0 stores=(4)
4242
eval duration=6m seed=42 metrics=(replicas,leases,write_bytes_per_second) cfgs=(mma-only)
4343
----
4444
leases#1: first: [s1=40, s2=0, s3=0, s4=0] (stddev=17.32, mean=10.00, sum=40)
45-
leases#1: last: [s1=35, s2=0, s3=0, s4=5] (stddev=14.58, mean=10.00, sum=40)
45+
leases#1: last: [s1=40, s2=0, s3=0, s4=0] (stddev=17.32, mean=10.00, sum=40)
4646
leases#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%] (sum=0%)
4747
replicas#1: first: [s1=40, s2=40, s3=40, s4=0] (stddev=17.32, mean=30.00, sum=120)
48-
replicas#1: last: [s1=35, s2=40, s3=40, s4=5] (stddev=14.58, mean=30.00, sum=120)
48+
replicas#1: last: [s1=40, s2=40, s3=40, s4=0] (stddev=17.32, mean=30.00, sum=120)
4949
replicas#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%] (sum=0%)
50-
write_bytes_per_second#1: last: [s1=17489677, s2=19989215, s3=19989215, s4=2499537] (stddev=7284303.98, mean=14991911.00, sum=59967644)
51-
write_bytes_per_second#1: thrash_pct: [s1=1%, s2=27%, s3=27%, s4=2%] (sum=57%)
52-
artifacts[mma-only]: 50eedac335e03fb8
53-
failed assertion sample 1
54-
stat=replicas value=(=0.00) ticks=6
55-
store=4 stat=5.00
56-
store=4 stat=5.00
57-
store=4 stat=5.00
58-
store=4 stat=5.00
59-
store=4 stat=5.00
60-
store=4 stat=5.00
61-
store=4 stat=5.00
50+
write_bytes_per_second#1: last: [s1=19999999, s2=19999999, s3=19999999, s4=0] (stddev=8660253.60, mean=14999999.25, sum=59999997)
51+
write_bytes_per_second#1: thrash_pct: [s1=27020%, s2=27020%, s3=27020%, s4=0%] (sum=81060%)
52+
artifacts[mma-only]: 9bd6f2a583adcd74
6253
==========================

0 commit comments

Comments
 (0)