Skip to content

Commit cfb998a

Browse files
sumeerbholatbg
authored andcommitted
mmaprototype: clean up some commentary around freshness and top-k
Epic: CRDB-55052 Release note: None
1 parent a59328e commit cfb998a

File tree

2 files changed

+44
-32
lines changed

2 files changed

+44
-32
lines changed

pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -209,21 +209,13 @@ type ReplicaChange struct {
209209
// both since a promoted replica can't have been the leaseholder and a
210210
// replica being demoted cannot retain the lease).
211211
//
212-
// NB: The prev value is always the state before the change. Typically, this
213-
// will be the source of truth provided by the leaseholder in the RangeMsg,
214-
// so will have real ReplicaIDs (if already a replica) and real ReplicaTypes
215-
// (including types beyond VOTER_FULL and NON_VOTER). However, because of
216-
// AdjustPendingChangeDisposition, we can remove pending changes and
217-
// rangeState.replicas can have replicas with unknownReplicaID. Technically,
218-
// there is nothing preventing someone from trying to construct another
219-
// change to the range, where the prev state would include unknownReplicaID
220-
// -- the data-structures allow for this. However, we do expect integration
221-
// code to typically provide a new source of truth for the range from the
222-
// leaseholder, before constructing more changes. Currently, this happens
223-
// because the rebalancer atomically provide a new StoreLeaseholderMsg and
224-
// then constructs more changes. We expect that the replicate and lease
225-
// queues, when trying to construct a change for range, will first provide
226-
// the source of truth for the range.
212+
// NB: The prev value is always the state before the change. This is the
213+
// source of truth provided by the leaseholder in the RangeMsg, so will
214+
// have real ReplicaIDs (if already a replica) and real ReplicaTypes
215+
// (including types beyond VOTER_FULL and NON_VOTER). This source-of-truth
216+
// claim is guaranteed by REQUIREMENT(change-computation) documented
217+
// elsewhere, and the fact that new changes are computed only when there
218+
// are no pending changes for a range.
227219
//
228220
// The ReplicaType in next is either the zero value (for removals), or
229221
// {VOTER_FULL, NON_VOTER} for additions/change, i.e., it represents the
@@ -808,24 +800,30 @@ type storeState struct {
808800
// This is consistent with the union of state in clusterState.ranges,
809801
// filtered for replicas that are on this store.
810802
//
811-
// NB: this can include LEARNER and VOTER_DEMOTING_LEARNER replicas.
803+
// NB: this can include roachpb.ReplicaTypes other than VOTER_FULL and
804+
// NON_VOTER, e.g. LEARNER, VOTER_DEMOTING_LEARNER etc.
812805
replicas map[roachpb.RangeID]ReplicaState
813806
// topKRanges along some load dimension. If the store is overloaded along
814807
// one resource dimension, that is the dimension chosen when picking the
815808
// top-k.
816809
//
817810
// It includes ranges whose replicas are being removed via pending
818-
// changes, or lease transfers. That is, it does not account for pending
819-
// or enacted changes made since the last time top-k was computed.
811+
// changes, or lease transfers. That is, it does not account for
812+
// pending or enacted changes made since the last time top-k was
813+
// computed. It does account for pending changes that were pending
814+
// when the top-k was computed.
820815
//
821816
// The key in this map is a local store-id.
822817
//
823-
// NB: this does not include LEARNER and VOTER_DEMOTING_LEARNER replicas.
818+
// NB: this only includes replicas that satisfy the isVoter() or
819+
// isNonVoter() methods, i.e., {VOTER_FULL, VOTER_INCOMING, NON_VOTER,
820+
// VOTER_DEMOTING_NON_VOTER}. This is justified in that these are the
821+
// only replica types where the allocator wants to explicitly consider
822+
// shedding, since the other states are transient states, that are
823+
// either going away, or will soon transition to a full-fledged state.
824824
//
825825
// We may decide to keep this top-k up-to-date incrementally instead of
826-
// recomputing it from scratch on each StoreLeaseholderMsg. Since
827-
// StoreLeaseholderMsg is incremental about the ranges it reports, that
828-
// may provide a building block for the incremental computation.
826+
// recomputing it from scratch on each StoreLeaseholderMsg.
829827
//
830828
// Example:
831829
// Assume the local node has two stores, s1 and s2.
@@ -1324,6 +1322,12 @@ func (rs *rangeState) removePendingChangeTracking(changeID changeID) {
13241322
// nodes, and even though we have rebalancing components per store, we want to
13251323
// reduce the sub-optimal decisions they make -- having a single clusterState
13261324
// is important for that.
1325+
//
1326+
// REQUIREMENT(change-computation): Any request to compute a change
1327+
// (rebalancing stores, or a change specifically for a range), must be atomic
1328+
// with the leaseholder providing the current authoritative state for all its
1329+
// ranges (for rebalancing) or for the specific range (for a range-specific
1330+
// change).
13271331
type clusterState struct {
13281332
ts timeutil.TimeSource
13291333
nodes map[roachpb.NodeID]*nodeState
@@ -1788,10 +1792,13 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
17881792
}
17891793
topk.threshold = threshold
17901794
}
1791-
// TODO: replica is already adjusted for some ongoing changes, which may be
1792-
// undone. So if s10 is a replica for range r1 whose leaseholder is the
1793-
// local store s1 that is trying to transfer the lease away, s10 will not
1794-
// see r1 below.
1795+
// NB: localss.adjusted.replicas is already adjusted for ongoing changes,
1796+
// which may be undone. For example, if s1 is the local store, and it is
1797+
// transferring its replica for range r1 to remote store s2, then
1798+
// localss.adjusted.replicas will not have r1. This is fine in that s1
1799+
// should not be making more decisions about r1. If the change is undone
1800+
// later, we will again compute the top-k, which will consider r1, before
1801+
// computing new changes (due to REQUIREMENT(change-computation)).
17951802
for rangeID, state := range localss.adjusted.replicas {
17961803
if !state.IsLeaseholder {
17971804
// We may have transferred the lease away previously but still have a
@@ -1807,7 +1814,14 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
18071814
continue
18081815
}
18091816
rs := cs.ranges[rangeID]
1810-
// TODO: replicas is also already adjusted.
1817+
// NB: rs.replicas is already adjusted for ongoing changes. For
1818+
// example, if s10 is a remote replica for range r1, which is being
1819+
// transferred to another store s11, s10 will not see r1 below. We
1820+
// make this choice to avoid cluttering the top-k for s10 with
1821+
// replicas that are going away. If it is undone, r1 will not be in
1822+
// the top-k for s10, but due to REQUIREMENT(change-computation), a
1823+
// new authoritative state will be provided and the top-k recomputed,
1824+
// before computing any new changes.
18111825
for _, replica := range rs.replicas {
18121826
typ := replica.ReplicaState.ReplicaType.ReplicaType
18131827
if isVoter(typ) || isNonVoter(typ) {

pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -291,11 +291,9 @@ func (re *rebalanceEnv) rebalanceReplicas(
291291
}
292292
isVoter, isNonVoter := rstate.constraints.replicaRole(store.StoreID)
293293
if !isVoter && !isNonVoter {
294-
// We should not panic here since the replicateQueue may have shed the
295-
// lease and informed MMA, since the last time MMA computed the top-k
296-
// ranges. This is useful for debugging in the prototype, due to the
297-
// lack of unit tests.
298-
panic(fmt.Sprintf("internal state inconsistency: "+
294+
// Due to REQUIREMENT(change-computation), the top-k is up to date, so
295+
// this must never happen.
296+
panic(errors.AssertionFailedf("internal state inconsistency: "+
299297
"store=%v range_id=%v pending-changes=%v "+
300298
"rstate_replicas=%v rstate_constraints=%v",
301299
store.StoreID, rangeID, rstate.pendingChanges, rstate.replicas, rstate.constraints))

0 commit comments

Comments
 (0)