Skip to content

Commit 72e5734

Browse files
craig[bot]sumeerbhola
andcommitted
Merge #157772
157772: mmaprototype: clean up some commentary around freshness and top-k r=tbg a=sumeerbhola Epic: CRDB-55052 Release note: None Co-authored-by: sumeerbhola <[email protected]>
2 parents 60165a6 + cfb998a commit 72e5734

File tree

2 files changed

+44
-32
lines changed

2 files changed

+44
-32
lines changed

pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -209,21 +209,13 @@ type ReplicaChange struct {
209209
// both since a promoted replica can't have been the leaseholder and a
210210
// replica being demoted cannot retain the lease).
211211
//
212-
// NB: The prev value is always the state before the change. Typically, this
213-
// will be the source of truth provided by the leaseholder in the RangeMsg,
214-
// so will have real ReplicaIDs (if already a replica) and real ReplicaTypes
215-
// (including types beyond VOTER_FULL and NON_VOTER). However, because of
216-
// AdjustPendingChangeDisposition, we can remove pending changes and
217-
// rangeState.replicas can have replicas with unknownReplicaID. Technically,
218-
// there is nothing preventing someone from trying to construct another
219-
// change to the range, where the prev state would include unknownReplicaID
220-
// -- the data-structures allow for this. However, we do expect integration
221-
// code to typically provide a new source of truth for the range from the
222-
// leaseholder, before constructing more changes. Currently, this happens
223-
// because the rebalancer atomically provide a new StoreLeaseholderMsg and
224-
// then constructs more changes. We expect that the replicate and lease
225-
// queues, when trying to construct a change for range, will first provide
226-
// the source of truth for the range.
212+
// NB: The prev value is always the state before the change. This is the
213+
// source of truth provided by the leaseholder in the RangeMsg, so will
214+
// have real ReplicaIDs (if already a replica) and real ReplicaTypes
215+
// (including types beyond VOTER_FULL and NON_VOTER). This source-of-truth
216+
// claim is guaranteed by REQUIREMENT(change-computation) documented
217+
// elsewhere, and the fact that new changes are computed only when there
218+
// are no pending changes for a range.
227219
//
228220
// The ReplicaType in next is either the zero value (for removals), or
229221
// {VOTER_FULL, NON_VOTER} for additions/change, i.e., it represents the
@@ -807,24 +799,30 @@ type storeState struct {
807799
// This is consistent with the union of state in clusterState.ranges,
808800
// filtered for replicas that are on this store.
809801
//
810-
// NB: this can include LEARNER and VOTER_DEMOTING_LEARNER replicas.
802+
// NB: this can include roachpb.ReplicaTypes other than VOTER_FULL and
803+
// NON_VOTER, e.g. LEARNER, VOTER_DEMOTING_LEARNER etc.
811804
replicas map[roachpb.RangeID]ReplicaState
812805
// topKRanges along some load dimension. If the store is overloaded along
813806
// one resource dimension, that is the dimension chosen when picking the
814807
// top-k.
815808
//
816809
// It includes ranges whose replicas are being removed via pending
817-
// changes, or lease transfers. That is, it does not account for pending
818-
// or enacted changes made since the last time top-k was computed.
810+
// changes, or lease transfers. That is, it does not account for
811+
// pending or enacted changes made since the last time top-k was
812+
// computed. It does account for pending changes that were pending
813+
// when the top-k was computed.
819814
//
820815
// The key in this map is a local store-id.
821816
//
822-
// NB: this does not include LEARNER and VOTER_DEMOTING_LEARNER replicas.
817+
// NB: this only includes replicas that satisfy the isVoter() or
818+
// isNonVoter() methods, i.e., {VOTER_FULL, VOTER_INCOMING, NON_VOTER,
819+
// VOTER_DEMOTING_NON_VOTER}. This is justified in that these are the
820+
// only replica types where the allocator wants to explicitly consider
821+
// shedding, since the other states are transient states, that are
822+
// either going away, or will soon transition to a full-fledged state.
823823
//
824824
// We may decide to keep this top-k up-to-date incrementally instead of
825-
// recomputing it from scratch on each StoreLeaseholderMsg. Since
826-
// StoreLeaseholderMsg is incremental about the ranges it reports, that
827-
// may provide a building block for the incremental computation.
825+
// recomputing it from scratch on each StoreLeaseholderMsg.
828826
//
829827
// Example:
830828
// Assume the local node has two stores, s1 and s2.
@@ -1324,6 +1322,12 @@ func (rs *rangeState) removePendingChangeTracking(changeID changeID) {
13241322
// nodes, and even though we have rebalancing components per store, we want to
13251323
// reduce the sub-optimal decisions they make -- having a single clusterState
13261324
// is important for that.
1325+
//
1326+
// REQUIREMENT(change-computation): Any request to compute a change
1327+
// (rebalancing stores, or a change specifically for a range), must be atomic
1328+
// with the leaseholder providing the current authoritative state for all its
1329+
// ranges (for rebalancing) or for the specific range (for a range-specific
1330+
// change).
13271331
type clusterState struct {
13281332
ts timeutil.TimeSource
13291333
nodes map[roachpb.NodeID]*nodeState
@@ -1792,10 +1796,13 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
17921796
}
17931797
topk.threshold = threshold
17941798
}
1795-
// TODO: replica is already adjusted for some ongoing changes, which may be
1796-
// undone. So if s10 is a replica for range r1 whose leaseholder is the
1797-
// local store s1 that is trying to transfer the lease away, s10 will not
1798-
// see r1 below.
1799+
// NB: localss.adjusted.replicas is already adjusted for ongoing changes,
1800+
// which may be undone. For example, if s1 is the local store, and it is
1801+
// transferring its replica for range r1 to remote store s2, then
1802+
// localss.adjusted.replicas will not have r1. This is fine in that s1
1803+
// should not be making more decisions about r1. If the change is undone
1804+
// later, we will again compute the top-k, which will consider r1, before
1805+
// computing new changes (due to REQUIREMENT(change-computation)).
17991806
for rangeID, state := range localss.adjusted.replicas {
18001807
if !state.IsLeaseholder {
18011808
// We may have transferred the lease away previously but still have a
@@ -1811,7 +1818,14 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
18111818
continue
18121819
}
18131820
rs := cs.ranges[rangeID]
1814-
// TODO: replicas is also already adjusted.
1821+
// NB: rs.replicas is already adjusted for ongoing changes. For
1822+
// example, if s10 is a remote replica for range r1, which is being
1823+
// transferred to another store s11, s10 will not see r1 below. We
1824+
// make this choice to avoid cluttering the top-k for s10 with
1825+
// replicas that are going away. If it is undone, r1 will not be in
1826+
// the top-k for s10, but due to REQUIREMENT(change-computation), a
1827+
// new authoritative state will be provided and the top-k recomputed,
1828+
// before computing any new changes.
18151829
for _, replica := range rs.replicas {
18161830
typ := replica.ReplicaState.ReplicaType.ReplicaType
18171831
if isVoter(typ) || isNonVoter(typ) {

pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -291,11 +291,9 @@ func (re *rebalanceEnv) rebalanceReplicas(
291291
}
292292
isVoter, isNonVoter := rstate.constraints.replicaRole(store.StoreID)
293293
if !isVoter && !isNonVoter {
294-
// We should not panic here since the replicateQueue may have shed the
295-
// lease and informed MMA, since the last time MMA computed the top-k
296-
// ranges. This is useful for debugging in the prototype, due to the
297-
// lack of unit tests.
298-
panic(fmt.Sprintf("internal state inconsistency: "+
294+
// Due to REQUIREMENT(change-computation), the top-k is up to date, so
295+
// this must never happen.
296+
panic(errors.AssertionFailedf("internal state inconsistency: "+
299297
"store=%v range_id=%v pending-changes=%v "+
300298
"rstate_replicas=%v rstate_constraints=%v",
301299
store.StoreID, rangeID, rstate.pendingChanges, rstate.replicas, rstate.constraints))

0 commit comments

Comments
 (0)