@@ -209,21 +209,13 @@ type ReplicaChange struct {
209209 // both since a promoted replica can't have been the leaseholder and a
210210 // replica being demoted cannot retain the lease).
211211 //
212- // NB: The prev value is always the state before the change. Typically, this
213- // will be the source of truth provided by the leaseholder in the RangeMsg,
214- // so will have real ReplicaIDs (if already a replica) and real ReplicaTypes
215- // (including types beyond VOTER_FULL and NON_VOTER). However, because of
216- // AdjustPendingChangeDisposition, we can remove pending changes and
217- // rangeState.replicas can have replicas with unknownReplicaID. Technically,
218- // there is nothing preventing someone from trying to construct another
219- // change to the range, where the prev state would include unknownReplicaID
220- // -- the data-structures allow for this. However, we do expect integration
221- // code to typically provide a new source of truth for the range from the
222- // leaseholder, before constructing more changes. Currently, this happens
223- // because the rebalancer atomically provide a new StoreLeaseholderMsg and
224- // then constructs more changes. We expect that the replicate and lease
225- // queues, when trying to construct a change for range, will first provide
226- // the source of truth for the range.
212+ // NB: The prev value is always the state before the change. This is the
213+ // source of truth provided by the leaseholder in the RangeMsg, so will
214+ // have real ReplicaIDs (if already a replica) and real ReplicaTypes
215+ // (including types beyond VOTER_FULL and NON_VOTER). This source-of-truth
216+ // claim is guaranteed by REQUIREMENT(change-computation) documented
217+ // elsewhere, and the fact that new changes are computed only when there
218+ // are no pending changes for a range.
227219 //
228220 // The ReplicaType in next is either the zero value (for removals), or
229221 // {VOTER_FULL, NON_VOTER} for additions/change, i.e., it represents the
@@ -807,24 +799,30 @@ type storeState struct {
807799 // This is consistent with the union of state in clusterState.ranges,
808800 // filtered for replicas that are on this store.
809801 //
810- // NB: this can include LEARNER and VOTER_DEMOTING_LEARNER replicas.
802+ // NB: this can include roachpb.ReplicaTypes other than VOTER_FULL and
803+ // NON_VOTER, e.g. LEARNER, VOTER_DEMOTING_LEARNER etc.
811804 replicas map [roachpb.RangeID ]ReplicaState
812805 // topKRanges along some load dimension. If the store is overloaded along
813806 // one resource dimension, that is the dimension chosen when picking the
814807 // top-k.
815808 //
816809 // It includes ranges whose replicas are being removed via pending
817- // changes, or lease transfers. That is, it does not account for pending
818- // or enacted changes made since the last time top-k was computed.
810+ // changes, or lease transfers. That is, it does not account for
811+ // pending or enacted changes made since the last time top-k was
812+ // computed. It does account for pending changes that were pending
813+ // when the top-k was computed.
819814 //
820815 // The key in this map is a local store-id.
821816 //
822- // NB: this does not include LEARNER and VOTER_DEMOTING_LEARNER replicas.
817+ // NB: this only includes replicas that satisfy the isVoter() or
818+ // isNonVoter() methods, i.e., {VOTER_FULL, VOTER_INCOMING, NON_VOTER,
819+ // VOTER_DEMOTING_NON_VOTER}. This is justified in that these are the
820+ // only replica types where the allocator wants to explicitly consider
821+ // shedding, since the other states are transient states, that are
822+ // either going away, or will soon transition to a full-fledged state.
823823 //
824824 // We may decide to keep this top-k up-to-date incrementally instead of
825- // recomputing it from scratch on each StoreLeaseholderMsg. Since
826- // StoreLeaseholderMsg is incremental about the ranges it reports, that
827- // may provide a building block for the incremental computation.
825+ // recomputing it from scratch on each StoreLeaseholderMsg.
828826 //
829827 // Example:
830828 // Assume the local node has two stores, s1 and s2.
@@ -1324,6 +1322,12 @@ func (rs *rangeState) removePendingChangeTracking(changeID changeID) {
13241322// nodes, and even though we have rebalancing components per store, we want to
13251323// reduce the sub-optimal decisions they make -- having a single clusterState
13261324// is important for that.
1325+ //
1326+ // REQUIREMENT(change-computation): Any request to compute a change
1327+ // (rebalancing stores, or a change specifically for a range), must be atomic
1328+ // with the leaseholder providing the current authoritative state for all its
1329+ // ranges (for rebalancing) or for the specific range (for a range-specific
1330+ // change).
13271331type clusterState struct {
13281332 ts timeutil.TimeSource
13291333 nodes map [roachpb.NodeID ]* nodeState
@@ -1792,10 +1796,13 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
17921796 }
17931797 topk .threshold = threshold
17941798 }
1795- // TODO: replica is already adjusted for some ongoing changes, which may be
1796- // undone. So if s10 is a replica for range r1 whose leaseholder is the
1797- // local store s1 that is trying to transfer the lease away, s10 will not
1798- // see r1 below.
1799+ // NB: localss.adjusted.replicas is already adjusted for ongoing changes,
1800+ // which may be undone. For example, if s1 is the local store, and it is
1801+ // transferring its replica for range r1 to remote store s2, then
1802+ // localss.adjusted.replicas will not have r1. This is fine in that s1
1803+ // should not be making more decisions about r1. If the change is undone
1804+ // later, we will again compute the top-k, which will consider r1, before
1805+ // computing new changes (due to REQUIREMENT(change-computation)).
17991806 for rangeID , state := range localss .adjusted .replicas {
18001807 if ! state .IsLeaseholder {
18011808 // We may have transferred the lease away previously but still have a
@@ -1811,7 +1818,14 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
18111818 continue
18121819 }
18131820 rs := cs .ranges [rangeID ]
1814- // TODO: replicas is also already adjusted.
1821+ // NB: rs.replicas is already adjusted for ongoing changes. For
1822+ // example, if s10 is a remote replica for range r1, which is being
1823+ // transferred to another store s11, s10 will not see r1 below. We
1824+ // make this choice to avoid cluttering the top-k for s10 with
1825+ // replicas that are going away. If it is undone, r1 will not be in
1826+ // the top-k for s10, but due to REQUIREMENT(change-computation), a
1827+ // new authoritative state will be provided and the top-k recomputed,
1828+ // before computing any new changes.
18151829 for _ , replica := range rs .replicas {
18161830 typ := replica .ReplicaState .ReplicaType .ReplicaType
18171831 if isVoter (typ ) || isNonVoter (typ ) {
0 commit comments