@@ -209,21 +209,13 @@ type ReplicaChange struct {
209209 // both since a promoted replica can't have been the leaseholder and a
210210 // replica being demoted cannot retain the lease).
211211 //
212- // NB: The prev value is always the state before the change. Typically, this
213- // will be the source of truth provided by the leaseholder in the RangeMsg,
214- // so will have real ReplicaIDs (if already a replica) and real ReplicaTypes
215- // (including types beyond VOTER_FULL and NON_VOTER). However, because of
216- // AdjustPendingChangeDisposition, we can remove pending changes and
217- // rangeState.replicas can have replicas with unknownReplicaID. Technically,
218- // there is nothing preventing someone from trying to construct another
219- // change to the range, where the prev state would include unknownReplicaID
220- // -- the data-structures allow for this. However, we do expect integration
221- // code to typically provide a new source of truth for the range from the
222- // leaseholder, before constructing more changes. Currently, this happens
223- // because the rebalancer atomically provide a new StoreLeaseholderMsg and
224- // then constructs more changes. We expect that the replicate and lease
225- // queues, when trying to construct a change for range, will first provide
226- // the source of truth for the range.
212+ // NB: The prev value is always the state before the change. This is the
213+ // source of truth provided by the leaseholder in the RangeMsg, so will
214+ // have real ReplicaIDs (if already a replica) and real ReplicaTypes
215+ // (including types beyond VOTER_FULL and NON_VOTER). This source-of-truth
216+ // claim is guaranteed by REQUIREMENT(change-computation) documented
217+ // elsewhere, and the fact that new changes are computed only when there
218+ // are no pending changes for a range.
227219 //
228220 // The ReplicaType in next is either the zero value (for removals), or
229221 // {VOTER_FULL, NON_VOTER} for additions/change, i.e., it represents the
@@ -808,24 +800,30 @@ type storeState struct {
808800 // This is consistent with the union of state in clusterState.ranges,
809801 // filtered for replicas that are on this store.
810802 //
811- // NB: this can include LEARNER and VOTER_DEMOTING_LEARNER replicas.
803+ // NB: this can include roachpb.ReplicaTypes other than VOTER_FULL and
804+ // NON_VOTER, e.g. LEARNER, VOTER_DEMOTING_LEARNER etc.
812805 replicas map [roachpb.RangeID ]ReplicaState
813806 // topKRanges along some load dimension. If the store is overloaded along
814807 // one resource dimension, that is the dimension chosen when picking the
815808 // top-k.
816809 //
817810 // It includes ranges whose replicas are being removed via pending
818- // changes, or lease transfers. That is, it does not account for pending
819- // or enacted changes made since the last time top-k was computed.
811+ // changes, or lease transfers. That is, it does not account for
812+ // pending or enacted changes made since the last time top-k was
813+ // computed. It does account for pending changes that were pending
814+ // when the top-k was computed.
820815 //
821816 // The key in this map is a local store-id.
822817 //
823- // NB: this does not include LEARNER and VOTER_DEMOTING_LEARNER replicas.
818+ // NB: this only includes replicas that satisfy the isVoter() or
819+ // isNonVoter() methods, i.e., {VOTER_FULL, VOTER_INCOMING, NON_VOTER,
820+ // VOTER_DEMOTING_NON_VOTER}. This is justified in that these are the
821+ // only replica types where the allocator wants to explicitly consider
822+ // shedding, since the other states are transient states, that are
823+ // either going away, or will soon transition to a full-fledged state.
824824 //
825825 // We may decide to keep this top-k up-to-date incrementally instead of
826- // recomputing it from scratch on each StoreLeaseholderMsg. Since
827- // StoreLeaseholderMsg is incremental about the ranges it reports, that
828- // may provide a building block for the incremental computation.
826+ // recomputing it from scratch on each StoreLeaseholderMsg.
829827 //
830828 // Example:
831829 // Assume the local node has two stores, s1 and s2.
@@ -1324,6 +1322,12 @@ func (rs *rangeState) removePendingChangeTracking(changeID changeID) {
13241322// nodes, and even though we have rebalancing components per store, we want to
13251323// reduce the sub-optimal decisions they make -- having a single clusterState
13261324// is important for that.
1325+ //
1326+ // REQUIREMENT(change-computation): Any request to compute a change
1327+ // (rebalancing stores, or a change specifically for a range), must be atomic
1328+ // with the leaseholder providing the current authoritative state for all its
1329+ // ranges (for rebalancing) or for the specific range (for a range-specific
1330+ // change).
13271331type clusterState struct {
13281332 ts timeutil.TimeSource
13291333 nodes map [roachpb.NodeID ]* nodeState
@@ -1788,10 +1792,13 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
17881792 }
17891793 topk .threshold = threshold
17901794 }
1791- // TODO: replica is already adjusted for some ongoing changes, which may be
1792- // undone. So if s10 is a replica for range r1 whose leaseholder is the
1793- // local store s1 that is trying to transfer the lease away, s10 will not
1794- // see r1 below.
1795+ // NB: localss.adjusted.replicas is already adjusted for ongoing changes,
1796+ // which may be undone. For example, if s1 is the local store, and it is
1797+ // transferring its replica for range r1 to remote store s2, then
1798+ // localss.adjusted.replicas will not have r1. This is fine in that s1
1799+ // should not be making more decisions about r1. If the change is undone
1800+ // later, we will again compute the top-k, which will consider r1, before
1801+ // computing new changes (due to REQUIREMENT(change-computation)).
17951802 for rangeID , state := range localss .adjusted .replicas {
17961803 if ! state .IsLeaseholder {
17971804 // We may have transferred the lease away previously but still have a
@@ -1807,7 +1814,14 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
18071814 continue
18081815 }
18091816 rs := cs .ranges [rangeID ]
1810- // TODO: replicas is also already adjusted.
1817+ // NB: rs.replicas is already adjusted for ongoing changes. For
1818+ // example, if s10 is a remote replica for range r1, which is being
1819+ // transferred to another store s11, s10 will not see r1 below. We
1820+ // make this choice to avoid cluttering the top-k for s10 with
1821+ // replicas that are going away. If it is undone, r1 will not be in
1822+ // the top-k for s10, but due to REQUIREMENT(change-computation), a
1823+ // new authoritative state will be provided and the top-k recomputed,
1824+ // before computing any new changes.
18111825 for _ , replica := range rs .replicas {
18121826 typ := replica .ReplicaState .ReplicaType .ReplicaType
18131827 if isVoter (typ ) || isNonVoter (typ ) {
0 commit comments