@@ -204,12 +204,12 @@ type ReplicaChange struct {
204204 // replica being demoted cannot retain the lease).
205205 //
206206 // NB: The prev value is always the state before the change. This is the
207- // source of truth provided by the leaseholder in the RangeMsg, so will
208- // have real ReplicaIDs (if already a replica) and real ReplicaTypes
207+ // latest source of truth provided by the leaseholder in the RangeMsg, so
208+ // will have real ReplicaIDs (if already a replica) and real ReplicaTypes
209209 // (including types beyond VOTER_FULL and NON_VOTER). This source-of-truth
210210 // claim is guaranteed by REQUIREMENT(change-computation) documented
211- // elsewhere, and the fact that new changes are computed only when there
212- // are no pending changes for a range.
211+ // elsewhere, and the fact that new changes are computed only when there are
212+ // no pending changes for a range.
213213 //
214214 // The ReplicaType in next is either the zero value (for removals), or
215215 // {VOTER_FULL, NON_VOTER} for additions/change, i.e., it represents the
@@ -218,6 +218,9 @@ type ReplicaChange struct {
218218 // TODO(tbg): in MakeLeaseTransferChanges, next.ReplicaType.ReplicaType is
219219 // simply the current value, and not necessarily {VOTER_FULL, NON_VOTER}.
220220 // So the above comment is incorrect. We should clean this up.
221+ //
222+ // The prev field is mutable after creation, to ensure that an undo restores
223+ // the state to the latest source of truth from the leaseholder.
221224 prev ReplicaState
222225 next ReplicaIDAndType
223226
@@ -459,21 +462,14 @@ func mapReplicaTypeToVoterOrNonVoter(rType roachpb.ReplicaType) roachpb.ReplicaT
459462// replicas, or transferring the lease. There is at most one change per store
460463// in the set.
461464//
462- // NB: pendingReplicaChanges is not visible outside the package, so we can be
463- // certain that callers outside this package that hold a PendingRangeChange
464- // cannot mutate the internals other than clearing the state.
465- //
466- // Additionally, for a PendingRangeChange returned outside the package, we
467- // ensure that the pendingReplicaChanges slice itself is not shared with the
468- // rangeState.pendingChanges slice since the rangeState.pendingChanges slice
469- // can have entries removed from it (and swapped around as part of removal).
470- //
471- // Some the state inside each *pendingReplicaChange is mutable at arbitrary
472- // points in time by the code inside this package (with the relevant locking,
473- // of course). Currently, this state is gcTime, enactedAtTime. Neither of it
474- // is read by the public methods on PendingRangeChange.
465+ // NB: pendingReplicaChanges is not visible outside the package.
475466//
476- // TODO(sumeer): when we expand the set of mutable fields, make a deep copy.
467+ // The *pendingReplicaChange objects are pointers since the clusterState
468+ // struct has multiple slices and maps that point to the same
469+ // *pendingReplicaChange object, which is mutable. To prevent race conditions
470+ // with exported functions on PendingRangeChange called from outside the
471+ // package, the *pendingReplicaChange objects returned outside the package are
472+ // a copy that will not be mutated.
477473type PendingRangeChange struct {
478474 RangeID roachpb.RangeID
479475 pendingReplicaChanges []* pendingReplicaChange
@@ -727,9 +723,9 @@ type pendingReplicaChange struct {
727723 // expiry. All replica changes in a PendingRangeChange have the same
728724 // startTime.
729725 startTime time.Time
730- // gcTime represents a time when the unenacted change should be GC'd, either
731- // using the normal GC undo path, or if rangeState.pendingChangeNoRollback
732- // is true, when processing a RangeMsg from the leaseholder .
726+ // gcTime represents a time when the unenacted change should be GC'd.
727+ //
728+ // Mutable after creation .
733729 gcTime time.Time
734730
735731 // TODO(kvoli,sumeerbhola): Consider adopting an explicit expiration time,
@@ -742,6 +738,8 @@ type pendingReplicaChange struct {
742738 // information received from the leaseholder, this value is set, so that even
743739 // if the store with a replica affected by this pending change does not tell
744740 // us about the enactment, we can garbage collect this change.
741+ //
742+ // Mutable after creation.
745743 enactedAtTime time.Time
746744}
747745
@@ -1052,7 +1050,7 @@ type rangeState struct {
10521050 // that are still at the initial state, or an intermediate state, it can
10531051 // continue anticipating that these pending changes will happen. Tracking
10541052 // what is pending also allows for undo in the case of explicit failure,
1055- // notified by AdjustPendingChangesDisposition.
1053+ // notified by AdjustPendingChangesDisposition, or GC .
10561054 //
10571055 // 2. Lifecycle
10581056 // pendingChanges track proposed modifications to a range's replicas or
@@ -1078,17 +1076,9 @@ type rangeState struct {
10781076 // has been enacted in this case.
10791077 //
10801078 // 2. Undone as failed: corresponding replica and load change is rolled back.
1081- // Note that for replica changes that originate from one action, all changes
1082- // would be undone together.
1083- // NB: pending changes of a range state originate from one decision.
1084- // Therefore, when one pending change is enacted successfully, we mark this
1085- // range state's pending changes as no rollback (read more about this in 3).
1086- // If we are here trying to undo a pending change but the range state has
1087- // already been marked as no rollback, we do not undo the remaining pending
1088- // changes. Instead, we wait for a StoreLeaseholderMsg to discard the pending
1089- // changes and revert the load adjustments after the
1090- // partiallyEnactedGCDuration has elapsed since the first enacted change. The
1091- // modeling here is imperfect (read more about this in 3).
1079+ // Note that for replica changes that originate from one action, some changes
1080+ // can be considered done because of the leaseholder msg, and others can be
1081+ // rolled back (say due to GC).
10921082 //
10931083 // This happens when:
10941084 // - The pending change failed to apply via
@@ -1149,14 +1139,10 @@ type rangeState struct {
11491139 // the replica and leaseholder to s4. An intermediate state that can be
11501140 // observed is {s1, s2, s3, s4} with the lease still at s3. But the pending
11511141 // change for adding s4 includes both that it has a replica, and it has the
1152- // lease, so we will not mark it done, and keep pretending that the whole
1153- // change is pending. Since lease transfers are fast, we accept this
1154- // imperfect modeling fidelity. One consequence of this imperfect modeling
1155- // is that if in this example there are no further changes observed until
1156- // GC, the allocator will undo both changes and go back to the state {s1,
1157- // s2, s3} with s3 as the leaseholder. That is, it has forgotten that s4 was
1158- // added. This is unavoidable and will be fixed by the first
1159- // StoreLeaseholderMsg post-GC.
1142+ // lease, so we will not mark it done, and keep pretending that the change
1143+ // is pending. However, we will change the prev state to indicate that s4
1144+ // has a replica, so that undo (say due to GC) rolls back to the latest
1145+ // source-of-truth from the leaseholder.
11601146 //
11611147 // 4. Non Atomicity Hazard
11621148 //
@@ -1165,20 +1151,19 @@ type rangeState struct {
11651151 // to contend with the hazard of having two leaseholders or no leaseholders.
11661152 // In the earlier example, say s3 and s4 were both local stores (a
11671153 // multi-store node), it may be possible to observe an intermediate state
1168- // {s1, s2, s3, s4} where s4 is the leaseholder. If we subsequently get a
1169- // spurious AdjustPendingChangesDisposition(success=false) call, or
1170- // time-based GC causes the s3 removal to be undone, there will be two
1171- // replicas marked as the leaseholder. The other extreme is believing that
1172- // the s3 transfer is done and the s4 incoming replica (and lease) failed
1173- // (this may not actually be possible because of the surrounding code).
1154+ // {s1, s2, s3, s4} where s4 is the leaseholder. We need to ensure that if
1155+ // we subsequently get a spurious
1156+ // AdjustPendingChangesDisposition(success=false) call, or time-based GC
1157+ // causes the s3 removal to be undone, there will not be two replicas marked
1158+ // as the leaseholder. The other extreme is believing that the s3 transfer
1159+ // is done and the s4 incoming replica (and lease) failed (this may not
1160+ // actually be possible because of the surrounding code).
11741161 //
1175- // We deal with this hazard by observing that we've constructed multiple
1176- // pending changes in order to observe intermediate changes in the common
1177- // case of success. Once one change in the set of changes is considered
1178- // enacted, we mark the whole remaining group as no-rollback. In the above
1179- // case, if we see s4 has become the leaseholder, the s1 removal can't undo
1180- // itself -- it can be dropped if it is considered subsumed when processing
1181- // a RangeMsg, or it can be GC'd.
1162+ // This hazard is dealt with in the same way outlined in the earlier
1163+ // example: when the leaseholder msg from s4 arrives that lists {s1, s2, s3,
1164+ // s4} as replicas, the prev state for the s3 change is updated to indicate
1165+ // that it is not the leaseholder. This means that if the change is undone,
1166+ // it will return to a prev state where it has a replica but not the lease.
11821167 //
11831168 // Additionally, when processing a RangeMsg, if any of the pending changes
11841169 // is considered inconsistent, all the pending changes are discarded. This
@@ -1198,11 +1183,6 @@ type rangeState struct {
11981183 // rangeState.pendingChanges across all ranges in clusterState.ranges will
11991184 // be identical to clusterState.pendingChanges.
12001185 pendingChanges []* pendingReplicaChange
1201- // When set, the pendingChanges can not be rolled back anymore. They have
1202- // to be enacted, or discarded wholesale in favor of the latest RangeMsg
1203- // from the leaseholder. It is reset to false when pendingChanges
1204- // transitions from empty to non-empty.
1205- pendingChangeNoRollback bool
12061186
12071187 // If non-nil, it is up-to-date. Typically, non-nil for a range that has no
12081188 // pendingChanges and is not satisfying some constraint, since we don't want
@@ -1534,27 +1514,15 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
15341514 // The change has been enacted according to the leaseholder.
15351515 enactedChanges = append (enactedChanges , change )
15361516 } else {
1517+ // Not subsumed. Replace the prev with the latest source of truth from
1518+ // the leaseholder. Note, this can be the noReplicaID case from above.
1519+ change .prev = adjustedReplica
15371520 remainingChanges = append (remainingChanges , change )
15381521 }
15391522 }
1540- gcRemainingChanges := false
1541- if rs .pendingChangeNoRollback {
1542- // A previous StoreLeaseholderMsg has enacted some changes, so the
1543- // remainingChanges may be GC'able. All of them share the same GC time.
1544- // Note that normal GC will not GC these, since normal GC needs to undo,
1545- // and we are not allowed to undo these.
1546- if len (remainingChanges ) > 0 {
1547- gcTime := remainingChanges [0 ].gcTime
1548- if gcTime .Before (now ) {
1549- gcRemainingChanges = true
1550- }
1551- }
1552- } else if len (enactedChanges ) > 0 && len (remainingChanges ) > 0 {
1553- // First time this set of changes is seeing something enacted, and there
1554- // are remaining changes.
1523+ if len (enactedChanges ) > 0 && len (remainingChanges ) > 0 {
1524+ // There are remaining changes, so potentially update their gcTime.
15551525 //
1556- // No longer permitted to rollback.
1557- rs .pendingChangeNoRollback = true
15581526 // All remaining changes have the same gcTime.
15591527 curGCTime := remainingChanges [0 ].gcTime
15601528 revisedGCTime := now .Add (partiallyEnactedGCDuration )
@@ -1598,27 +1566,19 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
15981566 // preCheckOnApplyReplicaChanges returns false if there are any pending
15991567 // changes, and these are the changes that are pending. This is hacky
16001568 // and should be cleaned up.
1601- var valid bool
1602- var reason redact.RedactableString
1603- if gcRemainingChanges {
1604- reason = "GCing remaining changes after partial enactment"
1605- } else {
1606- // NB: rs.pendingChanges contains the same changes as
1607- // remainingChanges, but they are not the same slice.
1608- rc := rs .pendingChanges
1609- rs .pendingChanges = nil
1610- err := cs .preCheckOnApplyReplicaChanges (PendingRangeChange {
1611- RangeID : rangeMsg .RangeID ,
1612- pendingReplicaChanges : remainingChanges ,
1613- })
1614- valid = err == nil
1615- if err != nil {
1616- reason = redact .Sprint (err )
1617- }
1618- // Restore it.
1619- rs .pendingChanges = rc
1620- }
1621- if valid {
1569+ //
1570+ // NB: rs.pendingChanges contains the same changes as
1571+ // remainingChanges, but they are not the same slice.
1572+ rc := rs .pendingChanges
1573+ rs .pendingChanges = nil
1574+ err := cs .preCheckOnApplyReplicaChanges (PendingRangeChange {
1575+ RangeID : rangeMsg .RangeID ,
1576+ pendingReplicaChanges : remainingChanges ,
1577+ })
1578+ // Restore it.
1579+ rs .pendingChanges = rc
1580+
1581+ if err == nil {
16221582 // Re-apply the remaining changes. Note that the load change was not
16231583 // undone above, so we pass !applyLoadChange, to avoid applying it
16241584 // again. Also note that applyReplicaChange does not add to the various
@@ -1628,6 +1588,7 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
16281588 cs .applyReplicaChange (change .ReplicaChange , false )
16291589 }
16301590 } else {
1591+ reason := redact .Sprint (err )
16311592 // The current state provided by the leaseholder does not permit these
16321593 // changes, so we need to drop them. This should be rare, but can happen
16331594 // if the leaseholder executed a change that MMA was completely unaware
@@ -1861,7 +1822,6 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
18611822 topk := ss .adjusted .topKRanges [msg .StoreID ]
18621823 topk .doneInit ()
18631824 }
1864-
18651825}
18661826
18671827// If the pending replica change does not happen within this GC duration, we
@@ -1895,15 +1855,6 @@ func (cs *clusterState) gcPendingChanges(now time.Time) {
18951855 if ! ok {
18961856 panic (errors .AssertionFailedf ("range %v not found in cluster state" , rangeID ))
18971857 }
1898-
1899- // Unlike normal GC that reverts changes, we want to discard these pending
1900- // changes. Do nothing here; processStoreLeaseholderMsgInternal will later
1901- // detect and discard these pending changes. Note that
1902- // processStoreLeaseholderMsgInternal will not revert the pending load
1903- // change.
1904- if rs .pendingChangeNoRollback {
1905- continue
1906- }
19071858 if len (rs .pendingChanges ) == 0 {
19081859 panic (errors .AssertionFailedf ("no pending changes in range %v" , rangeID ))
19091860 }
@@ -1945,8 +1896,6 @@ func (cs *clusterState) pendingChangeEnacted(cid changeID, enactedAt time.Time)
19451896}
19461897
19471898// undoPendingChange reverses the change with ID cid.
1948- //
1949- // REQUIRES: the change is not marked as no-rollback.
19501899func (cs * clusterState ) undoPendingChange (cid changeID ) {
19511900 change , ok := cs .pendingChanges [cid ]
19521901 if ! ok {
@@ -1956,10 +1905,6 @@ func (cs *clusterState) undoPendingChange(cid changeID) {
19561905 if ! ok {
19571906 panic (errors .AssertionFailedf ("range %v not found in cluster state" , change .rangeID ))
19581907 }
1959- if rs .pendingChangeNoRollback {
1960- // One cannot undo changes once no-rollback is true.
1961- panic (errors .AssertionFailedf ("pending change is marked as no-rollback" ))
1962- }
19631908 // Wipe the analyzed constraints, as the range has changed.
19641909 rs .constraints = nil
19651910 rs .lastFailedChange = cs .ts .Now ()
@@ -1992,6 +1937,10 @@ func printMapPendingChanges(changes map[changeID]*pendingReplicaChange) string {
19921937// adjusted load, tracked pending changes and changeIDs to reflect the pending
19931938// application. It updates the *pendingReplicaChanges inside the change.
19941939//
1940+ // The change contains replica changes that will be returned outside the
1941+ // package, so a copy is made for package internal use (see the comment on
1942+ // PendingRangeChange about mutability).
1943+ //
19951944// REQUIRES: all the replica changes are to the same range, and that the range
19961945// has no pending changes.
19971946func (cs * clusterState ) addPendingRangeChange (change PendingRangeChange ) {
@@ -2018,26 +1967,26 @@ func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) {
20181967 // Only the lease is being transferred.
20191968 gcDuration = pendingLeaseTransferGCDuration
20201969 }
2021- pendingChanges := change .pendingReplicaChanges
20221970 now := cs .ts .Now ()
2023- for _ , pendingChange := range pendingChanges {
2024- cs .applyReplicaChange (pendingChange .ReplicaChange , true )
1971+ for _ , origPendingChange := range change . pendingReplicaChanges {
1972+ cs .applyReplicaChange (origPendingChange .ReplicaChange , true )
20251973 cs .changeSeqGen ++
20261974 cid := cs .changeSeqGen
2027- pendingChange .changeID = cid
2028- pendingChange .startTime = now
2029- pendingChange .gcTime = now .Add (gcDuration )
2030- pendingChange .enactedAtTime = time.Time {}
1975+ origPendingChange .changeID = cid
1976+ origPendingChange .startTime = now
1977+ origPendingChange .gcTime = now .Add (gcDuration )
1978+ origPendingChange .enactedAtTime = time.Time {}
1979+ // Make a copy for internal tracking, since the internal state is mutable.
1980+ pendingChange := & pendingReplicaChange {}
1981+ * pendingChange = * origPendingChange
20311982 storeState := cs .stores [pendingChange .target .StoreID ]
20321983 rangeState := cs .ranges [rangeID ]
20331984 cs .pendingChanges [cid ] = pendingChange
20341985 storeState .adjusted .loadPendingChanges [cid ] = pendingChange
20351986 rangeState .pendingChanges = append (rangeState .pendingChanges , pendingChange )
2036- rangeState .pendingChangeNoRollback = false
20371987 log .KvDistribution .VInfof (context .Background (), 3 ,
20381988 "addPendingRangeChange: change_id=%v, range_id=%v, change=%v" ,
20391989 cid , rangeID , pendingChange .ReplicaChange )
2040- pendingChanges = append (pendingChanges , pendingChange )
20411990 }
20421991}
20431992
@@ -2105,8 +2054,6 @@ func (cs *clusterState) preCheckOnApplyReplicaChanges(rangeChange PendingRangeCh
21052054// preCheckOnUndoReplicaChanges does some validation of the changes being
21062055// proposed for undo.
21072056//
2108- // REQUIRES: the rangeState.pendingChangeNoRollback is false.
2109- //
21102057// This method is defensive since if we always check against the current state
21112058// before allowing a change to be added (including re-addition after a
21122059// StoreLeaseholderMsg), we should never have invalidity during an undo, if
0 commit comments