Skip to content

Commit 2ed96a3

Browse files
committed
mma: eliminate pendingChangeNoRollback
Instead, the ReplicaChange.prev field is updated to reflect the latest state reported by the leaseholder. In addition to simplifyin the code, it fixes an existing issue where an undo could rollback to a state preceding the latest leaseholder state. Informs #157049 Epic: CRDB-55052 Release note: None
1 parent a0b2049 commit 2ed96a3

File tree

6 files changed

+326
-202
lines changed

6 files changed

+326
-202
lines changed

pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -262,18 +262,14 @@ func (a *allocatorState) ProcessStoreLoadMsg(ctx context.Context, msg *StoreLoad
262262
func (a *allocatorState) AdjustPendingChangeDisposition(change PendingRangeChange, success bool) {
263263
a.mu.Lock()
264264
defer a.mu.Unlock()
265-
rs, ok := a.cs.ranges[change.RangeID]
265+
_, ok := a.cs.ranges[change.RangeID]
266266
if !ok {
267267
// Range no longer exists. This can happen if the StoreLeaseholderMsg
268268
// which included the effect of the change that transferred the lease away
269269
// was already processed, causing the range to no longer be tracked by the
270270
// allocator.
271271
return
272272
}
273-
if !success && rs.pendingChangeNoRollback {
274-
// Not allowed to undo.
275-
return
276-
}
277273
// NB: It is possible that some of the changes have already been enacted via
278274
// StoreLeaseholderMsg, and even been garbage collected. So no assumption
279275
// can be made about whether these changes will be found in the allocator's
@@ -284,6 +280,9 @@ func (a *allocatorState) AdjustPendingChangeDisposition(change PendingRangeChang
284280
if !ok {
285281
continue
286282
}
283+
// NB: the ch and c pointers are not identical even though they have the
284+
// same changeID. We create two copies in
285+
// clusterState.addPendingRangeChange, since the internal copy is mutable.
287286
changes = append(changes, ch)
288287
}
289288
if len(changes) == 0 {

pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go

Lines changed: 71 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -204,12 +204,12 @@ type ReplicaChange struct {
204204
// replica being demoted cannot retain the lease).
205205
//
206206
// NB: The prev value is always the state before the change. This is the
207-
// source of truth provided by the leaseholder in the RangeMsg, so will
208-
// have real ReplicaIDs (if already a replica) and real ReplicaTypes
207+
// latest source of truth provided by the leaseholder in the RangeMsg, so
208+
// will have real ReplicaIDs (if already a replica) and real ReplicaTypes
209209
// (including types beyond VOTER_FULL and NON_VOTER). This source-of-truth
210210
// claim is guaranteed by REQUIREMENT(change-computation) documented
211-
// elsewhere, and the fact that new changes are computed only when there
212-
// are no pending changes for a range.
211+
// elsewhere, and the fact that new changes are computed only when there are
212+
// no pending changes for a range.
213213
//
214214
// The ReplicaType in next is either the zero value (for removals), or
215215
// {VOTER_FULL, NON_VOTER} for additions/change, i.e., it represents the
@@ -218,6 +218,9 @@ type ReplicaChange struct {
218218
// TODO(tbg): in MakeLeaseTransferChanges, next.ReplicaType.ReplicaType is
219219
// simply the current value, and not necessarily {VOTER_FULL, NON_VOTER}.
220220
// So the above comment is incorrect. We should clean this up.
221+
//
222+
// The prev field is mutable after creation, to ensure that an undo restores
223+
// the state to the latest source of truth from the leaseholder.
221224
prev ReplicaState
222225
next ReplicaIDAndType
223226

@@ -459,21 +462,14 @@ func mapReplicaTypeToVoterOrNonVoter(rType roachpb.ReplicaType) roachpb.ReplicaT
459462
// replicas, or transferring the lease. There is at most one change per store
460463
// in the set.
461464
//
462-
// NB: pendingReplicaChanges is not visible outside the package, so we can be
463-
// certain that callers outside this package that hold a PendingRangeChange
464-
// cannot mutate the internals other than clearing the state.
465-
//
466-
// Additionally, for a PendingRangeChange returned outside the package, we
467-
// ensure that the pendingReplicaChanges slice itself is not shared with the
468-
// rangeState.pendingChanges slice since the rangeState.pendingChanges slice
469-
// can have entries removed from it (and swapped around as part of removal).
470-
//
471-
// Some the state inside each *pendingReplicaChange is mutable at arbitrary
472-
// points in time by the code inside this package (with the relevant locking,
473-
// of course). Currently, this state is gcTime, enactedAtTime. Neither of it
474-
// is read by the public methods on PendingRangeChange.
465+
// NB: pendingReplicaChanges is not visible outside the package.
475466
//
476-
// TODO(sumeer): when we expand the set of mutable fields, make a deep copy.
467+
// The *pendingReplicaChange objects are pointers since the clusterState
468+
// struct has multiple slices and maps that point to the same
469+
// *pendingReplicaChange object, which is mutable. To prevent race conditions
470+
// with exported functions on PendingRangeChange called from outside the
471+
// package, the *pendingReplicaChange objects returned outside the package are
472+
// a copy that will not be mutated.
477473
type PendingRangeChange struct {
478474
RangeID roachpb.RangeID
479475
pendingReplicaChanges []*pendingReplicaChange
@@ -727,9 +723,9 @@ type pendingReplicaChange struct {
727723
// expiry. All replica changes in a PendingRangeChange have the same
728724
// startTime.
729725
startTime time.Time
730-
// gcTime represents a time when the unenacted change should be GC'd, either
731-
// using the normal GC undo path, or if rangeState.pendingChangeNoRollback
732-
// is true, when processing a RangeMsg from the leaseholder.
726+
// gcTime represents a time when the unenacted change should be GC'd.
727+
//
728+
// Mutable after creation.
733729
gcTime time.Time
734730

735731
// TODO(kvoli,sumeerbhola): Consider adopting an explicit expiration time,
@@ -742,6 +738,8 @@ type pendingReplicaChange struct {
742738
// information received from the leaseholder, this value is set, so that even
743739
// if the store with a replica affected by this pending change does not tell
744740
// us about the enactment, we can garbage collect this change.
741+
//
742+
// Mutable after creation.
745743
enactedAtTime time.Time
746744
}
747745

@@ -1052,7 +1050,7 @@ type rangeState struct {
10521050
// that are still at the initial state, or an intermediate state, it can
10531051
// continue anticipating that these pending changes will happen. Tracking
10541052
// what is pending also allows for undo in the case of explicit failure,
1055-
// notified by AdjustPendingChangesDisposition.
1053+
// notified by AdjustPendingChangesDisposition, or GC.
10561054
//
10571055
// 2. Lifecycle
10581056
// pendingChanges track proposed modifications to a range's replicas or
@@ -1078,17 +1076,9 @@ type rangeState struct {
10781076
// has been enacted in this case.
10791077
//
10801078
// 2. Undone as failed: corresponding replica and load change is rolled back.
1081-
// Note that for replica changes that originate from one action, all changes
1082-
// would be undone together.
1083-
// NB: pending changes of a range state originate from one decision.
1084-
// Therefore, when one pending change is enacted successfully, we mark this
1085-
// range state's pending changes as no rollback (read more about this in 3).
1086-
// If we are here trying to undo a pending change but the range state has
1087-
// already been marked as no rollback, we do not undo the remaining pending
1088-
// changes. Instead, we wait for a StoreLeaseholderMsg to discard the pending
1089-
// changes and revert the load adjustments after the
1090-
// partiallyEnactedGCDuration has elapsed since the first enacted change. The
1091-
// modeling here is imperfect (read more about this in 3).
1079+
// Note that for replica changes that originate from one action, some changes
1080+
// can be considered done because of the leaseholder msg, and others can be
1081+
// rolled back (say due to GC).
10921082
//
10931083
// This happens when:
10941084
// - The pending change failed to apply via
@@ -1149,14 +1139,10 @@ type rangeState struct {
11491139
// the replica and leaseholder to s4. An intermediate state that can be
11501140
// observed is {s1, s2, s3, s4} with the lease still at s3. But the pending
11511141
// change for adding s4 includes both that it has a replica, and it has the
1152-
// lease, so we will not mark it done, and keep pretending that the whole
1153-
// change is pending. Since lease transfers are fast, we accept this
1154-
// imperfect modeling fidelity. One consequence of this imperfect modeling
1155-
// is that if in this example there are no further changes observed until
1156-
// GC, the allocator will undo both changes and go back to the state {s1,
1157-
// s2, s3} with s3 as the leaseholder. That is, it has forgotten that s4 was
1158-
// added. This is unavoidable and will be fixed by the first
1159-
// StoreLeaseholderMsg post-GC.
1142+
// lease, so we will not mark it done, and keep pretending that the change
1143+
// is pending. However, we will change the prev state to indicate that s4
1144+
// has a replica, so that undo (say due to GC) rolls back to the latest
1145+
// source-of-truth from the leaseholder.
11601146
//
11611147
// 4. Non Atomicity Hazard
11621148
//
@@ -1165,20 +1151,19 @@ type rangeState struct {
11651151
// to contend with the hazard of having two leaseholders or no leaseholders.
11661152
// In the earlier example, say s3 and s4 were both local stores (a
11671153
// multi-store node), it may be possible to observe an intermediate state
1168-
// {s1, s2, s3, s4} where s4 is the leaseholder. If we subsequently get a
1169-
// spurious AdjustPendingChangesDisposition(success=false) call, or
1170-
// time-based GC causes the s3 removal to be undone, there will be two
1171-
// replicas marked as the leaseholder. The other extreme is believing that
1172-
// the s3 transfer is done and the s4 incoming replica (and lease) failed
1173-
// (this may not actually be possible because of the surrounding code).
1154+
// {s1, s2, s3, s4} where s4 is the leaseholder. We need to ensure that if
1155+
// we subsequently get a spurious
1156+
// AdjustPendingChangesDisposition(success=false) call, or time-based GC
1157+
// causes the s3 removal to be undone, there will not be two replicas marked
1158+
// as the leaseholder. The other extreme is believing that the s3 transfer
1159+
// is done and the s4 incoming replica (and lease) failed (this may not
1160+
// actually be possible because of the surrounding code).
11741161
//
1175-
// We deal with this hazard by observing that we've constructed multiple
1176-
// pending changes in order to observe intermediate changes in the common
1177-
// case of success. Once one change in the set of changes is considered
1178-
// enacted, we mark the whole remaining group as no-rollback. In the above
1179-
// case, if we see s4 has become the leaseholder, the s1 removal can't undo
1180-
// itself -- it can be dropped if it is considered subsumed when processing
1181-
// a RangeMsg, or it can be GC'd.
1162+
// This hazard is dealt with in the same way outlined in the earlier
1163+
// example: when the leaseholder msg from s4 arrives that lists {s1, s2, s3,
1164+
// s4} as replicas, the prev state for the s3 change is updated to indicate
1165+
// that it is not the leaseholder. This means that if the change is undone,
1166+
// it will return to a prev state where it has a replica but not the lease.
11821167
//
11831168
// Additionally, when processing a RangeMsg, if any of the pending changes
11841169
// is considered inconsistent, all the pending changes are discarded. This
@@ -1198,11 +1183,6 @@ type rangeState struct {
11981183
// rangeState.pendingChanges across all ranges in clusterState.ranges will
11991184
// be identical to clusterState.pendingChanges.
12001185
pendingChanges []*pendingReplicaChange
1201-
// When set, the pendingChanges can not be rolled back anymore. They have
1202-
// to be enacted, or discarded wholesale in favor of the latest RangeMsg
1203-
// from the leaseholder. It is reset to false when pendingChanges
1204-
// transitions from empty to non-empty.
1205-
pendingChangeNoRollback bool
12061186

12071187
// If non-nil, it is up-to-date. Typically, non-nil for a range that has no
12081188
// pendingChanges and is not satisfying some constraint, since we don't want
@@ -1534,27 +1514,15 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
15341514
// The change has been enacted according to the leaseholder.
15351515
enactedChanges = append(enactedChanges, change)
15361516
} else {
1517+
// Not subsumed. Replace the prev with the latest source of truth from
1518+
// the leaseholder. Note, this can be the noReplicaID case from above.
1519+
change.prev = adjustedReplica
15371520
remainingChanges = append(remainingChanges, change)
15381521
}
15391522
}
1540-
gcRemainingChanges := false
1541-
if rs.pendingChangeNoRollback {
1542-
// A previous StoreLeaseholderMsg has enacted some changes, so the
1543-
// remainingChanges may be GC'able. All of them share the same GC time.
1544-
// Note that normal GC will not GC these, since normal GC needs to undo,
1545-
// and we are not allowed to undo these.
1546-
if len(remainingChanges) > 0 {
1547-
gcTime := remainingChanges[0].gcTime
1548-
if gcTime.Before(now) {
1549-
gcRemainingChanges = true
1550-
}
1551-
}
1552-
} else if len(enactedChanges) > 0 && len(remainingChanges) > 0 {
1553-
// First time this set of changes is seeing something enacted, and there
1554-
// are remaining changes.
1523+
if len(enactedChanges) > 0 && len(remainingChanges) > 0 {
1524+
// There are remaining changes, so potentially update their gcTime.
15551525
//
1556-
// No longer permitted to rollback.
1557-
rs.pendingChangeNoRollback = true
15581526
// All remaining changes have the same gcTime.
15591527
curGCTime := remainingChanges[0].gcTime
15601528
revisedGCTime := now.Add(partiallyEnactedGCDuration)
@@ -1598,27 +1566,19 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
15981566
// preCheckOnApplyReplicaChanges returns false if there are any pending
15991567
// changes, and these are the changes that are pending. This is hacky
16001568
// and should be cleaned up.
1601-
var valid bool
1602-
var reason redact.RedactableString
1603-
if gcRemainingChanges {
1604-
reason = "GCing remaining changes after partial enactment"
1605-
} else {
1606-
// NB: rs.pendingChanges contains the same changes as
1607-
// remainingChanges, but they are not the same slice.
1608-
rc := rs.pendingChanges
1609-
rs.pendingChanges = nil
1610-
err := cs.preCheckOnApplyReplicaChanges(PendingRangeChange{
1611-
RangeID: rangeMsg.RangeID,
1612-
pendingReplicaChanges: remainingChanges,
1613-
})
1614-
valid = err == nil
1615-
if err != nil {
1616-
reason = redact.Sprint(err)
1617-
}
1618-
// Restore it.
1619-
rs.pendingChanges = rc
1620-
}
1621-
if valid {
1569+
//
1570+
// NB: rs.pendingChanges contains the same changes as
1571+
// remainingChanges, but they are not the same slice.
1572+
rc := rs.pendingChanges
1573+
rs.pendingChanges = nil
1574+
err := cs.preCheckOnApplyReplicaChanges(PendingRangeChange{
1575+
RangeID: rangeMsg.RangeID,
1576+
pendingReplicaChanges: remainingChanges,
1577+
})
1578+
// Restore it.
1579+
rs.pendingChanges = rc
1580+
1581+
if err == nil {
16221582
// Re-apply the remaining changes. Note that the load change was not
16231583
// undone above, so we pass !applyLoadChange, to avoid applying it
16241584
// again. Also note that applyReplicaChange does not add to the various
@@ -1628,6 +1588,7 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
16281588
cs.applyReplicaChange(change.ReplicaChange, false)
16291589
}
16301590
} else {
1591+
reason := redact.Sprint(err)
16311592
// The current state provided by the leaseholder does not permit these
16321593
// changes, so we need to drop them. This should be rare, but can happen
16331594
// if the leaseholder executed a change that MMA was completely unaware
@@ -1861,7 +1822,6 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
18611822
topk := ss.adjusted.topKRanges[msg.StoreID]
18621823
topk.doneInit()
18631824
}
1864-
18651825
}
18661826

18671827
// If the pending replica change does not happen within this GC duration, we
@@ -1895,15 +1855,6 @@ func (cs *clusterState) gcPendingChanges(now time.Time) {
18951855
if !ok {
18961856
panic(errors.AssertionFailedf("range %v not found in cluster state", rangeID))
18971857
}
1898-
1899-
// Unlike normal GC that reverts changes, we want to discard these pending
1900-
// changes. Do nothing here; processStoreLeaseholderMsgInternal will later
1901-
// detect and discard these pending changes. Note that
1902-
// processStoreLeaseholderMsgInternal will not revert the pending load
1903-
// change.
1904-
if rs.pendingChangeNoRollback {
1905-
continue
1906-
}
19071858
if len(rs.pendingChanges) == 0 {
19081859
panic(errors.AssertionFailedf("no pending changes in range %v", rangeID))
19091860
}
@@ -1945,8 +1896,6 @@ func (cs *clusterState) pendingChangeEnacted(cid changeID, enactedAt time.Time)
19451896
}
19461897

19471898
// undoPendingChange reverses the change with ID cid.
1948-
//
1949-
// REQUIRES: the change is not marked as no-rollback.
19501899
func (cs *clusterState) undoPendingChange(cid changeID) {
19511900
change, ok := cs.pendingChanges[cid]
19521901
if !ok {
@@ -1956,10 +1905,6 @@ func (cs *clusterState) undoPendingChange(cid changeID) {
19561905
if !ok {
19571906
panic(errors.AssertionFailedf("range %v not found in cluster state", change.rangeID))
19581907
}
1959-
if rs.pendingChangeNoRollback {
1960-
// One cannot undo changes once no-rollback is true.
1961-
panic(errors.AssertionFailedf("pending change is marked as no-rollback"))
1962-
}
19631908
// Wipe the analyzed constraints, as the range has changed.
19641909
rs.constraints = nil
19651910
rs.lastFailedChange = cs.ts.Now()
@@ -1992,6 +1937,10 @@ func printMapPendingChanges(changes map[changeID]*pendingReplicaChange) string {
19921937
// adjusted load, tracked pending changes and changeIDs to reflect the pending
19931938
// application. It updates the *pendingReplicaChanges inside the change.
19941939
//
1940+
// The change contains replica changes that will be returned outside the
1941+
// package, so a copy is made for package internal use (see the comment on
1942+
// PendingRangeChange about mutability).
1943+
//
19951944
// REQUIRES: all the replica changes are to the same range, and that the range
19961945
// has no pending changes.
19971946
func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) {
@@ -2018,26 +1967,26 @@ func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) {
20181967
// Only the lease is being transferred.
20191968
gcDuration = pendingLeaseTransferGCDuration
20201969
}
2021-
pendingChanges := change.pendingReplicaChanges
20221970
now := cs.ts.Now()
2023-
for _, pendingChange := range pendingChanges {
2024-
cs.applyReplicaChange(pendingChange.ReplicaChange, true)
1971+
for _, origPendingChange := range change.pendingReplicaChanges {
1972+
cs.applyReplicaChange(origPendingChange.ReplicaChange, true)
20251973
cs.changeSeqGen++
20261974
cid := cs.changeSeqGen
2027-
pendingChange.changeID = cid
2028-
pendingChange.startTime = now
2029-
pendingChange.gcTime = now.Add(gcDuration)
2030-
pendingChange.enactedAtTime = time.Time{}
1975+
origPendingChange.changeID = cid
1976+
origPendingChange.startTime = now
1977+
origPendingChange.gcTime = now.Add(gcDuration)
1978+
origPendingChange.enactedAtTime = time.Time{}
1979+
// Make a copy for internal tracking, since the internal state is mutable.
1980+
pendingChange := &pendingReplicaChange{}
1981+
*pendingChange = *origPendingChange
20311982
storeState := cs.stores[pendingChange.target.StoreID]
20321983
rangeState := cs.ranges[rangeID]
20331984
cs.pendingChanges[cid] = pendingChange
20341985
storeState.adjusted.loadPendingChanges[cid] = pendingChange
20351986
rangeState.pendingChanges = append(rangeState.pendingChanges, pendingChange)
2036-
rangeState.pendingChangeNoRollback = false
20371987
log.KvDistribution.VInfof(context.Background(), 3,
20381988
"addPendingRangeChange: change_id=%v, range_id=%v, change=%v",
20391989
cid, rangeID, pendingChange.ReplicaChange)
2040-
pendingChanges = append(pendingChanges, pendingChange)
20411990
}
20421991
}
20431992

@@ -2105,8 +2054,6 @@ func (cs *clusterState) preCheckOnApplyReplicaChanges(rangeChange PendingRangeCh
21052054
// preCheckOnUndoReplicaChanges does some validation of the changes being
21062055
// proposed for undo.
21072056
//
2108-
// REQUIRES: the rangeState.pendingChangeNoRollback is false.
2109-
//
21102057
// This method is defensive since if we always check against the current state
21112058
// before allowing a change to be added (including re-addition after a
21122059
// StoreLeaseholderMsg), we should never have invalidity during an undo, if

0 commit comments

Comments
 (0)