From 2ed96a346bb41c84093492a12bc7670e07deb2c2 Mon Sep 17 00:00:00 2001 From: sumeerbhola Date: Tue, 18 Nov 2025 15:55:15 -0500 Subject: [PATCH] mma: eliminate pendingChangeNoRollback Instead, the ReplicaChange.prev field is updated to reflect the latest state reported by the leaseholder. In addition to simplifyin the code, it fixes an existing issue where an undo could rollback to a state preceding the latest leaseholder state. Informs #157049 Epic: CRDB-55052 Release note: None --- .../allocator/mmaprototype/allocator_state.go | 9 +- .../allocator/mmaprototype/cluster_state.go | 195 +++++++----------- .../testdata/cluster_state/rebalance_replica | 8 +- .../rebalance_replica_local_stores | 163 +++++++++++---- .../rebalance_replica_local_stores_enacted_gc | 76 ++++--- ...e_replica_local_stores_fail_lease_transfer | 77 +++++++ 6 files changed, 326 insertions(+), 202 deletions(-) create mode 100644 pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_fail_lease_transfer diff --git a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go index a27832b4af9c..f5f986550f38 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go @@ -262,7 +262,7 @@ func (a *allocatorState) ProcessStoreLoadMsg(ctx context.Context, msg *StoreLoad func (a *allocatorState) AdjustPendingChangeDisposition(change PendingRangeChange, success bool) { a.mu.Lock() defer a.mu.Unlock() - rs, ok := a.cs.ranges[change.RangeID] + _, ok := a.cs.ranges[change.RangeID] if !ok { // Range no longer exists. This can happen if the StoreLeaseholderMsg // which included the effect of the change that transferred the lease away @@ -270,10 +270,6 @@ func (a *allocatorState) AdjustPendingChangeDisposition(change PendingRangeChang // allocator. return } - if !success && rs.pendingChangeNoRollback { - // Not allowed to undo. - return - } // NB: It is possible that some of the changes have already been enacted via // StoreLeaseholderMsg, and even been garbage collected. So no assumption // can be made about whether these changes will be found in the allocator's @@ -284,6 +280,9 @@ func (a *allocatorState) AdjustPendingChangeDisposition(change PendingRangeChang if !ok { continue } + // NB: the ch and c pointers are not identical even though they have the + // same changeID. We create two copies in + // clusterState.addPendingRangeChange, since the internal copy is mutable. changes = append(changes, ch) } if len(changes) == 0 { diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go index eb65f83a71f3..ece48c1bd57e 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go @@ -204,12 +204,12 @@ type ReplicaChange struct { // replica being demoted cannot retain the lease). // // NB: The prev value is always the state before the change. This is the - // source of truth provided by the leaseholder in the RangeMsg, so will - // have real ReplicaIDs (if already a replica) and real ReplicaTypes + // latest source of truth provided by the leaseholder in the RangeMsg, so + // will have real ReplicaIDs (if already a replica) and real ReplicaTypes // (including types beyond VOTER_FULL and NON_VOTER). This source-of-truth // claim is guaranteed by REQUIREMENT(change-computation) documented - // elsewhere, and the fact that new changes are computed only when there - // are no pending changes for a range. + // elsewhere, and the fact that new changes are computed only when there are + // no pending changes for a range. // // The ReplicaType in next is either the zero value (for removals), or // {VOTER_FULL, NON_VOTER} for additions/change, i.e., it represents the @@ -218,6 +218,9 @@ type ReplicaChange struct { // TODO(tbg): in MakeLeaseTransferChanges, next.ReplicaType.ReplicaType is // simply the current value, and not necessarily {VOTER_FULL, NON_VOTER}. // So the above comment is incorrect. We should clean this up. + // + // The prev field is mutable after creation, to ensure that an undo restores + // the state to the latest source of truth from the leaseholder. prev ReplicaState next ReplicaIDAndType @@ -459,21 +462,14 @@ func mapReplicaTypeToVoterOrNonVoter(rType roachpb.ReplicaType) roachpb.ReplicaT // replicas, or transferring the lease. There is at most one change per store // in the set. // -// NB: pendingReplicaChanges is not visible outside the package, so we can be -// certain that callers outside this package that hold a PendingRangeChange -// cannot mutate the internals other than clearing the state. -// -// Additionally, for a PendingRangeChange returned outside the package, we -// ensure that the pendingReplicaChanges slice itself is not shared with the -// rangeState.pendingChanges slice since the rangeState.pendingChanges slice -// can have entries removed from it (and swapped around as part of removal). -// -// Some the state inside each *pendingReplicaChange is mutable at arbitrary -// points in time by the code inside this package (with the relevant locking, -// of course). Currently, this state is gcTime, enactedAtTime. Neither of it -// is read by the public methods on PendingRangeChange. +// NB: pendingReplicaChanges is not visible outside the package. // -// TODO(sumeer): when we expand the set of mutable fields, make a deep copy. +// The *pendingReplicaChange objects are pointers since the clusterState +// struct has multiple slices and maps that point to the same +// *pendingReplicaChange object, which is mutable. To prevent race conditions +// with exported functions on PendingRangeChange called from outside the +// package, the *pendingReplicaChange objects returned outside the package are +// a copy that will not be mutated. type PendingRangeChange struct { RangeID roachpb.RangeID pendingReplicaChanges []*pendingReplicaChange @@ -727,9 +723,9 @@ type pendingReplicaChange struct { // expiry. All replica changes in a PendingRangeChange have the same // startTime. startTime time.Time - // gcTime represents a time when the unenacted change should be GC'd, either - // using the normal GC undo path, or if rangeState.pendingChangeNoRollback - // is true, when processing a RangeMsg from the leaseholder. + // gcTime represents a time when the unenacted change should be GC'd. + // + // Mutable after creation. gcTime time.Time // TODO(kvoli,sumeerbhola): Consider adopting an explicit expiration time, @@ -742,6 +738,8 @@ type pendingReplicaChange struct { // information received from the leaseholder, this value is set, so that even // if the store with a replica affected by this pending change does not tell // us about the enactment, we can garbage collect this change. + // + // Mutable after creation. enactedAtTime time.Time } @@ -1052,7 +1050,7 @@ type rangeState struct { // that are still at the initial state, or an intermediate state, it can // continue anticipating that these pending changes will happen. Tracking // what is pending also allows for undo in the case of explicit failure, - // notified by AdjustPendingChangesDisposition. + // notified by AdjustPendingChangesDisposition, or GC. // // 2. Lifecycle // pendingChanges track proposed modifications to a range's replicas or @@ -1078,17 +1076,9 @@ type rangeState struct { // has been enacted in this case. // // 2. Undone as failed: corresponding replica and load change is rolled back. - // Note that for replica changes that originate from one action, all changes - // would be undone together. - // NB: pending changes of a range state originate from one decision. - // Therefore, when one pending change is enacted successfully, we mark this - // range state's pending changes as no rollback (read more about this in 3). - // If we are here trying to undo a pending change but the range state has - // already been marked as no rollback, we do not undo the remaining pending - // changes. Instead, we wait for a StoreLeaseholderMsg to discard the pending - // changes and revert the load adjustments after the - // partiallyEnactedGCDuration has elapsed since the first enacted change. The - // modeling here is imperfect (read more about this in 3). + // Note that for replica changes that originate from one action, some changes + // can be considered done because of the leaseholder msg, and others can be + // rolled back (say due to GC). // // This happens when: // - The pending change failed to apply via @@ -1149,14 +1139,10 @@ type rangeState struct { // the replica and leaseholder to s4. An intermediate state that can be // observed is {s1, s2, s3, s4} with the lease still at s3. But the pending // change for adding s4 includes both that it has a replica, and it has the - // lease, so we will not mark it done, and keep pretending that the whole - // change is pending. Since lease transfers are fast, we accept this - // imperfect modeling fidelity. One consequence of this imperfect modeling - // is that if in this example there are no further changes observed until - // GC, the allocator will undo both changes and go back to the state {s1, - // s2, s3} with s3 as the leaseholder. That is, it has forgotten that s4 was - // added. This is unavoidable and will be fixed by the first - // StoreLeaseholderMsg post-GC. + // lease, so we will not mark it done, and keep pretending that the change + // is pending. However, we will change the prev state to indicate that s4 + // has a replica, so that undo (say due to GC) rolls back to the latest + // source-of-truth from the leaseholder. // // 4. Non Atomicity Hazard // @@ -1165,20 +1151,19 @@ type rangeState struct { // to contend with the hazard of having two leaseholders or no leaseholders. // In the earlier example, say s3 and s4 were both local stores (a // multi-store node), it may be possible to observe an intermediate state - // {s1, s2, s3, s4} where s4 is the leaseholder. If we subsequently get a - // spurious AdjustPendingChangesDisposition(success=false) call, or - // time-based GC causes the s3 removal to be undone, there will be two - // replicas marked as the leaseholder. The other extreme is believing that - // the s3 transfer is done and the s4 incoming replica (and lease) failed - // (this may not actually be possible because of the surrounding code). + // {s1, s2, s3, s4} where s4 is the leaseholder. We need to ensure that if + // we subsequently get a spurious + // AdjustPendingChangesDisposition(success=false) call, or time-based GC + // causes the s3 removal to be undone, there will not be two replicas marked + // as the leaseholder. The other extreme is believing that the s3 transfer + // is done and the s4 incoming replica (and lease) failed (this may not + // actually be possible because of the surrounding code). // - // We deal with this hazard by observing that we've constructed multiple - // pending changes in order to observe intermediate changes in the common - // case of success. Once one change in the set of changes is considered - // enacted, we mark the whole remaining group as no-rollback. In the above - // case, if we see s4 has become the leaseholder, the s1 removal can't undo - // itself -- it can be dropped if it is considered subsumed when processing - // a RangeMsg, or it can be GC'd. + // This hazard is dealt with in the same way outlined in the earlier + // example: when the leaseholder msg from s4 arrives that lists {s1, s2, s3, + // s4} as replicas, the prev state for the s3 change is updated to indicate + // that it is not the leaseholder. This means that if the change is undone, + // it will return to a prev state where it has a replica but not the lease. // // Additionally, when processing a RangeMsg, if any of the pending changes // is considered inconsistent, all the pending changes are discarded. This @@ -1198,11 +1183,6 @@ type rangeState struct { // rangeState.pendingChanges across all ranges in clusterState.ranges will // be identical to clusterState.pendingChanges. pendingChanges []*pendingReplicaChange - // When set, the pendingChanges can not be rolled back anymore. They have - // to be enacted, or discarded wholesale in favor of the latest RangeMsg - // from the leaseholder. It is reset to false when pendingChanges - // transitions from empty to non-empty. - pendingChangeNoRollback bool // If non-nil, it is up-to-date. Typically, non-nil for a range that has no // pendingChanges and is not satisfying some constraint, since we don't want @@ -1534,27 +1514,15 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal( // The change has been enacted according to the leaseholder. enactedChanges = append(enactedChanges, change) } else { + // Not subsumed. Replace the prev with the latest source of truth from + // the leaseholder. Note, this can be the noReplicaID case from above. + change.prev = adjustedReplica remainingChanges = append(remainingChanges, change) } } - gcRemainingChanges := false - if rs.pendingChangeNoRollback { - // A previous StoreLeaseholderMsg has enacted some changes, so the - // remainingChanges may be GC'able. All of them share the same GC time. - // Note that normal GC will not GC these, since normal GC needs to undo, - // and we are not allowed to undo these. - if len(remainingChanges) > 0 { - gcTime := remainingChanges[0].gcTime - if gcTime.Before(now) { - gcRemainingChanges = true - } - } - } else if len(enactedChanges) > 0 && len(remainingChanges) > 0 { - // First time this set of changes is seeing something enacted, and there - // are remaining changes. + if len(enactedChanges) > 0 && len(remainingChanges) > 0 { + // There are remaining changes, so potentially update their gcTime. // - // No longer permitted to rollback. - rs.pendingChangeNoRollback = true // All remaining changes have the same gcTime. curGCTime := remainingChanges[0].gcTime revisedGCTime := now.Add(partiallyEnactedGCDuration) @@ -1598,27 +1566,19 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal( // preCheckOnApplyReplicaChanges returns false if there are any pending // changes, and these are the changes that are pending. This is hacky // and should be cleaned up. - var valid bool - var reason redact.RedactableString - if gcRemainingChanges { - reason = "GCing remaining changes after partial enactment" - } else { - // NB: rs.pendingChanges contains the same changes as - // remainingChanges, but they are not the same slice. - rc := rs.pendingChanges - rs.pendingChanges = nil - err := cs.preCheckOnApplyReplicaChanges(PendingRangeChange{ - RangeID: rangeMsg.RangeID, - pendingReplicaChanges: remainingChanges, - }) - valid = err == nil - if err != nil { - reason = redact.Sprint(err) - } - // Restore it. - rs.pendingChanges = rc - } - if valid { + // + // NB: rs.pendingChanges contains the same changes as + // remainingChanges, but they are not the same slice. + rc := rs.pendingChanges + rs.pendingChanges = nil + err := cs.preCheckOnApplyReplicaChanges(PendingRangeChange{ + RangeID: rangeMsg.RangeID, + pendingReplicaChanges: remainingChanges, + }) + // Restore it. + rs.pendingChanges = rc + + if err == nil { // Re-apply the remaining changes. Note that the load change was not // undone above, so we pass !applyLoadChange, to avoid applying it // again. Also note that applyReplicaChange does not add to the various @@ -1628,6 +1588,7 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal( cs.applyReplicaChange(change.ReplicaChange, false) } } else { + reason := redact.Sprint(err) // The current state provided by the leaseholder does not permit these // changes, so we need to drop them. This should be rare, but can happen // if the leaseholder executed a change that MMA was completely unaware @@ -1861,7 +1822,6 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal( topk := ss.adjusted.topKRanges[msg.StoreID] topk.doneInit() } - } // If the pending replica change does not happen within this GC duration, we @@ -1895,15 +1855,6 @@ func (cs *clusterState) gcPendingChanges(now time.Time) { if !ok { panic(errors.AssertionFailedf("range %v not found in cluster state", rangeID)) } - - // Unlike normal GC that reverts changes, we want to discard these pending - // changes. Do nothing here; processStoreLeaseholderMsgInternal will later - // detect and discard these pending changes. Note that - // processStoreLeaseholderMsgInternal will not revert the pending load - // change. - if rs.pendingChangeNoRollback { - continue - } if len(rs.pendingChanges) == 0 { panic(errors.AssertionFailedf("no pending changes in range %v", rangeID)) } @@ -1945,8 +1896,6 @@ func (cs *clusterState) pendingChangeEnacted(cid changeID, enactedAt time.Time) } // undoPendingChange reverses the change with ID cid. -// -// REQUIRES: the change is not marked as no-rollback. func (cs *clusterState) undoPendingChange(cid changeID) { change, ok := cs.pendingChanges[cid] if !ok { @@ -1956,10 +1905,6 @@ func (cs *clusterState) undoPendingChange(cid changeID) { if !ok { panic(errors.AssertionFailedf("range %v not found in cluster state", change.rangeID)) } - if rs.pendingChangeNoRollback { - // One cannot undo changes once no-rollback is true. - panic(errors.AssertionFailedf("pending change is marked as no-rollback")) - } // Wipe the analyzed constraints, as the range has changed. rs.constraints = nil rs.lastFailedChange = cs.ts.Now() @@ -1992,6 +1937,10 @@ func printMapPendingChanges(changes map[changeID]*pendingReplicaChange) string { // adjusted load, tracked pending changes and changeIDs to reflect the pending // application. It updates the *pendingReplicaChanges inside the change. // +// The change contains replica changes that will be returned outside the +// package, so a copy is made for package internal use (see the comment on +// PendingRangeChange about mutability). +// // REQUIRES: all the replica changes are to the same range, and that the range // has no pending changes. func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) { @@ -2018,26 +1967,26 @@ func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) { // Only the lease is being transferred. gcDuration = pendingLeaseTransferGCDuration } - pendingChanges := change.pendingReplicaChanges now := cs.ts.Now() - for _, pendingChange := range pendingChanges { - cs.applyReplicaChange(pendingChange.ReplicaChange, true) + for _, origPendingChange := range change.pendingReplicaChanges { + cs.applyReplicaChange(origPendingChange.ReplicaChange, true) cs.changeSeqGen++ cid := cs.changeSeqGen - pendingChange.changeID = cid - pendingChange.startTime = now - pendingChange.gcTime = now.Add(gcDuration) - pendingChange.enactedAtTime = time.Time{} + origPendingChange.changeID = cid + origPendingChange.startTime = now + origPendingChange.gcTime = now.Add(gcDuration) + origPendingChange.enactedAtTime = time.Time{} + // Make a copy for internal tracking, since the internal state is mutable. + pendingChange := &pendingReplicaChange{} + *pendingChange = *origPendingChange storeState := cs.stores[pendingChange.target.StoreID] rangeState := cs.ranges[rangeID] cs.pendingChanges[cid] = pendingChange storeState.adjusted.loadPendingChanges[cid] = pendingChange rangeState.pendingChanges = append(rangeState.pendingChanges, pendingChange) - rangeState.pendingChangeNoRollback = false log.KvDistribution.VInfof(context.Background(), 3, "addPendingRangeChange: change_id=%v, range_id=%v, change=%v", cid, rangeID, pendingChange.ReplicaChange) - pendingChanges = append(pendingChanges, pendingChange) } } @@ -2105,8 +2054,6 @@ func (cs *clusterState) preCheckOnApplyReplicaChanges(rangeChange PendingRangeCh // preCheckOnUndoReplicaChanges does some validation of the changes being // proposed for undo. // -// REQUIRES: the rangeState.pendingChangeNoRollback is false. -// // This method is defensive since if we always check against the current state // before allowing a change to be added (including re-addition after a // StoreLeaseholderMsg), we should never have invalidity during an undo, if diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica index 153b25b510a3..08300e0c0cb1 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica @@ -90,7 +90,7 @@ store-id=2 node-id=2 status=ok accepting all reported=[cpu:0, write-bandwidth:0, store-leaseholder-msg store-id=1 range-id=1 load=[80,80,80] raft-cpu=20 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) - store-id=1 replica-id=2 type=VOTER_FULL leaseholder=true + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true store-id=2 replica-id=2 type=VOTER_FULL ---- @@ -99,7 +99,7 @@ get-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=2 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s prev=(replica-id=1 type=VOTER_FULL leaseholder=true) @@ -120,7 +120,7 @@ get-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=2 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s enacted=5s prev=(replica-id=1 type=VOTER_FULL leaseholder=true) @@ -160,7 +160,7 @@ get-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=2 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s enacted=5s prev=(replica-id=1 type=VOTER_FULL leaseholder=true) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores index b6a2f972abbc..88467731ceb6 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores @@ -12,14 +12,11 @@ # s1) remains pending until s1 is removed from the replica set. # 3. Verifies enacted changes cannot be rejected (they're removed from # pendingChanges). -# 4. Verifies no-rollback changes cannot be rejected once any change in the -# set is enacted. -# 5. Tracks load adjustments: pending changes adjust load until store-reported +# 4. Tracks load adjustments: pending changes adjust load until store-reported # load reflects the change (after 10s of enactment). After that, the change # is removed from load tracking but may remain for GC. -# 6. Tests GC: changes marked no-rollback cannot be GC'd via normal GC (which -# requires undo). They're only removed once store-reported load reflects the -# change. +# 5. Tests GC and update of gcTime. +# 6. Test update of ReplicaChange.prev based on latest leaseholder info. # 7. Rebalances back from s2 to s1 and verifies changes are GC'd after the # normal GC duration. # @@ -95,7 +92,7 @@ t=5s store-leaseholder-msg store-id=1 range-id=1 load=[80,80,80] raft-cpu=20 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) - store-id=1 replica-id=2 type=VOTER_FULL leaseholder=true + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true store-id=2 replica-id=2 type=VOTER_FULL ---- @@ -104,7 +101,7 @@ get-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s prev=(replica-id=1 type=VOTER_FULL leaseholder=true) @@ -114,7 +111,7 @@ change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth store-leaseholder-msg store-id=2 range-id=1 load=[80,80,80] raft-cpu=20 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) - store-id=1 replica-id=2 type=VOTER_FULL + store-id=1 replica-id=1 type=VOTER_FULL store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true ---- @@ -125,18 +122,25 @@ range-id=1 local-store=2 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cp store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true # The addition of replica and lease on s2 is considered enacted. The removal -# of replica and lease from s1 is not yet enacted. The gc time is changes to -# be 30s after the enactment. +# of replica and lease from s1 is not yet enacted. The gc time is changed to +# be 30s after the enactment. Note that the prev state of change 2 no longer +# shows leaseholder=true since the lease has already been transferred. get-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=80 node-adjusted-cpu=88 seq=2 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:88, write-bandwidth:88, byte-size:88] node-reported-cpu=80 node-adjusted-cpu=88 seq=1 + top-k-ranges (local-store-id=2) dim=ByteSize: r1 + tick seconds=5 ---- t=10s @@ -146,21 +150,10 @@ reject-pending-changes change-ids=(1) expect-panic ---- pending(2) change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s - prev=(replica-id=none type=VOTER_FULL) - next=(replica-id=unknown type=VOTER_FULL leaseholder=true) -change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) - next=(replica-id=none type=VOTER_FULL) - -# Change 2 is found, but is no-rollback, so can't be rejected. -reject-pending-changes change-ids=(2) expect-panic ----- -pending(2) -change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) # Store load msg from s2, showing its new load. @@ -188,7 +181,7 @@ get-pending-changes ---- pending(1) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) # The adjusted load on s2 no longer reflects the addition of the replica and @@ -200,33 +193,37 @@ store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:8 store-id=2 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:80, write-bandwidth:80, byte-size:80] node-reported-cpu=160 node-adjusted-cpu=80 seq=4 top-k-ranges (local-store-id=2) dim=ByteSize: r1 -# Advance time enough for change 2 to be eligible for undo GC. -tick seconds=300 +# Advance time, but not enough to GC change 2. +tick seconds=10 ---- -t=5m10s +t=20s -# Even after the undo based GC time has elapsed change 2 cannot be undone -# since it is no-rollback. +# Change 2 is not undone. gc-pending-changes ---- pending(1) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) -# Store leaseholder msg from s2, showing that s1 is no longer a replica. +ranges +---- +range-id=1 local-store=2 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cpu=20 + store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true + +# Store leaseholder msg from s2, showing that s1 is no longer a replica, so +# change 2 is considered enacted. store-leaseholder-msg store-id=2 range-id=1 load=[80,80,80] raft-cpu=20 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true ---- -# Now change 2 is considered enacted. get-pending-changes ---- pending(1) -change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s enacted=5m10s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) +change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s enacted=20s + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) # Change 2 is still serving the purpose of adjusting the load on s1. @@ -236,10 +233,9 @@ store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:8 store-id=2 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:80, write-bandwidth:80, byte-size:80] node-reported-cpu=160 node-adjusted-cpu=80 seq=4 top-k-ranges (local-store-id=2) dim=CPURate: r1 -# Store load msg from s2, showing its new load. The enacted change is still -# needed because the enactment time is equal to the load-time. +# Store load msg from s2, showing its new load. store-load-msg - store-id=2 node-id=1 load=[85,85,85] capacity=[100,100,100] secondary-load=1 load-time=310s + store-id=2 node-id=1 load=[85,85,85] capacity=[100,100,100] secondary-load=1 load-time=20s ---- get-load-info @@ -250,13 +246,13 @@ store-id=2 node-id=1 status=ok accepting all reported=[cpu:85, write-bandwidth:8 tick seconds=20 ---- -t=5m30s +t=40s # Store load msg from s1, showing its new load. The enacted change is no # longer needed for load adjustment since load-time is 20s after the enactment # time (and lagForChangeReflectedInLoad is 10s). store-load-msg - store-id=1 node-id=1 load=[5,5,5] capacity=[100,100,100] secondary-load=1 load-time=330s + store-id=1 node-id=1 load=[5,5,5] capacity=[100,100,100] secondary-load=1 load-time=40s ---- # No longer tracking change 2 for the sake of load. @@ -275,10 +271,10 @@ make-pending-changes range-id=1 rebalance-replica: remove-store-id=2 add-store-id=1 ---- pending(2) -change-id=3 store-id=1 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=5m30s gc=10m30s +change-id=3 store-id=1 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=40s gc=5m40s prev=(replica-id=none type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) -change-id=4 store-id=2 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=5m30s gc=10m30s +change-id=4 store-id=2 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=40s gc=5m40s prev=(replica-id=2 type=VOTER_FULL leaseholder=true) next=(replica-id=none type=VOTER_FULL) @@ -292,7 +288,7 @@ store-id=2 node-id=1 status=ok accepting all reported=[cpu:85, write-bandwidth:8 # Enough time elapses to GC the changes. tick seconds=310 ---- -t=10m40s +t=5m50s gc-pending-changes ---- @@ -309,3 +305,80 @@ get-load-info store-id=1 node-id=1 status=ok accepting all reported=[cpu:5, write-bandwidth:5, byte-size:5] adjusted=[cpu:5, write-bandwidth:5, byte-size:5] node-reported-cpu=90 node-adjusted-cpu=90 seq=5 store-id=2 node-id=1 status=ok accepting all reported=[cpu:85, write-bandwidth:85, byte-size:85] adjusted=[cpu:85, write-bandwidth:85, byte-size:85] node-reported-cpu=90 node-adjusted-cpu=90 seq=7 top-k-ranges (local-store-id=2) dim=CPURate: r1 + +# Transfer the replica from s2 back to s1. The lease will also be transferred. +make-pending-changes range-id=1 + rebalance-replica: remove-store-id=2 add-store-id=1 +---- +pending(2) +change-id=5 store-id=1 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=5m50s gc=10m50s + prev=(replica-id=none type=VOTER_FULL) + next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +change-id=6 store-id=2 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=5m50s gc=10m50s + prev=(replica-id=2 type=VOTER_FULL leaseholder=true) + next=(replica-id=none type=VOTER_FULL) + +ranges +---- +range-id=1 local-store=2 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cpu=20 + store-id=1 replica-id=unknown type=VOTER_FULL leaseholder=true + +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:5, write-bandwidth:5, byte-size:5] adjusted=[cpu:93, write-bandwidth:93, byte-size:93] node-reported-cpu=90 node-adjusted-cpu=98 seq=6 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:85, write-bandwidth:85, byte-size:85] adjusted=[cpu:5, write-bandwidth:5, byte-size:5] node-reported-cpu=90 node-adjusted-cpu=98 seq=8 + top-k-ranges (local-store-id=2) dim=CPURate: r1 + +# Store leaseholder msg from s1, showing that s1 is a replica and has the lease, so +# change 5 is considered enacted. +store-leaseholder-msg +store-id=1 + range-id=1 load=[80,80,80] raft-cpu=20 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) + store-id=1 replica-id=3 type=VOTER_FULL leaseholder=true + store-id=2 replica-id=2 type=VOTER_FULL +---- + +# Change 6 is not yet enacted, but the prev state no longer shows +# leaseholder=true. +get-pending-changes +---- +pending(2) +change-id=5 store-id=1 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=5m50s gc=10m50s enacted=5m50s + prev=(replica-id=none type=VOTER_FULL) + next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +change-id=6 store-id=2 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=5m50s gc=6m20s + prev=(replica-id=2 type=VOTER_FULL) + next=(replica-id=none type=VOTER_FULL) + +ranges +---- +range-id=1 local-store=1 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cpu=20 + store-id=1 replica-id=3 type=VOTER_FULL leaseholder=true + +# store-id=2 still has top-k-ranges for local store s2 populated, since we +# haven't yet received a store leaseholder msg from s2 indicating it no longer +# is the leaseholder for r1. +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:5, write-bandwidth:5, byte-size:5] adjusted=[cpu:93, write-bandwidth:93, byte-size:93] node-reported-cpu=90 node-adjusted-cpu=98 seq=6 + top-k-ranges (local-store-id=1) dim=ByteSize: r1 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:85, write-bandwidth:85, byte-size:85] adjusted=[cpu:5, write-bandwidth:5, byte-size:5] node-reported-cpu=90 node-adjusted-cpu=98 seq=8 + top-k-ranges (local-store-id=2) dim=CPURate: r1 + +# Change 6 is garbage collected since enough time has elapsed. +gc-pending-changes +---- +pending(2) +change-id=5 store-id=1 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=5m50s gc=10m50s enacted=5m50s + prev=(replica-id=none type=VOTER_FULL) + next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +change-id=6 store-id=2 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=5m50s gc=6m20s + prev=(replica-id=2 type=VOTER_FULL) + next=(replica-id=none type=VOTER_FULL) + +# So one change enacted and one was rolled back. This one leaseholder +# invariant is still satisfied. +ranges +---- +range-id=1 local-store=1 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cpu=20 + store-id=1 replica-id=3 type=VOTER_FULL leaseholder=true diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_enacted_gc b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_enacted_gc index 59218c97e625..7d2790133568 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_enacted_gc +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_enacted_gc @@ -5,13 +5,13 @@ # - The range initially initially sits on (n1s1,n3s3), and is then rebalanced to # (n1s2,n3s4), i.e. moved between n1's two stores. # - The second half of the pending change (removal of s1) never gets enacted. -# - That change is pending is no-rollback due to the first half having been enacted. -# After 30s (partiallyEnactedGCDuration) has elapsed since the partial -# enactment, the removal is GC'd with the next leaseholder message. In -# particular, this change does not have to wait for the much longer regular GC -# timeout. +# - That pending change is gc'd after 30s (partiallyEnactedGCDuration) has +# elapsed since the partial enactment. In particular, this change does not +# have to wait for the much longer regular GC timeout. # - A second rebalance is carried out on the same range, while an enacted remnant # of the first operation remains. +# - A third rebalance is carried out between two non-leaseholder stores, and +# is partially enacted and partially gc'd. set-store store-id=1 node-id=1 attrs=purple locality-tiers=region=us-west-1,zone=us-west-1a store-id=2 node-id=1 attrs=yellow locality-tiers=region=us-east-1,zone=us-east-1a @@ -91,7 +91,8 @@ store-id=2 store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true ---- -# One change is considered enacted. +# One change is considered enacted. The prev value of the second change is +# updated, since the source-of-truth is the latest store leaseholder message. get-pending-changes ---- pending(2) @@ -99,7 +100,7 @@ change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2 prev=(replica-id=none type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-2, write-bandwidth:-2, byte-size:-2] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) ranges @@ -108,6 +109,20 @@ range-id=1 local-store=2 load=[cpu:3, write-bandwidth:3, byte-size:3] raft-cpu=2 store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true store-id=3 replica-id=3 type=VOTER_FULL +# store-id={1,3} still have top-k-ranges for local store s1 populated with r1, +# since we haven't yet received a store leaseholder msg from s1 indicating it +# no longer is the leaseholder for r1. +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:-2, write-bandwidth:-2, byte-size:-2] node-reported-cpu=0 node-adjusted-cpu=0 seq=1 + top-k-ranges (local-store-id=1) dim=CPURate: r1 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:2, byte-size:2] node-reported-cpu=0 node-adjusted-cpu=0 seq=1 + top-k-ranges (local-store-id=2) dim=ByteSize: r1 +store-id=3 node-id=3 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0 + top-k-ranges (local-store-id=1) dim=WriteBandwidth: r1 + top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 +store-id=4 node-id=4 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0 + tick seconds=35 ---- t=40s @@ -119,7 +134,7 @@ change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2 prev=(replica-id=none type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-2, write-bandwidth:-2, byte-size:-2] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) # Same store leaseholder msg from s2. The pending change for s1 is gc'd because @@ -147,6 +162,9 @@ change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2 prev=(replica-id=none type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +# store-id={1,3} still have top-k-ranges for local store s1 populated with r1, +# since we haven't yet received a store leaseholder msg from s1 indicating it +# no longer is the leaseholder for r1. get-load-info ---- store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=2 seq=2 @@ -159,6 +177,22 @@ store-id=3 node-id=3 status=ok accepting all reported=[cpu:0, write-bandwidth:0, top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 store-id=4 node-id=4 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0 +# Store leaseholder msg from s1, indicating no leases. +store-leaseholder-msg +store-id=1 +---- + +# No top-k ranges for local store s1 anymore. +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=2 seq=2 + top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:2, byte-size:2] node-reported-cpu=0 node-adjusted-cpu=2 seq=1 + top-k-ranges (local-store-id=2) dim=ByteSize: r1 +store-id=3 node-id=3 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0 + top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 +store-id=4 node-id=4 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0 + # Make another pending change to transfer from s3 to s4. make-pending-changes range-id=1 rebalance-replica: remove-store-id=3 add-store-id=4 @@ -181,12 +215,10 @@ t=45s get-load-info ---- store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=2 seq=2 - top-k-ranges (local-store-id=1) dim=CPURate: r1 top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:2, byte-size:2] node-reported-cpu=0 node-adjusted-cpu=2 seq=1 top-k-ranges (local-store-id=2) dim=ByteSize: r1 store-id=3 node-id=3 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:-2, write-bandwidth:-3, byte-size:-3] node-reported-cpu=0 node-adjusted-cpu=-2 seq=1 - top-k-ranges (local-store-id=1) dim=WriteBandwidth: r1 top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 store-id=4 node-id=4 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:3, byte-size:3] node-reported-cpu=0 node-adjusted-cpu=2 seq=1 @@ -231,17 +263,8 @@ change-id=4 store-id=3 node-id=3 range-id=1 load-delta=[cpu:-2, write-bandwidth: prev=(replica-id=3 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) -# Same store leaseholder msg from s2. The pending change for s3 is gc'd. -store-leaseholder-msg -store-id=2 - range-id=1 load=[3,3,3] raft-cpu=2 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) - store-id=1 replica-id=1 type=VOTER_FULL - store-id=3 replica-id=3 type=VOTER_FULL - store-id=4 replica-id=4 type=VOTER_FULL - store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true ----- - -get-pending-changes +# The pending change for s3 is gc'd. +gc-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2, byte-size:2] start=0s gc=5m0s enacted=5s @@ -251,15 +274,20 @@ change-id=3 store-id=4 node-id=4 range-id=1 load-delta=[cpu:2, write-bandwidth:3 prev=(replica-id=none type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL) +ranges +---- +range-id=1 local-store=2 load=[cpu:3, write-bandwidth:3, byte-size:3] raft-cpu=2 + store-id=1 replica-id=1 type=VOTER_FULL + store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true + store-id=4 replica-id=4 type=VOTER_FULL + store-id=3 replica-id=3 type=VOTER_FULL + get-load-info ---- store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=2 seq=2 - top-k-ranges (local-store-id=1) dim=CPURate: r1 top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:2, byte-size:2] node-reported-cpu=0 node-adjusted-cpu=2 seq=1 top-k-ranges (local-store-id=2) dim=ByteSize: r1 store-id=3 node-id=3 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=2 - top-k-ranges (local-store-id=1) dim=WriteBandwidth: r1 - top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 store-id=4 node-id=4 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:3, byte-size:3] node-reported-cpu=0 node-adjusted-cpu=2 seq=1 top-k-ranges (local-store-id=2) dim=ByteSize: r1 diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_fail_lease_transfer b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_fail_lease_transfer new file mode 100644 index 000000000000..0b4a92abdd55 --- /dev/null +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_fail_lease_transfer @@ -0,0 +1,77 @@ +set-store + store-id=1 node-id=1 attrs=purple locality-tiers=region=us-west-1,zone=us-west-1a + store-id=2 node-id=1 attrs=yellow locality-tiers=region=us-east-1,zone=us-east-1a +---- +node-id=1 locality-tiers=region=us-west-1,zone=us-west-1a,node=1 + store-id=1 attrs=purple locality-code=1:2:3: + store-id=2 attrs=yellow locality-code=4:5:3: + +# Store s1 is the leaseholder for range r1. +store-leaseholder-msg +store-id=1 + range-id=1 load=[2,2,2] raft-cpu=1 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true +---- + +ranges +---- +range-id=1 local-store=1 load=[cpu:2, write-bandwidth:2, byte-size:2] raft-cpu=1 + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true + +# Transfer the replica from s1 to s2. The lease will also be transferred. +make-pending-changes range-id=1 + rebalance-replica: remove-store-id=1 add-store-id=2 +---- +pending(2) +change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2, byte-size:2] start=0s gc=5m0s + prev=(replica-id=none type=VOTER_FULL) + next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-2, write-bandwidth:-2, byte-size:-2] start=0s gc=5m0s + prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + next=(replica-id=none type=VOTER_FULL) + +# Store leaseholder msg from s1, showing that s1 still has the replica and +# lease, and s2 also has a replica. +store-leaseholder-msg +store-id=1 + range-id=1 load=[3,3,3] raft-cpu=2 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true + store-id=2 replica-id=2 type=VOTER_FULL +---- + +# Both pending changes are still not enacted. The prev state for change-id=1 +# reflects that s2 has a voter replica. +get-pending-changes +---- +pending(2) +change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2, byte-size:2] start=0s gc=5m0s + prev=(replica-id=2 type=VOTER_FULL) + next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-2, write-bandwidth:-2, byte-size:-2] start=0s gc=5m0s + prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + next=(replica-id=none type=VOTER_FULL) + +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:-2, write-bandwidth:-2, byte-size:-2] node-reported-cpu=0 node-adjusted-cpu=0 seq=1 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:2, byte-size:2] node-reported-cpu=0 node-adjusted-cpu=0 seq=1 + +tick seconds=330 +---- +t=5m30s + +gc-pending-changes +---- +pending(0) + +get-pending-changes +---- +pending(0) + +# The replica from store s2 is removed because of GC. This is not correct. +# +# TODO: fix after cleanup PR is merged and this rebases on top. +ranges +---- +range-id=1 local-store=1 load=[cpu:3, write-bandwidth:3, byte-size:3] raft-cpu=2 + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true