diff --git a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go index a27832b4af9c..f5f986550f38 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go @@ -262,7 +262,7 @@ func (a *allocatorState) ProcessStoreLoadMsg(ctx context.Context, msg *StoreLoad func (a *allocatorState) AdjustPendingChangeDisposition(change PendingRangeChange, success bool) { a.mu.Lock() defer a.mu.Unlock() - rs, ok := a.cs.ranges[change.RangeID] + _, ok := a.cs.ranges[change.RangeID] if !ok { // Range no longer exists. This can happen if the StoreLeaseholderMsg // which included the effect of the change that transferred the lease away @@ -270,10 +270,6 @@ func (a *allocatorState) AdjustPendingChangeDisposition(change PendingRangeChang // allocator. return } - if !success && rs.pendingChangeNoRollback { - // Not allowed to undo. - return - } // NB: It is possible that some of the changes have already been enacted via // StoreLeaseholderMsg, and even been garbage collected. So no assumption // can be made about whether these changes will be found in the allocator's @@ -284,6 +280,9 @@ func (a *allocatorState) AdjustPendingChangeDisposition(change PendingRangeChang if !ok { continue } + // NB: the ch and c pointers are not identical even though they have the + // same changeID. We create two copies in + // clusterState.addPendingRangeChange, since the internal copy is mutable. changes = append(changes, ch) } if len(changes) == 0 { diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go index eb65f83a71f3..ece48c1bd57e 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go @@ -204,12 +204,12 @@ type ReplicaChange struct { // replica being demoted cannot retain the lease). // // NB: The prev value is always the state before the change. This is the - // source of truth provided by the leaseholder in the RangeMsg, so will - // have real ReplicaIDs (if already a replica) and real ReplicaTypes + // latest source of truth provided by the leaseholder in the RangeMsg, so + // will have real ReplicaIDs (if already a replica) and real ReplicaTypes // (including types beyond VOTER_FULL and NON_VOTER). This source-of-truth // claim is guaranteed by REQUIREMENT(change-computation) documented - // elsewhere, and the fact that new changes are computed only when there - // are no pending changes for a range. + // elsewhere, and the fact that new changes are computed only when there are + // no pending changes for a range. // // The ReplicaType in next is either the zero value (for removals), or // {VOTER_FULL, NON_VOTER} for additions/change, i.e., it represents the @@ -218,6 +218,9 @@ type ReplicaChange struct { // TODO(tbg): in MakeLeaseTransferChanges, next.ReplicaType.ReplicaType is // simply the current value, and not necessarily {VOTER_FULL, NON_VOTER}. // So the above comment is incorrect. We should clean this up. + // + // The prev field is mutable after creation, to ensure that an undo restores + // the state to the latest source of truth from the leaseholder. prev ReplicaState next ReplicaIDAndType @@ -459,21 +462,14 @@ func mapReplicaTypeToVoterOrNonVoter(rType roachpb.ReplicaType) roachpb.ReplicaT // replicas, or transferring the lease. There is at most one change per store // in the set. // -// NB: pendingReplicaChanges is not visible outside the package, so we can be -// certain that callers outside this package that hold a PendingRangeChange -// cannot mutate the internals other than clearing the state. -// -// Additionally, for a PendingRangeChange returned outside the package, we -// ensure that the pendingReplicaChanges slice itself is not shared with the -// rangeState.pendingChanges slice since the rangeState.pendingChanges slice -// can have entries removed from it (and swapped around as part of removal). -// -// Some the state inside each *pendingReplicaChange is mutable at arbitrary -// points in time by the code inside this package (with the relevant locking, -// of course). Currently, this state is gcTime, enactedAtTime. Neither of it -// is read by the public methods on PendingRangeChange. +// NB: pendingReplicaChanges is not visible outside the package. // -// TODO(sumeer): when we expand the set of mutable fields, make a deep copy. +// The *pendingReplicaChange objects are pointers since the clusterState +// struct has multiple slices and maps that point to the same +// *pendingReplicaChange object, which is mutable. To prevent race conditions +// with exported functions on PendingRangeChange called from outside the +// package, the *pendingReplicaChange objects returned outside the package are +// a copy that will not be mutated. type PendingRangeChange struct { RangeID roachpb.RangeID pendingReplicaChanges []*pendingReplicaChange @@ -727,9 +723,9 @@ type pendingReplicaChange struct { // expiry. All replica changes in a PendingRangeChange have the same // startTime. startTime time.Time - // gcTime represents a time when the unenacted change should be GC'd, either - // using the normal GC undo path, or if rangeState.pendingChangeNoRollback - // is true, when processing a RangeMsg from the leaseholder. + // gcTime represents a time when the unenacted change should be GC'd. + // + // Mutable after creation. gcTime time.Time // TODO(kvoli,sumeerbhola): Consider adopting an explicit expiration time, @@ -742,6 +738,8 @@ type pendingReplicaChange struct { // information received from the leaseholder, this value is set, so that even // if the store with a replica affected by this pending change does not tell // us about the enactment, we can garbage collect this change. + // + // Mutable after creation. enactedAtTime time.Time } @@ -1052,7 +1050,7 @@ type rangeState struct { // that are still at the initial state, or an intermediate state, it can // continue anticipating that these pending changes will happen. Tracking // what is pending also allows for undo in the case of explicit failure, - // notified by AdjustPendingChangesDisposition. + // notified by AdjustPendingChangesDisposition, or GC. // // 2. Lifecycle // pendingChanges track proposed modifications to a range's replicas or @@ -1078,17 +1076,9 @@ type rangeState struct { // has been enacted in this case. // // 2. Undone as failed: corresponding replica and load change is rolled back. - // Note that for replica changes that originate from one action, all changes - // would be undone together. - // NB: pending changes of a range state originate from one decision. - // Therefore, when one pending change is enacted successfully, we mark this - // range state's pending changes as no rollback (read more about this in 3). - // If we are here trying to undo a pending change but the range state has - // already been marked as no rollback, we do not undo the remaining pending - // changes. Instead, we wait for a StoreLeaseholderMsg to discard the pending - // changes and revert the load adjustments after the - // partiallyEnactedGCDuration has elapsed since the first enacted change. The - // modeling here is imperfect (read more about this in 3). + // Note that for replica changes that originate from one action, some changes + // can be considered done because of the leaseholder msg, and others can be + // rolled back (say due to GC). // // This happens when: // - The pending change failed to apply via @@ -1149,14 +1139,10 @@ type rangeState struct { // the replica and leaseholder to s4. An intermediate state that can be // observed is {s1, s2, s3, s4} with the lease still at s3. But the pending // change for adding s4 includes both that it has a replica, and it has the - // lease, so we will not mark it done, and keep pretending that the whole - // change is pending. Since lease transfers are fast, we accept this - // imperfect modeling fidelity. One consequence of this imperfect modeling - // is that if in this example there are no further changes observed until - // GC, the allocator will undo both changes and go back to the state {s1, - // s2, s3} with s3 as the leaseholder. That is, it has forgotten that s4 was - // added. This is unavoidable and will be fixed by the first - // StoreLeaseholderMsg post-GC. + // lease, so we will not mark it done, and keep pretending that the change + // is pending. However, we will change the prev state to indicate that s4 + // has a replica, so that undo (say due to GC) rolls back to the latest + // source-of-truth from the leaseholder. // // 4. Non Atomicity Hazard // @@ -1165,20 +1151,19 @@ type rangeState struct { // to contend with the hazard of having two leaseholders or no leaseholders. // In the earlier example, say s3 and s4 were both local stores (a // multi-store node), it may be possible to observe an intermediate state - // {s1, s2, s3, s4} where s4 is the leaseholder. If we subsequently get a - // spurious AdjustPendingChangesDisposition(success=false) call, or - // time-based GC causes the s3 removal to be undone, there will be two - // replicas marked as the leaseholder. The other extreme is believing that - // the s3 transfer is done and the s4 incoming replica (and lease) failed - // (this may not actually be possible because of the surrounding code). + // {s1, s2, s3, s4} where s4 is the leaseholder. We need to ensure that if + // we subsequently get a spurious + // AdjustPendingChangesDisposition(success=false) call, or time-based GC + // causes the s3 removal to be undone, there will not be two replicas marked + // as the leaseholder. The other extreme is believing that the s3 transfer + // is done and the s4 incoming replica (and lease) failed (this may not + // actually be possible because of the surrounding code). // - // We deal with this hazard by observing that we've constructed multiple - // pending changes in order to observe intermediate changes in the common - // case of success. Once one change in the set of changes is considered - // enacted, we mark the whole remaining group as no-rollback. In the above - // case, if we see s4 has become the leaseholder, the s1 removal can't undo - // itself -- it can be dropped if it is considered subsumed when processing - // a RangeMsg, or it can be GC'd. + // This hazard is dealt with in the same way outlined in the earlier + // example: when the leaseholder msg from s4 arrives that lists {s1, s2, s3, + // s4} as replicas, the prev state for the s3 change is updated to indicate + // that it is not the leaseholder. This means that if the change is undone, + // it will return to a prev state where it has a replica but not the lease. // // Additionally, when processing a RangeMsg, if any of the pending changes // is considered inconsistent, all the pending changes are discarded. This @@ -1198,11 +1183,6 @@ type rangeState struct { // rangeState.pendingChanges across all ranges in clusterState.ranges will // be identical to clusterState.pendingChanges. pendingChanges []*pendingReplicaChange - // When set, the pendingChanges can not be rolled back anymore. They have - // to be enacted, or discarded wholesale in favor of the latest RangeMsg - // from the leaseholder. It is reset to false when pendingChanges - // transitions from empty to non-empty. - pendingChangeNoRollback bool // If non-nil, it is up-to-date. Typically, non-nil for a range that has no // pendingChanges and is not satisfying some constraint, since we don't want @@ -1534,27 +1514,15 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal( // The change has been enacted according to the leaseholder. enactedChanges = append(enactedChanges, change) } else { + // Not subsumed. Replace the prev with the latest source of truth from + // the leaseholder. Note, this can be the noReplicaID case from above. + change.prev = adjustedReplica remainingChanges = append(remainingChanges, change) } } - gcRemainingChanges := false - if rs.pendingChangeNoRollback { - // A previous StoreLeaseholderMsg has enacted some changes, so the - // remainingChanges may be GC'able. All of them share the same GC time. - // Note that normal GC will not GC these, since normal GC needs to undo, - // and we are not allowed to undo these. - if len(remainingChanges) > 0 { - gcTime := remainingChanges[0].gcTime - if gcTime.Before(now) { - gcRemainingChanges = true - } - } - } else if len(enactedChanges) > 0 && len(remainingChanges) > 0 { - // First time this set of changes is seeing something enacted, and there - // are remaining changes. + if len(enactedChanges) > 0 && len(remainingChanges) > 0 { + // There are remaining changes, so potentially update their gcTime. // - // No longer permitted to rollback. - rs.pendingChangeNoRollback = true // All remaining changes have the same gcTime. curGCTime := remainingChanges[0].gcTime revisedGCTime := now.Add(partiallyEnactedGCDuration) @@ -1598,27 +1566,19 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal( // preCheckOnApplyReplicaChanges returns false if there are any pending // changes, and these are the changes that are pending. This is hacky // and should be cleaned up. - var valid bool - var reason redact.RedactableString - if gcRemainingChanges { - reason = "GCing remaining changes after partial enactment" - } else { - // NB: rs.pendingChanges contains the same changes as - // remainingChanges, but they are not the same slice. - rc := rs.pendingChanges - rs.pendingChanges = nil - err := cs.preCheckOnApplyReplicaChanges(PendingRangeChange{ - RangeID: rangeMsg.RangeID, - pendingReplicaChanges: remainingChanges, - }) - valid = err == nil - if err != nil { - reason = redact.Sprint(err) - } - // Restore it. - rs.pendingChanges = rc - } - if valid { + // + // NB: rs.pendingChanges contains the same changes as + // remainingChanges, but they are not the same slice. + rc := rs.pendingChanges + rs.pendingChanges = nil + err := cs.preCheckOnApplyReplicaChanges(PendingRangeChange{ + RangeID: rangeMsg.RangeID, + pendingReplicaChanges: remainingChanges, + }) + // Restore it. + rs.pendingChanges = rc + + if err == nil { // Re-apply the remaining changes. Note that the load change was not // undone above, so we pass !applyLoadChange, to avoid applying it // again. Also note that applyReplicaChange does not add to the various @@ -1628,6 +1588,7 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal( cs.applyReplicaChange(change.ReplicaChange, false) } } else { + reason := redact.Sprint(err) // The current state provided by the leaseholder does not permit these // changes, so we need to drop them. This should be rare, but can happen // if the leaseholder executed a change that MMA was completely unaware @@ -1861,7 +1822,6 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal( topk := ss.adjusted.topKRanges[msg.StoreID] topk.doneInit() } - } // If the pending replica change does not happen within this GC duration, we @@ -1895,15 +1855,6 @@ func (cs *clusterState) gcPendingChanges(now time.Time) { if !ok { panic(errors.AssertionFailedf("range %v not found in cluster state", rangeID)) } - - // Unlike normal GC that reverts changes, we want to discard these pending - // changes. Do nothing here; processStoreLeaseholderMsgInternal will later - // detect and discard these pending changes. Note that - // processStoreLeaseholderMsgInternal will not revert the pending load - // change. - if rs.pendingChangeNoRollback { - continue - } if len(rs.pendingChanges) == 0 { panic(errors.AssertionFailedf("no pending changes in range %v", rangeID)) } @@ -1945,8 +1896,6 @@ func (cs *clusterState) pendingChangeEnacted(cid changeID, enactedAt time.Time) } // undoPendingChange reverses the change with ID cid. -// -// REQUIRES: the change is not marked as no-rollback. func (cs *clusterState) undoPendingChange(cid changeID) { change, ok := cs.pendingChanges[cid] if !ok { @@ -1956,10 +1905,6 @@ func (cs *clusterState) undoPendingChange(cid changeID) { if !ok { panic(errors.AssertionFailedf("range %v not found in cluster state", change.rangeID)) } - if rs.pendingChangeNoRollback { - // One cannot undo changes once no-rollback is true. - panic(errors.AssertionFailedf("pending change is marked as no-rollback")) - } // Wipe the analyzed constraints, as the range has changed. rs.constraints = nil rs.lastFailedChange = cs.ts.Now() @@ -1992,6 +1937,10 @@ func printMapPendingChanges(changes map[changeID]*pendingReplicaChange) string { // adjusted load, tracked pending changes and changeIDs to reflect the pending // application. It updates the *pendingReplicaChanges inside the change. // +// The change contains replica changes that will be returned outside the +// package, so a copy is made for package internal use (see the comment on +// PendingRangeChange about mutability). +// // REQUIRES: all the replica changes are to the same range, and that the range // has no pending changes. func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) { @@ -2018,26 +1967,26 @@ func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) { // Only the lease is being transferred. gcDuration = pendingLeaseTransferGCDuration } - pendingChanges := change.pendingReplicaChanges now := cs.ts.Now() - for _, pendingChange := range pendingChanges { - cs.applyReplicaChange(pendingChange.ReplicaChange, true) + for _, origPendingChange := range change.pendingReplicaChanges { + cs.applyReplicaChange(origPendingChange.ReplicaChange, true) cs.changeSeqGen++ cid := cs.changeSeqGen - pendingChange.changeID = cid - pendingChange.startTime = now - pendingChange.gcTime = now.Add(gcDuration) - pendingChange.enactedAtTime = time.Time{} + origPendingChange.changeID = cid + origPendingChange.startTime = now + origPendingChange.gcTime = now.Add(gcDuration) + origPendingChange.enactedAtTime = time.Time{} + // Make a copy for internal tracking, since the internal state is mutable. + pendingChange := &pendingReplicaChange{} + *pendingChange = *origPendingChange storeState := cs.stores[pendingChange.target.StoreID] rangeState := cs.ranges[rangeID] cs.pendingChanges[cid] = pendingChange storeState.adjusted.loadPendingChanges[cid] = pendingChange rangeState.pendingChanges = append(rangeState.pendingChanges, pendingChange) - rangeState.pendingChangeNoRollback = false log.KvDistribution.VInfof(context.Background(), 3, "addPendingRangeChange: change_id=%v, range_id=%v, change=%v", cid, rangeID, pendingChange.ReplicaChange) - pendingChanges = append(pendingChanges, pendingChange) } } @@ -2105,8 +2054,6 @@ func (cs *clusterState) preCheckOnApplyReplicaChanges(rangeChange PendingRangeCh // preCheckOnUndoReplicaChanges does some validation of the changes being // proposed for undo. // -// REQUIRES: the rangeState.pendingChangeNoRollback is false. -// // This method is defensive since if we always check against the current state // before allowing a change to be added (including re-addition after a // StoreLeaseholderMsg), we should never have invalidity during an undo, if diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica index 153b25b510a3..08300e0c0cb1 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica @@ -90,7 +90,7 @@ store-id=2 node-id=2 status=ok accepting all reported=[cpu:0, write-bandwidth:0, store-leaseholder-msg store-id=1 range-id=1 load=[80,80,80] raft-cpu=20 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) - store-id=1 replica-id=2 type=VOTER_FULL leaseholder=true + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true store-id=2 replica-id=2 type=VOTER_FULL ---- @@ -99,7 +99,7 @@ get-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=2 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s prev=(replica-id=1 type=VOTER_FULL leaseholder=true) @@ -120,7 +120,7 @@ get-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=2 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s enacted=5s prev=(replica-id=1 type=VOTER_FULL leaseholder=true) @@ -160,7 +160,7 @@ get-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=2 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s enacted=5s prev=(replica-id=1 type=VOTER_FULL leaseholder=true) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores index b6a2f972abbc..88467731ceb6 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores @@ -12,14 +12,11 @@ # s1) remains pending until s1 is removed from the replica set. # 3. Verifies enacted changes cannot be rejected (they're removed from # pendingChanges). -# 4. Verifies no-rollback changes cannot be rejected once any change in the -# set is enacted. -# 5. Tracks load adjustments: pending changes adjust load until store-reported +# 4. Tracks load adjustments: pending changes adjust load until store-reported # load reflects the change (after 10s of enactment). After that, the change # is removed from load tracking but may remain for GC. -# 6. Tests GC: changes marked no-rollback cannot be GC'd via normal GC (which -# requires undo). They're only removed once store-reported load reflects the -# change. +# 5. Tests GC and update of gcTime. +# 6. Test update of ReplicaChange.prev based on latest leaseholder info. # 7. Rebalances back from s2 to s1 and verifies changes are GC'd after the # normal GC duration. # @@ -95,7 +92,7 @@ t=5s store-leaseholder-msg store-id=1 range-id=1 load=[80,80,80] raft-cpu=20 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) - store-id=1 replica-id=2 type=VOTER_FULL leaseholder=true + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true store-id=2 replica-id=2 type=VOTER_FULL ---- @@ -104,7 +101,7 @@ get-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s prev=(replica-id=1 type=VOTER_FULL leaseholder=true) @@ -114,7 +111,7 @@ change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth store-leaseholder-msg store-id=2 range-id=1 load=[80,80,80] raft-cpu=20 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) - store-id=1 replica-id=2 type=VOTER_FULL + store-id=1 replica-id=1 type=VOTER_FULL store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true ---- @@ -125,18 +122,25 @@ range-id=1 local-store=2 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cp store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true # The addition of replica and lease on s2 is considered enacted. The removal -# of replica and lease from s1 is not yet enacted. The gc time is changes to -# be 30s after the enactment. +# of replica and lease from s1 is not yet enacted. The gc time is changed to +# be 30s after the enactment. Note that the prev state of change 2 no longer +# shows leaseholder=true since the lease has already been transferred. get-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=80 node-adjusted-cpu=88 seq=2 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:88, write-bandwidth:88, byte-size:88] node-reported-cpu=80 node-adjusted-cpu=88 seq=1 + top-k-ranges (local-store-id=2) dim=ByteSize: r1 + tick seconds=5 ---- t=10s @@ -146,21 +150,10 @@ reject-pending-changes change-ids=(1) expect-panic ---- pending(2) change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s - prev=(replica-id=none type=VOTER_FULL) - next=(replica-id=unknown type=VOTER_FULL leaseholder=true) -change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) - next=(replica-id=none type=VOTER_FULL) - -# Change 2 is found, but is no-rollback, so can't be rejected. -reject-pending-changes change-ids=(2) expect-panic ----- -pending(2) -change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s - prev=(replica-id=none type=VOTER_FULL) + prev=(replica-id=2 type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) # Store load msg from s2, showing its new load. @@ -188,7 +181,7 @@ get-pending-changes ---- pending(1) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) # The adjusted load on s2 no longer reflects the addition of the replica and @@ -200,33 +193,37 @@ store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:8 store-id=2 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:80, write-bandwidth:80, byte-size:80] node-reported-cpu=160 node-adjusted-cpu=80 seq=4 top-k-ranges (local-store-id=2) dim=ByteSize: r1 -# Advance time enough for change 2 to be eligible for undo GC. -tick seconds=300 +# Advance time, but not enough to GC change 2. +tick seconds=10 ---- -t=5m10s +t=20s -# Even after the undo based GC time has elapsed change 2 cannot be undone -# since it is no-rollback. +# Change 2 is not undone. gc-pending-changes ---- pending(1) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) -# Store leaseholder msg from s2, showing that s1 is no longer a replica. +ranges +---- +range-id=1 local-store=2 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cpu=20 + store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true + +# Store leaseholder msg from s2, showing that s1 is no longer a replica, so +# change 2 is considered enacted. store-leaseholder-msg store-id=2 range-id=1 load=[80,80,80] raft-cpu=20 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true ---- -# Now change 2 is considered enacted. get-pending-changes ---- pending(1) -change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s enacted=5m10s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) +change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=35s enacted=20s + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) # Change 2 is still serving the purpose of adjusting the load on s1. @@ -236,10 +233,9 @@ store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:8 store-id=2 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:80, write-bandwidth:80, byte-size:80] node-reported-cpu=160 node-adjusted-cpu=80 seq=4 top-k-ranges (local-store-id=2) dim=CPURate: r1 -# Store load msg from s2, showing its new load. The enacted change is still -# needed because the enactment time is equal to the load-time. +# Store load msg from s2, showing its new load. store-load-msg - store-id=2 node-id=1 load=[85,85,85] capacity=[100,100,100] secondary-load=1 load-time=310s + store-id=2 node-id=1 load=[85,85,85] capacity=[100,100,100] secondary-load=1 load-time=20s ---- get-load-info @@ -250,13 +246,13 @@ store-id=2 node-id=1 status=ok accepting all reported=[cpu:85, write-bandwidth:8 tick seconds=20 ---- -t=5m30s +t=40s # Store load msg from s1, showing its new load. The enacted change is no # longer needed for load adjustment since load-time is 20s after the enactment # time (and lagForChangeReflectedInLoad is 10s). store-load-msg - store-id=1 node-id=1 load=[5,5,5] capacity=[100,100,100] secondary-load=1 load-time=330s + store-id=1 node-id=1 load=[5,5,5] capacity=[100,100,100] secondary-load=1 load-time=40s ---- # No longer tracking change 2 for the sake of load. @@ -275,10 +271,10 @@ make-pending-changes range-id=1 rebalance-replica: remove-store-id=2 add-store-id=1 ---- pending(2) -change-id=3 store-id=1 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=5m30s gc=10m30s +change-id=3 store-id=1 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=40s gc=5m40s prev=(replica-id=none type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) -change-id=4 store-id=2 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=5m30s gc=10m30s +change-id=4 store-id=2 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=40s gc=5m40s prev=(replica-id=2 type=VOTER_FULL leaseholder=true) next=(replica-id=none type=VOTER_FULL) @@ -292,7 +288,7 @@ store-id=2 node-id=1 status=ok accepting all reported=[cpu:85, write-bandwidth:8 # Enough time elapses to GC the changes. tick seconds=310 ---- -t=10m40s +t=5m50s gc-pending-changes ---- @@ -309,3 +305,80 @@ get-load-info store-id=1 node-id=1 status=ok accepting all reported=[cpu:5, write-bandwidth:5, byte-size:5] adjusted=[cpu:5, write-bandwidth:5, byte-size:5] node-reported-cpu=90 node-adjusted-cpu=90 seq=5 store-id=2 node-id=1 status=ok accepting all reported=[cpu:85, write-bandwidth:85, byte-size:85] adjusted=[cpu:85, write-bandwidth:85, byte-size:85] node-reported-cpu=90 node-adjusted-cpu=90 seq=7 top-k-ranges (local-store-id=2) dim=CPURate: r1 + +# Transfer the replica from s2 back to s1. The lease will also be transferred. +make-pending-changes range-id=1 + rebalance-replica: remove-store-id=2 add-store-id=1 +---- +pending(2) +change-id=5 store-id=1 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=5m50s gc=10m50s + prev=(replica-id=none type=VOTER_FULL) + next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +change-id=6 store-id=2 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=5m50s gc=10m50s + prev=(replica-id=2 type=VOTER_FULL leaseholder=true) + next=(replica-id=none type=VOTER_FULL) + +ranges +---- +range-id=1 local-store=2 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cpu=20 + store-id=1 replica-id=unknown type=VOTER_FULL leaseholder=true + +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:5, write-bandwidth:5, byte-size:5] adjusted=[cpu:93, write-bandwidth:93, byte-size:93] node-reported-cpu=90 node-adjusted-cpu=98 seq=6 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:85, write-bandwidth:85, byte-size:85] adjusted=[cpu:5, write-bandwidth:5, byte-size:5] node-reported-cpu=90 node-adjusted-cpu=98 seq=8 + top-k-ranges (local-store-id=2) dim=CPURate: r1 + +# Store leaseholder msg from s1, showing that s1 is a replica and has the lease, so +# change 5 is considered enacted. +store-leaseholder-msg +store-id=1 + range-id=1 load=[80,80,80] raft-cpu=20 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) + store-id=1 replica-id=3 type=VOTER_FULL leaseholder=true + store-id=2 replica-id=2 type=VOTER_FULL +---- + +# Change 6 is not yet enacted, but the prev state no longer shows +# leaseholder=true. +get-pending-changes +---- +pending(2) +change-id=5 store-id=1 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=5m50s gc=10m50s enacted=5m50s + prev=(replica-id=none type=VOTER_FULL) + next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +change-id=6 store-id=2 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=5m50s gc=6m20s + prev=(replica-id=2 type=VOTER_FULL) + next=(replica-id=none type=VOTER_FULL) + +ranges +---- +range-id=1 local-store=1 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cpu=20 + store-id=1 replica-id=3 type=VOTER_FULL leaseholder=true + +# store-id=2 still has top-k-ranges for local store s2 populated, since we +# haven't yet received a store leaseholder msg from s2 indicating it no longer +# is the leaseholder for r1. +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:5, write-bandwidth:5, byte-size:5] adjusted=[cpu:93, write-bandwidth:93, byte-size:93] node-reported-cpu=90 node-adjusted-cpu=98 seq=6 + top-k-ranges (local-store-id=1) dim=ByteSize: r1 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:85, write-bandwidth:85, byte-size:85] adjusted=[cpu:5, write-bandwidth:5, byte-size:5] node-reported-cpu=90 node-adjusted-cpu=98 seq=8 + top-k-ranges (local-store-id=2) dim=CPURate: r1 + +# Change 6 is garbage collected since enough time has elapsed. +gc-pending-changes +---- +pending(2) +change-id=5 store-id=1 node-id=1 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=5m50s gc=10m50s enacted=5m50s + prev=(replica-id=none type=VOTER_FULL) + next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +change-id=6 store-id=2 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=5m50s gc=6m20s + prev=(replica-id=2 type=VOTER_FULL) + next=(replica-id=none type=VOTER_FULL) + +# So one change enacted and one was rolled back. This one leaseholder +# invariant is still satisfied. +ranges +---- +range-id=1 local-store=1 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cpu=20 + store-id=1 replica-id=3 type=VOTER_FULL leaseholder=true diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_enacted_gc b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_enacted_gc index 59218c97e625..7d2790133568 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_enacted_gc +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_enacted_gc @@ -5,13 +5,13 @@ # - The range initially initially sits on (n1s1,n3s3), and is then rebalanced to # (n1s2,n3s4), i.e. moved between n1's two stores. # - The second half of the pending change (removal of s1) never gets enacted. -# - That change is pending is no-rollback due to the first half having been enacted. -# After 30s (partiallyEnactedGCDuration) has elapsed since the partial -# enactment, the removal is GC'd with the next leaseholder message. In -# particular, this change does not have to wait for the much longer regular GC -# timeout. +# - That pending change is gc'd after 30s (partiallyEnactedGCDuration) has +# elapsed since the partial enactment. In particular, this change does not +# have to wait for the much longer regular GC timeout. # - A second rebalance is carried out on the same range, while an enacted remnant # of the first operation remains. +# - A third rebalance is carried out between two non-leaseholder stores, and +# is partially enacted and partially gc'd. set-store store-id=1 node-id=1 attrs=purple locality-tiers=region=us-west-1,zone=us-west-1a store-id=2 node-id=1 attrs=yellow locality-tiers=region=us-east-1,zone=us-east-1a @@ -91,7 +91,8 @@ store-id=2 store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true ---- -# One change is considered enacted. +# One change is considered enacted. The prev value of the second change is +# updated, since the source-of-truth is the latest store leaseholder message. get-pending-changes ---- pending(2) @@ -99,7 +100,7 @@ change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2 prev=(replica-id=none type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-2, write-bandwidth:-2, byte-size:-2] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) ranges @@ -108,6 +109,20 @@ range-id=1 local-store=2 load=[cpu:3, write-bandwidth:3, byte-size:3] raft-cpu=2 store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true store-id=3 replica-id=3 type=VOTER_FULL +# store-id={1,3} still have top-k-ranges for local store s1 populated with r1, +# since we haven't yet received a store leaseholder msg from s1 indicating it +# no longer is the leaseholder for r1. +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:-2, write-bandwidth:-2, byte-size:-2] node-reported-cpu=0 node-adjusted-cpu=0 seq=1 + top-k-ranges (local-store-id=1) dim=CPURate: r1 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:2, byte-size:2] node-reported-cpu=0 node-adjusted-cpu=0 seq=1 + top-k-ranges (local-store-id=2) dim=ByteSize: r1 +store-id=3 node-id=3 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0 + top-k-ranges (local-store-id=1) dim=WriteBandwidth: r1 + top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 +store-id=4 node-id=4 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0 + tick seconds=35 ---- t=40s @@ -119,7 +134,7 @@ change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2 prev=(replica-id=none type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-2, write-bandwidth:-2, byte-size:-2] start=0s gc=35s - prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + prev=(replica-id=1 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) # Same store leaseholder msg from s2. The pending change for s1 is gc'd because @@ -147,6 +162,9 @@ change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2 prev=(replica-id=none type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +# store-id={1,3} still have top-k-ranges for local store s1 populated with r1, +# since we haven't yet received a store leaseholder msg from s1 indicating it +# no longer is the leaseholder for r1. get-load-info ---- store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=2 seq=2 @@ -159,6 +177,22 @@ store-id=3 node-id=3 status=ok accepting all reported=[cpu:0, write-bandwidth:0, top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 store-id=4 node-id=4 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0 +# Store leaseholder msg from s1, indicating no leases. +store-leaseholder-msg +store-id=1 +---- + +# No top-k ranges for local store s1 anymore. +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=2 seq=2 + top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:2, byte-size:2] node-reported-cpu=0 node-adjusted-cpu=2 seq=1 + top-k-ranges (local-store-id=2) dim=ByteSize: r1 +store-id=3 node-id=3 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0 + top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 +store-id=4 node-id=4 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0 + # Make another pending change to transfer from s3 to s4. make-pending-changes range-id=1 rebalance-replica: remove-store-id=3 add-store-id=4 @@ -181,12 +215,10 @@ t=45s get-load-info ---- store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=2 seq=2 - top-k-ranges (local-store-id=1) dim=CPURate: r1 top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:2, byte-size:2] node-reported-cpu=0 node-adjusted-cpu=2 seq=1 top-k-ranges (local-store-id=2) dim=ByteSize: r1 store-id=3 node-id=3 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:-2, write-bandwidth:-3, byte-size:-3] node-reported-cpu=0 node-adjusted-cpu=-2 seq=1 - top-k-ranges (local-store-id=1) dim=WriteBandwidth: r1 top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 store-id=4 node-id=4 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:3, byte-size:3] node-reported-cpu=0 node-adjusted-cpu=2 seq=1 @@ -231,17 +263,8 @@ change-id=4 store-id=3 node-id=3 range-id=1 load-delta=[cpu:-2, write-bandwidth: prev=(replica-id=3 type=VOTER_FULL) next=(replica-id=none type=VOTER_FULL) -# Same store leaseholder msg from s2. The pending change for s3 is gc'd. -store-leaseholder-msg -store-id=2 - range-id=1 load=[3,3,3] raft-cpu=2 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) - store-id=1 replica-id=1 type=VOTER_FULL - store-id=3 replica-id=3 type=VOTER_FULL - store-id=4 replica-id=4 type=VOTER_FULL - store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true ----- - -get-pending-changes +# The pending change for s3 is gc'd. +gc-pending-changes ---- pending(2) change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2, byte-size:2] start=0s gc=5m0s enacted=5s @@ -251,15 +274,20 @@ change-id=3 store-id=4 node-id=4 range-id=1 load-delta=[cpu:2, write-bandwidth:3 prev=(replica-id=none type=VOTER_FULL) next=(replica-id=unknown type=VOTER_FULL) +ranges +---- +range-id=1 local-store=2 load=[cpu:3, write-bandwidth:3, byte-size:3] raft-cpu=2 + store-id=1 replica-id=1 type=VOTER_FULL + store-id=2 replica-id=2 type=VOTER_FULL leaseholder=true + store-id=4 replica-id=4 type=VOTER_FULL + store-id=3 replica-id=3 type=VOTER_FULL + get-load-info ---- store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=2 seq=2 - top-k-ranges (local-store-id=1) dim=CPURate: r1 top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:2, byte-size:2] node-reported-cpu=0 node-adjusted-cpu=2 seq=1 top-k-ranges (local-store-id=2) dim=ByteSize: r1 store-id=3 node-id=3 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=2 - top-k-ranges (local-store-id=1) dim=WriteBandwidth: r1 - top-k-ranges (local-store-id=2) dim=WriteBandwidth: r1 store-id=4 node-id=4 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:3, byte-size:3] node-reported-cpu=0 node-adjusted-cpu=2 seq=1 top-k-ranges (local-store-id=2) dim=ByteSize: r1 diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_fail_lease_transfer b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_fail_lease_transfer new file mode 100644 index 000000000000..0b4a92abdd55 --- /dev/null +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_fail_lease_transfer @@ -0,0 +1,77 @@ +set-store + store-id=1 node-id=1 attrs=purple locality-tiers=region=us-west-1,zone=us-west-1a + store-id=2 node-id=1 attrs=yellow locality-tiers=region=us-east-1,zone=us-east-1a +---- +node-id=1 locality-tiers=region=us-west-1,zone=us-west-1a,node=1 + store-id=1 attrs=purple locality-code=1:2:3: + store-id=2 attrs=yellow locality-code=4:5:3: + +# Store s1 is the leaseholder for range r1. +store-leaseholder-msg +store-id=1 + range-id=1 load=[2,2,2] raft-cpu=1 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true +---- + +ranges +---- +range-id=1 local-store=1 load=[cpu:2, write-bandwidth:2, byte-size:2] raft-cpu=1 + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true + +# Transfer the replica from s1 to s2. The lease will also be transferred. +make-pending-changes range-id=1 + rebalance-replica: remove-store-id=1 add-store-id=2 +---- +pending(2) +change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2, byte-size:2] start=0s gc=5m0s + prev=(replica-id=none type=VOTER_FULL) + next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-2, write-bandwidth:-2, byte-size:-2] start=0s gc=5m0s + prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + next=(replica-id=none type=VOTER_FULL) + +# Store leaseholder msg from s1, showing that s1 still has the replica and +# lease, and s2 also has a replica. +store-leaseholder-msg +store-id=1 + range-id=1 load=[3,3,3] raft-cpu=2 config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'}) + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true + store-id=2 replica-id=2 type=VOTER_FULL +---- + +# Both pending changes are still not enacted. The prev state for change-id=1 +# reflects that s2 has a voter replica. +get-pending-changes +---- +pending(2) +change-id=1 store-id=2 node-id=1 range-id=1 load-delta=[cpu:2, write-bandwidth:2, byte-size:2] start=0s gc=5m0s + prev=(replica-id=2 type=VOTER_FULL) + next=(replica-id=unknown type=VOTER_FULL leaseholder=true) +change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-2, write-bandwidth:-2, byte-size:-2] start=0s gc=5m0s + prev=(replica-id=1 type=VOTER_FULL leaseholder=true) + next=(replica-id=none type=VOTER_FULL) + +get-load-info +---- +store-id=1 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:-2, write-bandwidth:-2, byte-size:-2] node-reported-cpu=0 node-adjusted-cpu=0 seq=1 +store-id=2 node-id=1 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:2, write-bandwidth:2, byte-size:2] node-reported-cpu=0 node-adjusted-cpu=0 seq=1 + +tick seconds=330 +---- +t=5m30s + +gc-pending-changes +---- +pending(0) + +get-pending-changes +---- +pending(0) + +# The replica from store s2 is removed because of GC. This is not correct. +# +# TODO: fix after cleanup PR is merged and this rebases on top. +ranges +---- +range-id=1 local-store=1 load=[cpu:3, write-bandwidth:3, byte-size:3] raft-cpu=2 + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true