mmaprototype: more test and comment improvements etc.

sumeerbhola · sumeerbhola · commit 0e33c116a125 · 2025-11-06T16:36:52.000-05:00
diff --git a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go
@@ -818,7 +818,12 @@ func (a *allocatorState) ProcessStoreLoadMsg(ctx context.Context, msg *StoreLoad
 func (a *allocatorState) AdjustPendingChangesDisposition(changeIDs []ChangeID, success bool) {
 	a.mu.Lock()
 	defer a.mu.Unlock()
+	// NB: It is possible that some of the changeIDs have already been enacted
+	// via StoreLeaseholderMsg, and even been garbage collected. So no
+	// assumption can be made about whether these changeIDs will be found in the
+	// allocator's state.
 	if !success {
+		// Gather the changes that are found and need to be undone.
 		replicaChanges := make([]ReplicaChange, 0, len(changeIDs))
 		for _, changeID := range changeIDs {
 			change, ok := a.cs.pendingChanges[changeID]
@@ -834,25 +839,35 @@ func (a *allocatorState) AdjustPendingChangesDisposition(changeIDs []ChangeID, s
 				return
 			}
 			replicaChanges = append(replicaChanges, change.ReplicaChange)
-			// Else ignore this change. We don't want to pass this change to
-			// pre-check since it will likely violate an invariant and cause us to
-			// emit a noisy log message.
 		}
 		if len(replicaChanges) == 0 {
 			return
 		}
+		// Check that we can undo these changes. If not, log and return.
 		if err := a.cs.preCheckOnUndoReplicaChanges(replicaChanges); err != nil {
+			// TODO(sumeer): we should be able to panic here, once the interface
+			// contract says that all the proposed changes must be included in
+			// changeIDs. Without that contract, there may be a pair of changes
+			// (remove replica and lease from s1), (add replica and lease to s2),
+			// and the caller can provide the first changeID only, and the undo
+			// would cause two leaseholders. The pre-check would catch that here.
 			log.KvDistribution.Infof(context.Background(), "did not undo change %v: due to %v", changeIDs, err)
 			return
 		}
 	}
 
 	for _, changeID := range changeIDs {
-		// We set !requireFound, since a StoreLeaseholderMsg that happened after
-		// the pending change was created and before this call to
+		// We set !requireFound, since some of these pending changes may no longer
+		// exist in the allocator's state. For example, a StoreLeaseholderMsg that
+		// happened after the pending change was created and before this call to
 		// AdjustPendingChangesDisposition may have already removed the pending
 		// change.
 		if success {
+			// TODO(sumeer): this code is implicitly assuming that all the changes
+			// on the rangeState are being enacted. And that is true of the current
+			// callers. We should explicitly state the assumption in the interface.
+			// Because if only some are being enacted, we ought to set
+			// pendingChangeNoRollback, and we don't bother to.
 			a.cs.pendingChangeEnacted(changeID, a.cs.ts.Now(), false)
 		} else {
 			a.cs.undoPendingChange(changeID, false)
diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go
@@ -536,6 +536,12 @@ func (prc PendingRangeChange) LeaseTransferFrom() roachpb.StoreID {
 	panic("unreachable")
 }
 
+// TODO(sumeer): we have various methods that take slices of either ChangeIDs
+// or pendingReplicaChanges or ReplicaChange, and have callers that already
+// have or could first construct a slice of pendingReplicaChanges, and avoid
+// various temporary slice construction and repeated map lookups. Clean this
+// up.
+
 // pendingReplicaChange is a proposed change to a single replica. Some
 // external entity (the leaseholder of the range) may choose to enact this
 // change. It may not be enacted if it will cause some invariant (like the
@@ -559,6 +565,10 @@ type pendingReplicaChange struct {
 	// earlier GC. It is used to hasten GC (for the remaining changes) when some
 	// subset of changes corresponding to the same complex change have been
 	// observed to be enacted.
+	//
+	// The GC of these changes happens on a different path than the usual GC,
+	// which can undo the changes -- this GC happens only when processing a
+	// RangeMsg from the leaseholder.
 	revisedGCTime time.Time
 
 	// TODO(kvoli,sumeerbhola): Consider adopting an explicit expiration time,
@@ -902,7 +912,12 @@ type rangeState struct {
 	// change for adding s4 includes both that it has a replica, and it has the
 	// lease, so we will not mark it done, and keep pretending that the whole
 	// change is pending. Since lease transfers are fast, we accept this
-	// imperfect modeling fidelity.
+	// imperfect modeling fidelity. One consequence of this imperfect modeling
+	// is that if in this example there are no further changes observed until
+	// GC, the allocator will undo both changes and go back to the state {s1,
+	// s2, s3} with s3 as the leaseholder. That is, it has forgotten that s4 was
+	// added. This is unavoidable and will be fixed by the first
+	// StoreLeaseholderMsg post-GC.
 	//
 	// 3. Non Atomicity Hazard
 	//
@@ -1627,9 +1642,7 @@ func (cs *clusterState) gcPendingChanges(now time.Time) {
 			changeIDs = append(changeIDs, pendingChange.ChangeID)
 		}
 		if err := cs.preCheckOnUndoReplicaChanges(replicaChanges); err != nil {
-			log.KvDistribution.Infof(context.Background(),
-				"did not undo changes to range %v: due to %v", rangeID, err)
-			continue
+			panic(err)
 		}
 		for _, changeID := range changeIDs {
 			cs.undoPendingChange(changeID, true)
@@ -1773,6 +1786,13 @@ func (cs *clusterState) createPendingChanges(changes ...ReplicaChange) []*pendin
 
 // preCheckOnApplyReplicaChanges does some validation of the changes being
 // proposed. It ensures the range is known and has no pending changes already.
+//
+// It only needs to be called for (a) new changes that are being proposed, or
+// (b) when we have reset the rangeState.replicas using a StoreLeaseholderMsg
+// and we have some previously proposed pending changes that have not been
+// enacted yet, and we want to re-validate them before adjusting
+// rangeState.replicas.
+//
 // For a removal, it validates that the replica exists. For non-removal, it
 // blind applies the change without validating whether the current state is
 // ReplicaChange.prev -- this blind application allows this pre-check to
@@ -1783,6 +1803,20 @@ func (cs *clusterState) createPendingChanges(changes ...ReplicaChange) []*pendin
 //
 // REQUIRES: all the changes are to the same range; there are 1, 2 or 4
 // changes.
+//
+// TODO(sumeer): the 4 changes part is a hack because the asim conformance
+// test produces a change (when running under SMA) which is:
+//
+// r10 type: RemoveReplica target store n3,s3 (replica-id=5 type=NON_VOTER)->(replica-id=none type=VOTER_FULL)
+// r10 type: RemoveReplica target store n2,s2 (replica-id=2 type=VOTER_FULL)->(replica-id=none type=VOTER_FULL)
+// r10 type: AddReplica target store n3,s3 (replica-id=none type=VOTER_FULL)->(replica-id=unknown type=VOTER_FULL)
+// r10 type: AddReplica target store n2,s2 (replica-id=none type=VOTER_FULL)->(replica-id=unknown type=NON_VOTER)]
+//
+// This change violates the requirement that there should be a single change
+// per store. Fix how this is modeled and disallow 4 changes.
+//
+// TODO(sumeer): allow arbitrary number of changes, but validate that at most
+// one change per store.
 func (cs *clusterState) preCheckOnApplyReplicaChanges(changes []ReplicaChange) error {
 	// preApplyReplicaChange is called before applying a change to the cluster
 	// state.
@@ -1833,23 +1867,16 @@ func (cs *clusterState) preCheckOnApplyReplicaChanges(changes []ReplicaChange) e
 	return replicaSetIsValid(copiedCurr.replicas)
 }
 
-// TODO: this is unnecessary since if we always check against the current
-// state before allowing a chang to be added (including re-addition after a
-// StoreLeaseholderMsg), we should never have invalidity during an undo.
-// Which is why this function now panics except for the trivial cases of no
-// changes or the range not existing in the cluster state.
-//
-// This is also justified by the current callers. If this were to return false
-// in non-trivial cases, what is the caller supposed to do? These changes have
-// been reflected on both the membership and load information. Undoing the
-// latter is trivial since it is just subtraction of numbers. But it can't
-// undo the membership changes. So we presumably have left membership in an
-// inconsistent state.
-
 // preCheckOnUndoReplicaChanges does some validation of the changes being
 // proposed for undo.
 //
-// REQUIRES: changes is non-empty, and all changes are to the same range.
+// REQUIRES: changes is non-empty; all changes are to the same range; the
+// rangeState.pendingChangeNoRollback is false.
+//
+// This method is defensive since if we always check against the current state
+// before allowing a change to be added (including re-addition after a
+// StoreLeaseholderMsg), we should never have invalidity during an undo, if
+// all the changes are being undone.
 func (cs *clusterState) preCheckOnUndoReplicaChanges(changes []ReplicaChange) error {
 	if len(changes) == 0 {
 		panic(errors.AssertionFailedf("no changes to undo"))
diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go
@@ -428,6 +428,11 @@ func TestClusterState(t *testing.T) {
 					return ss.status.String()
 
 				case "store-load-msg":
+					// TODO(sumeer): the load-time is passed as an argument, and is
+					// independent of ts. This is by necessity, since the load-time can
+					// be in the past, indicating gossip delay. However, having it be
+					// some arbitrary value can be confusing for the test reader.
+					// Consider making it relative to ts.
 					msg := parseStoreLoadMsg(t, d.Input)
 					cs.processStoreLoadMsg(context.Background(), &msg)
 					return ""
diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/multiple_ranges b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/multiple_ranges
@@ -100,8 +100,9 @@ range-id=3 local-store=1 load=[cpu:30, write-bandwidth:10, byte-size:10] raft-cp
   store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true
   store-id=2 replica-id=2 type=VOTER_FULL
 
-# Exercise the path that ensures consistency even though not-populated is
-# true.
+# Change the membership, while MaybeSpanConfIsPopulated=false, to ensure that
+# we notice the change in replicas. One of these ranges has changed the number
+# of replicas, and the other the replica-id of one of the store's replicas.
 store-leaseholder-msg
 store-id=1
   range-id=2 not-populated
@@ -111,6 +112,7 @@ store-id=1
     store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true
 ----
 
+# The ranges reflect the latest state from the StoreLeaseholderMsg.
 ranges
 ----
 range-id=2 local-store=1 load=[cpu:20, write-bandwidth:10, byte-size:15] raft-cpu=10
diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica
@@ -21,19 +21,19 @@ set-store
   store-id=1 node-id=1 attrs=purple locality-tiers=region=us-west-1,zone=us-west-1a
   store-id=2 node-id=2 attrs=yellow locality-tiers=region=us-east-1,zone=us-east-1a
 ----
-node-id=1 failure-summary=ok locality-tiers=region=us-west-1,zone=us-west-1a,node=1
-  store-id=1 membership=full attrs=purple locality-code=1:2:3:
-node-id=2 failure-summary=ok locality-tiers=region=us-east-1,zone=us-east-1a,node=2
-  store-id=2 membership=full attrs=yellow locality-code=4:5:6:
+node-id=1 locality-tiers=region=us-west-1,zone=us-west-1a,node=1
+  store-id=1 attrs=purple locality-code=1:2:3:
+node-id=2 locality-tiers=region=us-east-1,zone=us-east-1a,node=2
+  store-id=2 attrs=yellow locality-code=4:5:6:
 
 store-load-msg
   store-id=1 node-id=1 load=[80,80,80] capacity=[100,100,100] secondary-load=0 load-time=0s
 ----
 
 get-load-info
 ----
-store-id=1 node-id=1 reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:80, write-bandwidth:80, byte-size:80] node-reported-cpu=80 node-adjusted-cpu=80 seq=1
-store-id=2 node-id=2 reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0
+store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:80, write-bandwidth:80, byte-size:80] node-reported-cpu=80 node-adjusted-cpu=80 seq=1
+store-id=2 node-id=2 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0
 
 store-leaseholder-msg 
 store-id=1
@@ -49,9 +49,9 @@ range-id=1 local-store=1 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cp
 
 get-load-info
 ----
-store-id=1 node-id=1 reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:80, write-bandwidth:80, byte-size:80] node-reported-cpu=80 node-adjusted-cpu=80 seq=1
+store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:80, write-bandwidth:80, byte-size:80] node-reported-cpu=80 node-adjusted-cpu=80 seq=1
   top-k-ranges (local-store-id=1) dim=CPURate: r1
-store-id=2 node-id=2 reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0
+store-id=2 node-id=2 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=0 node-adjusted-cpu=0 seq=0
 
 make-pending-changes range-id=1
   rebalance-replica: remove-store-id=1 add-store-id=2
@@ -71,9 +71,9 @@ range-id=1 local-store=1 load=[cpu:80, write-bandwidth:80, byte-size:80] raft-cp
 
 get-load-info
 ----
-store-id=1 node-id=1 reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=80 node-adjusted-cpu=0 seq=2
+store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=80 node-adjusted-cpu=0 seq=2
   top-k-ranges (local-store-id=1) dim=CPURate: r1
-store-id=2 node-id=2 reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:88, write-bandwidth:88, byte-size:88] node-reported-cpu=0 node-adjusted-cpu=88 seq=1
+store-id=2 node-id=2 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:88, write-bandwidth:88, byte-size:88] node-reported-cpu=0 node-adjusted-cpu=88 seq=1
 
 # Same store load from s1. Results in no change.
 store-load-msg
@@ -82,9 +82,9 @@ store-load-msg
 
 get-load-info
 ----
-store-id=1 node-id=1 reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=80 node-adjusted-cpu=0 seq=4
+store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=80 node-adjusted-cpu=0 seq=4
   top-k-ranges (local-store-id=1) dim=CPURate: r1
-store-id=2 node-id=2 reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:88, write-bandwidth:88, byte-size:88] node-reported-cpu=0 node-adjusted-cpu=88 seq=1
+store-id=2 node-id=2 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:88, write-bandwidth:88, byte-size:88] node-reported-cpu=0 node-adjusted-cpu=88 seq=1
 
 # Store leaseholder msg from s1 showing that s2 has a replica but not the lease.
 store-leaseholder-msg
@@ -105,6 +105,7 @@ change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth
   prev=(replica-id=1 type=VOTER_FULL leaseholder=true)
   next=(replica-id=none type=VOTER_FULL)
 
+# Advance just to simulate some passage of time.
 tick seconds=5
 ----
 t=5s
@@ -131,8 +132,8 @@ ranges
 # The enacted changes are still adjusting the load.
 get-load-info
 ----
-store-id=1 node-id=1 reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=80 node-adjusted-cpu=0 seq=4
-store-id=2 node-id=2 reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:88, write-bandwidth:88, byte-size:88] node-reported-cpu=0 node-adjusted-cpu=88 seq=1
+store-id=1 node-id=1 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:0, write-bandwidth:0, byte-size:0] node-reported-cpu=80 node-adjusted-cpu=0 seq=4
+store-id=2 node-id=2 status=ok accepting all reported=[cpu:0, write-bandwidth:0, byte-size:0] adjusted=[cpu:88, write-bandwidth:88, byte-size:88] node-reported-cpu=0 node-adjusted-cpu=88 seq=1
 
 # Store load msg from s2 showing updated load.
 store-load-msg
@@ -141,16 +142,18 @@ store-load-msg
 
 # Store load msg from s1 showing updated load.
 store-load-msg
-  store-id=1 node-id=2 load=[5,5,5] capacity=[100,100,100] secondary-load=1 load-time=14s
+  store-id=1 node-id=1 load=[5,5,5] capacity=[100,100,100] secondary-load=1 load-time=14s
 ----
 
-# Neither load is recent enough (computePendingChangesReflectedInLatestLoad
-# timeout) to be considered as accounting for the enacted changes. So s2
-# adjusted load appears very high and s1 adjusted load becomes negative.
+# Both of the load msgs had load-time=14s, while the enacted time was 5s.
+# Neither is recent enough, since lagForChangeReflectedInLoad is 10s (see
+# computePendingChangesReflectedInLatestLoad) to be considered as accounting
+# for the enacted changes. So s2 adjusted load appears very high and s1
+# adjusted load becomes negative.
 get-load-info
 ----
-store-id=1 node-id=1 reported=[cpu:5, write-bandwidth:5, byte-size:5] adjusted=[cpu:-75, write-bandwidth:-75, byte-size:-75] node-reported-cpu=80 node-adjusted-cpu=-80 seq=6
-store-id=2 node-id=2 reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:168, write-bandwidth:168, byte-size:168] node-reported-cpu=5 node-adjusted-cpu=173 seq=3
+store-id=1 node-id=1 status=ok accepting all reported=[cpu:5, write-bandwidth:5, byte-size:5] adjusted=[cpu:-75, write-bandwidth:-75, byte-size:-75] node-reported-cpu=5 node-adjusted-cpu=-75 seq=6
+store-id=2 node-id=2 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:168, write-bandwidth:168, byte-size:168] node-reported-cpu=80 node-adjusted-cpu=168 seq=3
 
 # The enacted changes are still adjusting the load.
 get-pending-changes
@@ -170,15 +173,17 @@ store-load-msg
 
 # Store load msg from s1 showing updated load.
 store-load-msg
-  store-id=1 node-id=2 load=[5,5,5] capacity=[100,100,100] secondary-load=1 load-time=16s
+  store-id=1 node-id=1 load=[5,5,5] capacity=[100,100,100] secondary-load=1 load-time=16s
 ----
 
-# The enacted changes are no longer adjusting the load.
+# Both of the load msgs had load-time=16s, while the enacted time was 5s. So
+# they are recent enough to be considered as accounting for the enacted
+# changes. The enacted changes are no longer adjusting the load.
 get-pending-changes
 ----
 pending(0)
 
 get-load-info
 ----
-store-id=1 node-id=1 reported=[cpu:5, write-bandwidth:5, byte-size:5] adjusted=[cpu:5, write-bandwidth:5, byte-size:5] node-reported-cpu=80 node-adjusted-cpu=-80 seq=7
-store-id=2 node-id=2 reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:80, write-bandwidth:80, byte-size:80] node-reported-cpu=5 node-adjusted-cpu=165 seq=4
+store-id=1 node-id=1 status=ok accepting all reported=[cpu:5, write-bandwidth:5, byte-size:5] adjusted=[cpu:5, write-bandwidth:5, byte-size:5] node-reported-cpu=5 node-adjusted-cpu=5 seq=7
+store-id=2 node-id=2 status=ok accepting all reported=[cpu:80, write-bandwidth:80, byte-size:80] adjusted=[cpu:80, write-bandwidth:80, byte-size:80] node-reported-cpu=80 node-adjusted-cpu=80 seq=4
diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores
diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_enacted_gc b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica_local_stores_enacted_gc