Merge #143689

craig[bot] · pav-kv · craig[bot] · commit 0cbfd4a7b627 · 2025-04-02T21:43:38.000Z
143689: kvserver: load committed entries after mu.Unlock() r=tbg a=pav-kv This commit makes `handleRaftReadyRaftMuLocked` load the `Ready.Committed` entries after releasing `Replica.mu`, while still holding `Replica.raftMu`. This ensures that we don't have this IO under `Replica.mu`. Also, this storage interaction is placed after `r.sendRaftMessages`, which reduces messaging latency. This also fixes a bug. Previously, `detachRaftEntriesMonitorRaftMuLocked` would be called too soon, after `Ready` has been generated. Instead, it should span the entire `Ready` handling scope, so that includes the scope of applying the entries to the state machine. Part of #140235 Related to #143652, #143615 Obsoletes #125842 Co-authored-by: Pavel Kalinnikov <pavel@cockroachlabs.com>
diff --git a/pkg/kv/kvserver/logstore/logstore.go b/pkg/kv/kvserver/logstore/logstore.go
@@ -680,6 +680,9 @@ func LoadTerm(
 // which is a serious issue. But if the caller is unsure, they can check the
 // LastIndex to distinguish.
 //
+// The bytesAccount is used to account for and limit the loaded bytes. It can be
+// nil when the accounting / limiting is not needed.
+//
 // TODO(#132114): eliminate both ErrCompacted and ErrUnavailable.
 // TODO(pavelkalinnikov): return all entries we've read, consider maxSize a
 // target size. Currently we may read one extra entry and drop it.
diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go
@@ -946,14 +946,13 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 	}
 
 	var hasReady bool
-	var outboundMsgs []raftpb.Message
-	var msgStorageAppend raftpb.Message
-	var toApply []raftpb.Entry
+	var ready raft.Ready
+	var logSnapshot raft.LogSnapshot
+
 	rac2ModeToUse := r.replicationAdmissionControlModeToUse(ctx)
 	// Replication AC v2 state that is initialized while holding Replica.mu.
 	replicaStateInfoMap := r.raftMu.replicaStateScratchForFlowControl
 	var raftNodeBasicState replica_rac2.RaftNodeBasicState
-	var logSnapshot raft.LogSnapshot
 
 	rac2ModeForReady := r.shMu.currentRACv2Mode
 	leaderID := r.shMu.leaderID
@@ -981,31 +980,7 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 		}
 		logSnapshot = raftGroup.LogSnapshot()
 		if hasReady = raftGroup.HasReady(); hasReady {
-			// Since we are holding raftMu, only the Slice() call below will use
-			// raftMu.bytesAccount. It tracks memory usage that this Ready incurs.
-			r.attachRaftEntriesMonitorRaftMuLocked()
-			// TODO(pav-kv): currently, Slice() only accounts for entry bytes loaded
-			// from log storage, and ignores the in-memory unstable entries. Pass a
-			// flow control struct down the stack, and do a more complete accounting
-			// in raft. This will also eliminate the "side channel" plumbing hack with
-			// this bytesAccount.
-			rd := raftGroup.Ready()
-			if !rd.Committed.Empty() {
-				// TODO(pav-kv): do this loading when Replica.mu is released. We don't
-				// want IO under Replica.mu.
-				if toApply, err = logSnapshot.Slice(
-					rd.Committed, r.store.cfg.RaftMaxCommittedSizePerReady,
-				); err != nil {
-					return false, err
-				}
-			}
-			// We apply committed entries during this handleRaftReady, so it is ok to
-			// release the corresponding memory tokens at the end of this func. Next
-			// time we enter this function, the account will be empty again.
-			defer r.detachRaftEntriesMonitorRaftMuLocked()
-
-			logRaftReady(ctx, rd)
-			outboundMsgs, msgStorageAppend = splitLocalStorageMsgs(rd.Messages)
+			ready = raftGroup.Ready()
 		}
 		if switchToPullModeAfterReady {
 			raftGroup.SetLazyReplication(true)
@@ -1028,7 +1003,7 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 		unquiesceAndWakeLeader := hasReady || numFlushed > 0 || len(r.mu.proposals) > 0
 		return unquiesceAndWakeLeader, nil
 	})
-	r.mu.applyingEntries = len(toApply) != 0
+	r.mu.applyingEntries = !ready.Committed.Empty()
 	pausedFollowers := r.mu.pausedFollowers
 	r.mu.Unlock()
 	if errors.Is(err, errRemoved) {
@@ -1037,6 +1012,13 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 	} else if err != nil {
 		return stats, errors.Wrap(err, "checking raft group for Ready")
 	}
+
+	var outboundMsgs []raftpb.Message
+	var msgStorageAppend raftpb.Message
+	if hasReady {
+		logRaftReady(ctx, ready)
+		outboundMsgs, msgStorageAppend = splitLocalStorageMsgs(ready.Messages)
+	}
 	// Even if we don't have a Ready, or entries in Ready,
 	// replica_rac2.Processor may need to do some work.
 	raftEvent := rac2.RaftEventFromMsgStorageAppendAndMsgApps(
@@ -1074,6 +1056,30 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 	r.traceMessageSends(outboundMsgs, "sending messages")
 	r.sendRaftMessages(ctx, outboundMsgs, pausedFollowers, true /* willDeliverLocal */)
 
+	// Load the committed entries to be applied after releasing Replica.mu, to
+	// ensure that we don't have IO under this narrow/lightweight mutex. The
+	// RawNode can be making progress in the meantime, but it will never overwrite
+	// the committed entries it has been observing during the Ready() call.
+	//
+	// Also, do this loading after r.sendRaftMessages so that the outgoing
+	// messages don't need to wait for the storage interaction.
+	var toApply []raftpb.Entry
+	if !ready.Committed.Empty() {
+		// TODO(pav-kv): currently, Slice() only accounts for entry bytes loaded
+		// from log storage, and ignores the in-memory unstable entries. Consider a
+		// more complete flow control mechanism here, and eliminating the plumbing
+		// hack with the bytesAccount.
+		r.attachRaftEntriesMonitorRaftMuLocked()
+		// We apply committed entries during this handleRaftReady, so it is ok to
+		// release the corresponding memory tokens at the end of this func. Next
+		// time we enter this function, the account will be empty again.
+		defer r.detachRaftEntriesMonitorRaftMuLocked()
+		if toApply, err = logSnapshot.Slice(
+			ready.Committed, r.store.cfg.RaftMaxCommittedSizePerReady,
+		); err != nil {
+			return stats, errors.Wrap(err, "loading committed entries")
+		}
+	}
 	// If the ready struct includes entries that have been committed, these
 	// entries will be applied to the Replica's replicated state machine down
 	// below, after appending new entries to the raft log and sending messages
diff --git a/pkg/kv/kvserver/replica_raftlog.go b/pkg/kv/kvserver/replica_raftlog.go
@@ -77,12 +77,11 @@ func (r *replicaLogStorage) entriesLocked(
 	//
 	// TODO(pav-kv): we need better safety guardrails here. The log storage type
 	// can remember the readable bounds, and assert that reads do not cross them.
-	// TODO(pav-kv): r.raftMu.bytesAccount is broken - can't rely on raftMu here.
 	entries, _, loadedSize, err := logstore.LoadEntries(
 		r.AnnotateCtx(context.TODO()),
 		r.mu.stateLoader.StateLoader, r.store.TODOEngine(), r.RangeID,
 		r.store.raftEntryCache, r.raftMu.sideloaded, lo, hi, maxBytes,
-		&r.raftMu.bytesAccount,
+		nil, // bytesAccount is not used when reading under Replica.mu
 	)
 	r.store.metrics.RaftStorageReadBytes.Inc(int64(loadedSize))
 	return entries, err