Merge #145947

craig[bot] · tbg · craig[bot] · commit 36b30ff18f24 · 2025-06-10T08:22:51.000Z
145947: kvserver: use vanilla truncation in applySnapshotRaftMuLocked r=tbg a=tbg Previously, applySnapshotRaftMuLocked performed the log truncation that comes with snapshot application in an "ad-hoc" way. In particular, it wasn't clearing the sideloaded storage. We know call the same methods {stage,finalize}TruncationRaftMuLocked that are also called in the regular truncation code, which also clears the sideloaded storage. We also ensure that the raft log size is zero and trusted after the snapshot. Follow-up to #145328. Epic: CRDB-46488 Co-authored-by: Tobias Grieger <tobias.b.grieger@gmail.com>
diff --git a/pkg/kv/kvserver/raft_log_truncator.go b/pkg/kv/kvserver/raft_log_truncator.go
@@ -162,7 +162,7 @@ type pendingTruncation struct {
 	// ReplicatedEvalResult.RaftLogDelta, this is <= 0.
 	logDeltaBytes  int64
 	isDeltaTrusted bool
-	// hasSideloaded is true if the truncated interval contains at least one
+	// hasSideloaded is true if the truncated interval could contain at least one
 	// sideloaded entry.
 	hasSideloaded bool
 }
diff --git a/pkg/kv/kvserver/replica_application_result.go b/pkg/kv/kvserver/replica_application_result.go
@@ -11,6 +11,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
+	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/logstore"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftlog"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/readsummary/rspb"
 	"github.com/cockroachdb/cockroach/pkg/raft/raftpb"
@@ -505,6 +506,69 @@ func (r *Replica) stagePendingTruncationRaftMuLocked(pt pendingTruncation) {
 	r.asLogStorage().stagePendingTruncationRaftMuLocked(pt)
 }
 
+func (r *replicaLogStorage) stageApplySnapshotRaftMuLocked(
+	truncState kvserverpb.RaftTruncatedState,
+) {
+	r.raftMu.AssertHeld()
+
+	// A snapshot application implies a log truncation to the snapshot's index,
+	// and we apply the resulting memory state here (before the snapshot takes
+	// effect, i.e. the log entries disappear from storage). This avoids
+	// situations in which entries were already removed, but the in-mem state
+	// indicates that they ought to still exist.
+	//
+	// The truncation finalized below, after the snapshot is visible.
+
+	// Clear the raft entry cache at the end of this method (after mu has been
+	// released). Any reader that obtains their log bounds after the critical
+	// section but before the clear will see an empty log anyway, since the
+	// in-memory state is already updated to reflect the truncation, even if
+	// entries are still present in the cache.
+	defer r.cache.Drop(r.ls.RangeID)
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	// On snapshots, the entire log is cleared. This is safe:
+	// - log entries preceding the entry represented by the snapshot are durable
+	//   via the snapshot itself, and
+	// - committed log entries ahead of the snapshot index were not acked by this
+	//   replica, or raft would not have accepted this snapshot.
+	//
+	// Here, we update the in-memory state to reflect this before making the
+	// corresponding change to on-disk state. This makes sure that concurrent
+	// readers don't try to access entries no longer present in the log.
+	r.updateStateRaftMuLockedMuLocked(logstore.RaftState{
+		LastIndex: truncState.Index,
+		LastTerm:  truncState.Term,
+		ByteSize:  0,
+	})
+	r.shMu.trunc = truncState
+	r.shMu.lastCheckSize = 0
+	r.shMu.sizeTrusted = true
+}
+
+func (r *replicaLogStorage) finalizeApplySnapshotRaftMuLocked(ctx context.Context) {
+	r.raftMu.AssertHeld()
+	// This mirrors finalizeTruncationRaftMuLocked, but a snapshot may regress the last
+	// index (to discard a divergent log). For example:
+	//
+	// Raft log (before snapshot):
+	// - entry 100-150: term 1 [committed]
+	// - entry 151-200: term 2
+	// Committed raft log (on leader):
+	// - entry 100-150: term 1
+	// - entry 151:     term 3
+	//
+	// The replica may receive a snapshot at index 151. If we don't clear the
+	// sideloaded storage all the way up to the *old* last index, we may leak
+	// sideloaded entries. Rather than remember the old last index, we instead
+	// clear the sideloaded storage entirely. This is equivalent.
+	if err := r.ls.Sideload.Clear(ctx); err != nil {
+		log.Errorf(ctx, "while clearing sideloaded storage after snapshot: %+v", err)
+	}
+}
+
 func (r *replicaLogStorage) stagePendingTruncationRaftMuLocked(pt pendingTruncation) {
 	r.raftMu.AssertHeld()
 	// NB: The expected first index can be zero if this proposal is from before
@@ -513,7 +577,6 @@ func (r *replicaLogStorage) stagePendingTruncationRaftMuLocked(pt pendingTruncat
 	// this doesn't need any special casing.
 	pt.isDeltaTrusted = pt.isDeltaTrusted && r.shMu.trunc.Index+1 == pt.expectedFirstIndex
 
-	// TODO(pav-kv): move this logic to replicaLogStorage type.
 	func() {
 		r.mu.Lock()
 		defer r.mu.Unlock()
diff --git a/pkg/kv/kvserver/replica_application_state_machine.go b/pkg/kv/kvserver/replica_application_state_machine.go
@@ -42,7 +42,7 @@ type applyCommittedEntriesStats struct {
 	appBatchStats
 	followerStoreWriteBytes kvadmission.FollowerStoreWriteBytes
 	numBatchesProcessed     int // TODO(sep-raft-log): numBatches
-	stateAssertions         int
+	assertionsRequested     int
 	numConfChangeEntries    int
 }
 
@@ -206,14 +206,9 @@ func (sm *replicaStateMachine) ApplySideEffects(
 		// Some tests (TestRangeStatsInit) assumes that once the store has started
 		// and the first range has a lease that there will not be a later hard-state.
 		if shouldAssert {
-			// Assert that the on-disk state doesn't diverge from the in-memory
+			// Queue a check that the on-disk state doesn't diverge from the in-memory
 			// state as a result of the side effects.
-			sm.r.mu.RLock()
-			// TODO(sep-raft-log): either check only statemachine invariants or
-			// pass both engines in.
-			sm.r.assertStateRaftMuLockedReplicaMuRLocked(ctx, sm.r.store.TODOEngine())
-			sm.r.mu.RUnlock()
-			sm.applyStats.stateAssertions++
+			sm.applyStats.assertionsRequested++
 		}
 	} else if res := cmd.ReplicatedResult(); !res.IsZero() {
 		log.Fatalf(ctx, "failed to handle all side-effects of ReplicatedEvalResult: %v", res)
diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go
@@ -823,7 +823,7 @@ func (s handleRaftReadyStats) SafeFormat(p redact.SafePrinter, _ rune) {
 	}
 	p.SafeString("]")
 
-	if n := s.apply.stateAssertions; n > 0 {
+	if n := s.apply.assertionsRequested; n > 0 {
 		p.Printf(", state_assertions=%d", n)
 	}
 	if s.snap.offered {
@@ -1082,6 +1082,11 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 		}
 	}
 
+	// If this field is set, by the end of the method (after snapshot, append,
+	// apply handling), we will verify invariants including checking that
+	// in-memory state is congruent with disk state.
+	var shouldAssert bool
+
 	// Grab the known leaseholder before applying to the state machine.
 	startingLeaseholderID := r.shMu.state.Lease.Replica.ReplicaID
 	refreshReason := noReason
@@ -1111,6 +1116,7 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 		}
 
 		if app.Snapshot != nil {
+			shouldAssert = true
 			if inSnap.Desc == nil {
 				// If we didn't expect Raft to have a snapshot but it has one
 				// regardless, that is unexpected and indicates a programming
@@ -1236,6 +1242,7 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 			// it is now marked as destroyed.
 			return stats, err
 		}
+		shouldAssert = shouldAssert || stats.apply.assertionsRequested > 0
 
 		if r.store.cfg.KVAdmissionController != nil &&
 			stats.apply.followerStoreWriteBytes.NumEntries > 0 {
@@ -1268,6 +1275,15 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 	if r.store.TestingKnobs().EnableUnconditionalRefreshesInRaftReady {
 		refreshReason = reasonNewLeaderOrConfigChange
 	}
+
+	if shouldAssert {
+		sm.r.mu.RLock()
+		// TODO(sep-raft-log): either check only statemachine invariants or
+		// pass both engines in.
+		sm.r.assertStateRaftMuLockedReplicaMuRLocked(ctx, sm.r.store.TODOEngine())
+		sm.r.mu.RUnlock()
+	}
+
 	if refreshReason != noReason {
 		r.mu.Lock()
 		r.refreshProposalsLocked(ctx, 0 /* refreshAtDelta */, refreshReason)
diff --git a/pkg/kv/kvserver/replica_raft_test.go b/pkg/kv/kvserver/replica_raft_test.go
@@ -103,7 +103,7 @@ func Test_handleRaftReadyStats_SafeFormat(t *testing.T) {
 				numAddSST:                3,
 				numAddSSTCopies:          1,
 			},
-			stateAssertions:      4,
+			assertionsRequested:  4,
 			numConfChangeEntries: 6,
 		},
 		append: logstore.AppendStats{
diff --git a/pkg/kv/kvserver/replica_raftstorage.go b/pkg/kv/kvserver/replica_raftstorage.go
@@ -13,7 +13,6 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvstorage/snaprecv"
-	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/logstore"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/readsummary"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
@@ -607,15 +606,11 @@ func (r *Replica) applySnapshotRaftMuLocked(
 	clearedSpans = append(clearedSpans, clearedUnreplicatedSpan)
 	clearedSpans = append(clearedSpans, clearedSubsumedSpans...)
 
-	// Drop the entry cache before ingestion, like a real truncation would.
-	//
-	// TODO(sep-raft-log): like a real truncation, we should also bump the
-	// in-memory truncated state to the snapshot index. We should also assert
-	// that this leads to a (logically) empty log (otherwise we wouldn't have
-	// accepted the snapshot).
-	//
-	// See: https://github.com/cockroachdb/cockroach/pull/145328#discussion_r2068209588
-	r.store.raftEntryCache.Drop(r.RangeID)
+	ls := r.asLogStorage()
+
+	// Stage the truncation, so that in-memory state reflects an
+	// empty log.
+	ls.stageApplySnapshotRaftMuLocked(truncState)
 
 	stats.subsumedReplicas = timeutil.Now()
 
@@ -655,6 +650,9 @@ func (r *Replica) applySnapshotRaftMuLocked(
 		// snapshot.
 		writeBytes = uint64(inSnap.SSTSize)
 	}
+	// The snapshot is visible, so finalize the truncation.
+	ls.finalizeApplySnapshotRaftMuLocked(ctx)
+
 	// The "ignored" here is to ignore the writes to create the AC linear models
 	// for LSM writes. Since these writes typically correspond to actual writes
 	// onto the disk, we account for them separately in
@@ -683,6 +681,9 @@ func (r *Replica) applySnapshotRaftMuLocked(
 		log.Fatalf(ctx, "snapshot RaftAppliedIndexTerm %d doesn't match its metadata term %d",
 			state.RaftAppliedIndexTerm, nonemptySnap.Metadata.Term)
 	}
+	if ls.shMu.size != 0 {
+		log.Fatalf(ctx, "expected empty raft log after snapshot, got %d", ls.shMu.size)
+	}
 
 	// Read the prior read summary for this range, which was included in the
 	// snapshot. We may need to use it to bump our timestamp cache if we
@@ -737,23 +738,6 @@ func (r *Replica) applySnapshotRaftMuLocked(
 	// without risking a lock-ordering deadlock.
 	r.store.mu.Unlock()
 
-	// The log has been cleared and reset to start at the snapshot's applied
-	// index/term. Update the in-memory metadata accordingly.
-	r.asLogStorage().updateStateRaftMuLockedMuLocked(logstore.RaftState{
-		LastIndex: truncState.Index,
-		LastTerm:  truncState.Term,
-		ByteSize:  0, // the log is empty now
-	})
-	ls := r.asLogStorage()
-	ls.shMu.trunc = truncState
-	// Snapshots typically have fewer log entries than the leaseholder. The next
-	// time we hold the lease, recompute the log size before making decisions.
-	//
-	// TODO(pav-kv): does this assume that snapshots can contain log entries,
-	// which is no longer true? The comment needs an update, and the decision to
-	// set this flag to false revisited.
-	ls.shMu.sizeTrusted = false
-
 	// Update the store stats for the data in the snapshot.
 	r.store.metrics.subtractMVCCStats(ctx, r.tenantMetricsRef, *r.shMu.state.Stats)
 	r.store.metrics.addMVCCStats(ctx, r.tenantMetricsRef, *state.Stats)
@@ -791,14 +775,6 @@ func (r *Replica) applySnapshotRaftMuLocked(
 
 	r.mu.Unlock()
 
-	// Assert that the in-memory and on-disk states of the Replica are congruent
-	// after the application of the snapshot. Do so under a read lock, as this
-	// operation can be expensive. This is safe, as we hold the Replica.raftMu
-	// across both Replica.mu critical sections.
-	r.mu.RLock()
-	r.assertStateRaftMuLockedReplicaMuRLocked(ctx, r.store.TODOEngine())
-	r.mu.RUnlock()
-
 	// The rangefeed processor is listening for the logical ops attached to
 	// each raft command. These will be lost during a snapshot, so disconnect
 	// the rangefeed, if one exists.

Original file line number	Diff line number	Diff line change
`@@ -162,7 +162,7 @@ type pendingTruncation struct {`
`162`	`162`	`// ReplicatedEvalResult.RaftLogDelta, this is <= 0.`
`163`	`163`	`logDeltaBytes int64`
`164`	`164`	`isDeltaTrusted bool`
`165`		`- // hasSideloaded is true if the truncated interval contains at least one`
	`165`	`+ // hasSideloaded is true if the truncated interval could contain at least one`
`166`	`166`	`// sideloaded entry.`
`167`	`167`	`hasSideloaded bool`
`168`	`168`	`}`