Skip to content

Commit 6f850a3

Browse files
committed
kvserver: consolidate raft handling assertion checks
We now - avoid queueing multiple assertion checks in an apply cycle - queue the snapshot-induced assertion check at the end, giving us more flexibility over when the state is updated (and allowing future refactoring that streamlines these updates)
1 parent e7220a6 commit 6f850a3

File tree

3 files changed

+18
-14
lines changed

3 files changed

+18
-14
lines changed

pkg/kv/kvserver/replica_application_state_machine.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -206,13 +206,8 @@ func (sm *replicaStateMachine) ApplySideEffects(
206206
// Some tests (TestRangeStatsInit) assumes that once the store has started
207207
// and the first range has a lease that there will not be a later hard-state.
208208
if shouldAssert {
209-
// Assert that the on-disk state doesn't diverge from the in-memory
209+
// Queue a check that the on-disk state doesn't diverge from the in-memory
210210
// state as a result of the side effects.
211-
sm.r.mu.RLock()
212-
// TODO(sep-raft-log): either check only statemachine invariants or
213-
// pass both engines in.
214-
sm.r.assertStateRaftMuLockedReplicaMuRLocked(ctx, sm.r.store.TODOEngine())
215-
sm.r.mu.RUnlock()
216211
sm.applyStats.assertionsRequested++
217212
}
218213
} else if res := cmd.ReplicatedResult(); !res.IsZero() {

pkg/kv/kvserver/replica_raft.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,6 +1082,11 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
10821082
}
10831083
}
10841084

1085+
// If this field is set, by the end of the method (after snapshot, append,
1086+
// apply handling), we will verify invariants including checking that
1087+
// in-memory state is congruent with disk state.
1088+
var shouldAssert bool
1089+
10851090
// Grab the known leaseholder before applying to the state machine.
10861091
startingLeaseholderID := r.shMu.state.Lease.Replica.ReplicaID
10871092
refreshReason := noReason
@@ -1111,6 +1116,7 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
11111116
}
11121117

11131118
if app.Snapshot != nil {
1119+
shouldAssert = true
11141120
if inSnap.Desc == nil {
11151121
// If we didn't expect Raft to have a snapshot but it has one
11161122
// regardless, that is unexpected and indicates a programming
@@ -1236,6 +1242,7 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
12361242
// it is now marked as destroyed.
12371243
return stats, err
12381244
}
1245+
shouldAssert = shouldAssert || stats.apply.assertionsRequested > 0
12391246

12401247
if r.store.cfg.KVAdmissionController != nil &&
12411248
stats.apply.followerStoreWriteBytes.NumEntries > 0 {
@@ -1268,6 +1275,16 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
12681275
if r.store.TestingKnobs().EnableUnconditionalRefreshesInRaftReady {
12691276
refreshReason = reasonNewLeaderOrConfigChange
12701277
}
1278+
1279+
if shouldAssert {
1280+
sm.r.mu.RLock()
1281+
// TODO(sep-raft-log): either check only statemachine invariants or
1282+
// pass both engines in.
1283+
sm.r.assertStateRaftMuLockedReplicaMuRLocked(ctx, sm.r.store.TODOEngine())
1284+
sm.r.mu.RUnlock()
1285+
1286+
}
1287+
12711288
if refreshReason != noReason {
12721289
r.mu.Lock()
12731290
r.refreshProposalsLocked(ctx, 0 /* refreshAtDelta */, refreshReason)

pkg/kv/kvserver/replica_raftstorage.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -775,14 +775,6 @@ func (r *Replica) applySnapshotRaftMuLocked(
775775

776776
r.mu.Unlock()
777777

778-
// Assert that the in-memory and on-disk states of the Replica are congruent
779-
// after the application of the snapshot. Do so under a read lock, as this
780-
// operation can be expensive. This is safe, as we hold the Replica.raftMu
781-
// across both Replica.mu critical sections.
782-
r.mu.RLock()
783-
r.assertStateRaftMuLockedReplicaMuRLocked(ctx, r.store.TODOEngine())
784-
r.mu.RUnlock()
785-
786778
// The rangefeed processor is listening for the logical ops attached to
787779
// each raft command. These will be lost during a snapshot, so disconnect
788780
// the rangefeed, if one exists.

0 commit comments

Comments
 (0)