Skip to content

Commit 517d0c7

Browse files
craig[bot]pav-kv
andcommitted
Merge #152205
152205: kvserver: rm unnecessary Replica.mu locking r=arulajmani a=pav-kv This PR removes the unnecessary `Replica.mu.RLock()` in `Store.tryGetReplica()`. This function is on the critical path - every raft message [processing](https://github.com/cockroachdb/cockroach/blob/d1af030c8b9894306710132a253bd7c2c9145623/pkg/kv/kvserver/store_raft.go#L699) calls it. So this should have a non-zero performance impact. Presumably, `Replica.mu` used to be locked for reading `destroyStatus`, but the latter can be read with only `raftMu` held. To emphasize that this field can be read under either mutex, this PR also moves `destroyStatus` from the `Replica.mu` to `Replica.shMu` section. Technically, `destroyStatus` doesn't fit either of `Replica.{mu,shMu}` sections, because it must be mutated with also `readOnlyCmdMu` locked. But `shMu` is strictly closer by semantics. Epic: none Release note: none Co-authored-by: Pavel Kalinnikov <[email protected]>
2 parents 86c4728 + cf25e7d commit 517d0c7

16 files changed

+69
-53
lines changed

pkg/kv/kvserver/replica.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,14 @@ type Replica struct {
571571
//
572572
// TODO(pav-kv): audit all other fields and include here.
573573
shMu struct {
574+
// The destroyed status of a replica indicating if it's alive, corrupt,
575+
// scheduled for destruction or has been GCed. destroyStatus should only be
576+
// set while also holding the raftMu and readOnlyCmdMu.
577+
//
578+
// When this replica is being removed, the destroyStatus is updated and
579+
// RangeTombstone is written in the same raftMu critical section.
580+
destroyStatus
581+
574582
// The state of the Raft state machine.
575583
// Invariant: state.TruncatedState == nil. The field is being phased out in
576584
// favour of the one contained in logStorage.
@@ -587,14 +595,6 @@ type Replica struct {
587595
mu struct {
588596
// Protects all fields in the mu struct.
589597
ReplicaMutex
590-
// The destroyed status of a replica indicating if it's alive, corrupt,
591-
// scheduled for destruction or has been GCed.
592-
// destroyStatus should only be set while also holding the raftMu and
593-
// readOnlyCmdMu.
594-
//
595-
// When this replica is being removed, the destroyStatus is updated and
596-
// RangeTombstone is written in the same raftMu critical section.
597-
destroyStatus
598598
// Is the range quiescent? Quiescent ranges are not Tick()'d and unquiesce
599599
// whenever a Raft operation is performed.
600600
//
@@ -1250,7 +1250,7 @@ func (r *Replica) IsDestroyed() (DestroyReason, error) {
12501250
}
12511251

12521252
func (r *Replica) isDestroyedRLocked() (DestroyReason, error) {
1253-
return r.mu.destroyStatus.reason, r.mu.destroyStatus.err
1253+
return r.shMu.destroyStatus.reason, r.shMu.destroyStatus.err
12541254
}
12551255

12561256
// IsQuiescent returns whether the replica is quiescent or not.
@@ -2672,11 +2672,11 @@ func (r *Replica) maybeWatchForMergeLocked(ctx context.Context) (bool, error) {
26722672
r.raftMu.Lock()
26732673
r.readOnlyCmdMu.Lock()
26742674
r.mu.Lock()
2675-
if mergeCommitted && r.mu.destroyStatus.IsAlive() {
2675+
if mergeCommitted && r.shMu.destroyStatus.IsAlive() {
26762676
// The merge committed but the left-hand replica on this store hasn't
26772677
// subsumed this replica yet. Mark this replica as destroyed so it
26782678
// doesn't serve requests when we close the mergeCompleteCh below.
2679-
r.mu.destroyStatus.Set(kvpb.NewRangeNotFoundError(r.RangeID, r.store.StoreID()), destroyReasonMergePending)
2679+
r.shMu.destroyStatus.Set(kvpb.NewRangeNotFoundError(r.RangeID, r.store.StoreID()), destroyReasonMergePending)
26802680
}
26812681
// Unblock pending requests. If the merge committed, the requests will
26822682
// notice that the replica has been destroyed and return an appropriate

pkg/kv/kvserver/replica_app_batch.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ func (b *replicaAppBatch) runPostAddTriggersReplicaOnly(
353353
// commits by handleMergeResult() to finish the removal.
354354
rhsRepl.readOnlyCmdMu.Lock()
355355
rhsRepl.mu.Lock()
356-
rhsRepl.mu.destroyStatus.Set(
356+
rhsRepl.shMu.destroyStatus.Set(
357357
kvpb.NewRangeNotFoundError(rhsRepl.RangeID, rhsRepl.store.StoreID()),
358358
destroyReasonRemoved)
359359
rhsRepl.mu.Unlock()
@@ -443,7 +443,7 @@ func (b *replicaAppBatch) runPostAddTriggersReplicaOnly(
443443
// application.
444444
b.r.readOnlyCmdMu.Lock()
445445
b.r.mu.Lock()
446-
b.r.mu.destroyStatus.Set(
446+
b.r.shMu.destroyStatus.Set(
447447
kvpb.NewRangeNotFoundError(b.r.RangeID, b.r.store.StoreID()),
448448
destroyReasonRemoved)
449449
span := b.r.descRLocked().RSpan()

pkg/kv/kvserver/replica_application_state_machine_test.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,11 @@ func TestReplicaStateMachineChangeReplicas(t *testing.T) {
149149
// should there be a command in the raft log (i.e. some errant lease request
150150
// or whatnot) this will fire assertions because it will conflict with the
151151
// log index that we pulled out of thin air above.
152+
r.readOnlyCmdMu.Lock()
153+
defer r.readOnlyCmdMu.Unlock()
152154
r.mu.Lock()
153155
defer r.mu.Unlock()
154-
r.mu.destroyStatus.Set(errors.New("test done"), destroyReasonRemoved)
156+
r.shMu.destroyStatus.Set(errors.New("test done"), destroyReasonRemoved)
155157
})
156158
}
157159

@@ -231,14 +233,16 @@ func TestReplicaStateMachineRaftLogTruncationStronglyCoupled(t *testing.T) {
231233
_, err = sm.ApplySideEffects(checkedCmd.Ctx(), checkedCmd)
232234
require.NoError(t, err)
233235
func() {
234-
r.mu.Lock()
235-
defer r.mu.Unlock()
236236
// Set a destroyStatus to make sure there won't be any raft processing once
237237
// we release raftMu. We applied a command but not one from the raft log, so
238238
// should there be a command in the raft log (i.e. some errant lease request
239239
// or whatnot) this will fire assertions because it will conflict with the
240240
// log index that we pulled out of thin air above.
241-
r.mu.destroyStatus.Set(errors.New("test done"), destroyReasonRemoved)
241+
r.readOnlyCmdMu.Lock()
242+
r.mu.Lock()
243+
defer r.mu.Unlock()
244+
r.shMu.destroyStatus.Set(errors.New("test done"), destroyReasonRemoved)
245+
r.readOnlyCmdMu.Unlock()
242246

243247
require.Equal(t, raftAppliedIndex+1, r.shMu.state.RaftAppliedIndex)
244248
require.Equal(t, truncatedIndex+1, ls.shMu.trunc.Index)
@@ -427,7 +431,13 @@ func TestReplicaStateMachineEphemeralAppBatchRejection(t *testing.T) {
427431
defer r.raftMu.Unlock()
428432
// Avoid additional raft processing after we're done with this replica because
429433
// we've applied entries that aren't in the log.
430-
defer r.mu.destroyStatus.Set(errors.New("boom"), destroyReasonRemoved)
434+
defer func() {
435+
r.readOnlyCmdMu.Lock()
436+
defer r.readOnlyCmdMu.Unlock()
437+
r.mu.Lock()
438+
defer r.mu.Unlock()
439+
r.shMu.destroyStatus.Set(errors.New("boom"), destroyReasonRemoved)
440+
}()
431441

432442
sm := r.getStateMachine()
433443

pkg/kv/kvserver/replica_command_test.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -677,9 +677,13 @@ func TestWaitForLeaseAppliedIndex(t *testing.T) {
677677
stopper.Stop(ctx)
678678

679679
destroyErr := errors.New("destroyed")
680-
tc.repl.mu.Lock()
681-
tc.repl.mu.destroyStatus.Set(destroyErr, destroyReasonRemoved)
682-
tc.repl.mu.Unlock()
680+
func() {
681+
tc.repl.readOnlyCmdMu.Lock()
682+
defer tc.repl.readOnlyCmdMu.Unlock()
683+
tc.repl.mu.Lock()
684+
defer tc.repl.mu.Unlock()
685+
tc.repl.shMu.destroyStatus.Set(destroyErr, destroyReasonRemoved)
686+
}()
683687

684688
_, err = tc.repl.WaitForLeaseAppliedIndex(ctx, maxLAI)
685689
require.Error(t, err)

pkg/kv/kvserver/replica_corruption.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func (r *Replica) setCorruptRaftMuLocked(
4343

4444
log.Dev.ErrorfDepth(ctx, 1, "stalling replica due to: %s", cErr.ErrorMsg)
4545
cErr.Processed = true
46-
r.mu.destroyStatus.Set(cErr, destroyReasonRemoved)
46+
r.shMu.destroyStatus.Set(cErr, destroyReasonRemoved)
4747

4848
auxDir := r.store.TODOEngine().GetAuxiliaryDir()
4949
_ = r.store.TODOEngine().Env().MkdirAll(auxDir, os.ModePerm)

pkg/kv/kvserver/replica_destroy.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ func (r *Replica) disconnectReplicationRaftMuLocked(ctx context.Context) {
167167
p.finishApplication(ctx, makeProposalResultErr(kvpb.NewAmbiguousResultError(apply.ErrRemoved)))
168168
}
169169

170-
if !r.mu.destroyStatus.Removed() {
170+
if !r.shMu.destroyStatus.Removed() {
171171
log.Dev.Fatalf(ctx, "removing raft group before destroying replica %s", r)
172172
}
173173
r.mu.internalRaftGroup = nil

pkg/kv/kvserver/replica_proposal_buf.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1116,7 +1116,7 @@ func (rp *replicaProposer) getReplicaID() roachpb.ReplicaID {
11161116
}
11171117

11181118
func (rp *replicaProposer) destroyed() destroyStatus {
1119-
return rp.mu.destroyStatus
1119+
return rp.shMu.destroyStatus
11201120
}
11211121

11221122
func (rp *replicaProposer) leaseAppliedIndex() kvpb.LeaseAppliedIndex {

pkg/kv/kvserver/replica_raft.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1648,7 +1648,7 @@ func (r *Replica) refreshProposalsLocked(
16481648
}
16491649

16501650
r.mu.slowProposalCount = slowProposalCount
1651-
destroyed := r.mu.destroyStatus.Removed()
1651+
destroyed := r.shMu.destroyStatus.Removed()
16521652

16531653
// If the breaker isn't tripped yet but we've detected commands that have
16541654
// taken too long to replicate, and the replica is not destroyed, trip the
@@ -2222,7 +2222,7 @@ func (s pendingCmdSlice) Less(i, j int) bool {
22222222
func (r *Replica) withRaftGroupLocked(
22232223
f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error),
22242224
) error {
2225-
if r.mu.destroyStatus.Removed() {
2225+
if r.shMu.destroyStatus.Removed() {
22262226
// Callers know to detect errRemoved as non-fatal.
22272227
return errRemoved
22282228
}

pkg/kv/kvserver/replica_raftstorage.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ func (r *Replica) applySnapshotRaftMuLocked(
577577
// erroneously return empty data.
578578
sr.readOnlyCmdMu.Lock()
579579
sr.mu.Lock()
580-
sr.mu.destroyStatus.Set(
580+
sr.shMu.destroyStatus.Set(
581581
kvpb.NewRangeNotFoundError(sr.RangeID, sr.store.StoreID()),
582582
destroyReasonRemoved)
583583
sr.mu.Unlock()

pkg/kv/kvserver/replica_test.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13781,9 +13781,13 @@ func TestAdminScatterDestroyedReplica(t *testing.T) {
1378113781
tc.Start(ctx, t, stopper)
1378213782

1378313783
errBoom := errors.New("boom")
13784-
tc.repl.mu.Lock()
13785-
tc.repl.mu.destroyStatus.Set(errBoom, destroyReasonMergePending)
13786-
tc.repl.mu.Unlock()
13784+
func() {
13785+
tc.repl.readOnlyCmdMu.Lock()
13786+
defer tc.repl.readOnlyCmdMu.Unlock()
13787+
tc.repl.mu.Lock()
13788+
defer tc.repl.mu.Unlock()
13789+
tc.repl.shMu.destroyStatus.Set(errBoom, destroyReasonMergePending)
13790+
}()
1378713791

1378813792
desc := tc.repl.Desc()
1378913793
resp, err := tc.repl.adminScatter(ctx, kvpb.AdminScatterRequest{

0 commit comments

Comments
 (0)