Skip to content

Commit 1f63750

Browse files
committed
kvserver: do not cancel while in HandleRaftResponse
If the context is canceled in the middle of HandleRaftResponse, a Replica can be left in a broken state, e.g. its destruction can be partially completed. This can brick a RangeID, e.g. cause a subsequent call to getOrCreateReplica to be stuck in an infinite loop. Epic: none Release note: none
1 parent 4273ead commit 1f63750

File tree

1 file changed

+7
-8
lines changed

1 file changed

+7
-8
lines changed

pkg/kv/kvserver/raft_transport.go

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -599,25 +599,24 @@ func (t *RaftTransport) StopOutgoingMessage(storeID roachpb.StoreID) {
599599
func (t *RaftTransport) processQueue(
600600
ctx context.Context, q *raftSendQueue, client RPCMultiRaftClient, _ rpcbase.ConnectionClass,
601601
) error {
602-
batchCtx, cancel := context.WithCancel(ctx)
602+
streamCtx, cancel := context.WithCancel(ctx)
603603
defer cancel()
604-
stream, err := client.RaftMessageBatch(batchCtx) // closed via cancellation
604+
stream, err := client.RaftMessageBatch(streamCtx) // closed via cancellation
605605
if err != nil {
606606
return errors.Wrapf(err, "creating batch client")
607607
}
608608

609609
errCh := make(chan error, 1)
610610

611611
// NB: the stream context is canceled when this func returns, and causes the
612-
// response handling loop to terminate asynchronously.
613-
//
614-
// TODO(#140958): the context cancellation in the middle of HandleRaftResponse
615-
// can lead to broken state, such as a replica marked as destroyed but this
616-
// not being reflected in storage.
612+
// response handling loop to terminate asynchronously. Note though that the
613+
// task uses the parent context for HandleRaftResponse calls, to let it finish
614+
// gracefully. Previously, context cancellation inside HandleRaftResponse
615+
// could cause incorrect / half-done Replica destruction (see #140958).
617616
//
618617
// TODO(pav-kv): wait for the task termination to prevent subsequent
619618
// processQueue calls from piling up concurrent tasks.
620-
goCtx, hdl, err := t.stopper.GetHandle(stream.Context(), stop.TaskOpts{
619+
goCtx, hdl, err := t.stopper.GetHandle(ctx, stop.TaskOpts{
621620
TaskName: "storage.RaftTransport: processing queue",
622621
})
623622
if err != nil {

0 commit comments

Comments
 (0)