sql: fix rare race around concurrent remote flows setup

yuzefovich · yuzefovich · commit 702912791f0e · 2025-12-23T10:11:20.000-08:00
A few years ago in 0c1095e we changed the way we set up distributed query plans. Namely, we now start by setting up the gateway (i.e. local) flow first, and then we'll issue SetupFlowRequest RPCs concurrently to set up remote flows without actually blocking on the gateway until the setup is complete. We have seen about 5 occurrences where the protobuf marshaling code crashed when handling those concurrent RPCs. I have a hypothesis is that this is due to the main goroutine of the gateway flow not waiting until after RPCs are done. In particular, we put `PhysicalInfrastructure` objects into `sync.Pool` and they are released by executing `PlanningCtx.getCleanupFunc` function. That function is executed in a defer after `Run`ning the local flow completes. However, it's possible that it'll be executed _before_ concurrent SetupFlowRequest RPCs (evaluated via the distsql worker goroutines) are performed, and I'm guessing the flow specs might get corrupted because of that. In order to prevent this race, we now will block execution of `Flow.Cleanup` function of the gateway flow until all concurrent RPCs are done. I tried injecting the sleep right before executing the concurrent RPCs but still was unable to reproduce the problem on the gceworker. Given that we've only seen this a handful of times, I decided to omit the release note. Release note: None
diff --git a/pkg/sql/distsql_running.go b/pkg/sql/distsql_running.go
@@ -568,10 +568,14 @@ func (dsp *DistSQLPlanner) setupFlows(
 	var runnerSpan *tracing.Span
 	// This span is necessary because it can outlive its parent.
 	runnerCtx, runnerSpan = tracing.ChildSpan(runnerCtx, "setup-flow-async" /* opName */)
+	// runnerDone will be closed whenever all concurrent SetupFlowRequest RPCs
+	// are complete.
+	runnerDone := make(chan struct{})
 	// runnerCleanup can only be executed _after_ all issued RPCs are complete.
 	runnerCleanup := func() {
 		cancelRunnerCtx()
 		runnerSpan.Finish()
+		close(runnerDone)
 	}
 	// Make sure that we call runnerCleanup unless a new goroutine takes that
 	// responsibility.
@@ -642,24 +646,20 @@ func (dsp *DistSQLPlanner) setupFlows(
 	// setup of a remote flow fails, we want to eagerly cancel all the flows,
 	// and we do so in a separate goroutine.
 	//
-	// We need to synchronize the new goroutine with flow.Cleanup() being called
-	// since flow.Cleanup() is the last thing before DistSQLPlanner.Run returns
-	// at which point the rowResultWriter is no longer protected by the mutex of
-	// the DistSQLReceiver.
-	// TODO(yuzefovich): think through this comment - we no longer have mutex
-	// protection in place, so perhaps this can be simplified.
-	cleanupCalledMu := struct {
-		syncutil.Mutex
-		called bool
-	}{}
+	// We need to ensure that the local flow Cleanup() is blocked until all RPCs
+	// are performed. This is needed since we release the PhysicalInfrastructure
+	// back into the sync.Pool, and FlowSpecs for concurrent RPCs might
+	// reference that infra.
+	var cleanupCalled atomic.Bool
 	flow.AddOnCleanupStart(func() {
-		cleanupCalledMu.Lock()
-		defer cleanupCalledMu.Unlock()
-		cleanupCalledMu.called = true
-		// Cancel any outstanding RPCs while holding the lock to protect from
-		// the context canceled error (the result of the RPC) being set on the
-		// DistSQLReceiver by the listener goroutine below.
+		// Indicate that the local flow Cleanup has started - at this point we
+		// can ignore any errors communicated by the listener goroutine below.
+		cleanupCalled.Store(true)
+		// Cancel any outstanding RPCs.
 		cancelRunnerCtx()
+		// Now block the cleanup of the local flow until all concurrent RPCs are
+		// complete.
+		<-runnerDone
 	})
 	err = dsp.stopper.RunAsyncTask(origCtx, "distsql-remote-flows-setup-listener", func(ctx context.Context) {
 		// Note that in the loop below we always receive from the result channel
@@ -673,21 +673,17 @@ func (dsp *DistSQLPlanner) setupFlows(
 			if res.err != nil && !seenError {
 				// The setup of at least one remote flow failed.
 				seenError = true
-				func() {
-					cleanupCalledMu.Lock()
-					defer cleanupCalledMu.Unlock()
-					if cleanupCalledMu.called {
-						// Cleanup of the local flow has already been performed,
-						// so there is nothing to do.
-						return
-					}
-					// First, we send the error to the DistSQL receiver to be
-					// returned to the client eventually (which will happen on
-					// the next Push or PushBatch call).
-					recv.concurrentErrorCh <- res.err
-					// Now explicitly cancel the local flow.
-					flow.Cancel()
-				}()
+				if cleanupCalled.Load() {
+					// Cleanup of the local flow has already started, so there
+					// is nothing to communicate to the local flow.
+					continue
+				}
+				// First, we send the error to the DistSQL receiver to be
+				// returned to the client eventually (which will happen on the
+				// next Push or PushBatch call).
+				recv.concurrentErrorCh <- res.err
+				// Now explicitly cancel the local flow.
+				flow.Cancel()
 			}
 		}
 	})