cmd/go: test barrier actions

jsm · matloob · commit bca3e98b8aa4 · 2025-08-15T08:14:16.000-07:00
Add a barrier action between test run action and it's dependencies. Run will depend on this barrier action, and the barrier action will depend on: 1. The run action's dependencies 2. The previous barrier action This will force internal/work to schedule test run actions in-order, preventing potential goroutine starvation from the channel locking mechanism created to force test run actions to start in-order. Fixes golang#73106 golang#61233 Change-Id: I72e9f752f7521093f3c875eef7f2f29b2393fce9 Reviewed-on: https://go-review.googlesource.com/c/go/+/668035 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Matloob <matloob@golang.org> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Michael Matloob <matloob@google.com>
diff --git a/src/cmd/go/internal/test/test.go b/src/cmd/go/internal/test/test.go
@@ -1044,11 +1044,36 @@ func runTest(ctx context.Context, cmd *base.Command, args []string) {
 		prints = append(prints, printTest)
 	}
 
-	// Order runs for coordinating start JSON prints.
+	// Order runs for coordinating start JSON prints via two mechanisms:
+	// 1. Channel locking forces runTest actions to start in-order.
+	// 2. Barrier tasks force runTest actions to be scheduled in-order.
+	// We need both for performant behavior, as channel locking without the barrier tasks starves the worker pool,
+	// and barrier tasks without channel locking doesn't guarantee start in-order behavior alone.
+	var prevBarrier *work.Action
 	ch := make(chan struct{})
 	close(ch)
 	for _, a := range runs {
 		if r, ok := a.Actor.(*runTestActor); ok {
+			// Inject a barrier task between the run action and its dependencies.
+			// This barrier task wil also depend on the previous barrier task.
+			// This prevents the run task from being scheduled until all previous run dependencies have finished.
+			// The build graph will be augmented to look roughly like this:
+			//	build("a")           build("b")           build("c")
+			//	    |                   |                     |
+			//	barrier("a.test") -> barrier("b.test") -> barrier("c.test")
+			//	    |                   |                     |
+			//	run("a.test")        run("b.test")        run("c.test")
+
+			barrier := &work.Action{
+				Mode: "test barrier",
+				Deps: slices.Clip(a.Deps),
+			}
+			if prevBarrier != nil {
+				barrier.Deps = append(barrier.Deps, prevBarrier)
+			}
+			a.Deps = []*work.Action{barrier}
+			prevBarrier = barrier
+
 			r.prev = ch
 			ch = make(chan struct{})
 			r.next = ch
@@ -1400,6 +1425,8 @@ func (lockedStdout) Write(b []byte) (int, error) {
 
 func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action) error {
 	sh := b.Shell(a)
+	barrierAction := a.Deps[0]
+	buildAction := barrierAction.Deps[0]
 
 	// Wait for previous test to get started and print its first json line.
 	select {
@@ -1530,7 +1557,7 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 		// we have different link inputs but the same final binary,
 		// we still reuse the cached test result.
 		// c.saveOutput will store the result under both IDs.
-		r.c.tryCacheWithID(b, a, a.Deps[0].BuildContentID())
+		r.c.tryCacheWithID(b, a, buildAction.BuildContentID())
 	}
 	if r.c.buf != nil {
 		if stdout != &buf {
@@ -1581,7 +1608,7 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 		// fresh copies of tools to test as part of the testing.
 		addToEnv = "GOCOVERDIR=" + gcd
 	}
-	args := str.StringList(execCmd, a.Deps[0].BuiltTarget(), testlogArg, panicArg, fuzzArg, coverdirArg, testArgs)
+	args := str.StringList(execCmd, buildAction.BuiltTarget(), testlogArg, panicArg, fuzzArg, coverdirArg, testArgs)
 
 	if testCoverProfile != "" {
 		// Write coverage to temporary profile, for merging later.
@@ -1741,8 +1768,8 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 // tryCache is called just before the link attempt,
 // to see if the test result is cached and therefore the link is unneeded.
 // It reports whether the result can be satisfied from cache.
-func (c *runCache) tryCache(b *work.Builder, a *work.Action) bool {
-	return c.tryCacheWithID(b, a, a.Deps[0].BuildActionID())
+func (c *runCache) tryCache(b *work.Builder, a *work.Action, linkAction *work.Action) bool {
+	return c.tryCacheWithID(b, a, linkAction.BuildActionID())
 }
 
 func (c *runCache) tryCacheWithID(b *work.Builder, a *work.Action, id string) bool {
diff --git a/src/cmd/go/internal/work/action.go b/src/cmd/go/internal/work/action.go
@@ -92,7 +92,7 @@ type Action struct {
 
 	buggyInstall bool // is this a buggy install (see -linkshared)?
 
-	TryCache func(*Builder, *Action) bool // callback for cache bypass
+	TryCache func(*Builder, *Action, *Action) bool // callback for cache bypass
 
 	CacheExecutable bool // Whether to cache executables produced by link steps
 
diff --git a/src/cmd/go/internal/work/buildid.go b/src/cmd/go/internal/work/buildid.go
@@ -401,6 +401,25 @@ var (
 	stdlibRecompiledIncOnce = sync.OnceFunc(stdlibRecompiled.Inc)
 )
 
+// testRunAction returns the run action for a test given the link action
+// for the test binary, if the only (non-test-barrier) action that depend
+// on the link action is the run action.
+func testRunAction(a *Action) *Action {
+	if len(a.triggers) != 1 || a.triggers[0].Mode != "test barrier" {
+		return nil
+	}
+	var runAction *Action
+	for _, t := range a.triggers[0].triggers {
+		if t.Mode == "test run" {
+			if runAction != nil {
+				return nil
+			}
+			runAction = t
+		}
+	}
+	return runAction
+}
+
 // useCache tries to satisfy the action a, which has action ID actionHash,
 // by using a cached result from an earlier build.
 // If useCache decides that the cache can be used, it sets a.buildID
@@ -526,7 +545,7 @@ func (b *Builder) useCache(a *Action, actionHash cache.ActionID, target string,
 	// then to avoid the link step, report the link as up-to-date.
 	// We avoid the nested build ID problem in the previous special case
 	// by recording the test results in the cache under the action ID half.
-	if len(a.triggers) == 1 && a.triggers[0].TryCache != nil && a.triggers[0].TryCache(b, a.triggers[0]) {
+	if ra := testRunAction(a); ra != nil && ra.TryCache != nil && ra.TryCache(b, ra, a) {
 		// Best effort attempt to display output from the compile and link steps.
 		// If it doesn't work, it doesn't work: reusing the test result is more
 		// important than reprinting diagnostic information.
diff --git a/src/cmd/go/internal/work/cover.go b/src/cmd/go/internal/work/cover.go
@@ -36,8 +36,9 @@ func (b *Builder) CovData(a *Action, cmdargs ...any) ([]byte, error) {
 // but will be empty; in this case the return is an empty string.
 func BuildActionCoverMetaFile(runAct *Action) (string, error) {
 	p := runAct.Package
-	for i := range runAct.Deps {
-		pred := runAct.Deps[i]
+	barrierAct := runAct.Deps[0]
+	for i := range barrierAct.Deps {
+		pred := barrierAct.Deps[i]
 		if pred.Mode != "build" || pred.Package == nil {
 			continue
 		}
diff --git a/src/cmd/go/internal/work/exec.go b/src/cmd/go/internal/work/exec.go
@@ -183,7 +183,21 @@ func (b *Builder) Do(ctx context.Context, root *Action) {
 
 		for _, a0 := range a.triggers {
 			if a.Failed != nil {
-				a0.Failed = a.Failed
+				if a0.Mode == "test barrier" {
+					// If this action was triggered by a test, there
+					// will be a test barrier action in between the test
+					// and the true trigger. But there will be other
+					// triggers that are other barriers that are waiting
+					// for this one. Propagate the failure to the true
+					// trigger, but not to the other barriers.
+					for _, bt := range a0.triggers {
+						if bt.Mode != "test barrier" {
+							bt.Failed = a.Failed
+						}
+					}
+				} else {
+					a0.Failed = a.Failed
+				}
 			}
 			if a0.pending--; a0.pending == 0 {
 				b.ready.push(a0)