runtime: move TestReadMetricsSched to testprog

mknyszek · gopherbot · commit 16ae11a9e1ca · 2025-09-26T10:55:30.000-07:00
There are just too many flakes resulting from background pollution by the testing package and other tests. Run in a subprocess where at least the environment can be more tightly controlled. Fixes golang#75049. Change-Id: Iad59edaaf31268f1fcb77273f01317d963708fa6 Reviewed-on: https://go-review.googlesource.com/c/go/+/707155 Reviewed-by: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Knyszek <mknyszek@google.com>
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
@@ -22,7 +22,6 @@ import (
 	"strings"
 	"sync"
 	"sync/atomic"
-	"syscall"
 	"testing"
 	"time"
 	"unsafe"
@@ -1578,211 +1577,10 @@ func TestReadMetricsFinalizers(t *testing.T) {
 }
 
 func TestReadMetricsSched(t *testing.T) {
-	const (
-		notInGo = iota
-		runnable
-		running
-		waiting
-		created
-		threads
-		numSamples
-	)
-	var s [numSamples]metrics.Sample
-	s[notInGo].Name = "/sched/goroutines/not-in-go:goroutines"
-	s[runnable].Name = "/sched/goroutines/runnable:goroutines"
-	s[running].Name = "/sched/goroutines/running:goroutines"
-	s[waiting].Name = "/sched/goroutines/waiting:goroutines"
-	s[created].Name = "/sched/goroutines-created:goroutines"
-	s[threads].Name = "/sched/threads/total:threads"
-
-	logMetrics := func(t *testing.T, s []metrics.Sample) {
-		for i := range s {
-			t.Logf("%s: %d", s[i].Name, s[i].Value.Uint64())
-		}
-	}
-
-	// generalSlack is the amount of goroutines we allow ourselves to be
-	// off by in any given category, either due to background system
-	// goroutines or testing package goroutines.
-	const generalSlack = 4
-
-	// waitingSlack is the max number of blocked goroutines left
-	// from other tests, the testing package, or system
-	// goroutines.
-	const waitingSlack = 100
-
-	// threadsSlack is the maximum number of threads left over
-	// from other tests and the runtime (sysmon, the template thread, etc.)
-	const threadsSlack = 20
-
-	// Make sure GC isn't running, since GC workers interfere with
-	// expected counts.
-	defer debug.SetGCPercent(debug.SetGCPercent(-1))
-	runtime.GC()
-
-	check := func(t *testing.T, s *metrics.Sample, min, max uint64) {
-		val := s.Value.Uint64()
-		if val < min {
-			t.Errorf("%s too low; %d < %d", s.Name, val, min)
-		}
-		if val > max {
-			t.Errorf("%s too high; %d > %d", s.Name, val, max)
-		}
-	}
-	checkEq := func(t *testing.T, s *metrics.Sample, value uint64) {
-		check(t, s, value, value)
+	// This test is run in a subprocess to prevent other tests from polluting the metrics.
+	output := runTestProg(t, "testprog", "SchedMetrics")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
 	}
-	spinUntil := func(f func() bool) bool {
-		for {
-			if f() {
-				return true
-			}
-			time.Sleep(50 * time.Millisecond)
-		}
-	}
-
-	// Check base values.
-	t.Run("base", func(t *testing.T) {
-		defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
-		metrics.Read(s[:])
-		logMetrics(t, s[:])
-		check(t, &s[notInGo], 0, generalSlack)
-		check(t, &s[runnable], 0, generalSlack)
-		checkEq(t, &s[running], 1)
-		check(t, &s[waiting], 0, waitingSlack)
-	})
-
-	metrics.Read(s[:])
-	createdAfterBase := s[created].Value.Uint64()
-
-	// Force Running count to be high. We'll use these goroutines
-	// for Runnable, too.
-	const count = 10
-	var ready, exit atomic.Uint32
-	for i := 0; i < count-1; i++ {
-		go func() {
-			ready.Add(1)
-			for exit.Load() == 0 {
-				// Spin to get us and keep us running, but check
-				// the exit condition so we exit out early if we're
-				// done.
-				start := time.Now()
-				for time.Since(start) < 10*time.Millisecond && exit.Load() == 0 {
-				}
-				runtime.Gosched()
-			}
-		}()
-	}
-	for ready.Load() < count-1 {
-		runtime.Gosched()
-	}
-
-	// Be careful. We've entered a dangerous state for platforms
-	// that do not return back to the underlying system unless all
-	// goroutines are blocked, like js/wasm, since we have a bunch
-	// of runnable goroutines all spinning. We cannot write anything
-	// out.
-	if testenv.HasParallelism() {
-		t.Run("created", func(t *testing.T) {
-			metrics.Read(s[:])
-			logMetrics(t, s[:])
-			checkEq(t, &s[created], createdAfterBase+count)
-		})
-		t.Run("running", func(t *testing.T) {
-			defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(count + 4))
-			// It can take a little bit for the scheduler to
-			// distribute the goroutines to Ps, so retry until
-			// we see the count we expect or the test times out.
-			spinUntil(func() bool {
-				metrics.Read(s[:])
-				return s[running].Value.Uint64() >= count
-			})
-			logMetrics(t, s[:])
-			check(t, &s[running], count, count+4)
-			check(t, &s[threads], count, count+4+threadsSlack)
-		})
-
-		// Force runnable count to be high.
-		t.Run("runnable", func(t *testing.T) {
-			defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
-			metrics.Read(s[:])
-			logMetrics(t, s[:])
-			checkEq(t, &s[running], 1)
-			check(t, &s[runnable], count-1, count+generalSlack)
-		})
-
-		// Done with the running/runnable goroutines.
-		exit.Store(1)
-	} else {
-		// Read metrics and then exit all the other goroutines,
-		// so that system calls may proceed.
-		metrics.Read(s[:])
-
-		// Done with the running/runnable goroutines.
-		exit.Store(1)
-
-		// Now we can check our invariants.
-		t.Run("created", func(t *testing.T) {
-			// Look for count-1 goroutines because we read metrics
-			// *before* t.Run goroutine was created for this sub-test.
-			checkEq(t, &s[created], createdAfterBase+count-1)
-		})
-		t.Run("running", func(t *testing.T) {
-			logMetrics(t, s[:])
-			checkEq(t, &s[running], 1)
-			checkEq(t, &s[threads], 1)
-		})
-		t.Run("runnable", func(t *testing.T) {
-			logMetrics(t, s[:])
-			check(t, &s[runnable], count-1, count+generalSlack)
-		})
-	}
-
-	// Force not-in-go count to be high. This is a little tricky since
-	// we try really hard not to let things block in system calls.
-	// We have to drop to the syscall package to do this reliably.
-	t.Run("not-in-go", func(t *testing.T) {
-		// Block a bunch of goroutines on an OS pipe.
-		pr, pw, err := pipe()
-		if err != nil {
-			switch runtime.GOOS {
-			case "js", "wasip1":
-				t.Skip("creating pipe:", err)
-			}
-			t.Fatal("creating pipe:", err)
-		}
-		for i := 0; i < count; i++ {
-			go syscall.Read(pr, make([]byte, 1))
-		}
-
-		// Let the goroutines block.
-		spinUntil(func() bool {
-			metrics.Read(s[:])
-			return s[notInGo].Value.Uint64() >= count
-		})
-		logMetrics(t, s[:])
-		check(t, &s[notInGo], count, count+generalSlack)
-
-		syscall.Close(pw)
-		syscall.Close(pr)
-	})
-
-	t.Run("waiting", func(t *testing.T) {
-		// Force waiting count to be high.
-		const waitingCount = 1000
-		stop := make(chan bool)
-		for i := 0; i < waitingCount; i++ {
-			go func() { <-stop }()
-		}
-
-		// Let the goroutines block.
-		spinUntil(func() bool {
-			metrics.Read(s[:])
-			return s[waiting].Value.Uint64() >= waitingCount
-		})
-		logMetrics(t, s[:])
-		check(t, &s[waiting], waitingCount, waitingCount+waitingSlack)
-
-		close(stop)
-	})
 }
diff --git a/src/runtime/testdata/testprog/pipe_unix.go b/src/runtime/testdata/testprog/pipe_unix.go
@@ -4,7 +4,7 @@
 
 //go:build !windows
 
-package runtime_test
+package main
 
 import "syscall"
 
diff --git a/src/runtime/testdata/testprog/pipe_windows.go b/src/runtime/testdata/testprog/pipe_windows.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package runtime_test
+package main
 
 import "syscall"
 
diff --git a/src/runtime/testdata/testprog/schedmetrics.go b/src/runtime/testdata/testprog/schedmetrics.go