roachtest: global monitor expects specific process deaths

DarrylWong · DarrylWong · commit 22c043b66770 · 2025-06-24T11:58:32.000-04:00
The test monitor works by asserting that no unexpected
process deaths occured. Previously, a process death could
be marked as expected by specifying the number of deaths
the cluster expects to see. If a death is observed, the
number of expected deaths is decremented.

This approach has a few drawbacks. We don't keep track of
which nodes we expect to die. Almost all the time, we are
stopping a specific process and can make stronger assertion that
we expect that specific process to die.

Additionally, there were areas of the code that accidentally
called ExpectDeaths too many times, e.g. once in a helper
that stops a node and again when calling the helper. This could
mask failures where more nodes than expected die.

This change makes it so the global test monitor no longer expects
the number of process deaths, but rather which processes are expected
to die.
diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go
@@ -2221,7 +2221,10 @@ func (c *clusterImpl) StartE(
 			return errors.Wrap(err, "failed to wait for replication after starting cockroach")
 		}
 	}
-
+	// If starting the cluster was successful, mark the nodes as healthy. N.B. we must wait
+	// until cluster startup succeeds as we may have tests that purposely inject failures into
+	// cluster startup.
+	c.t.Monitor().ExpectProcessAlive(nodes)
 	return nil
 }
 
@@ -2264,6 +2267,15 @@ func (c *clusterImpl) StartServiceForVirtualClusterE(
 			return err
 		}
 	}
+
+	// If we are starting a separate process virtual cluster, we need to
+	// mark each SQL instance as healthy.
+	if len(startOpts.SeparateProcessNodes) > 0 {
+		nodes := startOpts.SeparateProcessNodes
+		virtualClusterName := startOpts.RoachprodOpts.VirtualClusterName
+		sqlInstance := startOpts.RoachprodOpts.SQLInstance
+		c.t.Monitor().ExpectProcessAlive(nodes, option.VirtualClusterName(virtualClusterName), option.SQLInstance(sqlInstance))
+	}
 	return nil
 }
 
@@ -2290,6 +2302,9 @@ func (c *clusterImpl) StopServiceForVirtualClusterE(
 	nodes := c.All()
 	if len(stopOpts.SeparateProcessNodes) > 0 {
 		nodes = stopOpts.SeparateProcessNodes
+		virtualClusterName := stopOpts.RoachprodOpts.VirtualClusterName
+		sqlInstance := stopOpts.RoachprodOpts.SQLInstance
+		c.t.Monitor().ExpectProcessDead(nodes, option.VirtualClusterName(virtualClusterName), option.SQLInstance(sqlInstance))
 	}
 
 	return roachprod.StopServiceForVirtualCluster(
@@ -2396,6 +2411,7 @@ func (c *clusterImpl) StopE(
 		stopOpts.RoachprodOpts.Wait = true
 		stopOpts.RoachprodOpts.GracePeriod = 10
 	}
+	c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))
 	return errors.Wrap(roachprod.Stop(ctx, l, c.MakeNodes(nodes...), stopOpts.RoachprodOpts), "cluster.StopE")
 }
 
@@ -2423,6 +2439,7 @@ func (c *clusterImpl) SignalE(
 	if c.spec.NodeCount == 0 {
 		return nil // unit tests
 	}
+	c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))
 	return errors.Wrap(roachprod.Signal(ctx, l, c.MakeNodes(nodes...), sig), "cluster.Signal")
 }
 
@@ -2454,6 +2471,7 @@ func (c *clusterImpl) WipeE(
 	}
 	c.setStatusForClusterOpt("wiping", false, nodes...)
 	defer c.clearStatusForClusterOpt(false)
+	c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))
 	return roachprod.Wipe(ctx, l, c.MakeNodes(nodes...), c.IsSecure())
 }
 
@@ -3016,7 +3034,7 @@ func (c *clusterImpl) Extend(ctx context.Context, d time.Duration, l *logger.Log
 // monitor's semantics around handling expected node deaths breaks down if it's
 // monitoring a workload node.
 func (c *clusterImpl) NewMonitor(ctx context.Context, opts ...option.Option) cluster.Monitor {
-	return newMonitor(ctx, c.t, c, opts...)
+	return newMonitor(ctx, c.t, c, false /* expectExactProcessDeath */, opts...)
 }
 
 func (c *clusterImpl) StartGrafana(
diff --git a/pkg/cmd/roachtest/monitor.go b/pkg/cmd/roachtest/monitor.go
@@ -17,10 +17,66 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/roachprod"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
+	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
 	"github.com/cockroachdb/errors"
 	"golang.org/x/sync/errgroup"
 )
 
+// monitorProcess represents a single process that the monitor monitors.
+type monitorProcess struct {
+	node               install.Node
+	virtualClusterName string
+	sqlInstance        int
+}
+
+// MonitorExpectedProcessHealth represents the expected health of a process.
+type MonitorExpectedProcessHealth string
+
+const (
+	ExpectedAlive = MonitorExpectedProcessHealth("process alive")
+	ExpectedDead  = MonitorExpectedProcessHealth("process dead")
+)
+
+// expectedProcessHealth is a concurrent map that stores the expected health of
+// each registered monitorProcess. It is a thin wrapper over syncutil.Map that ensures
+// consistent naming of the system interface.
+type expectedProcessHealth struct {
+	syncutil.Map[monitorProcess, MonitorExpectedProcessHealth]
+}
+
+func newProcess(node install.Node, virtualClusterName string, sqlInstance int) monitorProcess {
+	if virtualClusterName == "" {
+		virtualClusterName = install.SystemInterfaceName
+	}
+	return monitorProcess{
+		node:               node,
+		virtualClusterName: virtualClusterName,
+		sqlInstance:        sqlInstance,
+	}
+}
+
+func (m *expectedProcessHealth) get(
+	node install.Node, virtualClusterName string, sqlInstance int,
+) MonitorExpectedProcessHealth {
+	val, ok := m.Load(newProcess(node, virtualClusterName, sqlInstance))
+	if !ok {
+		// If the process has no expected state, assume it should be healthy.
+		return ExpectedAlive
+	}
+	return *val
+}
+
+func (m *expectedProcessHealth) set(
+	nodes install.Nodes,
+	virtualClusterName string,
+	sqlInstance int,
+	health MonitorExpectedProcessHealth,
+) {
+	for _, node := range nodes {
+		m.Store(newProcess(node, virtualClusterName, sqlInstance), &health)
+	}
+}
+
 // monitorImpl implements the Monitor interface. A monitor both
 // manages "user tasks" -- goroutines provided by tests -- as well as
 // checks that every node in the cluster is still running. A failure
@@ -46,7 +102,16 @@ type monitorImpl struct {
 	monitorGroup *errgroup.Group // monitor goroutine
 	monitorOnce  sync.Once       // guarantees monitor goroutine is only started once
 
-	expDeaths int32 // atomically
+	// expExactProcessDeath if true indicates that the monitor should expect that a
+	// specified process, as denoted by the triple in expProcessHealth.get, is dead.
+	// Otherwise, the monitor will expect that only a certain number of process deaths.
+	// The former is a stronger assertion used in the new global roachtest monitor,
+	// while the latter should be removed when the deprecated cluster monitor is removed.
+	expExactProcessDeath bool
+	// Deprecated: This field is used by the deprecated cluster monitor to track the number
+	// of expected process deaths, and should be removed when the cluster monitor is removed.
+	expDeaths        int32 // atomically
+	expProcessHealth expectedProcessHealth
 }
 
 func newMonitor(
@@ -58,19 +123,46 @@ func newMonitor(
 		L() *logger.Logger
 	},
 	c cluster.Cluster,
+	expectExactProcessDeath bool,
 	opts ...option.Option,
 ) *monitorImpl {
 	m := &monitorImpl{
-		t:     t,
-		l:     t.L(),
-		nodes: c.MakeNodes(opts...),
+		t:                    t,
+		l:                    t.L(),
+		nodes:                c.MakeNodes(opts...),
+		expExactProcessDeath: expectExactProcessDeath,
+		expProcessHealth:     expectedProcessHealth{},
 	}
 	m.ctx, m.cancel = context.WithCancel(ctx)
 	m.userGroup, _ = errgroup.WithContext(m.ctx)
 	m.monitorGroup, _ = errgroup.WithContext(m.ctx)
 	return m
 }
 
+func (m *monitorImpl) ExpectProcessHealth(
+	nodes install.Nodes, health MonitorExpectedProcessHealth, opts ...option.OptionFunc,
+) {
+	var virtualClusterOptions option.VirtualClusterOptions
+	if err := option.Apply(&virtualClusterOptions, opts...); err != nil {
+		m.t.Fatal(err)
+	}
+	m.expProcessHealth.set(nodes, virtualClusterOptions.VirtualClusterName, virtualClusterOptions.SQLInstance, health)
+}
+
+// ExpectProcessDeath lets the monitor know that a set of processes are about
+// to be killed, and that their deaths should be ignored. Virtual cluster
+// options can be passed to denote a separate process.
+func (m *monitorImpl) ExpectProcessDead(nodes option.NodeListOption, opts ...option.OptionFunc) {
+	m.ExpectProcessHealth(nodes.InstallNodes(), ExpectedDead, opts...)
+}
+
+// ExpectProcessAlive lets the monitor know that a set of processes are
+// expected to be healthy. Virtual cluster options can be passed to denote
+// a separate process.
+func (m *monitorImpl) ExpectProcessAlive(nodes option.NodeListOption, opts ...option.OptionFunc) {
+	m.ExpectProcessHealth(nodes.InstallNodes(), ExpectedAlive, opts...)
+}
+
 // ExpectDeath lets the monitor know that a node is about to be killed, and that
 // this should be ignored.
 func (m *monitorImpl) ExpectDeath() {
@@ -190,12 +282,16 @@ func (m *monitorImpl) startNodeMonitor() {
 						)
 					}
 				case install.MonitorProcessDead:
-					isExpectedDeath := atomic.AddInt32(&m.expDeaths, -1) >= 0
-					if isExpectedDeath {
-						expectedDeathStr = ": expected"
+					var isExpectedDeath bool
+					if m.expExactProcessDeath {
+						isExpectedDeath = m.expProcessHealth.get(info.Node, e.VirtualClusterName, e.SQLInstance) == ExpectedDead
+					} else {
+						isExpectedDeath = atomic.AddInt32(&m.expDeaths, -1) >= 0
 					}
 
-					if !isExpectedDeath {
+					if isExpectedDeath {
+						expectedDeathStr = ": expected"
+					} else {
 						retErr = fmt.Errorf("unexpected node event: %s", info)
 					}
 				}
diff --git a/pkg/cmd/roachtest/test/BUILD.bazel b/pkg/cmd/roachtest/test/BUILD.bazel
@@ -9,6 +9,7 @@ go_library(
     importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/cmd/roachtest/option",
         "//pkg/cmd/roachtest/roachtestutil/task",
         "//pkg/roachprod/logger",
         "@com_github_cockroachdb_version//:version",
diff --git a/pkg/cmd/roachtest/test/test_monitor.go b/pkg/cmd/roachtest/test/test_monitor.go
@@ -5,9 +5,10 @@
 
 package test
 
+import "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
+
 // Monitor is an interface for monitoring cockroach processes during a test.
 type Monitor interface {
-	ExpectDeath()
-	ExpectDeaths(count int32)
-	ResetDeaths()
+	ExpectProcessDead(nodes option.NodeListOption, opts ...option.OptionFunc)
+	ExpectProcessAlive(nodes option.NodeListOption, opts ...option.OptionFunc)
 }
diff --git a/pkg/cmd/roachtest/test_monitor.go b/pkg/cmd/roachtest/test_monitor.go
@@ -19,7 +19,7 @@ type testMonitorImpl struct {
 
 func newTestMonitor(ctx context.Context, t test.Test, c *clusterImpl) *testMonitorImpl {
 	return &testMonitorImpl{
-		monitorImpl: newMonitor(ctx, t, c),
+		monitorImpl: newMonitor(ctx, t, c, true /* expectExactProcessDeath */),
 	}
 }
 
diff --git a/pkg/cmd/roachtest/tests/process_lock.go b/pkg/cmd/roachtest/tests/process_lock.go
@@ -142,10 +142,8 @@ func registerProcessLock(r registry.Registry) {
 								// other operations that were performed
 								// concurrently with the running process did not
 								// corrupt the on-disk state.
-								t.Monitor().ExpectDeath()
 								c.Stop(ctx, l, option.DefaultStopOpts(), c.Node(n))
 								c.Start(ctx, l, startOpts, startSettings, c.Node(n))
-								t.Monitor().ResetDeaths()
 							},
 						}
 						ops[randutil.RandIntInRange(rng, 0, len(ops))]()
diff --git a/pkg/roachprod/install/monitor.go b/pkg/roachprod/install/monitor.go
@@ -316,7 +316,7 @@ func (m *monitorNode) monitorNode(ctx context.Context, l *logger.Logger) {
 	if err := sess.Wait(); err != nil {
 		// If we got an error waiting for the session but the context
 		// is already canceled, do not send an error through the
-		// channel; context cancelation happens at the user's request
+		// channel; context cancellation happens at the user's request
 		// or when the test finishes. In either case, the monitor
 		// should quiesce. Reporting the error is confusing and can be
 		// noisy in the case of multiple monitors.

Original file line number	Diff line number	Diff line change
`@@ -2221,7 +2221,10 @@ func (c *clusterImpl) StartE(`
`2221`	`2221`	`return errors.Wrap(err, "failed to wait for replication after starting cockroach")`
`2222`	`2222`	`}`
`2223`	`2223`	`}`
`2224`		`-`
	`2224`	`+ // If starting the cluster was successful, mark the nodes as healthy. N.B. we must wait`
	`2225`	`+ // until cluster startup succeeds as we may have tests that purposely inject failures into`
	`2226`	`+ // cluster startup.`
	`2227`	`+ c.t.Monitor().ExpectProcessAlive(nodes)`
`2225`	`2228`	`return nil`
`2226`	`2229`	`}`
`2227`	`2230`
`@@ -2264,6 +2267,15 @@ func (c *clusterImpl) StartServiceForVirtualClusterE(`
`2264`	`2267`	`return err`
`2265`	`2268`	`}`
`2266`	`2269`	`}`
	`2270`	`+`
	`2271`	`+ // If we are starting a separate process virtual cluster, we need to`
	`2272`	`+ // mark each SQL instance as healthy.`
	`2273`	`+ if len(startOpts.SeparateProcessNodes) > 0 {`
	`2274`	`+ nodes := startOpts.SeparateProcessNodes`
	`2275`	`+ virtualClusterName := startOpts.RoachprodOpts.VirtualClusterName`
	`2276`	`+ sqlInstance := startOpts.RoachprodOpts.SQLInstance`
	`2277`	`+ c.t.Monitor().ExpectProcessAlive(nodes, option.VirtualClusterName(virtualClusterName), option.SQLInstance(sqlInstance))`
	`2278`	`+ }`
`2267`	`2279`	`return nil`
`2268`	`2280`	`}`
`2269`	`2281`
`@@ -2290,6 +2302,9 @@ func (c *clusterImpl) StopServiceForVirtualClusterE(`
`2290`	`2302`	`nodes := c.All()`
`2291`	`2303`	`if len(stopOpts.SeparateProcessNodes) > 0 {`
`2292`	`2304`	`nodes = stopOpts.SeparateProcessNodes`
	`2305`	`+ virtualClusterName := stopOpts.RoachprodOpts.VirtualClusterName`
	`2306`	`+ sqlInstance := stopOpts.RoachprodOpts.SQLInstance`
	`2307`	`+ c.t.Monitor().ExpectProcessDead(nodes, option.VirtualClusterName(virtualClusterName), option.SQLInstance(sqlInstance))`
`2293`	`2308`	`}`
`2294`	`2309`
`2295`	`2310`	`return roachprod.StopServiceForVirtualCluster(`
`@@ -2396,6 +2411,7 @@ func (c *clusterImpl) StopE(`
`2396`	`2411`	`stopOpts.RoachprodOpts.Wait = true`
`2397`	`2412`	`stopOpts.RoachprodOpts.GracePeriod = 10`
`2398`	`2413`	`}`
	`2414`	`+ c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))`
`2399`	`2415`	`return errors.Wrap(roachprod.Stop(ctx, l, c.MakeNodes(nodes...), stopOpts.RoachprodOpts), "cluster.StopE")`
`2400`	`2416`	`}`
`2401`	`2417`
`@@ -2423,6 +2439,7 @@ func (c *clusterImpl) SignalE(`
`2423`	`2439`	`if c.spec.NodeCount == 0 {`
`2424`	`2440`	`return nil // unit tests`
`2425`	`2441`	`}`
	`2442`	`+ c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))`
`2426`	`2443`	`return errors.Wrap(roachprod.Signal(ctx, l, c.MakeNodes(nodes...), sig), "cluster.Signal")`
`2427`	`2444`	`}`
`2428`	`2445`
`@@ -2454,6 +2471,7 @@ func (c *clusterImpl) WipeE(`
`2454`	`2471`	`}`
`2455`	`2472`	`c.setStatusForClusterOpt("wiping", false, nodes...)`
`2456`	`2473`	`defer c.clearStatusForClusterOpt(false)`
	`2474`	`+ c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))`
`2457`	`2475`	`return roachprod.Wipe(ctx, l, c.MakeNodes(nodes...), c.IsSecure())`
`2458`	`2476`	`}`
`2459`	`2477`
`@@ -3016,7 +3034,7 @@ func (c clusterImpl) Extend(ctx context.Context, d time.Duration, l logger.Log`
`3016`	`3034`	`// monitor's semantics around handling expected node deaths breaks down if it's`
`3017`	`3035`	`// monitoring a workload node.`
`3018`	`3036`	`func (c *clusterImpl) NewMonitor(ctx context.Context, opts ...option.Option) cluster.Monitor {`
`3019`		`- return newMonitor(ctx, c.t, c, opts...)`
	`3037`	`+ return newMonitor(ctx, c.t, c, false /* expectExactProcessDeath */, opts...)`
`3020`	`3038`	`}`
`3021`	`3039`
`3022`	`3040`	`func (c *clusterImpl) StartGrafana(`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ type testMonitorImpl struct {`
`19`	`19`
`20`	`20`	`func newTestMonitor(ctx context.Context, t test.Test, c clusterImpl) testMonitorImpl {`
`21`	`21`	`return &testMonitorImpl{`
`22`		`- monitorImpl: newMonitor(ctx, t, c),`
	`22`	`+ monitorImpl: newMonitor(ctx, t, c, true /* expectExactProcessDeath */),`
`23`	`23`	`}`
`24`	`24`	`}`
`25`	`25`