Skip to content

Commit 22c043b

Browse files
committed
roachtest: global monitor expects specific process deaths
The test monitor works by asserting that no unexpected process deaths occured. Previously, a process death could be marked as expected by specifying the number of deaths the cluster expects to see. If a death is observed, the number of expected deaths is decremented. This approach has a few drawbacks. We don't keep track of which nodes we expect to die. Almost all the time, we are stopping a specific process and can make stronger assertion that we expect that specific process to die. Additionally, there were areas of the code that accidentally called ExpectDeaths too many times, e.g. once in a helper that stops a node and again when calling the helper. This could mask failures where more nodes than expected die. This change makes it so the global test monitor no longer expects the number of process deaths, but rather which processes are expected to die.
1 parent 7ca9db8 commit 22c043b

File tree

7 files changed

+131
-17
lines changed

7 files changed

+131
-17
lines changed

pkg/cmd/roachtest/cluster.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2221,7 +2221,10 @@ func (c *clusterImpl) StartE(
22212221
return errors.Wrap(err, "failed to wait for replication after starting cockroach")
22222222
}
22232223
}
2224-
2224+
// If starting the cluster was successful, mark the nodes as healthy. N.B. we must wait
2225+
// until cluster startup succeeds as we may have tests that purposely inject failures into
2226+
// cluster startup.
2227+
c.t.Monitor().ExpectProcessAlive(nodes)
22252228
return nil
22262229
}
22272230

@@ -2264,6 +2267,15 @@ func (c *clusterImpl) StartServiceForVirtualClusterE(
22642267
return err
22652268
}
22662269
}
2270+
2271+
// If we are starting a separate process virtual cluster, we need to
2272+
// mark each SQL instance as healthy.
2273+
if len(startOpts.SeparateProcessNodes) > 0 {
2274+
nodes := startOpts.SeparateProcessNodes
2275+
virtualClusterName := startOpts.RoachprodOpts.VirtualClusterName
2276+
sqlInstance := startOpts.RoachprodOpts.SQLInstance
2277+
c.t.Monitor().ExpectProcessAlive(nodes, option.VirtualClusterName(virtualClusterName), option.SQLInstance(sqlInstance))
2278+
}
22672279
return nil
22682280
}
22692281

@@ -2290,6 +2302,9 @@ func (c *clusterImpl) StopServiceForVirtualClusterE(
22902302
nodes := c.All()
22912303
if len(stopOpts.SeparateProcessNodes) > 0 {
22922304
nodes = stopOpts.SeparateProcessNodes
2305+
virtualClusterName := stopOpts.RoachprodOpts.VirtualClusterName
2306+
sqlInstance := stopOpts.RoachprodOpts.SQLInstance
2307+
c.t.Monitor().ExpectProcessDead(nodes, option.VirtualClusterName(virtualClusterName), option.SQLInstance(sqlInstance))
22932308
}
22942309

22952310
return roachprod.StopServiceForVirtualCluster(
@@ -2396,6 +2411,7 @@ func (c *clusterImpl) StopE(
23962411
stopOpts.RoachprodOpts.Wait = true
23972412
stopOpts.RoachprodOpts.GracePeriod = 10
23982413
}
2414+
c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))
23992415
return errors.Wrap(roachprod.Stop(ctx, l, c.MakeNodes(nodes...), stopOpts.RoachprodOpts), "cluster.StopE")
24002416
}
24012417

@@ -2423,6 +2439,7 @@ func (c *clusterImpl) SignalE(
24232439
if c.spec.NodeCount == 0 {
24242440
return nil // unit tests
24252441
}
2442+
c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))
24262443
return errors.Wrap(roachprod.Signal(ctx, l, c.MakeNodes(nodes...), sig), "cluster.Signal")
24272444
}
24282445

@@ -2454,6 +2471,7 @@ func (c *clusterImpl) WipeE(
24542471
}
24552472
c.setStatusForClusterOpt("wiping", false, nodes...)
24562473
defer c.clearStatusForClusterOpt(false)
2474+
c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))
24572475
return roachprod.Wipe(ctx, l, c.MakeNodes(nodes...), c.IsSecure())
24582476
}
24592477

@@ -3016,7 +3034,7 @@ func (c *clusterImpl) Extend(ctx context.Context, d time.Duration, l *logger.Log
30163034
// monitor's semantics around handling expected node deaths breaks down if it's
30173035
// monitoring a workload node.
30183036
func (c *clusterImpl) NewMonitor(ctx context.Context, opts ...option.Option) cluster.Monitor {
3019-
return newMonitor(ctx, c.t, c, opts...)
3037+
return newMonitor(ctx, c.t, c, false /* expectExactProcessDeath */, opts...)
30203038
}
30213039

30223040
func (c *clusterImpl) StartGrafana(

pkg/cmd/roachtest/monitor.go

Lines changed: 104 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,66 @@ import (
1717
"github.com/cockroachdb/cockroach/pkg/roachprod"
1818
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
1919
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
20+
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
2021
"github.com/cockroachdb/errors"
2122
"golang.org/x/sync/errgroup"
2223
)
2324

25+
// monitorProcess represents a single process that the monitor monitors.
26+
type monitorProcess struct {
27+
node install.Node
28+
virtualClusterName string
29+
sqlInstance int
30+
}
31+
32+
// MonitorExpectedProcessHealth represents the expected health of a process.
33+
type MonitorExpectedProcessHealth string
34+
35+
const (
36+
ExpectedAlive = MonitorExpectedProcessHealth("process alive")
37+
ExpectedDead = MonitorExpectedProcessHealth("process dead")
38+
)
39+
40+
// expectedProcessHealth is a concurrent map that stores the expected health of
41+
// each registered monitorProcess. It is a thin wrapper over syncutil.Map that ensures
42+
// consistent naming of the system interface.
43+
type expectedProcessHealth struct {
44+
syncutil.Map[monitorProcess, MonitorExpectedProcessHealth]
45+
}
46+
47+
func newProcess(node install.Node, virtualClusterName string, sqlInstance int) monitorProcess {
48+
if virtualClusterName == "" {
49+
virtualClusterName = install.SystemInterfaceName
50+
}
51+
return monitorProcess{
52+
node: node,
53+
virtualClusterName: virtualClusterName,
54+
sqlInstance: sqlInstance,
55+
}
56+
}
57+
58+
func (m *expectedProcessHealth) get(
59+
node install.Node, virtualClusterName string, sqlInstance int,
60+
) MonitorExpectedProcessHealth {
61+
val, ok := m.Load(newProcess(node, virtualClusterName, sqlInstance))
62+
if !ok {
63+
// If the process has no expected state, assume it should be healthy.
64+
return ExpectedAlive
65+
}
66+
return *val
67+
}
68+
69+
func (m *expectedProcessHealth) set(
70+
nodes install.Nodes,
71+
virtualClusterName string,
72+
sqlInstance int,
73+
health MonitorExpectedProcessHealth,
74+
) {
75+
for _, node := range nodes {
76+
m.Store(newProcess(node, virtualClusterName, sqlInstance), &health)
77+
}
78+
}
79+
2480
// monitorImpl implements the Monitor interface. A monitor both
2581
// manages "user tasks" -- goroutines provided by tests -- as well as
2682
// checks that every node in the cluster is still running. A failure
@@ -46,7 +102,16 @@ type monitorImpl struct {
46102
monitorGroup *errgroup.Group // monitor goroutine
47103
monitorOnce sync.Once // guarantees monitor goroutine is only started once
48104

49-
expDeaths int32 // atomically
105+
// expExactProcessDeath if true indicates that the monitor should expect that a
106+
// specified process, as denoted by the triple in expProcessHealth.get, is dead.
107+
// Otherwise, the monitor will expect that only a certain number of process deaths.
108+
// The former is a stronger assertion used in the new global roachtest monitor,
109+
// while the latter should be removed when the deprecated cluster monitor is removed.
110+
expExactProcessDeath bool
111+
// Deprecated: This field is used by the deprecated cluster monitor to track the number
112+
// of expected process deaths, and should be removed when the cluster monitor is removed.
113+
expDeaths int32 // atomically
114+
expProcessHealth expectedProcessHealth
50115
}
51116

52117
func newMonitor(
@@ -58,19 +123,46 @@ func newMonitor(
58123
L() *logger.Logger
59124
},
60125
c cluster.Cluster,
126+
expectExactProcessDeath bool,
61127
opts ...option.Option,
62128
) *monitorImpl {
63129
m := &monitorImpl{
64-
t: t,
65-
l: t.L(),
66-
nodes: c.MakeNodes(opts...),
130+
t: t,
131+
l: t.L(),
132+
nodes: c.MakeNodes(opts...),
133+
expExactProcessDeath: expectExactProcessDeath,
134+
expProcessHealth: expectedProcessHealth{},
67135
}
68136
m.ctx, m.cancel = context.WithCancel(ctx)
69137
m.userGroup, _ = errgroup.WithContext(m.ctx)
70138
m.monitorGroup, _ = errgroup.WithContext(m.ctx)
71139
return m
72140
}
73141

142+
func (m *monitorImpl) ExpectProcessHealth(
143+
nodes install.Nodes, health MonitorExpectedProcessHealth, opts ...option.OptionFunc,
144+
) {
145+
var virtualClusterOptions option.VirtualClusterOptions
146+
if err := option.Apply(&virtualClusterOptions, opts...); err != nil {
147+
m.t.Fatal(err)
148+
}
149+
m.expProcessHealth.set(nodes, virtualClusterOptions.VirtualClusterName, virtualClusterOptions.SQLInstance, health)
150+
}
151+
152+
// ExpectProcessDeath lets the monitor know that a set of processes are about
153+
// to be killed, and that their deaths should be ignored. Virtual cluster
154+
// options can be passed to denote a separate process.
155+
func (m *monitorImpl) ExpectProcessDead(nodes option.NodeListOption, opts ...option.OptionFunc) {
156+
m.ExpectProcessHealth(nodes.InstallNodes(), ExpectedDead, opts...)
157+
}
158+
159+
// ExpectProcessAlive lets the monitor know that a set of processes are
160+
// expected to be healthy. Virtual cluster options can be passed to denote
161+
// a separate process.
162+
func (m *monitorImpl) ExpectProcessAlive(nodes option.NodeListOption, opts ...option.OptionFunc) {
163+
m.ExpectProcessHealth(nodes.InstallNodes(), ExpectedAlive, opts...)
164+
}
165+
74166
// ExpectDeath lets the monitor know that a node is about to be killed, and that
75167
// this should be ignored.
76168
func (m *monitorImpl) ExpectDeath() {
@@ -190,12 +282,16 @@ func (m *monitorImpl) startNodeMonitor() {
190282
)
191283
}
192284
case install.MonitorProcessDead:
193-
isExpectedDeath := atomic.AddInt32(&m.expDeaths, -1) >= 0
194-
if isExpectedDeath {
195-
expectedDeathStr = ": expected"
285+
var isExpectedDeath bool
286+
if m.expExactProcessDeath {
287+
isExpectedDeath = m.expProcessHealth.get(info.Node, e.VirtualClusterName, e.SQLInstance) == ExpectedDead
288+
} else {
289+
isExpectedDeath = atomic.AddInt32(&m.expDeaths, -1) >= 0
196290
}
197291

198-
if !isExpectedDeath {
292+
if isExpectedDeath {
293+
expectedDeathStr = ": expected"
294+
} else {
199295
retErr = fmt.Errorf("unexpected node event: %s", info)
200296
}
201297
}

pkg/cmd/roachtest/test/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ go_library(
99
importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test",
1010
visibility = ["//visibility:public"],
1111
deps = [
12+
"//pkg/cmd/roachtest/option",
1213
"//pkg/cmd/roachtest/roachtestutil/task",
1314
"//pkg/roachprod/logger",
1415
"@com_github_cockroachdb_version//:version",

pkg/cmd/roachtest/test/test_monitor.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55

66
package test
77

8+
import "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
9+
810
// Monitor is an interface for monitoring cockroach processes during a test.
911
type Monitor interface {
10-
ExpectDeath()
11-
ExpectDeaths(count int32)
12-
ResetDeaths()
12+
ExpectProcessDead(nodes option.NodeListOption, opts ...option.OptionFunc)
13+
ExpectProcessAlive(nodes option.NodeListOption, opts ...option.OptionFunc)
1314
}

pkg/cmd/roachtest/test_monitor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ type testMonitorImpl struct {
1919

2020
func newTestMonitor(ctx context.Context, t test.Test, c *clusterImpl) *testMonitorImpl {
2121
return &testMonitorImpl{
22-
monitorImpl: newMonitor(ctx, t, c),
22+
monitorImpl: newMonitor(ctx, t, c, true /* expectExactProcessDeath */),
2323
}
2424
}
2525

pkg/cmd/roachtest/tests/process_lock.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,8 @@ func registerProcessLock(r registry.Registry) {
142142
// other operations that were performed
143143
// concurrently with the running process did not
144144
// corrupt the on-disk state.
145-
t.Monitor().ExpectDeath()
146145
c.Stop(ctx, l, option.DefaultStopOpts(), c.Node(n))
147146
c.Start(ctx, l, startOpts, startSettings, c.Node(n))
148-
t.Monitor().ResetDeaths()
149147
},
150148
}
151149
ops[randutil.RandIntInRange(rng, 0, len(ops))]()

pkg/roachprod/install/monitor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ func (m *monitorNode) monitorNode(ctx context.Context, l *logger.Logger) {
316316
if err := sess.Wait(); err != nil {
317317
// If we got an error waiting for the session but the context
318318
// is already canceled, do not send an error through the
319-
// channel; context cancelation happens at the user's request
319+
// channel; context cancellation happens at the user's request
320320
// or when the test finishes. In either case, the monitor
321321
// should quiesce. Reporting the error is confusing and can be
322322
// noisy in the case of multiple monitors.

0 commit comments

Comments
 (0)