Skip to content

Commit 1250b4e

Browse files
craig[bot]DarrylWongstevendannafqazi
committed
148444: roachtest: global monitor expects specific process deaths r=herkolategan,srosenberg a=DarrylWong This change refactors the global test monitor to expect specific process deaths rather than a number of any process deaths. This change is to support the failure injection framework (#138958) as well as failure injection in mixed version (#148084). In the future, we will want to be able to tell the state of the cluster/nodes from the monitor. This change also refactors the mixed version tests to use the global monitor, as well as deprecates the old cluster monitor. See individual commits for details. Informs: #118214 Release note: none 148697: allocator: hoist some calls to Locality r=wenyihu6 a=stevendanna store.Locality() allocates. Previously we were calling this inside an double-nested loop in diversityRebalanceScore which itself is called in a loop. This is probably just a drop in the bucket given how much is going on in the callers of this function, but perhaps it helps a bit: BenchmarkRebalanceToDiversityScore 427075 2734 ns/op 1920 B/op 12 allocs/op BenchmarkRebalanceToDiversityScore 904476 1443 ns/op 160 B/op 1 allocs/op Informs #147800 Release note: None 148764: workload/schemachanger: fix invalid function syntax errors r=fqazi a=fqazi Previously, the random schema changer workload used $$ as a delimiter for function definitions. Unfortunately, randomly generated strings used by functions could easily contain $$ causing the definition to terminator to show up. To address this patch modifies the delimiter to be `$FUNC_BODY$`. Fixes: #148555 Release note: None Co-authored-by: DarrylWong <[email protected]> Co-authored-by: Steven Danna <[email protected]> Co-authored-by: Faizan Qazi <[email protected]>
4 parents 5d590da + e2f1d38 + bc9fdcd + 80c70ee commit 1250b4e

File tree

115 files changed

+371
-289
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

115 files changed

+371
-289
lines changed

pkg/cmd/roachtest/cluster.go

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2221,7 +2221,10 @@ func (c *clusterImpl) StartE(
22212221
return errors.Wrap(err, "failed to wait for replication after starting cockroach")
22222222
}
22232223
}
2224-
2224+
// If starting the cluster was successful, mark the nodes as healthy. N.B. we must wait
2225+
// until cluster startup succeeds as we may have tests that purposely inject failures into
2226+
// cluster startup.
2227+
c.t.Monitor().ExpectProcessAlive(nodes)
22252228
return nil
22262229
}
22272230

@@ -2264,6 +2267,15 @@ func (c *clusterImpl) StartServiceForVirtualClusterE(
22642267
return err
22652268
}
22662269
}
2270+
2271+
// If we are starting a separate process virtual cluster, we need to
2272+
// mark each SQL instance as healthy.
2273+
if len(startOpts.SeparateProcessNodes) > 0 {
2274+
nodes := startOpts.SeparateProcessNodes
2275+
virtualClusterName := startOpts.RoachprodOpts.VirtualClusterName
2276+
sqlInstance := startOpts.RoachprodOpts.SQLInstance
2277+
c.t.Monitor().ExpectProcessAlive(nodes, option.VirtualClusterName(virtualClusterName), option.SQLInstance(sqlInstance))
2278+
}
22672279
return nil
22682280
}
22692281

@@ -2290,6 +2302,9 @@ func (c *clusterImpl) StopServiceForVirtualClusterE(
22902302
nodes := c.All()
22912303
if len(stopOpts.SeparateProcessNodes) > 0 {
22922304
nodes = stopOpts.SeparateProcessNodes
2305+
virtualClusterName := stopOpts.RoachprodOpts.VirtualClusterName
2306+
sqlInstance := stopOpts.RoachprodOpts.SQLInstance
2307+
c.t.Monitor().ExpectProcessDead(nodes, option.VirtualClusterName(virtualClusterName), option.SQLInstance(sqlInstance))
22932308
}
22942309

22952310
return roachprod.StopServiceForVirtualCluster(
@@ -2396,6 +2411,7 @@ func (c *clusterImpl) StopE(
23962411
stopOpts.RoachprodOpts.Wait = true
23972412
stopOpts.RoachprodOpts.GracePeriod = 10
23982413
}
2414+
c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))
23992415
return errors.Wrap(roachprod.Stop(ctx, l, c.MakeNodes(nodes...), stopOpts.RoachprodOpts), "cluster.StopE")
24002416
}
24012417

@@ -2423,6 +2439,7 @@ func (c *clusterImpl) SignalE(
24232439
if c.spec.NodeCount == 0 {
24242440
return nil // unit tests
24252441
}
2442+
c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))
24262443
return errors.Wrap(roachprod.Signal(ctx, l, c.MakeNodes(nodes...), sig), "cluster.Signal")
24272444
}
24282445

@@ -2454,6 +2471,7 @@ func (c *clusterImpl) WipeE(
24542471
}
24552472
c.setStatusForClusterOpt("wiping", false, nodes...)
24562473
defer c.clearStatusForClusterOpt(false)
2474+
c.t.Monitor().ExpectProcessDead(selectedNodesOrDefault(nodes, c.All()))
24572475
return roachprod.Wipe(ctx, l, c.MakeNodes(nodes...), c.IsSecure())
24582476
}
24592477

@@ -3009,14 +3027,16 @@ func (c *clusterImpl) Extend(ctx context.Context, d time.Duration, l *logger.Log
30093027
return nil
30103028
}
30113029

3012-
// NewMonitor creates a monitor that can watch for unexpected crdb node deaths on m.Wait()
3030+
// NewDeprecatedMonitor creates a monitor that can watch for unexpected crdb node deaths on m.Wait()
30133031
// and provide roachtest safe goroutines.
30143032
//
30153033
// As a general rule, if the user has a workload node, do not monitor it. A
30163034
// monitor's semantics around handling expected node deaths breaks down if it's
30173035
// monitoring a workload node.
3018-
func (c *clusterImpl) NewMonitor(ctx context.Context, opts ...option.Option) cluster.Monitor {
3019-
return newMonitor(ctx, c.t, c, opts...)
3036+
func (c *clusterImpl) NewDeprecatedMonitor(
3037+
ctx context.Context, opts ...option.Option,
3038+
) cluster.Monitor {
3039+
return newMonitor(ctx, c.t, c, false /* expectExactProcessDeath */, opts...)
30203040
}
30213041

30223042
func (c *clusterImpl) StartGrafana(

pkg/cmd/roachtest/cluster/cluster_interface.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,11 @@ type Cluster interface {
6666
Stop(ctx context.Context, l *logger.Logger, stopOpts option.StopOpts, opts ...option.Option)
6767
SignalE(ctx context.Context, l *logger.Logger, sig int, opts ...option.Option) error
6868
Signal(ctx context.Context, l *logger.Logger, sig int, opts ...option.Option)
69-
NewMonitor(context.Context, ...option.Option) Monitor
69+
70+
// NewDeprecatedMonitor is deprecated: See #118214
71+
// Instead, use task.Tasker for goroutine management (e.g. test.Go) and test.Monitor
72+
// for monitoring unexpected process deaths (e.g. through the test spec Monitor option)
73+
NewDeprecatedMonitor(context.Context, ...option.Option) Monitor
7074

7175
// Starting virtual clusters.
7276

pkg/cmd/roachtest/clusterstats/mock_cluster_generated_test.go

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/cmd/roachtest/monitor.go

Lines changed: 104 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,66 @@ import (
1717
"github.com/cockroachdb/cockroach/pkg/roachprod"
1818
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
1919
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
20+
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
2021
"github.com/cockroachdb/errors"
2122
"golang.org/x/sync/errgroup"
2223
)
2324

25+
// monitorProcess represents a single process that the monitor monitors.
26+
type monitorProcess struct {
27+
node install.Node
28+
virtualClusterName string
29+
sqlInstance int
30+
}
31+
32+
// MonitorExpectedProcessHealth represents the expected health of a process.
33+
type MonitorExpectedProcessHealth string
34+
35+
const (
36+
ExpectedAlive = MonitorExpectedProcessHealth("process alive")
37+
ExpectedDead = MonitorExpectedProcessHealth("process dead")
38+
)
39+
40+
// expectedProcessHealth is a concurrent map that stores the expected health of
41+
// each registered monitorProcess. It is a thin wrapper over syncutil.Map that ensures
42+
// consistent naming of the system interface.
43+
type expectedProcessHealth struct {
44+
syncutil.Map[monitorProcess, MonitorExpectedProcessHealth]
45+
}
46+
47+
func newProcess(node install.Node, virtualClusterName string, sqlInstance int) monitorProcess {
48+
if virtualClusterName == "" {
49+
virtualClusterName = install.SystemInterfaceName
50+
}
51+
return monitorProcess{
52+
node: node,
53+
virtualClusterName: virtualClusterName,
54+
sqlInstance: sqlInstance,
55+
}
56+
}
57+
58+
func (m *expectedProcessHealth) get(
59+
node install.Node, virtualClusterName string, sqlInstance int,
60+
) MonitorExpectedProcessHealth {
61+
val, ok := m.Load(newProcess(node, virtualClusterName, sqlInstance))
62+
if !ok {
63+
// If the process has no expected state, assume it should be healthy.
64+
return ExpectedAlive
65+
}
66+
return *val
67+
}
68+
69+
func (m *expectedProcessHealth) set(
70+
nodes install.Nodes,
71+
virtualClusterName string,
72+
sqlInstance int,
73+
health MonitorExpectedProcessHealth,
74+
) {
75+
for _, node := range nodes {
76+
m.Store(newProcess(node, virtualClusterName, sqlInstance), &health)
77+
}
78+
}
79+
2480
// monitorImpl implements the Monitor interface. A monitor both
2581
// manages "user tasks" -- goroutines provided by tests -- as well as
2682
// checks that every node in the cluster is still running. A failure
@@ -46,7 +102,16 @@ type monitorImpl struct {
46102
monitorGroup *errgroup.Group // monitor goroutine
47103
monitorOnce sync.Once // guarantees monitor goroutine is only started once
48104

49-
expDeaths int32 // atomically
105+
// expExactProcessDeath if true indicates that the monitor should expect that a
106+
// specified process, as denoted by the triple in expProcessHealth.get, is dead.
107+
// Otherwise, the monitor will expect that only a certain number of process deaths.
108+
// The former is a stronger assertion used in the new global roachtest monitor,
109+
// while the latter should be removed when the deprecated cluster monitor is removed.
110+
expExactProcessDeath bool
111+
// Deprecated: This field is used by the deprecated cluster monitor to track the number
112+
// of expected process deaths, and should be removed when the cluster monitor is removed.
113+
expDeaths int32 // atomically
114+
expProcessHealth expectedProcessHealth
50115
}
51116

52117
func newMonitor(
@@ -58,19 +123,46 @@ func newMonitor(
58123
L() *logger.Logger
59124
},
60125
c cluster.Cluster,
126+
expectExactProcessDeath bool,
61127
opts ...option.Option,
62128
) *monitorImpl {
63129
m := &monitorImpl{
64-
t: t,
65-
l: t.L(),
66-
nodes: c.MakeNodes(opts...),
130+
t: t,
131+
l: t.L(),
132+
nodes: c.MakeNodes(opts...),
133+
expExactProcessDeath: expectExactProcessDeath,
134+
expProcessHealth: expectedProcessHealth{},
67135
}
68136
m.ctx, m.cancel = context.WithCancel(ctx)
69137
m.userGroup, _ = errgroup.WithContext(m.ctx)
70138
m.monitorGroup, _ = errgroup.WithContext(m.ctx)
71139
return m
72140
}
73141

142+
func (m *monitorImpl) ExpectProcessHealth(
143+
nodes install.Nodes, health MonitorExpectedProcessHealth, opts ...option.OptionFunc,
144+
) {
145+
var virtualClusterOptions option.VirtualClusterOptions
146+
if err := option.Apply(&virtualClusterOptions, opts...); err != nil {
147+
m.t.Fatal(err)
148+
}
149+
m.expProcessHealth.set(nodes, virtualClusterOptions.VirtualClusterName, virtualClusterOptions.SQLInstance, health)
150+
}
151+
152+
// ExpectProcessDeath lets the monitor know that a set of processes are about
153+
// to be killed, and that their deaths should be ignored. Virtual cluster
154+
// options can be passed to denote a separate process.
155+
func (m *monitorImpl) ExpectProcessDead(nodes option.NodeListOption, opts ...option.OptionFunc) {
156+
m.ExpectProcessHealth(nodes.InstallNodes(), ExpectedDead, opts...)
157+
}
158+
159+
// ExpectProcessAlive lets the monitor know that a set of processes are
160+
// expected to be healthy. Virtual cluster options can be passed to denote
161+
// a separate process.
162+
func (m *monitorImpl) ExpectProcessAlive(nodes option.NodeListOption, opts ...option.OptionFunc) {
163+
m.ExpectProcessHealth(nodes.InstallNodes(), ExpectedAlive, opts...)
164+
}
165+
74166
// ExpectDeath lets the monitor know that a node is about to be killed, and that
75167
// this should be ignored.
76168
func (m *monitorImpl) ExpectDeath() {
@@ -190,12 +282,16 @@ func (m *monitorImpl) startNodeMonitor() {
190282
)
191283
}
192284
case install.MonitorProcessDead:
193-
isExpectedDeath := atomic.AddInt32(&m.expDeaths, -1) >= 0
194-
if isExpectedDeath {
195-
expectedDeathStr = ": expected"
285+
var isExpectedDeath bool
286+
if m.expExactProcessDeath {
287+
isExpectedDeath = m.expProcessHealth.get(info.Node, e.VirtualClusterName, e.SQLInstance) == ExpectedDead
288+
} else {
289+
isExpectedDeath = atomic.AddInt32(&m.expDeaths, -1) >= 0
196290
}
197291

198-
if !isExpectedDeath {
292+
if isExpectedDeath {
293+
expectedDeathStr = ": expected"
294+
} else {
199295
retErr = fmt.Errorf("unexpected node event: %s", info)
200296
}
201297
}

pkg/cmd/roachtest/registry/test_spec.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ type TestSpec struct {
191191
Randomized bool
192192

193193
// Monitor specifies whether the test initiates a process monitor. Eventually,
194-
// this should replace all instances of `cluster.NewMonitor`. To make this
194+
// this should replace all instances of `cluster.NewDeprecatedMonitor`. To make this
195195
// transition, tests should be modified to utilize the `test.Monitor` and
196196
// `roachtestutil.Task` interfaces provided with each test.
197197
Monitor bool

pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -283,20 +283,6 @@ func (h *Helper) GoCommand(cmd string, nodes option.NodeListOption) context.Canc
283283
}, task.Name(desc))
284284
}
285285

286-
// ExpectDeath alerts the testing infrastructure that a node is
287-
// expected to die. Regular restarts as part of the mixedversion
288-
// testing are already taken into account. This function should only
289-
// be used by tests that perform their own node restarts or chaos
290-
// events.
291-
func (h *Helper) ExpectDeath() {
292-
h.ExpectDeaths(1)
293-
}
294-
295-
// ExpectDeaths is the general version of `ExpectDeath()`.
296-
func (h *Helper) ExpectDeaths(n int) {
297-
h.runner.monitor.ExpectDeaths(n)
298-
}
299-
300286
// ClusterVersion returns the currently active cluster version. Avoids
301287
// querying the database if we are not running migrations, since the
302288
// test runner has cached version of the cluster versions.

pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,10 @@ func NewTest(
642642
crdbNodes option.NodeListOption,
643643
options ...CustomOption,
644644
) *Test {
645+
if !t.Spec().(*registry.TestSpec).Monitor {
646+
t.Fatal("mixedversion tests require enabling the global test monitor in the test spec")
647+
}
648+
645649
opts := defaultTestOptions()
646650
for _, fn := range options {
647651
fn(&opts)
@@ -858,7 +862,7 @@ func (t *Test) Run() {
858862
}
859863

860864
func (t *Test) run(plan *TestPlan) error {
861-
return newTestRunner(t.ctx, t.cancel, plan, t.options.tag, t.logger, t.cluster).run()
865+
return newTestRunner(t.ctx, t.cancel, plan, t.rt, t.options.tag, t.logger, t.cluster).run()
862866
}
863867

864868
func (t *Test) plan() (plan *TestPlan, retErr error) {

0 commit comments

Comments
 (0)