Skip to content

Commit 4c34b48

Browse files
craig[bot]Renato Costa
andcommitted
107548: roachtest/mixedversion: monitor nodes in every test r=herkolategan,srosenberg a=renatolabs This PR includes a few changes related to the roachprod/roachtest monitor itself, and integrates it with the `mixedversion` package. Specifically, the changes are as follows (each bullet point corresponds to a commit): * emit structured events from the roachprod monitor; this removes the need to perform string parsing on callers. * change `mixedversion` framework to monitor nodes by default. An unexpected node death immediately fails the test. * move public functions on the `*mixedversion.Helper` struct to its own file, for ease of browsing. * update the README with instructions on background tasks; most importantly, mixedversion tests can't use `cluster.NewMonitor` like other roachtests. Epic: CRDB-19321 Release notes: None Co-authored-by: Renato Costa <[email protected]>
2 parents a3a389e + 3609615 commit 4c34b48

File tree

12 files changed

+465
-210
lines changed

12 files changed

+465
-210
lines changed

pkg/cmd/roachprod/main.go

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -650,21 +650,15 @@ of nodes, outputting a line whenever a change is detected:
650650
`,
651651
Args: cobra.ExactArgs(1),
652652
Run: wrap(func(cmd *cobra.Command, args []string) error {
653-
messages, err := roachprod.Monitor(context.Background(), config.Logger, args[0], monitorOpts)
653+
eventChan, err := roachprod.Monitor(context.Background(), config.Logger, args[0], monitorOpts)
654654
if err != nil {
655655
return err
656656
}
657-
for msg := range messages {
658-
if msg.Err != nil {
659-
msg.Msg += "error: " + msg.Err.Error()
660-
}
661-
thisError := errors.Newf("%d: %s", msg.Node, msg.Msg)
662-
if msg.Err != nil || strings.Contains(msg.Msg, "dead") {
663-
err = errors.CombineErrors(err, thisError)
664-
}
665-
fmt.Println(thisError.Error())
657+
for info := range eventChan {
658+
fmt.Println(info.String())
666659
}
667-
return err
660+
661+
return nil
668662
}),
669663
}
670664

pkg/cmd/roachtest/cluster.go

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1528,22 +1528,20 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
15281528
}
15291529

15301530
t.L().Printf("checking for dead nodes")
1531-
ch, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})
1531+
eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})
15321532

15331533
// An error here means there was a problem initialising a SyncedCluster.
15341534
if err != nil {
15351535
return err
15361536
}
15371537

15381538
deadNodes := 0
1539-
for n := range ch {
1540-
// If there's an error, it means either that the monitor command failed
1541-
// completely, or that it found a dead node worth complaining about.
1542-
if n.Err != nil || strings.HasPrefix(n.Msg, "dead") {
1539+
for info := range eventsCh {
1540+
t.L().Printf("%s", info)
1541+
1542+
if _, isDeath := info.Event.(install.MonitorNodeDead); isDeath {
15431543
deadNodes++
15441544
}
1545-
1546-
t.L().Printf("n%d: err=%v,msg=%s", n.Node, n.Err, n.Msg)
15471545
}
15481546

15491547
if deadNodes > 0 {

pkg/cmd/roachtest/monitor.go

Lines changed: 35 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ package main
1313
import (
1414
"context"
1515
"fmt"
16-
"strings"
1716
"sync"
1817
"sync/atomic"
1918

@@ -32,11 +31,13 @@ type monitorImpl struct {
3231
Failed() bool
3332
WorkerStatus(...interface{})
3433
}
35-
l *logger.Logger
36-
nodes string
37-
ctx context.Context
38-
cancel func()
39-
g *errgroup.Group
34+
l *logger.Logger
35+
nodes string
36+
ctx context.Context
37+
cancel func()
38+
g *errgroup.Group
39+
40+
numTasks int32 // atomically
4041
expDeaths int32 // atomically
4142
}
4243

@@ -80,6 +81,8 @@ func (m *monitorImpl) ResetDeaths() {
8081
var errTestFatal = errors.New("t.Fatal() was called")
8182

8283
func (m *monitorImpl) Go(fn func(context.Context) error) {
84+
atomic.AddInt32(&m.numTasks, 1)
85+
8386
m.g.Go(func() (err error) {
8487
defer func() {
8588
r := recover()
@@ -171,15 +174,21 @@ func (m *monitorImpl) wait() error {
171174
}
172175

173176
// 1. The first goroutine waits for the worker errgroup to exit.
177+
// Note that this only happens if the caller created at least one
178+
// task for the monitor. This check enables the roachtest monitor to
179+
// be used in cases where we just want to monitor events in the
180+
// cluster without running any background tasks through the monitor.
174181
var wg sync.WaitGroup
175-
wg.Add(1)
176-
go func() {
177-
defer func() {
178-
m.cancel()
179-
wg.Done()
182+
if atomic.LoadInt32(&m.numTasks) > 0 {
183+
wg.Add(1)
184+
go func() {
185+
defer func() {
186+
m.cancel()
187+
wg.Done()
188+
}()
189+
setErr(errors.Wrap(m.g.Wait(), "function passed to monitor.Go failed"))
180190
}()
181-
setErr(errors.Wrap(m.g.Wait(), "monitor task failed"))
182-
}()
191+
}
183192

184193
// 2. The second goroutine reads from the monitoring channel, watching for any
185194
// unexpected death events.
@@ -190,28 +199,24 @@ func (m *monitorImpl) wait() error {
190199
wg.Done()
191200
}()
192201

193-
messagesChannel, err := roachprod.Monitor(m.ctx, m.l, m.nodes, install.MonitorOpts{})
202+
eventsCh, err := roachprod.Monitor(m.ctx, m.l, m.nodes, install.MonitorOpts{})
194203
if err != nil {
195204
setErr(errors.Wrap(err, "monitor command failure"))
196205
return
197206
}
198-
var monitorErr error
199-
for msg := range messagesChannel {
200-
if msg.Err != nil {
201-
msg.Msg += "error: " + msg.Err.Error()
202-
}
203-
thisError := errors.Newf("%d: %s", msg.Node, msg.Msg)
204-
if msg.Err != nil || strings.Contains(msg.Msg, "dead") {
205-
monitorErr = errors.CombineErrors(monitorErr, thisError)
207+
208+
for info := range eventsCh {
209+
_, isDeath := info.Event.(install.MonitorNodeDead)
210+
isExpectedDeath := isDeath && atomic.AddInt32(&m.expDeaths, -1) >= 0
211+
var expectedDeathStr string
212+
if isExpectedDeath {
213+
expectedDeathStr = ": expected"
206214
}
207-
var id int
208-
var s string
209-
newMsg := thisError.Error()
210-
if n, _ := fmt.Sscanf(newMsg, "%d: %s", &id, &s); n == 2 {
211-
if strings.Contains(s, "dead") && atomic.AddInt32(&m.expDeaths, -1) < 0 {
212-
setErr(errors.Wrap(fmt.Errorf("unexpected node event: %s", newMsg), "monitor command failure"))
213-
return
214-
}
215+
m.l.Printf("Monitor event: %s%s", info, expectedDeathStr)
216+
217+
if isDeath && !isExpectedDeath {
218+
setErr(fmt.Errorf("unexpected node event: %s", info))
219+
return
215220
}
216221
}
217222
}()

pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
33
go_library(
44
name = "mixedversion",
55
srcs = [
6+
"helper.go",
67
"mixedversion.go",
78
"planner.go",
89
"runner.go",

pkg/cmd/roachtest/roachtestutil/mixedversion/README.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,18 @@ for {
255255
}
256256
```
257257

258+
#### Use test helpers
259+
260+
Every test hook receives as parameter a `*mixedversion.Helper` instance. This helper struct contains convenience functions that make it easy to perform certain operations while automatically implementing some of the best practices described here. For a full list of helpers, check out the [`helper.go`](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go) file in the framework's source tree.
261+
262+
##### Aside: background functions
263+
264+
One particularly important helper functionality to highlight relates to the management of functions that need to run in the background during a test. Typically, roachtests are expected to use a [monitor](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/monitor.go) for that purpose; not only does the monitor protect from panics inadvertently crashing the test, it also preempts their execution (via context cancelation) if a node dies unexpectedly.
265+
266+
However, if a mixedversion test needs to perform a task in the background, they **must not use the roachtest monitor**. The reason for this is that mixedversion tests are [randomized](#embrace-randomness); as such, user-created monitors would not be able to predict when a death is expected since it does not know, by design, when a node restarts.
267+
268+
To run functions in the background, use the API provided by the mixedversion framework. Long running tasks that run throughout the test can be defined with `(*mixedversion.Test).BackgroundFunc`. If a test hook needs to perform a task in the background, the `*mixedversion.Helper` instance has a `Background` function that can be used for that purpose. As usual, check the documentation for the public API for more details on usage and behaviour of these functions.
269+
258270
#### Log progress
259271

260272
Logging events in the test as it runs can make debugging failures a lot easier. All hooks passed to the `mixedversion` API receive a `*logger.Logger` instance as parameter. **Functions should use that logger instead of the test logger** (`t.L()`). Doing so has two main advantages:
@@ -398,6 +410,6 @@ $ COCKROACH_RANDOM_SEED=7357315251706229449 roachtest run --versions-binary-over
398410

399411
### Final Notes
400412

401-
* This is a high level document and does not include API documentation. The `mixedversion` package includes a lot of documentation in the form of source code comments, and that should be the source of truth when it comes to finding out what functionality is available and how to use it.
413+
* This is a high level document and does not include API documentation. The `mixedversion` package includes a lot of documentation in the form of source code comments, and that should be the source of truth when it comes to finding out what functionality is available and how to use it. Most of the public API is in the [`mixedversion.go`](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go) and [`helper.go`](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go) files.
402414
* For a simple application of the `mixedversion` framework, check out the `acceptance/version-upgrade` roachtest. For a more complex example, see `backup-restore/mixed-version`.
403415
* For any other questions, please reach out to `#test-eng`!
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
// Copyright 2023 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the Business Source License
4+
// included in the file licenses/BSL.txt.
5+
//
6+
// As of the Change Date specified in that file, in accordance with
7+
// the Business Source License, use of this software will be governed
8+
// by the Apache License, Version 2.0, included in the file
9+
// licenses/APL.txt.
10+
11+
package mixedversion
12+
13+
import (
14+
"context"
15+
gosql "database/sql"
16+
"fmt"
17+
"math/rand"
18+
"path"
19+
"strings"
20+
"sync/atomic"
21+
22+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
23+
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
24+
)
25+
26+
func (h *Helper) RandomNode(prng *rand.Rand, nodes option.NodeListOption) int {
27+
return nodes[prng.Intn(len(nodes))]
28+
}
29+
30+
// RandomDB returns a (nodeID, connection) tuple for a randomly picked
31+
// cockroach node according to the parameters passed.
32+
func (h *Helper) RandomDB(prng *rand.Rand, nodes option.NodeListOption) (int, *gosql.DB) {
33+
node := h.RandomNode(prng, nodes)
34+
return node, h.Connect(node)
35+
}
36+
37+
// QueryRow performs `db.QueryRowContext` on a randomly picked
38+
// database node. The query and the node picked are logged in the logs
39+
// of the step that calls this function.
40+
func (h *Helper) QueryRow(rng *rand.Rand, query string, args ...interface{}) *gosql.Row {
41+
node, db := h.RandomDB(rng, h.runner.crdbNodes)
42+
h.stepLogger.Printf("running SQL statement:\n%s\nArgs: %v\nNode: %d", query, args, node)
43+
return db.QueryRowContext(h.ctx, query, args...)
44+
}
45+
46+
// Exec performs `db.ExecContext` on a randomly picked database node.
47+
// The query and the node picked are logged in the logs of the step
48+
// that calls this function.
49+
func (h *Helper) Exec(rng *rand.Rand, query string, args ...interface{}) error {
50+
node, db := h.RandomDB(rng, h.runner.crdbNodes)
51+
h.stepLogger.Printf("running SQL statement:\n%s\nArgs: %v\nNode: %d", query, args, node)
52+
_, err := db.ExecContext(h.ctx, query, args...)
53+
return err
54+
}
55+
56+
func (h *Helper) Connect(node int) *gosql.DB {
57+
return h.runner.conn(node)
58+
}
59+
60+
// SetContext should be called by steps that need access to the test
61+
// context, as that is only visible to them.
62+
func (h *Helper) SetContext(c *Context) {
63+
h.testContext = c
64+
}
65+
66+
// Context returns the test context associated with a certain step. It
67+
// is made available for user-functions (see runHookStep).
68+
func (h *Helper) Context() *Context {
69+
return h.testContext
70+
}
71+
72+
// Background allows test authors to create functions that run in the
73+
// background in mixed-version hooks.
74+
func (h *Helper) Background(
75+
name string, fn func(context.Context, *logger.Logger) error,
76+
) context.CancelFunc {
77+
return h.runner.background.Start(name, func(ctx context.Context) error {
78+
bgLogger, err := h.loggerFor(name)
79+
if err != nil {
80+
return fmt.Errorf("failed to create logger for background function %q: %w", name, err)
81+
}
82+
83+
err = panicAsError(bgLogger, func() error { return fn(ctx, bgLogger) })
84+
if err != nil {
85+
if isContextCanceled(ctx) {
86+
return err
87+
}
88+
89+
desc := fmt.Sprintf("error in background function %s: %s", name, err)
90+
return h.runner.testFailure(desc, bgLogger)
91+
}
92+
93+
return nil
94+
})
95+
}
96+
97+
// BackgroundCommand has the same semantics of `Background()`; the
98+
// command passed will run and the test will fail if the command is
99+
// not successful.
100+
func (h *Helper) BackgroundCommand(cmd string, nodes option.NodeListOption) context.CancelFunc {
101+
desc := fmt.Sprintf("run command: %q", cmd)
102+
return h.Background(desc, func(ctx context.Context, l *logger.Logger) error {
103+
l.Printf("running command `%s` on nodes %v in the background", cmd, nodes)
104+
return h.runner.cluster.RunE(ctx, nodes, cmd)
105+
})
106+
}
107+
108+
// ExpectDeath alerts the testing infrastructure that a node is
109+
// expected to die. Regular restarts as part of the mixedversion
110+
// testing are already taken into account. This function should only
111+
// be used by tests that perform their own node restarts or chaos
112+
// events.
113+
func (h *Helper) ExpectDeath() {
114+
h.ExpectDeaths(1)
115+
}
116+
117+
// ExpectDeaths is the general version of `ExpectDeath()`.
118+
func (h *Helper) ExpectDeaths(n int) {
119+
h.runner.monitor.ExpectDeaths(n)
120+
}
121+
122+
// loggerFor creates a logger instance to be used by background
123+
// functions (created by calling `Background` on the helper
124+
// instance). It is similar to the logger instances created for
125+
// mixed-version steps, but with the `background_` prefix.
126+
func (h *Helper) loggerFor(name string) (*logger.Logger, error) {
127+
atomic.AddInt64(&h.bgCount, 1)
128+
129+
fileName := invalidChars.ReplaceAllString(strings.ToLower(name), "")
130+
fileName = fmt.Sprintf("background_%s_%d", fileName, h.bgCount)
131+
fileName = path.Join(logPrefix, fileName)
132+
133+
return prefixedLogger(h.runner.logger, fileName)
134+
}

pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,7 @@ func (s restartWithNewBinaryStep) Description() string {
653653
func (s restartWithNewBinaryStep) Run(
654654
ctx context.Context, l *logger.Logger, c cluster.Cluster, h *Helper,
655655
) error {
656+
h.ExpectDeath()
656657
return clusterupgrade.RestartNodesWithNewBinary(
657658
ctx,
658659
s.rt,

0 commit comments

Comments
 (0)