knz
diff --git a/‎pkg/cmd/roachprod/main.go‎
Lines changed: 5 additions & 11 deletions b/‎pkg/cmd/roachprod/main.go‎
Lines changed: 5 additions & 11 deletions
diff --git a/‎pkg/cmd/roachtest/cluster.go‎
Lines changed: 5 additions & 7 deletions b/‎pkg/cmd/roachtest/cluster.go‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎pkg/cmd/roachtest/monitor.go‎
Lines changed: 35 additions & 30 deletions b/‎pkg/cmd/roachtest/monitor.go‎
Lines changed: 35 additions & 30 deletions
diff --git a/‎pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pkg/cmd/roachtest/roachtestutil/mixedversion/README.md‎
Lines changed: 13 additions & 1 deletion b/‎pkg/cmd/roachtest/roachtestutil/mixedversion/README.md‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go‎
Lines changed: 134 additions & 0 deletions b/‎pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go‎
Lines changed: 1 addition & 0 deletions b/‎pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go‎
Lines changed: 1 addition & 0 deletions
@@ -650,21 +650,15 @@ of nodes, outputting a line whenever a change is detected:
 `,
 	Args: cobra.ExactArgs(1),
 	Run: wrap(func(cmd *cobra.Command, args []string) error {
-		messages, err := roachprod.Monitor(context.Background(), config.Logger, args[0], monitorOpts)
+		eventChan, err := roachprod.Monitor(context.Background(), config.Logger, args[0], monitorOpts)
 		if err != nil {
 			return err
 		}
-		for msg := range messages {
-			if msg.Err != nil {
-				msg.Msg += "error: " + msg.Err.Error()
-			}
-			thisError := errors.Newf("%d: %s", msg.Node, msg.Msg)
-			if msg.Err != nil || strings.Contains(msg.Msg, "dead") {
-				err = errors.CombineErrors(err, thisError)
-			}
-			fmt.Println(thisError.Error())
+		for info := range eventChan {
+			fmt.Println(info.String())
 		}
-		return err
+
+		return nil
 	}),
 }
 
 
@@ -1528,22 +1528,20 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
 	}
 
 	t.L().Printf("checking for dead nodes")
-	ch, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})
+	eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})
 
 	// An error here means there was a problem initialising a SyncedCluster.
 	if err != nil {
 		return err
 	}
 
 	deadNodes := 0
-	for n := range ch {
-		// If there's an error, it means either that the monitor command failed
-		// completely, or that it found a dead node worth complaining about.
-		if n.Err != nil || strings.HasPrefix(n.Msg, "dead") {
+	for info := range eventsCh {
+		t.L().Printf("%s", info)
+
+		if _, isDeath := info.Event.(install.MonitorNodeDead); isDeath {
 			deadNodes++
 		}
-
-		t.L().Printf("n%d: err=%v,msg=%s", n.Node, n.Err, n.Msg)
 	}
 
 	if deadNodes > 0 {
 
@@ -13,7 +13,6 @@ package main
 import (
 	"context"
 	"fmt"
-	"strings"
 	"sync"
 	"sync/atomic"
 
@@ -32,11 +31,13 @@ type monitorImpl struct {
 		Failed() bool
 		WorkerStatus(...interface{})
 	}
-	l         *logger.Logger
-	nodes     string
-	ctx       context.Context
-	cancel    func()
-	g         *errgroup.Group
+	l      *logger.Logger
+	nodes  string
+	ctx    context.Context
+	cancel func()
+	g      *errgroup.Group
+
+	numTasks  int32 // atomically
 	expDeaths int32 // atomically
 }
 
@@ -80,6 +81,8 @@ func (m *monitorImpl) ResetDeaths() {
 var errTestFatal = errors.New("t.Fatal() was called")
 
 func (m *monitorImpl) Go(fn func(context.Context) error) {
+	atomic.AddInt32(&m.numTasks, 1)
+
 	m.g.Go(func() (err error) {
 		defer func() {
 			r := recover()
@@ -171,15 +174,21 @@ func (m *monitorImpl) wait() error {
 	}
 
 	// 1. The first goroutine waits for the worker errgroup to exit.
+	// Note that this only happens if the caller created at least one
+	// task for the monitor. This check enables the roachtest monitor to
+	// be used in cases where we just want to monitor events in the
+	// cluster without running any background tasks through the monitor.
 	var wg sync.WaitGroup
-	wg.Add(1)
-	go func() {
-		defer func() {
-			m.cancel()
-			wg.Done()
+	if atomic.LoadInt32(&m.numTasks) > 0 {
+		wg.Add(1)
+		go func() {
+			defer func() {
+				m.cancel()
+				wg.Done()
+			}()
+			setErr(errors.Wrap(m.g.Wait(), "function passed to monitor.Go failed"))
 		}()
-		setErr(errors.Wrap(m.g.Wait(), "monitor task failed"))
-	}()
+	}
 
 	// 2. The second goroutine reads from the monitoring channel, watching for any
 	// unexpected death events.
@@ -190,28 +199,24 @@ func (m *monitorImpl) wait() error {
 			wg.Done()
 		}()
 
-		messagesChannel, err := roachprod.Monitor(m.ctx, m.l, m.nodes, install.MonitorOpts{})
+		eventsCh, err := roachprod.Monitor(m.ctx, m.l, m.nodes, install.MonitorOpts{})
 		if err != nil {
 			setErr(errors.Wrap(err, "monitor command failure"))
 			return
 		}
-		var monitorErr error
-		for msg := range messagesChannel {
-			if msg.Err != nil {
-				msg.Msg += "error: " + msg.Err.Error()
-			}
-			thisError := errors.Newf("%d: %s", msg.Node, msg.Msg)
-			if msg.Err != nil || strings.Contains(msg.Msg, "dead") {
-				monitorErr = errors.CombineErrors(monitorErr, thisError)
+
+		for info := range eventsCh {
+			_, isDeath := info.Event.(install.MonitorNodeDead)
+			isExpectedDeath := isDeath && atomic.AddInt32(&m.expDeaths, -1) >= 0
+			var expectedDeathStr string
+			if isExpectedDeath {
+				expectedDeathStr = ": expected"
 			}
-			var id int
-			var s string
-			newMsg := thisError.Error()
-			if n, _ := fmt.Sscanf(newMsg, "%d: %s", &id, &s); n == 2 {
-				if strings.Contains(s, "dead") && atomic.AddInt32(&m.expDeaths, -1) < 0 {
-					setErr(errors.Wrap(fmt.Errorf("unexpected node event: %s", newMsg), "monitor command failure"))
-					return
-				}
+			m.l.Printf("Monitor event: %s%s", info, expectedDeathStr)
+
+			if isDeath && !isExpectedDeath {
+				setErr(fmt.Errorf("unexpected node event: %s", info))
+				return
 			}
 		}
 	}()
 
@@ -3,6 +3,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_library(
     name = "mixedversion",
     srcs = [
+        "helper.go",
         "mixedversion.go",
         "planner.go",
         "runner.go",
 
@@ -255,6 +255,18 @@ for {
 }
 ```
 
+#### Use test helpers
+
+Every test hook receives as parameter a `*mixedversion.Helper` instance. This helper struct contains convenience functions that make it easy to perform certain operations while automatically implementing some of the best practices described here. For a full list of helpers, check out the [`helper.go`](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go) file in the framework's source tree.
+
+##### Aside: background functions
+
+One particularly important helper functionality to highlight relates to the management of functions that need to run in the background during a test. Typically, roachtests are expected to use a [monitor](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/monitor.go) for that purpose; not only does the monitor protect from panics inadvertently crashing the test, it also preempts their execution (via context cancelation) if a node dies unexpectedly.
+
+However, if a mixedversion test needs to perform a task in the background, they **must not use the roachtest monitor**. The reason for this is that mixedversion tests are [randomized](#embrace-randomness); as such, user-created monitors would not be able to predict when a death is expected since it does not know, by design, when a node restarts.
+
+To run functions in the background, use the API provided by the mixedversion framework. Long running tasks that run throughout the test can be defined with `(*mixedversion.Test).BackgroundFunc`. If a test hook needs to perform a task in the background, the `*mixedversion.Helper` instance has a `Background` function that can be used for that purpose. As usual, check the documentation for the public API for more details on usage and behaviour of these functions.
+
 #### Log progress
 
 Logging events in the test as it runs can make debugging failures a lot easier. All hooks passed to the `mixedversion` API receive a `*logger.Logger` instance as parameter. **Functions should use that logger instead of the test logger** (`t.L()`). Doing so has two main advantages:
@@ -398,6 +410,6 @@ $ COCKROACH_RANDOM_SEED=7357315251706229449 roachtest run --versions-binary-over
 
 ### Final Notes
 
-* This is a high level document and does not include API documentation. The `mixedversion` package includes a lot of documentation in the form of source code comments, and that should be the source of truth when it comes to finding out what functionality is available and how to use it.
+* This is a high level document and does not include API documentation. The `mixedversion` package includes a lot of documentation in the form of source code comments, and that should be the source of truth when it comes to finding out what functionality is available and how to use it. Most of the public API is in the [`mixedversion.go`](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go) and [`helper.go`](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go) files.
 * For a simple application of the `mixedversion` framework, check out the `acceptance/version-upgrade` roachtest. For a more complex example, see `backup-restore/mixed-version`.
 * For any other questions, please reach out to `#test-eng`!
@@ -0,0 +1,134 @@
+// Copyright 2023 The Cockroach Authors.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.txt.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0, included in the file
+// licenses/APL.txt.
+
+package mixedversion
+
+import (
+	"context"
+	gosql "database/sql"
+	"fmt"
+	"math/rand"
+	"path"
+	"strings"
+	"sync/atomic"
+
+	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
+	"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
+)
+
+func (h *Helper) RandomNode(prng *rand.Rand, nodes option.NodeListOption) int {
+	return nodes[prng.Intn(len(nodes))]
+}
+
+// RandomDB returns a (nodeID, connection) tuple for a randomly picked
+// cockroach node according to the parameters passed.
+func (h *Helper) RandomDB(prng *rand.Rand, nodes option.NodeListOption) (int, *gosql.DB) {
+	node := h.RandomNode(prng, nodes)
+	return node, h.Connect(node)
+}
+
+// QueryRow performs `db.QueryRowContext` on a randomly picked
+// database node. The query and the node picked are logged in the logs
+// of the step that calls this function.
+func (h *Helper) QueryRow(rng *rand.Rand, query string, args ...interface{}) *gosql.Row {
+	node, db := h.RandomDB(rng, h.runner.crdbNodes)
+	h.stepLogger.Printf("running SQL statement:\n%s\nArgs: %v\nNode: %d", query, args, node)
+	return db.QueryRowContext(h.ctx, query, args...)
+}
+
+// Exec performs `db.ExecContext` on a randomly picked database node.
+// The query and the node picked are logged in the logs of the step
+// that calls this function.
+func (h *Helper) Exec(rng *rand.Rand, query string, args ...interface{}) error {
+	node, db := h.RandomDB(rng, h.runner.crdbNodes)
+	h.stepLogger.Printf("running SQL statement:\n%s\nArgs: %v\nNode: %d", query, args, node)
+	_, err := db.ExecContext(h.ctx, query, args...)
+	return err
+}
+
+func (h *Helper) Connect(node int) *gosql.DB {
+	return h.runner.conn(node)
+}
+
+// SetContext should be called by steps that need access to the test
+// context, as that is only visible to them.
+func (h *Helper) SetContext(c *Context) {
+	h.testContext = c
+}
+
+// Context returns the test context associated with a certain step. It
+// is made available for user-functions (see runHookStep).
+func (h *Helper) Context() *Context {
+	return h.testContext
+}
+
+// Background allows test authors to create functions that run in the
+// background in mixed-version hooks.
+func (h *Helper) Background(
+	name string, fn func(context.Context, *logger.Logger) error,
+) context.CancelFunc {
+	return h.runner.background.Start(name, func(ctx context.Context) error {
+		bgLogger, err := h.loggerFor(name)
+		if err != nil {
+			return fmt.Errorf("failed to create logger for background function %q: %w", name, err)
+		}
+
+		err = panicAsError(bgLogger, func() error { return fn(ctx, bgLogger) })
+		if err != nil {
+			if isContextCanceled(ctx) {
+				return err
+			}
+
+			desc := fmt.Sprintf("error in background function %s: %s", name, err)
+			return h.runner.testFailure(desc, bgLogger)
+		}
+
+		return nil
+	})
+}
+
+// BackgroundCommand has the same semantics of `Background()`; the
+// command passed will run and the test will fail if the command is
+// not successful.
+func (h *Helper) BackgroundCommand(cmd string, nodes option.NodeListOption) context.CancelFunc {
+	desc := fmt.Sprintf("run command: %q", cmd)
+	return h.Background(desc, func(ctx context.Context, l *logger.Logger) error {
+		l.Printf("running command `%s` on nodes %v in the background", cmd, nodes)
+		return h.runner.cluster.RunE(ctx, nodes, cmd)
+	})
+}
+
+// ExpectDeath alerts the testing infrastructure that a node is
+// expected to die. Regular restarts as part of the mixedversion
+// testing are already taken into account. This function should only
+// be used by tests that perform their own node restarts or chaos
+// events.
+func (h *Helper) ExpectDeath() {
+	h.ExpectDeaths(1)
+}
+
+// ExpectDeaths is the general version of `ExpectDeath()`.
+func (h *Helper) ExpectDeaths(n int) {
+	h.runner.monitor.ExpectDeaths(n)
+}
+
+// loggerFor creates a logger instance to be used by background
+// functions (created by calling `Background` on the helper
+// instance). It is similar to the logger instances created for
+// mixed-version steps, but with the `background_` prefix.
+func (h *Helper) loggerFor(name string) (*logger.Logger, error) {
+	atomic.AddInt64(&h.bgCount, 1)
+
+	fileName := invalidChars.ReplaceAllString(strings.ToLower(name), "")
+	fileName = fmt.Sprintf("background_%s_%d", fileName, h.bgCount)
+	fileName = path.Join(logPrefix, fileName)
+
+	return prefixedLogger(h.runner.logger, fileName)
+}
@@ -653,6 +653,7 @@ func (s restartWithNewBinaryStep) Description() string {
 func (s restartWithNewBinaryStep) Run(
 	ctx context.Context, l *logger.Logger, c cluster.Cluster, h *Helper,
 ) error {
+	h.ExpectDeath()
 	return clusterupgrade.RestartNodesWithNewBinary(
 		ctx,
 		s.rt,
Original file line number	Diff line number	Diff line change
`@@ -1528,22 +1528,20 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {`
`1528`	`1528`	`}`
`1529`	`1529`
`1530`	`1530`	`t.L().Printf("checking for dead nodes")`
`1531`		`- ch, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})`
	`1531`	`+ eventsCh, err := roachprod.Monitor(ctx, t.L(), c.name, install.MonitorOpts{OneShot: true, IgnoreEmptyNodes: true})`
`1532`	`1532`
`1533`	`1533`	`// An error here means there was a problem initialising a SyncedCluster.`
`1534`	`1534`	`if err != nil {`
`1535`	`1535`	`return err`
`1536`	`1536`	`}`
`1537`	`1537`
`1538`	`1538`	`deadNodes := 0`
`1539`		`- for n := range ch {`
`1540`		`- // If there's an error, it means either that the monitor command failed`
`1541`		`- // completely, or that it found a dead node worth complaining about.`
`1542`		`- if n.Err != nil \|\| strings.HasPrefix(n.Msg, "dead") {`
	`1539`	`+ for info := range eventsCh {`
	`1540`	`+ t.L().Printf("%s", info)`
	`1541`	`+`
	`1542`	`+ if _, isDeath := info.Event.(install.MonitorNodeDead); isDeath {`
`1543`	`1543`	`deadNodes++`
`1544`	`1544`	`}`
`1545`		`-`
`1546`		`- t.L().Printf("n%d: err=%v,msg=%s", n.Node, n.Err, n.Msg)`
`1547`	`1545`	`}`
`1548`	`1546`
`1549`	`1547`	`if deadNodes > 0 {`