Skip to content

Commit f75ebe7

Browse files
author
Renato Costa
committed
roachtest: monitor nodes in mixedversion
This updates the `mixedversion` runner to monitor the status of the cockroach processes on the cluster during a mixed-version test. This allows the test to fail immediately if a node crashes unexpectedly. Previously, if a node crashed, the test would only fail if the crash resulted in an assertion failure. There is a chance that this would not happen because the framework could decide to restart the node shortly after its crash. With this change, an unexpected node death causes the test to fail immediately. This commit also exposes the ability for test authors to use `ExpectDeath()` in their tests, in case the test performs its own restarts or chaos events. Epic: CRDB-19321 Release note: None
1 parent bf86880 commit f75ebe7

File tree

5 files changed

+198
-52
lines changed

5 files changed

+198
-52
lines changed

pkg/cmd/roachtest/monitor.go

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,13 @@ type monitorImpl struct {
3131
Failed() bool
3232
WorkerStatus(...interface{})
3333
}
34-
l *logger.Logger
35-
nodes string
36-
ctx context.Context
37-
cancel func()
38-
g *errgroup.Group
34+
l *logger.Logger
35+
nodes string
36+
ctx context.Context
37+
cancel func()
38+
g *errgroup.Group
39+
40+
numTasks int32 // atomically
3941
expDeaths int32 // atomically
4042
}
4143

@@ -79,6 +81,8 @@ func (m *monitorImpl) ResetDeaths() {
7981
var errTestFatal = errors.New("t.Fatal() was called")
8082

8183
func (m *monitorImpl) Go(fn func(context.Context) error) {
84+
atomic.AddInt32(&m.numTasks, 1)
85+
8286
m.g.Go(func() (err error) {
8387
defer func() {
8488
r := recover()
@@ -170,15 +174,21 @@ func (m *monitorImpl) wait() error {
170174
}
171175

172176
// 1. The first goroutine waits for the worker errgroup to exit.
177+
// Note that this only happens if the caller created at least one
178+
// task for the monitor. This check enables the roachtest monitor to
179+
// be used in cases where we just want to monitor events in the
180+
// cluster without running any background tasks through the monitor.
173181
var wg sync.WaitGroup
174-
wg.Add(1)
175-
go func() {
176-
defer func() {
177-
m.cancel()
178-
wg.Done()
182+
if atomic.LoadInt32(&m.numTasks) > 0 {
183+
wg.Add(1)
184+
go func() {
185+
defer func() {
186+
m.cancel()
187+
wg.Done()
188+
}()
189+
setErr(errors.Wrap(m.g.Wait(), "function passed to monitor.Go failed"))
179190
}()
180-
setErr(errors.Wrap(m.g.Wait(), "function passed to monitor.Go failed"))
181-
}()
191+
}
182192

183193
// 2. The second goroutine reads from the monitoring channel, watching for any
184194
// unexpected death events.

pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,7 @@ func (s restartWithNewBinaryStep) Description() string {
653653
func (s restartWithNewBinaryStep) Run(
654654
ctx context.Context, l *logger.Logger, c cluster.Cluster, h *Helper,
655655
) error {
656+
h.ExpectDeath()
656657
return clusterupgrade.RestartNodesWithNewBinary(
657658
ctx,
658659
s.rt,

0 commit comments

Comments
 (0)