Skip to content

Commit 400105c

Browse files
craig[bot]Renato Costa
andcommitted
107529: roachtest: make upgrade timeouts configurable r=herkolategan a=renatolabs This commit makes the timeout applied when waiting for a cluster upgrade configurable. Previously, a fixed 10 minute timeout would apply regardless of the test. A new option is added to `mixedversion` tests to specify custom upgrade timeouts; this is especially useful in tests that operate on larger volumes of data and upgrades that may include expensive migrations. Informs: cockroachdb#107414 Release note: None Co-authored-by: Renato Costa <[email protected]>
2 parents c566dbc + de9539a commit 400105c

File tree

7 files changed

+109
-16
lines changed

7 files changed

+109
-16
lines changed

pkg/cmd/roachtest/roachtestutil/clusterupgrade/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,6 @@ go_library(
1414
"//pkg/roachprod/logger",
1515
"//pkg/util/retry",
1616
"//pkg/util/version",
17+
"@com_github_cockroachdb_errors//:errors",
1718
],
1819
)

pkg/cmd/roachtest/roachtestutil/clusterupgrade/clusterupgrade.go

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
2828
"github.com/cockroachdb/cockroach/pkg/util/retry"
2929
"github.com/cockroachdb/cockroach/pkg/util/version"
30+
"github.com/cockroachdb/errors"
3031
)
3132

3233
const (
@@ -233,34 +234,71 @@ func RestartNodesWithNewBinary(
233234
return nil
234235
}
235236

237+
// DefaultUpgradeTimeout is the default timeout used when waiting for
238+
// an upgrade to finish (i.e., for all migrations to run and for the
239+
// cluster version to propagate). This timeout should be sufficient
240+
// for simple tests where there isn't a lot of data; in other
241+
// situations, a custom timeout can be passed to
242+
// `WaitForClusterUpgrade`.
243+
var DefaultUpgradeTimeout = 10 * time.Minute
244+
236245
// WaitForClusterUpgrade waits for the cluster version to reach the
237246
// first node's binary version. This function should only be called if
238247
// every node in the cluster has been restarted to run the same binary
239248
// version. We rely on the cluster's internal self-upgrading
240249
// mechanism to update the underlying cluster version.
241250
func WaitForClusterUpgrade(
242-
ctx context.Context, l *logger.Logger, nodes option.NodeListOption, dbFunc func(int) *gosql.DB,
251+
ctx context.Context,
252+
l *logger.Logger,
253+
nodes option.NodeListOption,
254+
dbFunc func(int) *gosql.DB,
255+
timeout time.Duration,
243256
) error {
244-
newVersion, err := BinaryVersion(dbFunc(nodes[0]))
257+
firstNode := nodes[0]
258+
newVersion, err := BinaryVersion(dbFunc(firstNode))
245259
if err != nil {
246260
return err
247261
}
248262

249-
l.Printf("waiting for cluster to auto-upgrade to %s", newVersion)
250-
for _, node := range nodes {
251-
err := retry.ForDuration(10*time.Minute, func() error {
263+
// waitForUpgrade will wait for the given `node` to have the
264+
// expected cluster version within the given timeout.
265+
waitForUpgrade := func(node int, timeout time.Duration) error {
266+
var latestVersion roachpb.Version
267+
err := retry.ForDuration(timeout, func() error {
252268
currentVersion, err := ClusterVersion(ctx, dbFunc(node))
253269
if err != nil {
254270
return err
255271
}
272+
273+
latestVersion = currentVersion
256274
if currentVersion != newVersion {
257-
return fmt.Errorf("%d: expected cluster version %s, got %s", node, newVersion, currentVersion)
275+
return fmt.Errorf("not upgraded yet")
258276
}
259-
l.Printf("%s: acked by n%d", currentVersion, node)
260277
return nil
261278
})
262279
if err != nil {
263-
return err
280+
return errors.Wrapf(err,
281+
"timed out after %s: expected n%d to be at cluster version %s, but is still at %s",
282+
timeout, node, newVersion, latestVersion,
283+
)
284+
}
285+
l.Printf("%s: acked by n%d", newVersion, node)
286+
return nil
287+
}
288+
289+
l.Printf("waiting for cluster to auto-upgrade to %s for %s", newVersion, timeout)
290+
if err := waitForUpgrade(firstNode, timeout); err != nil {
291+
return err
292+
}
293+
294+
// Wait for `propagationTimeout` for all other nodes to also
295+
// acknowledge the same cluster version as the first node. This
296+
// should happen much faster, as migrations should already have
297+
// finished at this point.
298+
propagationTimeout := 3 * time.Minute
299+
for _, node := range nodes[1:] {
300+
if err := waitForUpgrade(node, propagationTimeout); err != nil {
301+
return fmt.Errorf("n%d is already at %s: %w", firstNode, newVersion, err)
264302
}
265303
}
266304

pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ var (
138138
// We use fixtures more often than not as they are more likely to
139139
// detect bugs, especially in migrations.
140140
useFixturesProbability: 0.7,
141+
upgradeTimeout: clusterupgrade.DefaultUpgradeTimeout,
141142
}
142143
)
143144

@@ -232,9 +233,10 @@ type (
232233
}
233234

234235
// testOptions contains some options that can be changed by the user
235-
// that expose some control over the generated test plan.
236+
// that expose some control over the generated test plan and behaviour.
236237
testOptions struct {
237238
useFixturesProbability float64
239+
upgradeTimeout time.Duration
238240
}
239241

240242
customOption func(*testOptions)
@@ -298,6 +300,14 @@ func AlwaysUseFixtures(opts *testOptions) {
298300
opts.useFixturesProbability = 1
299301
}
300302

303+
// UpgradeTimeout allows test authors to provide a different timeout
304+
// to apply when waiting for an upgrade to finish.
305+
func UpgradeTimeout(timeout time.Duration) customOption {
306+
return func(opts *testOptions) {
307+
opts.upgradeTimeout = timeout
308+
}
309+
}
310+
301311
// NewTest creates a Test struct that users can use to create and run
302312
// a mixed-version roachtest.
303313
func NewTest(
@@ -605,8 +615,9 @@ func (s uploadCurrentVersionStep) Run(
605615
// the cluster and equal to the binary version of the first node in
606616
// the `nodes` field.
607617
type waitForStableClusterVersionStep struct {
608-
id int
609-
nodes option.NodeListOption
618+
id int
619+
nodes option.NodeListOption
620+
timeout time.Duration
610621
}
611622

612623
func (s waitForStableClusterVersionStep) ID() int { return s.id }
@@ -622,7 +633,7 @@ func (s waitForStableClusterVersionStep) Description() string {
622633
func (s waitForStableClusterVersionStep) Run(
623634
ctx context.Context, l *logger.Logger, c cluster.Cluster, h *Helper,
624635
) error {
625-
return clusterupgrade.WaitForClusterUpgrade(ctx, l, s.nodes, h.Connect)
636+
return clusterupgrade.WaitForClusterUpgrade(ctx, l, s.nodes, h.Connect, s.timeout)
626637
}
627638

628639
// preserveDowngradeOptionStep sets the `preserve_downgrade_option`

pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ func (p *testPlanner) initSteps() []testStep {
133133
return append(
134134
append(steps,
135135
uploadCurrentVersionStep{id: p.nextID(), rt: p.rt, crdbNodes: p.crdbNodes, dest: CurrentCockroachPath},
136-
waitForStableClusterVersionStep{id: p.nextID(), nodes: p.crdbNodes},
136+
waitForStableClusterVersionStep{id: p.nextID(), nodes: p.crdbNodes, timeout: p.options.upgradeTimeout},
137137
preserveDowngradeOptionStep{id: p.nextID(), prng: p.newRNG(), crdbNodes: p.crdbNodes},
138138
),
139139
p.hooks.StartupSteps(p.nextID, p.initialContext())...,
@@ -146,7 +146,7 @@ func (p *testPlanner) initSteps() []testStep {
146146
// user may have provided.
147147
func (p *testPlanner) finalSteps() []testStep {
148148
return append([]testStep{
149-
waitForStableClusterVersionStep{id: p.nextID(), nodes: p.crdbNodes},
149+
waitForStableClusterVersionStep{id: p.nextID(), nodes: p.crdbNodes, timeout: p.options.upgradeTimeout},
150150
}, p.hooks.AfterUpgradeFinalizedSteps(p.nextID, p.finalContext(false /* finalizing */))...)
151151
}
152152

pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"io"
1717
"math/rand"
1818
"testing"
19+
"time"
1920

2021
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
2122
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
@@ -258,6 +259,39 @@ func Test_startClusterID(t *testing.T) {
258259
require.Equal(t, 2, plan.startClusterID)
259260
}
260261

262+
// Test_upgradeTimeout tests the behaviour of upgrade timeouts in
263+
// mixedversion tests. If no custom value is passed, the default
264+
// timeout in the clusterupgrade package is used; otherwise, the
265+
// custom value is enforced.
266+
func Test_upgradeTimeout(t *testing.T) {
267+
findUpgradeWaitSteps := func(plan *TestPlan) []waitForStableClusterVersionStep {
268+
var steps []waitForStableClusterVersionStep
269+
for _, s := range plan.steps {
270+
if step, isUpgrade := s.(waitForStableClusterVersionStep); isUpgrade {
271+
steps = append(steps, step)
272+
}
273+
}
274+
if len(steps) == 0 {
275+
require.Fail(t, "could not find any waitForStableClusterVersionStep in the plan")
276+
}
277+
return steps
278+
}
279+
280+
assertTimeout := func(expectedTimeout time.Duration, opts ...customOption) {
281+
mvt := newTest(opts...)
282+
plan, err := mvt.plan()
283+
require.NoError(t, err)
284+
waitUpgrades := findUpgradeWaitSteps(plan)
285+
286+
for _, s := range waitUpgrades {
287+
require.Equal(t, expectedTimeout, s.timeout)
288+
}
289+
}
290+
291+
assertTimeout(10 * time.Minute) // using default settings, the default timeout applies
292+
assertTimeout(30*time.Minute, UpgradeTimeout(30*time.Minute)) // custom timeout applies.
293+
}
294+
261295
func newTest(options ...customOption) *Test {
262296
testOptions := defaultTestOptions
263297
for _, fn := range options {

pkg/cmd/roachtest/tests/mixed_version_backup.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2101,7 +2101,14 @@ func registerBackupMixedVersion(r registry.Registry) {
21012101

21022102
roachNodes := c.Range(1, c.Spec().NodeCount-1)
21032103
workloadNode := c.Node(c.Spec().NodeCount)
2104-
mvt := mixedversion.NewTest(ctx, t, t.L(), c, roachNodes)
2104+
mvt := mixedversion.NewTest(
2105+
ctx, t, t.L(), c, roachNodes,
2106+
// We use a longer upgrade timeout in this test to give the
2107+
// migrations enough time to finish considering all the data
2108+
// that might exist in the cluster by the time the upgrade is
2109+
// attempted.
2110+
mixedversion.UpgradeTimeout(30*time.Minute),
2111+
)
21052112
testRNG := mvt.RNG()
21062113

21072114
uploadVersion(ctx, t, c, workloadNode, clusterupgrade.MainVersion)

pkg/cmd/roachtest/tests/versionupgrade.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,9 @@ func allowAutoUpgradeStep(node int) versionStep {
317317
func waitForUpgradeStep(nodes option.NodeListOption) versionStep {
318318
return func(ctx context.Context, t test.Test, u *versionUpgradeTest) {
319319
dbFunc := func(node int) *gosql.DB { return u.conn(ctx, t, node) }
320-
if err := clusterupgrade.WaitForClusterUpgrade(ctx, t.L(), nodes, dbFunc); err != nil {
320+
if err := clusterupgrade.WaitForClusterUpgrade(
321+
ctx, t.L(), nodes, dbFunc, clusterupgrade.DefaultUpgradeTimeout,
322+
); err != nil {
321323
t.Fatal(err)
322324
}
323325
}

0 commit comments

Comments
 (0)