Skip to content

Commit adfec3c

Browse files
committed
roachtest: add helper to retry cluster settings
We occasionally see cluster setting statements time out due to overloaded clusters (e.g. mutators in mixed version tests, post test assertions). This change adds a small helper to retry those cluster setting statements. Fixes: #143791
1 parent 1c99d43 commit adfec3c

File tree

5 files changed

+78
-9
lines changed

5 files changed

+78
-9
lines changed

pkg/cmd/roachtest/roachtestutil/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ go_library(
3737
"//pkg/util/httputil",
3838
"//pkg/util/humanizeutil",
3939
"//pkg/util/protoutil",
40+
"//pkg/util/retry",
4041
"//pkg/util/syncutil",
4142
"//pkg/util/timeutil",
4243
"//pkg/workload/histogram/exporter",

pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@ import (
1515
"sync/atomic"
1616

1717
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
18+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
1819
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
1920
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/task"
2021
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
2122
"github.com/cockroachdb/cockroach/pkg/roachpb"
2223
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
2324
"github.com/cockroachdb/cockroach/pkg/testutils/release"
25+
"github.com/cockroachdb/cockroach/pkg/util/retry"
2426
"github.com/cockroachdb/errors"
2527
)
2628

@@ -152,6 +154,22 @@ func (s *Service) ExecWithGateway(
152154
return err
153155
}
154156

157+
func (s *Service) ExecWithRetry(
158+
rng *rand.Rand,
159+
nodes option.NodeListOption,
160+
retryOpts retry.Options,
161+
query string,
162+
args ...interface{},
163+
) error {
164+
db, err := s.prepareQuery(rng, nodes, query, args...)
165+
if err != nil {
166+
return err
167+
}
168+
169+
_, err = roachtestutil.ExecWithRetry(s.ctx, s.stepLogger, db, retryOpts, query, args...)
170+
return err
171+
}
172+
155173
func (s *Service) ClusterVersion(rng *rand.Rand) (roachpb.Version, error) {
156174
if s.Finalizing {
157175
n, db := s.RandomDB(rng)
@@ -249,6 +267,14 @@ func (h *Helper) ExecWithGateway(
249267
return h.DefaultService().ExecWithGateway(rng, nodes, query, args...)
250268
}
251269

270+
// ExecWithRetry is like ExecWithGateway, but retries the execution of
271+
// the statement on errors, using the retry options provided.
272+
func (h *Helper) ExecWithRetry(
273+
rng *rand.Rand, nodes option.NodeListOption, query string, args ...interface{},
274+
) error {
275+
return h.DefaultService().ExecWithGateway(rng, nodes, query, args...)
276+
}
277+
252278
// defaultTaskOptions returns the default options that are passed to all tasks
253279
// started by the helper.
254280
func (h *Helper) defaultTaskOptions() []task.Option {

pkg/cmd/roachtest/roachtestutil/mixedversion/steps.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,9 @@ func (s preserveDowngradeOptionStep) Run(
311311
return err
312312
}
313313

314-
return service.Exec(rng, "SET CLUSTER SETTING cluster.preserve_downgrade_option = $1", bv.String())
314+
return service.ExecWithRetry(rng, service.Descriptor.Nodes, roachtestutil.ClusterSettingRetryOpts,
315+
"SET CLUSTER SETTING cluster.preserve_downgrade_option = $1", bv.String(),
316+
)
315317
}
316318

317319
func (s preserveDowngradeOptionStep) ConcurrencyDisabled() bool {
@@ -408,8 +410,10 @@ func (s allowUpgradeStep) Description() string {
408410
func (s allowUpgradeStep) Run(
409411
ctx context.Context, l *logger.Logger, rng *rand.Rand, h *Helper,
410412
) error {
411-
return serviceByName(h, s.virtualClusterName).Exec(
412-
rng, "RESET CLUSTER SETTING cluster.preserve_downgrade_option",
413+
service := serviceByName(h, s.virtualClusterName)
414+
return service.ExecWithRetry(
415+
rng, service.Descriptor.Nodes, roachtestutil.ClusterSettingRetryOpts,
416+
"RESET CLUSTER SETTING cluster.preserve_downgrade_option",
413417
)
414418
}
415419

@@ -515,8 +519,8 @@ func (s setClusterSettingStep) Run(
515519
args = []interface{}{val}
516520
}
517521

518-
return serviceByName(h, serviceName).ExecWithGateway(
519-
rng, nodesRunningAtLeast(s.virtualClusterName, s.minVersion, h), stmt, args...,
522+
return serviceByName(h, serviceName).ExecWithRetry(
523+
rng, nodesRunningAtLeast(s.virtualClusterName, s.minVersion, h), roachtestutil.ClusterSettingRetryOpts, stmt, args...,
520524
)
521525
}
522526

@@ -563,7 +567,9 @@ func (s setClusterVersionStep) Run(
563567
}
564568

565569
l.Printf("setting cluster version to '%s'", binaryVersion)
566-
return service.Exec(rng, "SET CLUSTER SETTING version = $1", binaryVersion)
570+
return service.ExecWithRetry(rng, service.Descriptor.Nodes, roachtestutil.ClusterSettingRetryOpts,
571+
"SET CLUSTER SETTING version = $1", binaryVersion,
572+
)
567573
}
568574

569575
func (s setClusterVersionStep) ConcurrencyDisabled() bool {
@@ -587,8 +593,8 @@ func (s resetClusterSettingStep) Run(
587593
ctx context.Context, l *logger.Logger, rng *rand.Rand, h *Helper,
588594
) error {
589595
stmt := fmt.Sprintf("RESET CLUSTER SETTING %s", s.name)
590-
return serviceByName(h, s.virtualClusterName).ExecWithGateway(
591-
rng, nodesRunningAtLeast(s.virtualClusterName, s.minVersion, h), stmt,
596+
return serviceByName(h, s.virtualClusterName).ExecWithRetry(
597+
rng, nodesRunningAtLeast(s.virtualClusterName, s.minVersion, h), roachtestutil.ClusterSettingRetryOpts, stmt,
592598
)
593599
}
594600

pkg/cmd/roachtest/roachtestutil/utils.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2828
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
2929
"github.com/cockroachdb/cockroach/pkg/util"
30+
"github.com/cockroachdb/cockroach/pkg/util/retry"
3031
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
3132
"github.com/cockroachdb/errors"
3233
"github.com/stretchr/testify/require"
@@ -336,3 +337,38 @@ func SimulateMultiRegionCluster(
336337

337338
return cleanupFunc, nil
338339
}
340+
341+
// ExecWithRetry executes the given SQL statement with the specified retry logic.
342+
func ExecWithRetry(
343+
ctx context.Context,
344+
l *logger.Logger,
345+
db *gosql.DB,
346+
retryOpts retry.Options,
347+
query string,
348+
args ...any,
349+
) (gosql.Result, error) {
350+
var result gosql.Result
351+
err := retryOpts.Do(ctx, func(ctx context.Context) error {
352+
var err error
353+
result, err = db.ExecContext(ctx, query, args...)
354+
if err != nil {
355+
l.Printf("%s failed (retrying): %v", query, err)
356+
return err
357+
}
358+
return nil
359+
})
360+
361+
return result, err
362+
}
363+
364+
// ClusterSettingRetryOpts are retry options intended for cluster setting operations.
365+
//
366+
// We use relatively high backoff parameters with the assumption that:
367+
// 1. If we fail, it's likely due to cluster overload, and we want to give
368+
// the cluster adequate time to recover.
369+
// 2. Setting a cluster setting in roachtest is not latency sensitive.
370+
var ClusterSettingRetryOpts = retry.Options{
371+
InitialBackoff: 3 * time.Second,
372+
MaxBackoff: 5 * time.Second,
373+
MaxRetries: 5,
374+
}

pkg/cmd/roachtest/roachtestutil/validation_check.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ func CheckReplicaDivergenceOnDB(ctx context.Context, l *logger.Logger, db *gosql
4242
defer cancel()
4343

4444
// Speed up consistency checks. The test is done, so let's go full throttle.
45-
_, err := db.ExecContext(ctx, "SET CLUSTER SETTING server.consistency_check.max_rate = '1GB'")
45+
_, err := ExecWithRetry(ctx, l, db, ClusterSettingRetryOpts, "SET CLUSTER SETTING server.consistency_check.max_rate = '1GB'")
4646
if err != nil {
4747
return errors.Wrap(err, "unable to set 'server.consistency_check.max_rate'")
4848
}

0 commit comments

Comments
 (0)