Skip to content

Commit 0a22b4b

Browse files
committed
roachtest: retry fetching cluster and binary version
1 parent 46a42da commit 0a22b4b

File tree

8 files changed

+53
-21
lines changed

8 files changed

+53
-21
lines changed

pkg/cmd/roachtest/roachtestutil/clusterupgrade/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ go_library(
99
"//pkg/build",
1010
"//pkg/cmd/roachtest/cluster",
1111
"//pkg/cmd/roachtest/option",
12+
"//pkg/cmd/roachtest/roachtestutil",
1213
"//pkg/cmd/roachtest/test",
1314
"//pkg/roachpb",
1415
"//pkg/roachprod/install",

pkg/cmd/roachtest/roachtestutil/clusterupgrade/clusterupgrade.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/cockroachdb/cockroach/pkg/build"
2020
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
2121
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
22+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
2223
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
2324
"github.com/cockroachdb/cockroach/pkg/roachpb"
2425
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
@@ -156,10 +157,22 @@ func LatestPatchRelease(series string) (*Version, error) {
156157
// associated with the given database connection.
157158
// NB: version means major.minor[-internal]; the patch level isn't
158159
// returned. For example, a binary of version 19.2.4 will return 19.2.
159-
func BinaryVersion(ctx context.Context, db *gosql.DB) (roachpb.Version, error) {
160+
func BinaryVersion(ctx context.Context, l *logger.Logger, db *gosql.DB) (roachpb.Version, error) {
160161
zero := roachpb.Version{}
161162
var sv string
162-
if err := db.QueryRowContext(ctx, `SELECT crdb_internal.node_executable_version();`).Scan(&sv); err != nil {
163+
rows, err := roachtestutil.QueryWithRetry(
164+
ctx, l, db, roachtestutil.ClusterSettingRetryOpts, `SELECT crdb_internal.node_executable_version();`,
165+
)
166+
if err != nil {
167+
return zero, err
168+
}
169+
defer rows.Close()
170+
171+
if !rows.Next() {
172+
return zero, fmt.Errorf("no rows returned")
173+
}
174+
175+
if err := rows.Scan(&sv); err != nil {
163176
return zero, err
164177
}
165178

@@ -176,10 +189,22 @@ func BinaryVersion(ctx context.Context, db *gosql.DB) (roachpb.Version, error) {
176189
// in the background plus gossip asynchronicity.
177190
// NB: cluster versions are always major.minor[-internal]; there isn't
178191
// a patch level.
179-
func ClusterVersion(ctx context.Context, db *gosql.DB) (roachpb.Version, error) {
192+
func ClusterVersion(ctx context.Context, l *logger.Logger, db *gosql.DB) (roachpb.Version, error) {
180193
zero := roachpb.Version{}
181194
var sv string
182-
if err := db.QueryRowContext(ctx, `SHOW CLUSTER SETTING version`).Scan(&sv); err != nil {
195+
rows, err := roachtestutil.QueryWithRetry(
196+
ctx, l, db, roachtestutil.ClusterSettingRetryOpts, `SHOW CLUSTER SETTING version`,
197+
)
198+
if err != nil {
199+
return zero, err
200+
}
201+
defer rows.Close()
202+
203+
if !rows.Next() {
204+
return zero, fmt.Errorf("no rows returned")
205+
}
206+
207+
if err := rows.Scan(&sv); err != nil {
183208
return zero, err
184209
}
185210

@@ -477,7 +502,7 @@ func WaitForClusterUpgrade(
477502
timeout time.Duration,
478503
) error {
479504
firstNode := nodes[0]
480-
newVersion, err := BinaryVersion(ctx, dbFunc(firstNode))
505+
newVersion, err := BinaryVersion(ctx, l, dbFunc(firstNode))
481506
if err != nil {
482507
return err
483508
}
@@ -490,7 +515,7 @@ func WaitForClusterUpgrade(
490515
retryCtx, cancel := context.WithTimeout(ctx, timeout)
491516
defer cancel()
492517
err := opts.Do(retryCtx, func(ctx context.Context) error {
493-
currentVersion, err := ClusterVersion(ctx, dbFunc(node))
518+
currentVersion, err := ClusterVersion(ctx, l, dbFunc(node))
494519
if err != nil {
495520
return err
496521
}

pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ func (s *Service) ClusterVersion(rng *rand.Rand) (roachpb.Version, error) {
174174
if s.Finalizing {
175175
n, db := s.RandomDB(rng)
176176
s.stepLogger.Printf("querying cluster version through node %d", n)
177-
cv, err := clusterupgrade.ClusterVersion(s.ctx, db)
177+
cv, err := clusterupgrade.ClusterVersion(s.ctx, s.stepLogger, db)
178178
if err != nil {
179179
return roachpb.Version{}, fmt.Errorf("failed to query cluster version: %w", err)
180180
}
@@ -270,9 +270,13 @@ func (h *Helper) ExecWithGateway(
270270
// ExecWithRetry is like ExecWithGateway, but retries the execution of
271271
// the statement on errors, using the retry options provided.
272272
func (h *Helper) ExecWithRetry(
273-
rng *rand.Rand, nodes option.NodeListOption, query string, args ...interface{},
273+
rng *rand.Rand,
274+
nodes option.NodeListOption,
275+
retryOpts retry.Options,
276+
query string,
277+
args ...interface{},
274278
) error {
275-
return h.DefaultService().ExecWithGateway(rng, nodes, query, args...)
279+
return h.DefaultService().ExecWithRetry(rng, nodes, retryOpts, query, args...)
276280
}
277281

278282
// defaultTaskOptions returns the default options that are passed to all tasks

pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ var (
7979
// for an internal query (i.e., performed by the framework) to
8080
// complete. These queries are typically associated with gathering
8181
// upgrade state data to be displayed during execution.
82-
internalQueryTimeout = 30 * time.Second
82+
internalQueryTimeout = 90 * time.Second
8383
)
8484

8585
func newServiceRuntime(desc *ServiceDescriptor) *serviceRuntime {
@@ -527,7 +527,7 @@ func (tr *testRunner) refreshBinaryVersions(ctx context.Context, service *servic
527527
group := ctxgroup.WithContext(connectionCtx)
528528
for j, node := range tr.getAvailableNodes(service.descriptor) {
529529
group.GoCtx(func(ctx context.Context) error {
530-
bv, err := clusterupgrade.BinaryVersion(ctx, tr.conn(node, service.descriptor.Name))
530+
bv, err := clusterupgrade.BinaryVersion(ctx, tr.logger, tr.conn(node, service.descriptor.Name))
531531
if err != nil {
532532
return fmt.Errorf(
533533
"failed to get binary version for node %d (%s): %w",
@@ -558,7 +558,7 @@ func (tr *testRunner) refreshClusterVersions(ctx context.Context, service *servi
558558
group := ctxgroup.WithContext(connectionCtx)
559559
for j, node := range tr.getAvailableNodes(service.descriptor) {
560560
group.GoCtx(func(ctx context.Context) error {
561-
cv, err := clusterupgrade.ClusterVersion(ctx, tr.conn(node, service.descriptor.Name))
561+
cv, err := clusterupgrade.ClusterVersion(ctx, tr.logger, tr.conn(node, service.descriptor.Name))
562562
if err != nil {
563563
return fmt.Errorf(
564564
"failed to get cluster version for node %d (%s): %w",

pkg/cmd/roachtest/roachtestutil/mixedversion/steps.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ func (s preserveDowngradeOptionStep) Run(
306306
service := serviceByName(h, s.virtualClusterName)
307307
node, db := service.RandomDB(rng)
308308
l.Printf("checking binary version (via node %d)", node)
309-
bv, err := clusterupgrade.BinaryVersion(ctx, db)
309+
bv, err := clusterupgrade.BinaryVersion(ctx, l, db)
310310
if err != nil {
311311
return err
312312
}
@@ -558,7 +558,7 @@ func (s setClusterVersionStep) Run(
558558
node, db := service.RandomDB(rng)
559559
l.Printf("fetching binary version via n%d", node)
560560

561-
bv, err := clusterupgrade.BinaryVersion(ctx, db)
561+
bv, err := clusterupgrade.BinaryVersion(ctx, l, db)
562562
if err != nil {
563563
return errors.Wrapf(err, "getting binary version on n%d", node)
564564
}

pkg/cmd/roachtest/tests/mixed_version_backup.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,10 @@ func sanitizeVersionForBackup(v *clusterupgrade.Version) string {
216216
// hasInternalSystemJobs returns true if the cluster is expected to
217217
// have the `crdb_internal.system_jobs` vtable. If so, it should be
218218
// used instead of `system.jobs` when querying job status.
219-
func hasInternalSystemJobs(ctx context.Context, rng *rand.Rand, db *gosql.DB) (bool, error) {
220-
cv, err := clusterupgrade.ClusterVersion(ctx, db)
219+
func hasInternalSystemJobs(
220+
ctx context.Context, l *logger.Logger, rng *rand.Rand, db *gosql.DB,
221+
) (bool, error) {
222+
cv, err := clusterupgrade.ClusterVersion(ctx, l, db)
221223
if err != nil {
222224
return false, fmt.Errorf("failed to query cluster version: %w", err)
223225
}
@@ -2211,7 +2213,7 @@ func (mvb *mixedVersionBackup) createBackupCollection(
22112213
backupNamePrefix := mvb.backupNamePrefix(h, label)
22122214
n, db := h.System.RandomDB(rng)
22132215
l.Printf("checking existence of crdb_internal.system_jobs via node %d", n)
2214-
internalSystemJobs, err := hasInternalSystemJobs(ctx, rng, db)
2216+
internalSystemJobs, err := hasInternalSystemJobs(ctx, l, rng, db)
22152217
if err != nil {
22162218
return err
22172219
}
@@ -2742,7 +2744,7 @@ func (mvb *mixedVersionBackup) verifySomeBackups(
27422744

27432745
n, db := h.System.RandomDB(rng)
27442746
l.Printf("checking existence of crdb_internal.system_jobs via node %d", n)
2745-
internalSystemJobs, err := hasInternalSystemJobs(ctx, rng, db)
2747+
internalSystemJobs, err := hasInternalSystemJobs(ctx, l, rng, db)
27462748
if err != nil {
27472749
return err
27482750
}
@@ -2823,7 +2825,7 @@ func (mvb *mixedVersionBackup) verifyAllBackups(
28232825

28242826
n, db := h.System.RandomDB(rng)
28252827
l.Printf("checking existence of crdb_internal.system_jobs via node %d", n)
2826-
internalSystemJobs, err := hasInternalSystemJobs(ctx, rng, db)
2828+
internalSystemJobs, err := hasInternalSystemJobs(ctx, l, rng, db)
28272829
if err != nil {
28282830
restoreErrors = append(restoreErrors, fmt.Errorf("error checking for internal system jobs: %w", err))
28292831
return

pkg/cmd/roachtest/tests/multitenant_upgrade.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ func runMultitenantUpgrade(ctx context.Context, t test.Test, c cluster.Cluster)
284284
}
285285
defer tenantDB.Close()
286286

287-
binaryVersion, err := clusterupgrade.BinaryVersion(ctx, tenantDB)
287+
binaryVersion, err := clusterupgrade.BinaryVersion(ctx, l, tenantDB)
288288
if err != nil {
289289
return errors.Wrapf(err, "failed to query binary version for %s on n%d", tenant.name, n)
290290
}

pkg/cmd/roachtest/tests/versionupgrade.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ func makeVersionFixtureAndFatal(
263263
// cluster version might be 2.0, so we can only use the 2.0 or
264264
// 2.1 binary, but not the 19.1 binary (as 19.1 and 2.0 are not
265265
// compatible).
266-
binaryVersion, err := clusterupgrade.BinaryVersion(ctx, dbFunc(1))
266+
binaryVersion, err := clusterupgrade.BinaryVersion(ctx, t.L(), dbFunc(1))
267267
if err != nil {
268268
t.Fatalf("fetching binary version on n1: %v", err)
269269
}

0 commit comments

Comments
 (0)