Skip to content

Commit 46a42da

Browse files
committed
roachtest: retry invalid descriptors check
This check occasionally times out due to the cluster being overloaded. Wrap it in a retry loop to reduce flakes. Fixes: #145092
1 parent adfec3c commit 46a42da

File tree

4 files changed

+31
-4
lines changed

4 files changed

+31
-4
lines changed

pkg/cmd/roachtest/roachtestutil/utils.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,29 @@ func ExecWithRetry(
361361
return result, err
362362
}
363363

364+
// QueryWithRetry queries the given SQL statement with the specified retry logic.
365+
func QueryWithRetry(
366+
ctx context.Context,
367+
l *logger.Logger,
368+
db *gosql.DB,
369+
retryOpts retry.Options,
370+
query string,
371+
args ...any,
372+
) (*gosql.Rows, error) {
373+
var rows *gosql.Rows
374+
err := retryOpts.Do(ctx, func(ctx context.Context) error {
375+
var err error
376+
rows, err = db.QueryContext(ctx, query, args...)
377+
if err != nil {
378+
l.Printf("%s failed (retrying): %v", query, err)
379+
return err
380+
}
381+
return nil
382+
})
383+
384+
return rows, err
385+
}
386+
364387
// ClusterSettingRetryOpts are retry options intended for cluster setting operations.
365388
//
366389
// We use relatively high backoff parameters with the assumption that:

pkg/cmd/roachtest/roachtestutil/validation_check.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,14 +103,18 @@ FROM crdb_internal.check_consistency(false, '', '') as t;`)
103103

104104
// CheckInvalidDescriptors returns an error if there exists any descriptors in
105105
// the crdb_internal.invalid_objects virtual table.
106-
func CheckInvalidDescriptors(ctx context.Context, db *gosql.DB) error {
106+
func CheckInvalidDescriptors(ctx context.Context, l *logger.Logger, db *gosql.DB) error {
107107
var invalidIDs string
108+
l.Printf("checking for invalid descriptors")
108109
if err := timeutil.RunWithTimeout(ctx, "descriptor validation", time.Minute, func(ctx context.Context) error {
109110
// Because crdb_internal.invalid_objects is a virtual table, by default, the
110111
// query will take a lease on the database sqlDB is connected to and only run
111112
// the query on the given database. The "" prefix prevents this lease
112113
// acquisition and allows the query to fetch all descriptors in the cluster.
113-
rows, err := db.QueryContext(ctx, `SELECT id, obj_name, error FROM "".crdb_internal.invalid_objects`)
114+
//
115+
// This often times out when the cluster is overloaded. Since this is just
116+
// for validation, we retry with a relatively high backoff.
117+
rows, err := QueryWithRetry(ctx, l, db, ClusterSettingRetryOpts, `SELECT id, obj_name, error FROM "".crdb_internal.invalid_objects`)
114118
if err != nil {
115119
return err
116120
}

pkg/cmd/roachtest/test_runner.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1638,7 +1638,7 @@ func (r *testRunner) postTestAssertions(
16381638
// NB: the invalid description checks should run at the system tenant level.
16391639
db := c.Conn(ctx, t.L(), validationNode, option.VirtualClusterName(install.SystemInterfaceName))
16401640
defer db.Close()
1641-
if err := roachtestutil.CheckInvalidDescriptors(ctx, db); err != nil {
1641+
if err := roachtestutil.CheckInvalidDescriptors(ctx, t.L(), db); err != nil {
16421642
postAssertionErr(errors.WithDetail(err, "invalid descriptors check failed"))
16431643
}
16441644
}()

pkg/cmd/roachtest/tests/mixed_version_backup.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1289,7 +1289,7 @@ func (d *BackupRestoreTestDriver) verifyBackupCollection(
12891289
}
12901290
}
12911291
_, db := d.testUtils.RandomDB(rng, d.testUtils.roachNodes)
1292-
if err := roachtestutil.CheckInvalidDescriptors(ctx, db); err != nil {
1292+
if err := roachtestutil.CheckInvalidDescriptors(ctx, l, db); err != nil {
12931293
return fmt.Errorf("failed descriptor check: %w", err)
12941294
}
12951295

0 commit comments

Comments
 (0)