Skip to content

Commit 4fbdf6d

Browse files
craig[bot]kev-cao
andcommitted
Merge #147922
147922: roachtest: set pausepoint before backups in OR recovery roachtest r=msbutler a=kev-cao The OR recovery roachtests were encountering intermittent failures where the download job was found in a succeeded state when the test was waiting for it to pause. It was determined this was occurring whenever a full cluster backup/restore was performed. Because the pausepoint was set _after_ the backups were taken, this meant that the link phase of OR in a full cluster restore would wipe the pausepoints before the download phase. This patch moves the pausepoint setting so that it is set before the backups and therefore will be preserved after a full cluster OR. Fixes: #147557 Release note: None Co-authored-by: Kevin Cao <[email protected]>
2 parents bc50ec9 + bc26572 commit 4fbdf6d

File tree

2 files changed

+16
-16
lines changed

2 files changed

+16
-16
lines changed

pkg/cmd/roachtest/tests/backup_restore_roundtrip.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,20 @@ func testOnlineRestoreRecovery(ctx context.Context, t test.Test, c cluster.Clust
478478
Execute: allNodes,
479479
}
480480

481+
// We set this pausepoint so that the download job is paused before it
482+
// begins downloading external files. This allows us to guarantee that when
483+
// we delete an SST file, it won't have already been downloaded by the
484+
// download phase and therefore not trigger a failure.
485+
// We also must set this BEFORE taking backups. In the event of a full
486+
// cluster backup, the link phase of OR will reset the pausepoints back to
487+
// the point of the backup, which means this pausepoint will be wiped if it
488+
// is set after the backups.
489+
if _, err := dbConn.ExecContext(
490+
ctx, "SET CLUSTER SETTING jobs.debug.pausepoints = 'restore.before_download'",
491+
); err != nil {
492+
return errors.Wrap(err, "failed to set pausepoint for online restore")
493+
}
494+
481495
t.L().Printf("starting backup")
482496
collection, err := d.createBackupCollection(
483497
ctx, t.L(), t, testRNG, bspec, bspec, "online-restore-recovery-backup",
@@ -509,16 +523,6 @@ func testOnlineRestoreRecovery(ctx context.Context, t test.Test, c cluster.Clust
509523
}
510524
}
511525

512-
// We set this pausepoint so that the download job is paused before it
513-
// begins downloading external files. This allows us to guarantee that when
514-
// we delete an SST file, it won't have already been downloaded by the
515-
// download phase and therefore not trigger a failure.
516-
if _, err := dbConn.ExecContext(
517-
ctx, "SET CLUSTER SETTING jobs.debug.pausepoints = 'restore.before_download'",
518-
); err != nil {
519-
return errors.Wrap(err, "failed to set pausepoint for online restore")
520-
}
521-
522526
t.L().Printf("performing online restore of backup")
523527
if _, _, err := d.runRestore(
524528
ctx, t.L(), testRNG, collection, false /* checkFiles */, true, /* internalSystemJobs */
@@ -541,7 +545,6 @@ func testOnlineRestoreRecovery(ctx context.Context, t test.Test, c cluster.Clust
541545
return errors.Wrap(err, "failed to remove pausepoint for online restore")
542546
}
543547

544-
t.L().Printf("deleting sst from backup")
545548
if err := d.deleteRandomSST(ctx, t.L(), dbConn, collection); err != nil {
546549
return err
547550
}
@@ -555,10 +558,7 @@ func testOnlineRestoreRecovery(ctx context.Context, t test.Test, c cluster.Clust
555558
err, "waiting for download job %v to reach resumed state", downloadJobID,
556559
)
557560
}
558-
if err := errors.Wrapf(
559-
WaitForFailed(ctx, dbConn, jobspb.JobID(downloadJobID), jobStatusWait),
560-
"waiting for download job %v to reach failed state", downloadJobID,
561-
); err != nil {
561+
if err := WaitForFailed(ctx, dbConn, jobspb.JobID(downloadJobID), jobStatusWait); err != nil {
562562
return errors.Wrapf(
563563
err, "waiting for download job %v to reach failed state", downloadJobID,
564564
)

pkg/cmd/roachtest/tests/jobs_util.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ func WaitForState(
198198
return nil
199199
}
200200
if timeutil.Since(startTime) > maxWait {
201-
return errors.Newf("job %d did not reach state %s after %s", jobID, state, maxWait)
201+
return errors.Newf("job %d still in state %s after %s", jobID, state, maxWait)
202202
}
203203
ticker.Reset(5 * time.Second)
204204
case <-ctx.Done():

0 commit comments

Comments
 (0)