Skip to content

Commit 54ce3dc

Browse files
committed
backup: reset retry counter for download job when progress is detected
In #1448821, we added a retry loop to the download phase of online restore, but the retry policy allows at most 5 retries. In the event of download progress, we want to be more lenient with the retry policy, so this commit teaches the download phase to reset the retry counter if progress is detected. Epic: CRDB-51394 Fixes: #149893
1 parent 9e36b27 commit 54ce3dc

File tree

4 files changed

+81
-5
lines changed

4 files changed

+81
-5
lines changed

pkg/backup/restore_job.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -672,10 +672,11 @@ func loadBackupSQLDescs(
672672
type restoreResumer struct {
673673
job *jobs.Job
674674

675-
settings *cluster.Settings
676-
execCfg *sql.ExecutorConfig
677-
restoreStats roachpb.RowCount
678-
downloadJobID jobspb.JobID
675+
settings *cluster.Settings
676+
execCfg *sql.ExecutorConfig
677+
restoreStats roachpb.RowCount
678+
downloadJobID jobspb.JobID
679+
downloadJobProg float32
679680

680681
mu struct {
681682
syncutil.Mutex

pkg/backup/restore_online.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,7 @@ func (r *restoreResumer) waitForDownloadToComplete(
705705
// Download is already complete or there is nothing to be downloaded, in
706706
// either case we can mark the job as done.
707707
if total == 0 {
708+
r.downloadJobProg = 1.0
708709
return r.job.NoTxn().FractionProgressed(ctx, func(ctx context.Context, details jobspb.ProgressDetails) float32 {
709710
return 1.0
710711
})
@@ -731,6 +732,7 @@ func (r *restoreResumer) waitForDownloadToComplete(
731732
log.VInfof(ctx, 1, "restore download phase, %s downloaded, %s remaining of %s total (%.2f complete)",
732733
sz(total-remaining), sz(remaining), sz(total), fractionComplete,
733734
)
735+
r.downloadJobProg = fractionComplete
734736

735737
if remaining == 0 {
736738
r.notifyStatsRefresherOfNewTables()
@@ -808,6 +810,7 @@ func (r *restoreResumer) doDownloadFilesWithRetry(
808810
ctx context.Context, execCtx sql.JobExecContext,
809811
) error {
810812
var err error
813+
var lastProgress float32
811814
for rt := retry.StartWithCtx(ctx, retry.Options{
812815
InitialBackoff: time.Millisecond * 100,
813816
MaxBackoff: time.Second,
@@ -817,7 +820,12 @@ func (r *restoreResumer) doDownloadFilesWithRetry(
817820
if err == nil {
818821
return nil
819822
}
820-
log.Warningf(ctx, "failed attempt #%d to download files: %v", rt.CurrentAttempt(), err)
823+
log.Warningf(ctx, "failed attempt to download files: %v", err)
824+
if lastProgress != r.downloadJobProg {
825+
lastProgress = r.downloadJobProg
826+
rt.Reset()
827+
log.Infof(ctx, "download progress has advanced since last retry, resetting retry counter")
828+
}
821829
}
822830
return errors.Wrapf(err, "retries exhausted for downloading files")
823831
}
@@ -843,6 +851,13 @@ func (r *restoreResumer) doDownloadFiles(ctx context.Context, execCtx sql.JobExe
843851
}
844852
return errors.Wrap(err, "failed to generate and send download spans")
845853
}
854+
855+
testingKnobs := execCtx.ExecCfg().BackupRestoreTestingKnobs
856+
if testingKnobs != nil && testingKnobs.RunBeforeDownloadCleanup != nil {
857+
if err := testingKnobs.RunBeforeDownloadCleanup(); err != nil {
858+
return errors.Wrap(err, "testing knob RunBeforeDownloadCleanup failed")
859+
}
860+
}
846861
return r.cleanupAfterDownload(ctx, details)
847862
}
848863

pkg/backup/restore_online_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,62 @@ func TestOnlineRestoreRetryingDownloadRequests(t *testing.T) {
642642
}
643643
}
644644

645+
func TestOnlineRestoreDownloadRetryReset(t *testing.T) {
646+
defer leaktest.AfterTest(t)()
647+
defer log.Scope(t).Close(t)
648+
649+
defer nodelocal.ReplaceNodeLocalForTesting(t.TempDir())()
650+
651+
var attemptCount int
652+
clusterArgs := base.TestClusterArgs{
653+
ServerArgs: base.TestServerArgs{
654+
DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant,
655+
Knobs: base.TestingKnobs{
656+
BackupRestore: &sql.BackupRestoreTestingKnobs{
657+
// We want the retry loop to fail until its final attempt, and then
658+
// succeed on the last attempt. This will allow the download job to
659+
// make progress, in which case the retry loop _should_ reset. Then
660+
// we continue allowing the retry loop to fail until its last
661+
// attempt, in which case it will succeed again.
662+
RunBeforeSendingDownloadSpan: func() error {
663+
attemptCount++
664+
if attemptCount < maxDownloadAttempts {
665+
return errors.Newf("injected download request failure")
666+
}
667+
return nil
668+
},
669+
RunBeforeDownloadCleanup: func() error {
670+
if attemptCount < maxDownloadAttempts*2 {
671+
return errors.Newf("injected download cleanup failure")
672+
}
673+
return nil
674+
},
675+
},
676+
},
677+
},
678+
}
679+
const numAccounts = 1
680+
_, sqlDB, _, cleanupFn := backupRestoreTestSetupWithParams(
681+
t, singleNode, numAccounts, InitManualReplication, clusterArgs,
682+
)
683+
defer cleanupFn()
684+
685+
externalStorage := "nodelocal://1/backup"
686+
sqlDB.Exec(t, fmt.Sprintf("BACKUP INTO '%s'", externalStorage))
687+
sqlDB.Exec(
688+
t,
689+
fmt.Sprintf(`
690+
RESTORE DATABASE data FROM LATEST IN '%s'
691+
WITH EXPERIMENTAL DEFERRED COPY, new_db_name=data2
692+
`, externalStorage),
693+
)
694+
695+
var downloadJobID jobspb.JobID
696+
sqlDB.QueryRow(t, latestDownloadJobIDQuery).Scan(&downloadJobID)
697+
jobutils.WaitForJobToSucceed(t, sqlDB, downloadJobID)
698+
require.Equal(t, maxDownloadAttempts*2, attemptCount)
699+
}
700+
645701
func bankOnlineRestore(
646702
t *testing.T,
647703
sqlDB *sqlutils.SQLRunner,

pkg/sql/exec_util_backup.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ type BackupRestoreTestingKnobs struct {
7373
// RunBeforeSendingDownloadSpan is called within the retry loop of the
7474
// download span worker before sending the download span request.
7575
RunBeforeSendingDownloadSpan func() error
76+
77+
// RunBeforeDownloadCleanup is called before we cleanup after all external
78+
// files have been download.
79+
RunBeforeDownloadCleanup func() error
7680
}
7781

7882
var _ base.ModuleTestingKnobs = &BackupRestoreTestingKnobs{}

0 commit comments

Comments
 (0)