Skip to content

Commit 99d44a0

Browse files
craig[bot]kev-cao
andcommitted
Merge #149996
149996: backup: reset retry counter for download job when progress is detected r=dt a=kev-cao In #1448821, we added a retry loop to the download phase of online restore, but the retry policy allows at most 5 retries. In the event of download progress, we want to be more lenient with the retry policy, so this commit teaches the download phase to reset the retry counter if progress is detected. Epic: CRDB-51394 Fixes: #149893 Co-authored-by: Kevin Cao <[email protected]>
2 parents cad9544 + 54ce3dc commit 99d44a0

File tree

4 files changed

+81
-5
lines changed

4 files changed

+81
-5
lines changed

pkg/backup/restore_job.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -707,10 +707,11 @@ func loadBackupSQLDescs(
707707
type restoreResumer struct {
708708
job *jobs.Job
709709

710-
settings *cluster.Settings
711-
execCfg *sql.ExecutorConfig
712-
restoreStats roachpb.RowCount
713-
downloadJobID jobspb.JobID
710+
settings *cluster.Settings
711+
execCfg *sql.ExecutorConfig
712+
restoreStats roachpb.RowCount
713+
downloadJobID jobspb.JobID
714+
downloadJobProg float32
714715

715716
mu struct {
716717
syncutil.Mutex

pkg/backup/restore_online.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,7 @@ func (r *restoreResumer) waitForDownloadToComplete(
705705
// Download is already complete or there is nothing to be downloaded, in
706706
// either case we can mark the job as done.
707707
if total == 0 {
708+
r.downloadJobProg = 1.0
708709
return r.job.NoTxn().FractionProgressed(ctx, func(ctx context.Context, details jobspb.ProgressDetails) float32 {
709710
return 1.0
710711
})
@@ -731,6 +732,7 @@ func (r *restoreResumer) waitForDownloadToComplete(
731732
log.VInfof(ctx, 1, "restore download phase, %s downloaded, %s remaining of %s total (%.2f complete)",
732733
sz(total-remaining), sz(remaining), sz(total), fractionComplete,
733734
)
735+
r.downloadJobProg = fractionComplete
734736

735737
if remaining == 0 {
736738
r.notifyStatsRefresherOfNewTables()
@@ -809,6 +811,7 @@ func (r *restoreResumer) doDownloadFilesWithRetry(
809811
ctx context.Context, execCtx sql.JobExecContext,
810812
) error {
811813
var err error
814+
var lastProgress float32
812815
for rt := retry.StartWithCtx(ctx, retry.Options{
813816
InitialBackoff: time.Millisecond * 100,
814817
MaxBackoff: time.Second,
@@ -818,7 +821,12 @@ func (r *restoreResumer) doDownloadFilesWithRetry(
818821
if err == nil {
819822
return nil
820823
}
821-
log.Warningf(ctx, "failed attempt #%d to download files: %v", rt.CurrentAttempt(), err)
824+
log.Warningf(ctx, "failed attempt to download files: %v", err)
825+
if lastProgress != r.downloadJobProg {
826+
lastProgress = r.downloadJobProg
827+
rt.Reset()
828+
log.Infof(ctx, "download progress has advanced since last retry, resetting retry counter")
829+
}
822830
}
823831
return errors.Wrapf(err, "retries exhausted for downloading files")
824832
}
@@ -844,6 +852,13 @@ func (r *restoreResumer) doDownloadFiles(ctx context.Context, execCtx sql.JobExe
844852
}
845853
return errors.Wrap(err, "failed to generate and send download spans")
846854
}
855+
856+
testingKnobs := execCtx.ExecCfg().BackupRestoreTestingKnobs
857+
if testingKnobs != nil && testingKnobs.RunBeforeDownloadCleanup != nil {
858+
if err := testingKnobs.RunBeforeDownloadCleanup(); err != nil {
859+
return errors.Wrap(err, "testing knob RunBeforeDownloadCleanup failed")
860+
}
861+
}
847862
return r.cleanupAfterDownload(ctx, details)
848863
}
849864

pkg/backup/restore_online_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,62 @@ func TestOnlineRestoreRetryingDownloadRequests(t *testing.T) {
642642
}
643643
}
644644

645+
func TestOnlineRestoreDownloadRetryReset(t *testing.T) {
646+
defer leaktest.AfterTest(t)()
647+
defer log.Scope(t).Close(t)
648+
649+
defer nodelocal.ReplaceNodeLocalForTesting(t.TempDir())()
650+
651+
var attemptCount int
652+
clusterArgs := base.TestClusterArgs{
653+
ServerArgs: base.TestServerArgs{
654+
DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant,
655+
Knobs: base.TestingKnobs{
656+
BackupRestore: &sql.BackupRestoreTestingKnobs{
657+
// We want the retry loop to fail until its final attempt, and then
658+
// succeed on the last attempt. This will allow the download job to
659+
// make progress, in which case the retry loop _should_ reset. Then
660+
// we continue allowing the retry loop to fail until its last
661+
// attempt, in which case it will succeed again.
662+
RunBeforeSendingDownloadSpan: func() error {
663+
attemptCount++
664+
if attemptCount < maxDownloadAttempts {
665+
return errors.Newf("injected download request failure")
666+
}
667+
return nil
668+
},
669+
RunBeforeDownloadCleanup: func() error {
670+
if attemptCount < maxDownloadAttempts*2 {
671+
return errors.Newf("injected download cleanup failure")
672+
}
673+
return nil
674+
},
675+
},
676+
},
677+
},
678+
}
679+
const numAccounts = 1
680+
_, sqlDB, _, cleanupFn := backupRestoreTestSetupWithParams(
681+
t, singleNode, numAccounts, InitManualReplication, clusterArgs,
682+
)
683+
defer cleanupFn()
684+
685+
externalStorage := "nodelocal://1/backup"
686+
sqlDB.Exec(t, fmt.Sprintf("BACKUP INTO '%s'", externalStorage))
687+
sqlDB.Exec(
688+
t,
689+
fmt.Sprintf(`
690+
RESTORE DATABASE data FROM LATEST IN '%s'
691+
WITH EXPERIMENTAL DEFERRED COPY, new_db_name=data2
692+
`, externalStorage),
693+
)
694+
695+
var downloadJobID jobspb.JobID
696+
sqlDB.QueryRow(t, latestDownloadJobIDQuery).Scan(&downloadJobID)
697+
jobutils.WaitForJobToSucceed(t, sqlDB, downloadJobID)
698+
require.Equal(t, maxDownloadAttempts*2, attemptCount)
699+
}
700+
645701
func bankOnlineRestore(
646702
t *testing.T,
647703
sqlDB *sqlutils.SQLRunner,

pkg/sql/exec_util_backup.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ type BackupRestoreTestingKnobs struct {
7777
// RunBeforeSendingDownloadSpan is called within the retry loop of the
7878
// download span worker before sending the download span request.
7979
RunBeforeSendingDownloadSpan func() error
80+
81+
// RunBeforeDownloadCleanup is called before we cleanup after all external
82+
// files have been download.
83+
RunBeforeDownloadCleanup func() error
8084
}
8185

8286
var _ base.ModuleTestingKnobs = &BackupRestoreTestingKnobs{}

0 commit comments

Comments
 (0)