Skip to content

Commit a546326

Browse files
committed
backup: move retry loop to entire download phase
The retry loop introduced in #148821 specifically would retry the `DownloadSpan` requests. Moving the retry loop to handle the entire download phase would also allow us to retry the cleanup step as well. Epic: CRDB-51394 Release note: None
1 parent 55d7fbb commit a546326

File tree

2 files changed

+25
-21
lines changed

2 files changed

+25
-21
lines changed

pkg/backup/restore_job.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1793,7 +1793,7 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
17931793
if err := p.ExecCfg().JobRegistry.CheckPausepoint("restore.before_do_download_files"); err != nil {
17941794
return err
17951795
}
1796-
return r.doDownloadFiles(ctx, p)
1796+
return r.doDownloadFilesWithRetry(ctx, p)
17971797
}
17981798

17991799
if err := p.ExecCfg().JobRegistry.CheckPausepoint("restore.before_load_descriptors_from_backup"); err != nil {
@@ -2026,7 +2026,7 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
20262026
// TODO(msbutler): ideally doDownloadFiles would not depend on job details
20272027
// and is instead passed an execCfg and the download spans and anything else
20282028
// it needs. If that occured, we would not need to update details above.
2029-
if err := r.doDownloadFiles(ctx, p); err != nil {
2029+
if err := r.doDownloadFilesWithRetry(ctx, p); err != nil {
20302030
return err
20312031
}
20322032
}

pkg/backup/restore_online.go

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -563,28 +563,14 @@ func (r *restoreResumer) sendDownloadWorker(
563563
return err
564564
}
565565

566-
var err error
567-
for r := retry.StartWithCtx(ctx, retry.Options{
568-
InitialBackoff: time.Millisecond * 100,
569-
MaxBackoff: time.Second,
570-
MaxRetries: maxDownloadAttempts - 1,
571-
}); r.Next(); {
572-
err = func() error {
573-
if testingKnobs != nil && testingKnobs.RunBeforeSendingDownloadSpan != nil {
574-
if err := testingKnobs.RunBeforeSendingDownloadSpan(); err != nil {
575-
return err
576-
}
577-
}
578-
return sendDownloadSpan(ctx, execCtx, spans)
579-
}()
580-
if err == nil {
581-
break
566+
if testingKnobs != nil && testingKnobs.RunBeforeSendingDownloadSpan != nil {
567+
if err := testingKnobs.RunBeforeSendingDownloadSpan(); err != nil {
568+
return err
582569
}
583-
log.VInfof(ctx, 1, "attempt %d failed to download spans: %v", r.CurrentAttempt(), err)
584570
}
585571

586-
if err != nil {
587-
return errors.Wrapf(err, "retries exhausted for sending download spans")
572+
if err := sendDownloadSpan(ctx, execCtx, spans); err != nil {
573+
return err
588574
}
589575

590576
// Wait for the completion poller to signal that it has checked our work.
@@ -818,6 +804,24 @@ func getRemainingExternalFileBytes(
818804
return remaining, nil
819805
}
820806

807+
func (r *restoreResumer) doDownloadFilesWithRetry(
808+
ctx context.Context, execCtx sql.JobExecContext,
809+
) error {
810+
var err error
811+
for rt := retry.StartWithCtx(ctx, retry.Options{
812+
InitialBackoff: time.Millisecond * 100,
813+
MaxBackoff: time.Second,
814+
MaxRetries: maxDownloadAttempts - 1,
815+
}); rt.Next(); {
816+
err = r.doDownloadFiles(ctx, execCtx)
817+
if err == nil {
818+
return nil
819+
}
820+
log.Warningf(ctx, "failed attempt #%d to download files: %v", rt.CurrentAttempt(), err)
821+
}
822+
return errors.Wrapf(err, "retries exhausted for downloading files")
823+
}
824+
821825
func (r *restoreResumer) doDownloadFiles(ctx context.Context, execCtx sql.JobExecContext) error {
822826
details := r.job.Details().(jobspb.RestoreDetails)
823827

0 commit comments

Comments
 (0)