Skip to content

Commit 2a6e468

Browse files
craig[bot]alyshanjahani-crlkev-cao
committed
149571: roachtest: Skip flaky endpoint and fix retry logic for db-console/endpoints r=alyshanjahani-crl a=alyshanjahani-crl This commit skips the rangelog endpoint on the admin server. This failure mode is described in: #148112 (comment) This commit also sets an initial backoff when retrying endpoints. The default of 50ms resulted in the 10 retries being exhausted too quickly. In mixed version tests nodes are taken down and brought up, and so we observed many 404 failures due to this. Fixes: #148187 Release note: None 149615: backup: move retry loop to entire download phase r=msbutler a=kev-cao The retry loop introduced in #148821 specifically would retry the `DownloadSpan` requests. Moving the retry loop to handle the entire download phase would also allow us to retry the cleanup step as well. Epic: CRDB-51394 Release note: None Co-authored-by: Alyshan Jahani <[email protected]> Co-authored-by: Kevin Cao <[email protected]>
3 parents e25ecb8 + 98c956c + a546326 commit 2a6e468

File tree

4 files changed

+27
-22
lines changed

4 files changed

+27
-22
lines changed

pkg/backup/restore_job.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1793,7 +1793,7 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
17931793
if err := p.ExecCfg().JobRegistry.CheckPausepoint("restore.before_do_download_files"); err != nil {
17941794
return err
17951795
}
1796-
return r.doDownloadFiles(ctx, p)
1796+
return r.doDownloadFilesWithRetry(ctx, p)
17971797
}
17981798

17991799
if err := p.ExecCfg().JobRegistry.CheckPausepoint("restore.before_load_descriptors_from_backup"); err != nil {
@@ -2026,7 +2026,7 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
20262026
// TODO(msbutler): ideally doDownloadFiles would not depend on job details
20272027
// and is instead passed an execCfg and the download spans and anything else
20282028
// it needs. If that occured, we would not need to update details above.
2029-
if err := r.doDownloadFiles(ctx, p); err != nil {
2029+
if err := r.doDownloadFilesWithRetry(ctx, p); err != nil {
20302030
return err
20312031
}
20322032
}

pkg/backup/restore_online.go

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -563,28 +563,14 @@ func (r *restoreResumer) sendDownloadWorker(
563563
return err
564564
}
565565

566-
var err error
567-
for r := retry.StartWithCtx(ctx, retry.Options{
568-
InitialBackoff: time.Millisecond * 100,
569-
MaxBackoff: time.Second,
570-
MaxRetries: maxDownloadAttempts - 1,
571-
}); r.Next(); {
572-
err = func() error {
573-
if testingKnobs != nil && testingKnobs.RunBeforeSendingDownloadSpan != nil {
574-
if err := testingKnobs.RunBeforeSendingDownloadSpan(); err != nil {
575-
return err
576-
}
577-
}
578-
return sendDownloadSpan(ctx, execCtx, spans)
579-
}()
580-
if err == nil {
581-
break
566+
if testingKnobs != nil && testingKnobs.RunBeforeSendingDownloadSpan != nil {
567+
if err := testingKnobs.RunBeforeSendingDownloadSpan(); err != nil {
568+
return err
582569
}
583-
log.VInfof(ctx, 1, "attempt %d failed to download spans: %v", r.CurrentAttempt(), err)
584570
}
585571

586-
if err != nil {
587-
return errors.Wrapf(err, "retries exhausted for sending download spans")
572+
if err := sendDownloadSpan(ctx, execCtx, spans); err != nil {
573+
return err
588574
}
589575

590576
// Wait for the completion poller to signal that it has checked our work.
@@ -818,6 +804,24 @@ func getRemainingExternalFileBytes(
818804
return remaining, nil
819805
}
820806

807+
func (r *restoreResumer) doDownloadFilesWithRetry(
808+
ctx context.Context, execCtx sql.JobExecContext,
809+
) error {
810+
var err error
811+
for rt := retry.StartWithCtx(ctx, retry.Options{
812+
InitialBackoff: time.Millisecond * 100,
813+
MaxBackoff: time.Second,
814+
MaxRetries: maxDownloadAttempts - 1,
815+
}); rt.Next(); {
816+
err = r.doDownloadFiles(ctx, execCtx)
817+
if err == nil {
818+
return nil
819+
}
820+
log.Warningf(ctx, "failed attempt #%d to download files: %v", rt.CurrentAttempt(), err)
821+
}
822+
return errors.Wrapf(err, "retries exhausted for downloading files")
823+
}
824+
821825
func (r *restoreResumer) doDownloadFiles(ctx context.Context, execCtx sql.JobExecContext) error {
822826
details := r.job.Details().(jobspb.RestoreDetails)
823827

pkg/cmd/roachtest/tests/db-console/admin_endpoints.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
{
7979
"url": "/_admin/v1/rangelog",
8080
"method": "GET"
81+
"skip": "https://github.com/cockroachdb/cockroach/pull/148112#issuecomment-2960322577"
8182
},
8283
{
8384
"url": "/_admin/v1/rangelog/{range_id}",

pkg/cmd/roachtest/tests/db_console_endpoints.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ func testEndpoint(
248248
return nil
249249
}
250250

251-
return withRetries(ctx, retry.Options{MaxRetries: 10}, f)
251+
return withRetries(ctx, retry.Options{InitialBackoff: time.Second, MaxRetries: 10}, f)
252252
}
253253

254254
// withRetries runs the given function f with the provided retry options.

0 commit comments

Comments
 (0)