cockroachdb
diff --git a/‎pkg/backup/backup_test.go‎
Lines changed: 12 additions & 11 deletions b/‎pkg/backup/backup_test.go‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎pkg/backup/full_cluster_backup_restore_test.go‎
Lines changed: 2 additions & 8 deletions b/‎pkg/backup/full_cluster_backup_restore_test.go‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎pkg/backup/restore_data_processor.go‎
Lines changed: 0 additions & 7 deletions b/‎pkg/backup/restore_data_processor.go‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎pkg/backup/restore_job.go‎
Lines changed: 79 additions & 45 deletions b/‎pkg/backup/restore_job.go‎
Lines changed: 79 additions & 45 deletions
@@ -1404,26 +1404,27 @@ func TestRestoreJobRetryReset(t *testing.T) {
 	}{}
 	waitForProgress := make(chan struct{})
 
-	maxRetries := 4
-
 	params := base.TestClusterArgs{}
 	knobs := base.TestingKnobs{
 		BackupRestore: &sql.BackupRestoreTestingKnobs{
 			RestoreDistSQLRetryPolicy: &retry.Options{
 				InitialBackoff: time.Microsecond,
 				Multiplier:     2,
 				MaxBackoff:     2 * time.Microsecond,
-				MaxRetries:     maxRetries,
+				MaxDuration:    time.Second,
 			},
+			// Disable switching to the secondary retry policy for this test since it
+			// is not relevant to the test. Set to an unachievable value.
+			RestoreRetryProgressThreshold: 1.1,
 			RunBeforeRestoreFlow: func() error {
 				mu.Lock()
 				defer mu.Unlock()
-				if mu.retryCount >= maxRetries-1 {
-					return nil
+				if mu.retryCount < maxRestoreRetryFastFail {
+					mu.retryCount++
+					// Send a retryable error
+					return syscall.ECONNRESET
 				}
-				mu.retryCount++
-				// Send a retryable error
-				return syscall.ECONNRESET
+				return nil
 			},
 			RunAfterRestoreFlow: func() error {
 				mu.Lock()
@@ -1454,9 +1455,9 @@ func TestRestoreJobRetryReset(t *testing.T) {
 	})
 	close(waitForProgress)
 
-	jobutils.WaitForJobToPause(t, sqlDB, restoreJobId)
+	jobutils.WaitForJobToFail(t, sqlDB, restoreJobId)
 
-	require.Greater(t, mu.retryCount, maxRetries+2)
+	require.Greater(t, mu.retryCount, maxRestoreRetryFastFail+2)
 }
 
 // TestRestoreRetryProcErr tests that the restore data processor will mark
@@ -1493,7 +1494,7 @@ func TestRestoreRetryProcErr(t *testing.T) {
 					InitialBackoff: time.Microsecond,
 					Multiplier:     2,
 					MaxBackoff:     2 * time.Microsecond,
-					MaxRetries:     4,
+					MaxDuration:    time.Second,
 				},
 				RunBeforeRestoreFlow: func() error {
 					mu.Lock()
 
@@ -689,19 +689,13 @@ func TestClusterRestoreFailCleanup(t *testing.T) {
 	sqlDB.Exec(t, `BACKUP data.bank INTO 'nodelocal://1/throwawayjob'`)
 	sqlDB.Exec(t, `BACKUP INTO $1`, localFoo)
 
-	waitForJobPauseCancel := func(db *sqlutils.SQLRunner, jobID jobspb.JobID) {
-		db.Exec(t, `USE system;`)
-		jobutils.WaitForJobToPause(t, db, jobID)
-		db.Exec(t, `CANCEL JOB $1`, jobID)
-		jobutils.WaitForJobToCancel(t, db, jobID)
-	}
-
 	t.Run("during restoration of data", func(t *testing.T) {
 		_, sqlDBRestore, cleanupEmptyCluster := backupRestoreTestSetupEmpty(t, singleNode, tempDir, InitManualReplication, base.TestClusterArgs{})
 		defer cleanupEmptyCluster()
 		var jobID jobspb.JobID
 		sqlDBRestore.QueryRow(t, `RESTORE FROM LATEST IN 'nodelocal://1/missing-ssts' WITH detached`).Scan(&jobID)
-		waitForJobPauseCancel(sqlDBRestore, jobID)
+		sqlDBRestore.Exec(t, "USE system")
+		jobutils.WaitForJobToFail(t, sqlDBRestore, jobID)
 		// Verify the failed RESTORE added some DROP tables.
 		// Note that the system tables here correspond to the temporary tables
 		// imported, not the system tables themselves.
 
@@ -119,8 +119,6 @@ var defaultNumWorkers = metamorphic.ConstantWithTestRange(
 	1, /* metamorphic min */
 	8, /* metamorphic max */
 )
-var retryableRestoreProcError = errors.New("restore processor error after forward progress")
-var restoreProcError = errors.New("restore processor error without forward progress")
 
 // TODO(pbardea): It may be worthwhile to combine this setting with the one that
 // controls the number of concurrent AddSSTable requests if each restore worker
@@ -647,11 +645,6 @@ func (rd *restoreDataProcessor) Next() (rowenc.EncDatumRow, *execinfrapb.Produce
 		if !ok {
 			// Done. Check if any phase exited early with an error.
 			err := rd.phaseGroup.Wait()
-			if rd.progressMade {
-				err = errors.Mark(err, retryableRestoreProcError)
-			} else {
-				err = errors.Mark(err, restoreProcError)
-			}
 			rd.MoveToDraining(err)
 			return nil, rd.DrainHelper()
 		}
 
@@ -22,7 +22,6 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cloud/cloudpb"
 	"github.com/cockroachdb/cockroach/pkg/clusterversion"
 	"github.com/cockroachdb/cockroach/pkg/jobs"
-	"github.com/cockroachdb/cockroach/pkg/jobs/joberror"
 	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
 	"github.com/cockroachdb/cockroach/pkg/jobs/jobsprotectedts"
 	"github.com/cockroachdb/cockroach/pkg/keys"
@@ -81,10 +80,30 @@ import (
 	"github.com/cockroachdb/errors"
 )
 
+var (
+	restoreRetryMaxDuration = settings.RegisterDurationSetting(
+		settings.ApplicationLevel,
+		"restore.retry_max_duration",
+		"maximum duration a restore job will retry before terminating",
+		72*time.Hour,
+		settings.WithVisibility(settings.Reserved),
+		settings.PositiveDuration,
+	)
+)
+
 // restoreStatsInsertBatchSize is an arbitrarily chosen value of the number of
 // tables we process in a single txn when restoring their table statistics.
 const restoreStatsInsertBatchSize = 10
 
+// maxRestoreRetryFastFail is the maximum number of times we will retry without
+// seeing any progress before fast-failing the restore job.
+const maxRestoreRetryFastFail = 5
+
+// restoreRetryProgressThreshold is the fraction of the job that must
+// be _exceeded_ before we no longer fast fail the restore job after hitting the
+// maxRestoreRetryFastFail threshold.
+const restoreRetryProgressThreshold = 0
+
 var restoreStatsInsertionConcurrency = settings.RegisterIntSetting(
 	settings.ApplicationLevel,
 	"bulkio.restore.insert_stats_workers",
@@ -154,42 +173,30 @@ func rewriteBackupSpanKey(
 	return newKey, nil
 }
 
+// restoreWithRetry attempts to run restore with retry logic and logs retries
+// accordingly.
 func restoreWithRetry(
-	restoreCtx context.Context,
+	ctx context.Context,
 	execCtx sql.JobExecContext,
+	resumer *restoreResumer,
 	backupManifests []backuppb.BackupManifest,
 	backupLocalityInfo []jobspb.RestoreDetails_BackupLocalityInfo,
 	endTime hlc.Timestamp,
 	dataToRestore restorationData,
-	resumer *restoreResumer,
 	encryption *jobspb.BackupEncryptionOptions,
 	kmsEnv cloud.KMSEnv,
 ) (roachpb.RowCount, error) {
-
-	// We retry on pretty generic failures -- any rpc error. If a worker node were
-	// to restart, it would produce this kind of error, but there may be other
-	// errors that are also rpc errors. Don't retry to aggressively.
-	retryOpts := retry.Options{
-		MaxBackoff: 1 * time.Second,
-		MaxRetries: 5,
-	}
-	if execCtx.ExecCfg().BackupRestoreTestingKnobs != nil &&
-		execCtx.ExecCfg().BackupRestoreTestingKnobs.RestoreDistSQLRetryPolicy != nil {
-		retryOpts = *execCtx.ExecCfg().BackupRestoreTestingKnobs.RestoreDistSQLRetryPolicy
-	}
-
 	// We want to retry a restore if there are transient failures (i.e. worker nodes
-	// dying), so if we receive a retryable error, re-plan and retry the backup.
+	// dying), so if we receive a retryable error, re-plan and retry the restore.
+	retryOpts, progThreshold := getRetryOptionsAndProgressThreshold(execCtx)
 	var (
-		res                    roachpb.RowCount
-		err                    error
-		previousPersistedSpans jobspb.RestoreFrontierEntries
-		currentPersistedSpans  jobspb.RestoreFrontierEntries
+		res                                    roachpb.RowCount
+		err                                    error
+		currPersistedSpans, prevPersistedSpans jobspb.RestoreFrontierEntries
 	)
-
-	for r := retry.StartWithCtx(restoreCtx, retryOpts); r.Next(); {
+	for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
 		res, err = restore(
-			restoreCtx,
+			ctx,
 			execCtx,
 			backupManifests,
 			backupLocalityInfo,
@@ -204,30 +211,34 @@ func restoreWithRetry(
 			break
 		}
 
-		if errors.HasType(err, &kvpb.InsufficientSpaceError{}) || errors.Is(err, restoreProcError) {
+		if errors.HasType(err, &kvpb.InsufficientSpaceError{}) {
 			return roachpb.RowCount{}, jobs.MarkPauseRequestError(errors.UnwrapAll(err))
 		}
-
-		if joberror.IsPermanentBulkJobError(err) && !errors.Is(err, retryableRestoreProcError) {
-			return roachpb.RowCount{}, err
-		}
-
 		// If we are draining, it is unlikely we can start a
 		// new DistSQL flow. Exit with a retryable error so
 		// that another node can pick up the job.
 		if execCtx.ExecCfg().JobRegistry.IsDraining() {
 			return roachpb.RowCount{}, jobs.MarkAsRetryJobError(errors.Wrapf(err, "job encountered retryable error on draining node"))
 		}
 
-		log.Warningf(restoreCtx, "encountered retryable error: %+v", err)
-		currentPersistedSpans = resumer.job.Progress().Details.(*jobspb.Progress_Restore).Restore.Checkpoint
-		if !currentPersistedSpans.Equal(previousPersistedSpans) {
+		log.Warningf(ctx, "encountered retryable error: %+v", err)
+
+		// Check if retry counter should be reset if progress was made.
+		currPersistedSpans = resumer.job.
+			Progress().Details.(*jobspb.Progress_Restore).Restore.Checkpoint
+		if !currPersistedSpans.Equal(prevPersistedSpans) {
 			// If the previous persisted spans are different than the current, it
 			// implies that further progress has been persisted.
 			r.Reset()
-			log.Infof(restoreCtx, "restored frontier has advanced since last retry, resetting retry counter")
+			log.Infof(ctx, "restored frontier has advanced since last retry, resetting retry counter")
+		}
+		prevPersistedSpans = currPersistedSpans
+
+		// Fail fast if no progress has been made after a certain number of retries.
+		if r.CurrentAttempt() >= maxRestoreRetryFastFail &&
+			resumer.job.FractionCompleted() <= progThreshold {
+			return roachpb.RowCount{}, errors.Wrap(err, "restore job exhausted max retries without making progress")
 		}
-		previousPersistedSpans = currentPersistedSpans
 
 		testingKnobs := execCtx.ExecCfg().BackupRestoreTestingKnobs
 		if testingKnobs != nil && testingKnobs.RunAfterRetryIteration != nil {
@@ -237,18 +248,41 @@ func restoreWithRetry(
 		}
 	}
 
-	// We have exhausted retries, but we have not seen a "PermanentBulkJobError" so
-	// it is possible that this is a transient error that is taking longer than
-	// our configured retry to go away.
-	//
-	// Let's pause the job instead of failing it so that the user can decide
-	// whether to resume it or cancel it.
+	// Since the restore was able to make some progress before exhausting the
+	// retry counter, we will pause the job and allow the user to determine
+	// whether or not to resume the job or disccard all progress and cancel.
 	if err != nil {
 		return res, jobs.MarkPauseRequestError(errors.Wrap(err, "exhausted retries"))
 	}
 	return res, nil
 }
 
+// getRetryOptionsAndProgressThreshold returns the restore retry options and
+// progress threshold for fast failure, taking into consideration any testing
+// knobs and cluster settings.
+func getRetryOptionsAndProgressThreshold(execCtx sql.JobExecContext) (retry.Options, float32) {
+	// In the event that the job is failing early without any progress, we will
+	// manually quit out of the retry loop prematurely. As such, we set a long max
+	// duration and backoff to allow for the job to retry for a long time in the
+	// event that some progress has been made.
+	maxDuration := restoreRetryMaxDuration.Get(&execCtx.ExecCfg().Settings.SV)
+	retryOpts := retry.Options{
+		MaxBackoff:  5 * time.Minute,
+		MaxDuration: maxDuration,
+	}
+	var progThreshold float32 = restoreRetryProgressThreshold
+	if knobs := execCtx.ExecCfg().BackupRestoreTestingKnobs; knobs != nil {
+		if knobs.RestoreDistSQLRetryPolicy != nil {
+			retryOpts = *knobs.RestoreDistSQLRetryPolicy
+		}
+		if knobs.RestoreRetryProgressThreshold > 0 {
+			progThreshold = knobs.RestoreRetryProgressThreshold
+		}
+	}
+
+	return retryOpts, progThreshold
+}
+
 type storeByLocalityKV map[string]cloudpb.ExternalStorage
 
 func makeBackupLocalityMap(
@@ -490,7 +524,7 @@ func restore(
 			case <-timer.C:
 				// Replan the restore job if it has been 10 minutes since the last
 				// processor completed working.
-				return errors.Mark(laggingRestoreProcErr, retryableRestoreProcError)
+				return laggingRestoreProcErr
 			}
 		}
 	}
@@ -1929,11 +1963,11 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
 		res, err := restoreWithRetry(
 			ctx,
 			p,
+			r,
 			backupManifests,
 			details.BackupLocalityInfo,
 			details.EndTime,
 			preData,
-			r,
 			details.Encryption,
 			&kmsEnv,
 		)
@@ -1969,11 +2003,11 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
 		res, err := restoreWithRetry(
 			ctx,
 			p,
+			r,
 			backupManifests,
 			details.BackupLocalityInfo,
 			details.EndTime,
 			preValidateData,
-			r,
 			details.Encryption,
 			&kmsEnv,
 		)
@@ -1990,11 +2024,11 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
 		res, err := restoreWithRetry(
 			ctx,
 			p,
+			r,
 			backupManifests,
 			details.BackupLocalityInfo,
 			details.EndTime,
 			mainData,
-			r,
 			details.Encryption,
 			&kmsEnv,
 		)