@@ -22,7 +22,6 @@ import (
22
22
"github.com/cockroachdb/cockroach/pkg/cloud/cloudpb"
23
23
"github.com/cockroachdb/cockroach/pkg/clusterversion"
24
24
"github.com/cockroachdb/cockroach/pkg/jobs"
25
- "github.com/cockroachdb/cockroach/pkg/jobs/joberror"
26
25
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
27
26
"github.com/cockroachdb/cockroach/pkg/jobs/jobsprotectedts"
28
27
"github.com/cockroachdb/cockroach/pkg/keys"
@@ -81,10 +80,30 @@ import (
81
80
"github.com/cockroachdb/errors"
82
81
)
83
82
83
+ var (
84
+ restoreRetryMaxDuration = settings .RegisterDurationSetting (
85
+ settings .ApplicationLevel ,
86
+ "restore.retry_max_duration" ,
87
+ "maximum duration a restore job will retry before terminating" ,
88
+ 72 * time .Hour ,
89
+ settings .WithVisibility (settings .Reserved ),
90
+ settings .PositiveDuration ,
91
+ )
92
+ )
93
+
84
94
// restoreStatsInsertBatchSize is an arbitrarily chosen value of the number of
85
95
// tables we process in a single txn when restoring their table statistics.
86
96
const restoreStatsInsertBatchSize = 10
87
97
98
+ // maxRestoreRetryFastFail is the maximum number of times we will retry without
99
+ // seeing any progress before fast-failing the restore job.
100
+ const maxRestoreRetryFastFail = 5
101
+
102
+ // restoreRetryProgressThreshold is the fraction of the job that must
103
+ // be _exceeded_ before we no longer fast fail the restore job after hitting the
104
+ // maxRestoreRetryFastFail threshold.
105
+ const restoreRetryProgressThreshold = 0
106
+
88
107
var restoreStatsInsertionConcurrency = settings .RegisterIntSetting (
89
108
settings .ApplicationLevel ,
90
109
"bulkio.restore.insert_stats_workers" ,
@@ -154,42 +173,30 @@ func rewriteBackupSpanKey(
154
173
return newKey , nil
155
174
}
156
175
176
+ // restoreWithRetry attempts to run restore with retry logic and logs retries
177
+ // accordingly.
157
178
func restoreWithRetry (
158
- restoreCtx context.Context ,
179
+ ctx context.Context ,
159
180
execCtx sql.JobExecContext ,
181
+ resumer * restoreResumer ,
160
182
backupManifests []backuppb.BackupManifest ,
161
183
backupLocalityInfo []jobspb.RestoreDetails_BackupLocalityInfo ,
162
184
endTime hlc.Timestamp ,
163
185
dataToRestore restorationData ,
164
- resumer * restoreResumer ,
165
186
encryption * jobspb.BackupEncryptionOptions ,
166
187
kmsEnv cloud.KMSEnv ,
167
188
) (roachpb.RowCount , error ) {
168
-
169
- // We retry on pretty generic failures -- any rpc error. If a worker node were
170
- // to restart, it would produce this kind of error, but there may be other
171
- // errors that are also rpc errors. Don't retry to aggressively.
172
- retryOpts := retry.Options {
173
- MaxBackoff : 1 * time .Second ,
174
- MaxRetries : 5 ,
175
- }
176
- if execCtx .ExecCfg ().BackupRestoreTestingKnobs != nil &&
177
- execCtx .ExecCfg ().BackupRestoreTestingKnobs .RestoreDistSQLRetryPolicy != nil {
178
- retryOpts = * execCtx .ExecCfg ().BackupRestoreTestingKnobs .RestoreDistSQLRetryPolicy
179
- }
180
-
181
189
// We want to retry a restore if there are transient failures (i.e. worker nodes
182
- // dying), so if we receive a retryable error, re-plan and retry the backup.
190
+ // dying), so if we receive a retryable error, re-plan and retry the restore.
191
+ retryOpts , progThreshold := getRetryOptionsAndProgressThreshold (execCtx )
183
192
var (
184
- res roachpb.RowCount
185
- err error
186
- previousPersistedSpans jobspb.RestoreFrontierEntries
187
- currentPersistedSpans jobspb.RestoreFrontierEntries
193
+ res roachpb.RowCount
194
+ err error
195
+ currPersistedSpans , prevPersistedSpans jobspb.RestoreFrontierEntries
188
196
)
189
-
190
- for r := retry .StartWithCtx (restoreCtx , retryOpts ); r .Next (); {
197
+ for r := retry .StartWithCtx (ctx , retryOpts ); r .Next (); {
191
198
res , err = restore (
192
- restoreCtx ,
199
+ ctx ,
193
200
execCtx ,
194
201
backupManifests ,
195
202
backupLocalityInfo ,
@@ -204,30 +211,34 @@ func restoreWithRetry(
204
211
break
205
212
}
206
213
207
- if errors .HasType (err , & kvpb.InsufficientSpaceError {}) || errors . Is ( err , restoreProcError ) {
214
+ if errors .HasType (err , & kvpb.InsufficientSpaceError {}) {
208
215
return roachpb.RowCount {}, jobs .MarkPauseRequestError (errors .UnwrapAll (err ))
209
216
}
210
-
211
- if joberror .IsPermanentBulkJobError (err ) && ! errors .Is (err , retryableRestoreProcError ) {
212
- return roachpb.RowCount {}, err
213
- }
214
-
215
217
// If we are draining, it is unlikely we can start a
216
218
// new DistSQL flow. Exit with a retryable error so
217
219
// that another node can pick up the job.
218
220
if execCtx .ExecCfg ().JobRegistry .IsDraining () {
219
221
return roachpb.RowCount {}, jobs .MarkAsRetryJobError (errors .Wrapf (err , "job encountered retryable error on draining node" ))
220
222
}
221
223
222
- log .Warningf (restoreCtx , "encountered retryable error: %+v" , err )
223
- currentPersistedSpans = resumer .job .Progress ().Details .(* jobspb.Progress_Restore ).Restore .Checkpoint
224
- if ! currentPersistedSpans .Equal (previousPersistedSpans ) {
224
+ log .Warningf (ctx , "encountered retryable error: %+v" , err )
225
+
226
+ // Check if retry counter should be reset if progress was made.
227
+ currPersistedSpans = resumer .job .
228
+ Progress ().Details .(* jobspb.Progress_Restore ).Restore .Checkpoint
229
+ if ! currPersistedSpans .Equal (prevPersistedSpans ) {
225
230
// If the previous persisted spans are different than the current, it
226
231
// implies that further progress has been persisted.
227
232
r .Reset ()
228
- log .Infof (restoreCtx , "restored frontier has advanced since last retry, resetting retry counter" )
233
+ log .Infof (ctx , "restored frontier has advanced since last retry, resetting retry counter" )
234
+ }
235
+ prevPersistedSpans = currPersistedSpans
236
+
237
+ // Fail fast if no progress has been made after a certain number of retries.
238
+ if r .CurrentAttempt () >= maxRestoreRetryFastFail &&
239
+ resumer .job .FractionCompleted () <= progThreshold {
240
+ return roachpb.RowCount {}, errors .Wrap (err , "restore job exhausted max retries without making progress" )
229
241
}
230
- previousPersistedSpans = currentPersistedSpans
231
242
232
243
testingKnobs := execCtx .ExecCfg ().BackupRestoreTestingKnobs
233
244
if testingKnobs != nil && testingKnobs .RunAfterRetryIteration != nil {
@@ -237,18 +248,41 @@ func restoreWithRetry(
237
248
}
238
249
}
239
250
240
- // We have exhausted retries, but we have not seen a "PermanentBulkJobError" so
241
- // it is possible that this is a transient error that is taking longer than
242
- // our configured retry to go away.
243
- //
244
- // Let's pause the job instead of failing it so that the user can decide
245
- // whether to resume it or cancel it.
251
+ // Since the restore was able to make some progress before exhausting the
252
+ // retry counter, we will pause the job and allow the user to determine
253
+ // whether or not to resume the job or disccard all progress and cancel.
246
254
if err != nil {
247
255
return res , jobs .MarkPauseRequestError (errors .Wrap (err , "exhausted retries" ))
248
256
}
249
257
return res , nil
250
258
}
251
259
260
+ // getRetryOptionsAndProgressThreshold returns the restore retry options and
261
+ // progress threshold for fast failure, taking into consideration any testing
262
+ // knobs and cluster settings.
263
+ func getRetryOptionsAndProgressThreshold (execCtx sql.JobExecContext ) (retry.Options , float32 ) {
264
+ // In the event that the job is failing early without any progress, we will
265
+ // manually quit out of the retry loop prematurely. As such, we set a long max
266
+ // duration and backoff to allow for the job to retry for a long time in the
267
+ // event that some progress has been made.
268
+ maxDuration := restoreRetryMaxDuration .Get (& execCtx .ExecCfg ().Settings .SV )
269
+ retryOpts := retry.Options {
270
+ MaxBackoff : 5 * time .Minute ,
271
+ MaxDuration : maxDuration ,
272
+ }
273
+ var progThreshold float32 = restoreRetryProgressThreshold
274
+ if knobs := execCtx .ExecCfg ().BackupRestoreTestingKnobs ; knobs != nil {
275
+ if knobs .RestoreDistSQLRetryPolicy != nil {
276
+ retryOpts = * knobs .RestoreDistSQLRetryPolicy
277
+ }
278
+ if knobs .RestoreRetryProgressThreshold > 0 {
279
+ progThreshold = knobs .RestoreRetryProgressThreshold
280
+ }
281
+ }
282
+
283
+ return retryOpts , progThreshold
284
+ }
285
+
252
286
type storeByLocalityKV map [string ]cloudpb.ExternalStorage
253
287
254
288
func makeBackupLocalityMap (
@@ -490,7 +524,7 @@ func restore(
490
524
case <- timer .C :
491
525
// Replan the restore job if it has been 10 minutes since the last
492
526
// processor completed working.
493
- return errors . Mark ( laggingRestoreProcErr , retryableRestoreProcError )
527
+ return laggingRestoreProcErr
494
528
}
495
529
}
496
530
}
@@ -1929,11 +1963,11 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
1929
1963
res , err := restoreWithRetry (
1930
1964
ctx ,
1931
1965
p ,
1966
+ r ,
1932
1967
backupManifests ,
1933
1968
details .BackupLocalityInfo ,
1934
1969
details .EndTime ,
1935
1970
preData ,
1936
- r ,
1937
1971
details .Encryption ,
1938
1972
& kmsEnv ,
1939
1973
)
@@ -1969,11 +2003,11 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
1969
2003
res , err := restoreWithRetry (
1970
2004
ctx ,
1971
2005
p ,
2006
+ r ,
1972
2007
backupManifests ,
1973
2008
details .BackupLocalityInfo ,
1974
2009
details .EndTime ,
1975
2010
preValidateData ,
1976
- r ,
1977
2011
details .Encryption ,
1978
2012
& kmsEnv ,
1979
2013
)
@@ -1990,11 +2024,11 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
1990
2024
res , err := restoreWithRetry (
1991
2025
ctx ,
1992
2026
p ,
2027
+ r ,
1993
2028
backupManifests ,
1994
2029
details .BackupLocalityInfo ,
1995
2030
details .EndTime ,
1996
2031
mainData ,
1997
- r ,
1998
2032
details .Encryption ,
1999
2033
& kmsEnv ,
2000
2034
)
0 commit comments