Skip to content

Commit 952bafc

Browse files
committed
restore: prevent retry loop after failed full cluster online restore
Previously, if online restore failed on a full cluster restore, it would get stuck in a retry loop in its attempt to set all descriptors offline. Those descriptors would include the temporary system tables, which were dropped at the end of the link phase. This patch updates online restore recovery to ignore these dropped descriptors instead of failing. Epic: CRDB-51394 Fixes: #148088 Release note: Online restore no longer gets stuck in a retry loop when reverting after a failed or canceled download job.
1 parent d740cff commit 952bafc

File tree

2 files changed

+70
-9
lines changed

2 files changed

+70
-9
lines changed

pkg/backup/restore_online.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -892,11 +892,11 @@ func setDescriptorsOffline(
892892
return nil
893893
}
894894

895-
for i := range details.TableDescs {
896-
mutableTable, err := descCol.MutableByID(txn.KV()).Table(ctx, details.TableDescs[i].ID)
897-
if err != nil {
898-
return err
899-
}
895+
mutableTables, err := getUndroppedTablesFromRestore(ctx, txn.KV(), details, descCol)
896+
if err != nil {
897+
return errors.Wrapf(err, "set descriptors offline: getting undropped tables from restore")
898+
}
899+
for _, mutableTable := range mutableTables {
900900
if err := writeDesc(mutableTable); err != nil {
901901
return err
902902
}

pkg/backup/restore_online_test.go

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,12 +204,73 @@ func TestOnlineRestoreRecovery(t *testing.T) {
204204
})
205205
}
206206

207+
// We run full cluster online restore recovery in a separate environment since
208+
// it requires dropping all databases and will impact other tests.
209+
func TestFullClusterOnlineRestoreRecovery(t *testing.T) {
210+
defer leaktest.AfterTest(t)()
211+
defer log.Scope(t).Close(t)
212+
213+
tmpDir := t.TempDir()
214+
215+
defer nodelocal.ReplaceNodeLocalForTesting(tmpDir)()
216+
217+
const numAccounts = 1000
218+
219+
externalStorage := "nodelocal://1/backup"
220+
221+
_, sqlDB, _, cleanupFn := backupRestoreTestSetupWithParams(t, singleNode, numAccounts, InitManualReplication, orParams)
222+
defer cleanupFn()
223+
224+
sqlDB.Exec(t, "SET CLUSTER SETTING jobs.debug.pausepoints = 'restore.before_download'")
225+
sqlDB.Exec(t, fmt.Sprintf("BACKUP INTO '%s'", externalStorage))
226+
227+
// Reset cluster for full cluster restore.
228+
dbs := sqlDB.QueryStr(
229+
t, "SELECT database_name FROM [SHOW DATABASES] WHERE database_name != 'system'",
230+
)
231+
sqlDB.Exec(t, "USE system")
232+
for _, db := range dbs {
233+
sqlDB.Exec(t, fmt.Sprintf("DROP DATABASE %s CASCADE", db[0]))
234+
}
235+
236+
var linkJobID jobspb.JobID
237+
sqlDB.QueryRow(
238+
t,
239+
fmt.Sprintf(
240+
"RESTORE FROM LATEST IN '%s' WITH EXPERIMENTAL DEFERRED COPY, detached", externalStorage,
241+
),
242+
).Scan(&linkJobID)
243+
jobutils.WaitForJobToSucceed(t, sqlDB, linkJobID)
244+
var downloadJobID jobspb.JobID
245+
sqlDB.QueryRow(t, latestDownloadJobIDQuery).Scan(&downloadJobID)
246+
jobutils.WaitForJobToPause(t, sqlDB, downloadJobID)
247+
corruptBackup(t, sqlDB, tmpDir, externalStorage)
248+
sqlDB.Exec(t, "SET CLUSTER SETTING jobs.debug.pausepoints = ''")
249+
sqlDB.Exec(t, fmt.Sprintf("RESUME JOB %d", downloadJobID))
250+
jobutils.WaitForJobToFail(t, sqlDB, downloadJobID)
251+
}
252+
207253
func corruptBackup(t *testing.T, sqlDB *sqlutils.SQLRunner, ioDir string, uri string) {
208-
var filePath string
209-
filePathQuery := fmt.Sprintf("SELECT path FROM [SHOW BACKUP FILES FROM LATEST IN '%s'] LIMIT 1", uri)
210-
parsedURI, err := url.Parse(strings.Replace(uri, "'", "", -1))
211-
sqlDB.QueryRow(t, filePathQuery).Scan(&filePath)
254+
var filePath, spanStart, spanEnd string
255+
// We delete the last SST file in SHOW BACKUP to ensure the deletion of an SST
256+
// file that backs a user-table.
257+
// https://github.com/cockroachdb/cockroach/issues/148408 illustrates how
258+
// deleting the backing SST file of a system table will not necessarily cause
259+
// a download job to fail.
260+
filePathQuery := fmt.Sprintf(
261+
`SELECT path, start_pretty, end_pretty FROM
262+
(
263+
SELECT row_number() OVER (), *
264+
FROM [SHOW BACKUP FILES FROM LATEST IN '%s']
265+
)
266+
ORDER BY row_number DESC
267+
LIMIT 1`,
268+
uri,
269+
)
270+
parsedURI, err := url.Parse(strings.ReplaceAll(uri, "'", ""))
271+
sqlDB.QueryRow(t, filePathQuery).Scan(&filePath, &spanStart, &spanEnd)
212272
fullPath := filepath.Join(ioDir, parsedURI.Path, filePath)
273+
t.Logf("deleting backup file %s covering span [%s, %s)", fullPath, spanStart, spanEnd)
213274
require.NoError(t, err)
214275
require.NoError(t, os.Remove(fullPath))
215276
}

0 commit comments

Comments
 (0)