Skip to content

Commit dbe89ca

Browse files
craig[bot]kev-cao
andcommitted
Merge #148098
148098: restore: gracefully handle lookup of dropped temp system tables r=msbutler a=kev-cao Previously, if a lookup was attempted on a dropped system table (e.g. during cleanup after a failed restore), an error would be surfaced. This caused the restore jobs in the reverting state to be stuck in a retry loop. However, if we are attempting to do cleanup, we can simply skip any dropped descriptors, which this commit teaches restore to do. Epic: CRDB-51394 Fixes: #148088 Release note: Restore no longer gets stuck in the reverting state after failed cleanup of dropped temporary system tables. Co-authored-by: Kevin Cao <[email protected]>
2 parents 70f1be6 + 952bafc commit dbe89ca

File tree

4 files changed

+170
-26
lines changed

4 files changed

+170
-26
lines changed

pkg/backup/restore_job.go

Lines changed: 73 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ import (
5555
"github.com/cockroachdb/cockroach/pkg/sql/catalog/typedesc"
5656
"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
5757
"github.com/cockroachdb/cockroach/pkg/sql/isql"
58+
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
59+
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
5860
"github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scbackup"
5961
"github.com/cockroachdb/cockroach/pkg/sql/sem/catconstants"
6062
"github.com/cockroachdb/cockroach/pkg/sql/sem/catid"
@@ -2080,6 +2082,12 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro
20802082
}
20812083
}
20822084

2085+
if err := p.ExecCfg().JobRegistry.CheckPausepoint(
2086+
"restore.after_cleanup_temp_system_tables",
2087+
); err != nil {
2088+
return err
2089+
}
2090+
20832091
if details.DescriptorCoverage != tree.RequestedDescriptors {
20842092
// Bump the version of the role membership table so that the cache is
20852093
// invalidated.
@@ -2766,24 +2774,20 @@ func (r *restoreResumer) dropDescriptors(
27662774
b := txn.KV().NewBatch()
27672775
const kvTrace = false
27682776
// Collect the tables into mutable versions.
2769-
mutableTables := make([]*tabledesc.Mutable, len(details.TableDescs))
2770-
for i := range details.TableDescs {
2771-
var err error
2772-
mutableTables[i], err = descsCol.MutableByID(txn.KV()).Table(ctx, details.TableDescs[i].ID)
2773-
if err != nil {
2774-
return err
2775-
}
2776-
// Ensure that the version matches what we expect. In the case that it
2777-
// doesn't, it's not really clear what to do. Just log and carry on. If the
2778-
// descriptors have already been published, then there's nothing to fuss
2779-
// about so we only do this check if they have not been published.
2780-
if !details.DescriptorsPublished {
2781-
if got, exp := mutableTables[i].Version, details.TableDescs[i].Version; got != exp {
2782-
log.Errorf(ctx, "version changed for restored descriptor %d before "+
2783-
"drop: got %d, expected %d", mutableTables[i].GetID(), got, exp)
2784-
}
2777+
mutableTables, err := getUndroppedTablesFromRestore(
2778+
ctx, txn.KV(), details, descsCol,
2779+
)
2780+
if err != nil {
2781+
return errors.Wrap(err, "getting mutable tables from restore")
2782+
}
2783+
// Ensure that the table versions matches what we expect. In the case that it
2784+
// doesn't, it's not really clear what to do. Just log and carry on. If the
2785+
// descriptors have already been published, then there's nothing to fuss
2786+
// about so we only do this check if they have not been published.
2787+
if !details.DescriptorsPublished {
2788+
if err := checkRestoredTableDescriptorVersions(details, mutableTables); err != nil {
2789+
log.Errorf(ctx, "table version mismatch during drop: %v", err)
27852790
}
2786-
27872791
}
27882792

27892793
// Remove any back references installed from existing types to tables being restored.
@@ -3363,6 +3367,58 @@ func (r *restoreResumer) cleanupTempSystemTables(ctx context.Context) error {
33633367
return nil
33643368
}
33653369

3370+
// getUndroppedTablesFromRestore retrieves all table descriptors, offline or
3371+
// online, as listed in the restore details, excluding any tables that have been
3372+
// dropped or marked as dropped. This helps avoid situations where the temporary
3373+
// system database tables have already been copied over to the real system
3374+
// database and dropped but attempting to load those temporary tables again
3375+
// would result in an error.
3376+
func getUndroppedTablesFromRestore(
3377+
ctx context.Context, txn *kv.Txn, details jobspb.RestoreDetails, descCol *descs.Collection,
3378+
) ([]*tabledesc.Mutable, error) {
3379+
var tables []*tabledesc.Mutable
3380+
for _, desc := range details.TableDescs {
3381+
mutableTable, err := descCol.MutableByID(txn).Table(ctx, desc.ID)
3382+
if err != nil {
3383+
if pgerror.GetPGCode(err) == pgcode.UndefinedTable {
3384+
continue
3385+
}
3386+
return nil, err
3387+
}
3388+
if mutableTable.Dropped() {
3389+
continue
3390+
}
3391+
tables = append(tables, mutableTable)
3392+
}
3393+
return tables, nil
3394+
}
3395+
3396+
// checkeRestoredTableDescriptorVersions compares the versions of descriptors at
3397+
// the time of restore with the versions of the restored tables. It returns an
3398+
// error if any of the restored tables have a version that does not match the
3399+
// version that was recorded at the time of restore in the job details.
3400+
func checkRestoredTableDescriptorVersions(
3401+
details jobspb.RestoreDetails, restoredTables []*tabledesc.Mutable,
3402+
) error {
3403+
versionsAtRestoreTime := make(map[descpb.ID]descpb.DescriptorVersion)
3404+
for _, desc := range details.TableDescs {
3405+
versionsAtRestoreTime[desc.ID] = desc.Version
3406+
}
3407+
for _, table := range restoredTables {
3408+
if expVersion, ok := versionsAtRestoreTime[table.GetID()]; ok {
3409+
if table.Version != expVersion {
3410+
return errors.Errorf(
3411+
"version mismatch for restored descriptor %d, expected version %d, got %d",
3412+
table.GetID(), expVersion, table.Version,
3413+
)
3414+
}
3415+
} else {
3416+
return errors.Errorf("restored table %d not found in restore details", table.GetID())
3417+
}
3418+
}
3419+
return nil
3420+
}
3421+
33663422
var _ jobs.Resumer = &restoreResumer{}
33673423

33683424
func init() {

pkg/backup/restore_online.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -892,11 +892,11 @@ func setDescriptorsOffline(
892892
return nil
893893
}
894894

895-
for i := range details.TableDescs {
896-
mutableTable, err := descCol.MutableByID(txn.KV()).Table(ctx, details.TableDescs[i].ID)
897-
if err != nil {
898-
return err
899-
}
895+
mutableTables, err := getUndroppedTablesFromRestore(ctx, txn.KV(), details, descCol)
896+
if err != nil {
897+
return errors.Wrapf(err, "set descriptors offline: getting undropped tables from restore")
898+
}
899+
for _, mutableTable := range mutableTables {
900900
if err := writeDesc(mutableTable); err != nil {
901901
return err
902902
}

pkg/backup/restore_online_test.go

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,12 +204,73 @@ func TestOnlineRestoreRecovery(t *testing.T) {
204204
})
205205
}
206206

207+
// We run full cluster online restore recovery in a separate environment since
208+
// it requires dropping all databases and will impact other tests.
209+
func TestFullClusterOnlineRestoreRecovery(t *testing.T) {
210+
defer leaktest.AfterTest(t)()
211+
defer log.Scope(t).Close(t)
212+
213+
tmpDir := t.TempDir()
214+
215+
defer nodelocal.ReplaceNodeLocalForTesting(tmpDir)()
216+
217+
const numAccounts = 1000
218+
219+
externalStorage := "nodelocal://1/backup"
220+
221+
_, sqlDB, _, cleanupFn := backupRestoreTestSetupWithParams(t, singleNode, numAccounts, InitManualReplication, orParams)
222+
defer cleanupFn()
223+
224+
sqlDB.Exec(t, "SET CLUSTER SETTING jobs.debug.pausepoints = 'restore.before_download'")
225+
sqlDB.Exec(t, fmt.Sprintf("BACKUP INTO '%s'", externalStorage))
226+
227+
// Reset cluster for full cluster restore.
228+
dbs := sqlDB.QueryStr(
229+
t, "SELECT database_name FROM [SHOW DATABASES] WHERE database_name != 'system'",
230+
)
231+
sqlDB.Exec(t, "USE system")
232+
for _, db := range dbs {
233+
sqlDB.Exec(t, fmt.Sprintf("DROP DATABASE %s CASCADE", db[0]))
234+
}
235+
236+
var linkJobID jobspb.JobID
237+
sqlDB.QueryRow(
238+
t,
239+
fmt.Sprintf(
240+
"RESTORE FROM LATEST IN '%s' WITH EXPERIMENTAL DEFERRED COPY, detached", externalStorage,
241+
),
242+
).Scan(&linkJobID)
243+
jobutils.WaitForJobToSucceed(t, sqlDB, linkJobID)
244+
var downloadJobID jobspb.JobID
245+
sqlDB.QueryRow(t, latestDownloadJobIDQuery).Scan(&downloadJobID)
246+
jobutils.WaitForJobToPause(t, sqlDB, downloadJobID)
247+
corruptBackup(t, sqlDB, tmpDir, externalStorage)
248+
sqlDB.Exec(t, "SET CLUSTER SETTING jobs.debug.pausepoints = ''")
249+
sqlDB.Exec(t, fmt.Sprintf("RESUME JOB %d", downloadJobID))
250+
jobutils.WaitForJobToFail(t, sqlDB, downloadJobID)
251+
}
252+
207253
func corruptBackup(t *testing.T, sqlDB *sqlutils.SQLRunner, ioDir string, uri string) {
208-
var filePath string
209-
filePathQuery := fmt.Sprintf("SELECT path FROM [SHOW BACKUP FILES FROM LATEST IN '%s'] LIMIT 1", uri)
210-
parsedURI, err := url.Parse(strings.Replace(uri, "'", "", -1))
211-
sqlDB.QueryRow(t, filePathQuery).Scan(&filePath)
254+
var filePath, spanStart, spanEnd string
255+
// We delete the last SST file in SHOW BACKUP to ensure the deletion of an SST
256+
// file that backs a user-table.
257+
// https://github.com/cockroachdb/cockroach/issues/148408 illustrates how
258+
// deleting the backing SST file of a system table will not necessarily cause
259+
// a download job to fail.
260+
filePathQuery := fmt.Sprintf(
261+
`SELECT path, start_pretty, end_pretty FROM
262+
(
263+
SELECT row_number() OVER (), *
264+
FROM [SHOW BACKUP FILES FROM LATEST IN '%s']
265+
)
266+
ORDER BY row_number DESC
267+
LIMIT 1`,
268+
uri,
269+
)
270+
parsedURI, err := url.Parse(strings.ReplaceAll(uri, "'", ""))
271+
sqlDB.QueryRow(t, filePathQuery).Scan(&filePath, &spanStart, &spanEnd)
212272
fullPath := filepath.Join(ioDir, parsedURI.Path, filePath)
273+
t.Logf("deleting backup file %s covering span [%s, %s)", fullPath, spanStart, spanEnd)
213274
require.NoError(t, err)
214275
require.NoError(t, os.Remove(fullPath))
215276
}

pkg/backup/restore_test.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111
"time"
1212

1313
"github.com/cockroachdb/cockroach/pkg/backup/backuptestutils"
14+
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
15+
"github.com/cockroachdb/cockroach/pkg/testutils/jobutils"
1416
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
1517
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
1618
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
@@ -58,3 +60,28 @@ func TestRestoreWithOpenTransaction(t *testing.T) {
5860

5961
userConn.Exec(t, "COMMIT")
6062
}
63+
64+
// This test verifies that restore cleanup does not fail due to dropped
65+
// temporary system tables as described in #148088.
66+
func TestFailAfterCleanupSystemTables(t *testing.T) {
67+
defer leaktest.AfterTest(t)()
68+
defer log.Scope(t).Close(t)
69+
70+
clusterSize := 1
71+
_, sqlDB, _, cleanupFn := backuptestutils.StartBackupRestoreTestCluster(t, clusterSize)
72+
defer cleanupFn()
73+
74+
// Must set cluster setting before backup to ensure the setting is preserved.
75+
sqlDB.Exec(
76+
t, "SET CLUSTER SETTING jobs.debug.pausepoints = 'restore.after_cleanup_temp_system_tables'",
77+
)
78+
sqlDB.Exec(t, "BACKUP INTO 'nodelocal://1/backup'")
79+
80+
var jobID jobspb.JobID
81+
sqlDB.QueryRow(t, "RESTORE FROM LATEST IN 'nodelocal://1/backup' WITH detached").Scan(&jobID)
82+
sqlDB.Exec(t, "USE system")
83+
jobutils.WaitForJobToPause(t, sqlDB, jobID)
84+
85+
sqlDB.Exec(t, "CANCEL JOB $1", jobID)
86+
jobutils.WaitForJobToCancel(t, sqlDB, jobID)
87+
}

0 commit comments

Comments
 (0)