Skip to content

Commit 82a0890

Browse files
craig[bot]kev-cao
andcommitted
Merge #160027
160027: roachtest: deflake online-restore recovery r=msbutler a=kev-cao The OR recovery roachtest was failing due to the schemachange workload truncating a table and the roachtest only deleting from the full backup. In certain scenarios, the SST deleted from the full backup would end up being skipped in the restore due to the span it covered never being covered after truncation. This commit teaches OR to instead delete an SST from every layer of the backup collection, which should make the test resilient to truncation. Fixes: #159503 Release note: None Co-authored-by: Kevin Cao <[email protected]>
2 parents 91c3f2c + 1051aac commit 82a0890

File tree

2 files changed

+58
-42
lines changed

2 files changed

+58
-42
lines changed

pkg/cmd/roachtest/tests/backup_restore_roundtrip.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,7 @@ func testOnlineRestoreRecovery(ctx context.Context, t test.Test, c cluster.Clust
568568
return errors.Wrap(err, "failed to remove pausepoint for online restore")
569569
}
570570

571-
if err := d.deleteUserTableSST(ctx, t.L(), dbConn, collection); err != nil {
571+
if err := d.deleteSSTFromBackupLayers(ctx, t.L(), dbConn, collection); err != nil {
572572
return err
573573
}
574574
if _, err := dbConn.ExecContext(ctx, "RESUME JOB $1", downloadJobID); err != nil {

pkg/cmd/roachtest/tests/mixed_version_backup.go

Lines changed: 57 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2368,50 +2368,57 @@ func (d *BackupRestoreTestDriver) createBackupCollection(
23682368
return d.saveContents(ctx, l, rng, &collection, fingerprintAOST)
23692369
}
23702370

2371-
// deleteUserTableSST deletes an SST that covers a user table from the full
2372-
// backup in a collection. This is used to test online restore recovery. We
2373-
// delete from the full backup specifically to avoid deleting an SST from an
2374-
// incremental that is skipped due to a compacted backup. This ensures that
2375-
// deleting an SST will cause the download job to fail (assuming it hasn't
2376-
// already been downloaded).
2377-
// Note: We delete specifically a user-table SST as opposed to choosing a random
2378-
// SST because the temporary system table SSTs may be GC'd before the download
2379-
// phase attempts to download the SST. This would cause the download job to
2380-
// succeed instead of failing as expected. See
2381-
// https://github.com/cockroachdb/cockroach/issues/148408 for more details.
2382-
// TODO (kev-cao): Investigate if blocking storage level compaction would
2383-
// prevent GC of temporary system tables and allow us to delete a random SST
2384-
// instead. Technically, it is still possible, but unlikely, for storage to
2385-
// compact away a user-table SST before the download job attempts to download
2386-
// it, which would result in false positives in the test.
2387-
func (d *BackupRestoreTestDriver) deleteUserTableSST(
2371+
// deleteSSTFromBackupLayers deletes one SST from each layer of a backup
2372+
// collection. This is used to test online restore recovery. Deleting an SST
2373+
// from each layer ensures that at least one SST required for a restore will be
2374+
// missing.
2375+
func (d *BackupRestoreTestDriver) deleteSSTFromBackupLayers(
23882376
ctx context.Context, l *logger.Logger, db *gosql.DB, collection *backupCollection,
23892377
) error {
2390-
var sstPath string
2391-
var startPretty, endPretty string
2392-
var sizeBytes, numRows int
2393-
if err := db.QueryRowContext(
2378+
type sstInfo struct {
2379+
path, start, end string
2380+
bytes, rows int
2381+
}
2382+
rows, err := db.QueryContext(
23942383
ctx,
2395-
`SELECT path, start_pretty, end_pretty, size_bytes, rows FROM
2396-
(
2397-
SELECT row_number() OVER (), * FROM
2398-
[SHOW BACKUP FILES FROM LATEST IN $1]
2384+
// We delete the last SST in each directory just to ensure that we always
2385+
// delete user data. If a system table data is deleted, it is possible for
2386+
// the temporary system table to be GC'd before the download phase detects
2387+
// the missing file.
2388+
`WITH with_dir AS (
2389+
SELECT *, regexp_replace(path, '/[^/]+\.sst$', '') AS dir
2390+
FROM [SHOW BACKUP FILES FROM LATEST IN $1]
2391+
)
2392+
SELECT path, start_pretty, end_pretty, size_bytes, rows
2393+
FROM (
2394+
SELECT *, row_number() OVER (PARTITION BY dir ORDER BY path DESC) AS rn
2395+
FROM with_dir
23992396
)
2400-
WHERE backup_type = 'full'
2401-
ORDER BY row_number DESC
2402-
LIMIT 1`,
2397+
WHERE rn = 1`,
24032398
collection.uri(),
2404-
).Scan(&sstPath, &startPretty, &endPretty, &sizeBytes, &numRows); err != nil {
2405-
return errors.Wrapf(err, "failed to get SST path from %s", collection.uri())
2399+
)
2400+
if err != nil {
2401+
return errors.Wrapf(err, "failed to query for backup files in %q", collection.uri())
2402+
}
2403+
defer rows.Close()
2404+
var ssts []sstInfo
2405+
for rows.Next() {
2406+
var sst sstInfo
2407+
if err := rows.Scan(&sst.path, &sst.start, &sst.end, &sst.bytes, &sst.rows); err != nil {
2408+
return errors.Wrap(err, "error scanning SHOW BACKUP FILES row")
2409+
}
2410+
ssts = append(ssts, sst)
2411+
}
2412+
if rows.Err() != nil {
2413+
return errors.Wrap(rows.Err(), "error iterating over SHOW BACKUP FILES")
2414+
}
2415+
if len(ssts) == 0 {
2416+
return errors.Newf("unexpectedly found no SST files to delete in %q", collection.uri())
24062417
}
24072418
uri, err := url.Parse(collection.uri())
24082419
if err != nil {
24092420
return errors.Wrapf(err, "failed to parse backup collection URI %q", collection.uri())
24102421
}
2411-
l.Printf(
2412-
"deleting sst %s at %s: covering %d bytes and %d rows in [%s, %s)",
2413-
sstPath, uri, sizeBytes, numRows, startPretty, endPretty,
2414-
)
24152422
storage, err := cloud.ExternalStorageFromURI(
24162423
ctx,
24172424
collection.uri(),
@@ -2427,13 +2434,22 @@ func (d *BackupRestoreTestDriver) deleteUserTableSST(
24272434
return errors.Wrapf(err, "failed to create external storage for %q", collection.uri())
24282435
}
24292436
defer storage.Close()
2430-
return errors.Wrapf(
2431-
storage.Delete(ctx, sstPath),
2432-
"failed to delete sst %s from backup %s at %s",
2433-
sstPath,
2434-
collection.name,
2435-
uri,
2436-
)
2437+
for _, sst := range ssts {
2438+
l.Printf(
2439+
"deleting sst %s at %s: covering %d bytes and %d rows in [%s, %s)",
2440+
sst.path, uri, sst.bytes, sst.rows, sst.start, sst.end,
2441+
)
2442+
if err := storage.Delete(ctx, sst.path); err != nil {
2443+
return errors.Wrapf(
2444+
err,
2445+
"failed to delete sst %s from backup %s at %s",
2446+
sst.path,
2447+
collection.name,
2448+
uri,
2449+
)
2450+
}
2451+
}
2452+
return nil
24372453
}
24382454

24392455
// sentinelFilePath returns the path to the file that prevents job

0 commit comments

Comments
 (0)