Skip to content

Commit f533418

Browse files
committed
roachtest: guarantee deleting user-table sst in OR recovery
As illuminated in #148408, deleting an SST that backs a temporary system table before the download phase completes may still result in a successful download phase. Our current OR recovery roachtest deletes SSTs at random, so it is possible for the test to flake because of this. This updates the test to always delete the last SST in `SHOW BACKUP`, which should always cover a user table instead. Epic: CRDB-51394 Release note: None
1 parent 24d55be commit f533418

File tree

2 files changed

+25
-10
lines changed

2 files changed

+25
-10
lines changed

pkg/cmd/roachtest/tests/backup_restore_roundtrip.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ func testOnlineRestoreRecovery(ctx context.Context, t test.Test, c cluster.Clust
545545
return errors.Wrap(err, "failed to remove pausepoint for online restore")
546546
}
547547

548-
if err := d.deleteRandomSST(ctx, t.L(), dbConn, collection); err != nil {
548+
if err := d.deleteUserTableSST(ctx, t.L(), dbConn, collection); err != nil {
549549
return err
550550
}
551551
if _, err := dbConn.ExecContext(ctx, "RESUME JOB $1", downloadJobID); err != nil {

pkg/cmd/roachtest/tests/mixed_version_backup.go

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2316,24 +2316,39 @@ func (d *BackupRestoreTestDriver) createBackupCollection(
23162316
return d.saveContents(ctx, l, rng, &collection, fingerprintAOST)
23172317
}
23182318

2319-
// deleteRandomSSTs deletes an SST from the full backup in a collection. This is
2320-
// used to test online restore recovery. We delete from the full backup
2321-
// specifically to avoid deleting an SST from an incremental that is skipped due
2322-
// to a compacted backup. This ensures that deleting an SST will cause the
2323-
// download job to fail (assuming it hasn't already been downloaded).
2324-
func (d *BackupRestoreTestDriver) deleteRandomSST(
2319+
// deleteUserTableSST deletes an SST that covers a user table from the full
2320+
// backup in a collection. This is used to test online restore recovery. We
2321+
// delete from the full backup specifically to avoid deleting an SST from an
2322+
// incremental that is skipped due to a compacted backup. This ensures that
2323+
// deleting an SST will cause the download job to fail (assuming it hasn't
2324+
// already been downloaded).
2325+
// Note: We delete specifically a user-table SST as opposed to choosing a random
2326+
// SST because the temporary system table SSTs may be GC'd before the download
2327+
// phase attempts to download the SST. This would cause the download job to
2328+
// succeed instead of failing as expected. See
2329+
// https://github.com/cockroachdb/cockroach/issues/148408 for more details.
2330+
// TODO (kev-cao): Investigate if blocking storage level compaction would
2331+
// prevent GC of temporary system tables and allow us to delete a random SST
2332+
// instead. Technically, it is still possible, but unlikely, for storage to
2333+
// compact away a user-table SST before the download job attempts to download
2334+
// it, which would result in false positives in the test.
2335+
func (d *BackupRestoreTestDriver) deleteUserTableSST(
23252336
ctx context.Context, l *logger.Logger, db *gosql.DB, collection *backupCollection,
23262337
) error {
23272338
var sstPath string
23282339
if err := db.QueryRowContext(
23292340
ctx,
23302341
`SELECT path FROM
2331-
[SHOW BACKUP FILES FROM LATEST IN $1]
2342+
(
2343+
SELECT row_number() OVER (), * FROM
2344+
[SHOW BACKUP FILES FROM LATEST IN $1]
2345+
)
23322346
WHERE backup_type = 'full'
2333-
ORDER BY random() LIMIT 1 `,
2347+
ORDER BY row_number DESC
2348+
LIMIT 1`,
23342349
collection.uri(),
23352350
).Scan(&sstPath); err != nil {
2336-
return errors.Wrapf(err, "failed to get random SST path from %s", collection.uri())
2351+
return errors.Wrapf(err, "failed to get SST path from %s", collection.uri())
23372352
}
23382353
uri, err := url.Parse(collection.uri())
23392354
if err != nil {

0 commit comments

Comments
 (0)