Skip to content

Commit dfd17d1

Browse files
craig[bot]kev-caofqazijbowens
committed
148448: roachtest: guarantee deleting user-table sst in OR recovery r=msbutler a=kev-cao As illuminated in #148408, deleting an SST that backs a temporary system table before the download phase completes may still result in a successful download phase. Our current OR recovery roachtest deletes SSTs at random, so it is possible for the test to flake because of this. This updates the test to always delete the last SST in `SHOW BACKUP`, which should always cover a user table instead. Epic: CRDB-51394 Release note: None 148507: roachtest: deflake gopg r=fqazi a=fqazi Release note: None Fixes: #147750 148512: sql/schemachangerccl: deflake multi-region backup tests r=fqazi a=fqazi Increase the CPU cost per test so that fewer multi-region schema changer backup / restore tests run concurrently. This should reduce some flakes. Fixes: #148192 Release note: None 148515: storage: set format major verison for v25.3 r=RaduBerinde a=jbowens Set the storage engine's format major version for cluster version v25.3. Epic: none Release note: None Co-authored-by: Kevin Cao <[email protected]> Co-authored-by: Faizan Qazi <[email protected]> Co-authored-by: Jackson Owens <[email protected]>
5 parents 9364bf1 + f533418 + cada03d + 43e581f + fb7349a commit dfd17d1

File tree

6 files changed

+36
-18
lines changed

6 files changed

+36
-18
lines changed

pkg/ccl/schemachangerccl/sctestbackupccl/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ go_test(
1515
],
1616
exec_properties = {"test.Pool": "heavy"},
1717
shard_count = 48,
18-
tags = ["cpu:3"],
18+
tags = ["cpu:4"],
1919
deps = [
2020
"//pkg/ccl",
2121
"//pkg/ccl/schemachangerccl",

pkg/cli/testdata/ear-list

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ list
88
000004.log:
99
env type: Data, AES128_CTR
1010
keyID: bbb65a9d114c2a18740f27b6933b74f61018bd5adf545c153b48ffe6473336ef
11-
nonce: f9 35 f1 b1 73 c8 bc 8f bd f3 c0 1b
12-
counter: 3210085521
11+
nonce: 31 d3 cd 5a 69 e2 13 64 21 53 57 64
12+
counter: 3952287331
1313
000005.sst:
1414
env type: Data, AES128_CTR
1515
keyID: bbb65a9d114c2a18740f27b6933b74f61018bd5adf545c153b48ffe6473336ef
16-
nonce: 6e 34 f4 3c 11 43 1a f5 69 ce 33 f1
17-
counter: 2398097086
16+
nonce: 23 d9 b2 e1 39 b0 87 ed f9 6d 49 20
17+
counter: 3481614039
1818
COCKROACHDB_DATA_KEYS_000001_monolith:
1919
env type: Store, AES128_CTR
2020
keyID: f594229216d81add7811c4360212eb7629b578ef4eab6e5d05679b3c5de48867
@@ -35,11 +35,11 @@ marker.datakeys.000001.COCKROACHDB_DATA_KEYS_000001_monolith:
3535
keyID: f594229216d81add7811c4360212eb7629b578ef4eab6e5d05679b3c5de48867
3636
nonce: 55 d7 d4 27 6c 97 9b dd f1 5d 40 c8
3737
counter: 467030050
38-
marker.format-version.000008.021:
38+
marker.format-version.000011.024:
3939
env type: Data, AES128_CTR
4040
keyID: bbb65a9d114c2a18740f27b6933b74f61018bd5adf545c153b48ffe6473336ef
41-
nonce: df 3a 47 28 5e fb 88 e3 9e b3 9b a3
42-
counter: 1505393531
41+
nonce: c3 6d b2 7b 3f 3e 67 b9 28 b9 81 b1
42+
counter: 3050109376
4343
marker.manifest.000001.MANIFEST-000001:
4444
env type: Data, AES128_CTR
4545
keyID: bbb65a9d114c2a18740f27b6933b74f61018bd5adf545c153b48ffe6473336ef

pkg/cmd/roachtest/tests/backup_restore_roundtrip.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ func testOnlineRestoreRecovery(ctx context.Context, t test.Test, c cluster.Clust
545545
return errors.Wrap(err, "failed to remove pausepoint for online restore")
546546
}
547547

548-
if err := d.deleteRandomSST(ctx, t.L(), dbConn, collection); err != nil {
548+
if err := d.deleteUserTableSST(ctx, t.L(), dbConn, collection); err != nil {
549549
return err
550550
}
551551
if _, err := dbConn.ExecContext(ctx, "RESUME JOB $1", downloadJobID); err != nil {

pkg/cmd/roachtest/tests/gopg_blocklist.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,6 @@ var gopgIgnoreList = blocklist{
5252
`pg | ORM | relation with no results does not panic`: "unknown",
5353
// This test flakes sometimes because of connection reuse.
5454
`v10.TestColumnReuse`: "unknown",
55+
// This test is flaky sometimes due to the use of temp tables.
56+
`pg | soft delete with int column nil model ForceDelete | deletes the model`: "unknown",
5557
}

pkg/cmd/roachtest/tests/mixed_version_backup.go

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2316,24 +2316,39 @@ func (d *BackupRestoreTestDriver) createBackupCollection(
23162316
return d.saveContents(ctx, l, rng, &collection, fingerprintAOST)
23172317
}
23182318

2319-
// deleteRandomSSTs deletes an SST from the full backup in a collection. This is
2320-
// used to test online restore recovery. We delete from the full backup
2321-
// specifically to avoid deleting an SST from an incremental that is skipped due
2322-
// to a compacted backup. This ensures that deleting an SST will cause the
2323-
// download job to fail (assuming it hasn't already been downloaded).
2324-
func (d *BackupRestoreTestDriver) deleteRandomSST(
2319+
// deleteUserTableSST deletes an SST that covers a user table from the full
2320+
// backup in a collection. This is used to test online restore recovery. We
2321+
// delete from the full backup specifically to avoid deleting an SST from an
2322+
// incremental that is skipped due to a compacted backup. This ensures that
2323+
// deleting an SST will cause the download job to fail (assuming it hasn't
2324+
// already been downloaded).
2325+
// Note: We delete specifically a user-table SST as opposed to choosing a random
2326+
// SST because the temporary system table SSTs may be GC'd before the download
2327+
// phase attempts to download the SST. This would cause the download job to
2328+
// succeed instead of failing as expected. See
2329+
// https://github.com/cockroachdb/cockroach/issues/148408 for more details.
2330+
// TODO (kev-cao): Investigate if blocking storage level compaction would
2331+
// prevent GC of temporary system tables and allow us to delete a random SST
2332+
// instead. Technically, it is still possible, but unlikely, for storage to
2333+
// compact away a user-table SST before the download job attempts to download
2334+
// it, which would result in false positives in the test.
2335+
func (d *BackupRestoreTestDriver) deleteUserTableSST(
23252336
ctx context.Context, l *logger.Logger, db *gosql.DB, collection *backupCollection,
23262337
) error {
23272338
var sstPath string
23282339
if err := db.QueryRowContext(
23292340
ctx,
23302341
`SELECT path FROM
2331-
[SHOW BACKUP FILES FROM LATEST IN $1]
2342+
(
2343+
SELECT row_number() OVER (), * FROM
2344+
[SHOW BACKUP FILES FROM LATEST IN $1]
2345+
)
23322346
WHERE backup_type = 'full'
2333-
ORDER BY random() LIMIT 1 `,
2347+
ORDER BY row_number DESC
2348+
LIMIT 1`,
23342349
collection.uri(),
23352350
).Scan(&sstPath); err != nil {
2336-
return errors.Wrapf(err, "failed to get random SST path from %s", collection.uri())
2351+
return errors.Wrapf(err, "failed to get SST path from %s", collection.uri())
23372352
}
23382353
uri, err := url.Parse(collection.uri())
23392354
if err != nil {

pkg/storage/pebble.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2168,6 +2168,7 @@ func (p *Pebble) CreateCheckpoint(dir string, spans []roachpb.Span) error {
21682168
// named version, it can be assumed all *nodes* have ratcheted to the pebble
21692169
// version associated with it, since they did so during the fence version.
21702170
var pebbleFormatVersionMap = map[clusterversion.Key]pebble.FormatMajorVersion{
2171+
clusterversion.V25_3: pebble.FormatValueSeparation,
21712172
clusterversion.V25_2: pebble.FormatTableFormatV6,
21722173
}
21732174

0 commit comments

Comments
 (0)