Skip to content

Commit b311a60

Browse files
committed
roachtest: prevent GC on failed fixture restores
This commit teaches our fixture roachtests to flag fixtures for GC protection. This gives developers a larger grace period to investigate failed restore roachtests before GC deletes the fixture. Resolves: #160073 Release note: None
1 parent f966445 commit b311a60

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

pkg/cmd/roachtest/tests/online_restore.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ func exportStats(ctx context.Context, rd restoreDriver, restoreStats restoreStat
586586
}
587587

588588
func waitForDownloadJob(
589-
ctx context.Context, c cluster.Cluster, l *logger.Logger,
589+
ctx context.Context, c cluster.Cluster, l *logger.Logger, rd restoreDriver,
590590
) (time.Time, error) {
591591
l.Printf(`Begin tracking online restore download phase completion`)
592592
// Wait for the job to succeed.
@@ -625,6 +625,7 @@ func waitForDownloadJob(
625625
} else if status == string(jobs.StateRunning) {
626626
l.Printf("Download job still running")
627627
} else {
628+
rd.markFixtureWithFailure(ctx)
628629
return downloadJobEndTimeLowerBound, errors.Newf("job unexpectedly found in %s state", status)
629630
}
630631
}
@@ -740,6 +741,7 @@ func executeTestRestorePhase(
740741
restoreStartTime := timeutil.Now()
741742
restoreCmd := rd.restoreCmd(ctx, fmt.Sprintf("DATABASE %s", sp.backup.fixture.DatabaseName()), opts)
742743
if _, err = db.ExecContext(ctx, restoreCmd); err != nil {
744+
rd.markFixtureWithFailure(ctx)
743745
return time.Time{}, time.Time{}, err
744746
}
745747
restoreEndTime := timeutil.Now()
@@ -795,7 +797,7 @@ func executeTestDownloadPhase(
795797
defer workloadCancel()
796798
if runOnline {
797799
var err error
798-
if downloadEndTimeLowerBound, err = waitForDownloadJob(ctx, c, logger); err != nil {
800+
if downloadEndTimeLowerBound, err = waitForDownloadJob(ctx, c, logger, rd); err != nil {
799801
return err
800802
}
801803
downloadTime := downloadEndTimeLowerBound.Sub(downloadStartTime)

pkg/cmd/roachtest/tests/restore.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
"github.com/cockroachdb/cockroach/pkg/workload/histogram"
3838
"github.com/cockroachdb/errors"
3939
"github.com/prometheus/client_golang/prometheus"
40+
"github.com/stretchr/testify/assert"
4041
"github.com/stretchr/testify/require"
4142
)
4243

@@ -266,6 +267,7 @@ func registerRestore(r registry.Registry) {
266267
if status == string(jobs.StateSucceeded) {
267268
isJobComplete = true
268269
} else if status == string(jobs.StateFailed) || status == string(jobs.StateCanceled) {
270+
rd.markFixtureWithFailure(ctx)
269271
t.Fatalf("job unexpectedly found in %s state", status)
270272
}
271273
}
@@ -873,6 +875,9 @@ func (rd *restoreDriver) run(ctx context.Context, target string) error {
873875
_, err = conn.ExecContext(
874876
ctx, rd.restoreCmd(ctx, target, "WITH unsafe_restore_incompatible_version"),
875877
)
878+
if err != nil {
879+
rd.markFixtureWithFailure(ctx)
880+
}
876881
return err
877882
}
878883

@@ -945,7 +950,23 @@ func (rd *restoreDriver) maybeValidateFingerprint(ctx context.Context) {
945950
fingerprint := fingerprintDatabase(
946951
rd.t, conn, rd.sp.backup.fixture.DatabaseName(), "", /* aost */
947952
)
948-
require.Equal(rd.t, rd.fixtureMetadata.Fingerprint, fingerprint, "fingerprint mismatch after restore")
953+
if !assert.Equal(
954+
rd.t, rd.fixtureMetadata.Fingerprint, fingerprint, "fingerprint mismatch before restore",
955+
) {
956+
rd.markFixtureWithFailure(ctx)
957+
rd.t.FailNow()
958+
}
959+
}
960+
961+
// markFixtureWithFailure flags the fixture being restored to have caused a test
962+
// failure. This is useful when a test fails and we want to preserve the backup
963+
// for investigation. This is a best-effort operation.
964+
func (rd *restoreDriver) markFixtureWithFailure(ctx context.Context) {
965+
registry := GetFixtureRegistry(ctx, rd.t, rd.sp.backup.cloud)
966+
err := registry.MarkFailure(ctx, rd.t.L(), rd.fixtureMetadata.MetadataPath)
967+
if err != nil {
968+
rd.t.L().ErrorfCtx(ctx, "failed to mark fixture with failure: %v", err)
969+
}
949970
}
950971

951972
// exportToRoachperf exports a single perf metric for the given test to roachperf.

0 commit comments

Comments
 (0)