Skip to content

Commit b56401c

Browse files
committed
restore: deflake TestRestoreCheckpointing
The current `TestRestoreCheckpointing` waits for job progress to be updated by sleeping for a duration greater than the checkpoint interval. While this works the vast majority of the time, if the test cluster is overloaded, it is not sufficient and we can end up resuming the job before the progress was updated. This results in more spans being processed than expected. This commit updates the test to instead check the contents of the job progress checkpoint and wait until the checkpoint contains all processed spans from before the pause. Fixes: #153848 Release note: None
1 parent 552cfcb commit b56401c

File tree

1 file changed

+34
-5
lines changed

1 file changed

+34
-5
lines changed

pkg/backup/backup_test.go

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,8 +1301,12 @@ func TestRestoreCheckpointing(t *testing.T) {
13011301
defer jobs.TestingSetProgressThresholds()()
13021302

13031303
// totalEntries represents the number of entries to appear in the persisted frontier.
1304-
totalEntries := 7
1305-
entriesBeforePause := 4
1304+
const totalEntries = 7
1305+
const entriesBeforePause = 4
1306+
processedSpans := struct {
1307+
syncutil.Mutex
1308+
spans roachpb.Spans
1309+
}{}
13061310
entriesCount := 0
13071311
var alreadyPaused atomic.Bool
13081312
postResumeCount := 0
@@ -1313,7 +1317,7 @@ func TestRestoreCheckpointing(t *testing.T) {
13131317
knobs := base.TestingKnobs{
13141318
DistSQL: &execinfra.TestingKnobs{
13151319
BackupRestoreTestingKnobs: &sql.BackupRestoreTestingKnobs{
1316-
RunAfterProcessingRestoreSpanEntry: func(_ context.Context, _ *execinfrapb.RestoreSpanEntry) error {
1320+
RunAfterProcessingRestoreSpanEntry: func(_ context.Context, entry *execinfrapb.RestoreSpanEntry) error {
13171321
// Because the restore processor has several workers that
13181322
// concurrently send addsstable requests and because all workers will
13191323
// wait on the lock below, when one flush gets blocked on the
@@ -1325,12 +1329,20 @@ func TestRestoreCheckpointing(t *testing.T) {
13251329
// checking if the job was paused in each request before it began
13261330
// waiting for the lock.
13271331
wasPausedBeforeWaiting := alreadyPaused.Load()
1332+
13281333
mu.Lock()
13291334
defer mu.Unlock()
13301335
if entriesCount == entriesBeforePause {
13311336
close(waitForProgress)
13321337
<-blockDBRestore
1338+
} else if entriesCount < entriesBeforePause {
1339+
// We save all spans from before the pause to ensure that they have
1340+
// been checkpointed and saved in the job progress.
1341+
processedSpans.Lock()
1342+
defer processedSpans.Unlock()
1343+
processedSpans.spans = append(processedSpans.spans, entry.Span)
13331344
}
1345+
13341346
entriesCount++
13351347
if wasPausedBeforeWaiting {
13361348
postResumeCount++
@@ -1373,8 +1385,25 @@ func TestRestoreCheckpointing(t *testing.T) {
13731385
// Pause the job after some progress has been logged.
13741386
<-waitForProgress
13751387

1376-
// To ensure that progress gets persisted, sleep well beyond the test only job update interval.
1377-
time.Sleep(time.Second)
1388+
// To ensure that progress has been persisted, we wait until all processed
1389+
// spans from before the pause are stored in the job progress.
1390+
testutils.SucceedsSoon(t, func() error {
1391+
jobProgress := jobutils.GetJobProgress(t, sqlDB, jobID)
1392+
checkpointedSpans := jobProgress.GetRestore().Checkpoint
1393+
checkpointedSpanGroup := roachpb.SpanGroup{}
1394+
for _, span := range checkpointedSpans {
1395+
checkpointedSpanGroup.Add(span.Span)
1396+
}
1397+
1398+
processedSpans.Lock()
1399+
defer processedSpans.Unlock()
1400+
for _, span := range processedSpans.spans {
1401+
if !checkpointedSpanGroup.Encloses(span) {
1402+
return errors.Newf("span %s was processed but not saved in job progress yet")
1403+
}
1404+
}
1405+
return nil
1406+
})
13781407

13791408
sqlDB.Exec(t, `PAUSE JOB $1`, &jobID)
13801409
jobutils.WaitForJobToPause(t, sqlDB, jobID)

0 commit comments

Comments
 (0)