@@ -7,6 +7,7 @@ package metamorphic
77import (
88 "fmt"
99 "io"
10+ "math/rand/v2"
1011 "os"
1112 "path"
1213 "runtime/debug"
@@ -321,10 +322,11 @@ func (t *Test) minFMV() pebble.FormatMajorVersion {
321322 return minVersion
322323}
323324
324- func (t * Test ) restartDB (dbID objID ) error {
325+ func (t * Test ) restartDB (dbID objID , shouldCrashDuringOpen bool ) error {
325326 db := t .getDB (dbID )
326- // If strictFS is not used, we use pebble.NoSync for writeOpts, so we can't
327- // restart the database (even if we don't revert to synced data).
327+ // If strictFS is not used, no-op since we end up using pebble.NoSync for
328+ // writeOpts. In the case of pebble.NoSync, we can't restart the database
329+ // even if we don't revert to synced data.
328330 if ! t .testOpts .strictFS {
329331 return nil
330332 }
@@ -348,15 +350,26 @@ func (t *Test) restartDB(dbID objID) error {
348350 }
349351 }
350352 t .opts .FS = crashFS
353+ var slowFS * errorfs.FS
354+ // If we should crash during Open, inject some latency into the filesystem
355+ // so that the first Open is slow enough for us to capture some arbitrary
356+ // intermediate state.
357+ if shouldCrashDuringOpen {
358+ seed := time .Now ().UnixNano ()
359+ t .opts .Logger .Infof ("seed %d" , seed )
360+ mean := time .Duration (rand .IntN (20 ) + 10 * int (time .Millisecond ))
361+ t .opts .Logger .Infof ("Injecting mean %s of latency with p=%.3f" , mean , 1.0 )
362+ slowFS = errorfs .Wrap (crashFS ,
363+ errorfs .RandomLatency (errorfs .Randomly (1.0 , seed ), mean , seed , time .Second ))
364+ t .opts .FS = slowFS
365+ }
351366 t .opts .WithFSDefaults ()
352367 // We want to set the new FS in testOpts too, so they are propagated to the
353368 // TestOptions that were used with metamorphic.New().
354369 t .testOpts .Opts .FS = t .opts .FS
355- if t .opts .WALFailover != nil {
356- t .opts .WALFailover .Secondary .FS = t .opts .FS
357- t .testOpts .Opts .WALFailover .Secondary .FS = t .opts .FS
358- }
359370
371+ firstOpenDone := make (chan struct {})
372+ secondOpenDone := make (chan struct {})
360373 // TODO(jackson): Audit errorRate and ensure custom options' hooks semantics
361374 // are well defined within the context of retries.
362375 err := t .withRetries (func () (err error ) {
@@ -373,15 +386,90 @@ func (t *Test) restartDB(dbID objID) error {
373386 dir = path .Join (dir , fmt .Sprintf ("db%d" , dbID .slot ()))
374387 }
375388 o := t .finalizeOptions ()
389+ if shouldCrashDuringOpen {
390+ go func () {
391+ err = t .simulateCrashDuringOpen (dbID , slowFS , secondOpenDone , firstOpenDone )
392+ }()
393+ if err != nil {
394+ return err
395+ }
396+ }
376397 t .dbs [dbID .slot ()- 1 ], err = pebble .Open (dir , & o )
377- if err != nil {
378- return err
398+ if shouldCrashDuringOpen {
399+ firstOpenDone <- struct {}{}
379400 }
380401 return err
381402 })
403+ if shouldCrashDuringOpen {
404+ <- secondOpenDone
405+ }
382406 return err
383407}
384408
409+ func (t * Test ) simulateCrashDuringOpen (
410+ dbID objID , slowFS * errorfs.FS , secondOpenDone , firstOpenDone chan struct {},
411+ ) error {
412+ defer func () { secondOpenDone <- struct {}{} }()
413+
414+ // Wait a bit for the first Open to make some progress.
415+ time .Sleep (30 * time .Millisecond )
416+
417+ // Create a crash clone of the current filesystem state.
418+ rng := rand .New (rand .NewPCG (0 , uint64 (time .Now ().UnixNano ())))
419+ crashCloneFS , err := slowFS .CrashClone (vfs.CrashCloneCfg {
420+ UnsyncedDataPercent : rng .IntN (101 ),
421+ RNG : rng ,
422+ })
423+ if err != nil {
424+ return err
425+ }
426+
427+ // After the first Open has completed, close the resulting DB and open the
428+ // second DB.
429+ <- firstOpenDone
430+ err = t .dbs [dbID .slot ()- 1 ].Close ()
431+ if err != nil {
432+ return err
433+ }
434+ // Release any resources held by custom options. This may be used, for
435+ // example, by the encryption-at-rest custom option (within the Cockroach
436+ // repository) to close the file registry.
437+ for i := range t .testOpts .CustomOpts {
438+ if err := t .testOpts .CustomOpts [i ].Close (t .opts ); err != nil {
439+ return err
440+ }
441+ }
442+ t .opts .FS = crashCloneFS
443+ if t .opts .WALFailover != nil {
444+ ccsmemFS := t .opts .WALFailover .Secondary .FS .(* vfs.MemFS )
445+ crashCloneSecondaryFS := ccsmemFS .CrashClone (vfs.CrashCloneCfg {
446+ UnsyncedDataPercent : rng .IntN (101 ),
447+ RNG : rng ,
448+ })
449+ t .testOpts .Opts .WALFailover .Secondary .FS = crashCloneSecondaryFS
450+ t .opts .WALFailover .Secondary .FS = crashCloneSecondaryFS
451+ }
452+ // Reacquire any resources required by custom options. This may be used, for
453+ // example, by the encryption-at-rest custom option (within the Cockroach
454+ // repository) to reopen the file registry.
455+ for i := range t .testOpts .CustomOpts {
456+ if err := t .testOpts .CustomOpts [i ].Open (t .opts ); err != nil {
457+ return err
458+ }
459+ }
460+ // Create a copy of options for the second DB.
461+ dir := t .dir
462+ if len (t .dbs ) > 1 {
463+ dir = path .Join (dir , fmt .Sprintf ("db%d" , dbID .slot ()))
464+ }
465+ o := t .finalizeOptions ()
466+ t .dbs [dbID .slot ()- 1 ], err = pebble .Open (dir , & o )
467+ if err != nil {
468+ return err
469+ }
470+ return nil
471+ }
472+
385473func (t * Test ) saveInMemoryDataInternal () error {
386474 if rootFS := vfs .Root (t .opts .FS ); rootFS != vfs .Default {
387475 // t.opts.FS is an in-memory system; copy it to disk.
0 commit comments