@@ -2106,3 +2106,155 @@ func TestWALCorruptionBitFlip(t *testing.T) {
21062106 }
21072107 checkBitFlipErr (err , t )
21082108}
2109+
2110+ // TestCrashDuringOpenRandomized is a randomized test that simulates a hard crash
2111+ // during database opening. It creates a database with some data, then simulates
2112+ // opening it with injected filesystem slowness and crashes during the open
2113+ // process. It ensures that the resulting DB state opens successfully, and the
2114+ // contents of the DB match the expectations based on the keys written.
2115+ func TestCrashDuringOpenRandomized (t * testing.T ) {
2116+ seed := time .Now ().UnixNano ()
2117+ t .Logf ("seed %d" , seed )
2118+ rng := rand .New (rand .NewPCG (0 , uint64 (seed )))
2119+
2120+ // Create initial database with some data.
2121+ mem := vfs .NewCrashableMem ()
2122+ failoverOpts := WALFailoverOptions {
2123+ Secondary : wal.Dir {FS : mem , Dirname : "secondary" },
2124+ FailoverOptions : wal.FailoverOptions {
2125+ PrimaryDirProbeInterval : 100 * time .Microsecond ,
2126+ HealthyProbeLatencyThreshold : 5 * time .Millisecond ,
2127+ HealthyInterval : 50 * time .Microsecond ,
2128+ UnhealthySamplingInterval : 10 * time .Microsecond ,
2129+ UnhealthyOperationLatencyThreshold : func () (time.Duration , bool ) {
2130+ return 50 * time .Microsecond , true
2131+ },
2132+ ElevatedWriteStallThresholdLag : 100 * time .Microsecond ,
2133+ },
2134+ }
2135+ opts := & Options {
2136+ FS : mem ,
2137+ FormatMajorVersion : internalFormatNewest ,
2138+ Logger : testutils.Logger {T : t },
2139+ MemTableSize : 128 << 10 , // 128 KiB
2140+ MemTableStopWritesThreshold : 4 ,
2141+ WALFailover : & failoverOpts ,
2142+ }
2143+
2144+ // Create and populate initial database.
2145+ d , err := Open ("testdb" , opts )
2146+ require .NoError (t , err )
2147+
2148+ testData := make (map [string ][]byte )
2149+ for i := range 50 {
2150+ key := fmt .Sprintf ("key-%d" , i )
2151+ value := make ([]byte , 100 + rng .IntN (900 )) // 100-1000 bytes
2152+ for j := range value {
2153+ value [j ] = byte (i + j )
2154+ }
2155+ testData [key ] = value
2156+ require .NoError (t , d .Set ([]byte (key ), value , Sync ))
2157+ }
2158+ require .NoError (t , d .Close ())
2159+
2160+ // Now simulate opening with a crash clone we have taken during open.
2161+ // Create options with latency injection and WAL failover for the slow
2162+ // open process.
2163+ mean := time .Duration (rng .ExpFloat64 () * float64 (time .Microsecond ))
2164+ p := 1.0
2165+ t .Logf ("injecting mean %s of latency with p=%.3f" , mean , p )
2166+ slowFS := errorfs .Wrap (mem , errorfs .RandomLatency (
2167+ errorfs .Randomly (p , seed ),
2168+ mean ,
2169+ seed ,
2170+ 10 * time .Millisecond ,
2171+ ))
2172+
2173+ // Create WAL failover options for the slow open.
2174+ slowFailoverOpts := failoverOpts
2175+ slowFailoverOpts .Secondary = wal.Dir {FS : slowFS , Dirname : "secondary" }
2176+ slowOpts := & Options {
2177+ FS : mem ,
2178+ FormatMajorVersion : internalFormatNewest ,
2179+ Logger : testutils.Logger {T : t },
2180+ MemTableSize : 128 << 10 ,
2181+ MemTableStopWritesThreshold : 4 ,
2182+ WALFailover : & slowFailoverOpts ,
2183+ }
2184+
2185+ // Start opening the database in a goroutine.
2186+ type openResult struct {
2187+ db * DB
2188+ err error
2189+ }
2190+ openResultChan := make (chan openResult , 1 )
2191+ go func () {
2192+ t .Log ("opening database" )
2193+ db , err := Open ("testdb" , slowOpts )
2194+ t .Log ("opened database" )
2195+ openResultChan <- openResult {db : db , err : err }
2196+ }()
2197+
2198+ // Wait a bit to let the open process make some progress.
2199+ time .Sleep (time .Millisecond * time .Duration (5 + rng .IntN (10 )))
2200+
2201+ // Take crash clone while the open process is still running.
2202+ t .Log ("taking crash clone during open process" )
2203+ crashClone := mem .CrashClone (vfs.CrashCloneCfg {
2204+ UnsyncedDataPercent : rng .IntN (101 ),
2205+ RNG : rng ,
2206+ })
2207+
2208+ // Wait for the original open to complete (it might succeed or fail).
2209+ result := <- openResultChan
2210+ openedDB := result .db
2211+ if result .err != nil {
2212+ t .Errorf ("open failed: %v" , result .err )
2213+ }
2214+ if openedDB != nil {
2215+ if err := openedDB .Close (); err != nil {
2216+ t .Errorf ("failed to close openedDB: %v" , err )
2217+ }
2218+
2219+ }
2220+ t .Log ("using crashed filesystem for recovery" )
2221+ // Create WAL failover options for the crashed filesystem recovery.
2222+ crashedFailoverOpts := failoverOpts
2223+ crashedFailoverOpts .Secondary = wal.Dir {FS : crashClone , Dirname : "secondary" }
2224+
2225+ // Now try to open the crashed filesystem with WAL failover.
2226+ crashedOpts := & Options {
2227+ FS : crashClone ,
2228+ FormatMajorVersion : internalFormatNewest ,
2229+ Logger : testutils.Logger {T : t },
2230+ MemTableSize : 128 << 10 ,
2231+ MemTableStopWritesThreshold : 4 ,
2232+ WALFailover : & crashedFailoverOpts ,
2233+ }
2234+
2235+ recoveredDB , err := Open ("testdb" , crashedOpts )
2236+ require .NoError (t , err )
2237+
2238+ // Verify that we can read some of the expected data.
2239+ iter , err := recoveredDB .NewIter (nil )
2240+ require .NoError (t , err )
2241+
2242+ foundKeys := make (map [string ][]byte )
2243+ for valid := iter .First (); valid ; valid = iter .Next () {
2244+ key := string (iter .Key ())
2245+ value := slices .Clone (iter .Value ())
2246+ foundKeys [key ] = value
2247+ }
2248+ require .NoError (t , iter .Close ())
2249+
2250+ // Verify that found data matches expected data.
2251+ require .NotEmpty (t , foundKeys , "no keys found after crash" )
2252+
2253+ // Check that all found keys match expected data.
2254+ for key , foundValue := range foundKeys {
2255+ expectedValue , exists := testData [key ]
2256+ require .True (t , exists , "found unexpected key: %s" , key )
2257+ require .Equal (t , expectedValue , foundValue , "mismatch for key %s" , key )
2258+ }
2259+ require .NoError (t , recoveredDB .Close ())
2260+ }
0 commit comments