@@ -68,6 +68,7 @@ import (
6868 "github.com/cockroachdb/cockroach/pkg/util/log"
6969 "github.com/cockroachdb/cockroach/pkg/util/randutil"
7070 "github.com/cockroachdb/cockroach/pkg/util/retry"
71+ "github.com/cockroachdb/cockroach/pkg/util/stop"
7172 "github.com/cockroachdb/cockroach/pkg/util/syncutil"
7273 "github.com/cockroachdb/cockroach/pkg/util/timeutil"
7374 "github.com/cockroachdb/cockroach/pkg/util/tracing"
@@ -2788,7 +2789,6 @@ func TestLossQuorumCauseLeaderWatcherToSignalUnavailable(t *testing.T) {
27882789 require .NoError (t , log .SetVModule ("replica_range_lease=3,raft=4" ))
27892790
27902791 ctx := context .Background ()
2791- manualClock := hlc .NewHybridManualClock ()
27922792 stickyVFSRegistry := fs .NewStickyRegistry ()
27932793 lisReg := listenerutil .NewListenerRegistry ()
27942794 defer lisReg .Close ()
@@ -2851,12 +2851,8 @@ func TestLossQuorumCauseLeaderWatcherToSignalUnavailable(t *testing.T) {
28512851 return nil
28522852 })
28532853
2854- // Increment the clock by the leaderlessWatcher unavailable threshold.
2855- manualClock .Increment (threshold .Nanoseconds ())
2856-
28572854 // Wait for the leaderlessWatcher to indicate that the range is unavailable.
28582855 testutils .SucceedsSoon (t , func () error {
2859- tc .GetFirstStoreFromServer (t , aliveNodeIdx ).LookupReplica (roachpb .RKey (key ))
28602856 if ! repl .LeaderlessWatcher .IsUnavailable () {
28612857 return errors .New ("range is still available" )
28622858 }
@@ -2915,6 +2911,77 @@ func TestLossQuorumCauseLeaderWatcherToSignalUnavailable(t *testing.T) {
29152911 })
29162912}
29172913
2914+ // TestLeaderlessWatcherUnavailabilityErrorRefreshedOnUnavailabilityTransition
2915+ // ensures that the leaderless watcher constructs a new error every time it
2916+ // transitions to the unavailable state. In particular, the descriptor used
2917+ // in the error should be the latest descriptor.
2918+ // Serves as a regression test for
2919+ // https://github.com/cockroachdb/cockroach/issues/144639.
2920+ func TestLeaderlessWatcherErrorRefreshedOnUnavailabilityTransition (t * testing.T ) {
2921+ defer leaktest .AfterTest (t )()
2922+ defer log .Scope (t ).Close (t )
2923+ ctx := context .Background ()
2924+ stopper := stop .NewStopper ()
2925+ defer stopper .Stop (ctx )
2926+
2927+ manual := hlc .NewHybridManualClock ()
2928+ st := cluster .MakeTestingClusterSettings ()
2929+ // Set the leaderless threshold to 10 second.
2930+ kvserver .ReplicaLeaderlessUnavailableThreshold .Override (ctx , & st .SV , 10 * time .Second )
2931+
2932+ tc := testcluster .StartTestCluster (t , 3 , base.TestClusterArgs {
2933+ ReplicationMode : base .ReplicationManual ,
2934+ ServerArgs : base.TestServerArgs {
2935+ Settings : st ,
2936+ Knobs : base.TestingKnobs {
2937+ Server : & server.TestingKnobs {
2938+ WallClock : manual ,
2939+ },
2940+ },
2941+ },
2942+ })
2943+ defer tc .Stopper ().Stop (ctx )
2944+ key := tc .ScratchRange (t )
2945+ tc .AddVotersOrFatal (t , key , tc .Targets (1 )... )
2946+ repl := tc .GetFirstStoreFromServer (t , 1 ).LookupReplica (roachpb .RKey (key ))
2947+
2948+ // The leaderlessWatcher starts off as available.
2949+ require .False (t , repl .LeaderlessWatcher .IsUnavailable ())
2950+ // Let it know it's leaderless.
2951+ repl .RefreshLeaderlessWatcherUnavailableStateForTesting (ctx , raft .None , manual .Now (), st )
2952+ // Even though the replica is leaderless, enough time hasn't passed for it to
2953+ // be considered unavailable.
2954+ require .False (t , repl .LeaderlessWatcher .IsUnavailable ())
2955+ // The error should be nil as we're not considered leaderless at this point.
2956+ require .NoError (t , repl .LeaderlessWatcher .Err ())
2957+ // Let enough time pass.
2958+ manual .Increment (10 * time .Second .Nanoseconds ())
2959+ repl .RefreshLeaderlessWatcherUnavailableStateForTesting (ctx , raft .None , manual .Now (), st )
2960+ // Now the replica is considered unavailable.
2961+ require .True (t , repl .LeaderlessWatcher .IsUnavailable ())
2962+ require .Error (t , repl .LeaderlessWatcher .Err ())
2963+ // Regex to ensure we've got a replica unavailable error with n1 and n2 in the
2964+ // range descriptor.
2965+ require .Regexp (t , "replica unavailable.*n1.*n2.*gen=3" , repl .LeaderlessWatcher .Err ().Error ())
2966+
2967+ // Next up, let the replica know there's a leader. This should make it
2968+ // available again.
2969+ repl .RefreshLeaderlessWatcherUnavailableStateForTesting (ctx , 1 , manual .Now (), st )
2970+ require .False (t , repl .LeaderlessWatcher .IsUnavailable ())
2971+ // Change the range descriptor. Mark it leaderless and let enough time pass
2972+ // for it to be considered unavailable again.
2973+ tc .AddVotersOrFatal (t , key , tc .Targets (2 )... )
2974+ repl .RefreshLeaderlessWatcherUnavailableStateForTesting (ctx , raft .None , manual .Now (), st )
2975+ manual .Increment (10 * time .Second .Nanoseconds ())
2976+ repl .RefreshLeaderlessWatcherUnavailableStateForTesting (ctx , raft .None , manual .Now (), st )
2977+ // The replica should now be considered unavailable again.
2978+ require .True (t , repl .LeaderlessWatcher .IsUnavailable ())
2979+ require .Error (t , repl .LeaderlessWatcher .Err ())
2980+ // Ensure that the range descriptor now contains n1, n2, and n3 -- i.e, we're
2981+ // updating the error with the latest descriptor on the latest transition.
2982+ require .Regexp (t , "replica unavailable.*n1.*n2.*n3.*gen=5" , repl .LeaderlessWatcher .Err ().Error ())
2983+ }
2984+
29182985func TestClearRange (t * testing.T ) {
29192986 defer leaktest .AfterTest (t )()
29202987 defer log .Scope (t ).Close (t )
0 commit comments