@@ -26,16 +26,17 @@ const (
2626 memberIsLeaderTimeout = 10 * time .Minute
2727 memberRejoinedLearnerTimeout = 10 * time .Minute
2828 memberPromotedVotingTimeout = 15 * time .Minute
29+ networkDisruptionDuration = 15 * time .Second
2930 pollInterval = 5 * time .Second
3031)
3132
3233var _ = g .Describe ("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive] Two Node with Fencing etcd recovery" , func () {
3334 defer g .GinkgoRecover ()
3435
3536 var (
36- oc = util .NewCLIWithoutNamespace ("" ).AsAdmin ()
37- etcdClientFactory * helpers.EtcdClientFactoryImpl
38- survivedNode , targetNode corev1.Node
37+ oc = util .NewCLIWithoutNamespace ("" ).AsAdmin ()
38+ etcdClientFactory * helpers.EtcdClientFactoryImpl
39+ peerNode , targetNode corev1.Node
3940 )
4041
4142 g .BeforeEach (func () {
@@ -52,25 +53,29 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
5253
5354 // Select the first index randomly
5455 randomIndex := rand .Intn (len (nodes .Items ))
55- survivedNode = nodes .Items [randomIndex ]
56+ peerNode = nodes .Items [randomIndex ]
5657 // Select the remaining index
5758 targetNode = nodes .Items [(randomIndex + 1 )% len (nodes .Items )]
58- g .GinkgoT ().Printf ("Randomly selected %s (%s) to be shut down and %s (%s) to take the lead\n " , targetNode .Name , targetNode .Status .Addresses [0 ].Address , survivedNode .Name , survivedNode .Status .Addresses [0 ].Address )
5959
6060 kubeClient := oc .KubeClient ()
6161 etcdClientFactory = helpers .NewEtcdClientFactory (kubeClient )
6262
6363 g .GinkgoT ().Printf ("Ensure both nodes are healthy before starting the test\n " )
6464 o .Eventually (func () error {
65- return helpers .EnsureHealthyMember (g .GinkgoT (), etcdClientFactory , survivedNode .Name )
66- }, nodeIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), "expect to ensure Node A healthy without error" )
65+ return helpers .EnsureHealthyMember (g .GinkgoT (), etcdClientFactory , peerNode .Name )
66+ }, nodeIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), fmt . Sprintf ( "expect to ensure Node '%s' healthiness without errors" , peerNode . Name ) )
6767
6868 o .Eventually (func () error {
6969 return helpers .EnsureHealthyMember (g .GinkgoT (), etcdClientFactory , targetNode .Name )
70- }, nodeIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), "expect to ensure Node B healthy without error" )
70+ }, nodeIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), fmt . Sprintf ( "expect to ensure Node '%s' healthiness without errors" , targetNode . Name ) )
7171 })
7272
73- g .It ("Should recover from graceful node shutdown with etcd member re-addition" , func () {
73+ g .It ("should recover from graceful node shutdown with etcd member re-addition" , func () {
74+ // Note: In graceful shutdown, the targetNode is deliberately shut down while
75+ // the peerNode remains running and becomes the etcd leader.
76+ survivedNode := peerNode
77+ g .GinkgoT ().Printf ("Randomly selected %s (%s) to be shut down and %s (%s) to take the lead\n " ,
78+ targetNode .Name , targetNode .Status .Addresses [0 ].Address , peerNode .Name , peerNode .Status .Addresses [0 ].Address )
7479 g .By (fmt .Sprintf ("Shutting down %s gracefully in 1 minute" , targetNode .Name ))
7580 err := util .TriggerNodeRebootGraceful (oc .KubeClient (), targetNode .Name )
7681 o .Expect (err ).To (o .BeNil (), "Expected to gracefully shutdown the node without errors" )
@@ -81,7 +86,7 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
8186 return helpers .EnsureMemberRemoved (g .GinkgoT (), etcdClientFactory , targetNode .Name )
8287 }, memberHasLeftTimeout , pollInterval ).ShouldNot (o .HaveOccurred ())
8388
84- g .By (fmt .Sprintf ("Ensuring that %s is a healthy voting member and adds %s back as learner" , survivedNode .Name , targetNode .Name ))
89+ g .By (fmt .Sprintf ("Ensuring that %s is a healthy voting member and adds %s back as learner" , peerNode .Name , targetNode .Name ))
8590 validateEtcdRecoveryState (etcdClientFactory ,
8691 & survivedNode , true , false , // survivedNode expected started == true, learner == false
8792 & targetNode , false , true , // targetNode expected started == false, learner == true
@@ -100,13 +105,18 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
100105 memberPromotedVotingTimeout , pollInterval )
101106 })
102107
103- g .It ("Should recover from ungraceful node shutdown with etcd member re-addition" , func () {
108+ g .It ("should recover from ungraceful node shutdown with etcd member re-addition" , func () {
109+ // Note: In ungraceful shutdown, the targetNode is forcibly shut down while
110+ // the peerNode remains running and becomes the etcd leader.
111+ survivedNode := peerNode
112+ g .GinkgoT ().Printf ("Randomly selected %s (%s) to be shut down and %s (%s) to take the lead\n " ,
113+ targetNode .Name , targetNode .Status .Addresses [0 ].Address , peerNode .Name , peerNode .Status .Addresses [0 ].Address )
104114 g .By (fmt .Sprintf ("Shutting down %s ungracefully in 1 minute" , targetNode .Name ))
105115 err := util .TriggerNodeRebootUngraceful (oc .KubeClient (), targetNode .Name )
106116 o .Expect (err ).To (o .BeNil (), "Expected to ungracefully shutdown the node without errors" , targetNode .Name , err )
107117 time .Sleep (1 * time .Minute )
108118
109- g .By (fmt .Sprintf ("Ensuring that %s added %s back as learner" , survivedNode .Name , targetNode .Name ))
119+ g .By (fmt .Sprintf ("Ensuring that %s added %s back as learner" , peerNode .Name , targetNode .Name ))
110120 validateEtcdRecoveryState (etcdClientFactory ,
111121 & survivedNode , true , false , // survivedNode expected started == true, learner == false
112122 & targetNode , false , true , // targetNode expected started == false, learner == true
@@ -124,6 +134,41 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
124134 & targetNode , true , false , // targetNode expected started == true, learner == false
125135 memberPromotedVotingTimeout , pollInterval )
126136 })
137+
138+ g .It ("should recover from network disruption with etcd member re-addition" , func () {
139+ // Note: In network disruption, the targetNode runs the disruption command that
140+ // isolates the nodes from each other, creating a split-brain where pacemaker
141+ // determines which node gets fenced and which becomes the etcd leader.
142+ g .GinkgoT ().Printf ("Randomly selected %s (%s) to run the network disruption command\n " , targetNode .Name , targetNode .Status .Addresses [0 ].Address )
143+ g .By (fmt .Sprintf ("Blocking network communication between %s and %s for %v " , targetNode .Name , peerNode .Name , networkDisruptionDuration ))
144+ command , err := util .TriggerNetworkDisruption (oc .KubeClient (), & targetNode , & peerNode , networkDisruptionDuration )
145+ o .Expect (err ).To (o .BeNil (), "Expected to disrupt network without errors" )
146+ g .GinkgoT ().Printf ("command: '%s'\n " , command )
147+
148+ g .By ("Ensuring cluster recovery with proper leader/learner roles after network disruption" )
149+ // Note: The fenced node may recover quickly and already be started when we get
150+ // the first etcd membership. This is valid behavior, so we capture the learner's
151+ // state and adapt the test accordingly.
152+ leaderNode , learnerNode , learnerStarted := validateEtcdRecoveryStateWithoutAssumingLeader (etcdClientFactory ,
153+ & peerNode , & targetNode , memberIsLeaderTimeout , pollInterval )
154+
155+ if learnerStarted {
156+ g .GinkgoT ().Printf ("Learner node '%s' already started as learner\n " , learnerNode .Name )
157+ } else {
158+ g .By (fmt .Sprintf ("Ensuring '%s' rejoins as learner" , learnerNode .Name ))
159+ validateEtcdRecoveryState (etcdClientFactory ,
160+ leaderNode , true , false , // survivedNode expected started == true, learner == false
161+ learnerNode , true , true , // targetNode expected started == true, learner == true
162+ memberRejoinedLearnerTimeout , pollInterval )
163+ }
164+
165+ g .By (fmt .Sprintf ("Ensuring learner node '%s' is promoted back as voting member" , learnerNode .Name ))
166+ validateEtcdRecoveryState (etcdClientFactory ,
167+ leaderNode , true , false , // survivedNode expected started == true, learner == false
168+ learnerNode , true , false , // targetNode expected started == true, learner == false
169+ memberPromotedVotingTimeout , pollInterval )
170+
171+ })
127172})
128173
129174func getMembers (etcdClientFactory helpers.EtcdClientCreator ) ([]* etcdserverpb.Member , error ) {
@@ -226,7 +271,10 @@ func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition
226271 return nil
227272}
228273
229- func validateEtcdRecoveryState (e * helpers.EtcdClientFactoryImpl , survivedNode * corev1.Node , isSurvivedNodeStartedExpected , isSurvivedNodeLearnerExpected bool , targetNode * corev1.Node , isTargetNodeStartedExpected , isTargetNodeLearnerExpected bool , timeout , pollInterval time.Duration ) {
274+ func validateEtcdRecoveryState (e * helpers.EtcdClientFactoryImpl ,
275+ survivedNode * corev1.Node , isSurvivedNodeStartedExpected , isSurvivedNodeLearnerExpected bool ,
276+ targetNode * corev1.Node , isTargetNodeStartedExpected , isTargetNodeLearnerExpected bool ,
277+ timeout , pollInterval time.Duration ) {
230278 o .EventuallyWithOffset (1 , func () error {
231279 members , err := getMembers (e )
232280 if err != nil {
@@ -251,7 +299,76 @@ func validateEtcdRecoveryState(e *helpers.EtcdClientFactoryImpl, survivedNode *c
251299 targetNode .Name , isTargetNodeStartedExpected , isTargetNodeLearnerExpected , members )
252300 }
253301
254- g .GinkgoT ().Logf ("current membership: %+v" , members )
302+ g .GinkgoT ().Logf ("SUCCESS: got membership: %+v" , members )
303+ return nil
304+ }, timeout , pollInterval ).ShouldNot (o .HaveOccurred ())
305+ }
306+
307+ func validateEtcdRecoveryStateWithoutAssumingLeader (e * helpers.EtcdClientFactoryImpl ,
308+ nodeA , nodeB * corev1.Node ,
309+ timeout , pollInterval time.Duration ) (leaderNode , learnerNode * corev1.Node , learnerStarted bool ) {
310+
311+ o .EventuallyWithOffset (1 , func () error {
312+ members , err := getMembers (e )
313+ if err != nil {
314+ return err
315+ }
316+ if len (members ) != 2 {
317+ return fmt .Errorf ("expected 2 members, got %d" , len (members ))
318+ }
319+
320+ // Get state for both nodes first
321+ startedA , learnerA , err := getMemberState (nodeA , members )
322+ if err != nil {
323+ return fmt .Errorf ("failed to get state for node %s: %v" , nodeA .Name , err )
324+ }
325+
326+ startedB , learnerB , err := getMemberState (nodeB , members )
327+ if err != nil {
328+ return fmt .Errorf ("failed to get state for node %s: %v" , nodeB .Name , err )
329+ }
330+
331+ // Then, evaluate the possible combinations
332+ if ! startedA && ! startedB {
333+ return fmt .Errorf ("etcd members have not started yet" )
334+ }
335+
336+ // This should not happen
337+ if learnerA && learnerB {
338+ o .Expect (fmt .Errorf ("both nodes are learners! %s(started=%v, learner=%v), %s(started=%v, learner=%v)" ,
339+ nodeA .Name , startedA , learnerA , nodeB .Name , startedB , learnerB )).ToNot (o .HaveOccurred ())
340+ }
341+
342+ // This might happen if the disruption didn't occurred yet, or we get this snapshot when the learner has been already promoted
343+ if ! learnerA && ! learnerB {
344+ return fmt .Errorf ("both nodes are non-learners (should have exactly one learner): %s(started=%v, learner=%v), %s(started=%v, learner=%v)" ,
345+ nodeA .Name , startedA , learnerA , nodeB .Name , startedB , learnerB )
346+ }
347+
348+ // Once we get one leader and one learner, we don't care if the latter has started already, but the first must
349+ // already been started
350+ leaderStarted := (startedA && ! learnerA ) || (startedB && ! learnerB )
351+ if ! leaderStarted {
352+ return fmt .Errorf ("leader node is not started: %s(started=%v, learner=%v), %s(started=%v, learner=%v)" ,
353+ nodeA .Name , startedA , learnerA , nodeB .Name , startedB , learnerB )
354+ }
355+
356+ // Set return values based on actual roles
357+ if learnerA {
358+ leaderNode = nodeB
359+ learnerNode = nodeA
360+ learnerStarted = startedA
361+ } else {
362+ leaderNode = nodeA
363+ learnerNode = nodeB
364+ learnerStarted = startedB
365+ }
366+
367+ g .GinkgoT ().Logf ("SUCCESS: Leader is %s, learner is %s (started=%v)" ,
368+ leaderNode .Name , learnerNode .Name , learnerStarted )
369+
255370 return nil
256371 }, timeout , pollInterval ).ShouldNot (o .HaveOccurred ())
372+
373+ return leaderNode , learnerNode , learnerStarted
257374}
0 commit comments