@@ -12,7 +12,7 @@ import (
1212 o "github.com/onsi/gomega"
1313 v1 "github.com/openshift/api/config/v1"
1414 "github.com/openshift/origin/test/extended/etcd/helpers"
15- exutil "github.com/openshift/origin/test/extended/util"
15+ "github.com/openshift/origin/test/extended/util"
1616 "github.com/pkg/errors"
1717 "go.etcd.io/etcd/api/v3/etcdserverpb"
1818 corev1 "k8s.io/api/core/v1"
@@ -21,147 +21,108 @@ import (
2121
2222const (
2323 nodeIsHealthyTimeout = time .Minute
24+ etcdOperatorIsHealthyTimeout = time .Minute
2425 memberHasLeftTimeout = 5 * time .Minute
25- memberIsLeaderTimeout = 2 * time .Minute
26+ memberIsLeaderTimeout = 10 * time .Minute
2627 memberRejoinedLearnerTimeout = 10 * time .Minute
27- memberPromotedVotingTimeout = 10 * time .Minute
28+ memberPromotedVotingTimeout = 15 * time .Minute
2829 pollInterval = 5 * time .Second
2930)
3031
3132var _ = g .Describe ("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive] Two Node with Fencing etcd recovery" , func () {
3233 defer g .GinkgoRecover ()
3334
3435 var (
35- oc = exutil .NewCLIWithoutNamespace ("" ).AsAdmin ()
36- etcdClientFactory * helpers.EtcdClientFactoryImpl
37- nodeA , nodeB corev1.Node
36+ oc = util .NewCLIWithoutNamespace ("" ).AsAdmin ()
37+ etcdClientFactory * helpers.EtcdClientFactoryImpl
38+ survivedNode , targetNode corev1.Node
3839 )
3940
4041 g .BeforeEach (func () {
4142 skipIfNotTopology (oc , v1 .DualReplicaTopologyMode )
4243
44+ g .By ("Verifying etcd cluster operator is healthy before starting test" )
45+ o .Eventually (func () error {
46+ return ensureEtcdOperatorHealthy (oc )
47+ }, etcdOperatorIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), "etcd cluster operator should be healthy before starting test" )
48+
4349 nodes , err := oc .AdminKubeClient ().CoreV1 ().Nodes ().List (context .Background (), metav1.ListOptions {})
4450 o .Expect (err ).ShouldNot (o .HaveOccurred (), "Expected to retrieve nodes without error" )
4551 o .Expect (len (nodes .Items )).To (o .BeNumerically ("==" , 2 ), "Expected to find 2 Nodes only" )
4652
4753 // Select the first index randomly
4854 randomIndex := rand .Intn (len (nodes .Items ))
49- nodeA = nodes .Items [randomIndex ]
55+ survivedNode = nodes .Items [randomIndex ]
5056 // Select the remaining index
51- nodeB = nodes .Items [(randomIndex + 1 )% len (nodes .Items )]
52- g .GinkgoT ().Printf ("Randomly selected %s (%s) to be gracefully shut down and %s (%s) to take the lead\n " , nodeB .Name , nodeB .Status .Addresses [0 ].Address , nodeA .Name , nodeA .Status .Addresses [0 ].Address )
57+ targetNode = nodes .Items [(randomIndex + 1 )% len (nodes .Items )]
58+ g .GinkgoT ().Printf ("Randomly selected %s (%s) to be shut down and %s (%s) to take the lead\n " , targetNode .Name , targetNode .Status .Addresses [0 ].Address , survivedNode .Name , survivedNode .Status .Addresses [0 ].Address )
5359
5460 kubeClient := oc .KubeClient ()
5561 etcdClientFactory = helpers .NewEtcdClientFactory (kubeClient )
5662
5763 g .GinkgoT ().Printf ("Ensure both nodes are healthy before starting the test\n " )
5864 o .Eventually (func () error {
59- return helpers .EnsureHealthyMember (g .GinkgoT (), etcdClientFactory , nodeA .Name )
65+ return helpers .EnsureHealthyMember (g .GinkgoT (), etcdClientFactory , survivedNode .Name )
6066 }, nodeIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), "expect to ensure Node A healthy without error" )
6167
6268 o .Eventually (func () error {
63- return helpers .EnsureHealthyMember (g .GinkgoT (), etcdClientFactory , nodeB .Name )
69+ return helpers .EnsureHealthyMember (g .GinkgoT (), etcdClientFactory , targetNode .Name )
6470 }, nodeIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), "expect to ensure Node B healthy without error" )
6571 })
6672
67- g .It ("Should support a graceful node shutdown" , func () {
68- msg := fmt .Sprintf ("Shutting down %s gracefully in 1 minute" , nodeB .Name )
69- g .By (msg )
70- // NOTE: Using `shutdown` alone would cause the node to be permanently removed from the cluster.
71- // To prevent this, we use the `--reboot` flag, which ensures a graceful shutdown and allows the
72- // node to rejoin the cluster upon restart. A one-minute delay is added to give the debug node
73- // sufficient time to cleanly exit before the shutdown process completes.
74- _ , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , nodeB .Name , "openshift-etcd" , "shutdown" , "--reboot" , "+1" )
73+ g .It ("Should recover from graceful node shutdown with etcd member re-addition" , func () {
74+ g .By (fmt .Sprintf ("Shutting down %s gracefully in 1 minute" , targetNode .Name ))
75+ err := util .TriggerNodeRebootGraceful (oc .KubeClient (), targetNode .Name )
7576 o .Expect (err ).To (o .BeNil (), "Expected to gracefully shutdown the node without errors" )
7677 time .Sleep (time .Minute )
7778
78- msg = fmt .Sprintf ("Ensuring %s leaves the member list" , nodeB .Name )
79- g .By (msg )
79+ g .By (fmt .Sprintf ("Ensuring %s leaves the member list" , targetNode .Name ))
8080 o .Eventually (func () error {
81- return helpers .EnsureMemberRemoved (g .GinkgoT (), etcdClientFactory , nodeB .Name )
81+ return helpers .EnsureMemberRemoved (g .GinkgoT (), etcdClientFactory , targetNode .Name )
8282 }, memberHasLeftTimeout , pollInterval ).ShouldNot (o .HaveOccurred ())
8383
84- msg = fmt .Sprintf ("Ensuring that %s is a healthy voting member and adds %s back as learner" , nodeA .Name , nodeB .Name )
85- g .By (msg )
86- o .Eventually (func () error {
87- members , err := getMembers (etcdClientFactory )
88- if err != nil {
89- return err
90- }
91- if len (members ) != 2 {
92- return fmt .Errorf ("Not enough members" )
93- }
94-
95- if started , learner , err := getMemberState (& nodeA , members ); err != nil {
96- return err
97- } else if ! started || learner {
98- return fmt .Errorf ("Expected node: %s to be a started and voting member. Membership: %+v" , nodeA .Name , members )
99- }
100-
101- // Ensure GNS node is unstarted and a learner member
102- if started , learner , err := getMemberState (& nodeB , members ); err != nil {
103- return err
104- } else if started || ! learner {
105- return fmt .Errorf ("Expected node: %s to be a unstarted and learning member. Membership: %+v" , nodeB .Name , members )
106- }
107-
108- g .GinkgoT ().Logf ("membership: %+v" , members )
109- return nil
110- }, memberIsLeaderTimeout , pollInterval ).ShouldNot (o .HaveOccurred ())
111-
112- msg = fmt .Sprintf ("Ensuring %s rejoins as learner" , nodeB .Name )
113- g .By (msg )
114- o .Eventually (func () error {
115- members , err := getMembers (etcdClientFactory )
116- if err != nil {
117- return err
118- }
119- if len (members ) != 2 {
120- return fmt .Errorf ("Not enough members" )
121- }
122-
123- if started , learner , err := getMemberState (& nodeA , members ); err != nil {
124- return err
125- } else if ! started || learner {
126- return fmt .Errorf ("Expected node: %s to be a started and voting member. Membership: %+v" , nodeA .Name , members )
127- }
128-
129- if started , learner , err := getMemberState (& nodeB , members ); err != nil {
130- return err
131- } else if ! started || ! learner {
132- return fmt .Errorf ("Expected node: %s to be a started and learner member. Membership: %+v" , nodeB .Name , members )
133- }
134-
135- g .GinkgoT ().Logf ("membership: %+v" , members )
136- return nil
137- }, memberRejoinedLearnerTimeout , pollInterval ).ShouldNot (o .HaveOccurred ())
138-
139- msg = fmt .Sprintf ("Ensuring %s node is promoted back as voting member" , nodeB .Name )
140- g .By (msg )
141- o .Eventually (func () error {
142- members , err := getMembers (etcdClientFactory )
143- if err != nil {
144- return err
145- }
146- if len (members ) != 2 {
147- return fmt .Errorf ("Not enough members" )
148- }
149-
150- if started , learner , err := getMemberState (& nodeA , members ); err != nil {
151- return err
152- } else if ! started || learner {
153- return fmt .Errorf ("Expected node: %s to be a started and voting member. Membership: %+v" , nodeA .Name , members )
154- }
155-
156- if started , learner , err := getMemberState (& nodeB , members ); err != nil {
157- return err
158- } else if ! started || learner {
159- return fmt .Errorf ("Expected node: %s to be a started and voting member. Membership: %+v" , nodeB .Name , members )
160- }
161-
162- g .GinkgoT ().Logf ("membership: %+v" , members )
163- return nil
164- }, memberPromotedVotingTimeout , pollInterval ).ShouldNot (o .HaveOccurred ())
84+ g .By (fmt .Sprintf ("Ensuring that %s is a healthy voting member and adds %s back as learner" , survivedNode .Name , targetNode .Name ))
85+ validateEtcdRecoveryState (etcdClientFactory ,
86+ & survivedNode , true , false , // survivedNode expected started == true, learner == false
87+ & targetNode , false , true , // targetNode expected started == false, learner == true
88+ memberIsLeaderTimeout , pollInterval )
89+
90+ g .By (fmt .Sprintf ("Ensuring %s rejoins as learner" , targetNode .Name ))
91+ validateEtcdRecoveryState (etcdClientFactory ,
92+ & survivedNode , true , false , // survivedNode expected started == true, learner == false
93+ & targetNode , true , true , // targetNode expected started == true, learner == true
94+ memberRejoinedLearnerTimeout , pollInterval )
95+
96+ g .By (fmt .Sprintf ("Ensuring %s node is promoted back as voting member" , targetNode .Name ))
97+ validateEtcdRecoveryState (etcdClientFactory ,
98+ & survivedNode , true , false , // survivedNode expected started == true, learner == false
99+ & targetNode , true , false , // targetNode expected started == true, learner == false
100+ memberPromotedVotingTimeout , pollInterval )
101+ })
102+
103+ g .It ("Should recover from ungraceful node shutdown with etcd member re-addition" , func () {
104+ g .By (fmt .Sprintf ("Shutting down %s ungracefully in 1 minute" , targetNode .Name ))
105+ err := util .TriggerNodeRebootUngraceful (oc .KubeClient (), targetNode .Name )
106+ o .Expect (err ).To (o .BeNil (), "Expected to ungracefully shutdown the node without errors" , targetNode .Name , err )
107+ time .Sleep (1 * time .Minute )
108+
109+ g .By (fmt .Sprintf ("Ensuring that %s added %s back as learner" , survivedNode .Name , targetNode .Name ))
110+ validateEtcdRecoveryState (etcdClientFactory ,
111+ & survivedNode , true , false , // survivedNode expected started == true, learner == false
112+ & targetNode , false , true , // targetNode expected started == false, learner == true
113+ memberIsLeaderTimeout , pollInterval )
114+
115+ g .By (fmt .Sprintf ("Ensuring %s rejoins as learner" , targetNode .Name ))
116+ validateEtcdRecoveryState (etcdClientFactory ,
117+ & survivedNode , true , false , // survivedNode expected started == true, learner == false
118+ & targetNode , true , true , // targetNode expected started == true, learner == true
119+ memberRejoinedLearnerTimeout , pollInterval )
120+
121+ g .By (fmt .Sprintf ("Ensuring %s node is promoted back as voting member" , targetNode .Name ))
122+ validateEtcdRecoveryState (etcdClientFactory ,
123+ & survivedNode , true , false , // survivedNode expected started == true, learner == false
124+ & targetNode , true , false , // targetNode expected started == true, learner == false
125+ memberPromotedVotingTimeout , pollInterval )
165126 })
166127})
167128
@@ -205,3 +166,92 @@ func getMemberState(node *corev1.Node, members []*etcdserverpb.Member) (started,
205166 }
206167 return started , learner , nil
207168}
169+
170+ // ensureEtcdOperatorHealthy checks if the cluster-etcd-operator is healthy before running etcd tests
171+ func ensureEtcdOperatorHealthy (oc * util.CLI ) error {
172+ g .By ("Checking etcd ClusterOperator status" )
173+ etcdOperator , err := oc .AdminConfigClient ().ConfigV1 ().ClusterOperators ().Get (context .Background (), "etcd" , metav1.GetOptions {})
174+ if err != nil {
175+ return fmt .Errorf ("failed to retrieve etcd ClusterOperator: %v" , err )
176+ }
177+
178+ // Check if etcd operator is Available
179+ available := findClusterOperatorCondition (etcdOperator .Status .Conditions , v1 .OperatorAvailable )
180+ if available == nil || available .Status != v1 .ConditionTrue {
181+ return fmt .Errorf ("etcd ClusterOperator is not Available: %v" , available )
182+ }
183+
184+ // Check if etcd operator is not Degraded
185+ degraded := findClusterOperatorCondition (etcdOperator .Status .Conditions , v1 .OperatorDegraded )
186+ if degraded != nil && degraded .Status == v1 .ConditionTrue {
187+ return fmt .Errorf ("etcd ClusterOperator is Degraded: %s" , degraded .Message )
188+ }
189+
190+ // Check if etcd operator is not Progressing (optional - might be ok during normal operations)
191+ progressing := findClusterOperatorCondition (etcdOperator .Status .Conditions , v1 .OperatorProgressing )
192+ if progressing != nil && progressing .Status == v1 .ConditionTrue {
193+ g .GinkgoT ().Logf ("Warning: etcd ClusterOperator is Progressing: %s" , progressing .Message )
194+ }
195+
196+ g .By ("Checking etcd pods are running" )
197+ etcdPods , err := oc .AdminKubeClient ().CoreV1 ().Pods ("openshift-etcd" ).List (context .Background (), metav1.ListOptions {
198+ LabelSelector : "app=etcd" ,
199+ })
200+ if err != nil {
201+ return fmt .Errorf ("failed to retrieve etcd pods: %v" , err )
202+ }
203+
204+ runningPods := 0
205+ for _ , pod := range etcdPods .Items {
206+ if pod .Status .Phase == corev1 .PodRunning {
207+ runningPods ++
208+ }
209+ }
210+
211+ if runningPods < 2 {
212+ return fmt .Errorf ("expected at least 2 etcd pods running, found %d" , runningPods )
213+ }
214+
215+ g .GinkgoT ().Logf ("etcd cluster operator is healthy: Available=True, Degraded=False, %d pods running" , runningPods )
216+ return nil
217+ }
218+
219+ // findClusterOperatorCondition finds a condition in ClusterOperator status
220+ func findClusterOperatorCondition (conditions []v1.ClusterOperatorStatusCondition , conditionType v1.ClusterStatusConditionType ) * v1.ClusterOperatorStatusCondition {
221+ for i := range conditions {
222+ if conditions [i ].Type == conditionType {
223+ return & conditions [i ]
224+ }
225+ }
226+ return nil
227+ }
228+
229+ func validateEtcdRecoveryState (e * helpers.EtcdClientFactoryImpl , survivedNode * corev1.Node , isSurvivedNodeStartedExpected , isSurvivedNodeLearnerExpected bool , targetNode * corev1.Node , isTargetNodeStartedExpected , isTargetNodeLearnerExpected bool , timeout , pollInterval time.Duration ) {
230+ o .EventuallyWithOffset (1 , func () error {
231+ members , err := getMembers (e )
232+ if err != nil {
233+ return err
234+ }
235+ if len (members ) != 2 {
236+ return fmt .Errorf ("Not enough members" )
237+ }
238+
239+ if started , learner , err := getMemberState (survivedNode , members ); err != nil {
240+ return err
241+ } else if isSurvivedNodeStartedExpected != started || isSurvivedNodeLearnerExpected != learner {
242+ return fmt .Errorf ("Expected node: %s to be a started==%v and voting member==%v, got this membership instead: %+v" ,
243+ survivedNode .Name , isSurvivedNodeStartedExpected , isSurvivedNodeLearnerExpected , members )
244+ }
245+
246+ // Ensure GNS node is unstarted and a learner member
247+ if started , learner , err := getMemberState (targetNode , members ); err != nil {
248+ return err
249+ } else if isTargetNodeStartedExpected != started || isTargetNodeLearnerExpected != learner {
250+ return fmt .Errorf ("Expected node: %s to be a started==%v and voting member==%v, got this membership instead: %+v" ,
251+ targetNode .Name , isTargetNodeStartedExpected , isTargetNodeLearnerExpected , members )
252+ }
253+
254+ g .GinkgoT ().Logf ("current membership: %+v" , members )
255+ return nil
256+ }, timeout , pollInterval ).ShouldNot (o .HaveOccurred ())
257+ }
0 commit comments