Skip to content

Commit acd5b84

Browse files
committed
Improve etcd recovery tests for dual-replica
This commit extends the etcd recovery tests in the two_node suite with a new ungraceful shutdown test, and improves validation and logging. Key changes include: - New Ungraceful node shutdown test. - Refactor the validation logic into a dedicated `validateEtcdRecoveryState` function, improving code readability and reusability. - Improve logging and error messages to provide more context during test failures.
1 parent 26a3b2f commit acd5b84

File tree

3 files changed

+161
-105
lines changed

3 files changed

+161
-105
lines changed

test/extended/two_node/tnf_recovery.go

Lines changed: 153 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import (
1212
o "github.com/onsi/gomega"
1313
v1 "github.com/openshift/api/config/v1"
1414
"github.com/openshift/origin/test/extended/etcd/helpers"
15-
exutil "github.com/openshift/origin/test/extended/util"
15+
"github.com/openshift/origin/test/extended/util"
1616
"github.com/pkg/errors"
1717
"go.etcd.io/etcd/api/v3/etcdserverpb"
1818
corev1 "k8s.io/api/core/v1"
@@ -21,147 +21,108 @@ import (
2121

2222
const (
2323
nodeIsHealthyTimeout = time.Minute
24+
etcdOperatorIsHealthyTimeout = time.Minute
2425
memberHasLeftTimeout = 5 * time.Minute
25-
memberIsLeaderTimeout = 2 * time.Minute
26+
memberIsLeaderTimeout = 10 * time.Minute
2627
memberRejoinedLearnerTimeout = 10 * time.Minute
27-
memberPromotedVotingTimeout = 10 * time.Minute
28+
memberPromotedVotingTimeout = 15 * time.Minute
2829
pollInterval = 5 * time.Second
2930
)
3031

3132
var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive] Two Node with Fencing etcd recovery", func() {
3233
defer g.GinkgoRecover()
3334

3435
var (
35-
oc = exutil.NewCLIWithoutNamespace("").AsAdmin()
36-
etcdClientFactory *helpers.EtcdClientFactoryImpl
37-
nodeA, nodeB corev1.Node
36+
oc = util.NewCLIWithoutNamespace("").AsAdmin()
37+
etcdClientFactory *helpers.EtcdClientFactoryImpl
38+
survivedNode, targetNode corev1.Node
3839
)
3940

4041
g.BeforeEach(func() {
4142
skipIfNotTopology(oc, v1.DualReplicaTopologyMode)
4243

44+
g.By("Verifying etcd cluster operator is healthy before starting test")
45+
o.Eventually(func() error {
46+
return ensureEtcdOperatorHealthy(oc)
47+
}, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster operator should be healthy before starting test")
48+
4349
nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
4450
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
4551
o.Expect(len(nodes.Items)).To(o.BeNumerically("==", 2), "Expected to find 2 Nodes only")
4652

4753
// Select the first index randomly
4854
randomIndex := rand.Intn(len(nodes.Items))
49-
nodeA = nodes.Items[randomIndex]
55+
survivedNode = nodes.Items[randomIndex]
5056
// Select the remaining index
51-
nodeB = nodes.Items[(randomIndex+1)%len(nodes.Items)]
52-
g.GinkgoT().Printf("Randomly selected %s (%s) to be gracefully shut down and %s (%s) to take the lead\n", nodeB.Name, nodeB.Status.Addresses[0].Address, nodeA.Name, nodeA.Status.Addresses[0].Address)
57+
targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)]
58+
g.GinkgoT().Printf("Randomly selected %s (%s) to be shut down and %s (%s) to take the lead\n", targetNode.Name, targetNode.Status.Addresses[0].Address, survivedNode.Name, survivedNode.Status.Addresses[0].Address)
5359

5460
kubeClient := oc.KubeClient()
5561
etcdClientFactory = helpers.NewEtcdClientFactory(kubeClient)
5662

5763
g.GinkgoT().Printf("Ensure both nodes are healthy before starting the test\n")
5864
o.Eventually(func() error {
59-
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, nodeA.Name)
65+
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivedNode.Name)
6066
}, nodeIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "expect to ensure Node A healthy without error")
6167

6268
o.Eventually(func() error {
63-
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, nodeB.Name)
69+
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, targetNode.Name)
6470
}, nodeIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "expect to ensure Node B healthy without error")
6571
})
6672

67-
g.It("Should support a graceful node shutdown", func() {
68-
msg := fmt.Sprintf("Shutting down %s gracefully in 1 minute", nodeB.Name)
69-
g.By(msg)
70-
// NOTE: Using `shutdown` alone would cause the node to be permanently removed from the cluster.
71-
// To prevent this, we use the `--reboot` flag, which ensures a graceful shutdown and allows the
72-
// node to rejoin the cluster upon restart. A one-minute delay is added to give the debug node
73-
// sufficient time to cleanly exit before the shutdown process completes.
74-
_, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, nodeB.Name, "openshift-etcd", "shutdown", "--reboot", "+1")
73+
g.It("Should recover from graceful node shutdown with etcd member re-addition", func() {
74+
g.By(fmt.Sprintf("Shutting down %s gracefully in 1 minute", targetNode.Name))
75+
err := util.TriggerNodeRebootGraceful(oc.KubeClient(), targetNode.Name)
7576
o.Expect(err).To(o.BeNil(), "Expected to gracefully shutdown the node without errors")
7677
time.Sleep(time.Minute)
7778

78-
msg = fmt.Sprintf("Ensuring %s leaves the member list", nodeB.Name)
79-
g.By(msg)
79+
g.By(fmt.Sprintf("Ensuring %s leaves the member list", targetNode.Name))
8080
o.Eventually(func() error {
81-
return helpers.EnsureMemberRemoved(g.GinkgoT(), etcdClientFactory, nodeB.Name)
81+
return helpers.EnsureMemberRemoved(g.GinkgoT(), etcdClientFactory, targetNode.Name)
8282
}, memberHasLeftTimeout, pollInterval).ShouldNot(o.HaveOccurred())
8383

84-
msg = fmt.Sprintf("Ensuring that %s is a healthy voting member and adds %s back as learner", nodeA.Name, nodeB.Name)
85-
g.By(msg)
86-
o.Eventually(func() error {
87-
members, err := getMembers(etcdClientFactory)
88-
if err != nil {
89-
return err
90-
}
91-
if len(members) != 2 {
92-
return fmt.Errorf("Not enough members")
93-
}
94-
95-
if started, learner, err := getMemberState(&nodeA, members); err != nil {
96-
return err
97-
} else if !started || learner {
98-
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members)
99-
}
100-
101-
// Ensure GNS node is unstarted and a learner member
102-
if started, learner, err := getMemberState(&nodeB, members); err != nil {
103-
return err
104-
} else if started || !learner {
105-
return fmt.Errorf("Expected node: %s to be a unstarted and learning member. Membership: %+v", nodeB.Name, members)
106-
}
107-
108-
g.GinkgoT().Logf("membership: %+v", members)
109-
return nil
110-
}, memberIsLeaderTimeout, pollInterval).ShouldNot(o.HaveOccurred())
111-
112-
msg = fmt.Sprintf("Ensuring %s rejoins as learner", nodeB.Name)
113-
g.By(msg)
114-
o.Eventually(func() error {
115-
members, err := getMembers(etcdClientFactory)
116-
if err != nil {
117-
return err
118-
}
119-
if len(members) != 2 {
120-
return fmt.Errorf("Not enough members")
121-
}
122-
123-
if started, learner, err := getMemberState(&nodeA, members); err != nil {
124-
return err
125-
} else if !started || learner {
126-
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members)
127-
}
128-
129-
if started, learner, err := getMemberState(&nodeB, members); err != nil {
130-
return err
131-
} else if !started || !learner {
132-
return fmt.Errorf("Expected node: %s to be a started and learner member. Membership: %+v", nodeB.Name, members)
133-
}
134-
135-
g.GinkgoT().Logf("membership: %+v", members)
136-
return nil
137-
}, memberRejoinedLearnerTimeout, pollInterval).ShouldNot(o.HaveOccurred())
138-
139-
msg = fmt.Sprintf("Ensuring %s node is promoted back as voting member", nodeB.Name)
140-
g.By(msg)
141-
o.Eventually(func() error {
142-
members, err := getMembers(etcdClientFactory)
143-
if err != nil {
144-
return err
145-
}
146-
if len(members) != 2 {
147-
return fmt.Errorf("Not enough members")
148-
}
149-
150-
if started, learner, err := getMemberState(&nodeA, members); err != nil {
151-
return err
152-
} else if !started || learner {
153-
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members)
154-
}
155-
156-
if started, learner, err := getMemberState(&nodeB, members); err != nil {
157-
return err
158-
} else if !started || learner {
159-
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeB.Name, members)
160-
}
161-
162-
g.GinkgoT().Logf("membership: %+v", members)
163-
return nil
164-
}, memberPromotedVotingTimeout, pollInterval).ShouldNot(o.HaveOccurred())
84+
g.By(fmt.Sprintf("Ensuring that %s is a healthy voting member and adds %s back as learner", survivedNode.Name, targetNode.Name))
85+
validateEtcdRecoveryState(etcdClientFactory,
86+
&survivedNode, true, false, // survivedNode expected started == true, learner == false
87+
&targetNode, false, true, // targetNode expected started == false, learner == true
88+
memberIsLeaderTimeout, pollInterval)
89+
90+
g.By(fmt.Sprintf("Ensuring %s rejoins as learner", targetNode.Name))
91+
validateEtcdRecoveryState(etcdClientFactory,
92+
&survivedNode, true, false, // survivedNode expected started == true, learner == false
93+
&targetNode, true, true, // targetNode expected started == true, learner == true
94+
memberRejoinedLearnerTimeout, pollInterval)
95+
96+
g.By(fmt.Sprintf("Ensuring %s node is promoted back as voting member", targetNode.Name))
97+
validateEtcdRecoveryState(etcdClientFactory,
98+
&survivedNode, true, false, // survivedNode expected started == true, learner == false
99+
&targetNode, true, false, // targetNode expected started == true, learner == false
100+
memberPromotedVotingTimeout, pollInterval)
101+
})
102+
103+
g.It("Should recover from ungraceful node shutdown with etcd member re-addition", func() {
104+
g.By(fmt.Sprintf("Shutting down %s ungracefully in 1 minute", targetNode.Name))
105+
err := util.TriggerNodeRebootUngraceful(oc.KubeClient(), targetNode.Name)
106+
o.Expect(err).To(o.BeNil(), "Expected to ungracefully shutdown the node without errors", targetNode.Name, err)
107+
time.Sleep(1 * time.Minute)
108+
109+
g.By(fmt.Sprintf("Ensuring that %s added %s back as learner", survivedNode.Name, targetNode.Name))
110+
validateEtcdRecoveryState(etcdClientFactory,
111+
&survivedNode, true, false, // survivedNode expected started == true, learner == false
112+
&targetNode, false, true, // targetNode expected started == false, learner == true
113+
memberIsLeaderTimeout, pollInterval)
114+
115+
g.By(fmt.Sprintf("Ensuring %s rejoins as learner", targetNode.Name))
116+
validateEtcdRecoveryState(etcdClientFactory,
117+
&survivedNode, true, false, // survivedNode expected started == true, learner == false
118+
&targetNode, true, true, // targetNode expected started == true, learner == true
119+
memberRejoinedLearnerTimeout, pollInterval)
120+
121+
g.By(fmt.Sprintf("Ensuring %s node is promoted back as voting member", targetNode.Name))
122+
validateEtcdRecoveryState(etcdClientFactory,
123+
&survivedNode, true, false, // survivedNode expected started == true, learner == false
124+
&targetNode, true, false, // targetNode expected started == true, learner == false
125+
memberPromotedVotingTimeout, pollInterval)
165126
})
166127
})
167128

@@ -205,3 +166,92 @@ func getMemberState(node *corev1.Node, members []*etcdserverpb.Member) (started,
205166
}
206167
return started, learner, nil
207168
}
169+
170+
// ensureEtcdOperatorHealthy checks if the cluster-etcd-operator is healthy before running etcd tests
171+
func ensureEtcdOperatorHealthy(oc *util.CLI) error {
172+
g.By("Checking etcd ClusterOperator status")
173+
etcdOperator, err := oc.AdminConfigClient().ConfigV1().ClusterOperators().Get(context.Background(), "etcd", metav1.GetOptions{})
174+
if err != nil {
175+
return fmt.Errorf("failed to retrieve etcd ClusterOperator: %v", err)
176+
}
177+
178+
// Check if etcd operator is Available
179+
available := findClusterOperatorCondition(etcdOperator.Status.Conditions, v1.OperatorAvailable)
180+
if available == nil || available.Status != v1.ConditionTrue {
181+
return fmt.Errorf("etcd ClusterOperator is not Available: %v", available)
182+
}
183+
184+
// Check if etcd operator is not Degraded
185+
degraded := findClusterOperatorCondition(etcdOperator.Status.Conditions, v1.OperatorDegraded)
186+
if degraded != nil && degraded.Status == v1.ConditionTrue {
187+
return fmt.Errorf("etcd ClusterOperator is Degraded: %s", degraded.Message)
188+
}
189+
190+
// Check if etcd operator is not Progressing (optional - might be ok during normal operations)
191+
progressing := findClusterOperatorCondition(etcdOperator.Status.Conditions, v1.OperatorProgressing)
192+
if progressing != nil && progressing.Status == v1.ConditionTrue {
193+
g.GinkgoT().Logf("Warning: etcd ClusterOperator is Progressing: %s", progressing.Message)
194+
}
195+
196+
g.By("Checking etcd pods are running")
197+
etcdPods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-etcd").List(context.Background(), metav1.ListOptions{
198+
LabelSelector: "app=etcd",
199+
})
200+
if err != nil {
201+
return fmt.Errorf("failed to retrieve etcd pods: %v", err)
202+
}
203+
204+
runningPods := 0
205+
for _, pod := range etcdPods.Items {
206+
if pod.Status.Phase == corev1.PodRunning {
207+
runningPods++
208+
}
209+
}
210+
211+
if runningPods < 2 {
212+
return fmt.Errorf("expected at least 2 etcd pods running, found %d", runningPods)
213+
}
214+
215+
g.GinkgoT().Logf("etcd cluster operator is healthy: Available=True, Degraded=False, %d pods running", runningPods)
216+
return nil
217+
}
218+
219+
// findClusterOperatorCondition finds a condition in ClusterOperator status
220+
func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition, conditionType v1.ClusterStatusConditionType) *v1.ClusterOperatorStatusCondition {
221+
for i := range conditions {
222+
if conditions[i].Type == conditionType {
223+
return &conditions[i]
224+
}
225+
}
226+
return nil
227+
}
228+
229+
func validateEtcdRecoveryState(e *helpers.EtcdClientFactoryImpl, survivedNode *corev1.Node, isSurvivedNodeStartedExpected, isSurvivedNodeLearnerExpected bool, targetNode *corev1.Node, isTargetNodeStartedExpected, isTargetNodeLearnerExpected bool, timeout, pollInterval time.Duration) {
230+
o.EventuallyWithOffset(1, func() error {
231+
members, err := getMembers(e)
232+
if err != nil {
233+
return err
234+
}
235+
if len(members) != 2 {
236+
return fmt.Errorf("Not enough members")
237+
}
238+
239+
if started, learner, err := getMemberState(survivedNode, members); err != nil {
240+
return err
241+
} else if isSurvivedNodeStartedExpected != started || isSurvivedNodeLearnerExpected != learner {
242+
return fmt.Errorf("Expected node: %s to be a started==%v and voting member==%v, got this membership instead: %+v",
243+
survivedNode.Name, isSurvivedNodeStartedExpected, isSurvivedNodeLearnerExpected, members)
244+
}
245+
246+
// Ensure GNS node is unstarted and a learner member
247+
if started, learner, err := getMemberState(targetNode, members); err != nil {
248+
return err
249+
} else if isTargetNodeStartedExpected != started || isTargetNodeLearnerExpected != learner {
250+
return fmt.Errorf("Expected node: %s to be a started==%v and voting member==%v, got this membership instead: %+v",
251+
targetNode.Name, isTargetNodeStartedExpected, isTargetNodeLearnerExpected, members)
252+
}
253+
254+
g.GinkgoT().Logf("current membership: %+v", members)
255+
return nil
256+
}, timeout, pollInterval).ShouldNot(o.HaveOccurred())
257+
}

test/extended/util/annotate/generated/zz_generated.annotations.go

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

zz_generated.manifests/test-reporting.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,11 @@ spec:
7070
Two Node with Fencing pods and podman containers Should verify the number
7171
of podman-etcd containers as configured'
7272
- testName: '[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive]
73-
Two Node with Fencing etcd recovery Should support a graceful node shutdown'
73+
Two Node with Fencing etcd recovery Should recover from graceful node shutdown
74+
with etcd member re-addition'
75+
- testName: '[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive]
76+
Two Node with Fencing etcd recovery Should recover from ungraceful node shutdown
77+
with etcd member re-addition'
7478
- testName: '[sig-node][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node]
7579
Two Node with Fencing topology Should validate the number of control-planes,
7680
arbiters as configured'

0 commit comments

Comments
 (0)