Skip to content

Commit c897529

Browse files
Merge pull request #30110 from clobrano/feature/tnf-e2e-network-failure
OCPEDGE-1483: Add TNF E2E tests for network failure
2 parents 91bb880 + 79b0a98 commit c897529

File tree

4 files changed

+181
-24
lines changed

4 files changed

+181
-24
lines changed

test/extended/two_node/tnf_recovery.go

Lines changed: 131 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,17 @@ const (
2626
memberIsLeaderTimeout = 10 * time.Minute
2727
memberRejoinedLearnerTimeout = 10 * time.Minute
2828
memberPromotedVotingTimeout = 15 * time.Minute
29+
networkDisruptionDuration = 15 * time.Second
2930
pollInterval = 5 * time.Second
3031
)
3132

3233
var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive] Two Node with Fencing etcd recovery", func() {
3334
defer g.GinkgoRecover()
3435

3536
var (
36-
oc = util.NewCLIWithoutNamespace("").AsAdmin()
37-
etcdClientFactory *helpers.EtcdClientFactoryImpl
38-
survivedNode, targetNode corev1.Node
37+
oc = util.NewCLIWithoutNamespace("").AsAdmin()
38+
etcdClientFactory *helpers.EtcdClientFactoryImpl
39+
peerNode, targetNode corev1.Node
3940
)
4041

4142
g.BeforeEach(func() {
@@ -52,25 +53,29 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
5253

5354
// Select the first index randomly
5455
randomIndex := rand.Intn(len(nodes.Items))
55-
survivedNode = nodes.Items[randomIndex]
56+
peerNode = nodes.Items[randomIndex]
5657
// Select the remaining index
5758
targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)]
58-
g.GinkgoT().Printf("Randomly selected %s (%s) to be shut down and %s (%s) to take the lead\n", targetNode.Name, targetNode.Status.Addresses[0].Address, survivedNode.Name, survivedNode.Status.Addresses[0].Address)
5959

6060
kubeClient := oc.KubeClient()
6161
etcdClientFactory = helpers.NewEtcdClientFactory(kubeClient)
6262

6363
g.GinkgoT().Printf("Ensure both nodes are healthy before starting the test\n")
6464
o.Eventually(func() error {
65-
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivedNode.Name)
66-
}, nodeIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "expect to ensure Node A healthy without error")
65+
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, peerNode.Name)
66+
}, nodeIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("expect to ensure Node '%s' healthiness without errors", peerNode.Name))
6767

6868
o.Eventually(func() error {
6969
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, targetNode.Name)
70-
}, nodeIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "expect to ensure Node B healthy without error")
70+
}, nodeIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("expect to ensure Node '%s' healthiness without errors", targetNode.Name))
7171
})
7272

73-
g.It("Should recover from graceful node shutdown with etcd member re-addition", func() {
73+
g.It("should recover from graceful node shutdown with etcd member re-addition", func() {
74+
// Note: In graceful shutdown, the targetNode is deliberately shut down while
75+
// the peerNode remains running and becomes the etcd leader.
76+
survivedNode := peerNode
77+
g.GinkgoT().Printf("Randomly selected %s (%s) to be shut down and %s (%s) to take the lead\n",
78+
targetNode.Name, targetNode.Status.Addresses[0].Address, peerNode.Name, peerNode.Status.Addresses[0].Address)
7479
g.By(fmt.Sprintf("Shutting down %s gracefully in 1 minute", targetNode.Name))
7580
err := util.TriggerNodeRebootGraceful(oc.KubeClient(), targetNode.Name)
7681
o.Expect(err).To(o.BeNil(), "Expected to gracefully shutdown the node without errors")
@@ -81,7 +86,7 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
8186
return helpers.EnsureMemberRemoved(g.GinkgoT(), etcdClientFactory, targetNode.Name)
8287
}, memberHasLeftTimeout, pollInterval).ShouldNot(o.HaveOccurred())
8388

84-
g.By(fmt.Sprintf("Ensuring that %s is a healthy voting member and adds %s back as learner", survivedNode.Name, targetNode.Name))
89+
g.By(fmt.Sprintf("Ensuring that %s is a healthy voting member and adds %s back as learner", peerNode.Name, targetNode.Name))
8590
validateEtcdRecoveryState(etcdClientFactory,
8691
&survivedNode, true, false, // survivedNode expected started == true, learner == false
8792
&targetNode, false, true, // targetNode expected started == false, learner == true
@@ -100,13 +105,18 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
100105
memberPromotedVotingTimeout, pollInterval)
101106
})
102107

103-
g.It("Should recover from ungraceful node shutdown with etcd member re-addition", func() {
108+
g.It("should recover from ungraceful node shutdown with etcd member re-addition", func() {
109+
// Note: In ungraceful shutdown, the targetNode is forcibly shut down while
110+
// the peerNode remains running and becomes the etcd leader.
111+
survivedNode := peerNode
112+
g.GinkgoT().Printf("Randomly selected %s (%s) to be shut down and %s (%s) to take the lead\n",
113+
targetNode.Name, targetNode.Status.Addresses[0].Address, peerNode.Name, peerNode.Status.Addresses[0].Address)
104114
g.By(fmt.Sprintf("Shutting down %s ungracefully in 1 minute", targetNode.Name))
105115
err := util.TriggerNodeRebootUngraceful(oc.KubeClient(), targetNode.Name)
106116
o.Expect(err).To(o.BeNil(), "Expected to ungracefully shutdown the node without errors", targetNode.Name, err)
107117
time.Sleep(1 * time.Minute)
108118

109-
g.By(fmt.Sprintf("Ensuring that %s added %s back as learner", survivedNode.Name, targetNode.Name))
119+
g.By(fmt.Sprintf("Ensuring that %s added %s back as learner", peerNode.Name, targetNode.Name))
110120
validateEtcdRecoveryState(etcdClientFactory,
111121
&survivedNode, true, false, // survivedNode expected started == true, learner == false
112122
&targetNode, false, true, // targetNode expected started == false, learner == true
@@ -124,6 +134,41 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
124134
&targetNode, true, false, // targetNode expected started == true, learner == false
125135
memberPromotedVotingTimeout, pollInterval)
126136
})
137+
138+
g.It("should recover from network disruption with etcd member re-addition", func() {
139+
// Note: In network disruption, the targetNode runs the disruption command that
140+
// isolates the nodes from each other, creating a split-brain where pacemaker
141+
// determines which node gets fenced and which becomes the etcd leader.
142+
g.GinkgoT().Printf("Randomly selected %s (%s) to run the network disruption command\n", targetNode.Name, targetNode.Status.Addresses[0].Address)
143+
g.By(fmt.Sprintf("Blocking network communication between %s and %s for %v ", targetNode.Name, peerNode.Name, networkDisruptionDuration))
144+
command, err := util.TriggerNetworkDisruption(oc.KubeClient(), &targetNode, &peerNode, networkDisruptionDuration)
145+
o.Expect(err).To(o.BeNil(), "Expected to disrupt network without errors")
146+
g.GinkgoT().Printf("command: '%s'\n", command)
147+
148+
g.By("Ensuring cluster recovery with proper leader/learner roles after network disruption")
149+
// Note: The fenced node may recover quickly and already be started when we get
150+
// the first etcd membership. This is valid behavior, so we capture the learner's
151+
// state and adapt the test accordingly.
152+
leaderNode, learnerNode, learnerStarted := validateEtcdRecoveryStateWithoutAssumingLeader(etcdClientFactory,
153+
&peerNode, &targetNode, memberIsLeaderTimeout, pollInterval)
154+
155+
if learnerStarted {
156+
g.GinkgoT().Printf("Learner node '%s' already started as learner\n", learnerNode.Name)
157+
} else {
158+
g.By(fmt.Sprintf("Ensuring '%s' rejoins as learner", learnerNode.Name))
159+
validateEtcdRecoveryState(etcdClientFactory,
160+
leaderNode, true, false, // survivedNode expected started == true, learner == false
161+
learnerNode, true, true, // targetNode expected started == true, learner == true
162+
memberRejoinedLearnerTimeout, pollInterval)
163+
}
164+
165+
g.By(fmt.Sprintf("Ensuring learner node '%s' is promoted back as voting member", learnerNode.Name))
166+
validateEtcdRecoveryState(etcdClientFactory,
167+
leaderNode, true, false, // survivedNode expected started == true, learner == false
168+
learnerNode, true, false, // targetNode expected started == true, learner == false
169+
memberPromotedVotingTimeout, pollInterval)
170+
171+
})
127172
})
128173

129174
func getMembers(etcdClientFactory helpers.EtcdClientCreator) ([]*etcdserverpb.Member, error) {
@@ -226,7 +271,10 @@ func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition
226271
return nil
227272
}
228273

229-
func validateEtcdRecoveryState(e *helpers.EtcdClientFactoryImpl, survivedNode *corev1.Node, isSurvivedNodeStartedExpected, isSurvivedNodeLearnerExpected bool, targetNode *corev1.Node, isTargetNodeStartedExpected, isTargetNodeLearnerExpected bool, timeout, pollInterval time.Duration) {
274+
func validateEtcdRecoveryState(e *helpers.EtcdClientFactoryImpl,
275+
survivedNode *corev1.Node, isSurvivedNodeStartedExpected, isSurvivedNodeLearnerExpected bool,
276+
targetNode *corev1.Node, isTargetNodeStartedExpected, isTargetNodeLearnerExpected bool,
277+
timeout, pollInterval time.Duration) {
230278
o.EventuallyWithOffset(1, func() error {
231279
members, err := getMembers(e)
232280
if err != nil {
@@ -251,7 +299,76 @@ func validateEtcdRecoveryState(e *helpers.EtcdClientFactoryImpl, survivedNode *c
251299
targetNode.Name, isTargetNodeStartedExpected, isTargetNodeLearnerExpected, members)
252300
}
253301

254-
g.GinkgoT().Logf("current membership: %+v", members)
302+
g.GinkgoT().Logf("SUCCESS: got membership: %+v", members)
303+
return nil
304+
}, timeout, pollInterval).ShouldNot(o.HaveOccurred())
305+
}
306+
307+
func validateEtcdRecoveryStateWithoutAssumingLeader(e *helpers.EtcdClientFactoryImpl,
308+
nodeA, nodeB *corev1.Node,
309+
timeout, pollInterval time.Duration) (leaderNode, learnerNode *corev1.Node, learnerStarted bool) {
310+
311+
o.EventuallyWithOffset(1, func() error {
312+
members, err := getMembers(e)
313+
if err != nil {
314+
return err
315+
}
316+
if len(members) != 2 {
317+
return fmt.Errorf("expected 2 members, got %d", len(members))
318+
}
319+
320+
// Get state for both nodes first
321+
startedA, learnerA, err := getMemberState(nodeA, members)
322+
if err != nil {
323+
return fmt.Errorf("failed to get state for node %s: %v", nodeA.Name, err)
324+
}
325+
326+
startedB, learnerB, err := getMemberState(nodeB, members)
327+
if err != nil {
328+
return fmt.Errorf("failed to get state for node %s: %v", nodeB.Name, err)
329+
}
330+
331+
// Then, evaluate the possible combinations
332+
if !startedA && !startedB {
333+
return fmt.Errorf("etcd members have not started yet")
334+
}
335+
336+
// This should not happen
337+
if learnerA && learnerB {
338+
o.Expect(fmt.Errorf("both nodes are learners! %s(started=%v, learner=%v), %s(started=%v, learner=%v)",
339+
nodeA.Name, startedA, learnerA, nodeB.Name, startedB, learnerB)).ToNot(o.HaveOccurred())
340+
}
341+
342+
// This might happen if the disruption didn't occurred yet, or we get this snapshot when the learner has been already promoted
343+
if !learnerA && !learnerB {
344+
return fmt.Errorf("both nodes are non-learners (should have exactly one learner): %s(started=%v, learner=%v), %s(started=%v, learner=%v)",
345+
nodeA.Name, startedA, learnerA, nodeB.Name, startedB, learnerB)
346+
}
347+
348+
// Once we get one leader and one learner, we don't care if the latter has started already, but the first must
349+
// already been started
350+
leaderStarted := (startedA && !learnerA) || (startedB && !learnerB)
351+
if !leaderStarted {
352+
return fmt.Errorf("leader node is not started: %s(started=%v, learner=%v), %s(started=%v, learner=%v)",
353+
nodeA.Name, startedA, learnerA, nodeB.Name, startedB, learnerB)
354+
}
355+
356+
// Set return values based on actual roles
357+
if learnerA {
358+
leaderNode = nodeB
359+
learnerNode = nodeA
360+
learnerStarted = startedA
361+
} else {
362+
leaderNode = nodeA
363+
learnerNode = nodeB
364+
learnerStarted = startedB
365+
}
366+
367+
g.GinkgoT().Logf("SUCCESS: Leader is %s, learner is %s (started=%v)",
368+
leaderNode.Name, learnerNode.Name, learnerStarted)
369+
255370
return nil
256371
}, timeout, pollInterval).ShouldNot(o.HaveOccurred())
372+
373+
return leaderNode, learnerNode, learnerStarted
257374
}

test/extended/util/annotate/generated/zz_generated.annotations.go

Lines changed: 4 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/extended/util/nodes.go

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,16 @@ import (
1414
"k8s.io/client-go/kubernetes"
1515
)
1616

17+
// networkMode represents the networking mode for disruption pods
18+
type networkMode int
19+
20+
const (
21+
// hostNetworkMode enables host networking for the disruption pod
22+
hostNetworkMode networkMode = iota
23+
// podNetworkMode disables host networking for the disruption pod
24+
podNetworkMode
25+
)
26+
1727
// GetClusterNodesByRole returns the cluster nodes by role
1828
func GetClusterNodesByRole(oc *CLI, role string) ([]string, error) {
1929
nodes, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("node", "-l", "node-role.kubernetes.io/"+role, "-o", "jsonpath='{.items[*].metadata.name}'").Output()
@@ -95,19 +105,43 @@ func DebugAllNodesRetryWithOptionsAndChroot(oc *CLI, debugNodeNamespace string,
95105
// TriggerNodeRebootGraceful initiates a graceful node reboot which allows the system to terminate processes cleanly before rebooting.
96106
func TriggerNodeRebootGraceful(kubeClient kubernetes.Interface, nodeName string) error {
97107
command := "echo 'reboot in 1 minute'; exec chroot /host shutdown -r 1"
98-
return triggerNodeReboot(kubeClient, nodeName, 0, command)
108+
return createNodeDisruptionPod(kubeClient, nodeName, 0, podNetworkMode, command)
99109
}
100110

101111
// TriggerNodeRebootUngraceful initiates an ungraceful node reboot which does not allow the system to terminate processes cleanly before rebooting.
102112
func TriggerNodeRebootUngraceful(kubeClient kubernetes.Interface, nodeName string) error {
103113
command := "echo 'reboot in 1 minute'; exec chroot /host sudo systemd-run sh -c 'sleep 60 && reboot --force --force'"
104-
return triggerNodeReboot(kubeClient, nodeName, 0, command)
114+
return createNodeDisruptionPod(kubeClient, nodeName, 0, podNetworkMode, command)
115+
}
116+
117+
// TriggerNetworkDisruption blocks network traffic between the target and peer nodes for a given duration.
118+
func TriggerNetworkDisruption(kubeClient kubernetes.Interface, target, peer *corev1.Node, disruptionDuration time.Duration) (string, error) {
119+
preambleCmd := fmt.Sprintf("echo 'temporarily disabling network connection between %s and %s for %v'; exec chroot /host sh -c ", target.Name, peer.Name, disruptionDuration)
120+
121+
peerIP := getNodeInternalAddress(peer)
122+
123+
blockTrafficCmd := fmt.Sprintf("sudo iptables -I INPUT -j DROP -s %s && sudo iptables -I OUTPUT -j DROP -d %s", peerIP, peerIP)
124+
cleanupCmd := fmt.Sprintf("sudo iptables -D INPUT -j DROP -s %s; sudo iptables -D OUTPUT -j DROP -d %s", peerIP, peerIP)
125+
sleepCmd := fmt.Sprintf("sleep %d", int(disruptionDuration.Seconds()))
126+
disruptionCmd := fmt.Sprintf("%s 'trap \"%s\" EXIT; %s ; %s'", preambleCmd, cleanupCmd, blockTrafficCmd, sleepCmd)
127+
128+
return disruptionCmd, createNodeDisruptionPod(kubeClient, target.Name, 0, hostNetworkMode, disruptionCmd)
129+
}
130+
131+
func getNodeInternalAddress(node *corev1.Node) string {
132+
for _, addr := range node.Status.Addresses {
133+
if addr.Type == corev1.NodeInternalIP {
134+
return addr.Address
135+
}
136+
}
137+
// fallback
138+
return node.Status.Addresses[0].Address
105139
}
106140

107-
func triggerNodeReboot(kubeClient kubernetes.Interface, nodeName string, attempt int, command string) error {
141+
func createNodeDisruptionPod(kubeClient kubernetes.Interface, nodeName string, attempt int, networkMode networkMode, command string) error {
108142
isTrue := true
109143
zero := int64(0)
110-
name := fmt.Sprintf("reboot-%s-%d", nodeName, attempt)
144+
name := fmt.Sprintf("disrupt-%s-%d", nodeName, attempt)
111145
_, err := kubeClient.CoreV1().Pods("kube-system").Create(context.Background(), &corev1.Pod{
112146
ObjectMeta: metav1.ObjectMeta{
113147
Name: name,
@@ -116,6 +150,7 @@ func triggerNodeReboot(kubeClient kubernetes.Interface, nodeName string, attempt
116150
},
117151
},
118152
Spec: corev1.PodSpec{
153+
HostNetwork: networkMode == hostNetworkMode,
119154
HostPID: true,
120155
RestartPolicy: corev1.RestartPolicyNever,
121156
NodeName: nodeName,
@@ -131,7 +166,7 @@ func triggerNodeReboot(kubeClient kubernetes.Interface, nodeName string, attempt
131166
},
132167
Containers: []corev1.Container{
133168
{
134-
Name: "reboot",
169+
Name: "disruption",
135170
SecurityContext: &corev1.SecurityContext{
136171
RunAsUser: &zero,
137172
Privileged: &isTrue,
@@ -154,7 +189,7 @@ func triggerNodeReboot(kubeClient kubernetes.Interface, nodeName string, attempt
154189
},
155190
}, metav1.CreateOptions{})
156191
if errors.IsAlreadyExists(err) {
157-
return triggerNodeReboot(kubeClient, nodeName, attempt+1, command)
192+
return createNodeDisruptionPod(kubeClient, nodeName, attempt+1, hostNetworkMode, command)
158193
}
159194
return err
160195
}

zz_generated.manifests/test-reporting.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,10 +189,13 @@ spec:
189189
- testName: '[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica]
190190
Two Node with Fencing should have podman etcd containers running on each node'
191191
- testName: '[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive]
192-
Two Node with Fencing etcd recovery Should recover from graceful node shutdown
192+
Two Node with Fencing etcd recovery should recover from graceful node shutdown
193193
with etcd member re-addition'
194194
- testName: '[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive]
195-
Two Node with Fencing etcd recovery Should recover from ungraceful node shutdown
195+
Two Node with Fencing etcd recovery should recover from network disruption
196+
with etcd member re-addition'
197+
- testName: '[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive]
198+
Two Node with Fencing etcd recovery should recover from ungraceful node shutdown
196199
with etcd member re-addition'
197200
- testName: '[sig-node][apigroup:config.openshift.io][OCPFeatureGate:DualReplica]
198201
Two Node with Fencing topology should have BareMetalHost operational status

0 commit comments

Comments
 (0)