Skip to content

Commit b36acb8

Browse files
Merge pull request #30022 from Neilhamza/OCPEDGE-1969
OCPEDGE-1969: add recovery tests for TNA
2 parents 9849104 + 9881db2 commit b36acb8

File tree

5 files changed

+201
-37
lines changed

5 files changed

+201
-37
lines changed

test/extended/two_node/arbiter_topology.go

Lines changed: 14 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,19 @@ import (
2121
"k8s.io/apimachinery/pkg/util/wait"
2222
)
2323

24-
var (
25-
expectedPods = map[string]int{
26-
"openshift-cluster-node-tuning-operator": 1,
27-
"openshift-dns": 1,
28-
"openshift-etcd": 2,
29-
"openshift-image-registry": 1,
30-
"openshift-kni-infra": 3,
31-
"openshift-machine-config-operator": 2,
32-
"openshift-monitoring": 1,
33-
"openshift-multus": 3,
34-
"openshift-network-diagnostics": 1,
35-
"openshift-network-operator": 1,
36-
"openshift-ovn-kubernetes": 1,
37-
}
38-
)
24+
var expectedPods = map[string]int{
25+
"openshift-cluster-node-tuning-operator": 1,
26+
"openshift-dns": 1,
27+
"openshift-etcd": 2,
28+
"openshift-image-registry": 1,
29+
"openshift-kni-infra": 3,
30+
"openshift-machine-config-operator": 2,
31+
"openshift-monitoring": 1,
32+
"openshift-multus": 3,
33+
"openshift-network-diagnostics": 1,
34+
"openshift-network-operator": 1,
35+
"openshift-ovn-kubernetes": 1,
36+
}
3937

4038
var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] expected Master and Arbiter node counts", func() {
4139
defer g.GinkgoRecover()
@@ -70,15 +68,12 @@ var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:High
7068
var _ = g.Describe("[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter] required pods on the Arbiter node", func() {
7169
defer g.GinkgoRecover()
7270

73-
var (
74-
oc = exutil.NewCLIWithoutNamespace("")
75-
)
71+
oc := exutil.NewCLIWithoutNamespace("")
7672

7773
g.BeforeEach(func() {
7874
skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode)
7975
})
8076
g.It("Should verify that the correct number of pods are running on the Arbiter node", func() {
81-
8277
g.By("Retrieving the Arbiter node name")
8378
nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
8479
LabelSelector: labelNodeRoleArbiter,
@@ -405,21 +400,3 @@ func createDaemonSetDeployment(oc *exutil.CLI) (*appv1.DaemonSet, error) {
405400
func isPodRunning(pod corev1.Pod) bool {
406401
return pod.Status.Phase == corev1.PodRunning
407402
}
408-
409-
func isClusterOperatorAvailable(operator *v1.ClusterOperator) bool {
410-
for _, cond := range operator.Status.Conditions {
411-
if cond.Type == v1.OperatorAvailable && cond.Status == v1.ConditionTrue {
412-
return true
413-
}
414-
}
415-
return false
416-
}
417-
418-
func isClusterOperatorDegraded(operator *v1.ClusterOperator) bool {
419-
for _, cond := range operator.Status.Conditions {
420-
if cond.Type == v1.OperatorDegraded && cond.Status == v1.ConditionTrue {
421-
return true
422-
}
423-
}
424-
return false
425-
}

test/extended/two_node/common.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,21 @@ func skipIfNotTopology(oc *exutil.CLI, wanted v1.TopologyMode) {
2424
e2eskipper.Skip(fmt.Sprintf("Cluster is not in %v topology, skipping test", wanted))
2525
}
2626
}
27+
28+
func isClusterOperatorAvailable(operator *v1.ClusterOperator) bool {
29+
for _, cond := range operator.Status.Conditions {
30+
if cond.Type == v1.OperatorAvailable && cond.Status == v1.ConditionTrue {
31+
return true
32+
}
33+
}
34+
return false
35+
}
36+
37+
func isClusterOperatorDegraded(operator *v1.ClusterOperator) bool {
38+
for _, cond := range operator.Status.Conditions {
39+
if cond.Type == v1.OperatorDegraded && cond.Status == v1.ConditionTrue {
40+
return true
41+
}
42+
}
43+
return false
44+
}
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
package two_node
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strings"
7+
"time"
8+
9+
g "github.com/onsi/ginkgo/v2"
10+
o "github.com/onsi/gomega"
11+
12+
v1 "github.com/openshift/api/config/v1"
13+
exutil "github.com/openshift/origin/test/extended/util"
14+
15+
corev1 "k8s.io/api/core/v1"
16+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
17+
"k8s.io/apimachinery/pkg/util/wait"
18+
)
19+
20+
const (
21+
statusReady = "Ready"
22+
statusNotReady = "NotReady"
23+
statusUnknown = "Unknown"
24+
)
25+
26+
var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node][Disruptive] One master node outage is handled seamlessly", func() {
27+
defer g.GinkgoRecover()
28+
oc := exutil.NewCLIWithoutNamespace("").AsAdmin()
29+
30+
g.BeforeEach(func() {
31+
skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode)
32+
})
33+
34+
g.It("should maintain etcd quorum and workloads with one master node down", func() {
35+
ctx := context.Background()
36+
37+
g.By("Identifying one master node to simulate failure")
38+
masterNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{
39+
LabelSelector: labelNodeRoleMaster,
40+
})
41+
o.Expect(err).To(o.BeNil())
42+
o.Expect(masterNodes.Items).To(o.HaveLen(2))
43+
targetNode := masterNodes.Items[0].Name
44+
45+
g.By(fmt.Sprintf("Gracefully rebooting %s to simulate failure", targetNode))
46+
shutdownOrRebootNode(oc, targetNode, "openshift-etcd", "shutdown", "-r", "+1")
47+
48+
g.By("Waiting for the node to become NotReady")
49+
waitForNodeCondition(oc, targetNode, corev1.NodeReady, corev1.ConditionFalse, statusNotReady, 10*time.Minute)
50+
51+
g.By("Validating etcd quorum is met while the node is still NotReady")
52+
err = wait.PollUntilContextTimeout(ctx, 15*time.Second, 15*time.Minute, true, func(ctx context.Context) (bool, error) {
53+
operator, err := oc.AdminConfigClient().ConfigV1().ClusterOperators().Get(ctx, "etcd", metav1.GetOptions{})
54+
if err != nil {
55+
return false, nil
56+
}
57+
return isClusterOperatorAvailable(operator), nil
58+
})
59+
o.Expect(err).To(o.BeNil(), "Expected etcd operator to remain healthy while one master node is NotReady")
60+
})
61+
g.AfterEach(func() {
62+
ctx := context.Background()
63+
g.By("Ensuring all cluster nodes are back to Ready state")
64+
65+
nodeList, err := oc.AdminKubeClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{})
66+
o.Expect(err).To(o.BeNil(), "Failed to list cluster nodes")
67+
68+
for _, node := range nodeList.Items {
69+
waitForNodeCondition(oc, node.Name, corev1.NodeReady, corev1.ConditionTrue, statusReady, 15*time.Minute)
70+
}
71+
})
72+
})
73+
74+
var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node][Disruptive] Recovery when arbiter node is down and master nodes restart", func() {
75+
defer g.GinkgoRecover()
76+
oc := exutil.NewCLIWithoutNamespace("").AsAdmin()
77+
var arbiterNodeName string
78+
g.BeforeEach(func() {
79+
skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode)
80+
})
81+
g.It("should regain quorum after arbiter down and master nodes restart", func() {
82+
ctx := context.Background()
83+
84+
g.By("Getting arbiter node")
85+
arbiterNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{
86+
LabelSelector: labelNodeRoleArbiter,
87+
})
88+
o.Expect(err).To(o.BeNil())
89+
o.Expect(arbiterNodes.Items).To(o.HaveLen(1))
90+
arbiterNode := arbiterNodes.Items[0]
91+
arbiterNodeName = arbiterNode.Name
92+
93+
g.By("Triggering 15-minute simulated shutdown on arbiter node by stopping kubelet")
94+
_, err = exutil.DebugNodeRetryWithOptionsAndChroot(oc, arbiterNodeName, "openshift-etcd",
95+
"bash", "-c", `systemd-run --on-active=10s --unit=delayed-reboot.service bash -c "sleep 5; systemctl stop kubelet; sleep 900; reboot"`)
96+
o.Expect(err).To(o.BeNil(), "Expected arbiter shutdown simulation to succeed")
97+
98+
g.By("Waiting for arbiter to become status uknown due to kubelet stopped")
99+
waitForNodeCondition(oc, arbiterNodeName, corev1.NodeReady, corev1.ConditionUnknown, statusUnknown, 5*time.Minute)
100+
101+
g.By("Rebooting both master nodes")
102+
masterNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{
103+
LabelSelector: labelNodeRoleMaster,
104+
})
105+
o.Expect(err).To(o.BeNil())
106+
for _, node := range masterNodes.Items {
107+
shutdownOrRebootNode(oc, node.Name, "openshift-etcd", "shutdown", "-r", "+1")
108+
}
109+
110+
g.By("Waiting for master nodes to become NotReady")
111+
for _, node := range masterNodes.Items {
112+
waitForNodeCondition(oc, node.Name, corev1.NodeReady, corev1.ConditionFalse, statusNotReady, 10*time.Minute)
113+
}
114+
115+
g.By("Waiting for master nodes to become Ready")
116+
for _, node := range masterNodes.Items {
117+
waitForNodeCondition(oc, node.Name, corev1.NodeReady, corev1.ConditionTrue, statusReady, 15*time.Minute)
118+
}
119+
120+
g.By("Waiting for etcd quorum to be restored")
121+
err = wait.PollUntilContextTimeout(ctx, 15*time.Second, 15*time.Minute, true, func(ctx context.Context) (bool, error) {
122+
operator, err := oc.AdminConfigClient().ConfigV1().ClusterOperators().Get(ctx, "etcd", metav1.GetOptions{})
123+
if err != nil {
124+
return false, nil
125+
}
126+
return isClusterOperatorAvailable(operator), nil
127+
})
128+
o.Expect(err).To(o.BeNil(), "Expected etcd operator to become available again")
129+
})
130+
g.AfterEach(func() {
131+
g.By("Ensuring arbiter node becomes ready again")
132+
waitForNodeCondition(oc, arbiterNodeName, corev1.NodeReady, corev1.ConditionTrue, statusReady, 15*time.Minute)
133+
})
134+
})
135+
136+
func shutdownOrRebootNode(oc *exutil.CLI, nodeName, component string, args ...string) {
137+
_, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, nodeName, component, args...)
138+
action := strings.Join(args, " ")
139+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected node %s to execute '%s' successfully", nodeName, action))
140+
}
141+
142+
func waitForNodeCondition(oc *exutil.CLI, nodeName string, conditionType corev1.NodeConditionType, expectStatus corev1.ConditionStatus, statusName string, timeout time.Duration) {
143+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
144+
defer cancel()
145+
146+
err := wait.PollUntilContextTimeout(ctx, 10*time.Second, timeout, true, func(ctx context.Context) (bool, error) {
147+
node, err := oc.AdminKubeClient().CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
148+
if err != nil {
149+
return false, nil
150+
}
151+
for _, cond := range node.Status.Conditions {
152+
if cond.Type == conditionType && cond.Status == expectStatus {
153+
return true, nil
154+
}
155+
}
156+
return false, nil
157+
})
158+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected node %s to become %s", nodeName, statusName))
159+
}

test/extended/util/annotate/generated/zz_generated.annotations.go

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

zz_generated.manifests/test-reporting.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,12 @@ spec:
443443
- testName: '[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter]
444444
Ensure etcd health and quorum in HighlyAvailableArbiterMode should have all
445445
etcd pods running and quorum met'
446+
- testName: '[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node][Disruptive]
447+
One master node outage is handled seamlessly should maintain etcd quorum and
448+
workloads with one master node down'
449+
- testName: '[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node][Disruptive]
450+
Recovery when arbiter node is down and master nodes restart should regain
451+
quorum after arbiter down and master nodes restart'
446452
- testName: '[sig-node][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter]
447453
expected Master and Arbiter node counts Should validate that there are Master
448454
and Arbiter nodes as specified in the cluster'

0 commit comments

Comments
 (0)