|
| 1 | +package two_node |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "fmt" |
| 6 | + "strings" |
| 7 | + "time" |
| 8 | + |
| 9 | + g "github.com/onsi/ginkgo/v2" |
| 10 | + o "github.com/onsi/gomega" |
| 11 | + |
| 12 | + v1 "github.com/openshift/api/config/v1" |
| 13 | + exutil "github.com/openshift/origin/test/extended/util" |
| 14 | + |
| 15 | + corev1 "k8s.io/api/core/v1" |
| 16 | + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| 17 | + "k8s.io/apimachinery/pkg/util/wait" |
| 18 | +) |
| 19 | + |
| 20 | +const ( |
| 21 | + statusReady = "Ready" |
| 22 | + statusNotReady = "NotReady" |
| 23 | + statusUnknown = "Unknown" |
| 24 | +) |
| 25 | + |
| 26 | +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node][Disruptive] One master node outage is handled seamlessly", func() { |
| 27 | + defer g.GinkgoRecover() |
| 28 | + oc := exutil.NewCLIWithoutNamespace("").AsAdmin() |
| 29 | + |
| 30 | + g.BeforeEach(func() { |
| 31 | + skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode) |
| 32 | + }) |
| 33 | + |
| 34 | + g.It("should maintain etcd quorum and workloads with one master node down", func() { |
| 35 | + ctx := context.Background() |
| 36 | + |
| 37 | + g.By("Identifying one master node to simulate failure") |
| 38 | + masterNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{ |
| 39 | + LabelSelector: labelNodeRoleMaster, |
| 40 | + }) |
| 41 | + o.Expect(err).To(o.BeNil()) |
| 42 | + o.Expect(masterNodes.Items).To(o.HaveLen(2)) |
| 43 | + targetNode := masterNodes.Items[0].Name |
| 44 | + |
| 45 | + g.By(fmt.Sprintf("Gracefully rebooting %s to simulate failure", targetNode)) |
| 46 | + shutdownOrRebootNode(oc, targetNode, "openshift-etcd", "shutdown", "-r", "+1") |
| 47 | + |
| 48 | + g.By("Waiting for the node to become NotReady") |
| 49 | + waitForNodeCondition(oc, targetNode, corev1.NodeReady, corev1.ConditionFalse, statusNotReady, 10*time.Minute) |
| 50 | + |
| 51 | + g.By("Validating etcd quorum is met while the node is still NotReady") |
| 52 | + err = wait.PollUntilContextTimeout(ctx, 15*time.Second, 15*time.Minute, true, func(ctx context.Context) (bool, error) { |
| 53 | + operator, err := oc.AdminConfigClient().ConfigV1().ClusterOperators().Get(ctx, "etcd", metav1.GetOptions{}) |
| 54 | + if err != nil { |
| 55 | + return false, nil |
| 56 | + } |
| 57 | + return isClusterOperatorAvailable(operator), nil |
| 58 | + }) |
| 59 | + o.Expect(err).To(o.BeNil(), "Expected etcd operator to remain healthy while one master node is NotReady") |
| 60 | + }) |
| 61 | + g.AfterEach(func() { |
| 62 | + ctx := context.Background() |
| 63 | + g.By("Ensuring all cluster nodes are back to Ready state") |
| 64 | + |
| 65 | + nodeList, err := oc.AdminKubeClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{}) |
| 66 | + o.Expect(err).To(o.BeNil(), "Failed to list cluster nodes") |
| 67 | + |
| 68 | + for _, node := range nodeList.Items { |
| 69 | + waitForNodeCondition(oc, node.Name, corev1.NodeReady, corev1.ConditionTrue, statusReady, 15*time.Minute) |
| 70 | + } |
| 71 | + }) |
| 72 | +}) |
| 73 | + |
| 74 | +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:HighlyAvailableArbiter][Suite:openshift/two-node][Disruptive] Recovery when arbiter node is down and master nodes restart", func() { |
| 75 | + defer g.GinkgoRecover() |
| 76 | + oc := exutil.NewCLIWithoutNamespace("").AsAdmin() |
| 77 | + var arbiterNodeName string |
| 78 | + g.BeforeEach(func() { |
| 79 | + skipIfNotTopology(oc, v1.HighlyAvailableArbiterMode) |
| 80 | + }) |
| 81 | + g.It("should regain quorum after arbiter down and master nodes restart", func() { |
| 82 | + ctx := context.Background() |
| 83 | + |
| 84 | + g.By("Getting arbiter node") |
| 85 | + arbiterNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{ |
| 86 | + LabelSelector: labelNodeRoleArbiter, |
| 87 | + }) |
| 88 | + o.Expect(err).To(o.BeNil()) |
| 89 | + o.Expect(arbiterNodes.Items).To(o.HaveLen(1)) |
| 90 | + arbiterNode := arbiterNodes.Items[0] |
| 91 | + arbiterNodeName = arbiterNode.Name |
| 92 | + |
| 93 | + g.By("Triggering 15-minute simulated shutdown on arbiter node by stopping kubelet") |
| 94 | + _, err = exutil.DebugNodeRetryWithOptionsAndChroot(oc, arbiterNodeName, "openshift-etcd", |
| 95 | + "bash", "-c", `systemd-run --on-active=10s --unit=delayed-reboot.service bash -c "sleep 5; systemctl stop kubelet; sleep 900; reboot"`) |
| 96 | + o.Expect(err).To(o.BeNil(), "Expected arbiter shutdown simulation to succeed") |
| 97 | + |
| 98 | + g.By("Waiting for arbiter to become status uknown due to kubelet stopped") |
| 99 | + waitForNodeCondition(oc, arbiterNodeName, corev1.NodeReady, corev1.ConditionUnknown, statusUnknown, 5*time.Minute) |
| 100 | + |
| 101 | + g.By("Rebooting both master nodes") |
| 102 | + masterNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{ |
| 103 | + LabelSelector: labelNodeRoleMaster, |
| 104 | + }) |
| 105 | + o.Expect(err).To(o.BeNil()) |
| 106 | + for _, node := range masterNodes.Items { |
| 107 | + shutdownOrRebootNode(oc, node.Name, "openshift-etcd", "shutdown", "-r", "+1") |
| 108 | + } |
| 109 | + |
| 110 | + g.By("Waiting for master nodes to become NotReady") |
| 111 | + for _, node := range masterNodes.Items { |
| 112 | + waitForNodeCondition(oc, node.Name, corev1.NodeReady, corev1.ConditionFalse, statusNotReady, 10*time.Minute) |
| 113 | + } |
| 114 | + |
| 115 | + g.By("Waiting for master nodes to become Ready") |
| 116 | + for _, node := range masterNodes.Items { |
| 117 | + waitForNodeCondition(oc, node.Name, corev1.NodeReady, corev1.ConditionTrue, statusReady, 15*time.Minute) |
| 118 | + } |
| 119 | + |
| 120 | + g.By("Waiting for etcd quorum to be restored") |
| 121 | + err = wait.PollUntilContextTimeout(ctx, 15*time.Second, 15*time.Minute, true, func(ctx context.Context) (bool, error) { |
| 122 | + operator, err := oc.AdminConfigClient().ConfigV1().ClusterOperators().Get(ctx, "etcd", metav1.GetOptions{}) |
| 123 | + if err != nil { |
| 124 | + return false, nil |
| 125 | + } |
| 126 | + return isClusterOperatorAvailable(operator), nil |
| 127 | + }) |
| 128 | + o.Expect(err).To(o.BeNil(), "Expected etcd operator to become available again") |
| 129 | + }) |
| 130 | + g.AfterEach(func() { |
| 131 | + g.By("Ensuring arbiter node becomes ready again") |
| 132 | + waitForNodeCondition(oc, arbiterNodeName, corev1.NodeReady, corev1.ConditionTrue, statusReady, 15*time.Minute) |
| 133 | + }) |
| 134 | +}) |
| 135 | + |
| 136 | +func shutdownOrRebootNode(oc *exutil.CLI, nodeName, component string, args ...string) { |
| 137 | + _, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, nodeName, component, args...) |
| 138 | + action := strings.Join(args, " ") |
| 139 | + o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected node %s to execute '%s' successfully", nodeName, action)) |
| 140 | +} |
| 141 | + |
| 142 | +func waitForNodeCondition(oc *exutil.CLI, nodeName string, conditionType corev1.NodeConditionType, expectStatus corev1.ConditionStatus, statusName string, timeout time.Duration) { |
| 143 | + ctx, cancel := context.WithTimeout(context.Background(), timeout) |
| 144 | + defer cancel() |
| 145 | + |
| 146 | + err := wait.PollUntilContextTimeout(ctx, 10*time.Second, timeout, true, func(ctx context.Context) (bool, error) { |
| 147 | + node, err := oc.AdminKubeClient().CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) |
| 148 | + if err != nil { |
| 149 | + return false, nil |
| 150 | + } |
| 151 | + for _, cond := range node.Status.Conditions { |
| 152 | + if cond.Type == conditionType && cond.Status == expectStatus { |
| 153 | + return true, nil |
| 154 | + } |
| 155 | + } |
| 156 | + return false, nil |
| 157 | + }) |
| 158 | + o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected node %s to become %s", nodeName, statusName)) |
| 159 | +} |
0 commit comments