Skip to content

Commit c03284a

Browse files
(chore): cluster shows not ready when node unschedulable
Signed-off-by: abhijith-darshan <abhijith.ravindra@sap.com> (chore): set ready false when all nodes are not ready Signed-off-by: abhijith-darshan <abhijith.ravindra@sap.com> (chore): e2e test node not ready Signed-off-by: abhijith-darshan <abhijith.ravindra@sap.com> (chore): set not ready only when all nodes are not ready Signed-off-by: abhijith-darshan <abhijith.ravindra@sap.com>
1 parent 511db1e commit c03284a

File tree

4 files changed

+102
-11
lines changed

4 files changed

+102
-11
lines changed

cmd/greenhouse/controllers.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ var knownControllers = map[string]func(controllerName string, mgr ctrl.Manager)
4242
"clusterPluginDefinition": (&plugindefinitioncontroller.ClusterPluginDefinitionReconciler{}).SetupWithManager,
4343

4444
// Cluster controllers
45-
"bootStrap": (&clustercontrollers.BootstrapReconciler{}).SetupWithManager,
46-
"clusterReconciler": startClusterReconciler,
47-
"kubeconfig": (&clustercontrollers.KubeconfigReconciler{}).SetupWithManager,
45+
"bootstrap": (&clustercontrollers.BootstrapReconciler{}).SetupWithManager,
46+
"cluster": startClusterReconciler,
47+
"kubeconfig": (&clustercontrollers.KubeconfigReconciler{}).SetupWithManager,
4848
}
4949

5050
// knownControllers lists the name of known controllers.

e2e/cluster/e2e_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,4 +251,36 @@ var _ = Describe("Cluster E2E", Ordered, func() {
251251
shared.ClusterIsReady(ctx, adminClient, remoteOIDCClusterCName, env.TestNamespace)
252252
})
253253
})
254+
255+
Context("Cluster Node Not Ready / Ready 😵 🤖", Ordered, func() {
256+
It("should onboard remote cluster", func() {
257+
By("onboarding remote cluster")
258+
shared.OnboardRemoteCluster(ctx, adminClient, env.RemoteKubeConfigBytes, remoteClusterFName, env.TestNamespace, team.Name)
259+
})
260+
It("should have a cluster resource created", func() {
261+
By("verifying if the cluster resource is created")
262+
Eventually(func(g Gomega) bool {
263+
err := adminClient.Get(ctx, client.ObjectKey{Name: remoteClusterFName, Namespace: env.TestNamespace}, &greenhousev1alpha1.Cluster{})
264+
g.Expect(err).ToNot(HaveOccurred())
265+
return true
266+
}).Should(BeTrue(), "cluster resource should be created")
267+
268+
By("verifying the cluster status is ready")
269+
shared.ClusterIsReady(ctx, adminClient, remoteClusterFName, env.TestNamespace)
270+
})
271+
It("should reach not ready state when all nodes are not ready", func() {
272+
By("cordoning all nodes in the remote cluster")
273+
expect.CordonRemoteNodes(ctx, remoteClient)
274+
275+
By("verifying the cluster status is not ready")
276+
expect.ReconcileReadyNotReady(ctx, adminClient, remoteClusterFName, env.TestNamespace, false)
277+
})
278+
It("should reach ready state when nodes are ready again", func() {
279+
By("un-cordoning all nodes in the remote cluster")
280+
expect.UnCordonRemoteNodes(ctx, remoteClient)
281+
282+
By("verifying the cluster status is ready again")
283+
expect.ReconcileReadyNotReady(ctx, adminClient, remoteClusterFName, env.TestNamespace, true)
284+
})
285+
})
254286
})

e2e/cluster/expect/expect.go

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ package expect
66
import (
77
"context"
88
"fmt"
9-
"strconv"
109
"time"
1110

1211
rbacv1 "k8s.io/api/rbac/v1"
@@ -123,12 +122,12 @@ func RevokingRemoteClusterAccess(ctx context.Context, adminClient, remoteClient
123122
func ReconcileReadyNotReady(ctx context.Context, adminClient client.Client, clusterName, namespace string, readyStatus bool) {
124123
err := shared.WaitUntilResourceReadyOrNotReady(ctx, adminClient, &greenhousev1alpha1.Cluster{}, clusterName, namespace, func(resource lifecycle.RuntimeObject) error {
125124
By("triggering a reconcile of the cluster resource")
126-
resourceLabels := resource.GetLabels()
125+
resourceLabels := resource.GetAnnotations()
127126
if resourceLabels == nil {
128127
resourceLabels = make(map[string]string)
129128
}
130-
resourceLabels["greenhouse.sap/last-apply"] = strconv.FormatInt(time.Now().Unix(), 10)
131-
resource.SetLabels(resourceLabels)
129+
resourceLabels[lifecycle.ReconcileAnnotation] = metav1.Now().Format(time.DateTime)
130+
resource.SetAnnotations(resourceLabels)
132131
return adminClient.Update(ctx, resource)
133132
}, readyStatus)
134133
Expect(err).NotTo(HaveOccurred(), "cluster should be in desired status")
@@ -139,3 +138,39 @@ func GetRestConfig(restClientGetter *clientutil.RestClientGetter) *rest.Config {
139138
Expect(err).NotTo(HaveOccurred(), "there should be no error creating the remote REST config")
140139
return restConfig
141140
}
141+
142+
func CordonRemoteNodes(ctx context.Context, remoteClient client.Client) {
143+
nodes := &corev1.NodeList{}
144+
err := remoteClient.List(ctx, nodes)
145+
Expect(err).NotTo(HaveOccurred(), "there should be no error while listing nodes")
146+
for _, node := range nodes.Items {
147+
labels := node.GetLabels()
148+
_, ok := labels["node-role.kubernetes.io/control-plane"]
149+
if ok {
150+
By("skipping cordoning control plane node: " + node.Name)
151+
continue
152+
}
153+
if node.Spec.Unschedulable {
154+
continue
155+
}
156+
base := node.DeepCopy()
157+
node.Spec.Unschedulable = true
158+
err = remoteClient.Patch(ctx, &node, client.MergeFrom(base))
159+
Expect(err).NotTo(HaveOccurred(), "there should be no error while patching node")
160+
}
161+
}
162+
163+
func UnCordonRemoteNodes(ctx context.Context, remoteClient client.Client) {
164+
nodes := &corev1.NodeList{}
165+
err := remoteClient.List(ctx, nodes)
166+
Expect(err).NotTo(HaveOccurred(), "there should be no error while listing nodes")
167+
for _, node := range nodes.Items {
168+
if !node.Spec.Unschedulable {
169+
continue
170+
}
171+
base := node.DeepCopy()
172+
node.Spec.Unschedulable = false
173+
err = remoteClient.Patch(ctx, &node, client.MergeFrom(base))
174+
Expect(err).NotTo(HaveOccurred(), "there should be no error while patching node")
175+
}
176+
}

internal/controller/cluster/cluster_status.go

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@ import (
3030
"github.com/cloudoperators/greenhouse/pkg/lifecycle"
3131
)
3232

33-
const clusterK8sVersionUnknown = "unknown"
33+
const (
34+
clusterK8sVersionUnknown = "unknown"
35+
controlPlaneNodeLabel = "node-role.kubernetes.io/control-plane"
36+
)
3437

3538
func (r *RemoteClusterReconciler) setConditions() lifecycle.Conditioner {
3639
return func(ctx context.Context, resource lifecycle.RuntimeObject) {
@@ -51,7 +54,7 @@ func (r *RemoteClusterReconciler) setConditions() lifecycle.Conditioner {
5154
kubeConfigValidCondition, k8sVersion = r.reconcileKubeConfigValid(restClientGetter)
5255
}
5356

54-
var allNodesReadyCondition, clusterAccessibleCondition, resourcesDeployedCondition greenhousemetav1alpha1.Condition
57+
var allNodesReadyCondition, clusterAccessibleCondition, resourcesDeployedCondition, readyCondition greenhousemetav1alpha1.Condition
5558
var nodes *greenhousev1alpha1.Nodes
5659
// Can only reconcile detailed status if kubeconfig is valid.
5760
if kubeConfigValidCondition.IsFalse() {
@@ -64,7 +67,11 @@ func (r *RemoteClusterReconciler) setConditions() lifecycle.Conditioner {
6467
resourcesDeployedCondition = r.reconcileBootstrapResources(ctx, restClientGetter, clusterSecret)
6568
}
6669

67-
readyCondition := r.reconcileReadyStatus(kubeConfigValidCondition, resourcesDeployedCondition)
70+
if nodes != nil && int32(len(nodes.NotReady)) == nodes.Total {
71+
readyCondition = greenhousemetav1alpha1.FalseCondition(greenhousemetav1alpha1.ReadyCondition, "", "all nodes are not ready")
72+
} else {
73+
readyCondition = r.reconcileReadyStatus(kubeConfigValidCondition, resourcesDeployedCondition)
74+
}
6875

6976
ownerLabelCondition := util.ComputeOwnerLabelCondition(ctx, r.Client, cluster)
7077
util.UpdateOwnedByLabelMissingMetric(cluster, ownerLabelCondition.IsFalse())
@@ -223,11 +230,14 @@ func (r *RemoteClusterReconciler) reconcileNodeStatus(
223230
return greenhousemetav1alpha1.FalseCondition(greenhousev1alpha1.AllNodesReady, "", err.Error()), nil
224231
}
225232

226-
notReadyNodes := []greenhousev1alpha1.NodeStatus{}
233+
var notReadyNodes []greenhousev1alpha1.NodeStatus
227234
var totalNodes, readyNodes int32
228235
allNodesReadyCondition = greenhousemetav1alpha1.TrueCondition(greenhousev1alpha1.AllNodesReady, "", "")
229236

230237
for _, node := range nodeList.Items {
238+
if checkIfControlPlaneNode(node.Labels) {
239+
continue
240+
}
231241
totalNodes++
232242

233243
nodeStatus := getNodeStatusIfNodeNotReady(node)
@@ -251,6 +261,9 @@ func (r *RemoteClusterReconciler) reconcileNodeStatus(
251261

252262
// getNodeStatusIfNodeNotReady returns a NodeStatus when the NodeReady condition is explicitly False; otherwise nil.
253263
func getNodeStatusIfNodeNotReady(node corev1.Node) *greenhousev1alpha1.NodeStatus {
264+
if node.Spec.Unschedulable {
265+
return &greenhousev1alpha1.NodeStatus{Name: node.Name, Message: "Node is unschedulable", LastTransitionTime: metav1.Now()}
266+
}
254267
var readyCondition *corev1.NodeCondition
255268
for i := range node.Status.Conditions {
256269
if node.Status.Conditions[i].Type == corev1.NodeReady {
@@ -265,3 +278,14 @@ func getNodeStatusIfNodeNotReady(node corev1.Node) *greenhousev1alpha1.NodeStatu
265278

266279
return &greenhousev1alpha1.NodeStatus{Name: node.Name, Message: readyCondition.Message, LastTransitionTime: readyCondition.LastTransitionTime}
267280
}
281+
282+
func checkIfControlPlaneNode(labels map[string]string) (control bool) {
283+
control = false
284+
for key, _ := range labels {
285+
if key == controlPlaneNodeLabel {
286+
control = true
287+
break
288+
}
289+
}
290+
return
291+
}

0 commit comments

Comments
 (0)