Skip to content

Commit b609df8

Browse files
(chore): cluster shows not ready when node unschedulable
Signed-off-by: abhijith-darshan <abhijith.ravindra@sap.com> (chore): set ready false when all nodes are not ready Signed-off-by: abhijith-darshan <abhijith.ravindra@sap.com> (chore): e2e test node not ready Signed-off-by: abhijith-darshan <abhijith.ravindra@sap.com> (chore): set not ready only when all nodes are not ready Signed-off-by: abhijith-darshan <abhijith.ravindra@sap.com> (chore): set PayloadSchedulable condition Signed-off-by: abhijith-darshan <abhijith.ravindra@sap.com> Automatic generation of CRD API Docs (chore): fix eventually Signed-off-by: abhijith-darshan <abhijith.ravindra@sap.com>
1 parent de35b70 commit b609df8

File tree

10 files changed

+139
-27
lines changed

10 files changed

+139
-27
lines changed

api/v1alpha1/cluster_types.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ const (
4040
// AllNodesReady reflects the readiness status of all nodes of a cluster.
4141
AllNodesReady greenhousemetav1alpha1.ConditionType = "AllNodesReady"
4242

43+
// PayloadSchedulable reflects whether workloads can be scheduled on the cluster.
44+
PayloadSchedulable greenhousemetav1alpha1.ConditionType = "PayloadSchedulable"
45+
4346
// KubeConfigValid reflects the validity of the kubeconfig of a cluster.
4447
KubeConfigValid greenhousemetav1alpha1.ConditionType = "KubeConfigValid"
4548

@@ -74,9 +77,9 @@ type ClusterConditionType string
7477
// Nodes contains node count metrics and tracks non-ready nodes in a cluster.
7578
type Nodes struct {
7679
// Total represent the number of all the nodes in the cluster.
77-
Total int32 `json:"total,omitempty"`
80+
Total int `json:"total,omitempty"`
7881
// ReadyNodes represent the number of ready nodes in the cluster.
79-
Ready int32 `json:"ready,omitempty"`
82+
Ready int `json:"ready,omitempty"`
8083
// NotReady is slice of non-ready nodes status details.
8184
// +listType="map"
8285
// +listMapKey=name

charts/manager/crds/greenhouse.sap_clusters.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,10 @@ spec:
119119
ready:
120120
description: ReadyNodes represent the number of ready nodes in
121121
the cluster.
122-
format: int32
123122
type: integer
124123
total:
125124
description: Total represent the number of all the nodes in the
126125
cluster.
127-
format: int32
128126
type: integer
129127
type: object
130128
statusConditions:

cmd/greenhouse/controllers.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ var knownControllers = map[string]func(controllerName string, mgr ctrl.Manager)
4242
"clusterPluginDefinition": (&plugindefinitioncontroller.ClusterPluginDefinitionReconciler{}).SetupWithManager,
4343

4444
// Cluster controllers
45-
"bootStrap": (&clustercontrollers.BootstrapReconciler{}).SetupWithManager,
46-
"clusterReconciler": startClusterReconciler,
47-
"kubeconfig": (&clustercontrollers.KubeconfigReconciler{}).SetupWithManager,
45+
"bootstrap": (&clustercontrollers.BootstrapReconciler{}).SetupWithManager,
46+
"cluster": startClusterReconciler,
47+
"kubeconfig": (&clustercontrollers.KubeconfigReconciler{}).SetupWithManager,
4848
}
4949

5050
// knownControllers lists the name of known controllers.

docs/reference/api/index.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1879,7 +1879,7 @@ <h3 id="greenhouse.sap/v1alpha1.Nodes">Nodes
18791879
<td>
18801880
<code>total</code><br>
18811881
<em>
1882-
int32
1882+
int
18831883
</em>
18841884
</td>
18851885
<td>
@@ -1890,7 +1890,7 @@ <h3 id="greenhouse.sap/v1alpha1.Nodes">Nodes
18901890
<td>
18911891
<code>ready</code><br>
18921892
<em>
1893-
int32
1893+
int
18941894
</em>
18951895
</td>
18961896
<td>

docs/reference/api/openapi.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -592,11 +592,9 @@ components:
592592
x-kubernetes-list-type: map
593593
ready:
594594
description: ReadyNodes represent the number of ready nodes in the cluster.
595-
format: int32
596595
type: integer
597596
total:
598597
description: Total represent the number of all the nodes in the cluster.
599-
format: int32
600598
type: integer
601599
type: object
602600
statusConditions:

e2e/cluster/e2e_test.go

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
const (
3434
remoteClusterHName = "remote-int-h-cluster"
3535
remoteClusterFName = "remote-int-f-cluster"
36+
remoteClusterNodeName = "remote-int-n-cluster"
3637
remoteOIDCClusterHName = "remote-int-oidc-h-cluster"
3738
remoteOIDCClusterCName = "remote-int-oidc-c-cluster"
3839
remoteOIDCClusterFName = "remote-int-oidc-f-cluster"
@@ -69,13 +70,14 @@ var _ = BeforeSuite(func() {
6970
env = env.WithOrganization(ctx, adminClient, "./testdata/organization.yaml")
7071
team = test.NewTeam(ctx, "test-cluster-e2e-team", env.TestNamespace, test.WithTeamLabel(greenhouseapis.LabelKeySupportGroup, "true"), test.WithMappedIDPGroup("SOME_IDP_GROUP_NAME"))
7172
err = adminClient.Create(ctx, team)
72-
Expect(err).ToNot(HaveOccurred(), "there should be no error creating a Team")
73+
Expect(client.IgnoreAlreadyExists(err)).ToNot(HaveOccurred(), "there should be no error creating a Team")
7374
testStartTime = time.Now().UTC()
7475
})
7576

7677
var _ = AfterSuite(func() {
7778
shared.OffBoardRemoteCluster(ctx, adminClient, remoteClient, testStartTime, remoteClusterHName, env.TestNamespace)
7879
shared.OffBoardRemoteCluster(ctx, adminClient, remoteClient, testStartTime, remoteClusterFName, env.TestNamespace)
80+
shared.OffBoardRemoteCluster(ctx, adminClient, remoteClient, testStartTime, remoteClusterNodeName, env.TestNamespace)
7981
shared.OffBoardRemoteCluster(ctx, adminClient, remoteClient, testStartTime, remoteOIDCClusterHName, env.TestNamespace)
8082
shared.OffBoardRemoteCluster(ctx, adminClient, remoteClient, testStartTime, remoteOIDCClusterFName, env.TestNamespace)
8183
shared.OffBoardRemoteCluster(ctx, adminClient, remoteClient, testStartTime, remoteOIDCClusterCName, env.TestNamespace)
@@ -251,4 +253,52 @@ var _ = Describe("Cluster E2E", Ordered, func() {
251253
shared.ClusterIsReady(ctx, adminClient, remoteOIDCClusterCName, env.TestNamespace)
252254
})
253255
})
256+
257+
Context("Cluster Node Not Ready / Ready 😵 🤖", Ordered, func() {
258+
It("should onboard remote cluster", func() {
259+
By("onboarding remote cluster")
260+
shared.OnboardRemoteCluster(ctx, adminClient, env.RemoteKubeConfigBytes, remoteClusterNodeName, env.TestNamespace, team.Name)
261+
})
262+
It("should have a cluster resource created", func() {
263+
By("verifying if the cluster resource is created")
264+
Eventually(func(g Gomega) bool {
265+
err := adminClient.Get(ctx, client.ObjectKey{Name: remoteClusterNodeName, Namespace: env.TestNamespace}, &greenhousev1alpha1.Cluster{})
266+
g.Expect(err).ToNot(HaveOccurred())
267+
return true
268+
}).Should(BeTrue(), "cluster resource should be created")
269+
270+
By("verifying the cluster status is ready")
271+
shared.ClusterIsReady(ctx, adminClient, remoteClusterNodeName, env.TestNamespace)
272+
})
273+
It("should reach not ready state when all nodes are not ready", func() {
274+
By("cordoning all nodes in the remote cluster")
275+
expect.CordonRemoteNodes(ctx, remoteClient)
276+
277+
By("verifying the cluster payload is not schedulable")
278+
Eventually(func(g Gomega) {
279+
cluster := &greenhousev1alpha1.Cluster{}
280+
err := adminClient.Get(ctx, client.ObjectKey{Name: remoteClusterNodeName, Namespace: env.TestNamespace}, cluster)
281+
g.Expect(err).ToNot(HaveOccurred(), "there should be no error getting the cluster resource")
282+
conditions := cluster.GetConditions()
283+
payloadCondition := conditions.GetConditionByType(greenhousev1alpha1.PayloadSchedulable)
284+
g.Expect(payloadCondition).ToNot(BeNil(), "cluster should have PayloadSchedulable condition")
285+
g.Expect(payloadCondition.IsTrue()).To(BeFalse(), "cluster should not be able to schedule payloads")
286+
}).Should(Succeed(), "cluster should not be able to schedule payloads when all nodes are cordoned")
287+
})
288+
It("should reach ready state when nodes are ready again", func() {
289+
By("un-cordoning all nodes in the remote cluster")
290+
expect.UnCordonRemoteNodes(ctx, remoteClient)
291+
292+
By("verifying the cluster status is ready again")
293+
Eventually(func(g Gomega) {
294+
cluster := &greenhousev1alpha1.Cluster{}
295+
err := adminClient.Get(ctx, client.ObjectKey{Name: remoteClusterNodeName, Namespace: env.TestNamespace}, cluster)
296+
g.Expect(err).ToNot(HaveOccurred(), "there should be no error getting the cluster resource")
297+
conditions := cluster.GetConditions()
298+
payloadCondition := conditions.GetConditionByType(greenhousev1alpha1.PayloadSchedulable)
299+
g.Expect(payloadCondition).ToNot(BeNil(), "cluster should have PayloadSchedulable condition")
300+
g.Expect(payloadCondition.IsTrue()).To(BeTrue(), "cluster be able to schedule payloads")
301+
}).Should(Succeed(), "cluster should be able to schedule payloads when nodes are uncordoned")
302+
})
303+
})
254304
})

e2e/cluster/expect/expect.go

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ package expect
66
import (
77
"context"
88
"fmt"
9-
"strconv"
109
"time"
1110

1211
rbacv1 "k8s.io/api/rbac/v1"
@@ -123,12 +122,12 @@ func RevokingRemoteClusterAccess(ctx context.Context, adminClient, remoteClient
123122
func ReconcileReadyNotReady(ctx context.Context, adminClient client.Client, clusterName, namespace string, readyStatus bool) {
124123
err := shared.WaitUntilResourceReadyOrNotReady(ctx, adminClient, &greenhousev1alpha1.Cluster{}, clusterName, namespace, func(resource lifecycle.RuntimeObject) error {
125124
By("triggering a reconcile of the cluster resource")
126-
resourceLabels := resource.GetLabels()
125+
resourceLabels := resource.GetAnnotations()
127126
if resourceLabels == nil {
128127
resourceLabels = make(map[string]string)
129128
}
130-
resourceLabels["greenhouse.sap/last-apply"] = strconv.FormatInt(time.Now().Unix(), 10)
131-
resource.SetLabels(resourceLabels)
129+
resourceLabels[lifecycle.ReconcileAnnotation] = metav1.Now().Format(time.DateTime)
130+
resource.SetAnnotations(resourceLabels)
132131
return adminClient.Update(ctx, resource)
133132
}, readyStatus)
134133
Expect(err).NotTo(HaveOccurred(), "cluster should be in desired status")
@@ -139,3 +138,39 @@ func GetRestConfig(restClientGetter *clientutil.RestClientGetter) *rest.Config {
139138
Expect(err).NotTo(HaveOccurred(), "there should be no error creating the remote REST config")
140139
return restConfig
141140
}
141+
142+
func CordonRemoteNodes(ctx context.Context, remoteClient client.Client) {
143+
nodes := &corev1.NodeList{}
144+
err := remoteClient.List(ctx, nodes)
145+
Expect(err).NotTo(HaveOccurred(), "there should be no error while listing nodes")
146+
for _, node := range nodes.Items {
147+
labels := node.GetLabels()
148+
_, ok := labels["node-role.kubernetes.io/control-plane"]
149+
if ok {
150+
By("skipping cordoning control plane node: " + node.Name)
151+
continue
152+
}
153+
if node.Spec.Unschedulable {
154+
continue
155+
}
156+
base := node.DeepCopy()
157+
node.Spec.Unschedulable = true
158+
err = remoteClient.Patch(ctx, &node, client.MergeFrom(base))
159+
Expect(err).NotTo(HaveOccurred(), "there should be no error while patching node")
160+
}
161+
}
162+
163+
func UnCordonRemoteNodes(ctx context.Context, remoteClient client.Client) {
164+
nodes := &corev1.NodeList{}
165+
err := remoteClient.List(ctx, nodes)
166+
Expect(err).NotTo(HaveOccurred(), "there should be no error while listing nodes")
167+
for _, node := range nodes.Items {
168+
if !node.Spec.Unschedulable {
169+
continue
170+
}
171+
base := node.DeepCopy()
172+
node.Spec.Unschedulable = false
173+
err = remoteClient.Patch(ctx, &node, client.MergeFrom(base))
174+
Expect(err).NotTo(HaveOccurred(), "there should be no error while patching node")
175+
}
176+
}

e2e/shared/cluster.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,9 @@ func OffBoardRemoteCluster(ctx context.Context, adminClient, remoteClient client
8282
Eventually(func(g Gomega) {
8383
crb := &rbacv1.ClusterRoleBinding{}
8484
err := remoteClient.Get(ctx, client.ObjectKey{Name: ManagedResourceName}, crb)
85-
GinkgoWriter.Printf("crb err: %v\n", err)
85+
if err != nil {
86+
GinkgoWriter.Printf("crb err: %v\n", err)
87+
}
8688
g.Expect(apierrors.IsNotFound(err)).To(BeTrue(), "cluster role binding should be deleted")
8789
managedSA := &corev1.ServiceAccount{}
8890
err = remoteClient.Get(ctx, client.ObjectKey{Name: ManagedResourceName, Namespace: namespace}, managedSA)

internal/controller/cluster/cluster_status.go

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@ import (
3030
"github.com/cloudoperators/greenhouse/pkg/lifecycle"
3131
)
3232

33-
const clusterK8sVersionUnknown = "unknown"
33+
const (
34+
clusterK8sVersionUnknown = "unknown"
35+
controlPlaneNodeLabel = "node-role.kubernetes.io/control-plane"
36+
)
3437

3538
func (r *RemoteClusterReconciler) setConditions() lifecycle.Conditioner {
3639
return func(ctx context.Context, resource lifecycle.RuntimeObject) {
@@ -51,17 +54,22 @@ func (r *RemoteClusterReconciler) setConditions() lifecycle.Conditioner {
5154
kubeConfigValidCondition, k8sVersion = r.reconcileKubeConfigValid(restClientGetter)
5255
}
5356

54-
var allNodesReadyCondition, clusterAccessibleCondition, resourcesDeployedCondition greenhousemetav1alpha1.Condition
57+
var allNodesReadyCondition, clusterAccessibleCondition, resourcesDeployedCondition, payloadSchedulable greenhousemetav1alpha1.Condition
58+
payloadSchedulable = greenhousemetav1alpha1.TrueCondition(greenhousev1alpha1.PayloadSchedulable, "", "")
5559
var nodes *greenhousev1alpha1.Nodes
5660
// Can only reconcile detailed status if kubeconfig is valid.
5761
if kubeConfigValidCondition.IsFalse() {
62+
payloadSchedulable = greenhousemetav1alpha1.FalseCondition(greenhousev1alpha1.PayloadSchedulable, "", "kubeconfig not valid - payloads cannot be scheduled")
5863
allNodesReadyCondition = greenhousemetav1alpha1.UnknownCondition(greenhousev1alpha1.AllNodesReady, "", "kubeconfig not valid - cannot know node status")
5964
clusterAccessibleCondition = greenhousemetav1alpha1.UnknownCondition(greenhousev1alpha1.PermissionsVerified, "", "kubeconfig not valid - cannot validate cluster access")
6065
resourcesDeployedCondition = greenhousemetav1alpha1.UnknownCondition(greenhousev1alpha1.ManagedResourcesDeployed, "", "kubeconfig not valid - cannot validate managed resources")
6166
} else {
6267
allNodesReadyCondition, nodes = r.reconcileNodeStatus(ctx, restClientGetter)
6368
clusterAccessibleCondition = r.reconcilePermissions(ctx, restClientGetter)
6469
resourcesDeployedCondition = r.reconcileBootstrapResources(ctx, restClientGetter, clusterSecret)
70+
if nodes != nil && len(nodes.NotReady) == nodes.Total {
71+
payloadSchedulable = greenhousemetav1alpha1.FalseCondition(greenhousev1alpha1.PayloadSchedulable, "", "all nodes are not ready")
72+
}
6573
}
6674

6775
readyCondition := r.reconcileReadyStatus(kubeConfigValidCondition, resourcesDeployedCondition)
@@ -76,6 +84,7 @@ func (r *RemoteClusterReconciler) setConditions() lifecycle.Conditioner {
7684
ownerLabelCondition,
7785
clusterAccessibleCondition,
7886
resourcesDeployedCondition,
87+
payloadSchedulable,
7988
}
8089
deletionCondition := r.checkDeletionSchedule(logger, cluster)
8190
if !deletionCondition.IsUnknown() {
@@ -223,11 +232,14 @@ func (r *RemoteClusterReconciler) reconcileNodeStatus(
223232
return greenhousemetav1alpha1.FalseCondition(greenhousev1alpha1.AllNodesReady, "", err.Error()), nil
224233
}
225234

226-
notReadyNodes := []greenhousev1alpha1.NodeStatus{}
227-
var totalNodes, readyNodes int32
235+
var notReadyNodes []greenhousev1alpha1.NodeStatus
236+
var totalNodes, readyNodes int
228237
allNodesReadyCondition = greenhousemetav1alpha1.TrueCondition(greenhousev1alpha1.AllNodesReady, "", "")
229238

230239
for _, node := range nodeList.Items {
240+
if checkIfControlPlaneNode(node.Labels) {
241+
continue
242+
}
231243
totalNodes++
232244

233245
nodeStatus := getNodeStatusIfNodeNotReady(node)
@@ -251,6 +263,9 @@ func (r *RemoteClusterReconciler) reconcileNodeStatus(
251263

252264
// getNodeStatusIfNodeNotReady returns a NodeStatus when the NodeReady condition is explicitly False; otherwise nil.
253265
func getNodeStatusIfNodeNotReady(node corev1.Node) *greenhousev1alpha1.NodeStatus {
266+
if node.Spec.Unschedulable {
267+
return &greenhousev1alpha1.NodeStatus{Name: node.Name, Message: "Node is unschedulable", LastTransitionTime: metav1.Now()}
268+
}
254269
var readyCondition *corev1.NodeCondition
255270
for i := range node.Status.Conditions {
256271
if node.Status.Conditions[i].Type == corev1.NodeReady {
@@ -265,3 +280,14 @@ func getNodeStatusIfNodeNotReady(node corev1.Node) *greenhousev1alpha1.NodeStatu
265280

266281
return &greenhousev1alpha1.NodeStatus{Name: node.Name, Message: readyCondition.Message, LastTransitionTime: readyCondition.LastTransitionTime}
267282
}
283+
284+
func checkIfControlPlaneNode(labels map[string]string) (control bool) {
285+
control = false
286+
for key := range labels {
287+
if key == controlPlaneNodeLabel {
288+
control = true
289+
break
290+
}
291+
}
292+
return
293+
}

internal/controller/cluster/status_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,9 @@ var _ = Describe("Cluster status", Ordered, func() {
155155
g.Expect(validCluster.Status.GetConditionByType(greenhousev1alpha1.AllNodesReady).Status).To(Equal(metav1.ConditionFalse))
156156
g.Expect(validCluster.Status.GetConditionByType(greenhousev1alpha1.AllNodesReady).Message).To(ContainSubstring("test-node not ready, test-node-3 not ready"))
157157
// Validate the counts of total and ready nodes.
158-
g.Expect(validCluster.Status.Nodes).ToNot((BeNil()))
159-
g.Expect(validCluster.Status.Nodes.Total).To(Equal(int32(3)))
160-
g.Expect(validCluster.Status.Nodes.Ready).To(Equal(int32(1)))
158+
g.Expect(validCluster.Status.Nodes).ToNot(BeNil())
159+
g.Expect(validCluster.Status.Nodes.Total).To(Equal(3))
160+
g.Expect(validCluster.Status.Nodes.Ready).To(Equal(1))
161161
g.Expect(validCluster.Status.Nodes.NotReady).To(HaveLen(2))
162162
}).Should(Succeed())
163163

@@ -220,9 +220,9 @@ var _ = Describe("Cluster status", Ordered, func() {
220220
g.Expect(validCluster.Status.GetConditionByType(greenhousev1alpha1.AllNodesReady).Status).To(Equal(metav1.ConditionTrue), "The AllNodesReady condition should be true")
221221
g.Expect(validCluster.Status.GetConditionByType(greenhousev1alpha1.AllNodesReady).Message).To(BeEmpty())
222222
// Validate the counts of total and ready nodes.
223-
g.Expect(validCluster.Status.Nodes).ToNot((BeNil()))
224-
g.Expect(validCluster.Status.Nodes.Total).To(Equal(int32(3)))
225-
g.Expect(validCluster.Status.Nodes.Ready).To(Equal(int32(3)))
223+
g.Expect(validCluster.Status.Nodes).ToNot(BeNil())
224+
g.Expect(validCluster.Status.Nodes.Total).To(Equal(3))
225+
g.Expect(validCluster.Status.Nodes.Ready).To(Equal(3))
226226
g.Expect(validCluster.Status.Nodes.NotReady).To(BeEmpty())
227227
}).Should(Succeed())
228228

0 commit comments

Comments
 (0)