Skip to content

Commit f422a1a

Browse files
ServerMaintenance managed-by label
1 parent ba12c39 commit f422a1a

File tree

4 files changed

+96
-54
lines changed

4 files changed

+96
-54
lines changed

Makefile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,7 @@ export-kubeconfig:
8080

8181
.PHONY: tilt-up
8282
tilt-up: kind-setup export-kubeconfig
83-
KUBECONFIG=./config/kind/mgmt-kubeconfig:./config/kind/worker-kubeconfig
84-
tilt up
83+
KUBECONFIG=./config/kind/mgmt-kubeconfig:./config/kind/worker-kubeconfig tilt up
8584

8685
##@ Build
8786

Tiltfile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,13 @@ def mgmt_kubectl(args):
1212
def worker_kubectl(args):
1313
return local('kubectl --kubeconfig=' + worker_kubeconfig + ' --context=' + worker_ctx + ' ' + args)
1414

15-
mgmt_kubectl('apply -f https://raw.githubusercontent.com/ironcore-dev/metal-operator/main/config/crd/bases/metal.ironcore.dev_serverclaims.yaml')
16-
mgmt_kubectl('apply -f https://raw.githubusercontent.com/ironcore-dev/metal-operator/main/config/crd/bases/metal.ironcore.dev_servermaintenances.yaml')
17-
mgmt_kubectl('apply -f https://raw.githubusercontent.com/ironcore-dev/metal-operator/main/config/crd/bases/metal.ironcore.dev_servers.yaml')
15+
METAL_OPERATOR_REF = "v0.3.0"
16+
mgmt_kubectl('apply -f https://raw.githubusercontent.com/ironcore-dev/metal-operator/' + METAL_OPERATOR_REF + '/config/crd/bases/metal.ironcore.dev_serverclaims.yaml')
17+
mgmt_kubectl('apply -f https://raw.githubusercontent.com/ironcore-dev/metal-operator/' + METAL_OPERATOR_REF + '/config/crd/bases/metal.ironcore.dev_servermaintenances.yaml')
18+
mgmt_kubectl('apply -f https://raw.githubusercontent.com/ironcore-dev/metal-operator/' + METAL_OPERATOR_REF + '/config/crd/bases/metal.ironcore.dev_servers.yaml')
19+
mgmt_kubectl('wait --for=condition=Established --timeout=60s crd/serverclaims.metal.ironcore.dev')
20+
mgmt_kubectl('wait --for=condition=Established --timeout=60s crd/servermaintenances.metal.ironcore.dev')
21+
mgmt_kubectl('wait --for=condition=Established --timeout=60s crd/servers.metal.ironcore.dev')
1822
mgmt_kubectl('apply -f config/kind/crs/server.yaml')
1923
mgmt_kubectl('apply -f config/kind/crs/serverclaim.yaml')
2024

pkg/cloudprovider/metal/node_controller.go

Lines changed: 49 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@ import (
1010
"net"
1111
"strings"
1212

13-
"github.com/google/uuid"
1413
metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1"
1514
corev1 "k8s.io/api/core/v1"
1615
apierrors "k8s.io/apimachinery/pkg/api/errors"
1716
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1817
"k8s.io/apimachinery/pkg/types"
1918
"k8s.io/client-go/tools/cache"
2019
"k8s.io/client-go/util/workqueue"
20+
"k8s.io/klog/v2"
2121
ctrl "sigs.k8s.io/controller-runtime"
2222
ctrlcache "sigs.k8s.io/controller-runtime/pkg/cache"
2323
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -54,21 +54,19 @@ func NewNodeReconciler(targetClient client.Client, metalClient client.Client, no
5454
func (r *NodeReconciler) Start(ctx context.Context) error {
5555
defer r.queue.ShutDown()
5656

57-
l := ctrl.LoggerFrom(ctx).WithName("node-controller")
58-
5957
_, err := r.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
6058
AddFunc: func(obj any) {
6159
node, ok := obj.(*corev1.Node)
6260
if !ok {
63-
l.Error(nil, "unexpected object type in AddFunc", "type", fmt.Sprintf("%T", obj))
61+
klog.Errorf("unexpected object type: %T", obj)
6462
return
6563
}
6664
r.queue.Add(client.ObjectKeyFromObject(node))
6765
},
6866
UpdateFunc: func(oldObj, newObj any) {
6967
node, ok := newObj.(*corev1.Node)
7068
if !ok {
71-
l.Error(nil, "unexpected object type in UpdateFunc", "type", fmt.Sprintf("%T", newObj))
69+
klog.Errorf("unexpected object type: %T", newObj)
7270
return
7371
}
7472
r.queue.Add(client.ObjectKeyFromObject(node))
@@ -79,7 +77,7 @@ func (r *NodeReconciler) Start(ctx context.Context) error {
7977
}
8078
node, ok := obj.(*corev1.Node)
8179
if !ok {
82-
l.Error(nil, "unexpected object type in DeleteFunc", "type", fmt.Sprintf("%T", obj))
80+
klog.Errorf("unexpected object type: %T", obj)
8381
return
8482
}
8583
key := client.ObjectKeyFromObject(node)
@@ -99,11 +97,8 @@ func (r *NodeReconciler) Start(ctx context.Context) error {
9997
func() {
10098
defer r.queue.Done(key)
10199

102-
reqL := l.WithValues("namespace", key.Namespace, "node-name", key.Name, "reconcile-id", uuid.NewString())
103-
reqCtx := ctrl.LoggerInto(ctx, reqL)
104-
105-
if err = r.Reconcile(reqCtx, ctrl.Request{NamespacedName: key}); err != nil {
106-
reqL.Error(err, "Failed to reconcile Node")
100+
if err = r.Reconcile(ctx, ctrl.Request{NamespacedName: key}); err != nil {
101+
klog.Errorf("Failed to reconcile Node %s: %v", key, err)
107102
r.queue.AddRateLimited(key)
108103
return
109104
}
@@ -113,45 +108,32 @@ func (r *NodeReconciler) Start(ctx context.Context) error {
113108
}
114109
}()
115110
<-ctx.Done()
116-
l.Info("Stopping Node reconciler")
111+
klog.Info("Stopping Node reconciler")
117112
return nil
118113
}
119114

120115
func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) error {
121-
l := ctrl.LoggerFrom(ctx)
122-
l.V(2).Info("Reconciling Node")
116+
klog.V(2).Infof("Reconciling Node %s", req.NamespacedName)
123117

124118
node := &corev1.Node{}
125119
if err := r.targetClient.Get(ctx, req.NamespacedName, node); err != nil {
126120
if client.IgnoreNotFound(err) != nil {
127121
return err
128122
}
129-
l.V(2).Info("Node not found, skipping reconciliation")
130-
return nil
131-
}
132-
133-
if node.Spec.ProviderID == "" {
134-
l.Info("Node has empty spec.providerID, skipping reconciliation")
123+
klog.V(2).Infof("Node %s not found, skipping reconciliation", req.NamespacedName)
135124
return nil
136125
}
137126

138-
serverClaimKey, err := parseProviderID(node.Spec.ProviderID)
139-
if err != nil {
140-
return fmt.Errorf("unable to parse provider ID: %w", err)
141-
}
142-
143-
l = l.WithValues("server-claim-name", serverClaimKey.Name, "server-claim-namespace", serverClaimKey.Namespace)
144-
145127
if !node.DeletionTimestamp.IsZero() {
146-
l.Info("Node is deleting, reconciling delete flow")
147-
return r.reconcileDelete(ctx, node, serverClaimKey)
128+
klog.V(2).Infof("Node %s is deleting, reconciling delete flow", req.NamespacedName)
129+
return r.reconcileDelete(ctx, node)
148130
}
149131

150-
if err = r.reconcilePodCIDR(ctx, node); err != nil {
132+
if err := r.reconcilePodCIDR(ctx, node); err != nil {
151133
return fmt.Errorf("unable to reconcile PodCIDR: %w", err)
152134
}
153135

154-
if err = r.reconcileMaintenance(ctx, node, serverClaimKey); err != nil {
136+
if err := r.reconcileMaintenance(ctx, node); err != nil {
155137
return fmt.Errorf("unable to reconcile maintenance: %w", err)
156138
}
157139

@@ -180,12 +162,26 @@ func parseProviderID(providerID string) (types.NamespacedName, error) {
180162
return types.NamespacedName{Namespace: parts[0], Name: parts[1]}, nil
181163
}
182164

183-
func (r *NodeReconciler) reconcileDelete(ctx context.Context, node *corev1.Node, serverClaimKey types.NamespacedName) error {
165+
func (r *NodeReconciler) reconcileDelete(ctx context.Context, node *corev1.Node) error {
184166
if !controllerutil.ContainsFinalizer(node, nodeMaintenanceFinalizer) {
185167
return nil
186168
}
187169

188-
if err := r.ensureServerMaintenanceNotExists(ctx, serverClaimKey); err != nil {
170+
serverClaimKey, err := parseProviderID(node.Spec.ProviderID)
171+
if err != nil {
172+
klog.Errorf("Node %s has empty/invalid spec.providerID during node deletion: %v. Skipping CR cleanup to unblock node deletion", node.Name, err)
173+
174+
base := node.DeepCopy()
175+
if removed := controllerutil.RemoveFinalizer(node, nodeMaintenanceFinalizer); removed {
176+
if patchErr := r.targetClient.Patch(ctx, node, client.MergeFrom(base)); patchErr != nil {
177+
return fmt.Errorf("unable to remove finalizer: %w", patchErr)
178+
}
179+
}
180+
181+
return nil
182+
}
183+
184+
if err = r.ensureServerMaintenanceNotExists(ctx, serverClaimKey); err != nil {
189185
return fmt.Errorf("unable to cleanup ServerMaintenance: %w", err)
190186
}
191187

@@ -200,11 +196,15 @@ func (r *NodeReconciler) reconcileDelete(ctx context.Context, node *corev1.Node,
200196
}
201197

202198
func (r *NodeReconciler) ensureServerMaintenanceNotExists(ctx context.Context, key types.NamespacedName) error {
203-
maintenance := &metalv1alpha1.ServerMaintenance{
204-
ObjectMeta: metav1.ObjectMeta{
205-
Name: key.Name,
206-
Namespace: key.Namespace,
207-
},
199+
200+
maintenance := &metalv1alpha1.ServerMaintenance{}
201+
if err := r.metalClient.Get(ctx, key, maintenance); err != nil {
202+
return client.IgnoreNotFound(err)
203+
}
204+
205+
if maintenance.Labels == nil || maintenance.Labels[labelKeyManagedBy] != cloudProviderMetalName {
206+
klog.Infof("ServerMaintenance %s exists but is not managed by CCM, skipping deletion", key)
207+
return nil
208208
}
209209

210210
if err := r.metalClient.Delete(ctx, maintenance); err != nil {
@@ -215,15 +215,13 @@ func (r *NodeReconciler) ensureServerMaintenanceNotExists(ctx context.Context, k
215215
}
216216

217217
func (r *NodeReconciler) reconcilePodCIDR(ctx context.Context, node *corev1.Node) error {
218-
l := ctrl.LoggerFrom(ctx)
219-
220218
if PodPrefixSize <= 0 {
221219
// <= 0 disables automatic assignment of pod CIDR.
222220
return nil
223221
}
224222

225223
if node.Spec.PodCIDR != "" {
226-
l.Info("PodCIDR is already populated; patch was not done", "PodCIDR", node.Spec.PodCIDR)
224+
klog.InfoS("PodCIDR is already populated; patch was not done", "Node", node.Name, "PodCIDR", node.Spec.PodCIDR)
227225
return nil
228226
}
229227

@@ -248,13 +246,13 @@ func (r *NodeReconciler) reconcilePodCIDR(ctx context.Context, node *corev1.Node
248246
return fmt.Errorf("failed to patch Node's PodCIDR with error %w", err)
249247
}
250248

251-
l.Info("Patched Node's PodCIDR and PodCIDRs", "PodCIDR", podCIDR)
249+
klog.InfoS("Patched Node's PodCIDR and PodCIDRs", "Node", node.Name, "PodCIDR", podCIDR)
252250

253251
return nil
254252
}
255253
}
256254

257-
l.Info("Node does not have a NodeInternalIP, not setting podCIDR")
255+
klog.Info("Node does not have a NodeInternalIP, not setting podCIDR")
258256
return nil
259257
}
260258

@@ -268,20 +266,23 @@ func zeroHostBits(ip net.IP, maskSize int) net.IP {
268266
return ip.Mask(mask)
269267
}
270268

271-
func (r *NodeReconciler) reconcileMaintenance(ctx context.Context, node *corev1.Node, serverClaimKey types.NamespacedName) error {
272-
l := ctrl.LoggerFrom(ctx)
269+
func (r *NodeReconciler) reconcileMaintenance(ctx context.Context, node *corev1.Node) error {
270+
serverClaimKey, err := parseProviderID(node.Spec.ProviderID)
271+
if err != nil {
272+
return fmt.Errorf("unable to parse provider ID: %w", err)
273+
}
273274

274275
serverClaim := &metalv1alpha1.ServerClaim{}
275-
if err := r.metalClient.Get(ctx, serverClaimKey, serverClaim); err != nil {
276+
if err = r.metalClient.Get(ctx, serverClaimKey, serverClaim); err != nil {
276277
if apierrors.IsNotFound(err) {
277-
l.Info("ServerClaim not found, skipping reconciliation")
278+
klog.V(2).Infof("ServerClaim %s not found, skipping reconciliation", serverClaimKey)
278279
return nil
279280
}
280281
return fmt.Errorf("unable to get ServerClaim: %w", err)
281282
}
282283

283284
if serverClaim.Spec.ServerRef == nil {
284-
l.Info("ServerClaim has empty ServerRef, skipping maintenance logic")
285+
klog.V(2).Infof("ServerClaim %s has empty ServerRef, skipping maintenance logic", serverClaimKey)
285286
return nil
286287
}
287288

pkg/cloudprovider/metal/node_controller_test.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ var _ = Describe("NodeReconciler", func() {
207207
Expect(maintenanceCR.Spec.Priority).To(Equal(serverMaintenancePriority))
208208
Expect(maintenanceCR.Spec.ServerRef).NotTo(BeNil())
209209
Expect(maintenanceCR.Spec.ServerRef.Name).To(Equal(serverClaim.Spec.ServerRef.Name))
210+
Expect(maintenanceCR.Labels).To(HaveKeyWithValue(labelKeyManagedBy, cloudProviderMetalName))
210211

211212
By("Verifying the finalizer is added to the Node")
212213
Eventually(Object(node)).Should(HaveField("Finalizers", ContainElement(nodeMaintenanceFinalizer)))
@@ -220,7 +221,6 @@ var _ = Describe("NodeReconciler", func() {
220221
Labels: map[string]string{
221222
"test-marker": "do-not-overwrite",
222223
},
223-
Finalizers: []string{nodeMaintenanceFinalizer},
224224
},
225225
Spec: metalv1alpha1.ServerMaintenanceSpec{
226226
Policy: metalv1alpha1.ServerMaintenancePolicyOwnerApproval,
@@ -249,6 +249,7 @@ var _ = Describe("NodeReconciler", func() {
249249
}).Should(Succeed())
250250

251251
By("Verifying the finalizer is present in the Node")
252+
Eventually(Object(node)).Should(HaveField("Finalizers", ContainElement(nodeMaintenanceFinalizer)))
252253
Consistently(Object(node)).Should(HaveField("Finalizers", ContainElement(nodeMaintenanceFinalizer)))
253254
})
254255

@@ -348,6 +349,43 @@ var _ = Describe("NodeReconciler", func() {
348349
})
349350
})
350351

352+
It("should NOT delete the ServerMaintenance CR if it is not managed by the controller", func(ctx SpecContext) {
353+
const managedBy = "some-other-controller-or-admin"
354+
unownedCR := &metalv1alpha1.ServerMaintenance{
355+
ObjectMeta: metav1.ObjectMeta{
356+
Name: serverClaim.Name,
357+
Namespace: serverClaim.Namespace,
358+
Labels: map[string]string{
359+
labelKeyManagedBy: managedBy,
360+
},
361+
},
362+
Spec: metalv1alpha1.ServerMaintenanceSpec{
363+
Policy: metalv1alpha1.ServerMaintenancePolicyOwnerApproval,
364+
Priority: serverMaintenancePriority,
365+
ServerRef: &corev1.LocalObjectReference{
366+
Name: serverClaim.Spec.ServerRef.Name,
367+
},
368+
},
369+
}
370+
Expect(k8sClient.Create(ctx, unownedCR)).To(Succeed())
371+
DeferCleanup(k8sClient.Delete, unownedCR)
372+
373+
originalNode := node.DeepCopy()
374+
if node.Annotations == nil {
375+
node.Annotations = make(map[string]string)
376+
}
377+
node.Annotations["dummy-trigger"] = "trigger-reconcile"
378+
Expect(k8sClient.Patch(ctx, node, client.MergeFrom(originalNode))).To(Succeed())
379+
380+
By("Ensuring the unowned ServerMaintenance CR remains untouched")
381+
Consistently(func(g Gomega) {
382+
checkCR := &metalv1alpha1.ServerMaintenance{}
383+
err := k8sClient.Get(ctx, client.ObjectKeyFromObject(unownedCR), checkCR)
384+
g.Expect(err).NotTo(HaveOccurred())
385+
g.Expect(checkCR.Labels).To(HaveKeyWithValue(labelKeyManagedBy, managedBy))
386+
}).Should(Succeed())
387+
})
388+
351389
Context("Maintenance Approval Handshake", func() {
352390
It("should add the approval label to the ServerClaim when Node is approved", func(ctx SpecContext) {
353391
originalClaim := serverClaim.DeepCopy()

0 commit comments

Comments
 (0)