Skip to content

Commit f4cf5a6

Browse files
committed
Add an expiry mechanism to out-of-service taint.
Signed-off-by: Michael Shitrit <[email protected]>
1 parent 96dfd8f commit f4cf5a6

File tree

2 files changed

+180
-18
lines changed

2 files changed

+180
-18
lines changed

controllers/selfnoderemediation_controller.go

Lines changed: 92 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -44,24 +44,26 @@ import (
4444
)
4545

4646
const (
47-
SNRFinalizer = "self-node-remediation.medik8s.io/snr-finalizer"
48-
nhcTimeOutAnnotation = "remediation.medik8s.io/nhc-timed-out"
49-
excludeRemediationLabel = "remediation.medik8s.io/exclude-from-remediation"
47+
SNRFinalizer = "self-node-remediation.medik8s.io/snr-finalizer"
48+
nhcTimeOutAnnotation = "remediation.medik8s.io/nhc-timed-out"
49+
excludeRemediationLabel = "remediation.medik8s.io/exclude-from-remediation"
50+
outOfServiceTimestampAnnotation = "remediation.medik8s.io/oos-timestamp"
5051

5152
eventReasonRemediationSkipped = "RemediationSkipped"
5253

5354
// remediation
54-
eventReasonAddFinalizer = "AddFinalizer"
55-
eventReasonMarkUnschedulable = "MarkUnschedulable"
56-
eventReasonAddNoExecute = "AddNoExecute"
57-
eventReasonAddOutOfService = "AddOutOfService"
58-
eventReasonUpdateTimeAssumedRebooted = "UpdateTimeAssumedRebooted"
59-
eventReasonDeleteResources = "DeleteResources"
60-
eventReasonMarkSchedulable = "MarkNodeSchedulable"
61-
eventReasonRemoveFinalizer = "RemoveFinalizer"
62-
eventReasonRemoveNoExecute = "RemoveNoExecuteTaint"
63-
eventReasonRemoveOutOfService = "RemoveOutOfService"
64-
eventReasonNodeReboot = "NodeReboot"
55+
eventReasonAddFinalizer = "AddFinalizer"
56+
eventReasonMarkUnschedulable = "MarkUnschedulable"
57+
eventReasonAddNoExecute = "AddNoExecute"
58+
eventReasonAddOutOfService = "AddOutOfService"
59+
eventReasonUpdateTimeAssumedRebooted = "UpdateTimeAssumedRebooted"
60+
eventReasonDeleteResources = "DeleteResources"
61+
eventReasonMarkSchedulable = "MarkNodeSchedulable"
62+
eventReasonRemoveFinalizer = "RemoveFinalizer"
63+
eventReasonRemoveNoExecute = "RemoveNoExecuteTaint"
64+
eventReasonRemoveOutOfService = "RemoveOutOfService"
65+
eventReasonNodeReboot = "NodeReboot"
66+
eventReasonOutOfServiceTimestampExpired = "OutOfServiceTimestampExpired"
6567
)
6668

6769
var (
@@ -81,6 +83,9 @@ var (
8183
Value: "nodeshutdown",
8284
Effect: v1.TaintEffectNoExecute,
8385
}
86+
87+
// OutOfServiceTimeoutDuration - time after which out-of-service taint is automatically removed
88+
OutOfServiceTimeoutDuration = 1 * time.Minute
8489
)
8590

8691
type conditionReason string
@@ -424,6 +429,16 @@ func (r *SelfNodeRemediationReconciler) useOutOfServiceTaint(node *v1.Node, snr
424429
return 0, err
425430
}
426431

432+
isTaintRemoved, err := r.checkAndHandleExpiredOutOfServiceTaint(node)
433+
if err != nil {
434+
r.logger.Error(err, "Failed to check/handle expired out-of-service taint", "node name", node.Name)
435+
return 0, err
436+
}
437+
438+
if isTaintRemoved {
439+
return 0, nil
440+
}
441+
427442
// We can not control to delete node resources by the "out-of-service" taint
428443
// So timer is used to avoid to keep waiting to complete
429444
if !r.isResourceDeletionCompleted(node) {
@@ -839,12 +854,19 @@ func (r *SelfNodeRemediationReconciler) addOutOfServiceTaint(node *v1.Node) erro
839854
now := metav1.Now()
840855
taint.TimeAdded = &now
841856
node.Spec.Taints = append(node.Spec.Taints, taint)
857+
858+
// Add out-of-service timestamp annotation
859+
if node.Annotations == nil {
860+
node.Annotations = make(map[string]string)
861+
}
862+
node.Annotations[outOfServiceTimestampAnnotation] = now.Format(time.RFC3339)
863+
842864
if err := r.Client.Patch(context.Background(), node, patch); err != nil {
843-
r.logger.Error(err, "Failed to add out-of-service taint on node", "node name", node.Name)
865+
r.logger.Error(err, "Failed to add out-of-service taint and timestamp annotation on node", "node name", node.Name)
844866
return err
845867
}
846868
events.NormalEvent(r.Recorder, node, eventReasonAddOutOfService, "Remediation process - add out-of-service taint to unhealthy node")
847-
r.logger.Info("out-of-service taint added", "new taints", node.Spec.Taints)
869+
r.logger.Info("out-of-service taint and timestamp annotation added", "new taints", node.Spec.Taints, "timestamp", now.Format(time.RFC3339))
848870
return nil
849871
}
850872

@@ -861,15 +883,67 @@ func (r *SelfNodeRemediationReconciler) removeOutOfServiceTaint(node *v1.Node) e
861883
} else {
862884
node.Spec.Taints = taints
863885
}
886+
887+
delete(node.Annotations, outOfServiceTimestampAnnotation)
888+
864889
if err := r.Client.Patch(context.Background(), node, patch); err != nil {
865-
r.logger.Error(err, "Failed to remove taint from node,", "node name", node.Name, "taint key", OutOfServiceTaint.Key, "taint effect", OutOfServiceTaint.Effect)
890+
r.logger.Error(err, "Failed to remove taint and timestamp annotation from node,", "node name", node.Name, "taint key", OutOfServiceTaint.Key, "taint effect", OutOfServiceTaint.Effect)
866891
return err
867892
}
868893
events.NormalEvent(r.Recorder, node, eventReasonRemoveOutOfService, "Remediation process - remove out-of-service taint from node")
869-
r.logger.Info("out-of-service taint removed", "new taints", node.Spec.Taints)
894+
r.logger.Info("out-of-service taint and timestamp annotation removed", "new taints", node.Spec.Taints)
870895
return nil
871896
}
872897

898+
// checkAndHandleExpiredOutOfServiceTaint checks if the out-of-service taint has been on the node for longer than OutOfServiceTimeoutDuration
899+
// and removes it if expired
900+
func (r *SelfNodeRemediationReconciler) checkAndHandleExpiredOutOfServiceTaint(node *v1.Node) (bool, error) {
901+
// Check if node has the timestamp annotation
902+
if node.Annotations == nil {
903+
return false, nil
904+
}
905+
906+
timestampStr, exists := node.Annotations[outOfServiceTimestampAnnotation]
907+
if !exists {
908+
r.logger.Info("out-of-service taint exists but no timestamp annotation found", "node name", node.Name)
909+
return false, nil
910+
}
911+
912+
// Parse the timestamp
913+
timestamp, err := time.Parse(time.RFC3339, timestampStr)
914+
if err != nil {
915+
r.logger.Error(err, "Failed to parse out-of-service timestamp annotation", "node name", node.Name, "timestamp", timestampStr)
916+
return false, err
917+
}
918+
919+
// Check if the timeout has expired
920+
if time.Since(timestamp) >= OutOfServiceTimeoutDuration {
921+
r.logger.Info("out-of-service taint timeout expired, removing taint and annotation",
922+
"node name", node.Name,
923+
"timestamp", timestampStr,
924+
"elapsed", time.Since(timestamp),
925+
"timeout", OutOfServiceTimeoutDuration)
926+
927+
// Remove the out-of-service taint and annotation
928+
if err := r.removeOutOfServiceTaint(node); err != nil {
929+
return false, err
930+
}
931+
932+
// Emit an event about the timeout expiration
933+
events.WarningEvent(r.Recorder, node, eventReasonOutOfServiceTimestampExpired,
934+
"Out-of-service taint automatically removed due to timeout expiration")
935+
936+
return true, nil
937+
}
938+
939+
r.logger.Info("out-of-service taint timeout not yet expired",
940+
"node name", node.Name,
941+
"elapsed", time.Since(timestamp),
942+
"timeout", OutOfServiceTimeoutDuration)
943+
944+
return false, nil
945+
}
946+
873947
func (r *SelfNodeRemediationReconciler) isResourceDeletionCompleted(node *v1.Node) bool {
874948
pods := &v1.PodList{}
875949
if err := r.Client.List(context.Background(), pods); err != nil {

controllers/tests/controller/selfnoderemediation_controller_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,94 @@ var _ = Describe("SNR Controller", func() {
352352

353353
verifySNRDoesNotExists(snr)
354354
})
355+
356+
It("should automatically remove out-of-service taint after timeout when workloads are not deleted", func() {
357+
// Temporarily set a much shorter timeout for testing (instead of 1 minute)
358+
originalTimeout := controllers.OutOfServiceTimeoutDuration
359+
controllers.OutOfServiceTimeoutDuration = 3 * time.Second
360+
DeferCleanup(func() {
361+
controllers.OutOfServiceTimeoutDuration = originalTimeout
362+
})
363+
364+
createSNR(snr, remediationStrategy)
365+
366+
node := verifyNodeIsUnschedulable()
367+
addUnschedulableTaint(node)
368+
369+
verifyTypeConditions(snr, metav1.ConditionTrue, metav1.ConditionUnknown, "RemediationStarted")
370+
371+
// Create terminating pod but DON'T delete it - this simulates workloads stuck in terminating state
372+
createTerminatingPod()
373+
374+
verifyTimeHasBeenRebootedExists(snr)
375+
verifyWatchdogTriggered()
376+
verifyFinalizerExists(snr)
377+
verifyNoExecuteTaintExist()
378+
verifyOutOfServiceTaintExist()
379+
380+
// Verify timestamp annotation was added
381+
nodeKey := types.NamespacedName{Name: shared.UnhealthyNodeName}
382+
updatedNode := &v1.Node{}
383+
Eventually(func() bool {
384+
err := k8sClient.Get(context.Background(), nodeKey, updatedNode)
385+
if err != nil {
386+
return false
387+
}
388+
if updatedNode.Annotations == nil {
389+
return false
390+
}
391+
timestampStr, exists := updatedNode.Annotations["remediation.medik8s.io/oos-timestamp"]
392+
if !exists || timestampStr == "" {
393+
return false
394+
}
395+
// Verify timestamp is valid RFC3339 format
396+
_, err = time.Parse(time.RFC3339, timestampStr)
397+
return err == nil
398+
}, 10*time.Second, 250*time.Millisecond).Should(BeTrue(), "out-of-service timestamp annotation should exist and be valid")
399+
400+
verifyEvent("Normal", "AddOutOfService", "Remediation process - add out-of-service taint to unhealthy node")
401+
402+
// Wait for less than timeout (1 second) and verify taint still exists
403+
time.Sleep(1 * time.Second)
404+
verifyOutOfServiceTaintExist()
405+
406+
// Wait for timeout to expire (3 seconds) and verify taint is automatically removed
407+
Eventually(func() bool {
408+
err := k8sClient.Get(context.Background(), nodeKey, updatedNode)
409+
if err != nil {
410+
return false
411+
}
412+
return !utils.TaintExists(updatedNode.Spec.Taints, &v1.Taint{
413+
Key: "node.kubernetes.io/out-of-service",
414+
Value: "nodeshutdown",
415+
Effect: v1.TaintEffectNoExecute,
416+
})
417+
}, 10*time.Second, 250*time.Millisecond).Should(BeTrue(), "out-of-service taint should be automatically removed after 3 second timeout")
418+
419+
// Verify timestamp annotation was also removed
420+
Eventually(func() bool {
421+
err := k8sClient.Get(context.Background(), nodeKey, updatedNode)
422+
if err != nil {
423+
return false
424+
}
425+
if updatedNode.Annotations == nil {
426+
return true
427+
}
428+
_, exists := updatedNode.Annotations["remediation.medik8s.io/oos-timestamp"]
429+
return !exists
430+
}, 10*time.Second, 250*time.Millisecond).Should(BeTrue(), "out-of-service timestamp annotation should be removed")
431+
432+
// Verify warning event was emitted for timeout expiration
433+
verifyEvent("Warning", "OutOfServiceTimestampExpired", "Out-of-service taint automatically removed due to timeout expiration")
434+
435+
// Clean up: manually delete the terminating pod and SNR to complete the test
436+
deleteTerminatingPod()
437+
deleteSNR(snr)
438+
verifyNodeIsSchedulable()
439+
removeUnschedulableTaint()
440+
verifyNoExecuteTaintRemoved()
441+
verifySNRDoesNotExists(snr)
442+
})
355443
})
356444

357445
Context("Remediation has a Machine Owner Ref", func() {

0 commit comments

Comments
 (0)