Skip to content

Commit 0bfa3d8

Browse files
Merge pull request #270 from mshitrit/oos-expiry
Add an expiry mechanism to out-of-service taint.
2 parents 96dfd8f + e85c4d2 commit 0bfa3d8

File tree

2 files changed

+86
-14
lines changed

2 files changed

+86
-14
lines changed

controllers/selfnoderemediation_controller.go

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -51,17 +51,18 @@ const (
5151
eventReasonRemediationSkipped = "RemediationSkipped"
5252

5353
// remediation
54-
eventReasonAddFinalizer = "AddFinalizer"
55-
eventReasonMarkUnschedulable = "MarkUnschedulable"
56-
eventReasonAddNoExecute = "AddNoExecute"
57-
eventReasonAddOutOfService = "AddOutOfService"
58-
eventReasonUpdateTimeAssumedRebooted = "UpdateTimeAssumedRebooted"
59-
eventReasonDeleteResources = "DeleteResources"
60-
eventReasonMarkSchedulable = "MarkNodeSchedulable"
61-
eventReasonRemoveFinalizer = "RemoveFinalizer"
62-
eventReasonRemoveNoExecute = "RemoveNoExecuteTaint"
63-
eventReasonRemoveOutOfService = "RemoveOutOfService"
64-
eventReasonNodeReboot = "NodeReboot"
54+
eventReasonAddFinalizer = "AddFinalizer"
55+
eventReasonMarkUnschedulable = "MarkUnschedulable"
56+
eventReasonAddNoExecute = "AddNoExecute"
57+
eventReasonAddOutOfService = "AddOutOfService"
58+
eventReasonUpdateTimeAssumedRebooted = "UpdateTimeAssumedRebooted"
59+
eventReasonDeleteResources = "DeleteResources"
60+
eventReasonMarkSchedulable = "MarkNodeSchedulable"
61+
eventReasonRemoveFinalizer = "RemoveFinalizer"
62+
eventReasonRemoveNoExecute = "RemoveNoExecuteTaint"
63+
eventReasonRemoveOutOfService = "RemoveOutOfService"
64+
eventReasonNodeReboot = "NodeReboot"
65+
eventReasonOutOfServiceTimestampExpired = "OutOfServiceTimestampExpired"
6566
)
6667

6768
var (
@@ -81,6 +82,9 @@ var (
8182
Value: "nodeshutdown",
8283
Effect: v1.TaintEffectNoExecute,
8384
}
85+
86+
// OutOfServiceTimeoutDuration - time after which out-of-service taint is automatically removed
87+
OutOfServiceTimeoutDuration = 1 * time.Minute
8488
)
8589

8690
type conditionReason string
@@ -430,9 +434,18 @@ func (r *SelfNodeRemediationReconciler) useOutOfServiceTaint(node *v1.Node, snr
430434
isExpired, timeLeft := r.isResourceDeletionExpired(snr)
431435
if !isExpired {
432436
return timeLeft, nil
437+
} else if snr.GetDeletionTimestamp() != nil { // Time expired and node is healthy
438+
err := r.removeOutOfServiceTaint(node)
439+
if err != nil {
440+
return 0, err
441+
}
442+
// Emit an event about the timeout expiration
443+
events.WarningEvent(r.Recorder, node, eventReasonOutOfServiceTimestampExpired,
444+
"Out-of-service taint automatically removed due to timeout expiration on a healthy node")
445+
return 0, nil
446+
} else { // if the timer is expired, but the node is still unhealthy exponential backoff is triggered
447+
return 0, errors.New("Not ready to delete out-of-service taint")
433448
}
434-
// if the timer is expired, exponential backoff is triggered
435-
return 0, errors.New("Not ready to delete out-of-service taint")
436449
}
437450

438451
if err := r.removeOutOfServiceTaint(node); err != nil {
@@ -902,7 +915,7 @@ func (r *SelfNodeRemediationReconciler) isPodTerminating(pod *v1.Pod) bool {
902915
}
903916

904917
func (r *SelfNodeRemediationReconciler) isResourceDeletionExpired(snr *v1alpha1.SelfNodeRemediation) (bool, time.Duration) {
905-
waitTime := snr.Status.TimeAssumedRebooted.Add(300 * time.Second)
918+
waitTime := snr.Status.TimeAssumedRebooted.Add(OutOfServiceTimeoutDuration)
906919

907920
if waitTime.After(time.Now()) {
908921
return false, 5 * time.Second

controllers/tests/controller/selfnoderemediation_controller_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,65 @@ var _ = Describe("SNR Controller", func() {
352352

353353
verifySNRDoesNotExists(snr)
354354
})
355+
356+
It("should automatically remove out-of-service taint after timeout when workloads are not deleted", func() {
357+
// Temporarily set a much shorter timeout for testing (instead of 1 minute)
358+
originalTimeout := controllers.OutOfServiceTimeoutDuration
359+
controllers.OutOfServiceTimeoutDuration = 3 * time.Second
360+
DeferCleanup(func() {
361+
controllers.OutOfServiceTimeoutDuration = originalTimeout
362+
})
363+
364+
createSNR(snr, remediationStrategy)
365+
366+
node := verifyNodeIsUnschedulable()
367+
addUnschedulableTaint(node)
368+
369+
verifyTypeConditions(snr, metav1.ConditionTrue, metav1.ConditionUnknown, "RemediationStarted")
370+
371+
// Create terminating pod but DON'T delete it - this simulates workloads stuck in terminating state
372+
createTerminatingPod()
373+
374+
verifyTimeHasBeenRebootedExists(snr)
375+
verifyWatchdogTriggered()
376+
verifyFinalizerExists(snr)
377+
verifyNoExecuteTaintExist()
378+
verifyOutOfServiceTaintExist()
379+
380+
// Verify timestamp annotation was added
381+
nodeKey := types.NamespacedName{Name: shared.UnhealthyNodeName}
382+
updatedNode := &v1.Node{}
383+
verifyEvent("Normal", "AddOutOfService", "Remediation process - add out-of-service taint to unhealthy node")
384+
385+
// Simulate NHC trying to delete SNR because the node is healthy
386+
deleteSNR(snr)
387+
// Wait for less than timeout (1 second) and verify taint still exists
388+
time.Sleep(1 * time.Second)
389+
verifyOutOfServiceTaintExist()
390+
391+
// Wait for timeout to expire (3 seconds) and verify taint is automatically removed
392+
Eventually(func() bool {
393+
err := k8sClient.Get(context.Background(), nodeKey, updatedNode)
394+
if err != nil {
395+
return false
396+
}
397+
return !utils.TaintExists(updatedNode.Spec.Taints, &v1.Taint{
398+
Key: "node.kubernetes.io/out-of-service",
399+
Value: "nodeshutdown",
400+
Effect: v1.TaintEffectNoExecute,
401+
})
402+
}, 10*time.Second, 250*time.Millisecond).Should(BeTrue(), "out-of-service taint should be automatically removed after 3 second timeout")
403+
404+
// Verify warning event was emitted for timeout expiration
405+
verifyEvent("Warning", "OutOfServiceTimestampExpired", "Out-of-service taint automatically removed due to timeout expiration on a healthy node")
406+
407+
// Clean up: manually delete the terminating pod and SNR to complete the test
408+
deleteTerminatingPod()
409+
verifyNodeIsSchedulable()
410+
removeUnschedulableTaint()
411+
verifyNoExecuteTaintRemoved()
412+
verifySNRDoesNotExists(snr)
413+
})
355414
})
356415

357416
Context("Remediation has a Machine Owner Ref", func() {

0 commit comments

Comments
 (0)