@@ -44,24 +44,26 @@ import (
4444)
4545
4646const (
47- SNRFinalizer = "self-node-remediation.medik8s.io/snr-finalizer"
48- nhcTimeOutAnnotation = "remediation.medik8s.io/nhc-timed-out"
49- excludeRemediationLabel = "remediation.medik8s.io/exclude-from-remediation"
47+ SNRFinalizer = "self-node-remediation.medik8s.io/snr-finalizer"
48+ nhcTimeOutAnnotation = "remediation.medik8s.io/nhc-timed-out"
49+ excludeRemediationLabel = "remediation.medik8s.io/exclude-from-remediation"
50+ outOfServiceTimestampAnnotation = "remediation.medik8s.io/oos-timestamp"
5051
5152 eventReasonRemediationSkipped = "RemediationSkipped"
5253
5354 // remediation
54- eventReasonAddFinalizer = "AddFinalizer"
55- eventReasonMarkUnschedulable = "MarkUnschedulable"
56- eventReasonAddNoExecute = "AddNoExecute"
57- eventReasonAddOutOfService = "AddOutOfService"
58- eventReasonUpdateTimeAssumedRebooted = "UpdateTimeAssumedRebooted"
59- eventReasonDeleteResources = "DeleteResources"
60- eventReasonMarkSchedulable = "MarkNodeSchedulable"
61- eventReasonRemoveFinalizer = "RemoveFinalizer"
62- eventReasonRemoveNoExecute = "RemoveNoExecuteTaint"
63- eventReasonRemoveOutOfService = "RemoveOutOfService"
64- eventReasonNodeReboot = "NodeReboot"
55+ eventReasonAddFinalizer = "AddFinalizer"
56+ eventReasonMarkUnschedulable = "MarkUnschedulable"
57+ eventReasonAddNoExecute = "AddNoExecute"
58+ eventReasonAddOutOfService = "AddOutOfService"
59+ eventReasonUpdateTimeAssumedRebooted = "UpdateTimeAssumedRebooted"
60+ eventReasonDeleteResources = "DeleteResources"
61+ eventReasonMarkSchedulable = "MarkNodeSchedulable"
62+ eventReasonRemoveFinalizer = "RemoveFinalizer"
63+ eventReasonRemoveNoExecute = "RemoveNoExecuteTaint"
64+ eventReasonRemoveOutOfService = "RemoveOutOfService"
65+ eventReasonNodeReboot = "NodeReboot"
66+ eventReasonOutOfServiceTimestampExpired = "OutOfServiceTimestampExpired"
6567)
6668
6769var (
8183 Value : "nodeshutdown" ,
8284 Effect : v1 .TaintEffectNoExecute ,
8385 }
86+
87+ // OutOfServiceTimeoutDuration - time after which out-of-service taint is automatically removed
88+ OutOfServiceTimeoutDuration = 1 * time .Minute
8489)
8590
8691type conditionReason string
@@ -424,6 +429,16 @@ func (r *SelfNodeRemediationReconciler) useOutOfServiceTaint(node *v1.Node, snr
424429 return 0 , err
425430 }
426431
432+ isTaintRemoved , err := r .checkAndHandleExpiredOutOfServiceTaint (node )
433+ if err != nil {
434+ r .logger .Error (err , "Failed to check/handle expired out-of-service taint" , "node name" , node .Name )
435+ return 0 , err
436+ }
437+
438+ if isTaintRemoved {
439+ return 0 , nil
440+ }
441+
427442 // We can not control to delete node resources by the "out-of-service" taint
428443 // So timer is used to avoid to keep waiting to complete
429444 if ! r .isResourceDeletionCompleted (node ) {
@@ -839,12 +854,19 @@ func (r *SelfNodeRemediationReconciler) addOutOfServiceTaint(node *v1.Node) erro
839854 now := metav1 .Now ()
840855 taint .TimeAdded = & now
841856 node .Spec .Taints = append (node .Spec .Taints , taint )
857+
858+ // Add out-of-service timestamp annotation
859+ if node .Annotations == nil {
860+ node .Annotations = make (map [string ]string )
861+ }
862+ node .Annotations [outOfServiceTimestampAnnotation ] = now .Format (time .RFC3339 )
863+
842864 if err := r .Client .Patch (context .Background (), node , patch ); err != nil {
843- r .logger .Error (err , "Failed to add out-of-service taint on node" , "node name" , node .Name )
865+ r .logger .Error (err , "Failed to add out-of-service taint and timestamp annotation on node" , "node name" , node .Name )
844866 return err
845867 }
846868 events .NormalEvent (r .Recorder , node , eventReasonAddOutOfService , "Remediation process - add out-of-service taint to unhealthy node" )
847- r .logger .Info ("out-of-service taint added" , "new taints" , node .Spec .Taints )
869+ r .logger .Info ("out-of-service taint and timestamp annotation added" , "new taints" , node .Spec .Taints , "timestamp" , now . Format ( time . RFC3339 ) )
848870 return nil
849871}
850872
@@ -861,15 +883,67 @@ func (r *SelfNodeRemediationReconciler) removeOutOfServiceTaint(node *v1.Node) e
861883 } else {
862884 node .Spec .Taints = taints
863885 }
886+
887+ delete (node .Annotations , outOfServiceTimestampAnnotation )
888+
864889 if err := r .Client .Patch (context .Background (), node , patch ); err != nil {
865- r .logger .Error (err , "Failed to remove taint from node," , "node name" , node .Name , "taint key" , OutOfServiceTaint .Key , "taint effect" , OutOfServiceTaint .Effect )
890+ r .logger .Error (err , "Failed to remove taint and timestamp annotation from node," , "node name" , node .Name , "taint key" , OutOfServiceTaint .Key , "taint effect" , OutOfServiceTaint .Effect )
866891 return err
867892 }
868893 events .NormalEvent (r .Recorder , node , eventReasonRemoveOutOfService , "Remediation process - remove out-of-service taint from node" )
869- r .logger .Info ("out-of-service taint removed" , "new taints" , node .Spec .Taints )
894+ r .logger .Info ("out-of-service taint and timestamp annotation removed" , "new taints" , node .Spec .Taints )
870895 return nil
871896}
872897
898+ // checkAndHandleExpiredOutOfServiceTaint checks if the out-of-service taint has been on the node for longer than OutOfServiceTimeoutDuration
899+ // and removes it if expired
900+ func (r * SelfNodeRemediationReconciler ) checkAndHandleExpiredOutOfServiceTaint (node * v1.Node ) (bool , error ) {
901+ // Check if node has the timestamp annotation
902+ if node .Annotations == nil {
903+ return false , nil
904+ }
905+
906+ timestampStr , exists := node .Annotations [outOfServiceTimestampAnnotation ]
907+ if ! exists {
908+ r .logger .Info ("out-of-service taint exists but no timestamp annotation found" , "node name" , node .Name )
909+ return false , nil
910+ }
911+
912+ // Parse the timestamp
913+ timestamp , err := time .Parse (time .RFC3339 , timestampStr )
914+ if err != nil {
915+ r .logger .Error (err , "Failed to parse out-of-service timestamp annotation" , "node name" , node .Name , "timestamp" , timestampStr )
916+ return false , err
917+ }
918+
919+ // Check if the timeout has expired
920+ if time .Since (timestamp ) >= OutOfServiceTimeoutDuration {
921+ r .logger .Info ("out-of-service taint timeout expired, removing taint and annotation" ,
922+ "node name" , node .Name ,
923+ "timestamp" , timestampStr ,
924+ "elapsed" , time .Since (timestamp ),
925+ "timeout" , OutOfServiceTimeoutDuration )
926+
927+ // Remove the out-of-service taint and annotation
928+ if err := r .removeOutOfServiceTaint (node ); err != nil {
929+ return false , err
930+ }
931+
932+ // Emit an event about the timeout expiration
933+ events .WarningEvent (r .Recorder , node , eventReasonOutOfServiceTimestampExpired ,
934+ "Out-of-service taint automatically removed due to timeout expiration" )
935+
936+ return true , nil
937+ }
938+
939+ r .logger .Info ("out-of-service taint timeout not yet expired" ,
940+ "node name" , node .Name ,
941+ "elapsed" , time .Since (timestamp ),
942+ "timeout" , OutOfServiceTimeoutDuration )
943+
944+ return false , nil
945+ }
946+
873947func (r * SelfNodeRemediationReconciler ) isResourceDeletionCompleted (node * v1.Node ) bool {
874948 pods := & v1.PodList {}
875949 if err := r .Client .List (context .Background (), pods ); err != nil {
0 commit comments