@@ -44,10 +44,9 @@ import (
4444)
4545
4646const (
47- SNRFinalizer = "self-node-remediation.medik8s.io/snr-finalizer"
48- nhcTimeOutAnnotation = "remediation.medik8s.io/nhc-timed-out"
49- excludeRemediationLabel = "remediation.medik8s.io/exclude-from-remediation"
50- outOfServiceTimestampAnnotation = "remediation.medik8s.io/oos-timestamp"
47+ SNRFinalizer = "self-node-remediation.medik8s.io/snr-finalizer"
48+ nhcTimeOutAnnotation = "remediation.medik8s.io/nhc-timed-out"
49+ excludeRemediationLabel = "remediation.medik8s.io/exclude-from-remediation"
5150
5251 eventReasonRemediationSkipped = "RemediationSkipped"
5352
@@ -429,25 +428,24 @@ func (r *SelfNodeRemediationReconciler) useOutOfServiceTaint(node *v1.Node, snr
429428 return 0 , err
430429 }
431430
432- isTaintRemoved , err := r .checkAndHandleExpiredOutOfServiceTaint (node )
433- if err != nil {
434- r .logger .Error (err , "Failed to check/handle expired out-of-service taint" , "node name" , node .Name )
435- return 0 , err
436- }
437-
438- if isTaintRemoved {
439- return 0 , nil
440- }
441-
442431 // We can not control to delete node resources by the "out-of-service" taint
443432 // So timer is used to avoid to keep waiting to complete
444433 if ! r .isResourceDeletionCompleted (node ) {
445434 isExpired , timeLeft := r .isResourceDeletionExpired (snr )
446435 if ! isExpired {
447436 return timeLeft , nil
437+ } else if snr .GetDeletionTimestamp () != nil { // Time expired and node is healthy
438+ err := r .removeOutOfServiceTaint (node )
439+ if err != nil {
440+ return 0 , err
441+ }
442+ // Emit an event about the timeout expiration
443+ events .WarningEvent (r .Recorder , node , eventReasonOutOfServiceTimestampExpired ,
444+ "Out-of-service taint automatically removed due to timeout expiration on a healthy node" )
445+ return 0 , nil
446+ } else { // if the timer is expired, but the node is still unhealthy exponential backoff is triggered
447+ return 0 , errors .New ("Not ready to delete out-of-service taint" )
448448 }
449- // if the timer is expired, exponential backoff is triggered
450- return 0 , errors .New ("Not ready to delete out-of-service taint" )
451449 }
452450
453451 if err := r .removeOutOfServiceTaint (node ); err != nil {
@@ -854,19 +852,12 @@ func (r *SelfNodeRemediationReconciler) addOutOfServiceTaint(node *v1.Node) erro
854852 now := metav1 .Now ()
855853 taint .TimeAdded = & now
856854 node .Spec .Taints = append (node .Spec .Taints , taint )
857-
858- // Add out-of-service timestamp annotation
859- if node .Annotations == nil {
860- node .Annotations = make (map [string ]string )
861- }
862- node .Annotations [outOfServiceTimestampAnnotation ] = now .Format (time .RFC3339 )
863-
864855 if err := r .Client .Patch (context .Background (), node , patch ); err != nil {
865- r .logger .Error (err , "Failed to add out-of-service taint and timestamp annotation on node" , "node name" , node .Name )
856+ r .logger .Error (err , "Failed to add out-of-service taint on node" , "node name" , node .Name )
866857 return err
867858 }
868859 events .NormalEvent (r .Recorder , node , eventReasonAddOutOfService , "Remediation process - add out-of-service taint to unhealthy node" )
869- r .logger .Info ("out-of-service taint and timestamp annotation added" , "new taints" , node .Spec .Taints , "timestamp" , now . Format ( time . RFC3339 ) )
860+ r .logger .Info ("out-of-service taint added" , "new taints" , node .Spec .Taints )
870861 return nil
871862}
872863
@@ -883,67 +874,15 @@ func (r *SelfNodeRemediationReconciler) removeOutOfServiceTaint(node *v1.Node) e
883874 } else {
884875 node .Spec .Taints = taints
885876 }
886-
887- delete (node .Annotations , outOfServiceTimestampAnnotation )
888-
889877 if err := r .Client .Patch (context .Background (), node , patch ); err != nil {
890- r .logger .Error (err , "Failed to remove taint and timestamp annotation from node," , "node name" , node .Name , "taint key" , OutOfServiceTaint .Key , "taint effect" , OutOfServiceTaint .Effect )
878+ r .logger .Error (err , "Failed to remove taint from node," , "node name" , node .Name , "taint key" , OutOfServiceTaint .Key , "taint effect" , OutOfServiceTaint .Effect )
891879 return err
892880 }
893881 events .NormalEvent (r .Recorder , node , eventReasonRemoveOutOfService , "Remediation process - remove out-of-service taint from node" )
894- r .logger .Info ("out-of-service taint and timestamp annotation removed" , "new taints" , node .Spec .Taints )
882+ r .logger .Info ("out-of-service taint removed" , "new taints" , node .Spec .Taints )
895883 return nil
896884}
897885
898- // checkAndHandleExpiredOutOfServiceTaint checks if the out-of-service taint has been on the node for longer than OutOfServiceTimeoutDuration
899- // and removes it if expired
900- func (r * SelfNodeRemediationReconciler ) checkAndHandleExpiredOutOfServiceTaint (node * v1.Node ) (bool , error ) {
901- // Check if node has the timestamp annotation
902- if node .Annotations == nil {
903- return false , nil
904- }
905-
906- timestampStr , exists := node .Annotations [outOfServiceTimestampAnnotation ]
907- if ! exists {
908- r .logger .Info ("out-of-service taint exists but no timestamp annotation found" , "node name" , node .Name )
909- return false , nil
910- }
911-
912- // Parse the timestamp
913- timestamp , err := time .Parse (time .RFC3339 , timestampStr )
914- if err != nil {
915- r .logger .Error (err , "Failed to parse out-of-service timestamp annotation" , "node name" , node .Name , "timestamp" , timestampStr )
916- return false , err
917- }
918-
919- // Check if the timeout has expired
920- if time .Since (timestamp ) >= OutOfServiceTimeoutDuration {
921- r .logger .Info ("out-of-service taint timeout expired, removing taint and annotation" ,
922- "node name" , node .Name ,
923- "timestamp" , timestampStr ,
924- "elapsed" , time .Since (timestamp ),
925- "timeout" , OutOfServiceTimeoutDuration )
926-
927- // Remove the out-of-service taint and annotation
928- if err := r .removeOutOfServiceTaint (node ); err != nil {
929- return false , err
930- }
931-
932- // Emit an event about the timeout expiration
933- events .WarningEvent (r .Recorder , node , eventReasonOutOfServiceTimestampExpired ,
934- "Out-of-service taint automatically removed due to timeout expiration" )
935-
936- return true , nil
937- }
938-
939- r .logger .Info ("out-of-service taint timeout not yet expired" ,
940- "node name" , node .Name ,
941- "elapsed" , time .Since (timestamp ),
942- "timeout" , OutOfServiceTimeoutDuration )
943-
944- return false , nil
945- }
946-
947886func (r * SelfNodeRemediationReconciler ) isResourceDeletionCompleted (node * v1.Node ) bool {
948887 pods := & v1.PodList {}
949888 if err := r .Client .List (context .Background (), pods ); err != nil {
@@ -976,7 +915,7 @@ func (r *SelfNodeRemediationReconciler) isPodTerminating(pod *v1.Pod) bool {
976915}
977916
978917func (r * SelfNodeRemediationReconciler ) isResourceDeletionExpired (snr * v1alpha1.SelfNodeRemediation ) (bool , time.Duration ) {
979- waitTime := snr .Status .TimeAssumedRebooted .Add (300 * time . Second )
918+ waitTime := snr .Status .TimeAssumedRebooted .Add (OutOfServiceTimeoutDuration )
980919
981920 if waitTime .After (time .Now ()) {
982921 return false , 5 * time .Second
0 commit comments