Skip to content

Commit e85c4d2

Browse files
committed
Apply Feedback
- remove oos taint only when the node is healthy - reuse existing expiry mechanism - phrasing and other nits Signed-off-by: Michael Shitrit <[email protected]>
1 parent f4cf5a6 commit e85c4d2

File tree

2 files changed

+22
-112
lines changed

2 files changed

+22
-112
lines changed

controllers/selfnoderemediation_controller.go

Lines changed: 19 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,9 @@ import (
4444
)
4545

4646
const (
47-
SNRFinalizer = "self-node-remediation.medik8s.io/snr-finalizer"
48-
nhcTimeOutAnnotation = "remediation.medik8s.io/nhc-timed-out"
49-
excludeRemediationLabel = "remediation.medik8s.io/exclude-from-remediation"
50-
outOfServiceTimestampAnnotation = "remediation.medik8s.io/oos-timestamp"
47+
SNRFinalizer = "self-node-remediation.medik8s.io/snr-finalizer"
48+
nhcTimeOutAnnotation = "remediation.medik8s.io/nhc-timed-out"
49+
excludeRemediationLabel = "remediation.medik8s.io/exclude-from-remediation"
5150

5251
eventReasonRemediationSkipped = "RemediationSkipped"
5352

@@ -429,25 +428,24 @@ func (r *SelfNodeRemediationReconciler) useOutOfServiceTaint(node *v1.Node, snr
429428
return 0, err
430429
}
431430

432-
isTaintRemoved, err := r.checkAndHandleExpiredOutOfServiceTaint(node)
433-
if err != nil {
434-
r.logger.Error(err, "Failed to check/handle expired out-of-service taint", "node name", node.Name)
435-
return 0, err
436-
}
437-
438-
if isTaintRemoved {
439-
return 0, nil
440-
}
441-
442431
// We can not control to delete node resources by the "out-of-service" taint
443432
// So timer is used to avoid to keep waiting to complete
444433
if !r.isResourceDeletionCompleted(node) {
445434
isExpired, timeLeft := r.isResourceDeletionExpired(snr)
446435
if !isExpired {
447436
return timeLeft, nil
437+
} else if snr.GetDeletionTimestamp() != nil { // Time expired and node is healthy
438+
err := r.removeOutOfServiceTaint(node)
439+
if err != nil {
440+
return 0, err
441+
}
442+
// Emit an event about the timeout expiration
443+
events.WarningEvent(r.Recorder, node, eventReasonOutOfServiceTimestampExpired,
444+
"Out-of-service taint automatically removed due to timeout expiration on a healthy node")
445+
return 0, nil
446+
} else { // if the timer is expired, but the node is still unhealthy exponential backoff is triggered
447+
return 0, errors.New("Not ready to delete out-of-service taint")
448448
}
449-
// if the timer is expired, exponential backoff is triggered
450-
return 0, errors.New("Not ready to delete out-of-service taint")
451449
}
452450

453451
if err := r.removeOutOfServiceTaint(node); err != nil {
@@ -854,19 +852,12 @@ func (r *SelfNodeRemediationReconciler) addOutOfServiceTaint(node *v1.Node) erro
854852
now := metav1.Now()
855853
taint.TimeAdded = &now
856854
node.Spec.Taints = append(node.Spec.Taints, taint)
857-
858-
// Add out-of-service timestamp annotation
859-
if node.Annotations == nil {
860-
node.Annotations = make(map[string]string)
861-
}
862-
node.Annotations[outOfServiceTimestampAnnotation] = now.Format(time.RFC3339)
863-
864855
if err := r.Client.Patch(context.Background(), node, patch); err != nil {
865-
r.logger.Error(err, "Failed to add out-of-service taint and timestamp annotation on node", "node name", node.Name)
856+
r.logger.Error(err, "Failed to add out-of-service taint on node", "node name", node.Name)
866857
return err
867858
}
868859
events.NormalEvent(r.Recorder, node, eventReasonAddOutOfService, "Remediation process - add out-of-service taint to unhealthy node")
869-
r.logger.Info("out-of-service taint and timestamp annotation added", "new taints", node.Spec.Taints, "timestamp", now.Format(time.RFC3339))
860+
r.logger.Info("out-of-service taint added", "new taints", node.Spec.Taints)
870861
return nil
871862
}
872863

@@ -883,67 +874,15 @@ func (r *SelfNodeRemediationReconciler) removeOutOfServiceTaint(node *v1.Node) e
883874
} else {
884875
node.Spec.Taints = taints
885876
}
886-
887-
delete(node.Annotations, outOfServiceTimestampAnnotation)
888-
889877
if err := r.Client.Patch(context.Background(), node, patch); err != nil {
890-
r.logger.Error(err, "Failed to remove taint and timestamp annotation from node,", "node name", node.Name, "taint key", OutOfServiceTaint.Key, "taint effect", OutOfServiceTaint.Effect)
878+
r.logger.Error(err, "Failed to remove taint from node,", "node name", node.Name, "taint key", OutOfServiceTaint.Key, "taint effect", OutOfServiceTaint.Effect)
891879
return err
892880
}
893881
events.NormalEvent(r.Recorder, node, eventReasonRemoveOutOfService, "Remediation process - remove out-of-service taint from node")
894-
r.logger.Info("out-of-service taint and timestamp annotation removed", "new taints", node.Spec.Taints)
882+
r.logger.Info("out-of-service taint removed", "new taints", node.Spec.Taints)
895883
return nil
896884
}
897885

898-
// checkAndHandleExpiredOutOfServiceTaint checks if the out-of-service taint has been on the node for longer than OutOfServiceTimeoutDuration
899-
// and removes it if expired
900-
func (r *SelfNodeRemediationReconciler) checkAndHandleExpiredOutOfServiceTaint(node *v1.Node) (bool, error) {
901-
// Check if node has the timestamp annotation
902-
if node.Annotations == nil {
903-
return false, nil
904-
}
905-
906-
timestampStr, exists := node.Annotations[outOfServiceTimestampAnnotation]
907-
if !exists {
908-
r.logger.Info("out-of-service taint exists but no timestamp annotation found", "node name", node.Name)
909-
return false, nil
910-
}
911-
912-
// Parse the timestamp
913-
timestamp, err := time.Parse(time.RFC3339, timestampStr)
914-
if err != nil {
915-
r.logger.Error(err, "Failed to parse out-of-service timestamp annotation", "node name", node.Name, "timestamp", timestampStr)
916-
return false, err
917-
}
918-
919-
// Check if the timeout has expired
920-
if time.Since(timestamp) >= OutOfServiceTimeoutDuration {
921-
r.logger.Info("out-of-service taint timeout expired, removing taint and annotation",
922-
"node name", node.Name,
923-
"timestamp", timestampStr,
924-
"elapsed", time.Since(timestamp),
925-
"timeout", OutOfServiceTimeoutDuration)
926-
927-
// Remove the out-of-service taint and annotation
928-
if err := r.removeOutOfServiceTaint(node); err != nil {
929-
return false, err
930-
}
931-
932-
// Emit an event about the timeout expiration
933-
events.WarningEvent(r.Recorder, node, eventReasonOutOfServiceTimestampExpired,
934-
"Out-of-service taint automatically removed due to timeout expiration")
935-
936-
return true, nil
937-
}
938-
939-
r.logger.Info("out-of-service taint timeout not yet expired",
940-
"node name", node.Name,
941-
"elapsed", time.Since(timestamp),
942-
"timeout", OutOfServiceTimeoutDuration)
943-
944-
return false, nil
945-
}
946-
947886
func (r *SelfNodeRemediationReconciler) isResourceDeletionCompleted(node *v1.Node) bool {
948887
pods := &v1.PodList{}
949888
if err := r.Client.List(context.Background(), pods); err != nil {
@@ -976,7 +915,7 @@ func (r *SelfNodeRemediationReconciler) isPodTerminating(pod *v1.Pod) bool {
976915
}
977916

978917
func (r *SelfNodeRemediationReconciler) isResourceDeletionExpired(snr *v1alpha1.SelfNodeRemediation) (bool, time.Duration) {
979-
waitTime := snr.Status.TimeAssumedRebooted.Add(300 * time.Second)
918+
waitTime := snr.Status.TimeAssumedRebooted.Add(OutOfServiceTimeoutDuration)
980919

981920
if waitTime.After(time.Now()) {
982921
return false, 5 * time.Second

controllers/tests/controller/selfnoderemediation_controller_test.go

Lines changed: 3 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -380,25 +380,10 @@ var _ = Describe("SNR Controller", func() {
380380
// Verify timestamp annotation was added
381381
nodeKey := types.NamespacedName{Name: shared.UnhealthyNodeName}
382382
updatedNode := &v1.Node{}
383-
Eventually(func() bool {
384-
err := k8sClient.Get(context.Background(), nodeKey, updatedNode)
385-
if err != nil {
386-
return false
387-
}
388-
if updatedNode.Annotations == nil {
389-
return false
390-
}
391-
timestampStr, exists := updatedNode.Annotations["remediation.medik8s.io/oos-timestamp"]
392-
if !exists || timestampStr == "" {
393-
return false
394-
}
395-
// Verify timestamp is valid RFC3339 format
396-
_, err = time.Parse(time.RFC3339, timestampStr)
397-
return err == nil
398-
}, 10*time.Second, 250*time.Millisecond).Should(BeTrue(), "out-of-service timestamp annotation should exist and be valid")
399-
400383
verifyEvent("Normal", "AddOutOfService", "Remediation process - add out-of-service taint to unhealthy node")
401384

385+
// Simulate NHC trying to delete SNR because the node is healthy
386+
deleteSNR(snr)
402387
// Wait for less than timeout (1 second) and verify taint still exists
403388
time.Sleep(1 * time.Second)
404389
verifyOutOfServiceTaintExist()
@@ -416,25 +401,11 @@ var _ = Describe("SNR Controller", func() {
416401
})
417402
}, 10*time.Second, 250*time.Millisecond).Should(BeTrue(), "out-of-service taint should be automatically removed after 3 second timeout")
418403

419-
// Verify timestamp annotation was also removed
420-
Eventually(func() bool {
421-
err := k8sClient.Get(context.Background(), nodeKey, updatedNode)
422-
if err != nil {
423-
return false
424-
}
425-
if updatedNode.Annotations == nil {
426-
return true
427-
}
428-
_, exists := updatedNode.Annotations["remediation.medik8s.io/oos-timestamp"]
429-
return !exists
430-
}, 10*time.Second, 250*time.Millisecond).Should(BeTrue(), "out-of-service timestamp annotation should be removed")
431-
432404
// Verify warning event was emitted for timeout expiration
433-
verifyEvent("Warning", "OutOfServiceTimestampExpired", "Out-of-service taint automatically removed due to timeout expiration")
405+
verifyEvent("Warning", "OutOfServiceTimestampExpired", "Out-of-service taint automatically removed due to timeout expiration on a healthy node")
434406

435407
// Clean up: manually delete the terminating pod and SNR to complete the test
436408
deleteTerminatingPod()
437-
deleteSNR(snr)
438409
verifyNodeIsSchedulable()
439410
removeUnschedulableTaint()
440411
verifyNoExecuteTaintRemoved()

0 commit comments

Comments
 (0)