Skip to content

Commit e7f87de

Browse files
committed
nfd-master: retry node updates indefinitely
Treat node updates like a reconciliation loop. Keep trying on node update as long as it fails. Node update permafailing likely indicates a bug in the nfd code (there should be no reason for it to fail forever) and it's better to clearly see it in the logs/metrics rather than giving up after a few retries.
1 parent 4790962 commit e7f87de

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

pkg/nfd-master/node-updater-pool.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,15 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI
5656
if _, err := u.nfdMaster.getNode(nodeName); apierrors.IsNotFound(err) {
5757
klog.InfoS("node not found, skip update", "nodeName", nodeName)
5858
} else if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName); err != nil {
59-
if queue.NumRequeues(nodeName) < 15 {
60-
klog.InfoS("retrying node update", "nodeName", nodeName, "lastError", err)
61-
queue.AddRateLimited(nodeName)
62-
return true
59+
if n := queue.NumRequeues(nodeName); n < 15 {
60+
klog.InfoS("retrying node update", "nodeName", nodeName, "lastError", err, "numRetries", n)
6361
} else {
64-
klog.ErrorS(err, "failed to update node", "nodeName", nodeName)
62+
klog.ErrorS(err, "node update failed, queuing for retry ", "nodeName", nodeName, "numRetries", n)
63+
// Count only long-failing attempts
6564
nodeUpdateFailures.Inc()
6665
}
66+
queue.AddRateLimited(nodeName)
67+
return true
6768
}
6869
queue.Forget(nodeName)
6970
return true

0 commit comments

Comments
 (0)