Do not retry pods that are not scheduled

trozet · trozet · commit 4650923b5bf1 · 2025-05-20T15:20:12.000-04:00
In our pod handlers we have code that checks if a pod is scheduled. If
the pod is not scheduled, then we do not add the pod to our retry
framework. However, there are times where we automatically enqueue all
pods on a node or across the cluster. This can be from when a new node
comes up (pre-IC) and need to add all pods on that node. Another use
case is when a new UDN/NAD is created and the controller spins up...we
add all pods then.

We shouldn't be queuing pods to the retry framework that are not
scheduled. It's a waste of operations. However, even if we do enqueue
them, we definitely should not be treating a non-scheduled resource as
an error and retrying it again later.

This commit changes the retry framework to detect the above case, and
log an error. It does not trigger retrying of the resource, which may
perpetually fail and then then cause OVNKubernetesResourceRetryFailure.

Signed-off-by: Tim Rozet &lt;trozet@redhat.com&gt;
diff --git a/go-controller/pkg/retry/obj_retry.go b/go-controller/pkg/retry/obj_retry.go
@@ -320,7 +320,11 @@ func (r *RetryFramework) resourceRetry(objKey string, now time.Time) {
 		}
 		if r.ResourceHandler.NeedsUpdateDuringRetry && entry.config != nil && entry.newObj != nil {
 			klog.Infof("%v retry: updating object %s", r.ResourceHandler.ObjType, objKey)
-			if err := r.ResourceHandler.UpdateResource(entry.config, entry.newObj, true); err != nil {
+			if !r.ResourceHandler.IsResourceScheduled(entry.newObj) {
+				// unscheduled resources (pods) will be retried again later we do not track these as failures, and should not retry.
+				// we should avoid queuing objects to the retry handler that are not scheduled. Thus treat this as an error.
+				klog.Errorf("%v retry: cannot update object that is not scheduled: %s", r.ResourceHandler.ObjType, objKey)
+			} else if err := r.ResourceHandler.UpdateResource(entry.config, entry.newObj, true); err != nil {
 				entry.timeStamp = time.Now()
 				entry.failedAttempts++
 				if entry.failedAttempts >= MaxFailedAttempts {
@@ -336,14 +340,12 @@ func (r *RetryFramework) resourceRetry(objKey string, now time.Time) {
 		} else {
 			// delete old object if needed
 			if entry.oldObj != nil {
-				klog.Infof("Removing old object: %s %s (failed: %v)",
-					r.ResourceHandler.ObjType, objKey, entry.failedAttempts)
+				klog.Infof("Removing old object: %s %s (failed: %v)", r.ResourceHandler.ObjType, objKey, entry.failedAttempts)
 				if !r.ResourceHandler.IsResourceScheduled(entry.oldObj) {
-					klog.V(5).Infof("Retry: %s %s not scheduled", r.ResourceHandler.ObjType, objKey)
-					entry.failedAttempts++
-					return
-				}
-				if err := r.ResourceHandler.DeleteResource(entry.oldObj, entry.config); err != nil {
+					// unscheduled resources (pods) will be retried again later we do not track these as failures, and should not retry.
+					// we should avoid queuing objects to the retry handler that are not scheduled. Thus treat this as an error.
+					klog.Errorf("%v retry: cannot delete object that was not scheduled %s", r.ResourceHandler.ObjType, objKey)
+				} else if err := r.ResourceHandler.DeleteResource(entry.oldObj, entry.config); err != nil {
 					entry.timeStamp = time.Now()
 					entry.failedAttempts++
 					if entry.failedAttempts >= MaxFailedAttempts {
@@ -363,11 +365,10 @@ func (r *RetryFramework) resourceRetry(objKey string, now time.Time) {
 			if entry.newObj != nil {
 				klog.Infof("Adding new object: %s %s", r.ResourceHandler.ObjType, objKey)
 				if !r.ResourceHandler.IsResourceScheduled(entry.newObj) {
-					klog.V(5).Infof("Retry: %s %s not scheduled", r.ResourceHandler.ObjType, objKey)
-					entry.failedAttempts++
-					return
-				}
-				if err := r.ResourceHandler.AddResource(entry.newObj, true); err != nil {
+					// unscheduled resources (pods) will be retried again later we do not track these as failures, and should not retry.
+					// we should avoid queuing objects to the retry handler that are not scheduled. Thus treat this as an error.
+					klog.Errorf("%v retry: cannot create object that is not scheduled %s", r.ResourceHandler.ObjType, objKey)
+				} else if err := r.ResourceHandler.AddResource(entry.newObj, true); err != nil {
 					entry.timeStamp = time.Now()
 					entry.failedAttempts++
 					if entry.failedAttempts >= MaxFailedAttempts {