@@ -182,15 +182,9 @@ func testPodSandboxCreation(events monitorapi.Intervals, clientConfig *rest.Conf
182182 }
183183
184184 partialLocator := monitorapi .NonUniquePodLocatorFrom (event .Locator )
185+ progressingOperatorName := getProgressingOperatorName (event , operatorsProgressing )
185186 if deletionTime := getPodDeletionTime (eventsForPods [partialLocator ], event .Locator ); deletionTime == nil {
186- var progressingOperatorName string
187- for _ , operatorProgressingInterval := range operatorsProgressing {
188- if event .From .After (operatorProgressingInterval .From ) &&
189- event .To .Before (operatorProgressingInterval .To ) {
190- progressingOperatorName = operatorProgressingInterval .Locator .Keys [monitorapi .LocatorClusterOperatorKey ]
191- break
192- }
193- }
187+ // Pod was never deleted
194188 if len (progressingOperatorName ) > 0 {
195189 flakes = append (flakes , fmt .Sprintf (
196190 "%v - never deleted - operator:%s was progressing which may cause pod sandbox creation errors - %v" ,
@@ -201,19 +195,28 @@ func testPodSandboxCreation(events monitorapi.Intervals, clientConfig *rest.Conf
201195 event .Locator .OldLocator (), event .Message .OldMessage ()))
202196 }
203197 } else {
198+ // Pod was deleted - check timing and operator status
204199 timeBetweenDeleteAndFailure := event .From .Sub (* deletionTime )
205- switch {
206- case timeBetweenDeleteAndFailure < 1 * time .Second :
207- // nothing here, one second is close enough to be ok, the kubelet and CNI just didn't know
208- case timeBetweenDeleteAndFailure < 5 * time .Second :
209- // withing five seconds, it ought to be long enough to know, but it's close enough to flake and not fail
210- flakes = append (flakes , fmt .Sprintf ("%v - %0.2f seconds after deletion - %v" , event .Locator .OldLocator (), timeBetweenDeleteAndFailure .Seconds (), event .Message .OldMessage ()))
211- case deletionTime .Before (event .From ):
212- // something went wrong. More than five seconds after the pod was deleted, the CNI is trying to set up pod sandboxes and can't
213- failures = append (failures , fmt .Sprintf ("%v - %0.2f seconds after deletion - %v" , event .Locator .OldLocator (), timeBetweenDeleteAndFailure .Seconds (), event .Message .OldMessage ()))
214- default :
215- // something went wrong. deletion happened after we had a failure to create the pod sandbox
216- failures = append (failures , fmt .Sprintf ("%v - deletion came AFTER sandbox failure - %v" , event .Locator .OldLocator (), event .Message .OldMessage ()))
200+ if len (progressingOperatorName ) > 0 {
201+ // If an operator was progressing, treat as flake regardless of timing
202+ flakes = append (flakes , fmt .Sprintf (
203+ "%v - %0.2f seconds after deletion - operator:%s was progressing which may cause pod sandbox creation errors - %v" ,
204+ event .Locator .OldLocator (), timeBetweenDeleteAndFailure .Seconds (), progressingOperatorName , event .Message .OldMessage ()))
205+ } else {
206+ // No operator progressing, apply timing-based logic
207+ switch {
208+ case timeBetweenDeleteAndFailure < 1 * time .Second :
209+ // nothing here, one second is close enough to be ok, the kubelet and CNI just didn't know
210+ case timeBetweenDeleteAndFailure < 5 * time .Second :
211+ // withing five seconds, it ought to be long enough to know, but it's close enough to flake and not fail
212+ flakes = append (flakes , fmt .Sprintf ("%v - %0.2f seconds after deletion - %v" , event .Locator .OldLocator (), timeBetweenDeleteAndFailure .Seconds (), event .Message .OldMessage ()))
213+ case deletionTime .Before (event .From ):
214+ // something went wrong. More than five seconds after the pod was deleted, the CNI is trying to set up pod sandboxes and can't
215+ failures = append (failures , fmt .Sprintf ("%v - %0.2f seconds after deletion - %v" , event .Locator .OldLocator (), timeBetweenDeleteAndFailure .Seconds (), event .Message .OldMessage ()))
216+ default :
217+ // something went wrong. deletion happened after we had a failure to create the pod sandbox
218+ failures = append (failures , fmt .Sprintf ("%v - deletion came AFTER sandbox failure - %v" , event .Locator .OldLocator (), event .Message .OldMessage ()))
219+ }
217220 }
218221 }
219222 }
@@ -318,6 +321,18 @@ func getPodDeletionTime(events monitorapi.Intervals, podLocator monitorapi.Locat
318321 return nil
319322}
320323
324+ // getProgressingOperatorName checks if an event occurred during any operator's Progressing interval
325+ // and returns the name of the progressing operator, or empty string if none found.
326+ func getProgressingOperatorName (event monitorapi.Interval , operatorsProgressing monitorapi.Intervals ) string {
327+ for _ , operatorProgressingInterval := range operatorsProgressing {
328+ if event .From .After (operatorProgressingInterval .From ) &&
329+ event .To .Before (operatorProgressingInterval .To ) {
330+ return operatorProgressingInterval .Locator .Keys [monitorapi .LocatorClusterOperatorKey ]
331+ }
332+ }
333+ return ""
334+ }
335+
321336// bug is tracked here: https://bugzilla.redhat.com/show_bug.cgi?id=2057181
322337// It was closed working as designed.
323338func testOvnNodeReadinessProbe (events monitorapi.Intervals , kubeClientConfig * rest.Config ) []* junitapi.JUnitTestCase {
0 commit comments