Skip to content

Commit 606e09f

Browse files
Merge pull request #30457 from weliang1/check-operators-for-pod-deletion
OCPBUGS-63478: Check operators for pod deletion
2 parents 4f47dc4 + 5f9cd4d commit 606e09f

File tree

1 file changed

+35
-20
lines changed

1 file changed

+35
-20
lines changed

pkg/monitortests/network/legacynetworkmonitortests/networking.go

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -182,15 +182,9 @@ func testPodSandboxCreation(events monitorapi.Intervals, clientConfig *rest.Conf
182182
}
183183

184184
partialLocator := monitorapi.NonUniquePodLocatorFrom(event.Locator)
185+
progressingOperatorName := getProgressingOperatorName(event, operatorsProgressing)
185186
if deletionTime := getPodDeletionTime(eventsForPods[partialLocator], event.Locator); deletionTime == nil {
186-
var progressingOperatorName string
187-
for _, operatorProgressingInterval := range operatorsProgressing {
188-
if event.From.After(operatorProgressingInterval.From) &&
189-
event.To.Before(operatorProgressingInterval.To) {
190-
progressingOperatorName = operatorProgressingInterval.Locator.Keys[monitorapi.LocatorClusterOperatorKey]
191-
break
192-
}
193-
}
187+
// Pod was never deleted
194188
if len(progressingOperatorName) > 0 {
195189
flakes = append(flakes, fmt.Sprintf(
196190
"%v - never deleted - operator:%s was progressing which may cause pod sandbox creation errors - %v",
@@ -201,19 +195,28 @@ func testPodSandboxCreation(events monitorapi.Intervals, clientConfig *rest.Conf
201195
event.Locator.OldLocator(), event.Message.OldMessage()))
202196
}
203197
} else {
198+
// Pod was deleted - check timing and operator status
204199
timeBetweenDeleteAndFailure := event.From.Sub(*deletionTime)
205-
switch {
206-
case timeBetweenDeleteAndFailure < 1*time.Second:
207-
// nothing here, one second is close enough to be ok, the kubelet and CNI just didn't know
208-
case timeBetweenDeleteAndFailure < 5*time.Second:
209-
// withing five seconds, it ought to be long enough to know, but it's close enough to flake and not fail
210-
flakes = append(flakes, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator.OldLocator(), timeBetweenDeleteAndFailure.Seconds(), event.Message.OldMessage()))
211-
case deletionTime.Before(event.From):
212-
// something went wrong. More than five seconds after the pod was deleted, the CNI is trying to set up pod sandboxes and can't
213-
failures = append(failures, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator.OldLocator(), timeBetweenDeleteAndFailure.Seconds(), event.Message.OldMessage()))
214-
default:
215-
// something went wrong. deletion happened after we had a failure to create the pod sandbox
216-
failures = append(failures, fmt.Sprintf("%v - deletion came AFTER sandbox failure - %v", event.Locator.OldLocator(), event.Message.OldMessage()))
200+
if len(progressingOperatorName) > 0 {
201+
// If an operator was progressing, treat as flake regardless of timing
202+
flakes = append(flakes, fmt.Sprintf(
203+
"%v - %0.2f seconds after deletion - operator:%s was progressing which may cause pod sandbox creation errors - %v",
204+
event.Locator.OldLocator(), timeBetweenDeleteAndFailure.Seconds(), progressingOperatorName, event.Message.OldMessage()))
205+
} else {
206+
// No operator progressing, apply timing-based logic
207+
switch {
208+
case timeBetweenDeleteAndFailure < 1*time.Second:
209+
// nothing here, one second is close enough to be ok, the kubelet and CNI just didn't know
210+
case timeBetweenDeleteAndFailure < 5*time.Second:
211+
// withing five seconds, it ought to be long enough to know, but it's close enough to flake and not fail
212+
flakes = append(flakes, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator.OldLocator(), timeBetweenDeleteAndFailure.Seconds(), event.Message.OldMessage()))
213+
case deletionTime.Before(event.From):
214+
// something went wrong. More than five seconds after the pod was deleted, the CNI is trying to set up pod sandboxes and can't
215+
failures = append(failures, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator.OldLocator(), timeBetweenDeleteAndFailure.Seconds(), event.Message.OldMessage()))
216+
default:
217+
// something went wrong. deletion happened after we had a failure to create the pod sandbox
218+
failures = append(failures, fmt.Sprintf("%v - deletion came AFTER sandbox failure - %v", event.Locator.OldLocator(), event.Message.OldMessage()))
219+
}
217220
}
218221
}
219222
}
@@ -318,6 +321,18 @@ func getPodDeletionTime(events monitorapi.Intervals, podLocator monitorapi.Locat
318321
return nil
319322
}
320323

324+
// getProgressingOperatorName checks if an event occurred during any operator's Progressing interval
325+
// and returns the name of the progressing operator, or empty string if none found.
326+
func getProgressingOperatorName(event monitorapi.Interval, operatorsProgressing monitorapi.Intervals) string {
327+
for _, operatorProgressingInterval := range operatorsProgressing {
328+
if event.From.After(operatorProgressingInterval.From) &&
329+
event.To.Before(operatorProgressingInterval.To) {
330+
return operatorProgressingInterval.Locator.Keys[monitorapi.LocatorClusterOperatorKey]
331+
}
332+
}
333+
return ""
334+
}
335+
321336
// bug is tracked here: https://bugzilla.redhat.com/show_bug.cgi?id=2057181
322337
// It was closed working as designed.
323338
func testOvnNodeReadinessProbe(events monitorapi.Intervals, kubeClientConfig *rest.Config) []*junitapi.JUnitTestCase {

0 commit comments

Comments
 (0)