fixup! fixup! fixup! fixup! fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus

rexagod · rexagod · commit 85008d1964ba · 2025-11-05T00:19:56.000+05:30
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -502,43 +502,6 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
 	twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
 	registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)
 
-	registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
-		name: "PrometheusReadinessProbeErrors",
-		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
-			monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
-			monitorapi.LocatorPodKey:       regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
-		},
-		messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
-		messageHumanRegex:  regexp.MustCompile("Readiness probe errored"),
-		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
-		/*
-			05:50:32	openshift-monitoring	kubelet	prometheus-k8s-1
-			Unhealthy
-			Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
-
-			05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
-			Unhealthy
-			Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
-
-			11:44:16 (x56)	openshift-monitoring	kubelet	prometheus-k8s-0
-			Unhealthy
-			Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
-
-			Readiness probes run during the lifecycle of the container, including termination.
-			Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
-			With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
-
-			To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
-			The job below hit ~60 readiness errors during the upgrade:
-			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
-			However, the job below hit readiness errors 774 times during the upgrade:
-			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
-
-			Also, do note that these events were exhibited outside of upgrades as well, so we need to allow them in general.
-		*/
-		repeatThresholdOverride: 100,
-	})
-
 	return registry
 }
 
@@ -601,6 +564,42 @@ func NewUpgradePathologicalEventMatchers(kubeConfig *rest.Config, finalIntervals
 	m := newFailedSchedulingDuringNodeUpdatePathologicalEventMatcher(finalIntervals)
 	registry.AddPathologicalEventMatcherOrDie(m)
 
+	// Prometheus pods may have readiness probe errors during upgrades.
+	registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
+		name: "PrometheusReadinessProbeErrors",
+		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
+			monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
+			monitorapi.LocatorPodKey:       regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
+		},
+		messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
+		messageHumanRegex:  regexp.MustCompile("Readiness probe errored"),
+		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
+		/*
+			05:50:32	openshift-monitoring	kubelet	prometheus-k8s-1
+			Unhealthy
+			Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
+
+			05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
+			Unhealthy
+			Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+			11:44:16 (x56)	openshift-monitoring	kubelet	prometheus-k8s-0
+			Unhealthy
+			Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+			Readiness probes run during the lifecycle of the container, including termination.
+			Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
+			With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
+
+			To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
+			The job below hit ~60 readiness errors during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
+			However, the job below hit readiness errors 774 times during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
+		*/
+		repeatThresholdOverride: 100,
+	})
+
 	return registry
 }
 
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
@@ -761,7 +761,7 @@ func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) {
 				},
 			})
 			evaluator := duplicateEventsEvaluator{
-				registry: NewUniversalPathologicalEventMatchers(nil, events),
+				registry: NewUpgradePathologicalEventMatchers(nil, events),
 			}
 
 			testName := "events should not repeat"

Original file line number	Diff line number	Diff line change
`@@ -761,7 +761,7 @@ func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) {`
`761`	`761`	`},`
`762`	`762`	`})`
`763`	`763`	`evaluator := duplicateEventsEvaluator{`
`764`		`- registry: NewUniversalPathologicalEventMatchers(nil, events),`
	`764`	`+ registry: NewUpgradePathologicalEventMatchers(nil, events),`
`765`	`765`	`}`
`766`	`766`
`767`	`767`	`testName := "events should not repeat"`