Merge pull request #30372 from rexagod/OCPBUGS-62703

openshift-merge-bot[bot] · web-flow · commit 94655c9d26dc · 2025-11-08T17:01:52.000Z
OCPBUGS-62703: Relax duplicate events detection for Prometheus
diff --git a/pkg/monitor/monitorapi/types.go b/pkg/monitor/monitorapi/types.go
@@ -251,6 +251,8 @@ const (
 	FailedToAuthenticateWithOpenShiftUser IntervalReason = "FailedToAuthenticateWithOpenShiftUser"
 	FailedContactingAPIReason             IntervalReason = "FailedContactingAPI"
 
+	UnhealthyReason IntervalReason = "Unhealthy"
+
 	UpgradeStartedReason  IntervalReason = "UpgradeStarted"
 	UpgradeVersionReason  IntervalReason = "UpgradeVersion"
 	UpgradeRollbackReason IntervalReason = "UpgradeRollback"
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -564,6 +564,42 @@ func NewUpgradePathologicalEventMatchers(kubeConfig *rest.Config, finalIntervals
 	m := newFailedSchedulingDuringNodeUpdatePathologicalEventMatcher(finalIntervals)
 	registry.AddPathologicalEventMatcherOrDie(m)
 
+	// Prometheus pods may have readiness probe errors during upgrades.
+	registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
+		name: "PrometheusReadinessProbeErrors",
+		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
+			monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
+			monitorapi.LocatorPodKey:       regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
+		},
+		messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
+		messageHumanRegex:  regexp.MustCompile("Readiness probe errored"),
+		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
+		/*
+			05:50:32	openshift-monitoring	kubelet	prometheus-k8s-1
+			Unhealthy
+			Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
+
+			05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
+			Unhealthy
+			Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+			11:44:16 (x56)	openshift-monitoring	kubelet	prometheus-k8s-0
+			Unhealthy
+			Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+			Readiness probes run during the lifecycle of the container, including termination.
+			Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
+			With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
+
+			To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
+			The job below hit ~60 readiness errors during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
+			However, the job below hit readiness errors 774 times during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
+		*/
+		repeatThresholdOverride: 100,
+	})
+
 	return registry
 }
 
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
@@ -2,6 +2,7 @@ package pathologicaleventlibrary
 
 import (
 	_ "embed"
+	"fmt"
 	"testing"
 	"time"
 
@@ -666,3 +667,118 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
 		})
 	}
 }
+
+func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) {
+	const namespace = "openshift-monitoring"
+
+	unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message {
+		return monitorapi.Message{
+			Reason:       monitorapi.UnhealthyReason,
+			HumanMessage: humanMessage,
+			Annotations: map[monitorapi.AnnotationKey]string{
+				monitorapi.AnnotationCount:        fmt.Sprintf("%d", repetitionCount),
+				monitorapi.AnnotationPathological: "true",
+			},
+		}
+	}
+
+	nsLocatorWithPodKey := func(pod, ns string) monitorapi.Locator {
+		return monitorapi.Locator{
+			Type: monitorapi.LocatorTypePod,
+			Keys: map[monitorapi.LocatorKey]string{
+				monitorapi.LocatorNamespaceKey: ns,
+				monitorapi.LocatorPodKey:       pod,
+			},
+		}
+	}
+
+	tests := []struct {
+		name            string
+		expectedMessage string
+		pod             string
+		ns              string
+		humanMessage    string
+		repetitionCount int
+	}{
+		{
+			name:            "Readiness probe error (stopping container) on first Prometheus pod",
+			expectedMessage: "",
+			pod:             "prometheus-k8s-0",
+			ns:              namespace,
+			humanMessage:    "Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
+			repetitionCount: 100,
+		},
+		{
+			name:            "Readiness probe error (terminated container) on second Prometheus pod",
+			expectedMessage: "",
+			pod:             "prometheus-k8s-1",
+			ns:              namespace,
+			humanMessage:    "Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found",
+			repetitionCount: 100,
+		},
+		{
+			name:            "Readiness probe error (stopping container, different human message) on second Prometheus pod",
+			expectedMessage: "",
+			pod:             "prometheus-k8s-1",
+			ns:              namespace,
+			humanMessage:    "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
+			repetitionCount: 100,
+		},
+		{
+			name:            "Readiness probe error (stopping container) on a Prometheus pod in a different namespace should not be ignored",
+			expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/foo pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+			pod:             "prometheus-k8s-1",
+			ns:              "foo",
+			humanMessage:    "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
+			repetitionCount: 100,
+		},
+		{
+			name:            "Readiness probe error (stopping container) on non-existent Prometheus pod should not be ignored",
+			expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+			pod:             "prometheus-k8s-2",
+			ns:              namespace,
+			humanMessage:    "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
+			repetitionCount: 100,
+		},
+		{
+			name:            "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
+			expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+			pod:             "prometheus-k8s-1",
+			ns:              namespace,
+			humanMessage:    "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
+			repetitionCount: 101,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			events := monitorapi.Intervals([]monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: nsLocatorWithPodKey(test.pod, test.ns),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage(test.humanMessage, test.repetitionCount),
+					},
+				},
+			})
+			evaluator := duplicateEventsEvaluator{
+				registry: NewUpgradePathologicalEventMatchers(nil, events),
+			}
+
+			testName := "events should not repeat"
+			junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
+			jUnitName := getJUnitName(testName, test.ns)
+			for _, junit := range junits {
+				if junit.Name == jUnitName {
+					if test.expectedMessage != "" {
+						require.NotNil(t, junit.FailureOutput, "expected junit to have failure output")
+						require.Equal(t, test.expectedMessage, junit.FailureOutput.Output)
+					} else {
+						require.Nil(t, junit.FailureOutput, "expected success but got failure output for junit: %s", junit.Name)
+					}
+
+					break
+				}
+			}
+		})
+	}
+}