fixup! fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus

rexagod · rexagod · commit 785a37c77c17 · 2025-10-30T13:16:49.000+05:30
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -1219,13 +1219,17 @@ func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(fin
 	})
 
 	if len(testIntervals) > 0 {
-		// Readiness probe errors are expected during upgrades, allow a higher threshold.
-		// Set the threshold to 100 to allow for a high number of readiness probe errors
-		// during the upgrade, but not so high that we would miss a real problem, i.e.,
-		// the job below (and usually) hit ~60 readiness errors during the upgrade,
-		// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048,
-		// However, the job below hit readiness errors 774 times during the upgrade,
-		// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856.
+		/*
+			Readiness probes run during the lifecycle of the container, including termination.
+			Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
+			With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
+
+			To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
+			The job below hit ~60 readiness errors during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
+			However, the job below hit readiness errors 774 times during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
+		*/
 		matcher.repeatThresholdOverride = 100
 	} else {
 		matcher.neverAllow = true
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
@@ -2,6 +2,7 @@ package pathologicaleventlibrary
 
 import (
 	_ "embed"
+	"fmt"
 	"testing"
 	"time"
 
@@ -666,3 +667,117 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
 		})
 	}
 }
+
+func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testing.T) {
+	const namespace = "openshift-monitoring"
+
+	unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message {
+		return monitorapi.Message{
+			Reason:       monitorapi.UnhealthyReason,
+			HumanMessage: humanMessage,
+			Annotations: map[monitorapi.AnnotationKey]string{
+				monitorapi.AnnotationCount:        fmt.Sprintf("%d", repetitionCount),
+				monitorapi.AnnotationPathological: "true",
+			},
+		}
+	}
+
+	openshiftMonitoringNsLocatorWithPodKey := func(pod string) monitorapi.Locator {
+		return monitorapi.Locator{
+			Type: monitorapi.LocatorTypePod,
+			Keys: map[monitorapi.LocatorKey]string{
+				monitorapi.LocatorNamespaceKey: "openshift-monitoring",
+				monitorapi.LocatorPodKey:       pod,
+			},
+		}
+	}
+
+	tests := []struct {
+		name            string
+		intervals       []monitorapi.Interval
+		expectedMessage string
+	}{
+		{
+			name: "Readiness probe error (stopping container) on first Prometheus pod",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-0"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
+					},
+				},
+			},
+		},
+		{
+			name: "Readiness probe error (terminated container) on second Prometheus pod",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found", 100),
+					},
+				},
+			},
+		},
+		{
+			name: "Readiness probe error (stopping container, different human message) on second Prometheus pod",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
+					},
+				},
+			},
+		},
+		{
+			name: "Readiness probe error (stopping container, different human message) on non-existent Prometheus pod should not be ignored",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-2"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
+					},
+				},
+			},
+			expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+		},
+		{
+			name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 101),
+					},
+				},
+			},
+			expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			events := monitorapi.Intervals(test.intervals)
+			evaluator := duplicateEventsEvaluator{
+				registry: NewUpgradePathologicalEventMatchers(nil, events),
+			}
+
+			testName := "events should not repeat"
+			junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
+			jUnitName := getJUnitName(testName, namespace)
+			for _, junit := range junits {
+				if junit.Name == jUnitName {
+					if test.expectedMessage != "" {
+						require.NotNil(t, junit.FailureOutput, "expected junit to have failure output")
+						require.Equal(t, test.expectedMessage, junit.FailureOutput.Output)
+					} else {
+						require.Nil(t, junit.FailureOutput, "expected success but got failure output for junit: %s", junit.Name)
+					}
+
+					break
+				}
+			}
+		})
+	}
+}