Skip to content

Commit 785a37c

Browse files
committed
fixup! fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus
1 parent 0d1ade8 commit 785a37c

File tree

2 files changed

+126
-7
lines changed

2 files changed

+126
-7
lines changed

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,13 +1219,17 @@ func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(fin
12191219
})
12201220

12211221
if len(testIntervals) > 0 {
1222-
// Readiness probe errors are expected during upgrades, allow a higher threshold.
1223-
// Set the threshold to 100 to allow for a high number of readiness probe errors
1224-
// during the upgrade, but not so high that we would miss a real problem, i.e.,
1225-
// the job below (and usually) hit ~60 readiness errors during the upgrade,
1226-
// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048,
1227-
// However, the job below hit readiness errors 774 times during the upgrade,
1228-
// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856.
1222+
/*
1223+
Readiness probes run during the lifecycle of the container, including termination.
1224+
Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
1225+
With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
1226+
1227+
To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
1228+
The job below hit ~60 readiness errors during the upgrade:
1229+
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
1230+
However, the job below hit readiness errors 774 times during the upgrade:
1231+
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
1232+
*/
12291233
matcher.repeatThresholdOverride = 100
12301234
} else {
12311235
matcher.neverAllow = true

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package pathologicaleventlibrary
22

33
import (
44
_ "embed"
5+
"fmt"
56
"testing"
67
"time"
78

@@ -666,3 +667,117 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
666667
})
667668
}
668669
}
670+
671+
func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testing.T) {
672+
const namespace = "openshift-monitoring"
673+
674+
unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message {
675+
return monitorapi.Message{
676+
Reason: monitorapi.UnhealthyReason,
677+
HumanMessage: humanMessage,
678+
Annotations: map[monitorapi.AnnotationKey]string{
679+
monitorapi.AnnotationCount: fmt.Sprintf("%d", repetitionCount),
680+
monitorapi.AnnotationPathological: "true",
681+
},
682+
}
683+
}
684+
685+
openshiftMonitoringNsLocatorWithPodKey := func(pod string) monitorapi.Locator {
686+
return monitorapi.Locator{
687+
Type: monitorapi.LocatorTypePod,
688+
Keys: map[monitorapi.LocatorKey]string{
689+
monitorapi.LocatorNamespaceKey: "openshift-monitoring",
690+
monitorapi.LocatorPodKey: pod,
691+
},
692+
}
693+
}
694+
695+
tests := []struct {
696+
name string
697+
intervals []monitorapi.Interval
698+
expectedMessage string
699+
}{
700+
{
701+
name: "Readiness probe error (stopping container) on first Prometheus pod",
702+
intervals: []monitorapi.Interval{
703+
{
704+
Condition: monitorapi.Condition{
705+
Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-0"),
706+
Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
707+
},
708+
},
709+
},
710+
},
711+
{
712+
name: "Readiness probe error (terminated container) on second Prometheus pod",
713+
intervals: []monitorapi.Interval{
714+
{
715+
Condition: monitorapi.Condition{
716+
Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
717+
Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found", 100),
718+
},
719+
},
720+
},
721+
},
722+
{
723+
name: "Readiness probe error (stopping container, different human message) on second Prometheus pod",
724+
intervals: []monitorapi.Interval{
725+
{
726+
Condition: monitorapi.Condition{
727+
Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
728+
Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
729+
},
730+
},
731+
},
732+
},
733+
{
734+
name: "Readiness probe error (stopping container, different human message) on non-existent Prometheus pod should not be ignored",
735+
intervals: []monitorapi.Interval{
736+
{
737+
Condition: monitorapi.Condition{
738+
Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-2"),
739+
Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
740+
},
741+
},
742+
},
743+
expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
744+
},
745+
{
746+
name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
747+
intervals: []monitorapi.Interval{
748+
{
749+
Condition: monitorapi.Condition{
750+
Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
751+
Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 101),
752+
},
753+
},
754+
},
755+
expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
756+
},
757+
}
758+
759+
for _, test := range tests {
760+
t.Run(test.name, func(t *testing.T) {
761+
events := monitorapi.Intervals(test.intervals)
762+
evaluator := duplicateEventsEvaluator{
763+
registry: NewUpgradePathologicalEventMatchers(nil, events),
764+
}
765+
766+
testName := "events should not repeat"
767+
junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
768+
jUnitName := getJUnitName(testName, namespace)
769+
for _, junit := range junits {
770+
if junit.Name == jUnitName {
771+
if test.expectedMessage != "" {
772+
require.NotNil(t, junit.FailureOutput, "expected junit to have failure output")
773+
require.Equal(t, test.expectedMessage, junit.FailureOutput.Output)
774+
} else {
775+
require.Nil(t, junit.FailureOutput, "expected success but got failure output for junit: %s", junit.Name)
776+
}
777+
778+
break
779+
}
780+
}
781+
})
782+
}
783+
}

0 commit comments

Comments
 (0)