Skip to content

Commit ed24ec4

Browse files
committed
fixup! fixup! fixup! fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus
1 parent 8753df3 commit ed24ec4

File tree

1 file changed

+53
-52
lines changed

1 file changed

+53
-52
lines changed

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go

Lines changed: 53 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,7 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
668668
}
669669
}
670670

671-
func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testing.T) {
671+
func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) {
672672
const namespace = "openshift-monitoring"
673673

674674
unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message {
@@ -682,90 +682,91 @@ func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testi
682682
}
683683
}
684684

685-
openshiftMonitoringNsLocatorWithPodKey := func(pod string) monitorapi.Locator {
685+
nsLocatorWithPodKey := func(pod, ns string) monitorapi.Locator {
686686
return monitorapi.Locator{
687687
Type: monitorapi.LocatorTypePod,
688688
Keys: map[monitorapi.LocatorKey]string{
689-
monitorapi.LocatorNamespaceKey: "openshift-monitoring",
689+
monitorapi.LocatorNamespaceKey: ns,
690690
monitorapi.LocatorPodKey: pod,
691691
},
692692
}
693693
}
694694

695695
tests := []struct {
696696
name string
697-
intervals []monitorapi.Interval
698697
expectedMessage string
698+
pod string
699+
ns string
700+
humanMessage string
701+
repetitionCount int
699702
}{
700703
{
701-
name: "Readiness probe error (stopping container) on first Prometheus pod",
702-
intervals: []monitorapi.Interval{
703-
{
704-
Condition: monitorapi.Condition{
705-
Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-0"),
706-
Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
707-
},
708-
},
709-
},
704+
name: "Readiness probe error (stopping container) on first Prometheus pod",
705+
expectedMessage: "",
706+
pod: "prometheus-k8s-0",
707+
ns: namespace,
708+
humanMessage: "Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
709+
repetitionCount: 100,
710710
},
711711
{
712-
name: "Readiness probe error (terminated container) on second Prometheus pod",
713-
intervals: []monitorapi.Interval{
714-
{
715-
Condition: monitorapi.Condition{
716-
Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
717-
Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found", 100),
718-
},
719-
},
720-
},
712+
name: "Readiness probe error (terminated container) on second Prometheus pod",
713+
expectedMessage: "",
714+
pod: "prometheus-k8s-1",
715+
ns: namespace,
716+
humanMessage: "Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found",
717+
repetitionCount: 100,
721718
},
722719
{
723-
name: "Readiness probe error (stopping container, different human message) on second Prometheus pod",
724-
intervals: []monitorapi.Interval{
725-
{
726-
Condition: monitorapi.Condition{
727-
Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
728-
Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
729-
},
730-
},
731-
},
720+
name: "Readiness probe error (stopping container, different human message) on second Prometheus pod",
721+
expectedMessage: "",
722+
pod: "prometheus-k8s-1",
723+
ns: namespace,
724+
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
725+
repetitionCount: 100,
732726
},
733727
{
734-
name: "Readiness probe error (stopping container, different human message) on non-existent Prometheus pod should not be ignored",
735-
intervals: []monitorapi.Interval{
736-
{
737-
Condition: monitorapi.Condition{
738-
Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-2"),
739-
Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
740-
},
741-
},
742-
},
728+
name: "Readiness probe error (stopping container) on a Prometheus pod in a different namespace should not be ignored",
729+
expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/foo pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
730+
pod: "prometheus-k8s-1",
731+
ns: "foo",
732+
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
733+
repetitionCount: 100,
734+
},
735+
{
736+
name: "Readiness probe error (stopping container) on non-existent Prometheus pod should not be ignored",
743737
expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
738+
pod: "prometheus-k8s-2",
739+
ns: namespace,
740+
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
741+
repetitionCount: 100,
744742
},
745743
{
746-
name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
747-
intervals: []monitorapi.Interval{
748-
{
749-
Condition: monitorapi.Condition{
750-
Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
751-
Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 101),
752-
},
753-
},
754-
},
744+
name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
755745
expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
746+
pod: "prometheus-k8s-1",
747+
ns: namespace,
748+
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
749+
repetitionCount: 101,
756750
},
757751
}
758752

759753
for _, test := range tests {
760754
t.Run(test.name, func(t *testing.T) {
761-
events := monitorapi.Intervals(test.intervals)
755+
events := monitorapi.Intervals([]monitorapi.Interval{
756+
{
757+
Condition: monitorapi.Condition{
758+
Locator: nsLocatorWithPodKey(test.pod, test.ns),
759+
Message: unhealthyReasonPathologicalMessageWithHumanMessage(test.humanMessage, test.repetitionCount),
760+
},
761+
},
762+
})
762763
evaluator := duplicateEventsEvaluator{
763-
registry: NewUpgradePathologicalEventMatchers(nil, events),
764+
registry: NewUniversalPathologicalEventMatchers(nil, events),
764765
}
765766

766767
testName := "events should not repeat"
767768
junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
768-
jUnitName := getJUnitName(testName, namespace)
769+
jUnitName := getJUnitName(testName, test.ns)
769770
for _, junit := range junits {
770771
if junit.Name == jUnitName {
771772
if test.expectedMessage != "" {

0 commit comments

Comments
 (0)