Skip to content

Commit 94655c9

Browse files
Merge pull request #30372 from rexagod/OCPBUGS-62703
OCPBUGS-62703: Relax duplicate events detection for Prometheus
2 parents 606e09f + 85008d1 commit 94655c9

File tree

3 files changed

+154
-0
lines changed

3 files changed

+154
-0
lines changed

pkg/monitor/monitorapi/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,8 @@ const (
251251
FailedToAuthenticateWithOpenShiftUser IntervalReason = "FailedToAuthenticateWithOpenShiftUser"
252252
FailedContactingAPIReason IntervalReason = "FailedContactingAPI"
253253

254+
UnhealthyReason IntervalReason = "Unhealthy"
255+
254256
UpgradeStartedReason IntervalReason = "UpgradeStarted"
255257
UpgradeVersionReason IntervalReason = "UpgradeVersion"
256258
UpgradeRollbackReason IntervalReason = "UpgradeRollback"

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,42 @@ func NewUpgradePathologicalEventMatchers(kubeConfig *rest.Config, finalIntervals
564564
m := newFailedSchedulingDuringNodeUpdatePathologicalEventMatcher(finalIntervals)
565565
registry.AddPathologicalEventMatcherOrDie(m)
566566

567+
// Prometheus pods may have readiness probe errors during upgrades.
568+
registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
569+
name: "PrometheusReadinessProbeErrors",
570+
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
571+
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
572+
monitorapi.LocatorPodKey: regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
573+
},
574+
messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
575+
messageHumanRegex: regexp.MustCompile("Readiness probe errored"),
576+
jira: "https://issues.redhat.com/browse/OCPBUGS-62703",
577+
/*
578+
05:50:32 openshift-monitoring kubelet prometheus-k8s-1
579+
Unhealthy
580+
Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
581+
582+
05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0
583+
Unhealthy
584+
Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
585+
586+
11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0
587+
Unhealthy
588+
Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
589+
590+
Readiness probes run during the lifecycle of the container, including termination.
591+
Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
592+
With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
593+
594+
To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
595+
The job below hit ~60 readiness errors during the upgrade:
596+
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
597+
However, the job below hit readiness errors 774 times during the upgrade:
598+
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
599+
*/
600+
repeatThresholdOverride: 100,
601+
})
602+
567603
return registry
568604
}
569605

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package pathologicaleventlibrary
22

33
import (
44
_ "embed"
5+
"fmt"
56
"testing"
67
"time"
78

@@ -666,3 +667,118 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
666667
})
667668
}
668669
}
670+
671+
func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) {
672+
const namespace = "openshift-monitoring"
673+
674+
unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message {
675+
return monitorapi.Message{
676+
Reason: monitorapi.UnhealthyReason,
677+
HumanMessage: humanMessage,
678+
Annotations: map[monitorapi.AnnotationKey]string{
679+
monitorapi.AnnotationCount: fmt.Sprintf("%d", repetitionCount),
680+
monitorapi.AnnotationPathological: "true",
681+
},
682+
}
683+
}
684+
685+
nsLocatorWithPodKey := func(pod, ns string) monitorapi.Locator {
686+
return monitorapi.Locator{
687+
Type: monitorapi.LocatorTypePod,
688+
Keys: map[monitorapi.LocatorKey]string{
689+
monitorapi.LocatorNamespaceKey: ns,
690+
monitorapi.LocatorPodKey: pod,
691+
},
692+
}
693+
}
694+
695+
tests := []struct {
696+
name string
697+
expectedMessage string
698+
pod string
699+
ns string
700+
humanMessage string
701+
repetitionCount int
702+
}{
703+
{
704+
name: "Readiness probe error (stopping container) on first Prometheus pod",
705+
expectedMessage: "",
706+
pod: "prometheus-k8s-0",
707+
ns: namespace,
708+
humanMessage: "Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
709+
repetitionCount: 100,
710+
},
711+
{
712+
name: "Readiness probe error (terminated container) on second Prometheus pod",
713+
expectedMessage: "",
714+
pod: "prometheus-k8s-1",
715+
ns: namespace,
716+
humanMessage: "Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found",
717+
repetitionCount: 100,
718+
},
719+
{
720+
name: "Readiness probe error (stopping container, different human message) on second Prometheus pod",
721+
expectedMessage: "",
722+
pod: "prometheus-k8s-1",
723+
ns: namespace,
724+
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
725+
repetitionCount: 100,
726+
},
727+
{
728+
name: "Readiness probe error (stopping container) on a Prometheus pod in a different namespace should not be ignored",
729+
expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/foo pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
730+
pod: "prometheus-k8s-1",
731+
ns: "foo",
732+
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
733+
repetitionCount: 100,
734+
},
735+
{
736+
name: "Readiness probe error (stopping container) on non-existent Prometheus pod should not be ignored",
737+
expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
738+
pod: "prometheus-k8s-2",
739+
ns: namespace,
740+
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
741+
repetitionCount: 100,
742+
},
743+
{
744+
name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
745+
expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
746+
pod: "prometheus-k8s-1",
747+
ns: namespace,
748+
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
749+
repetitionCount: 101,
750+
},
751+
}
752+
753+
for _, test := range tests {
754+
t.Run(test.name, func(t *testing.T) {
755+
events := monitorapi.Intervals([]monitorapi.Interval{
756+
{
757+
Condition: monitorapi.Condition{
758+
Locator: nsLocatorWithPodKey(test.pod, test.ns),
759+
Message: unhealthyReasonPathologicalMessageWithHumanMessage(test.humanMessage, test.repetitionCount),
760+
},
761+
},
762+
})
763+
evaluator := duplicateEventsEvaluator{
764+
registry: NewUpgradePathologicalEventMatchers(nil, events),
765+
}
766+
767+
testName := "events should not repeat"
768+
junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
769+
jUnitName := getJUnitName(testName, test.ns)
770+
for _, junit := range junits {
771+
if junit.Name == jUnitName {
772+
if test.expectedMessage != "" {
773+
require.NotNil(t, junit.FailureOutput, "expected junit to have failure output")
774+
require.Equal(t, test.expectedMessage, junit.FailureOutput.Output)
775+
} else {
776+
require.Nil(t, junit.FailureOutput, "expected success but got failure output for junit: %s", junit.Name)
777+
}
778+
779+
break
780+
}
781+
}
782+
})
783+
}
784+
}

0 commit comments

Comments
 (0)