Skip to content

Commit 85008d1

Browse files
committed
fixup! fixup! fixup! fixup! fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus
1 parent ed24ec4 commit 85008d1

File tree

2 files changed

+37
-38
lines changed

2 files changed

+37
-38
lines changed

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go

Lines changed: 36 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -502,43 +502,6 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
502502
twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
503503
registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)
504504

505-
registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
506-
name: "PrometheusReadinessProbeErrors",
507-
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
508-
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
509-
monitorapi.LocatorPodKey: regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
510-
},
511-
messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
512-
messageHumanRegex: regexp.MustCompile("Readiness probe errored"),
513-
jira: "https://issues.redhat.com/browse/OCPBUGS-62703",
514-
/*
515-
05:50:32 openshift-monitoring kubelet prometheus-k8s-1
516-
Unhealthy
517-
Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
518-
519-
05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0
520-
Unhealthy
521-
Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
522-
523-
11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0
524-
Unhealthy
525-
Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
526-
527-
Readiness probes run during the lifecycle of the container, including termination.
528-
Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
529-
With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
530-
531-
To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
532-
The job below hit ~60 readiness errors during the upgrade:
533-
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
534-
However, the job below hit readiness errors 774 times during the upgrade:
535-
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
536-
537-
Also, do note that these events were exhibited outside of upgrades as well, so we need to allow them in general.
538-
*/
539-
repeatThresholdOverride: 100,
540-
})
541-
542505
return registry
543506
}
544507

@@ -601,6 +564,42 @@ func NewUpgradePathologicalEventMatchers(kubeConfig *rest.Config, finalIntervals
601564
m := newFailedSchedulingDuringNodeUpdatePathologicalEventMatcher(finalIntervals)
602565
registry.AddPathologicalEventMatcherOrDie(m)
603566

567+
// Prometheus pods may have readiness probe errors during upgrades.
568+
registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
569+
name: "PrometheusReadinessProbeErrors",
570+
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
571+
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
572+
monitorapi.LocatorPodKey: regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
573+
},
574+
messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
575+
messageHumanRegex: regexp.MustCompile("Readiness probe errored"),
576+
jira: "https://issues.redhat.com/browse/OCPBUGS-62703",
577+
/*
578+
05:50:32 openshift-monitoring kubelet prometheus-k8s-1
579+
Unhealthy
580+
Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
581+
582+
05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0
583+
Unhealthy
584+
Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
585+
586+
11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0
587+
Unhealthy
588+
Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
589+
590+
Readiness probes run during the lifecycle of the container, including termination.
591+
Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
592+
With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
593+
594+
To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
595+
The job below hit ~60 readiness errors during the upgrade:
596+
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
597+
However, the job below hit readiness errors 774 times during the upgrade:
598+
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
599+
*/
600+
repeatThresholdOverride: 100,
601+
})
602+
604603
return registry
605604
}
606605

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -761,7 +761,7 @@ func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) {
761761
},
762762
})
763763
evaluator := duplicateEventsEvaluator{
764-
registry: NewUniversalPathologicalEventMatchers(nil, events),
764+
registry: NewUpgradePathologicalEventMatchers(nil, events),
765765
}
766766

767767
testName := "events should not repeat"

0 commit comments

Comments
 (0)