Skip to content

Commit 8753df3

Browse files
committed
fixup! fixup! fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus
1 parent 785a37c commit 8753df3

File tree

1 file changed

+36
-65
lines changed

1 file changed

+36
-65
lines changed

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go

Lines changed: 36 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -502,8 +502,42 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
502502
twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
503503
registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)
504504

505-
prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher := newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals)
506-
registry.AddPathologicalEventMatcherOrDie(prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher)
505+
registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
506+
name: "PrometheusReadinessProbeErrors",
507+
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
508+
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
509+
monitorapi.LocatorPodKey: regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
510+
},
511+
messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
512+
messageHumanRegex: regexp.MustCompile("Readiness probe errored"),
513+
jira: "https://issues.redhat.com/browse/OCPBUGS-62703",
514+
/*
515+
05:50:32 openshift-monitoring kubelet prometheus-k8s-1
516+
Unhealthy
517+
Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
518+
519+
05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0
520+
Unhealthy
521+
Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
522+
523+
11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0
524+
Unhealthy
525+
Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
526+
527+
Readiness probes run during the lifecycle of the container, including termination.
528+
Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
529+
With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
530+
531+
To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
532+
The job below hit ~60 readiness errors during the upgrade:
533+
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
534+
However, the job below hit readiness errors 774 times during the upgrade:
535+
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
536+
537+
Also, do note that these events were exhibited outside of upgrades as well, so we need to allow them in general.
538+
*/
539+
repeatThresholdOverride: 100,
540+
})
507541

508542
return registry
509543
}
@@ -1174,66 +1208,3 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev
11741208
allowIfWithinIntervals: crioReloadedIntervals,
11751209
}
11761210
}
1177-
1178-
func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher {
1179-
podNamePrefix := "prometheus-k8s"
1180-
podNamespace := "openshift-monitoring"
1181-
messageHumanizedSubstring := "Readiness probe errored"
1182-
messageReason := monitorapi.UnhealthyReason
1183-
matcher := &SimplePathologicalEventMatcher{
1184-
name: "PrometheusReadinessProbeErrorsDuringUpgrades",
1185-
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
1186-
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + podNamespace + `$`),
1187-
monitorapi.LocatorPodKey: regexp.MustCompile(`^` + podNamePrefix + `-[0,1]$`),
1188-
},
1189-
messageReasonRegex: regexp.MustCompile(`^` + string(messageReason) + `$`),
1190-
messageHumanRegex: regexp.MustCompile(messageHumanizedSubstring),
1191-
jira: "https://issues.redhat.com/browse/OCPBUGS-62703",
1192-
}
1193-
1194-
// Sanity check in case no `finalIntervals` are provided.
1195-
if finalIntervals == nil || len(finalIntervals) == 0 {
1196-
matcher.neverAllow = true
1197-
return matcher
1198-
}
1199-
1200-
/*
1201-
05:50:32 openshift-monitoring kubelet prometheus-k8s-1
1202-
Unhealthy
1203-
Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
1204-
1205-
05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0
1206-
Unhealthy
1207-
Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
1208-
1209-
11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0
1210-
Unhealthy
1211-
Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
1212-
*/
1213-
testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
1214-
return eventInterval.Locator.Type == monitorapi.LocatorTypePod &&
1215-
eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == podNamespace &&
1216-
strings.HasPrefix(eventInterval.Locator.Keys[monitorapi.LocatorPodKey], podNamePrefix) &&
1217-
eventInterval.Message.Reason == messageReason &&
1218-
strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring)
1219-
})
1220-
1221-
if len(testIntervals) > 0 {
1222-
/*
1223-
Readiness probes run during the lifecycle of the container, including termination.
1224-
Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
1225-
With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
1226-
1227-
To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
1228-
The job below hit ~60 readiness errors during the upgrade:
1229-
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
1230-
However, the job below hit readiness errors 774 times during the upgrade:
1231-
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
1232-
*/
1233-
matcher.repeatThresholdOverride = 100
1234-
} else {
1235-
matcher.neverAllow = true
1236-
}
1237-
1238-
return matcher
1239-
}

0 commit comments

Comments
 (0)