Skip to content

Commit b5321f7

Browse files
committed
OCPBUGS-62703: Relax duplicate events detection for Prometheus
Overrides the duplicate readiness error events' limit for Prometheus during upgrades. Since Prometheus needs some time to wind down (see [1]), it was causing Kubelet to exhibit readiness error events during the time span it took to terminate. This ignores those pings to a limit (100). [1]: https://github.com/prometheus-operator/prometheus-operator/blob/d0ae00fdedc656a5a1a290d9839b84d860f15428/pkg/prometheus/common.go#L56-L59 Signed-off-by: Pranshu Srivastava <[email protected]>
1 parent d3b6fa6 commit b5321f7

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,9 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
502502
twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
503503
registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)
504504

505+
prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher := newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals)
506+
registry.AddPathologicalEventMatcherOrDie(prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher)
507+
505508
return registry
506509
}
507510

@@ -1171,3 +1174,58 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev
11711174
allowIfWithinIntervals: crioReloadedIntervals,
11721175
}
11731176
}
1177+
1178+
func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher {
1179+
statefulSetName := "prometheus-k8s"
1180+
statefulSetNamespace := "openshift-monitoring"
1181+
messageHumanizedSubstring := "Readiness probe errored: rpc error"
1182+
messageReason := "Unhealthy"
1183+
matcher := &SimplePathologicalEventMatcher{
1184+
name: "PrometheusReadinessProbeErrorsDuringUpgrades",
1185+
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
1186+
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + statefulSetNamespace + `$`),
1187+
monitorapi.LocatorStatefulSetKey: regexp.MustCompile(`^` + statefulSetName + `$`),
1188+
},
1189+
messageReasonRegex: regexp.MustCompile(`^` + messageReason + `$`),
1190+
messageHumanRegex: regexp.MustCompile(messageHumanizedSubstring),
1191+
jira: "https://issues.redhat.com/browse/OCPBUGS-62703",
1192+
}
1193+
1194+
// Sanity check in case no `finalIntervals` are provided.
1195+
if finalIntervals == nil || len(finalIntervals) == 0 {
1196+
matcher.neverAllow = true
1197+
return matcher
1198+
}
1199+
1200+
/*
1201+
05:50:32 openshift-monitoring kubelet prometheus-k8s-1
1202+
Unhealthy
1203+
Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
1204+
1205+
05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0
1206+
Unhealthy
1207+
Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
1208+
*/
1209+
testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
1210+
return eventInterval.Locator.Type == monitorapi.LocatorTypeStatefulSet &&
1211+
eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == statefulSetNamespace &&
1212+
eventInterval.Locator.Keys[monitorapi.LocatorStatefulSetKey] == statefulSetName &&
1213+
eventInterval.Message.Reason == monitorapi.IntervalReason(messageReason) &&
1214+
strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring)
1215+
})
1216+
1217+
if len(testIntervals) > 0 {
1218+
// Readiness probe errors are expected during upgrades, allow a higher threshold.
1219+
// Set the threshold to 100 to allow for a high number of readiness probe errors
1220+
// during the upgrade, but not so high that we would miss a real problem, i.e.,
1221+
// the job below (and usually) hit ~60 readiness errors during the upgrade,
1222+
// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048,
1223+
// However, the job below hit readiness errors 774 times during the upgrade,
1224+
// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856.
1225+
matcher.repeatThresholdOverride = 100
1226+
} else {
1227+
matcher.neverAllow = true
1228+
}
1229+
1230+
return matcher
1231+
}

0 commit comments

Comments
 (0)