@@ -502,6 +502,9 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
502502 twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher (finalIntervals )
503503 registry .AddPathologicalEventMatcherOrDie (twoNodeEtcdEndpointsMatcher )
504504
505+ prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher := newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher (finalIntervals )
506+ registry .AddPathologicalEventMatcherOrDie (prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher )
507+
505508 return registry
506509}
507510
@@ -1171,3 +1174,58 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev
11711174 allowIfWithinIntervals : crioReloadedIntervals ,
11721175 }
11731176}
1177+
1178+ func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher (finalIntervals monitorapi.Intervals ) EventMatcher {
1179+ statefulSetName := "prometheus-k8s"
1180+ statefulSetNamespace := "openshift-monitoring"
1181+ messageHumanizedSubstring := "Readiness probe errored: rpc error"
1182+ messageReason := "Unhealthy"
1183+ matcher := & SimplePathologicalEventMatcher {
1184+ name : "PrometheusReadinessProbeErrorsDuringUpgrades" ,
1185+ locatorKeyRegexes : map [monitorapi.LocatorKey ]* regexp.Regexp {
1186+ monitorapi .LocatorNamespaceKey : regexp .MustCompile (`^` + statefulSetNamespace + `$` ),
1187+ monitorapi .LocatorStatefulSetKey : regexp .MustCompile (`^` + statefulSetName + `$` ),
1188+ },
1189+ messageReasonRegex : regexp .MustCompile (`^` + messageReason + `$` ),
1190+ messageHumanRegex : regexp .MustCompile (messageHumanizedSubstring ),
1191+ jira : "https://issues.redhat.com/browse/OCPBUGS-62703" ,
1192+ }
1193+
1194+ // Sanity check in case no `finalIntervals` are provided.
1195+ if finalIntervals == nil || len (finalIntervals ) == 0 {
1196+ matcher .neverAllow = true
1197+ return matcher
1198+ }
1199+
1200+ /*
1201+ 05:50:32 openshift-monitoring kubelet prometheus-k8s-1
1202+ Unhealthy
1203+ Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
1204+
1205+ 05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0
1206+ Unhealthy
1207+ Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
1208+ */
1209+ testIntervals := finalIntervals .Filter (func (eventInterval monitorapi.Interval ) bool {
1210+ return eventInterval .Locator .Type == monitorapi .LocatorTypeStatefulSet &&
1211+ eventInterval .Locator .Keys [monitorapi .LocatorNamespaceKey ] == statefulSetNamespace &&
1212+ eventInterval .Locator .Keys [monitorapi .LocatorStatefulSetKey ] == statefulSetName &&
1213+ eventInterval .Message .Reason == monitorapi .IntervalReason (messageReason ) &&
1214+ strings .Contains (eventInterval .Message .HumanMessage , messageHumanizedSubstring )
1215+ })
1216+
1217+ if len (testIntervals ) > 0 {
1218+ // Readiness probe errors are expected during upgrades, allow a higher threshold.
1219+ // Set the threshold to 100 to allow for a high number of readiness probe errors
1220+ // during the upgrade, but not so high that we would miss a real problem, i.e.,
1221+ // the job below (and usually) hit ~60 readiness errors during the upgrade,
1222+ // https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048,
1223+ // However, the job below hit readiness errors 774 times during the upgrade,
1224+ // https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856.
1225+ matcher .repeatThresholdOverride = 100
1226+ } else {
1227+ matcher .neverAllow = true
1228+ }
1229+
1230+ return matcher
1231+ }
0 commit comments