Skip to content

Commit 0d1ade8

Browse files
committed
fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus
1 parent b5321f7 commit 0d1ade8

File tree

2 files changed

+17
-11
lines changed

2 files changed

+17
-11
lines changed

pkg/monitor/monitorapi/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,8 @@ const (
251251
FailedToAuthenticateWithOpenShiftUser IntervalReason = "FailedToAuthenticateWithOpenShiftUser"
252252
FailedContactingAPIReason IntervalReason = "FailedContactingAPI"
253253

254+
UnhealthyReason IntervalReason = "Unhealthy"
255+
254256
UpgradeStartedReason IntervalReason = "UpgradeStarted"
255257
UpgradeVersionReason IntervalReason = "UpgradeVersion"
256258
UpgradeRollbackReason IntervalReason = "UpgradeRollback"

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,17 +1176,17 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev
11761176
}
11771177

11781178
func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher {
1179-
statefulSetName := "prometheus-k8s"
1180-
statefulSetNamespace := "openshift-monitoring"
1181-
messageHumanizedSubstring := "Readiness probe errored: rpc error"
1182-
messageReason := "Unhealthy"
1179+
podNamePrefix := "prometheus-k8s"
1180+
podNamespace := "openshift-monitoring"
1181+
messageHumanizedSubstring := "Readiness probe errored"
1182+
messageReason := monitorapi.UnhealthyReason
11831183
matcher := &SimplePathologicalEventMatcher{
11841184
name: "PrometheusReadinessProbeErrorsDuringUpgrades",
11851185
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
1186-
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + statefulSetNamespace + `$`),
1187-
monitorapi.LocatorStatefulSetKey: regexp.MustCompile(`^` + statefulSetName + `$`),
1186+
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + podNamespace + `$`),
1187+
monitorapi.LocatorPodKey: regexp.MustCompile(`^` + podNamePrefix + `-[0,1]$`),
11881188
},
1189-
messageReasonRegex: regexp.MustCompile(`^` + messageReason + `$`),
1189+
messageReasonRegex: regexp.MustCompile(`^` + string(messageReason) + `$`),
11901190
messageHumanRegex: regexp.MustCompile(messageHumanizedSubstring),
11911191
jira: "https://issues.redhat.com/browse/OCPBUGS-62703",
11921192
}
@@ -1205,12 +1205,16 @@ func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(fin
12051205
05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0
12061206
Unhealthy
12071207
Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
1208+
1209+
11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0
1210+
Unhealthy
1211+
Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
12081212
*/
12091213
testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
1210-
return eventInterval.Locator.Type == monitorapi.LocatorTypeStatefulSet &&
1211-
eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == statefulSetNamespace &&
1212-
eventInterval.Locator.Keys[monitorapi.LocatorStatefulSetKey] == statefulSetName &&
1213-
eventInterval.Message.Reason == monitorapi.IntervalReason(messageReason) &&
1214+
return eventInterval.Locator.Type == monitorapi.LocatorTypePod &&
1215+
eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == podNamespace &&
1216+
strings.HasPrefix(eventInterval.Locator.Keys[monitorapi.LocatorPodKey], podNamePrefix) &&
1217+
eventInterval.Message.Reason == messageReason &&
12141218
strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring)
12151219
})
12161220

0 commit comments

Comments
 (0)