Skip to content

Commit 432ba32

Browse files
Merge pull request #506 from Makdaam/SREP-1056
Adding more cases to the ClusterMonitoringEBB alert investigation.
2 parents 2c5f33d + 90bc8cd commit 432ba32

File tree

2 files changed

+102
-10
lines changed

2 files changed

+102
-10
lines changed

pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,32 @@ import (
1717
"sigs.k8s.io/controller-runtime/pkg/client"
1818
)
1919

20-
var uwmMisconfiguredSL = ocm.ServiceLog{
20+
var uwmConfigMapMisconfiguredSL = ocm.ServiceLog{
2121
Severity: "Major",
2222
Summary: "Action required: review user-workload-monitoring configuration",
2323
ServiceName: "SREManualAction",
2424
Description: "Your cluster's user workload monitoring is misconfigured: please review the user-workload-monitoring-config ConfigMap in the openshift-user-workload-monitoring namespace. For more information, please refer to the product documentation: https://access.redhat.com/documentation/en-us/red_hat_openshift_service_on_aws/4/html/monitoring/configuring-the-monitoring-stack#.",
2525
InternalOnly: false,
2626
}
2727

28+
var uwmAMMisconfiguredSL = ocm.ServiceLog{
29+
Severity: "Major",
30+
Summary: "Action required: review user-workload-monitoring configuration",
31+
ServiceName: "SREManualAction",
32+
Description: "Your cluster's user workload monitoring is misconfigured: please review the Alert Manager configuration in the opennshift-user-workload-monitoring namespace. For more information, please refer to the product documentation: https://access.redhat.com/documentation/en-us/red_hat_openshift_service_on_aws/4/html/monitoring/configuring-the-monitoring-stack#.",
33+
InternalOnly: false,
34+
}
35+
36+
var uwmGenericMisconfiguredSL = ocm.ServiceLog{
37+
Severity: "Major",
38+
Summary: "Action required: review user-workload-monitoring configuration",
39+
ServiceName: "SREManualAction",
40+
Description: "Your cluster's user workload monitoring is misconfigured: please review the cluster operator status and correct the configuration in the opennshift-user-workload-monitoring namespace. For more information, please refer to the product documentation: https://access.redhat.com/documentation/en-us/red_hat_openshift_service_on_aws/4/html/monitoring/configuring-the-monitoring-stack#.",
41+
InternalOnly: false,
42+
}
43+
44+
const available = "Available"
45+
2846
type Investigation struct{}
2947

3048
func (c *Investigation) Run(r *investigation.Resources) (result investigation.InvestigationResult, err error) {
@@ -72,7 +90,31 @@ func (c *Investigation) Run(r *investigation.Resources) (result investigation.In
7290
// If it is, send a service log and silence the alert.
7391
if isUWMConfigInvalid(&monitoringCo) {
7492
notes.AppendAutomation("Customer misconfigured the UWM configmap, sending service log and silencing the alert")
75-
err = r.OcmClient.PostServiceLog(r.Cluster.ID(), &uwmMisconfiguredSL)
93+
err = r.OcmClient.PostServiceLog(r.Cluster.ID(), &uwmConfigMapMisconfiguredSL)
94+
if err != nil {
95+
return result, fmt.Errorf("failed posting servicelog: %w", err)
96+
}
97+
// XXX: No metric before
98+
result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil}
99+
100+
return result, r.PdClient.SilenceIncidentWithNote(notes.String())
101+
}
102+
103+
if isUWMAlertManagerBroken(&monitoringCo) {
104+
notes.AppendAutomation("Customer misconfigured the UWM (UpdatingUserWorkloadAlertmanager), sending service log and silencing the alert")
105+
err = r.OcmClient.PostServiceLog(r.Cluster.ID(), &uwmAMMisconfiguredSL)
106+
if err != nil {
107+
return result, fmt.Errorf("failed posting servicelog: %w", err)
108+
}
109+
// XXX: No metric before
110+
result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil}
111+
112+
return result, r.PdClient.SilenceIncidentWithNote(notes.String())
113+
}
114+
115+
if isUWMPrometheusBroken(&monitoringCo) {
116+
notes.AppendAutomation("Customer misconfigured the UWM (UpdatingUserWorkloadPrometheus), sending service log and silencing the alert")
117+
err = r.OcmClient.PostServiceLog(r.Cluster.ID(), &uwmGenericMisconfiguredSL)
76118
if err != nil {
77119
return result, fmt.Errorf("failed posting servicelog: %w", err)
78120
}
@@ -84,7 +126,7 @@ func (c *Investigation) Run(r *investigation.Resources) (result investigation.In
84126

85127
// The UWM configmap is valid, an SRE will need to manually investigate this alert.
86128
// Escalate the alert with our findings.
87-
notes.AppendSuccess("Monitoring CO not degraded due to a broken UWM configmap")
129+
notes.AppendSuccess("Monitoring CO not degraded due to UWM misconfiguration")
88130
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
89131
}
90132

@@ -109,7 +151,29 @@ func isUWMConfigInvalid(monitoringCo *configv1.ClusterOperator) bool {
109151
symptomStatusString := `the User Workload Configuration from "config.yaml" key in the "openshift-user-workload-monitoring/user-workload-monitoring-config" ConfigMap could not be parsed`
110152

111153
for _, condition := range monitoringCo.Status.Conditions {
112-
if condition.Type == "Available" {
154+
if condition.Type == available {
155+
return strings.Contains(condition.Message, symptomStatusString)
156+
}
157+
}
158+
return false
159+
}
160+
161+
func isUWMAlertManagerBroken(monitoringCo *configv1.ClusterOperator) bool {
162+
symptomStatusString := `UpdatingUserWorkloadAlertmanager: waiting for Alertmanager User Workload object changes failed: waiting for Alertmanager openshift-user-workload-monitoring/user-workload`
163+
164+
for _, condition := range monitoringCo.Status.Conditions {
165+
if condition.Type == available {
166+
return strings.Contains(condition.Message, symptomStatusString)
167+
}
168+
}
169+
return false
170+
}
171+
172+
func isUWMPrometheusBroken(monitoringCo *configv1.ClusterOperator) bool {
173+
symptomStatusString := `UpdatingUserWorkloadPrometheus: Prometheus "openshift-user-workload-monitoring/user-workload": NoPodReady`
174+
175+
for _, condition := range monitoringCo.Status.Conditions {
176+
if condition.Type == available {
113177
return strings.Contains(condition.Message, symptomStatusString)
114178
}
115179
}

pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn_test.go

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,31 +8,59 @@ import (
88
)
99

1010
var (
11-
statusConditionAvailable = configv1.ClusterOperatorStatusCondition{Type: "Available", Status: "True"}
12-
statusConditionUpgradeable = configv1.ClusterOperatorStatusCondition{Type: "Upgradeable", Status: "True"}
13-
statusConditionUnavailableSymptomsMatch = configv1.ClusterOperatorStatusCondition{Type: "Available", Status: "False", Message: `the User Workload Configuration from "config.yaml" key in the "openshift-user-workload-monitoring/user-workload-monitoring-config" ConfigMap could not be parsed`}
11+
statusConditionAvailable = configv1.ClusterOperatorStatusCondition{Type: "Available", Status: "True"}
12+
statusConditionUpgradeable = configv1.ClusterOperatorStatusCondition{Type: "Upgradeable", Status: "True"}
13+
statusConditionUnavailableConfigMapSymptomsMatch = configv1.ClusterOperatorStatusCondition{Type: "Available", Status: "False", Message: `the User Workload Configuration from "config.yaml" key in the "openshift-user-workload-monitoring/user-workload-monitoring-config" ConfigMap could not be parsed`}
14+
statusConditionUnavailableAMSymptomsMatch = configv1.ClusterOperatorStatusCondition{Type: "Available", Status: "False", Message: `UpdatingUserWorkloadAlertmanager: waiting for Alertmanager User Workload object changes failed: waiting for Alertmanager openshift-user-workload-monitoring/user-workload: context deadline exceeded: condition Reconciled: status False: reason ReconciliationFailed: provision alertmanager configuration: failed to initialize from secret: address ${SMTP_HOST:-smtp.gmail.com:587}: too many colons in address`}
15+
statusConditionUnavailablePMSymptomsMatch = configv1.ClusterOperatorStatusCondition{Type: "Available", Status: "False", Message: `UpdatingUserWorkloadPrometheus: Prometheus "openshift-user-workload-monitoring/user-workload": NoPodReady: shard 0: pod prometheus-user-workload-0: containers with unready status: [prometheus] shard 0: pod prometheus-user-workload-1: containers with unready status: [prometheus]`}
1416
)
1517

16-
func TestSymptomMatches(t *testing.T) {
18+
func TestSymptomMatchesConfigMap(t *testing.T) {
1719
monitoringCo := configv1.ClusterOperator{
1820
ObjectMeta: v1.ObjectMeta{Name: "monitoring"},
1921
Status: configv1.ClusterOperatorStatus{
20-
Conditions: []configv1.ClusterOperatorStatusCondition{statusConditionUnavailableSymptomsMatch, statusConditionUpgradeable},
22+
Conditions: []configv1.ClusterOperatorStatusCondition{statusConditionUnavailableConfigMapSymptomsMatch, statusConditionUpgradeable},
2123
},
2224
}
2325
if !isUWMConfigInvalid(&monitoringCo) {
2426
t.Fatal("expected symptoms to match")
2527
}
2628
}
2729

30+
func TestSymptomMatchesAM(t *testing.T) {
31+
monitoringCo := configv1.ClusterOperator{
32+
ObjectMeta: v1.ObjectMeta{Name: "monitoring"},
33+
Status: configv1.ClusterOperatorStatus{
34+
Conditions: []configv1.ClusterOperatorStatusCondition{statusConditionUnavailableAMSymptomsMatch, statusConditionUpgradeable},
35+
},
36+
}
37+
if !isUWMAlertManagerBroken(&monitoringCo) {
38+
t.Fatal("expected symptoms to match")
39+
}
40+
}
41+
42+
func TestSymptomMatchesPrometheus(t *testing.T) {
43+
monitoringCo := configv1.ClusterOperator{
44+
ObjectMeta: v1.ObjectMeta{Name: "monitoring"},
45+
Status: configv1.ClusterOperatorStatus{
46+
Conditions: []configv1.ClusterOperatorStatusCondition{statusConditionUnavailablePMSymptomsMatch, statusConditionUpgradeable},
47+
},
48+
}
49+
if !isUWMPrometheusBroken(&monitoringCo) {
50+
t.Fatal("expected symptoms to match")
51+
}
52+
}
53+
2854
func TestSymptomNoMatch(t *testing.T) {
2955
monitoringCo := configv1.ClusterOperator{
3056
ObjectMeta: v1.ObjectMeta{Name: "monitoring"},
3157
Status: configv1.ClusterOperatorStatus{
3258
Conditions: []configv1.ClusterOperatorStatusCondition{statusConditionAvailable, statusConditionUpgradeable},
3359
},
3460
}
35-
if isUWMConfigInvalid(&monitoringCo) {
61+
if isUWMConfigInvalid(&monitoringCo) ||
62+
isUWMAlertManagerBroken(&monitoringCo) ||
63+
isUWMPrometheusBroken(&monitoringCo) {
3664
t.Fatal("expected symptoms to not match")
3765
}
3866
}

0 commit comments

Comments
 (0)