Skip to content

Commit 0aca6cd

Browse files
committed
Feat: remove escalation policy and replace by escalating to the next level
1 parent 2b844d8 commit 0aca6cd

File tree

14 files changed

+143
-411
lines changed

14 files changed

+143
-411
lines changed

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,6 @@ Grafana dashboard configmaps are stored in the [Dashboards](./dashboards/) direc
137137
* `AWS_SECRET_ACCESS_KEY`: refers to the secret access key of the base AWS account used by CAD
138138
* `CAD_AWS_CSS_JUMPROLE`: refers to the arn of the RH-SRE-CCS-Access jumprole
139139
* `CAD_AWS_SUPPORT_JUMPROLE`: refers to the arn of the RH-Technical-Support-Access jumprole
140-
* `CAD_ESCALATION_POLICY`: refers to the escalation policy CAD should use to escalate the incident to
141140
* `CAD_PD_EMAIL`: refers to the email for a login via mail/pw credentials
142141
* `CAD_PD_PW`: refers to the password for a login via mail/pw credentials
143142
* `CAD_PD_TOKEN`: refers to the generated private access token for token-based authentication

cadctl/cmd/investigate/investigate.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ func run(_ *cobra.Command, _ []string) error {
7575

7676
// Escalate all unsupported alerts
7777
if alertInvestigation == nil {
78-
err = pdClient.EscalateAlert()
78+
err = pdClient.EscalateIncident()
7979
if err != nil {
8080
return fmt.Errorf("could not escalate unsupported alert: %w", err)
8181
}
@@ -152,12 +152,12 @@ func GetOCMClient() (*ocm.SdkClient, error) {
152152
func clusterRequiresInvestigation(cluster *cmv1.Cluster, pdClient *pagerduty.SdkClient, ocmClient *ocm.SdkClient) (bool, error) {
153153
if cluster.State() == cmv1.ClusterStateUninstalling {
154154
logging.Info("Cluster is uninstalling and requires no investigation. Silencing alert.")
155-
return false, pdClient.SilenceAlertWithNote("CAD: Cluster is already uninstalling, silencing alert.")
155+
return false, pdClient.SilenceIncidentWithNote("CAD: Cluster is already uninstalling, silencing alert.")
156156
}
157157

158158
if cluster.AWS() == nil {
159159
logging.Info("Cloud provider unsupported, forwarding to primary.")
160-
return false, pdClient.EscalateAlertWithNote("CAD could not run an automated investigation on this cluster: unsupported cloud provider.")
160+
return false, pdClient.EscalateIncidentWithNote("CAD could not run an automated investigation on this cluster: unsupported cloud provider.")
161161
}
162162

163163
isAccessProtected, err := ocmClient.IsAccessProtected(cluster)
@@ -166,7 +166,7 @@ func clusterRequiresInvestigation(cluster *cmv1.Cluster, pdClient *pagerduty.Sdk
166166
}
167167
if isAccessProtected {
168168
logging.Info("Cluster is access protected. Escalating alert.")
169-
return false, pdClient.EscalateAlertWithNote("CAD is unable to run against access protected clusters. Please investigate.")
169+
return false, pdClient.EscalateIncidentWithNote("CAD is unable to run against access protected clusters. Please investigate.")
170170
}
171171
return true, nil
172172
}

deploy/task-cad-checks-secrets-pd.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ metadata:
55
name: cad-pd-token
66
type: Opaque
77
stringData:
8-
CAD_ESCALATION_POLICY: CHANGEME # refers to the escalation policy CAD should use to escalate the incident to
98
CAD_PD_EMAIL: CHANGEME # refers to the email for a login via mail/pw credentials
109
CAD_PD_PW: CHANGEME # refers to the password for a login via mail/pw credentials
1110
CAD_PD_TOKEN: CHANGEME # refers to the generated private access token for token-based authentication

interceptor/pkg/interceptor/pdinterceptor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ func (pdi *PagerDutyInterceptor) Process(ctx context.Context, r *triggersv1.Inte
111111
// and escalate the alert to SRE
112112
if investigation == nil {
113113
pdi.Logger.Infof("Incident %s is not mapped to an investigation, escalating incident and returning InterceptorResponse `Continue: false`.", pdClient.GetIncidentID())
114-
err = pdClient.EscalateAlert()
114+
err = pdClient.EscalateIncident()
115115
if err != nil {
116116
pdi.Logger.Errorf("failed to escalate incident '%s': %w", pdClient.GetIncidentID(), err)
117117
}

interceptor/test/e2e.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ temp_log_file=$(mktemp)
1818
# Function to send an interceptor request and check the response
1919
function test_interceptor {
2020
# Run the interceptor and print logs to temporary log file
21-
CAD_PD_TOKEN=$(echo $pd_test_token) CAD_ESCALATION_POLICY=$(echo $pd_test_escalation_policy) CAD_SILENT_POLICY=$(echo $pd_test_silence_policy) ./../bin/interceptor > $temp_log_file 2>&1 &
21+
CAD_PD_TOKEN=$(echo $pd_test_token) CAD_SILENT_POLICY=$(echo $pd_test_silence_policy) ./../bin/interceptor > $temp_log_file 2>&1 &
2222

2323
# Store the PID of the interceptor process
2424
INTERCEPTOR_PID=$!

openshift/template.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,6 @@ objects:
279279
metadata:
280280
name: cad-pd-token
281281
stringData:
282-
CAD_ESCALATION_POLICY: CHANGEME
283282
CAD_PD_EMAIL: CHANGEME
284283
CAD_PD_PW: CHANGEME
285284
CAD_PD_TOKEN: CHANGEME

pkg/investigations/ccam/ccam.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,14 @@ func Evaluate(cluster *cmv1.Cluster, bpError error, ocmClient ocm.Client, pdClie
4343
return fmt.Errorf("could not post limited support reason for %s: %w", cluster.Name(), err)
4444
}
4545

46-
return pdClient.SilenceAlertWithNote(fmt.Sprintf("Added the following Limited Support reason to cluster: %#v. Silencing alert.\n", ccamLimitedSupport))
46+
return pdClient.SilenceIncidentWithNote(fmt.Sprintf("Added the following Limited Support reason to cluster: %#v. Silencing alert.\n", ccamLimitedSupport))
4747
case cmv1.ClusterStateUninstalling:
4848
// A cluster in uninstalling state should not alert primary - we just skip this
49-
return pdClient.SilenceAlertWithNote(fmt.Sprintf("Skipped adding limited support reason '%s': cluster is already uninstalling.", ccamLimitedSupport.Summary))
49+
return pdClient.SilenceIncidentWithNote(fmt.Sprintf("Skipped adding limited support reason '%s': cluster is already uninstalling.", ccamLimitedSupport.Summary))
5050
default:
5151
// Anything else is an unknown state to us and/or requires investigation.
5252
// E.g. we land here if we run into a CPD alert where credentials were removed (installing state) and don't want to put it in LS yet.
53-
return pdClient.EscalateAlertWithNote(fmt.Sprintf("Cluster has invalid cloud credentials (support role/policy is missing) and the cluster is in state '%s'. Please investigate.", cluster.State()))
53+
return pdClient.EscalateIncidentWithNote(fmt.Sprintf("Cluster has invalid cloud credentials (support role/policy is missing) and the cluster is in state '%s'. Please investigate.", cluster.State()))
5454
}
5555
}
5656

pkg/investigations/chgm/chgm.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func Investigate(r *investigation.Resources) error {
4646
// 1. Check if the user stopped instances
4747
res, err := investigateStoppedInstances(r.Cluster, r.ClusterDeployment, r.AwsClient, r.OcmClient)
4848
if err != nil {
49-
return r.PdClient.EscalateAlertWithNote(fmt.Sprintf("InvestigateInstances failed: %s\n", err.Error()))
49+
return r.PdClient.EscalateIncidentWithNote(fmt.Sprintf("InvestigateInstances failed: %s\n", err.Error()))
5050
}
5151
logging.Debugf("the investigation returned: [infras running: %d] - [masters running: %d]", res.RunningInstances.Infra, res.RunningInstances.Master)
5252

@@ -95,7 +95,7 @@ func Investigate(r *investigation.Resources) error {
9595
metrics.Inc(metrics.LimitedSupportSet, investigationName, "EgressBlocked")
9696

9797
notes.AppendAutomation("Egress `nosnch.in` blocked, sent limited support.")
98-
return r.PdClient.SilenceAlertWithNote(notes.String())
98+
return r.PdClient.SilenceIncidentWithNote(notes.String())
9999
}
100100

101101
err := r.OcmClient.PostServiceLog(r.Cluster.ID(), createEgressSL(failureReason))
@@ -112,7 +112,7 @@ func Investigate(r *investigation.Resources) error {
112112
}
113113

114114
// Found no issues that CAD can handle by itself - forward notes to SRE.
115-
return r.PdClient.EscalateAlertWithNote(notes.String())
115+
return r.PdClient.EscalateIncidentWithNote(notes.String())
116116
}
117117

118118
// investigateHibernation checks if the cluster was recently woken up from
@@ -439,5 +439,5 @@ func postChgmSLAndSilence(clusterID string, ocmCli ocm.Client, pdCli pagerduty.C
439439
return fmt.Errorf("failed sending service log: %w", err)
440440
}
441441

442-
return pdCli.SilenceAlertWithNote("Customer stopped instances. Sent SL and silencing alert.")
442+
return pdCli.SilenceIncidentWithNote("Customer stopped instances. Sent SL and silencing alert.")
443443
}

0 commit comments

Comments
 (0)