Skip to content

Commit 2db698e

Browse files
Merge pull request #397 from MateSaary/OSD-28224-post-failures
OSD-28224 - cad should always post failures to the incident notes
2 parents 9e27b78 + a79eb74 commit 2db698e

File tree

6 files changed

+32
-4
lines changed

6 files changed

+32
-4
lines changed

cadctl/cmd/investigate/investigate.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,14 @@ func run(cmd *cobra.Command, _ []string) error {
7676

7777
logging.Infof("Incident link: %s", pdClient.GetIncidentRef())
7878

79+
var investigationResources *investigation.Resources
80+
81+
defer func() {
82+
if err != nil {
83+
handleCADFailure(err, investigationResources)
84+
}
85+
}()
86+
7987
_, cadExperimentalEnabled := os.LookupEnv("CAD_EXPERIMENTAL_ENABLED")
8088
alertInvestigation := investigations.GetInvestigation(pdClient.GetTitle(), cadExperimentalEnabled)
8189

@@ -131,21 +139,36 @@ func run(cmd *cobra.Command, _ []string) error {
131139

132140
customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient)
133141
if err != nil {
134-
ccamResources := &investigation.Resources{Name: "ccam", Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, AdditionalResources: map[string]interface{}{"error": err}}
142+
ccamResources := &investigation.Resources{Name: "ccam", Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, Notes: nil, AdditionalResources: map[string]interface{}{"error": err}}
135143
inv := ccam.Investigation{}
136144
result, err := inv.Run(ccamResources)
137145
updateMetrics(alertInvestigation.Name(), &result)
138146
return err
139147
}
140148

141-
investigationResources := &investigation.Resources{Name: alertInvestigation.Name(), Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient}
149+
investigationResources = &investigation.Resources{Name: alertInvestigation.Name(), Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, Notes: nil}
142150

143151
logging.Infof("Starting investigation for %s", alertInvestigation.Name())
144152
result, err := alertInvestigation.Run(investigationResources)
145153
updateMetrics(alertInvestigation.Name(), &result)
154+
146155
return err
147156
}
148157

158+
func handleCADFailure(err error, resources *investigation.Resources) {
159+
logging.Errorf("CAD investigation failed: %v", err)
160+
161+
notes := resources.Notes
162+
163+
notes.AppendWarning("🚨 CAD investigation failed, CAD team has been notified. Please investigate manually. 🚨")
164+
pdErr := resources.PdClient.EscalateIncidentWithNote(notes.String())
165+
if pdErr != nil {
166+
logging.Errorf("Failed to escalate notes to PagerDuty: %v", pdErr)
167+
} else {
168+
logging.Info("CAD failure & incident notes added to PagerDuty")
169+
}
170+
}
171+
149172
// GetOCMClient will retrieve the OcmClient from the 'ocm' package
150173
func GetOCMClient() (*ocm.SdkClient, error) {
151174
cadOcmFilePath := os.Getenv("CAD_OCM_FILE_PATH")

hack/bootstrap-investigation.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ func (c *Investigation) Run(r *investigation.Resources) (investigation.Investiga
9696
9797
// Initialize PagerDuty note writer
9898
notes := notewriter.New(r.Name, logging.RawLogger)
99+
defer func() { r.Notes = notes }()
99100
100101
// TODO: Implement investigation logic here
101102

pkg/investigations/chgm/chgm.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ type Investiation struct{}
4242
func (c *Investiation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
4343
result := investigation.InvestigationResult{}
4444
notes := notewriter.New("CHGM", logging.RawLogger)
45+
defer func() { r.Notes = notes }()
4546

4647
// 1. Check if the user stopped instances
4748
res, err := investigateStoppedInstances(r.Cluster, r.ClusterDeployment, r.AwsClient, r.OcmClient)

pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,13 @@ var uwmMisconfiguredSL = ocm.ServiceLog{
2727

2828
type Investigation struct{}
2929

30-
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
30+
func (c *Investigation) Run(r *investigation.Resources) (result investigation.InvestigationResult, err error) {
3131
// Initialize k8s client
3232
// This would be better suited to be passend in with the investigation resources
3333
// In turn we would need to split out ccam and k8sclient, as those are tied to a cluster
3434
// Failing the cleanup call is not critical as there is garbage collection for the RBAC within MCC https://issues.redhat.com/browse/OSD-27692
3535
// We can revisit backplane-apis remediation implementation to improve this behavior, by e.g.
3636
// patching the existing RBAC etc...
37-
result := investigation.InvestigationResult{}
3837
k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, r.Name)
3938
if err != nil {
4039
return result, fmt.Errorf("unable to initialize k8s cli: %w", err)
@@ -49,6 +48,7 @@ func (c *Investigation) Run(r *investigation.Resources) (investigation.Investiga
4948

5049
// Initialize PagerDuty note writer
5150
notes := notewriter.New(r.Name, logging.RawLogger)
51+
defer func() { r.Notes = notes }()
5252

5353
// List the monitoring cluster operator
5454
coList := &configv1.ClusterOperatorList{}

pkg/investigations/cpd/cpd.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ var byovpcRoutingSL = &ocm.ServiceLog{Severity: "Major", Summary: "Installation
3030
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
3131
result := investigation.InvestigationResult{}
3232
notes := notewriter.New("CPD", logging.RawLogger)
33+
defer func() { r.Notes = notes }()
3334

3435
if r.Cluster.Status().State() == "ready" {
3536
// We are unsure when this happens, in theory, if the cluster is ready, the alert shouldn't fire or should autoresolve.

pkg/investigations/investigation/investigation.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ package investigation
44
import (
55
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
66
"github.com/openshift/configuration-anomaly-detection/pkg/aws"
7+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
78
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
89
"github.com/openshift/configuration-anomaly-detection/pkg/pagerduty"
910
hivev1 "github.com/openshift/hive/apis/hive/v1"
@@ -38,5 +39,6 @@ type Resources struct {
3839
AwsClient aws.Client
3940
OcmClient ocm.Client
4041
PdClient pagerduty.Client
42+
Notes *notewriter.NoteWriter
4143
AdditionalResources map[string]interface{}
4244
}

0 commit comments

Comments
 (0)