Skip to content

Commit b8af331

Browse files
committed
Add caching to the builder.
Use the builder to report the error on failures.
1 parent bf29319 commit b8af331

File tree

2 files changed

+74
-55
lines changed

2 files changed

+74
-55
lines changed

cadctl/cmd/investigate/investigate.go

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,6 @@ func run(cmd *cobra.Command, _ []string) error {
7878

7979
logging.Infof("Incident link: %s", pdClient.GetIncidentRef())
8080

81-
var investigationResources *investigation.Resources
82-
83-
defer func() {
84-
if err != nil {
85-
handleCADFailure(err, investigationResources, pdClient)
86-
}
87-
}()
88-
8981
experimentalEnabledVar := os.Getenv("CAD_EXPERIMENTAL_ENABLED")
9082
cadExperimentalEnabled, _ := strconv.ParseBool(experimentalEnabledVar)
9183
alertInvestigation := investigations.GetInvestigation(pdClient.GetTitle(), cadExperimentalEnabled)
@@ -115,36 +107,54 @@ func run(cmd *cobra.Command, _ []string) error {
115107
}
116108

117109
builder := &investigation.ResourceBuilderT{}
110+
defer func() {
111+
if err != nil {
112+
handleCADFailure(err, builder, pdClient)
113+
}
114+
}()
115+
118116
// Prime the builder with information required for all investigations.
119117
builder.WithName(alertInvestigation.Name()).WithCluster(clusterID).WithPagerDutyClient(pdClient).WithOcmClient(ocmClient).WithLogger(logLevelFlag, pipelineNameEnv)
120118

119+
// handleClusterNotFound centralizes the logic for this specific error case.
120+
handleClusterNotFound := func(investigationErr error) error {
121+
if investigationErr != nil && strings.Contains(investigationErr.Error(), "no cluster found") {
122+
logging.Warnf("No cluster found with ID '%s'. Escalating and exiting.", clusterID)
123+
return pdClient.EscalateIncidentWithNote("CAD was unable to find the incident cluster in OCM. An alert for a non-existing cluster is unexpected. Please investigate manually.")
124+
}
125+
return investigationErr
126+
}
127+
121128
precheck := precheck.Investigation{}
122129
result, err := precheck.Run(builder)
123-
if err != nil || result.StopInvestigations {
130+
if err = handleClusterNotFound(err); err != nil || result.StopInvestigations {
124131
return err
125132
}
126133

127134
inv := ccam.Investigation{}
128135
result, err = inv.Run(builder)
129-
updateMetrics(alertInvestigation.Name(), &result)
130-
if err != nil {
136+
if err = handleClusterNotFound(err); err != nil {
131137
return err
132138
}
139+
updateMetrics(alertInvestigation.Name(), &result)
133140

134141
logging.Infof("Starting investigation for %s", alertInvestigation.Name())
135142
result, err = alertInvestigation.Run(builder)
136-
updateMetrics(alertInvestigation.Name(), &result)
137-
if err != nil {
143+
if err = handleClusterNotFound(err); err != nil {
138144
return err
139145
}
146+
updateMetrics(alertInvestigation.Name(), &result)
140147

141148
return updateIncidentTitle(pdClient)
142149
}
143150

144-
func handleCADFailure(err error, resources *investigation.Resources, pdClient *pagerduty.SdkClient) {
151+
func handleCADFailure(err error, builder *investigation.ResourceBuilderT, pdClient *pagerduty.SdkClient) {
145152
logging.Errorf("CAD investigation failed: %v", err)
146153

147154
var notes string
155+
// The builder caches resources, so we can access them here even if a later step failed.
156+
// We ignore the error here because we just want to get any notes that were created.
157+
resources, _ := builder.Build()
148158
if resources != nil && resources.Notes != nil {
149159
resources.Notes.AppendWarning("🚨 CAD investigation failed, CAD team has been notified. Please investigate manually. 🚨")
150160
notes = resources.Notes.String()

pkg/investigations/investigation/investigation.go

Lines changed: 50 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ type ResourceBuilderT struct {
7777
name string
7878
logLevel string
7979
pipelineName string
80+
81+
// cache
82+
builtResources *Resources
83+
buildErr error
8084
}
8185

8286
func (r *ResourceBuilderT) WithCluster(clusterId string) ResourceBuilder {
@@ -123,66 +127,71 @@ func (r *ResourceBuilderT) WithLogger(logLevel string, pipelineName string) Reso
123127
}
124128

125129
func (r *ResourceBuilderT) Build() (*Resources, error) {
126-
var awsClient aws.Client
127-
var cluster *cmv1.Cluster
128-
var clusterDeployment *hivev1.ClusterDeployment
129-
var notes *notewriter.NoteWriter
130-
var internalClusterId string
130+
if r.buildErr != nil {
131+
return nil, r.buildErr
132+
}
133+
134+
if r.builtResources == nil {
135+
r.builtResources = &Resources{
136+
Name: r.name,
137+
OcmClient: r.ocmClient,
138+
PdClient: r.pdClient,
139+
}
140+
}
141+
131142
var err error
132143

133144
if r.buildClusterDeployment && !r.buildCluster {
134-
return nil, errors.New("can not build ClusterDeployment without Cluster")
145+
r.buildErr = errors.New("cannot build ClusterDeployment without Cluster")
146+
return nil, r.buildErr
147+
}
148+
if r.buildAwsClient && !r.buildCluster {
149+
r.buildErr = errors.New("cannot build AwsClient without Cluster")
150+
return nil, r.buildErr
135151
}
136152

137-
if r.buildAwsClient {
138-
awsClient, err = managedcloud.CreateCustomerAWSClient(cluster, r.ocmClient)
153+
if r.buildCluster && r.builtResources.Cluster == nil {
154+
r.builtResources.Cluster, err = r.ocmClient.GetClusterInfo(r.clusterId)
139155
if err != nil {
156+
// Let the caller handle how to respond to this error.
157+
err = fmt.Errorf("could not retrieve cluster info for %s: %w", r.clusterId, err)
158+
r.buildErr = err
140159
return nil, err
141160
}
142161
}
143162

144-
if r.buildCluster {
145-
cluster, err = r.ocmClient.GetClusterInfo(r.clusterId)
146-
if err != nil {
147-
if strings.Contains(err.Error(), "no cluster found") {
148-
logging.Warnf("No cluster found with ID '%s'. Exiting.", r.clusterId)
149-
err = r.pdClient.EscalateIncidentWithNote("CAD was unable to find the incident cluster in OCM. An alert for a non-existing cluster is unexpected. Please investigate manually.")
150-
logging.Errorf("Could not escalate via PagerDuty: ", err)
151-
return nil, errors.New("unable to find incident cluster in OCM")
163+
// Dependent resources can only be built if a cluster object exists.
164+
if r.builtResources.Cluster != nil {
165+
internalClusterId := r.builtResources.Cluster.ID()
166+
167+
if r.buildAwsClient && r.builtResources.AwsClient == nil {
168+
r.builtResources.AwsClient, err = managedcloud.CreateCustomerAWSClient(r.builtResources.Cluster, r.ocmClient)
169+
if err != nil {
170+
r.buildErr = err
171+
return nil, err
152172
}
153-
return nil, fmt.Errorf("could not retrieve cluster info for %s: %w", r.clusterId, err)
154173
}
155174

156-
// From this point on, we normalize to internal ID, as this ID always exists.
157-
// For installing clusters, externalID can be empty.
158-
internalClusterId = cluster.ID()
159-
}
160-
161-
if r.buildClusterDeployment {
162-
clusterDeployment, err = r.ocmClient.GetClusterDeployment(internalClusterId)
163-
if err != nil {
164-
return nil, fmt.Errorf("could not retrieve Cluster Deployment for %s: %w", internalClusterId, err)
175+
if r.buildClusterDeployment && r.builtResources.ClusterDeployment == nil {
176+
r.builtResources.ClusterDeployment, err = r.ocmClient.GetClusterDeployment(internalClusterId)
177+
if err != nil {
178+
err = fmt.Errorf("could not retrieve Cluster Deployment for %s: %w", internalClusterId, err)
179+
r.buildErr = err
180+
return nil, err
181+
}
165182
}
166-
}
167183

168-
if r.buildLogger {
169-
logging.RawLogger = logging.InitLogger(r.logLevel, "", internalClusterId)
184+
if r.buildLogger {
185+
// Re-initialize the logger with the cluster ID.
186+
logging.RawLogger = logging.InitLogger(r.logLevel, "", internalClusterId)
187+
}
170188
}
171189

172-
if r.buildNotes {
173-
// Initialize NoteWriter with sane defaults
174-
notes = notewriter.New(r.name, logging.RawLogger)
190+
if r.buildNotes && r.builtResources.Notes == nil {
191+
r.builtResources.Notes = notewriter.New(r.name, logging.RawLogger)
175192
}
176193

177-
return &Resources{
178-
Name: r.name,
179-
Cluster: cluster,
180-
ClusterDeployment: clusterDeployment,
181-
AwsClient: awsClient,
182-
OcmClient: r.ocmClient,
183-
PdClient: r.pdClient,
184-
Notes: notes,
185-
}, nil
194+
return r.builtResources, nil
186195
}
187196

188197
// This is an implementation to be used in tests, but putting it into a _test.go file will make it not resolvable.

0 commit comments

Comments
 (0)