Skip to content

Commit bf29319

Browse files
committed
Use a builder pattern to initialize resources used by investigations.
To allow not all resources to be prefilled as not every investigation needs everything, this introduces a Builder pattern that is prefilled with a subset of fields (OCM, PD, Cluster, Logging) that can be extended by each investigation. Once everything that is supposed to be build is set, calling Build() will return the requested resources.
1 parent 726fa09 commit bf29319

File tree

16 files changed

+3505
-318
lines changed

16 files changed

+3505
-318
lines changed

cadctl/cmd/investigate/investigate.go

Lines changed: 11 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,11 @@ import (
2222
"strconv"
2323
"strings"
2424

25-
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
2625
investigations "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
2726
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
2827
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
28+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/precheck"
2929
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
30-
"github.com/openshift/configuration-anomaly-detection/pkg/managedcloud"
3130
"github.com/openshift/configuration-anomaly-detection/pkg/metrics"
3231
ocm "github.com/openshift/configuration-anomaly-detection/pkg/ocm"
3332
"github.com/openshift/configuration-anomaly-detection/pkg/pagerduty"
@@ -115,45 +114,25 @@ func run(cmd *cobra.Command, _ []string) error {
115114
return fmt.Errorf("could not initialize ocm client: %w", err)
116115
}
117116

118-
cluster, err := ocmClient.GetClusterInfo(clusterID)
119-
if err != nil {
120-
if strings.Contains(err.Error(), "no cluster found") {
121-
logging.Warnf("No cluster found with ID '%s'. Exiting.", clusterID)
122-
return pdClient.EscalateIncidentWithNote("CAD was unable to find the incident cluster in OCM. An alert for a non-existing cluster is unexpected. Please investigate manually.")
123-
}
124-
return fmt.Errorf("could not retrieve cluster info for %s: %w", clusterID, err)
125-
}
126-
127-
// From this point on, we normalize to internal ID, as this ID always exists.
128-
// For installing clusters, externalID can be empty.
129-
internalClusterID := cluster.ID()
130-
131-
// re-initialize logger for the internal-cluster-id context
132-
logging.RawLogger = logging.InitLogger(logLevelFlag, pipelineNameEnv, internalClusterID)
117+
builder := &investigation.ResourceBuilderT{}
118+
// Prime the builder with information required for all investigations.
119+
builder.WithName(alertInvestigation.Name()).WithCluster(clusterID).WithPagerDutyClient(pdClient).WithOcmClient(ocmClient).WithLogger(logLevelFlag, pipelineNameEnv)
133120

134-
requiresInvestigation, err := clusterRequiresInvestigation(cluster, pdClient, ocmClient)
135-
if err != nil || !requiresInvestigation {
121+
precheck := precheck.Investigation{}
122+
result, err := precheck.Run(builder)
123+
if err != nil || result.StopInvestigations {
136124
return err
137125
}
138126

139-
clusterDeployment, err := ocmClient.GetClusterDeployment(internalClusterID)
140-
if err != nil {
141-
return fmt.Errorf("could not retrieve Cluster Deployment for %s: %w", internalClusterID, err)
142-
}
143-
144-
customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient)
127+
inv := ccam.Investigation{}
128+
result, err = inv.Run(builder)
129+
updateMetrics(alertInvestigation.Name(), &result)
145130
if err != nil {
146-
ccamResources := &investigation.Resources{Name: "ccam", Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, Notes: nil, AdditionalResources: map[string]interface{}{"error": err}}
147-
inv := ccam.Investigation{}
148-
result, err := inv.Run(ccamResources)
149-
updateMetrics(alertInvestigation.Name(), &result)
150131
return err
151132
}
152133

153-
investigationResources = &investigation.Resources{Name: alertInvestigation.Name(), Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, Notes: nil}
154-
155134
logging.Infof("Starting investigation for %s", alertInvestigation.Name())
156-
result, err := alertInvestigation.Run(investigationResources)
135+
result, err = alertInvestigation.Run(builder)
157136
updateMetrics(alertInvestigation.Name(), &result)
158137
if err != nil {
159138
return err
@@ -185,32 +164,6 @@ func handleCADFailure(err error, resources *investigation.Resources, pdClient *p
185164
}
186165
}
187166

188-
// Checks pre-requisites for a cluster investigation:
189-
// - the cluster's state is supported by CAD for an investigation (= not uninstalling)
190-
// - the cloud provider is supported by CAD (cluster is AWS)
191-
// Performs according pagerduty actions and returns whether CAD needs to investigate the cluster
192-
func clusterRequiresInvestigation(cluster *cmv1.Cluster, pdClient *pagerduty.SdkClient, ocmClient *ocm.SdkClient) (bool, error) {
193-
if cluster.State() == cmv1.ClusterStateUninstalling {
194-
logging.Info("Cluster is uninstalling and requires no investigation. Silencing alert.")
195-
return false, pdClient.SilenceIncidentWithNote("CAD: Cluster is already uninstalling, silencing alert.")
196-
}
197-
198-
if cluster.AWS() == nil {
199-
logging.Info("Cloud provider unsupported, forwarding to primary.")
200-
return false, pdClient.EscalateIncidentWithNote("CAD could not run an automated investigation on this cluster: unsupported cloud provider.")
201-
}
202-
203-
isAccessProtected, err := ocmClient.IsAccessProtected(cluster)
204-
if err != nil {
205-
logging.Warnf("failed to get access protection status for cluster. %w. Continuing...")
206-
}
207-
if isAccessProtected {
208-
logging.Info("Cluster is access protected. Escalating alert.")
209-
return false, pdClient.EscalateIncidentWithNote("CAD is unable to run against access protected clusters. Please investigate.")
210-
}
211-
return true, nil
212-
}
213-
214167
func updateMetrics(investigationName string, result *investigation.InvestigationResult) {
215168
if result.ServiceLogSent.Performed {
216169
metrics.Inc(metrics.ServicelogSent, append([]string{investigationName}, result.ServiceLogSent.Labels...)...)

0 commit comments

Comments
 (0)