diff --git a/.gitignore b/.gitignore index 1e60bd2f..5aa9cf8a 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ e2e-suite.test payload test/testinfra/*.log test/testinfra/*.pem +.aider* diff --git a/README.md b/README.md index 7597a2f6..ad5093aa 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,20 @@ As PagerDuty itself does not provide finer granularity for webhooks than service To add a new alert investigation: - run `make bootstrap-investigation` to generate boilerplate code in `pkg/investigations` (This creates the corresponding folder & .go file, and also appends the investigation to the `availableInvestigations` interface in `registry.go`.). -- investigation.Resources contain initialized clients for the clusters aws environment, ocm and more. See [Integrations](#integrations) +- The `Run` method of your investigation receives a `ResourceBuilder`. Use its `With...` methods to request the resources your investigation needs, then call `Build()` to get a `Resources` struct containing them. The builder automatically handles dependencies between resources (e.g., requesting an AWS client will also initialize the cluster object). For example: + ```go + func (c *Investigation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { + result := investigation.InvestigationResult{} + // Request an AWS client. This will also ensure the Cluster resource is built. + r, err := rb.WithAwsClient().Build() + if err != nil { + return result, err + } + // Now you can use r.AwsClient, r.Cluster, r.PdClient, etc. + // ... + } + ``` +- The returned `Resources` struct contains initialized clients and cluster objects. See [Integrations](#integrations) for a full list of available resources. - Add test objects or scripts used to recreate the alert symptoms to the `pkg/investigations/$INVESTIGATION_NAME/testing/` directory for future use. Be sure to clearly document the testing procedure under the `Testing` section of the investigation-specific README.md file ### Graduating an investigation diff --git a/cadctl/cmd/investigate/investigate.go b/cadctl/cmd/investigate/investigate.go index a77445e3..840a5bd2 100644 --- a/cadctl/cmd/investigate/investigate.go +++ b/cadctl/cmd/investigate/investigate.go @@ -28,7 +28,6 @@ import ( "github.com/openshift/configuration-anomaly-detection/pkg/investigations/precheck" "github.com/openshift/configuration-anomaly-detection/pkg/logging" "github.com/openshift/configuration-anomaly-detection/pkg/metrics" - ocm "github.com/openshift/configuration-anomaly-detection/pkg/ocm" "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty" "github.com/spf13/cobra" @@ -101,26 +100,28 @@ func run(cmd *cobra.Command, _ []string) error { return err } - ocmClient, err := ocm.New() + builder, err := investigation.NewResourceBuilder(pdClient, clusterID, alertInvestigation.Name(), logLevelFlag, pipelineNameEnv) if err != nil { - return fmt.Errorf("could not initialize ocm client: %w", err) + return fmt.Errorf("failed to create resource builder: %w", err) } - builder := &investigation.ResourceBuilderT{} defer func() { if err != nil { handleCADFailure(err, builder, pdClient) } }() - // Prime the builder with information required for all investigations. - builder.WithName(alertInvestigation.Name()).WithCluster(clusterID).WithPagerDutyClient(pdClient).WithOcmClient(ocmClient).WithLogger(logLevelFlag, pipelineNameEnv) - precheck := precheck.ClusterStatePrecheck{} result, err := precheck.Run(builder) - if err != nil && strings.Contains(err.Error(), "no cluster found") { - logging.Warnf("No cluster found with ID '%s'. Escalating and exiting.", clusterID) - return pdClient.EscalateIncidentWithNote("CAD was unable to find the incident cluster in OCM. An alert for a non-existing cluster is unexpected. Please investigate manually.") + if err != nil { + if strings.Contains(err.Error(), "no cluster found") { + logging.Warnf("No cluster found with ID '%s'. Escalating and exiting.", clusterID) + return pdClient.EscalateIncidentWithNote("CAD was unable to find the incident cluster in OCM. An alert for a non-existing cluster is unexpected. Please investigate manually.") + } + return err + } + if result.StopInvestigations != nil { + return result.StopInvestigations } ccamInvestigation := ccam.Investigation{} @@ -140,7 +141,7 @@ func run(cmd *cobra.Command, _ []string) error { return updateIncidentTitle(pdClient) } -func handleCADFailure(err error, rb *investigation.ResourceBuilderT, pdClient *pagerduty.SdkClient) { +func handleCADFailure(err error, rb investigation.ResourceBuilder, pdClient *pagerduty.SdkClient) { logging.Errorf("CAD investigation failed: %v", err) var notes string diff --git a/hack/bootstrap-investigation.sh b/hack/bootstrap-investigation.sh index 1be1d8cf..4c7218cf 100755 --- a/hack/bootstrap-investigation.sh +++ b/hack/bootstrap-investigation.sh @@ -91,12 +91,10 @@ import ( type Investigation struct{} -func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { +func (c *Investigation) Run(rb *investigation.ResourceBuilder) (investigation.InvestigationResult, error) { result := investigation.InvestigationResult{} - - // Initialize PagerDuty note writer - notes := notewriter.New(r.Name, logging.RawLogger) - defer func() { r.Notes = notes }() + // TODO: Add additional builder configuration depending on your required resources using With...() + r, err := rb.WithNotes().Build() // TODO: Implement investigation logic here diff --git a/pkg/investigations/cpd/cpd.go b/pkg/investigations/cpd/cpd.go index 71a480b1..48730b21 100644 --- a/pkg/investigations/cpd/cpd.go +++ b/pkg/investigations/cpd/cpd.go @@ -29,7 +29,7 @@ var byovpcRoutingSL = &ocm.ServiceLog{Severity: "Major", Summary: "Installation // In the future, we want to automate service logs based on the network verifier output. func (c *Investigation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { result := investigation.InvestigationResult{} - r, err := rb.WithClusterDeployment().Build() + r, err := rb.WithClusterDeployment().WithAwsClient().Build() if err != nil { return result, err } diff --git a/pkg/investigations/investigation/investigation.go b/pkg/investigations/investigation/investigation.go index 0e34475a..69903a5e 100644 --- a/pkg/investigations/investigation/investigation.go +++ b/pkg/investigations/investigation/investigation.go @@ -1,7 +1,6 @@ package investigation import ( - "errors" "fmt" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" @@ -25,8 +24,30 @@ type InvestigationResult struct { ServiceLogSent InvestigationStep // If multiple investigations might be run this can indicate a fatal error that makes running additional investigations useless. - // Uses a false-default so it must be set expclicitly by an investigation. - StopInvestigations bool + // If nil, investigations should continue. If not nil, should contain a meaningful error message explaining why investigations must stop. + StopInvestigations error +} + +func NewResourceBuilder(pdClient pagerduty.Client, clusterId string, name string, logLevel string, pipelineName string) (ResourceBuilder, error) { + ocmClient, err := ocm.New() + if err != nil { + return nil, fmt.Errorf("could not create ocm client: %w", err) + } + + rb := &ResourceBuilderT{ + buildLogger: true, + clusterId: clusterId, + name: name, + logLevel: logLevel, + pipelineName: pipelineName, + ocmClient: ocmClient, + builtResources: &Resources{ + PdClient: pdClient, + OcmClient: ocmClient, + }, + } + + return rb, nil } type Investigation interface { @@ -51,14 +72,10 @@ type Resources struct { } type ResourceBuilder interface { - WithCluster(clusterId string) ResourceBuilder + WithCluster() ResourceBuilder WithClusterDeployment() ResourceBuilder WithAwsClient() ResourceBuilder - WithOcmClient(*ocm.SdkClient) ResourceBuilder - WithPagerDutyClient(client *pagerduty.SdkClient) ResourceBuilder WithNotes() ResourceBuilder - WithName(name string) ResourceBuilder - WithLogger(logLevel string, pipelineName string) ResourceBuilder Build() (*Resources, error) } @@ -69,87 +86,50 @@ type ResourceBuilderT struct { buildNotes bool buildLogger bool - // These clients are required for all investigations and all clusters, so they are pre-filled. - pdClient *pagerduty.SdkClient - ocmClient *ocm.SdkClient - clusterId string name string logLevel string pipelineName string + ocmClient *ocm.SdkClient + // cache builtResources *Resources buildErr error } -func (r *ResourceBuilderT) WithCluster(clusterId string) ResourceBuilder { +func (r *ResourceBuilderT) WithCluster() ResourceBuilder { r.buildCluster = true - r.clusterId = clusterId return r } func (r *ResourceBuilderT) WithClusterDeployment() ResourceBuilder { + r.WithCluster() r.buildClusterDeployment = true return r } func (r *ResourceBuilderT) WithAwsClient() ResourceBuilder { + r.WithCluster() r.buildAwsClient = true return r } -func (r *ResourceBuilderT) WithOcmClient(client *ocm.SdkClient) ResourceBuilder { - r.ocmClient = client - return r -} - -func (r *ResourceBuilderT) WithPagerDutyClient(client *pagerduty.SdkClient) ResourceBuilder { - r.pdClient = client - return r -} - func (r *ResourceBuilderT) WithNotes() ResourceBuilder { r.buildNotes = true return r } -func (r *ResourceBuilderT) WithName(name string) ResourceBuilder { - r.name = name - return r -} - -func (r *ResourceBuilderT) WithLogger(logLevel string, pipelineName string) ResourceBuilder { - r.buildLogger = true - r.logLevel = logLevel - r.pipelineName = pipelineName - return r -} - func (r *ResourceBuilderT) Build() (*Resources, error) { if r.buildErr != nil { return nil, r.buildErr } - if r.builtResources == nil { - r.builtResources = &Resources{ - Name: r.name, - OcmClient: r.ocmClient, - PdClient: r.pdClient, - } - } + // The Name is now set during construction. + r.builtResources.Name = r.name var err error - if r.buildClusterDeployment && !r.buildCluster { - r.buildErr = errors.New("cannot build ClusterDeployment without Cluster") - return nil, r.buildErr - } - if r.buildAwsClient && !r.buildCluster { - r.buildErr = errors.New("cannot build AwsClient without Cluster") - return nil, r.buildErr - } - if r.buildCluster && r.builtResources.Cluster == nil { r.builtResources.Cluster, err = r.ocmClient.GetClusterInfo(r.clusterId) if err != nil { @@ -183,7 +163,7 @@ func (r *ResourceBuilderT) Build() (*Resources, error) { if r.buildLogger { // Re-initialize the logger with the cluster ID. - logging.RawLogger = logging.InitLogger(r.logLevel, "", internalClusterId) + logging.RawLogger = logging.InitLogger(r.logLevel, r.pipelineName, internalClusterId) } } @@ -200,7 +180,7 @@ type ResourceBuilderMock struct { BuildError error } -func (r *ResourceBuilderMock) WithCluster(clusterId string) ResourceBuilder { +func (r *ResourceBuilderMock) WithCluster() ResourceBuilder { return r } @@ -212,26 +192,10 @@ func (r *ResourceBuilderMock) WithAwsClient() ResourceBuilder { return r } -func (r *ResourceBuilderMock) WithOcmClient(client *ocm.SdkClient) ResourceBuilder { - return r -} - -func (r *ResourceBuilderMock) WithPagerDutyClient(client *pagerduty.SdkClient) ResourceBuilder { - return r -} - func (r *ResourceBuilderMock) WithNotes() ResourceBuilder { return r } -func (r *ResourceBuilderMock) WithName(name string) ResourceBuilder { - return r -} - -func (r *ResourceBuilderMock) WithLogger(logLevel string, pipelineName string) ResourceBuilder { - return r -} - func (r *ResourceBuilderMock) Build() (*Resources, error) { if r.BuildError != nil { return nil, r.BuildError diff --git a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go index 76e480b5..8b7b680e 100644 --- a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go +++ b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go @@ -63,7 +63,7 @@ func (i *Investigation) teardown() error { func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { ctx := context.Background() result := investigation.InvestigationResult{} - r, err := rb.Build() + r, err := rb.WithCluster().Build() if err != nil { return result, err } diff --git a/pkg/investigations/precheck/precheck.go b/pkg/investigations/precheck/precheck.go index 672658ba..5c5fc127 100644 --- a/pkg/investigations/precheck/precheck.go +++ b/pkg/investigations/precheck/precheck.go @@ -1,6 +1,8 @@ package precheck import ( + "errors" + cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" "github.com/openshift/configuration-anomaly-detection/pkg/logging" @@ -14,7 +16,7 @@ type ClusterStatePrecheck struct{} // Performs according pagerduty actions and returns whether CAD needs to investigate the cluster func (c *ClusterStatePrecheck) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { result := investigation.InvestigationResult{} - r, err := rb.Build() + r, err := rb.WithCluster().Build() if err != nil { return result, err } @@ -23,13 +25,13 @@ func (c *ClusterStatePrecheck) Run(rb investigation.ResourceBuilder) (investigat ocmClient := r.OcmClient if cluster.State() == cmv1.ClusterStateUninstalling { logging.Info("Cluster is uninstalling and requires no investigation. Silencing alert.") - result.StopInvestigations = true + result.StopInvestigations = errors.New("cluster is already uninstalling") return result, pdClient.SilenceIncidentWithNote("CAD: Cluster is already uninstalling, silencing alert.") } if cluster.AWS() == nil { logging.Info("Cloud provider unsupported, forwarding to primary.") - result.StopInvestigations = true + result.StopInvestigations = errors.New("unsupported cloud provider (non-AWS)") return result, pdClient.EscalateIncidentWithNote("CAD could not run an automated investigation on this cluster: unsupported cloud provider.") } @@ -39,7 +41,7 @@ func (c *ClusterStatePrecheck) Run(rb investigation.ResourceBuilder) (investigat } if isAccessProtected { logging.Info("Cluster is access protected. Escalating alert.") - result.StopInvestigations = true + result.StopInvestigations = errors.New("cluster is access protected") return result, pdClient.EscalateIncidentWithNote("CAD is unable to run against access protected clusters. Please investigate.") } return result, nil diff --git a/pkg/investigations/precheck/precheck_test.go b/pkg/investigations/precheck/precheck_test.go index a033bf4c..ec669fac 100644 --- a/pkg/investigations/precheck/precheck_test.go +++ b/pkg/investigations/precheck/precheck_test.go @@ -1,6 +1,7 @@ package precheck import ( + "errors" "reflect" "testing" @@ -35,7 +36,7 @@ func TestInvestigation_Run(t *testing.T) { PdClient: pdClient, }, }}, - investigation.InvestigationResult{StopInvestigations: true}, + investigation.InvestigationResult{StopInvestigations: errors.New("unsupported cloud provider (non-AWS)")}, false, }, }