From eb84e8b006c4ad02c7196a9603ceef236417b610 Mon Sep 17 00:00:00 2001 From: Florian Bergmann Date: Thu, 31 Jul 2025 12:58:44 +0200 Subject: [PATCH 1/3] refactor(investigation): Rework ResourceBuilder for improved usability This commit refactors the \`ResourceBuilder\` to be more user-friendly and robust. The previous implementation required callers to manage dependencies between resources manually and was prone to runtime errors if resources were not requested in the correct order. The key changes include: - A new \`NewResourceBuilder\` constructor now handles the initial setup of required components like OCM and PagerDuty clients, simplifying the builder's API. - The \`Build()\` method now lazily initializes resources only when they are explicitly requested via \`With...\` methods. This includes deferring the \`GetClusterInfo\` call until necessary. - Dependency management is now handled internally by the builder. For example, requesting an AWS client via \`WithAwsClient()\` will automatically ensure the cluster object is also built. - Investigation \`Run\` methods have been updated to explicitly declare their resource dependencies using the new builder methods. This refactoring resolves several build and runtime errors, improves the developer experience by providing a clearer and safer API, and makes investigations easier to write and maintain. Co-authored-by: Aider --- .gitignore | 1 + cadctl/cmd/investigate/investigate.go | 23 ++-- pkg/investigations/cpd/cpd.go | 2 +- .../investigation/investigation.go | 100 ++++++------------ ...ehealthcheckunterminatedshortcircuitsre.go | 2 +- pkg/investigations/precheck/precheck.go | 2 +- 6 files changed, 48 insertions(+), 82 deletions(-) diff --git a/.gitignore b/.gitignore index 1e60bd2f..5aa9cf8a 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ e2e-suite.test payload test/testinfra/*.log test/testinfra/*.pem +.aider* diff --git a/cadctl/cmd/investigate/investigate.go b/cadctl/cmd/investigate/investigate.go index a77445e3..3d37f954 100644 --- a/cadctl/cmd/investigate/investigate.go +++ b/cadctl/cmd/investigate/investigate.go @@ -28,7 +28,6 @@ import ( "github.com/openshift/configuration-anomaly-detection/pkg/investigations/precheck" "github.com/openshift/configuration-anomaly-detection/pkg/logging" "github.com/openshift/configuration-anomaly-detection/pkg/metrics" - ocm "github.com/openshift/configuration-anomaly-detection/pkg/ocm" "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty" "github.com/spf13/cobra" @@ -101,26 +100,28 @@ func run(cmd *cobra.Command, _ []string) error { return err } - ocmClient, err := ocm.New() + builder, err := investigation.NewResourceBuilder(pdClient, clusterID, alertInvestigation.Name(), logLevelFlag, pipelineNameEnv) if err != nil { - return fmt.Errorf("could not initialize ocm client: %w", err) + return fmt.Errorf("failed to create resource builder: %w", err) } - builder := &investigation.ResourceBuilderT{} defer func() { if err != nil { handleCADFailure(err, builder, pdClient) } }() - // Prime the builder with information required for all investigations. - builder.WithName(alertInvestigation.Name()).WithCluster(clusterID).WithPagerDutyClient(pdClient).WithOcmClient(ocmClient).WithLogger(logLevelFlag, pipelineNameEnv) - precheck := precheck.ClusterStatePrecheck{} result, err := precheck.Run(builder) - if err != nil && strings.Contains(err.Error(), "no cluster found") { - logging.Warnf("No cluster found with ID '%s'. Escalating and exiting.", clusterID) - return pdClient.EscalateIncidentWithNote("CAD was unable to find the incident cluster in OCM. An alert for a non-existing cluster is unexpected. Please investigate manually.") + if err != nil { + if strings.Contains(err.Error(), "no cluster found") { + logging.Warnf("No cluster found with ID '%s'. Escalating and exiting.", clusterID) + return pdClient.EscalateIncidentWithNote("CAD was unable to find the incident cluster in OCM. An alert for a non-existing cluster is unexpected. Please investigate manually.") + } + return err + } + if result.StopInvestigations { + return nil } ccamInvestigation := ccam.Investigation{} @@ -140,7 +141,7 @@ func run(cmd *cobra.Command, _ []string) error { return updateIncidentTitle(pdClient) } -func handleCADFailure(err error, rb *investigation.ResourceBuilderT, pdClient *pagerduty.SdkClient) { +func handleCADFailure(err error, rb investigation.ResourceBuilder, pdClient *pagerduty.SdkClient) { logging.Errorf("CAD investigation failed: %v", err) var notes string diff --git a/pkg/investigations/cpd/cpd.go b/pkg/investigations/cpd/cpd.go index 71a480b1..48730b21 100644 --- a/pkg/investigations/cpd/cpd.go +++ b/pkg/investigations/cpd/cpd.go @@ -29,7 +29,7 @@ var byovpcRoutingSL = &ocm.ServiceLog{Severity: "Major", Summary: "Installation // In the future, we want to automate service logs based on the network verifier output. func (c *Investigation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { result := investigation.InvestigationResult{} - r, err := rb.WithClusterDeployment().Build() + r, err := rb.WithClusterDeployment().WithAwsClient().Build() if err != nil { return result, err } diff --git a/pkg/investigations/investigation/investigation.go b/pkg/investigations/investigation/investigation.go index 0e34475a..42c8b30c 100644 --- a/pkg/investigations/investigation/investigation.go +++ b/pkg/investigations/investigation/investigation.go @@ -1,7 +1,6 @@ package investigation import ( - "errors" "fmt" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" @@ -29,6 +28,28 @@ type InvestigationResult struct { StopInvestigations bool } +func NewResourceBuilder(pdClient pagerduty.Client, clusterId string, name string, logLevel string, pipelineName string) (ResourceBuilder, error) { + ocmClient, err := ocm.New() + if err != nil { + return nil, fmt.Errorf("could not create ocm client: %w", err) + } + + rb := &ResourceBuilderT{ + buildLogger: true, + clusterId: clusterId, + name: name, + logLevel: logLevel, + pipelineName: pipelineName, + ocmClient: ocmClient, + builtResources: &Resources{ + PdClient: pdClient, + OcmClient: ocmClient, + }, + } + + return rb, nil +} + type Investigation interface { Run(builder ResourceBuilder) (InvestigationResult, error) // Please note that when adding an investigation the name and the directory currently need to be the same, @@ -51,14 +72,10 @@ type Resources struct { } type ResourceBuilder interface { - WithCluster(clusterId string) ResourceBuilder + WithCluster() ResourceBuilder WithClusterDeployment() ResourceBuilder WithAwsClient() ResourceBuilder - WithOcmClient(*ocm.SdkClient) ResourceBuilder - WithPagerDutyClient(client *pagerduty.SdkClient) ResourceBuilder WithNotes() ResourceBuilder - WithName(name string) ResourceBuilder - WithLogger(logLevel string, pipelineName string) ResourceBuilder Build() (*Resources, error) } @@ -69,87 +86,50 @@ type ResourceBuilderT struct { buildNotes bool buildLogger bool - // These clients are required for all investigations and all clusters, so they are pre-filled. - pdClient *pagerduty.SdkClient - ocmClient *ocm.SdkClient - clusterId string name string logLevel string pipelineName string + ocmClient *ocm.SdkClient + // cache builtResources *Resources buildErr error } -func (r *ResourceBuilderT) WithCluster(clusterId string) ResourceBuilder { +func (r *ResourceBuilderT) WithCluster() ResourceBuilder { r.buildCluster = true - r.clusterId = clusterId return r } func (r *ResourceBuilderT) WithClusterDeployment() ResourceBuilder { + r.WithCluster() r.buildClusterDeployment = true return r } func (r *ResourceBuilderT) WithAwsClient() ResourceBuilder { + r.WithCluster() r.buildAwsClient = true return r } -func (r *ResourceBuilderT) WithOcmClient(client *ocm.SdkClient) ResourceBuilder { - r.ocmClient = client - return r -} - -func (r *ResourceBuilderT) WithPagerDutyClient(client *pagerduty.SdkClient) ResourceBuilder { - r.pdClient = client - return r -} - func (r *ResourceBuilderT) WithNotes() ResourceBuilder { r.buildNotes = true return r } -func (r *ResourceBuilderT) WithName(name string) ResourceBuilder { - r.name = name - return r -} - -func (r *ResourceBuilderT) WithLogger(logLevel string, pipelineName string) ResourceBuilder { - r.buildLogger = true - r.logLevel = logLevel - r.pipelineName = pipelineName - return r -} - func (r *ResourceBuilderT) Build() (*Resources, error) { if r.buildErr != nil { return nil, r.buildErr } - if r.builtResources == nil { - r.builtResources = &Resources{ - Name: r.name, - OcmClient: r.ocmClient, - PdClient: r.pdClient, - } - } + // The Name is now set during construction. + r.builtResources.Name = r.name var err error - if r.buildClusterDeployment && !r.buildCluster { - r.buildErr = errors.New("cannot build ClusterDeployment without Cluster") - return nil, r.buildErr - } - if r.buildAwsClient && !r.buildCluster { - r.buildErr = errors.New("cannot build AwsClient without Cluster") - return nil, r.buildErr - } - if r.buildCluster && r.builtResources.Cluster == nil { r.builtResources.Cluster, err = r.ocmClient.GetClusterInfo(r.clusterId) if err != nil { @@ -183,7 +163,7 @@ func (r *ResourceBuilderT) Build() (*Resources, error) { if r.buildLogger { // Re-initialize the logger with the cluster ID. - logging.RawLogger = logging.InitLogger(r.logLevel, "", internalClusterId) + logging.RawLogger = logging.InitLogger(r.logLevel, r.pipelineName, internalClusterId) } } @@ -200,7 +180,7 @@ type ResourceBuilderMock struct { BuildError error } -func (r *ResourceBuilderMock) WithCluster(clusterId string) ResourceBuilder { +func (r *ResourceBuilderMock) WithCluster() ResourceBuilder { return r } @@ -212,26 +192,10 @@ func (r *ResourceBuilderMock) WithAwsClient() ResourceBuilder { return r } -func (r *ResourceBuilderMock) WithOcmClient(client *ocm.SdkClient) ResourceBuilder { - return r -} - -func (r *ResourceBuilderMock) WithPagerDutyClient(client *pagerduty.SdkClient) ResourceBuilder { - return r -} - func (r *ResourceBuilderMock) WithNotes() ResourceBuilder { return r } -func (r *ResourceBuilderMock) WithName(name string) ResourceBuilder { - return r -} - -func (r *ResourceBuilderMock) WithLogger(logLevel string, pipelineName string) ResourceBuilder { - return r -} - func (r *ResourceBuilderMock) Build() (*Resources, error) { if r.BuildError != nil { return nil, r.BuildError diff --git a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go index 76e480b5..8b7b680e 100644 --- a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go +++ b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go @@ -63,7 +63,7 @@ func (i *Investigation) teardown() error { func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { ctx := context.Background() result := investigation.InvestigationResult{} - r, err := rb.Build() + r, err := rb.WithCluster().Build() if err != nil { return result, err } diff --git a/pkg/investigations/precheck/precheck.go b/pkg/investigations/precheck/precheck.go index 672658ba..1b96142b 100644 --- a/pkg/investigations/precheck/precheck.go +++ b/pkg/investigations/precheck/precheck.go @@ -14,7 +14,7 @@ type ClusterStatePrecheck struct{} // Performs according pagerduty actions and returns whether CAD needs to investigate the cluster func (c *ClusterStatePrecheck) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { result := investigation.InvestigationResult{} - r, err := rb.Build() + r, err := rb.WithCluster().Build() if err != nil { return result, err } From dcb1c9d33f9bcb37b9f47584be8d9e43624c30e8 Mon Sep 17 00:00:00 2001 From: Florian Bergmann Date: Thu, 31 Jul 2025 14:04:42 +0200 Subject: [PATCH 2/3] docs(readme): Update instructions for using ResourceBuilder Explain how to use the ResourceBuilder in a new investigation, including requesting resources and the automatic dependency handling. Co-authored-by: Aider --- README.md | 15 ++++++++++++++- hack/bootstrap-investigation.sh | 8 +++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7597a2f6..ad5093aa 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,20 @@ As PagerDuty itself does not provide finer granularity for webhooks than service To add a new alert investigation: - run `make bootstrap-investigation` to generate boilerplate code in `pkg/investigations` (This creates the corresponding folder & .go file, and also appends the investigation to the `availableInvestigations` interface in `registry.go`.). -- investigation.Resources contain initialized clients for the clusters aws environment, ocm and more. See [Integrations](#integrations) +- The `Run` method of your investigation receives a `ResourceBuilder`. Use its `With...` methods to request the resources your investigation needs, then call `Build()` to get a `Resources` struct containing them. The builder automatically handles dependencies between resources (e.g., requesting an AWS client will also initialize the cluster object). For example: + ```go + func (c *Investigation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { + result := investigation.InvestigationResult{} + // Request an AWS client. This will also ensure the Cluster resource is built. + r, err := rb.WithAwsClient().Build() + if err != nil { + return result, err + } + // Now you can use r.AwsClient, r.Cluster, r.PdClient, etc. + // ... + } + ``` +- The returned `Resources` struct contains initialized clients and cluster objects. See [Integrations](#integrations) for a full list of available resources. - Add test objects or scripts used to recreate the alert symptoms to the `pkg/investigations/$INVESTIGATION_NAME/testing/` directory for future use. Be sure to clearly document the testing procedure under the `Testing` section of the investigation-specific README.md file ### Graduating an investigation diff --git a/hack/bootstrap-investigation.sh b/hack/bootstrap-investigation.sh index 1be1d8cf..4c7218cf 100755 --- a/hack/bootstrap-investigation.sh +++ b/hack/bootstrap-investigation.sh @@ -91,12 +91,10 @@ import ( type Investigation struct{} -func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { +func (c *Investigation) Run(rb *investigation.ResourceBuilder) (investigation.InvestigationResult, error) { result := investigation.InvestigationResult{} - - // Initialize PagerDuty note writer - notes := notewriter.New(r.Name, logging.RawLogger) - defer func() { r.Notes = notes }() + // TODO: Add additional builder configuration depending on your required resources using With...() + r, err := rb.WithNotes().Build() // TODO: Implement investigation logic here From e7b448cebeec6e3626aea73f4ce1a726466423a1 Mon Sep 17 00:00:00 2001 From: Florian Bergmann Date: Mon, 4 Aug 2025 16:13:27 +0200 Subject: [PATCH 3/3] Use an error to indicate stopping investigations. The error can convey more information than a simple bool. If it's nil = all good, otherwise the error can be returned why the investigation was stopped. --- cadctl/cmd/investigate/investigate.go | 4 ++-- pkg/investigations/investigation/investigation.go | 4 ++-- pkg/investigations/precheck/precheck.go | 8 +++++--- pkg/investigations/precheck/precheck_test.go | 3 ++- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cadctl/cmd/investigate/investigate.go b/cadctl/cmd/investigate/investigate.go index 3d37f954..840a5bd2 100644 --- a/cadctl/cmd/investigate/investigate.go +++ b/cadctl/cmd/investigate/investigate.go @@ -120,8 +120,8 @@ func run(cmd *cobra.Command, _ []string) error { } return err } - if result.StopInvestigations { - return nil + if result.StopInvestigations != nil { + return result.StopInvestigations } ccamInvestigation := ccam.Investigation{} diff --git a/pkg/investigations/investigation/investigation.go b/pkg/investigations/investigation/investigation.go index 42c8b30c..69903a5e 100644 --- a/pkg/investigations/investigation/investigation.go +++ b/pkg/investigations/investigation/investigation.go @@ -24,8 +24,8 @@ type InvestigationResult struct { ServiceLogSent InvestigationStep // If multiple investigations might be run this can indicate a fatal error that makes running additional investigations useless. - // Uses a false-default so it must be set expclicitly by an investigation. - StopInvestigations bool + // If nil, investigations should continue. If not nil, should contain a meaningful error message explaining why investigations must stop. + StopInvestigations error } func NewResourceBuilder(pdClient pagerduty.Client, clusterId string, name string, logLevel string, pipelineName string) (ResourceBuilder, error) { diff --git a/pkg/investigations/precheck/precheck.go b/pkg/investigations/precheck/precheck.go index 1b96142b..5c5fc127 100644 --- a/pkg/investigations/precheck/precheck.go +++ b/pkg/investigations/precheck/precheck.go @@ -1,6 +1,8 @@ package precheck import ( + "errors" + cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" "github.com/openshift/configuration-anomaly-detection/pkg/logging" @@ -23,13 +25,13 @@ func (c *ClusterStatePrecheck) Run(rb investigation.ResourceBuilder) (investigat ocmClient := r.OcmClient if cluster.State() == cmv1.ClusterStateUninstalling { logging.Info("Cluster is uninstalling and requires no investigation. Silencing alert.") - result.StopInvestigations = true + result.StopInvestigations = errors.New("cluster is already uninstalling") return result, pdClient.SilenceIncidentWithNote("CAD: Cluster is already uninstalling, silencing alert.") } if cluster.AWS() == nil { logging.Info("Cloud provider unsupported, forwarding to primary.") - result.StopInvestigations = true + result.StopInvestigations = errors.New("unsupported cloud provider (non-AWS)") return result, pdClient.EscalateIncidentWithNote("CAD could not run an automated investigation on this cluster: unsupported cloud provider.") } @@ -39,7 +41,7 @@ func (c *ClusterStatePrecheck) Run(rb investigation.ResourceBuilder) (investigat } if isAccessProtected { logging.Info("Cluster is access protected. Escalating alert.") - result.StopInvestigations = true + result.StopInvestigations = errors.New("cluster is access protected") return result, pdClient.EscalateIncidentWithNote("CAD is unable to run against access protected clusters. Please investigate.") } return result, nil diff --git a/pkg/investigations/precheck/precheck_test.go b/pkg/investigations/precheck/precheck_test.go index a033bf4c..ec669fac 100644 --- a/pkg/investigations/precheck/precheck_test.go +++ b/pkg/investigations/precheck/precheck_test.go @@ -1,6 +1,7 @@ package precheck import ( + "errors" "reflect" "testing" @@ -35,7 +36,7 @@ func TestInvestigation_Run(t *testing.T) { PdClient: pdClient, }, }}, - investigation.InvestigationResult{StopInvestigations: true}, + investigation.InvestigationResult{StopInvestigations: errors.New("unsupported cloud provider (non-AWS)")}, false, }, }