Skip to content

Commit d1a1274

Browse files
Merge pull request #505 from bergmannf/srep-600-resource-init
[SREP-600] Use a builder pattern to initialize resources used by investigations.
2 parents 726fa09 + d283d19 commit d1a1274

File tree

14 files changed

+611
-327
lines changed

14 files changed

+611
-327
lines changed

cadctl/cmd/investigate/investigate.go

Lines changed: 22 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,11 @@ import (
2222
"strconv"
2323
"strings"
2424

25-
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
2625
investigations "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
2726
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
2827
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
28+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/precheck"
2929
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
30-
"github.com/openshift/configuration-anomaly-detection/pkg/managedcloud"
3130
"github.com/openshift/configuration-anomaly-detection/pkg/metrics"
3231
ocm "github.com/openshift/configuration-anomaly-detection/pkg/ocm"
3332
"github.com/openshift/configuration-anomaly-detection/pkg/pagerduty"
@@ -79,14 +78,6 @@ func run(cmd *cobra.Command, _ []string) error {
7978

8079
logging.Infof("Incident link: %s", pdClient.GetIncidentRef())
8180

82-
var investigationResources *investigation.Resources
83-
84-
defer func() {
85-
if err != nil {
86-
handleCADFailure(err, investigationResources, pdClient)
87-
}
88-
}()
89-
9081
experimentalEnabledVar := os.Getenv("CAD_EXPERIMENTAL_ENABLED")
9182
cadExperimentalEnabled, _ := strconv.ParseBool(experimentalEnabledVar)
9283
alertInvestigation := investigations.GetInvestigation(pdClient.GetTitle(), cadExperimentalEnabled)
@@ -115,57 +106,47 @@ func run(cmd *cobra.Command, _ []string) error {
115106
return fmt.Errorf("could not initialize ocm client: %w", err)
116107
}
117108

118-
cluster, err := ocmClient.GetClusterInfo(clusterID)
119-
if err != nil {
120-
if strings.Contains(err.Error(), "no cluster found") {
121-
logging.Warnf("No cluster found with ID '%s'. Exiting.", clusterID)
122-
return pdClient.EscalateIncidentWithNote("CAD was unable to find the incident cluster in OCM. An alert for a non-existing cluster is unexpected. Please investigate manually.")
109+
builder := &investigation.ResourceBuilderT{}
110+
defer func() {
111+
if err != nil {
112+
handleCADFailure(err, builder, pdClient)
123113
}
124-
return fmt.Errorf("could not retrieve cluster info for %s: %w", clusterID, err)
125-
}
126-
127-
// From this point on, we normalize to internal ID, as this ID always exists.
128-
// For installing clusters, externalID can be empty.
129-
internalClusterID := cluster.ID()
114+
}()
130115

131-
// re-initialize logger for the internal-cluster-id context
132-
logging.RawLogger = logging.InitLogger(logLevelFlag, pipelineNameEnv, internalClusterID)
116+
// Prime the builder with information required for all investigations.
117+
builder.WithName(alertInvestigation.Name()).WithCluster(clusterID).WithPagerDutyClient(pdClient).WithOcmClient(ocmClient).WithLogger(logLevelFlag, pipelineNameEnv)
133118

134-
requiresInvestigation, err := clusterRequiresInvestigation(cluster, pdClient, ocmClient)
135-
if err != nil || !requiresInvestigation {
136-
return err
119+
precheck := precheck.ClusterStatePrecheck{}
120+
result, err := precheck.Run(builder)
121+
if err != nil && strings.Contains(err.Error(), "no cluster found") {
122+
logging.Warnf("No cluster found with ID '%s'. Escalating and exiting.", clusterID)
123+
return pdClient.EscalateIncidentWithNote("CAD was unable to find the incident cluster in OCM. An alert for a non-existing cluster is unexpected. Please investigate manually.")
137124
}
138125

139-
clusterDeployment, err := ocmClient.GetClusterDeployment(internalClusterID)
126+
ccamInvestigation := ccam.Investigation{}
127+
result, err = ccamInvestigation.Run(builder)
140128
if err != nil {
141-
return fmt.Errorf("could not retrieve Cluster Deployment for %s: %w", internalClusterID, err)
142-
}
143-
144-
customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient)
145-
if err != nil {
146-
ccamResources := &investigation.Resources{Name: "ccam", Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, Notes: nil, AdditionalResources: map[string]interface{}{"error": err}}
147-
inv := ccam.Investigation{}
148-
result, err := inv.Run(ccamResources)
149-
updateMetrics(alertInvestigation.Name(), &result)
150129
return err
151130
}
152-
153-
investigationResources = &investigation.Resources{Name: alertInvestigation.Name(), Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, Notes: nil}
131+
updateMetrics(alertInvestigation.Name(), &result)
154132

155133
logging.Infof("Starting investigation for %s", alertInvestigation.Name())
156-
result, err := alertInvestigation.Run(investigationResources)
157-
updateMetrics(alertInvestigation.Name(), &result)
134+
result, err = alertInvestigation.Run(builder)
158135
if err != nil {
159136
return err
160137
}
138+
updateMetrics(alertInvestigation.Name(), &result)
161139

162140
return updateIncidentTitle(pdClient)
163141
}
164142

165-
func handleCADFailure(err error, resources *investigation.Resources, pdClient *pagerduty.SdkClient) {
143+
func handleCADFailure(err error, rb *investigation.ResourceBuilderT, pdClient *pagerduty.SdkClient) {
166144
logging.Errorf("CAD investigation failed: %v", err)
167145

168146
var notes string
147+
// The builder caches resources, so we can access them here even if a later step failed.
148+
// We ignore the error here because we just want to get any notes that were created.
149+
resources, _ := rb.Build()
169150
if resources != nil && resources.Notes != nil {
170151
resources.Notes.AppendWarning("🚨 CAD investigation failed, CAD team has been notified. Please investigate manually. 🚨")
171152
notes = resources.Notes.String()
@@ -185,32 +166,6 @@ func handleCADFailure(err error, resources *investigation.Resources, pdClient *p
185166
}
186167
}
187168

188-
// Checks pre-requisites for a cluster investigation:
189-
// - the cluster's state is supported by CAD for an investigation (= not uninstalling)
190-
// - the cloud provider is supported by CAD (cluster is AWS)
191-
// Performs according pagerduty actions and returns whether CAD needs to investigate the cluster
192-
func clusterRequiresInvestigation(cluster *cmv1.Cluster, pdClient *pagerduty.SdkClient, ocmClient *ocm.SdkClient) (bool, error) {
193-
if cluster.State() == cmv1.ClusterStateUninstalling {
194-
logging.Info("Cluster is uninstalling and requires no investigation. Silencing alert.")
195-
return false, pdClient.SilenceIncidentWithNote("CAD: Cluster is already uninstalling, silencing alert.")
196-
}
197-
198-
if cluster.AWS() == nil {
199-
logging.Info("Cloud provider unsupported, forwarding to primary.")
200-
return false, pdClient.EscalateIncidentWithNote("CAD could not run an automated investigation on this cluster: unsupported cloud provider.")
201-
}
202-
203-
isAccessProtected, err := ocmClient.IsAccessProtected(cluster)
204-
if err != nil {
205-
logging.Warnf("failed to get access protection status for cluster. %w. Continuing...")
206-
}
207-
if isAccessProtected {
208-
logging.Info("Cluster is access protected. Escalating alert.")
209-
return false, pdClient.EscalateIncidentWithNote("CAD is unable to run against access protected clusters. Please investigate.")
210-
}
211-
return true, nil
212-
}
213-
214169
func updateMetrics(investigationName string, result *investigation.InvestigationResult) {
215170
if result.ServiceLogSent.Performed {
216171
metrics.Inc(metrics.ServicelogSent, append([]string{investigationName}, result.ServiceLogSent.Labels...)...)

pkg/investigations/cannotretrieveupdatessre/cannotretrieveupdatessre.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,12 @@ import (
1818
type Investigation struct{}
1919

2020
// Run executes the investigation for the CannotRetrieveUpdatesSRE alert
21-
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
21+
func (c *Investigation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) {
2222
result := investigation.InvestigationResult{}
23+
r, err := rb.WithAwsClient().Build()
24+
if err != nil {
25+
return result, err
26+
}
2327
notes := notewriter.New("CannotRetrieveUpdatesSRE", logging.RawLogger)
2428
k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, r.Name)
2529
if err != nil {

pkg/investigations/ccam/ccam.go

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -21,47 +21,47 @@ var ccamLimitedSupport = &ocm.LimitedSupportReason{
2121

2222
// Evaluate estimates if the awsError is a cluster credentials are missing error. If it determines that it is,
2323
// the cluster is placed into limited support (if the cluster state allows it), otherwise an error is returned.
24-
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
24+
func (c *Investigation) Run(r investigation.ResourceBuilder) (investigation.InvestigationResult, error) {
2525
result := investigation.InvestigationResult{}
26-
cluster := r.Cluster
27-
ocmClient := r.OcmClient
28-
pdClient := r.PdClient
29-
bpError, ok := r.AdditionalResources["error"].(error)
30-
if !ok {
31-
return result, fmt.Errorf("missing required Investigation field 'error'")
32-
}
26+
// Apart from the defaults this investigation requires an AWS client which can fail to build
27+
resources, err := r.WithAwsClient().Build()
3328
logging.Info("Investigating possible missing cloud credentials...")
34-
35-
if customerRemovedPermissions := customerRemovedPermissions(bpError.Error()); !customerRemovedPermissions {
36-
// We aren't able to jumpRole because of an error that is different than
37-
// a removed support role/policy or removed installer role/policy
38-
// This would normally be a backplane failure.
39-
return result, fmt.Errorf("credentials are there, error is different: %w", bpError)
40-
}
41-
42-
// The jumprole failed because of a missing support role/policy:
43-
// we need to figure out if we cluster state allows us to set limited support
44-
// (the cluster is in a ready state, not uninstalling, installing, etc.)
45-
46-
switch cluster.State() {
47-
case cmv1.ClusterStateReady:
48-
// Cluster is in functional sate but we can't jumprole to it: post limited support
49-
result.LimitedSupportSet.Performed = true
50-
result.LimitedSupportSet.Labels = []string{ccamLimitedSupport.Summary}
51-
err := ocmClient.PostLimitedSupportReason(ccamLimitedSupport, cluster.ID())
52-
if err != nil {
53-
return result, fmt.Errorf("could not post limited support reason for %s: %w", cluster.Name(), err)
29+
if err != nil {
30+
if customerRemovedPermissions := customerRemovedPermissions(err.Error()); !customerRemovedPermissions {
31+
// We aren't able to jumpRole because of an error that is different than
32+
// a removed support role/policy or removed installer role/policy
33+
// This would normally be a backplane failure.
34+
return result, err
35+
}
36+
cluster := resources.Cluster
37+
ocmClient := resources.OcmClient
38+
pdClient := resources.PdClient
39+
40+
// The jumprole failed because of a missing support role/policy:
41+
// we need to figure out if we cluster state allows us to set limited support
42+
// (the cluster is in a ready state, not uninstalling, installing, etc.)
43+
44+
switch cluster.State() {
45+
case cmv1.ClusterStateReady:
46+
// Cluster is in functional sate but we can't jumprole to it: post limited support
47+
result.LimitedSupportSet.Performed = true
48+
result.LimitedSupportSet.Labels = []string{ccamLimitedSupport.Summary}
49+
err := ocmClient.PostLimitedSupportReason(ccamLimitedSupport, cluster.ID())
50+
if err != nil {
51+
return result, fmt.Errorf("could not post limited support reason for %s: %w", cluster.Name(), err)
52+
}
53+
54+
return result, pdClient.SilenceIncidentWithNote(fmt.Sprintf("Added the following Limited Support reason to cluster: %#v. Silencing alert.\n", ccamLimitedSupport))
55+
case cmv1.ClusterStateUninstalling:
56+
// A cluster in uninstalling state should not alert primary - we just skip this
57+
return result, pdClient.SilenceIncidentWithNote(fmt.Sprintf("Skipped adding limited support reason '%s': cluster is already uninstalling.", ccamLimitedSupport.Summary))
58+
default:
59+
// Anything else is an unknown state to us and/or requires investigation.
60+
// E.g. we land here if we run into a CPD alert where credentials were removed (installing state) and don't want to put it in LS yet.
61+
return result, pdClient.EscalateIncidentWithNote(fmt.Sprintf("Cluster has invalid cloud credentials (support role/policy is missing) and the cluster is in state '%s'. Please investigate.", cluster.State()))
5462
}
55-
56-
return result, pdClient.SilenceIncidentWithNote(fmt.Sprintf("Added the following Limited Support reason to cluster: %#v. Silencing alert.\n", ccamLimitedSupport))
57-
case cmv1.ClusterStateUninstalling:
58-
// A cluster in uninstalling state should not alert primary - we just skip this
59-
return result, pdClient.SilenceIncidentWithNote(fmt.Sprintf("Skipped adding limited support reason '%s': cluster is already uninstalling.", ccamLimitedSupport.Summary))
60-
default:
61-
// Anything else is an unknown state to us and/or requires investigation.
62-
// E.g. we land here if we run into a CPD alert where credentials were removed (installing state) and don't want to put it in LS yet.
63-
return result, pdClient.EscalateIncidentWithNote(fmt.Sprintf("Cluster has invalid cloud credentials (support role/policy is missing) and the cluster is in state '%s'. Please investigate.", cluster.State()))
6463
}
64+
return result, nil
6565
}
6666

6767
func (c *Investigation) Name() string {

pkg/investigations/ccam/ccam_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@ import (
99

1010
func TestEvaluateRandomError(t *testing.T) {
1111
timeoutError := errors.New("credentials are there, error is different: timeout")
12-
input := investigation.Resources{
13-
Cluster: nil,
14-
ClusterDeployment: nil,
15-
AwsClient: nil,
16-
OcmClient: nil,
17-
PdClient: nil,
18-
AdditionalResources: map[string]interface{}{
19-
"error": errors.New("timeout"),
12+
input := investigation.ResourceBuilderMock{
13+
Resources: &investigation.Resources{
14+
Cluster: nil,
15+
ClusterDeployment: nil,
16+
AwsClient: nil,
17+
OcmClient: nil,
18+
PdClient: nil,
2019
},
20+
BuildError: timeoutError,
2121
}
2222

2323
inv := Investigation{}

pkg/investigations/chgm/chgm.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,12 @@ var (
3636
type Investiation struct{}
3737

3838
// Run runs the investigation for a triggered chgm pagerduty event
39-
func (c *Investiation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
39+
func (c *Investiation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) {
4040
result := investigation.InvestigationResult{}
41+
r, err := rb.WithClusterDeployment().Build()
42+
if err != nil {
43+
return result, err
44+
}
4145
notes := notewriter.New("CHGM", logging.RawLogger)
4246
defer func() { r.Notes = notes }()
4347

0 commit comments

Comments
 (0)