Skip to content

Commit 8fb1c44

Browse files
committed
Move CCAM to use the investigate abstraction as well.
It's ugly, because it uses a map[string]interface for random data passing now, but it keeps it inline with all the other checks performed by CAD.
1 parent 5ddff32 commit 8fb1c44

File tree

5 files changed

+50
-20
lines changed

5 files changed

+50
-20
lines changed

cadctl/cmd/investigate/investigate.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,10 @@ func run(_ *cobra.Command, _ []string) error {
120120

121121
customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient)
122122
if err != nil {
123-
return ccam.Evaluate(cluster, err, ocmClient, pdClient, alertInvestigation.Name)
123+
ccamResources := &investigation.Resources{Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, AdditionalResources: map[string]interface{}{"alertType": alertInvestigation.Name, "error": err}}
124+
result, err := ccam.Investigate(ccamResources)
125+
updateMetrics(&result)
126+
return err
124127
}
125128

126129
investigationResources := &investigation.Resources{Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient}
@@ -183,4 +186,4 @@ func updateMetrics(result *investigation.InvestigationResult) {
183186
if result.LimitedSupportSet.Performed {
184187
metrics.Inc(metrics.LimitedSupportSet, append([]string{result.InvestigationName}, result.LimitedSupportSet.Labels...)...)
185188
}
186-
}
189+
}

pkg/investigations/ccam/ccam.go

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,9 @@ import (
77
"regexp"
88

99
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
10+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
1011
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
11-
"github.com/openshift/configuration-anomaly-detection/pkg/metrics"
1212
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
13-
"github.com/openshift/configuration-anomaly-detection/pkg/pagerduty"
1413
)
1514

1615
var ccamLimitedSupport = &ocm.LimitedSupportReason{
@@ -20,14 +19,27 @@ var ccamLimitedSupport = &ocm.LimitedSupportReason{
2019

2120
// Evaluate estimates if the awsError is a cluster credentials are missing error. If it determines that it is,
2221
// the cluster is placed into limited support (if the cluster state allows it), otherwise an error is returned.
23-
func Evaluate(cluster *cmv1.Cluster, bpError error, ocmClient ocm.Client, pdClient pagerduty.Client, alertType string) error {
22+
func Investigate(r *investigation.Resources) (investigation.InvestigationResult, error) {
23+
result := investigation.InvestigationResult{}
24+
cluster := r.Cluster
25+
ocmClient := r.OcmClient
26+
pdClient := r.PdClient
27+
alertType, ok := r.AdditionalResources["alertType"].(string)
28+
if !ok {
29+
return result, fmt.Errorf("Missing required CCAM field 'alertType'")
30+
}
31+
result.InvestigationName = alertType
32+
bpError, ok := r.AdditionalResources["error"].(error)
33+
if !ok {
34+
return result, fmt.Errorf("Missing required CCAM field 'error'")
35+
}
2436
logging.Info("Investigating possible missing cloud credentials...")
2537

2638
if customerRemovedPermissions := customerRemovedPermissions(bpError.Error()); !customerRemovedPermissions {
2739
// We aren't able to jumpRole because of an error that is different than
2840
// a removed support role/policy or removed installer role/policy
2941
// This would normally be a backplane failure.
30-
return fmt.Errorf("credentials are there, error is different: %w", bpError)
42+
return result, fmt.Errorf("credentials are there, error is different: %w", bpError)
3143
}
3244

3345
// The jumprole failed because of a missing support role/policy:
@@ -37,20 +49,21 @@ func Evaluate(cluster *cmv1.Cluster, bpError error, ocmClient ocm.Client, pdClie
3749
switch cluster.State() {
3850
case cmv1.ClusterStateReady:
3951
// Cluster is in functional sate but we can't jumprole to it: post limited support
40-
metrics.Inc(metrics.LimitedSupportSet, alertType, ccamLimitedSupport.Summary)
52+
result.LimitedSupportSet.Performed = true
53+
result.LimitedSupportSet.Labels = []string{ccamLimitedSupport.Summary}
4154
err := ocmClient.PostLimitedSupportReason(ccamLimitedSupport, cluster.ID())
4255
if err != nil {
43-
return fmt.Errorf("could not post limited support reason for %s: %w", cluster.Name(), err)
56+
return result, fmt.Errorf("could not post limited support reason for %s: %w", cluster.Name(), err)
4457
}
4558

46-
return pdClient.SilenceIncidentWithNote(fmt.Sprintf("Added the following Limited Support reason to cluster: %#v. Silencing alert.\n", ccamLimitedSupport))
59+
return result, pdClient.SilenceIncidentWithNote(fmt.Sprintf("Added the following Limited Support reason to cluster: %#v. Silencing alert.\n", ccamLimitedSupport))
4760
case cmv1.ClusterStateUninstalling:
4861
// A cluster in uninstalling state should not alert primary - we just skip this
49-
return pdClient.SilenceIncidentWithNote(fmt.Sprintf("Skipped adding limited support reason '%s': cluster is already uninstalling.", ccamLimitedSupport.Summary))
62+
return result, pdClient.SilenceIncidentWithNote(fmt.Sprintf("Skipped adding limited support reason '%s': cluster is already uninstalling.", ccamLimitedSupport.Summary))
5063
default:
5164
// Anything else is an unknown state to us and/or requires investigation.
5265
// E.g. we land here if we run into a CPD alert where credentials were removed (installing state) and don't want to put it in LS yet.
53-
return pdClient.EscalateIncidentWithNote(fmt.Sprintf("Cluster has invalid cloud credentials (support role/policy is missing) and the cluster is in state '%s'. Please investigate.", cluster.State()))
66+
return result, pdClient.EscalateIncidentWithNote(fmt.Sprintf("Cluster has invalid cloud credentials (support role/policy is missing) and the cluster is in state '%s'. Please investigate.", cluster.State()))
5467
}
5568
}
5669

@@ -94,4 +107,4 @@ func customerRemovedPermissions(backplaneError string) bool {
94107
}
95108

96109
return false
97-
}
110+
}

pkg/investigations/ccam/ccam_test.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,24 @@ package ccam
33
import (
44
"errors"
55
"testing"
6+
7+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
68
)
79

810
func TestEvaluateRandomError(t *testing.T) {
911
timeoutError := errors.New("credentials are there, error is different: timeout")
10-
err := Evaluate(nil, errors.New("timeout"), nil, nil, "")
12+
input := investigation.Resources{
13+
Cluster: nil,
14+
ClusterDeployment: nil,
15+
AwsClient: nil,
16+
OcmClient: nil,
17+
PdClient: nil,
18+
AdditionalResources: map[string]interface{}{
19+
"alertType": "",
20+
"error": errors.New("timeout"),
21+
},
22+
}
23+
_, err := Investigate(&input)
1124
if err.Error() != timeoutError.Error() {
1225
t.Fatalf("Expected error %v, but got %v", timeoutError, err)
1326
}
@@ -59,4 +72,4 @@ func TestCustomerRemovedPermissions(t *testing.T) {
5972
}
6073
})
6174
}
62-
}
75+
}

pkg/investigations/chgm/chgm.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,4 +443,4 @@ func postChgmSLAndSilence(clusterID string, ocmCli ocm.Client, pdCli pagerduty.C
443443
}
444444

445445
return pdCli.SilenceIncidentWithNote("Customer stopped instances. Sent SL and silencing alert.")
446-
}
446+
}

pkg/investigations/investigation.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,10 @@ func NewInvestigation(investigationFn func(resources *Resources) (InvestigationR
3535

3636
// Resources holds all resources/tools required for alert investigations
3737
type Resources struct {
38-
Cluster *cmv1.Cluster
39-
ClusterDeployment *hivev1.ClusterDeployment
40-
AwsClient aws.Client
41-
OcmClient ocm.Client
42-
PdClient pagerduty.Client
38+
Cluster *cmv1.Cluster
39+
ClusterDeployment *hivev1.ClusterDeployment
40+
AwsClient aws.Client
41+
OcmClient ocm.Client
42+
PdClient pagerduty.Client
43+
AdditionalResources map[string]interface{}
4344
}

0 commit comments

Comments
 (0)