Skip to content

Commit c08e9f1

Browse files
Merge pull request #347 from bergmannf/OSD-27978-metrics-return-type
[OSD-27978] Change investigation return type
2 parents 24bba06 + 417d3f4 commit c08e9f1

File tree

9 files changed

+220
-84
lines changed

9 files changed

+220
-84
lines changed

.editorconfig

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ root = true
44
end_of_line = lf
55
charset = utf-8
66
trim_trailing_whitespace = true
7-
insert_final_newline = false
7+
insert_final_newline = true
88

99
[*.go]
1010
indent_style = tab
11-
tab_width = 4
11+
tab_width = 4

cadctl/cmd/investigate/investigate.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,13 +120,18 @@ func run(_ *cobra.Command, _ []string) error {
120120

121121
customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient)
122122
if err != nil {
123-
return ccam.Evaluate(cluster, err, ocmClient, pdClient, alertInvestigation.Name)
123+
ccamResources := &investigation.Resources{Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, AdditionalResources: map[string]interface{}{"error": err}}
124+
result, err := ccam.Investigate(ccamResources)
125+
updateMetrics(alertInvestigation.Name, &result)
126+
return err
124127
}
125128

126129
investigationResources := &investigation.Resources{Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient}
127130

128131
logging.Infof("Starting investigation for %s", alertInvestigation.Name)
129-
return alertInvestigation.Run(investigationResources)
132+
result, err := alertInvestigation.Run(investigationResources)
133+
updateMetrics(alertInvestigation.Name, &result)
134+
return err
130135
}
131136

132137
// GetOCMClient will retrieve the OcmClient from the 'ocm' package
@@ -170,3 +175,15 @@ func clusterRequiresInvestigation(cluster *cmv1.Cluster, pdClient *pagerduty.Sdk
170175
}
171176
return true, nil
172177
}
178+
179+
func updateMetrics(investigationName string, result *investigation.InvestigationResult) {
180+
if result.ServiceLogSent.Performed {
181+
metrics.Inc(metrics.ServicelogSent, append([]string{investigationName}, result.ServiceLogSent.Labels...)...)
182+
}
183+
if result.ServiceLogPrepared.Performed {
184+
metrics.Inc(metrics.ServicelogPrepared, append([]string{investigationName}, result.ServiceLogPrepared.Labels...)...)
185+
}
186+
if result.LimitedSupportSet.Performed {
187+
metrics.Inc(metrics.LimitedSupportSet, append([]string{investigationName}, result.LimitedSupportSet.Labels...)...)
188+
}
189+
}

pkg/investigations/ccam/ccam.go

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,9 @@ import (
77
"regexp"
88

99
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
10+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
1011
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
11-
"github.com/openshift/configuration-anomaly-detection/pkg/metrics"
1212
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
13-
"github.com/openshift/configuration-anomaly-detection/pkg/pagerduty"
1413
)
1514

1615
var ccamLimitedSupport = &ocm.LimitedSupportReason{
@@ -20,14 +19,22 @@ var ccamLimitedSupport = &ocm.LimitedSupportReason{
2019

2120
// Evaluate estimates if the awsError is a cluster credentials are missing error. If it determines that it is,
2221
// the cluster is placed into limited support (if the cluster state allows it), otherwise an error is returned.
23-
func Evaluate(cluster *cmv1.Cluster, bpError error, ocmClient ocm.Client, pdClient pagerduty.Client, alertType string) error {
22+
func Investigate(r *investigation.Resources) (investigation.InvestigationResult, error) {
23+
result := investigation.InvestigationResult{}
24+
cluster := r.Cluster
25+
ocmClient := r.OcmClient
26+
pdClient := r.PdClient
27+
bpError, ok := r.AdditionalResources["error"].(error)
28+
if !ok {
29+
return result, fmt.Errorf("Missing required CCAM field 'error'")
30+
}
2431
logging.Info("Investigating possible missing cloud credentials...")
2532

2633
if customerRemovedPermissions := customerRemovedPermissions(bpError.Error()); !customerRemovedPermissions {
2734
// We aren't able to jumpRole because of an error that is different than
2835
// a removed support role/policy or removed installer role/policy
2936
// This would normally be a backplane failure.
30-
return fmt.Errorf("credentials are there, error is different: %w", bpError)
37+
return result, fmt.Errorf("credentials are there, error is different: %w", bpError)
3138
}
3239

3340
// The jumprole failed because of a missing support role/policy:
@@ -37,20 +44,21 @@ func Evaluate(cluster *cmv1.Cluster, bpError error, ocmClient ocm.Client, pdClie
3744
switch cluster.State() {
3845
case cmv1.ClusterStateReady:
3946
// Cluster is in functional sate but we can't jumprole to it: post limited support
40-
metrics.Inc(metrics.LimitedSupportSet, alertType, ccamLimitedSupport.Summary)
47+
result.LimitedSupportSet.Performed = true
48+
result.LimitedSupportSet.Labels = []string{ccamLimitedSupport.Summary}
4149
err := ocmClient.PostLimitedSupportReason(ccamLimitedSupport, cluster.ID())
4250
if err != nil {
43-
return fmt.Errorf("could not post limited support reason for %s: %w", cluster.Name(), err)
51+
return result, fmt.Errorf("could not post limited support reason for %s: %w", cluster.Name(), err)
4452
}
4553

46-
return pdClient.SilenceIncidentWithNote(fmt.Sprintf("Added the following Limited Support reason to cluster: %#v. Silencing alert.\n", ccamLimitedSupport))
54+
return result, pdClient.SilenceIncidentWithNote(fmt.Sprintf("Added the following Limited Support reason to cluster: %#v. Silencing alert.\n", ccamLimitedSupport))
4755
case cmv1.ClusterStateUninstalling:
4856
// A cluster in uninstalling state should not alert primary - we just skip this
49-
return pdClient.SilenceIncidentWithNote(fmt.Sprintf("Skipped adding limited support reason '%s': cluster is already uninstalling.", ccamLimitedSupport.Summary))
57+
return result, pdClient.SilenceIncidentWithNote(fmt.Sprintf("Skipped adding limited support reason '%s': cluster is already uninstalling.", ccamLimitedSupport.Summary))
5058
default:
5159
// Anything else is an unknown state to us and/or requires investigation.
5260
// E.g. we land here if we run into a CPD alert where credentials were removed (installing state) and don't want to put it in LS yet.
53-
return pdClient.EscalateIncidentWithNote(fmt.Sprintf("Cluster has invalid cloud credentials (support role/policy is missing) and the cluster is in state '%s'. Please investigate.", cluster.State()))
61+
return result, pdClient.EscalateIncidentWithNote(fmt.Sprintf("Cluster has invalid cloud credentials (support role/policy is missing) and the cluster is in state '%s'. Please investigate.", cluster.State()))
5462
}
5563
}
5664

pkg/investigations/ccam/ccam_test.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,23 @@ package ccam
33
import (
44
"errors"
55
"testing"
6+
7+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
68
)
79

810
func TestEvaluateRandomError(t *testing.T) {
911
timeoutError := errors.New("credentials are there, error is different: timeout")
10-
err := Evaluate(nil, errors.New("timeout"), nil, nil, "")
12+
input := investigation.Resources{
13+
Cluster: nil,
14+
ClusterDeployment: nil,
15+
AwsClient: nil,
16+
OcmClient: nil,
17+
PdClient: nil,
18+
AdditionalResources: map[string]interface{}{
19+
"error": errors.New("timeout"),
20+
},
21+
}
22+
_, err := Investigate(&input)
1123
if err.Error() != timeoutError.Error() {
1224
t.Fatalf("Expected error %v, but got %v", timeoutError, err)
1325
}

pkg/investigations/chgm/chgm.go

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"github.com/openshift/configuration-anomaly-detection/pkg/aws"
1111
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
1212
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
13-
"github.com/openshift/configuration-anomaly-detection/pkg/metrics"
1413
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
1514
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
1615
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
@@ -23,8 +22,6 @@ import (
2322
)
2423

2524
var (
26-
investigationName = "ClusterHasGoneMissing"
27-
2825
chgmSL = ocm.ServiceLog{
2926
Severity: "Critical",
3027
Summary: "Action required: cluster not checking in",
@@ -40,21 +37,23 @@ var (
4037
)
4138

4239
// Investigate runs the investigation for a triggered chgm pagerduty event
43-
func Investigate(r *investigation.Resources) error {
40+
func Investigate(r *investigation.Resources) (investigation.InvestigationResult, error) {
41+
result := investigation.InvestigationResult{}
4442
notes := notewriter.New("CHGM", logging.RawLogger)
4543

4644
// 1. Check if the user stopped instances
4745
res, err := investigateStoppedInstances(r.Cluster, r.ClusterDeployment, r.AwsClient, r.OcmClient)
4846
if err != nil {
49-
return r.PdClient.EscalateIncidentWithNote(fmt.Sprintf("InvestigateInstances failed: %s\n", err.Error()))
47+
return result, r.PdClient.EscalateIncidentWithNote(fmt.Sprintf("InvestigateInstances failed: %s\n", err.Error()))
5048
}
5149
logging.Debugf("the investigation returned: [infras running: %d] - [masters running: %d]", res.RunningInstances.Infra, res.RunningInstances.Master)
5250

5351
if !res.UserAuthorized {
5452
logging.Infof("Instances were stopped by unauthorized user: %s / arn: %s", res.User.UserName, res.User.IssuerUserName)
55-
return utils.WithRetries(func() error {
53+
return result, utils.WithRetries(func() error {
5654
err := postChgmSLAndSilence(r.Cluster.ID(), r.OcmClient, r.PdClient)
57-
metrics.Inc(metrics.ServicelogSent, investigationName)
55+
// XXX: metrics.Inc(metrics.ServicelogSent, investigationName)
56+
result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil}
5857

5958
return err
6059
})
@@ -89,21 +88,23 @@ func Investigate(r *investigation.Resources) error {
8988
if strings.Contains(failureReason, "nosnch.in") {
9089
err := r.OcmClient.PostLimitedSupportReason(&egressLS, r.Cluster.ID())
9190
if err != nil {
92-
return err
91+
return result, err
9392
}
9493

95-
metrics.Inc(metrics.LimitedSupportSet, investigationName, "EgressBlocked")
94+
// XXX: metrics.Inc(metrics.LimitedSupportSet, investigationName, "EgressBlocked")
95+
result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"EgressBlocked"}}
9696

9797
notes.AppendAutomation("Egress `nosnch.in` blocked, sent limited support.")
98-
return r.PdClient.SilenceIncidentWithNote(notes.String())
98+
return result, r.PdClient.SilenceIncidentWithNote(notes.String())
9999
}
100100

101101
err := r.OcmClient.PostServiceLog(r.Cluster.ID(), createEgressSL(failureReason))
102102
if err != nil {
103-
return err
103+
return result, err
104104
}
105105

106-
metrics.Inc(metrics.ServicelogSent, investigationName)
106+
// XXX: metrics.Inc(metrics.ServicelogSent, investigationName)
107+
result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil}
107108

108109
notes.AppendWarning("NetworkVerifier found unreachable targets and sent the SL, but deadmanssnitch is not blocked! \n⚠️ Please investigate this cluster.\nUnreachable: \n%s", failureReason)
109110
case networkverifier.Success:
@@ -112,7 +113,7 @@ func Investigate(r *investigation.Resources) error {
112113
}
113114

114115
// Found no issues that CAD can handle by itself - forward notes to SRE.
115-
return r.PdClient.EscalateIncidentWithNote(notes.String())
116+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
116117
}
117118

118119
// investigateHibernation checks if the cluster was recently woken up from

0 commit comments

Comments
 (0)