Skip to content

Commit 106e6fa

Browse files
Merge pull request #354 from petrkotas/add-interface
Add intesigation interface
2 parents b1e891f + e020c6a commit 106e6fa

File tree

12 files changed

+169
-176
lines changed

12 files changed

+169
-176
lines changed

cadctl/cmd/investigate/investigate.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ import (
2222
"path/filepath"
2323

2424
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
25-
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
25+
investigations "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
2626
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
27-
investigation_mapping "github.com/openshift/configuration-anomaly-detection/pkg/investigations/mapping"
27+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
2828
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
2929
"github.com/openshift/configuration-anomaly-detection/pkg/managedcloud"
3030
"github.com/openshift/configuration-anomaly-detection/pkg/metrics"
@@ -71,7 +71,8 @@ func run(_ *cobra.Command, _ []string) error {
7171

7272
logging.Infof("Incident link: %s", pdClient.GetIncidentRef())
7373

74-
alertInvestigation := investigation_mapping.GetInvestigation(pdClient.GetTitle())
74+
_, cadExperimentalEnabled := os.LookupEnv("CAD_EXPERIMENTAL_ENABLED")
75+
alertInvestigation := investigations.GetInvestigation(pdClient.GetTitle(), cadExperimentalEnabled)
7576

7677
// Escalate all unsupported alerts
7778
if alertInvestigation == nil {
@@ -82,7 +83,7 @@ func run(_ *cobra.Command, _ []string) error {
8283
return nil
8384
}
8485

85-
metrics.Inc(metrics.Alerts, alertInvestigation.Name)
86+
metrics.Inc(metrics.Alerts, alertInvestigation.Name())
8687

8788
// clusterID can end up being either be the internal or external ID.
8889
// We don't really care, as we only use this to initialize the cluster object,
@@ -121,16 +122,17 @@ func run(_ *cobra.Command, _ []string) error {
121122
customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient)
122123
if err != nil {
123124
ccamResources := &investigation.Resources{Name: "ccam", Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, AdditionalResources: map[string]interface{}{"error": err}}
124-
result, err := ccam.Investigate(ccamResources)
125-
updateMetrics(alertInvestigation.Name, &result)
125+
inv := ccam.CCAM{}
126+
result, err := inv.Run(ccamResources)
127+
updateMetrics(alertInvestigation.Name(), &result)
126128
return err
127129
}
128130

129-
investigationResources := &investigation.Resources{Name: alertInvestigation.Name, Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient}
131+
investigationResources := &investigation.Resources{Name: alertInvestigation.Name(), Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient}
130132

131133
logging.Infof("Starting investigation for %s", alertInvestigation.Name)
132134
result, err := alertInvestigation.Run(investigationResources)
133-
updateMetrics(alertInvestigation.Name, &result)
135+
updateMetrics(alertInvestigation.Name(), &result)
134136
return err
135137
}
136138

interceptor/pkg/interceptor/pdinterceptor.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@ import (
88
"fmt"
99
"io"
1010
"net/http"
11+
"os"
1112
"time"
1213

13-
investigation_mapping "github.com/openshift/configuration-anomaly-detection/pkg/investigations/mapping"
14+
investigations "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
1415
"github.com/openshift/configuration-anomaly-detection/pkg/pagerduty"
1516
triggersv1 "github.com/tektoncd/triggers/pkg/apis/triggers/v1beta1"
1617
"github.com/tektoncd/triggers/pkg/interceptors"
@@ -106,7 +107,8 @@ func (pdi *PagerDutyInterceptor) Process(ctx context.Context, r *triggersv1.Inte
106107
return interceptors.Failf(codes.InvalidArgument, "could not initialize pagerduty client: %v", err)
107108
}
108109

109-
investigation := investigation_mapping.GetInvestigation(pdClient.GetTitle())
110+
_, cadExperimentalEnabled := os.LookupEnv("CAD_EXPERIMENTAL_ENABLED")
111+
investigation := investigations.GetInvestigation(pdClient.GetTitle(), cadExperimentalEnabled)
110112
// If the alert is not in the whitelist, return `Continue: false` as interceptor response
111113
// and escalate the alert to SRE
112114
if investigation == nil {

pkg/investigations/ccam/ccam.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,21 @@ import (
77
"regexp"
88

99
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
10-
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
10+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
1111
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
1212
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
1313
)
1414

15+
type CCAM struct{}
16+
1517
var ccamLimitedSupport = &ocm.LimitedSupportReason{
1618
Summary: "Restore missing cloud credentials",
1719
Details: "Your cluster requires you to take action because Red Hat is not able to access the infrastructure with the provided credentials. Please restore the credentials and permissions provided during install",
1820
}
1921

2022
// Evaluate estimates if the awsError is a cluster credentials are missing error. If it determines that it is,
2123
// the cluster is placed into limited support (if the cluster state allows it), otherwise an error is returned.
22-
func Investigate(r *investigation.Resources) (investigation.InvestigationResult, error) {
24+
func (c *CCAM) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
2325
result := investigation.InvestigationResult{}
2426
cluster := r.Cluster
2527
ocmClient := r.OcmClient
@@ -62,6 +64,22 @@ func Investigate(r *investigation.Resources) (investigation.InvestigationResult,
6264
}
6365
}
6466

67+
func (c *CCAM) Name() string {
68+
return "Cluster Credentials Are Missing (CCAM)"
69+
}
70+
71+
func (c *CCAM) Description() string {
72+
return "Detects missing cluster credentials"
73+
}
74+
75+
func (c *CCAM) ShouldInvestigateAlert(alert string) bool {
76+
return false
77+
}
78+
79+
func (c *CCAM) IsExperimental() bool {
80+
return false
81+
}
82+
6583
// userCausedErrors contains the list of backplane returned error strings that we map to
6684
// customer modifications/role deletions.
6785
var userCausedErrors = []string{

pkg/investigations/ccam/ccam_test.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import (
44
"errors"
55
"testing"
66

7-
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
7+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
88
)
99

1010
func TestEvaluateRandomError(t *testing.T) {
@@ -19,7 +19,10 @@ func TestEvaluateRandomError(t *testing.T) {
1919
"error": errors.New("timeout"),
2020
},
2121
}
22-
_, err := Investigate(&input)
22+
23+
inv := CCAM{}
24+
25+
_, err := inv.Run(&input)
2326
if err.Error() != timeoutError.Error() {
2427
t.Fatalf("Expected error %v, but got %v", timeoutError, err)
2528
}

pkg/investigations/chgm/chgm.go

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import (
88
"time"
99

1010
"github.com/openshift/configuration-anomaly-detection/pkg/aws"
11-
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
11+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
1212
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
1313
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
1414
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
@@ -36,8 +36,10 @@ var (
3636
}
3737
)
3838

39-
// Investigate runs the investigation for a triggered chgm pagerduty event
40-
func Investigate(r *investigation.Resources) (investigation.InvestigationResult, error) {
39+
type CHGM struct{}
40+
41+
// Run runs the investigation for a triggered chgm pagerduty event
42+
func (c *CHGM) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
4143
result := investigation.InvestigationResult{}
4244
notes := notewriter.New("CHGM", logging.RawLogger)
4345

@@ -116,6 +118,22 @@ func Investigate(r *investigation.Resources) (investigation.InvestigationResult,
116118
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
117119
}
118120

121+
func (c *CHGM) Name() string {
122+
return "Cluster Has Gone Missing (CHGM)"
123+
}
124+
125+
func (c *CHGM) Description() string {
126+
return "Detects reason for clusters that have gone missing"
127+
}
128+
129+
func (c *CHGM) ShouldInvestigateAlert(alert string) bool {
130+
return strings.Contains(alert, "has gone missing")
131+
}
132+
133+
func (c *CHGM) IsExperimental() bool {
134+
return false
135+
}
136+
119137
// investigateHibernation checks if the cluster was recently woken up from
120138
// hibernation. If clusters are hibernated for more than 30 days, the internal
121139
// certificates of the kubelets can expire and CSRs need to be approved

0 commit comments

Comments
 (0)