Skip to content

Commit d2329ef

Browse files
authored
Merge branch 'main' into OSD-28718-loglevel-config
2 parents 90cdbc9 + 7b85acb commit d2329ef

File tree

12 files changed

+198
-39
lines changed

12 files changed

+198
-39
lines changed

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ test-interceptor-e2e: check-go121-install check-jq-install check-vault-install b
8383
@echo "Running e2e tests for interceptor..."
8484
cd interceptor && ./test/e2e.sh
8585

86+
##@ Boilerplate:
87+
.PHONY: boilerplate
88+
bootstrap-investigation: ## Bootstrap a new boilerplate investigation
89+
@cd hack && ./bootstrap-investigation.sh
90+
8691
##@ Template-updater:
8792
.PHONY: template-updater
8893
template-updater: build-template-updater lint-template-updater ## Run all targets for template-updater

README.md

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,26 @@ The required investigation is identified by CAD based on the incident and its pa
6666
As PagerDuty itself does not provide finer granularity for webhooks than service-based, CAD filters out the alerts it should investigate. For more information, please refer to https://support.pagerduty.com/docs/webhooks.
6767

6868
To add a new alert investigation:
69-
- create a mapping for the alert to the `GetInvestigation` function in `mapping.go` and write a corresponding CAD investigation (e.g. `Investigate()` in `chgm.go`).
70-
- if the alert is not yet routed to CAD, add a webhook to the service your alert fires on. For production, the service should also have an escalation policy that escalates to SRE on CAD automation timeout.
69+
70+
- run `make bootstrap-investigation` to generate boilerplate code in `pkg/investigations` (This creates the corresponding folder & .go file, and also appends the investigation to the `availableInvestigations` interface in `registry.go`.).
71+
- investigation.Resources contain initialized clients for the clusters aws environment, ocm and more. See [Integrations](#integrations)
72+
73+
### Integrations
74+
75+
> **Note:** When writing an investiation, you can use them right away.
76+
They are initialized for you and passed to the investigation via investigation.Resources.
77+
78+
79+
* [AWS](https://github.com/aws/aws-sdk-go) -- Logging into the cluster, retreiving instance info and AWS CloudTrail events.
80+
- See `pkg/aws`
81+
* [PagerDuty](https://github.com/PagerDuty/go-pagerduty) -- Retrieving alert info, esclating or silencing incidents, and adding notes.
82+
- See `pkg/pagerduty`
83+
* [OCM](https://github.com/openshift-online/ocm-sdk-go) -- Retrieving cluster info, sending service logs, and managing (post, delete) limited support reasons.
84+
- See `pkg/ocm`
85+
- In case of missing permissions to query an ocm resource, add it to the Configuration-Anomaly-Detection role in uhc-account-manager
86+
* [osd-network-verifier](https://github.com/openshift/osd-network-verifier) -- Tool to verify the pre-configured networking components for ROSA and OSD CCS clusters.
87+
* [k8sclient](https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/client) -- Interact with clusters kube-api
88+
- Requires RBAC definitions for your investigation to be added to `metadata.yaml`
7189

7290
## Testing locally
7391

cadctl/cmd/investigate/investigate.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ func run(cmd *cobra.Command, _ []string) error {
132132
customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient)
133133
if err != nil {
134134
ccamResources := &investigation.Resources{Name: "ccam", Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, AdditionalResources: map[string]interface{}{"error": err}}
135-
inv := ccam.CCAM{}
135+
inv := ccam.Investigation{}
136136
result, err := inv.Run(ccamResources)
137137
updateMetrics(alertInvestigation.Name(), &result)
138138
return err

hack/bootstrap-investigation.sh

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#!/usr/bin/env bash
2+
3+
set -e
4+
5+
read -p "Enter the new investigation (package) name: " INVESTIGATION_NAME
6+
if [[ "${INVESTIGATION_NAME}" == "" ]] ; then
7+
echo "Investigation name cannot be empty."
8+
exit 1
9+
elif [[ "${INVESTIGATION_NAME}" =~ [^a-zA-Z0-9_] ]] ; then
10+
echo "Investigation name must be alphanumeric."
11+
exit 1
12+
fi
13+
14+
read -p "Enter new investigation description: " INVESTIGATION_DESCRIPTION
15+
if [[ "${INVESTIGATION_DESCRIPTION}" == "" ]] ; then
16+
INVESTIGATION_DESCRIPTION="TODO"
17+
fi
18+
19+
read -p "Should Investigate Alert (y/n): " INVESTIGATE_ALERT_BOOL
20+
if [[ "${INVESTIGATE_ALERT_BOOL}" == "y" ]] ; then
21+
read -p "Investigation alert string: " INVESTIGATION_ALERT_STRING
22+
INVESTIGATION_ALERT="strings.Contains(alert, \"${INVESTIGATION_ALERT_STRING}\")"
23+
elif [[ "${INVESTIGATE_ALERT_BOOL}" == "n" ]] ; then
24+
INVESTIGATION_ALERT="false"
25+
else
26+
echo "Invalid input. Please enter 'y' or 'n'."
27+
exit 1
28+
fi
29+
30+
INVESTIGATION_NAME=$(echo "${INVESTIGATION_NAME}" | tr '[:upper:]' '[:lower:]')
31+
32+
INVESTIGATION_DIR="../pkg/investigations/${INVESTIGATION_NAME}"
33+
34+
if [ -d "${INVESTIGATION_DIR}" ]; then
35+
echo "Investigation of name ${INVESTIGATION_NAME} already exists."
36+
exit 1
37+
fi
38+
39+
mkdir -p "${INVESTIGATION_DIR}"
40+
ls "${INVESTIGATION_DIR}"
41+
42+
touch "${INVESTIGATION_DIR}/${INVESTIGATION_NAME}.go"
43+
touch "${INVESTIGATION_DIR}/metadata.yaml"
44+
touch "${INVESTIGATION_DIR}/README.md"
45+
46+
# Create README.md file
47+
cat <<EOF > "${INVESTIGATION_DIR}/README.md"
48+
# ${INVESTIGATION_NAME} Investigation
49+
50+
${INVESTIGATION_DESCRIPTION}
51+
52+
EOF
53+
54+
# Create metadata.yaml file
55+
cat <<EOF > "${INVESTIGATION_DIR}/metadata.yaml"
56+
name: ${INVESTIGATION_NAME}
57+
rbac:
58+
roles: []
59+
clusterRoleRules: []
60+
customerDataAccess: false
61+
62+
EOF
63+
64+
# Create boilerplate investigation file
65+
cat <<EOF > "${INVESTIGATION_DIR}/${INVESTIGATION_NAME}.go"
66+
// Package ${INVESTIGATION_NAME} contains...TODO
67+
package ${INVESTIGATION_NAME}
68+
69+
import (
70+
"strings"
71+
72+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
73+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
74+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
75+
)
76+
77+
type Investigation struct{}
78+
79+
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
80+
result := investigation.InvestigationResult{}
81+
82+
// Initialize PagerDuty note writer
83+
notes := notewriter.New(r.Name, logging.RawLogger)
84+
85+
// TODO: Implement investigation logic here
86+
87+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
88+
}
89+
90+
func (c *Investigation) Name() string {
91+
return "${INVESTIGATION_NAME}"
92+
}
93+
94+
func (c *Investigation) Description() string {
95+
return "${INVESTIGATION_DESCRIPTION}"
96+
}
97+
98+
func (c *Investigation) ShouldInvestigateAlert(alert string) bool {
99+
return ${INVESTIGATION_ALERT}
100+
}
101+
102+
func (c *Investigation) IsExperimental() bool {
103+
// TODO: Update to false when graduating to production.
104+
return true
105+
}
106+
107+
EOF
108+
109+
echo "${INVESTIGATION_NAME} created in ${INVESTIGATION_DIR}"
110+
echo "metadata.yaml file created in ${INVESTIGATION_DIR}"
111+
112+
# Update registry.go to contain new investigation
113+
if ! grep -q "${INVESTIGATION_NAME}" ../pkg/investigations/registry.go && ! grep -q "${INVESTIGATION_NAME}" ../pkg/investigations/registry.go; then
114+
sed -i "/import (/a \\\t\"github.com/openshift/configuration-anomaly-detection/pkg/investigations/${INVESTIGATION_NAME}\"" ../pkg/investigations/registry.go
115+
sed -i "/var availableInvestigations = \[/a \\\t&${INVESTIGATION_NAME}.Investigation{}," ../pkg/investigations/registry.go
116+
echo "${INVESTIGATION_NAME} added to registry.go"
117+
else
118+
echo "${INVESTIGATION_NAME} already exists in registry.go"
119+
fi

interceptor/README.md

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,32 @@
1-
# CAD Tekton Interceptor
1+
# CAD Tekton Interceptor
22

3-
The tekton interceptor is a component plugged between the event listener and the task runs. The interceptor makes sure we don't start a pipeline for every alert we receive. Instead, alerts are filtered based on whether or not they are handled by CAD. Unhandled alerts are directly escalated and no pipeline is started.
3+
The tekton interceptor is a component plugged between the event listener and the task runs. The interceptor makes sure we don't start a pipeline for every alert we receive. Instead, alerts are filtered based on whether or not they are handled by CAD. Unhandled alerts are directly escalated and no pipeline is started.
44

55
## Testing
66

77
### E2E
88

99
The interceptor has E2E tests starting the HTTP service and checking the HTTP responses. The tests are based on pre-existing PagerDuty alerts.
10-
```
10+
11+
``` bash
12+
1113
make e2e-interceptor
1214

1315
# To also print the output of the interceptor service:
1416
CAD_E2E_VERBOSE=true make test-interceptor
15-
```
17+
```
18+
19+
## Development
20+
21+
It is possible to run the interceptor locally in a "minimal" state, where E2E is not used, and only the
22+
crucial-to-run env variables (seen below) are set as placeholders. This is useful for *local* development/debugging.
23+
24+
``` bash
25+
$ make build-interceptor
26+
27+
$ CAD_SILENT_POLICY=test
28+
$ CAD_PD_TOKEN=test
29+
$ PD_SIGNATURE=test
30+
31+
$ ./bin/interceptor
32+
```

pkg/investigations/ccam/ccam.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import (
1212
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
1313
)
1414

15-
type CCAM struct{}
15+
type Investigation struct{}
1616

1717
var ccamLimitedSupport = &ocm.LimitedSupportReason{
1818
Summary: "Restore missing cloud credentials",
@@ -21,14 +21,14 @@ var ccamLimitedSupport = &ocm.LimitedSupportReason{
2121

2222
// Evaluate estimates if the awsError is a cluster credentials are missing error. If it determines that it is,
2323
// the cluster is placed into limited support (if the cluster state allows it), otherwise an error is returned.
24-
func (c *CCAM) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
24+
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
2525
result := investigation.InvestigationResult{}
2626
cluster := r.Cluster
2727
ocmClient := r.OcmClient
2828
pdClient := r.PdClient
2929
bpError, ok := r.AdditionalResources["error"].(error)
3030
if !ok {
31-
return result, fmt.Errorf("Missing required CCAM field 'error'")
31+
return result, fmt.Errorf("Missing required Investigation field 'error'")
3232
}
3333
logging.Info("Investigating possible missing cloud credentials...")
3434

@@ -64,19 +64,19 @@ func (c *CCAM) Run(r *investigation.Resources) (investigation.InvestigationResul
6464
}
6565
}
6666

67-
func (c *CCAM) Name() string {
67+
func (c *Investigation) Name() string {
6868
return "Cluster Credentials Are Missing (CCAM)"
6969
}
7070

71-
func (c *CCAM) Description() string {
71+
func (c *Investigation) Description() string {
7272
return "Detects missing cluster credentials"
7373
}
7474

75-
func (c *CCAM) ShouldInvestigateAlert(alert string) bool {
75+
func (c *Investigation) ShouldInvestigateAlert(alert string) bool {
7676
return false
7777
}
7878

79-
func (c *CCAM) IsExperimental() bool {
79+
func (c *Investigation) IsExperimental() bool {
8080
return false
8181
}
8282

pkg/investigations/ccam/ccam_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ func TestEvaluateRandomError(t *testing.T) {
2020
},
2121
}
2222

23-
inv := CCAM{}
23+
inv := Investigation{}
2424

2525
_, err := inv.Run(&input)
2626
if err.Error() != timeoutError.Error() {

pkg/investigations/chgm/chgm.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ var (
3636
}
3737
)
3838

39-
type CHGM struct{}
39+
type Investiation struct{}
4040

4141
// Run runs the investigation for a triggered chgm pagerduty event
42-
func (c *CHGM) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
42+
func (c *Investiation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
4343
result := investigation.InvestigationResult{}
4444
notes := notewriter.New("CHGM", logging.RawLogger)
4545

@@ -118,19 +118,19 @@ func (c *CHGM) Run(r *investigation.Resources) (investigation.InvestigationResul
118118
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
119119
}
120120

121-
func (c *CHGM) Name() string {
121+
func (c *Investiation) Name() string {
122122
return "Cluster Has Gone Missing (CHGM)"
123123
}
124124

125-
func (c *CHGM) Description() string {
125+
func (c *Investiation) Description() string {
126126
return "Detects reason for clusters that have gone missing"
127127
}
128128

129-
func (c *CHGM) ShouldInvestigateAlert(alert string) bool {
129+
func (c *Investiation) ShouldInvestigateAlert(alert string) bool {
130130
return strings.Contains(alert, "has gone missing")
131131
}
132132

133-
func (c *CHGM) IsExperimental() bool {
133+
func (c *Investiation) IsExperimental() bool {
134134
return false
135135
}
136136

pkg/investigations/chgm/chgm_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ var _ = Describe("chgm", func() {
9292
mockCtrl.Finish()
9393
})
9494

95-
inv := CHGM{}
95+
inv := Investiation{}
9696

9797
Describe("Triggered", func() {
9898
When("Triggered finds instances stopped by the customer", func() {

pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ var uwmMisconfiguredSL = ocm.ServiceLog{
2525
InternalOnly: false,
2626
}
2727

28-
type CMEBB struct{}
28+
type Investigation struct{}
2929

30-
func (c *CMEBB) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
30+
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
3131
// Initialize k8s client
3232
// This would be better suited to be passend in with the investigation resources
3333
// In turn we would need to split out ccam and k8sclient, as those are tied to a cluster
@@ -84,20 +84,20 @@ func (c *CMEBB) Run(r *investigation.Resources) (investigation.InvestigationResu
8484
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
8585
}
8686

87-
func (c *CMEBB) Name() string {
87+
func (c *Investigation) Name() string {
8888
return "clustermonitoringerrorbudgetburn"
8989
}
9090

91-
func (c *CMEBB) Description() string {
91+
func (c *Investigation) Description() string {
9292
return "Investigate the cluster monitoring error budget burn alert"
9393
}
9494

95-
func (c *CMEBB) ShouldInvestigateAlert(alert string) bool {
95+
func (c *Investigation) ShouldInvestigateAlert(alert string) bool {
9696
return strings.Contains(alert, "ClusterMonitoringErrorBudgetBurnSRE")
9797
}
9898

99-
func (c *CMEBB) IsExperimental() bool {
100-
return true
99+
func (c *Investigation) IsExperimental() bool {
100+
return false
101101
}
102102

103103
// Check if the `Available` status condition reports a broken UWM config

0 commit comments

Comments
 (0)