Skip to content

Commit 9375d40

Browse files
authored
Merge branch 'main' into main
2 parents fd00d08 + 241ed3b commit 9375d40

File tree

13 files changed

+130
-49
lines changed

13 files changed

+130
-49
lines changed

Makefile

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ build: build-interceptor build-cadctl build-template-updater ## Build all subpro
2626
.PHONY: lint
2727
lint: lint-cadctl lint-interceptor lint-template-updater ## Lint all subprojects
2828

29+
.PHONY: test
30+
test: test-cadctl test-interceptor
31+
2932
##@ cadctl:
3033
.PHONY: cadctl
3134
cadctl: generate-cadctl build-cadctl test-cadctl lint-cadctl generate-template-file ## Run all targets for cadctl (generate, build, test, lint, generation)
@@ -54,7 +57,7 @@ test-cadctl: check-go121-install ## Run automated tests for cadctl
5457

5558
##@ Interceptor:
5659
.PHONY: interceptor
57-
interceptor: build-interceptor test-interceptor lint-interceptor ## Run all targets for interceptor (build, test, lint)
60+
interceptor: build-interceptor test-interceptor test-interceptor-e2e lint-interceptor ## Run all targets for interceptor (build, test, lint)
5861

5962
.PHONY: build-interceptor
6063
build-interceptor: check-go121-install ## Build the interceptor binary
@@ -69,10 +72,13 @@ lint-interceptor: install-linter ## Lint interceptor subproject
6972
cd interceptor && GOLANGCI_LINT_CACHE=$$(mktemp -d) $(GOPATH)/bin/golangci-lint run -c ../.golangci.yml
7073

7174
.PHONY: test-interceptor
72-
test-interceptor: check-go121-install check-jq-install check-vault-install build-interceptor ## Run automated tests for interceptor
75+
test-interceptor: check-go121-install check-jq-install build-interceptor ## Run unit tests for interceptor
7376
@echo
7477
@echo "Running unit tests for interceptor..."
7578
cd interceptor && go test -race -mod=readonly ./...
79+
80+
.PHONY: test-interceptor-e2e
81+
test-interceptor-e2e: check-go121-install check-jq-install check-vault-install build-interceptor ## Run e2e tests for interceptor
7682
@echo
7783
@echo "Running e2e tests for interceptor..."
7884
cd interceptor && ./test/e2e.sh

README.md

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,27 @@ The required investigation is identified by CAD based on the incident and its pa
6666
As PagerDuty itself does not provide finer granularity for webhooks than service-based, CAD filters out the alerts it should investigate. For more information, please refer to https://support.pagerduty.com/docs/webhooks.
6767

6868
To add a new alert investigation:
69+
6970
- run `make bootstrap-investigation` to generate boilerplate code in `pkg/investigations` (This creates the corresponding folder & .go file, and also appends the investigation to the `availableInvestigations` interface in `registry.go`.).
7071
- if the alert is not yet routed to CAD, add a webhook to the service your alert fires on. For production, the service should also have an escalation policy that escalates to SRE on CAD automation timeout.
7172

73+
### Integrations
74+
75+
> **Note:** When writing an investiation, you can use them right away.
76+
They are initialized for you and passed to the investigation via investigation.Resources.
77+
78+
79+
* [AWS](https://github.com/aws/aws-sdk-go) -- Logging into the cluster, retreiving instance info and AWS CloudTrail events.
80+
- See `pkg/aws`
81+
* [PagerDuty](https://github.com/PagerDuty/go-pagerduty) -- Retrieving alert info, esclating or silencing incidents, and adding notes.
82+
- See `pkg/pagerduty`
83+
* [OCM](https://github.com/openshift-online/ocm-sdk-go) -- Retrieving cluster info, sending service logs, and managing (post, delete) limited support reasons.
84+
- See `pkg/ocm`
85+
- In case of missing permissions to query an ocm resource, add it to the Configuration-Anomaly-Detection role in uhc-account-manager
86+
* [osd-network-verifier](https://github.com/openshift/osd-network-verifier) -- Tool to verify the pre-configured networking components for ROSA and OSD CCS clusters.
87+
* [k8sclient](https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/client) -- Interact with clusters kube-api
88+
- Requires RBAC definitions for your investigation to be added to `metadata.yaml`
89+
7290
## Testing locally
7391

7492
### Pre-requirements
@@ -98,13 +116,6 @@ Every alert managed by CAD corresponds to an investigation, representing the exe
98116

99117
Investigation specific documentation can be found in the according investigation folder, e.g. for [ClusterHasGoneMissing](./pkg/investigations/chgm/README.md).
100118

101-
### Integrations
102-
103-
* [AWS](https://github.com/aws/aws-sdk-go) -- Logging into the cluster, retreiving instance info and AWS CloudTrail events.
104-
* [PagerDuty](https://github.com/PagerDuty/go-pagerduty) -- Retrieving alert info, esclating or silencing incidents, and adding notes.
105-
* [OCM](https://github.com/openshift-online/ocm-sdk-go) -- Retrieving cluster info, sending service logs, and managing (post, delete) limited support reasons.
106-
* [osd-network-verifier](https://github.com/openshift/osd-network-verifier) -- Tool to verify the pre-configured networking components for ROSA and OSD CCS clusters.
107-
108119
### Templates
109120

110121
* [Update-Template](./hack/update-template/README.md) -- Updating configuration-anomaly-detection-template.Template.yaml.

cadctl/cmd/investigate/investigate.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ func run(_ *cobra.Command, _ []string) error {
122122
customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient)
123123
if err != nil {
124124
ccamResources := &investigation.Resources{Name: "ccam", Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, AdditionalResources: map[string]interface{}{"error": err}}
125-
inv := ccam.CCAM{}
125+
inv := ccam.Investigation{}
126126
result, err := inv.Run(ccamResources)
127127
updateMetrics(alertInvestigation.Name(), &result)
128128
return err

interceptor/README.md

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,32 @@
1-
# CAD Tekton Interceptor
1+
# CAD Tekton Interceptor
22

3-
The tekton interceptor is a component plugged between the event listener and the task runs. The interceptor makes sure we don't start a pipeline for every alert we receive. Instead, alerts are filtered based on whether or not they are handled by CAD. Unhandled alerts are directly escalated and no pipeline is started.
3+
The tekton interceptor is a component plugged between the event listener and the task runs. The interceptor makes sure we don't start a pipeline for every alert we receive. Instead, alerts are filtered based on whether or not they are handled by CAD. Unhandled alerts are directly escalated and no pipeline is started.
44

55
## Testing
66

77
### E2E
88

99
The interceptor has E2E tests starting the HTTP service and checking the HTTP responses. The tests are based on pre-existing PagerDuty alerts.
10-
```
10+
11+
``` bash
12+
1113
make e2e-interceptor
1214

1315
# To also print the output of the interceptor service:
1416
CAD_E2E_VERBOSE=true make test-interceptor
15-
```
17+
```
18+
19+
## Development
20+
21+
It is possible to run the interceptor locally in a "minimal" state, where E2E is not used, and only the
22+
crucial-to-run env variables (seen below) are set as placeholders. This is useful for *local* development/debugging.
23+
24+
``` bash
25+
$ make build-interceptor
26+
27+
$ CAD_SILENT_POLICY=test
28+
$ CAD_PD_TOKEN=test
29+
$ PD_SIGNATURE=test
30+
31+
$ ./bin/interceptor
32+
```

interceptor/pkg/interceptor/pdinterceptor.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"os"
1212
"time"
1313

14+
"github.com/PagerDuty/go-pagerduty/webhookv3"
1415
investigations "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
1516
"github.com/openshift/configuration-anomaly-detection/pkg/pagerduty"
1617
triggersv1 "github.com/tektoncd/triggers/pkg/apis/triggers/v1beta1"
@@ -39,6 +40,7 @@ func (pdi PagerDutyInterceptor) ServeHTTP(w http.ResponseWriter, r *http.Request
3940
http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
4041
}
4142
}
43+
4244
w.Header().Add("Content-Type", "application/json")
4345
if _, err := w.Write(b); err != nil {
4446
pdi.Logger.Errorf("failed to write response: %s", err)
@@ -86,7 +88,44 @@ func (pdi *PagerDutyInterceptor) executeInterceptor(r *http.Request) ([]byte, er
8688
if _, err := io.Copy(&body, r.Body); err != nil {
8789
return nil, internal(fmt.Errorf("failed to read body: %w", err))
8890
}
91+
r.Body = io.NopCloser(bytes.NewReader(body.Bytes()))
92+
93+
// originalReq is the original request that was sent to the interceptor,
94+
// due to be unwrapped into a new header and body for signature verification.
95+
var originalReq struct {
96+
Body string `json:"body"`
97+
Header map[string][]string `json:"header"`
98+
}
99+
if err := json.Unmarshal(body.Bytes(), &originalReq); err != nil {
100+
return nil, badRequest(fmt.Errorf("failed to parse request body: %w", err))
101+
}
102+
103+
extractedRequest, err := http.NewRequestWithContext(ctx, r.Method, r.URL.String(), bytes.NewReader([]byte(originalReq.Body)))
104+
if err != nil {
105+
return nil, internal(fmt.Errorf("malformed body/header in unwrapped request: %w", err))
106+
}
107+
108+
for k, v := range originalReq.Header {
109+
for _, v := range v {
110+
extractedRequest.Header.Add(k, v)
111+
}
112+
}
113+
89114
var ireq triggersv1.InterceptorRequest
115+
116+
// logging request
117+
pdi.Logger.Info("Wrapped Request header: %v", r.Header)
118+
pdi.Logger.Info("Wrapped Request body: ", body.String())
119+
pdi.Logger.Info("Unwrapped Request header: %v", extractedRequest.Header)
120+
pdi.Logger.Info("Unwrapped Request body: ", originalReq.Body)
121+
122+
token, _ := os.LookupEnv("PD_SIGNATURE")
123+
124+
err = webhookv3.VerifySignature(extractedRequest, token)
125+
if err != nil {
126+
return nil, badRequest(fmt.Errorf("failed to verify signature: %w", err))
127+
}
128+
90129
if err := json.Unmarshal(body.Bytes(), &ireq); err != nil {
91130
return nil, badRequest(fmt.Errorf("failed to parse body as InterceptorRequest: %w", err))
92131
}

interceptor/test/e2e.sh

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ temp_log_file=$(mktemp)
1919
function test_interceptor {
2020
# Run the interceptor and print logs to temporary log file
2121
CAD_PD_TOKEN=$(echo $pd_test_token) CAD_SILENT_POLICY=$(echo $pd_test_silence_policy) ./../bin/interceptor > $temp_log_file 2>&1 &
22-
22+
PD_SIGNATURE="test"
23+
PAYLOAD="{\"body\":\"{\\\"__pd_metadata\\\":{\\\"incident\\\":{\\\"id\\\":\\\"$incident_id\\\"}}}\",\"header\":{\"Content-Type\":[\"application/json\"]},\"extensions\":{},\"interceptor_params\":{},\"context\":null}"
24+
SIGN=$(echo -n "$PAYLOAD" | sha256hmac -K $PD_SIGNATURE | tr -d "[:space:]-")
25+
echo "Sign: $SIGN"
26+
2327
# Store the PID of the interceptor process
2428
INTERCEPTOR_PID=$!
2529

@@ -32,8 +36,8 @@ function test_interceptor {
3236
# Send an interceptor request to localhost:8080
3337
# See https://pkg.go.dev/github.com/tektoncd/triggers/pkg/apis/triggers/v1alpha1#InterceptorRequest
3438
CURL_EXITCODE=0
35-
CURL_OUTPUT=$(curl -s -X POST -H "Content-Type: application/json" \
36-
-d "{\"body\":\"{\\\"__pd_metadata\\\":{\\\"incident\\\":{\\\"id\\\":\\\"$incident_id\\\"}}}\",\"header\":{\"Content-Type\":[\"application/json\"]},\"extensions\":{},\"interceptor_params\":{},\"context\":null}" \
39+
CURL_OUTPUT=$(curl -s -X POST -H "X-PagerDuty-Signature:v1=${SIGN}" -H "Content-Type: application/json" \
40+
-d "$PAYLOAD" \
3741
http://localhost:8080) || CURL_EXITCODE=$?
3842

3943
# Check if the curl output matches the expected response
@@ -69,5 +73,9 @@ echo "Test 1: alert with existing handling returns a 'continue: true' response"
6973
test_interceptor "Q12WO44XJLR3H3" "$EXPECTED_RESPONSE_CONTINUE"
7074

7175
# Test for an alert we don't handle (alert called unhandled)
72-
echo "Test 1: unhandled alerts returns a 'continue: false' response"
76+
echo "Test 2: unhandled alerts returns a 'continue: false' response"
7377
test_interceptor "Q3722KGCG12ZWD" "$EXPECTED_RESPONSE_STOP"
78+
79+
echo "Test 3: expected failure due to invalid signature"
80+
PD_SIGNATURE="invalid-signature"
81+
test_interceptor "Q12WO44XJLR3H3" "$EXPECTED_RESPONSE_STOP"

pkg/investigations/ccam/ccam.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import (
1212
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
1313
)
1414

15-
type CCAM struct{}
15+
type Investigation struct{}
1616

1717
var ccamLimitedSupport = &ocm.LimitedSupportReason{
1818
Summary: "Restore missing cloud credentials",
@@ -21,14 +21,14 @@ var ccamLimitedSupport = &ocm.LimitedSupportReason{
2121

2222
// Evaluate estimates if the awsError is a cluster credentials are missing error. If it determines that it is,
2323
// the cluster is placed into limited support (if the cluster state allows it), otherwise an error is returned.
24-
func (c *CCAM) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
24+
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
2525
result := investigation.InvestigationResult{}
2626
cluster := r.Cluster
2727
ocmClient := r.OcmClient
2828
pdClient := r.PdClient
2929
bpError, ok := r.AdditionalResources["error"].(error)
3030
if !ok {
31-
return result, fmt.Errorf("Missing required CCAM field 'error'")
31+
return result, fmt.Errorf("Missing required Investigation field 'error'")
3232
}
3333
logging.Info("Investigating possible missing cloud credentials...")
3434

@@ -64,19 +64,19 @@ func (c *CCAM) Run(r *investigation.Resources) (investigation.InvestigationResul
6464
}
6565
}
6666

67-
func (c *CCAM) Name() string {
67+
func (c *Investigation) Name() string {
6868
return "Cluster Credentials Are Missing (CCAM)"
6969
}
7070

71-
func (c *CCAM) Description() string {
71+
func (c *Investigation) Description() string {
7272
return "Detects missing cluster credentials"
7373
}
7474

75-
func (c *CCAM) ShouldInvestigateAlert(alert string) bool {
75+
func (c *Investigation) ShouldInvestigateAlert(alert string) bool {
7676
return false
7777
}
7878

79-
func (c *CCAM) IsExperimental() bool {
79+
func (c *Investigation) IsExperimental() bool {
8080
return false
8181
}
8282

pkg/investigations/ccam/ccam_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ func TestEvaluateRandomError(t *testing.T) {
2020
},
2121
}
2222

23-
inv := CCAM{}
23+
inv := Investigation{}
2424

2525
_, err := inv.Run(&input)
2626
if err.Error() != timeoutError.Error() {

pkg/investigations/chgm/chgm.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ var (
3636
}
3737
)
3838

39-
type CHGM struct{}
39+
type Investiation struct{}
4040

4141
// Run runs the investigation for a triggered chgm pagerduty event
42-
func (c *CHGM) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
42+
func (c *Investiation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
4343
result := investigation.InvestigationResult{}
4444
notes := notewriter.New("CHGM", logging.RawLogger)
4545

@@ -118,19 +118,19 @@ func (c *CHGM) Run(r *investigation.Resources) (investigation.InvestigationResul
118118
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
119119
}
120120

121-
func (c *CHGM) Name() string {
121+
func (c *Investiation) Name() string {
122122
return "Cluster Has Gone Missing (CHGM)"
123123
}
124124

125-
func (c *CHGM) Description() string {
125+
func (c *Investiation) Description() string {
126126
return "Detects reason for clusters that have gone missing"
127127
}
128128

129-
func (c *CHGM) ShouldInvestigateAlert(alert string) bool {
129+
func (c *Investiation) ShouldInvestigateAlert(alert string) bool {
130130
return strings.Contains(alert, "has gone missing")
131131
}
132132

133-
func (c *CHGM) IsExperimental() bool {
133+
func (c *Investiation) IsExperimental() bool {
134134
return false
135135
}
136136

pkg/investigations/chgm/chgm_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ var _ = Describe("chgm", func() {
9292
mockCtrl.Finish()
9393
})
9494

95-
inv := CHGM{}
95+
inv := Investiation{}
9696

9797
Describe("Triggered", func() {
9898
When("Triggered finds instances stopped by the customer", func() {

0 commit comments

Comments
 (0)