Skip to content

Commit f5b8467

Browse files
Merge pull request #454 from Nikokolas3270/SREP-207
SREP-207: The interceptor is now producing metrics which can will be used to alert is something is wrong
2 parents cef76e8 + 6016652 commit f5b8467

File tree

5 files changed

+144
-61
lines changed

5 files changed

+144
-61
lines changed

interceptor/go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ toolchain go1.23.9
77
require (
88
github.com/PagerDuty/go-pagerduty v1.8.0
99
github.com/openshift/configuration-anomaly-detection v0.0.0-00010101000000-000000000000
10+
github.com/prometheus/client_golang v1.22.0
1011
github.com/tektoncd/triggers v0.27.0
1112
google.golang.org/grpc v1.70.0
1213
knative.dev/pkg v0.0.0-20241026180704-25f6002b00f3
14+
sigs.k8s.io/controller-runtime v0.19.3
1315
)
1416

1517
require (
@@ -226,7 +228,6 @@ require (
226228
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
227229
github.com/pkg/errors v0.9.1 // indirect
228230
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
229-
github.com/prometheus/client_golang v1.22.0 // indirect
230231
github.com/prometheus/client_model v0.6.1 // indirect
231232
github.com/prometheus/common v0.63.0 // indirect
232233
github.com/prometheus/procfs v0.15.1 // indirect
@@ -306,7 +307,6 @@ require (
306307
k8s.io/kubectl v0.32.2 // indirect
307308
k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 // indirect
308309
oras.land/oras-go v1.2.5 // indirect
309-
sigs.k8s.io/controller-runtime v0.19.3 // indirect
310310
sigs.k8s.io/gateway-api v1.2.1 // indirect
311311
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
312312
sigs.k8s.io/kustomize/api v0.18.0 // indirect

interceptor/main.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ import (
1212

1313
"github.com/openshift/configuration-anomaly-detection/interceptor/pkg/interceptor"
1414
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
15+
"github.com/prometheus/client_golang/prometheus/promhttp"
1516
"knative.dev/pkg/signals"
17+
"sigs.k8s.io/controller-runtime/pkg/metrics"
1618
)
1719

1820
const (
@@ -28,10 +30,12 @@ func main() {
2830
// set up signals so we handle the first shutdown signal gracefully
2931
ctx := signals.NewContext()
3032

31-
service := interceptor.PagerDutyInterceptor{}
33+
stats := interceptor.CreateInterceptorStats()
3234
mux := http.NewServeMux()
33-
mux.Handle("/", service)
35+
mux.Handle("/", interceptor.CreateInterceptorHandler(stats))
3436
mux.HandleFunc("/ready", readinessHandler)
37+
interceptor.CreateAndRegisterMetricsCollector(stats)
38+
mux.Handle("/metrics", promhttp.HandlerFor(metrics.Registry, promhttp.HandlerOpts{Registry: metrics.Registry}))
3539

3640
srv := &http.Server{
3741
Addr: fmt.Sprintf(":%d", HTTPPort),
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package interceptor
2+
3+
import (
4+
"strconv"
5+
6+
"sigs.k8s.io/controller-runtime/pkg/metrics"
7+
8+
"github.com/prometheus/client_golang/prometheus"
9+
)
10+
11+
const (
12+
requestsCountMetricName = "cad_interceptor_requests_total"
13+
requestsCountMetricHelp = "Number of times CAD interceptor has been called (through a PagerDuty webhook, normally)"
14+
15+
errorsCountMetricName = "cad_interceptor_errors_total"
16+
errorsCountMetricHelp = "Number of times CAD interceptor has been failed to process a request"
17+
)
18+
19+
var (
20+
requestsCountMetricDesc = prometheus.NewDesc(
21+
requestsCountMetricName,
22+
requestsCountMetricHelp,
23+
nil, nil)
24+
25+
errorsCountMetricDesc = prometheus.NewDesc(
26+
errorsCountMetricName,
27+
errorsCountMetricHelp,
28+
[]string{"error_code", "reason"}, nil)
29+
)
30+
31+
type interceptorMetricsCollector struct {
32+
stats *InterceptorStats
33+
}
34+
35+
func CreateAndRegisterMetricsCollector(stats *InterceptorStats) {
36+
metrics.Registry.MustRegister(&interceptorMetricsCollector{stats})
37+
}
38+
39+
func (c *interceptorMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
40+
prometheus.DescribeByCollect(c, ch)
41+
}
42+
43+
func (c *interceptorMetricsCollector) Collect(ch chan<- prometheus.Metric) {
44+
ch <- prometheus.MustNewConstMetric(
45+
requestsCountMetricDesc,
46+
prometheus.CounterValue,
47+
float64(c.stats.RequestsCount),
48+
)
49+
50+
for codeWithReason, errorsCount := range c.stats.CodeWithReasonToErrorsCount {
51+
ch <- prometheus.MustNewConstMetric(
52+
errorsCountMetricDesc,
53+
prometheus.CounterValue,
54+
float64(errorsCount),
55+
strconv.Itoa(codeWithReason.ErrorCode),
56+
codeWithReason.Reason,
57+
)
58+
}
59+
}

interceptor/pkg/interceptor/pdinterceptor.go

Lines changed: 48 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,35 @@ import (
2424
// ErrInvalidContentType is returned when the content-type is not a JSON body.
2525
var ErrInvalidContentType = errors.New("form parameter encoding not supported, please change the hook to send JSON payloads")
2626

27-
type PagerDutyInterceptor struct{}
27+
type ErrorCodeWithReason struct {
28+
ErrorCode int
29+
Reason string
30+
}
2831

29-
func (pdi PagerDutyInterceptor) ServeHTTP(w http.ResponseWriter, r *http.Request) {
30-
b, err := pdi.executeInterceptor(r)
31-
if err != nil {
32-
var e Error
33-
if errors.As(err, &e) {
34-
logging.Infof("HTTP %d - %s", e.Status(), e)
35-
http.Error(w, e.Error(), e.Status())
36-
} else {
37-
logging.Errorf("Non Status Error: %s", err)
38-
http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
39-
}
32+
type InterceptorStats struct {
33+
RequestsCount uint64
34+
CodeWithReasonToErrorsCount map[ErrorCodeWithReason]int
35+
}
36+
37+
func CreateInterceptorStats() *InterceptorStats {
38+
return &InterceptorStats{CodeWithReasonToErrorsCount: make(map[ErrorCodeWithReason]int)}
39+
}
40+
41+
type interceptorHandler struct {
42+
stats *InterceptorStats
43+
}
44+
45+
func CreateInterceptorHandler(stats *InterceptorStats) http.Handler {
46+
return &interceptorHandler{stats}
47+
}
48+
49+
func (pdi interceptorHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
50+
pdi.stats.RequestsCount++
51+
52+
b, httpErr := pdi.executeInterceptor(r)
53+
if httpErr != nil {
54+
logging.Infof("HTTP %d - %s", httpErr.code, httpErr.err)
55+
http.Error(w, httpErr.err.Error(), httpErr.code)
4056
}
4157

4258
w.Header().Add("Content-Type", "application/json")
@@ -45,46 +61,35 @@ func (pdi PagerDutyInterceptor) ServeHTTP(w http.ResponseWriter, r *http.Request
4561
}
4662
}
4763

48-
// Error represents a handler error. It provides methods for a HTTP status
49-
// code and embeds the built-in error interface.
50-
type Error interface {
51-
error
52-
Status() int
64+
// httpError represents an error with an associated HTTP status code.
65+
type httpError struct {
66+
code int
67+
err error
5368
}
5469

55-
// HTTPError represents an error with an associated HTTP status code.
56-
type HTTPError struct {
57-
Code int
58-
Err error
59-
}
60-
61-
// Allows HTTPError to satisfy the error interface.
62-
func (se HTTPError) Error() string {
63-
return se.Err.Error()
64-
}
70+
func (pdi *interceptorHandler) httpError(errorCode int, reason string, err error) *httpError {
71+
pdi.stats.CodeWithReasonToErrorsCount[ErrorCodeWithReason{errorCode, reason}]++
6572

66-
// Returns our HTTP status code.
67-
func (se HTTPError) Status() int {
68-
return se.Code
73+
return &httpError{code: errorCode, err: fmt.Errorf("%s: %w", reason, err)}
6974
}
7075

71-
func badRequest(err error) HTTPError {
72-
return HTTPError{Code: http.StatusBadRequest, Err: err}
76+
func (pdi *interceptorHandler) badRequest(reason string, err error) *httpError {
77+
return pdi.httpError(http.StatusBadRequest, reason, err)
7378
}
7479

75-
func internal(err error) HTTPError {
76-
return HTTPError{Code: http.StatusInternalServerError, Err: err}
80+
func (pdi *interceptorHandler) internal(reason string, err error) *httpError {
81+
return pdi.httpError(http.StatusInternalServerError, reason, err)
7782
}
7883

79-
func (pdi *PagerDutyInterceptor) executeInterceptor(r *http.Request) ([]byte, error) {
84+
func (pdi *interceptorHandler) executeInterceptor(r *http.Request) ([]byte, *httpError) {
8085
// Create a context
8186
ctx, cancel := context.WithTimeout(r.Context(), 3*time.Second)
8287
defer cancel()
8388

8489
var body bytes.Buffer
8590
defer r.Body.Close() //nolint:errcheck
8691
if _, err := io.Copy(&body, r.Body); err != nil {
87-
return nil, internal(fmt.Errorf("failed to read body: %w", err))
92+
return nil, pdi.internal("failed to read body", err)
8893
}
8994
r.Body = io.NopCloser(bytes.NewReader(body.Bytes()))
9095

@@ -95,12 +100,12 @@ func (pdi *PagerDutyInterceptor) executeInterceptor(r *http.Request) ([]byte, er
95100
Header map[string][]string `json:"header"`
96101
}
97102
if err := json.Unmarshal(body.Bytes(), &originalReq); err != nil {
98-
return nil, badRequest(fmt.Errorf("failed to parse request body: %w", err))
103+
return nil, pdi.badRequest("failed to parse body", err)
99104
}
100105

101106
extractedRequest, err := http.NewRequestWithContext(ctx, r.Method, r.URL.String(), bytes.NewReader([]byte(originalReq.Body)))
102107
if err != nil {
103-
return nil, internal(fmt.Errorf("malformed body/header in unwrapped request: %w", err))
108+
return nil, pdi.internal("malformed body/header in unwrapped request", err)
104109
}
105110

106111
for k, v := range originalReq.Header {
@@ -117,26 +122,26 @@ func (pdi *PagerDutyInterceptor) executeInterceptor(r *http.Request) ([]byte, er
117122

118123
err = webhookv3.VerifySignature(extractedRequest, token)
119124
if err != nil {
120-
return nil, badRequest(fmt.Errorf("failed to verify signature: %w", err))
125+
return nil, pdi.badRequest("failed to verify signature", err)
121126
}
122127

123128
logging.Info("Signature verified successfully")
124129

125130
if err := json.Unmarshal(body.Bytes(), &ireq); err != nil {
126-
return nil, badRequest(fmt.Errorf("failed to parse body as InterceptorRequest: %w", err))
131+
return nil, pdi.badRequest("failed to parse body as InterceptorRequest", err)
127132
}
128133
logging.Debugf("Interceptor request body is: %s", ireq.Body)
129134

130-
iresp := pdi.Process(ctx, &ireq)
135+
iresp := pdi.process(ctx, &ireq)
131136
logging.Debugf("Interceptor response is: %+v", iresp)
132137
respBytes, err := json.Marshal(iresp)
133138
if err != nil {
134-
return nil, internal(err)
139+
return nil, pdi.internal("failed to encode response", err)
135140
}
136141
return respBytes, nil
137142
}
138143

139-
func (pdi *PagerDutyInterceptor) Process(ctx context.Context, r *triggersv1.InterceptorRequest) *triggersv1.InterceptorResponse {
144+
func (pdi *interceptorHandler) process(ctx context.Context, r *triggersv1.InterceptorRequest) *triggersv1.InterceptorResponse {
140145
pdClient, err := pagerduty.GetPDClient([]byte(r.Body))
141146
if err != nil {
142147
return interceptors.Failf(codes.InvalidArgument, "could not initialize pagerduty client: %v", err)

interceptor/test/e2e.sh

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ function test_interceptor {
2020

2121
local incident_id=$1
2222
local expected_response=$2
23-
local override_signature=$3
23+
local expected_metrics=$3
24+
local override_signature=$4
2425

2526
# Run the interceptor and print logs to temporary log file
2627
export PD_SIGNATURE="test"
@@ -52,26 +53,40 @@ function test_interceptor {
5253
-d "$WRAPPED_PAYLOAD" \
5354
http://localhost:8080) || CURL_EXITCODE=$?
5455

55-
# Check if the curl output matches the expected response
56-
if [[ "$CURL_OUTPUT" == "$expected_response" ]] && [[ "$CURL_EXITCODE" == "0" ]]; then
57-
echo -e "${GREEN}Test passed for incident ID $incident_id: Response is as expected.${NC}"
56+
local return_code=0
5857

59-
# Shut down the interceptor
60-
kill $INTERCEPTOR_PID
61-
else
58+
# Check if the curl output differs from the expected response
59+
if [[ "$CURL_OUTPUT" != "$expected_response" ]] || [[ "$CURL_EXITCODE" != "0" ]]; then
6260
echo -e "${RED}Test failed for incident ID $incident_id: Unexpected response.${NC}"
6361
echo -e "${RED}Expected: $expected_response${NC}"
6462
echo -e "${RED}Got: $CURL_OUTPUT${NC}"
6563
echo -e "${RED}Exit code: $CURL_EXITCODE${NC}"
6664
echo -e ""
6765
echo -e "Interceptor logs"
6866
cat $temp_log_file
67+
return_code=1
68+
else
69+
curl_metrics_exitcode=0
70+
curl_metrics_output=$(curl -s http://localhost:8080/metrics | grep '^cad_interceptor_') || curl_metrics_exitcode=$?
71+
72+
if [[ "$curl_metrics_output" != "$expected_metrics" ]] || [[ "$curl_metrics_exitcode" != "0" ]]; then
73+
echo -e "${RED}Test failed for incident ID $incident_id: Unexpected metrics.${NC}"
74+
echo -e "${RED}Expected: $expected_metrics${NC}"
75+
echo -e "${RED}Got: $curl_metrics_output${NC}"
76+
echo -e "${RED}Exit code: $curl_metrics_exitcode${NC}"
77+
echo -e ""
78+
echo -e "Interceptor logs"
79+
cat $temp_log_file
80+
return_code=1
81+
else
82+
echo -e "${GREEN}Test passed for incident ID $incident_id: Response and metrics are as expected.${NC}"
83+
fi
84+
fi
6985

70-
# Shut down the interceptor
71-
kill $INTERCEPTOR_PID
86+
# Shut down the interceptor
87+
kill $INTERCEPTOR_PID
7288

73-
return 1
74-
fi
89+
return $return_code
7590
}
7691

7792
# Expected outputs
@@ -83,13 +98,13 @@ EXPECTED_RESPONSE_SIGNATURE_ERROR='failed to verify signature: invalid webhook s
8398
echo "========= TESTS ============="
8499
# Test for a pre-existing alert we handle (ClusterProvisioningDelay)
85100
echo "Test 1: alert with existing handling returns a 'continue: true' response"
86-
test_interceptor "Q12WO44XJLR3H3" "$EXPECTED_RESPONSE_CONTINUE"
101+
test_interceptor "Q12WO44XJLR3H3" "$EXPECTED_RESPONSE_CONTINUE" "cad_interceptor_requests_total 1"
87102

88103
# Test for an alert we don't handle (alert called unhandled)
89104
echo "Test 2: unhandled alerts returns a 'continue: false' response"
90-
test_interceptor "Q3722KGCG12ZWD" "$EXPECTED_RESPONSE_STOP"
105+
test_interceptor "Q3722KGCG12ZWD" "$EXPECTED_RESPONSE_STOP" "cad_interceptor_requests_total 1"
91106

92107
# Test for an alert with invalid signature
93108
echo "Test 3: expected failure due to invalid signature"
94109
PD_SIGNATURE="invalid-signature"
95-
test_interceptor "Q12WO44XJLR3H3" "$EXPECTED_RESPONSE_SIGNATURE_ERROR" "invalid-signature"
110+
test_interceptor "Q12WO44XJLR3H3" "$EXPECTED_RESPONSE_SIGNATURE_ERROR" 'cad_interceptor_errors_total{error_code="400",reason="failed to verify signature"} 1'$'\n''cad_interceptor_requests_total 1' "invalid-signature"

0 commit comments

Comments
 (0)