Skip to content

Commit 5051cf6

Browse files
committed
API Call Alerts
Fails e2e run when alerts are firing. Adds prometheus alerts for excessive API calls from operator-controller or catalogd, as well as summary graphs to match. Signed-off-by: Daniel Franz <[email protected]>
1 parent 5970a0d commit 5051cf6

File tree

5 files changed

+53
-33
lines changed

5 files changed

+53
-33
lines changed

.github/workflows/e2e.yaml

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35,21 +35,6 @@ jobs:
3535
- name: Run e2e tests
3636
run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
3737

38-
- name: alerts-check
39-
# Grab all current alerts, filtering out pending, and print the GH actions warning string
40-
# containing the alert name and description.
41-
#
42-
# NOTE: Leaving this as annotating-only instead of failing the run until we have some more
43-
# finely-tuned alerts.
44-
run: |
45-
if [[ -s /tmp/artifacts/alerts.out ]]; then \
46-
jq -r 'if .state=="firing" then
47-
"::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)"
48-
elif .state=="pending" then
49-
"::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
50-
end' /tmp/artifacts/alerts.out
51-
fi
52-
5338
- uses: actions/upload-artifact@v4
5439
if: failure()
5540
with:

config/overlays/prometheus/prometheus_rule.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,15 @@ spec:
5757
keep_firing_for: 1d
5858
annotations:
5959
description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%"
60+
- alert: operator-controller-api-call-rate
61+
expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10
62+
for: 5m
63+
keep_firing_for: 1d
64+
annotations:
65+
description: "operator-controller making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"
66+
- alert: catalogd-api-call-rate
67+
expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5
68+
for: 5m
69+
keep_firing_for: 1d
70+
annotations:
71+
description: "catalogd making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"

test/e2e/e2e_suite_test.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package e2e
22

33
import (
44
"context"
5+
"fmt"
56
"os"
67
"testing"
78

@@ -40,7 +41,11 @@ func TestMain(m *testing.M) {
4041

4142
res := m.Run()
4243
err = utils.PrintSummary(testSummaryOutputEnvVar)
43-
utilruntime.Must(err)
44+
if err != nil {
45+
// Fail the run if alerts are found
46+
fmt.Printf("%s", err)
47+
os.Exit(1)
48+
}
4449
os.Exit(res)
4550
}
4651

test/utils/summary.go

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -42,25 +42,31 @@ type xychart struct {
4242
}
4343

4444
type githubSummary struct {
45-
client api.Client
46-
Pods []string
45+
client api.Client
46+
firingAlerts bool
47+
Pods []string
4748
}
4849

4950
func NewSummary(c api.Client, pods ...string) githubSummary {
5051
return githubSummary{
51-
client: c,
52-
Pods: pods,
52+
client: c,
53+
Pods: pods,
54+
firingAlerts: false,
5355
}
5456
}
5557

58+
func (s *githubSummary) FiringAlerts() bool {
59+
return s.firingAlerts
60+
}
61+
5662
// PerformanceQuery queries the prometheus server and generates a mermaid xychart with the data.
5763
// title - Display name of the xychart
5864
// pod - Pod name with which to filter results from prometheus
5965
// query - Prometheus query
6066
// yLabel - Label of the Y axis i.e. "KB/s", "MB", etc.
6167
// scaler - Constant by which to scale the results. For instance, cpu usage is more human-readable
6268
// as "mCPU" vs "CPU", so we scale the results by a factor of 1,000.
63-
func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) {
69+
func (s *githubSummary) PerformanceQuery(title, pod, query, yLabel string, scaler float64) (string, error) {
6470
v1api := v1.NewAPI(s.client)
6571
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
6672
defer cancel()
@@ -115,7 +121,7 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string,
115121

116122
// Alerts queries the prometheus server for alerts and generates markdown output for anything found.
117123
// If no alerts are found, the alerts section will contain only "None." in the final output.
118-
func (s githubSummary) Alerts() (string, error) {
124+
func (s *githubSummary) Alerts() (string, error) {
119125
v1api := v1.NewAPI(s.client)
120126
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
121127
defer cancel()
@@ -136,6 +142,7 @@ func (s githubSummary) Alerts() (string, error) {
136142
switch a.State {
137143
case v1.AlertStateFiring:
138144
firingAlerts = append(firingAlerts, aConv)
145+
s.firingAlerts = true
139146
case v1.AlertStatePending:
140147
pendingAlerts = append(pendingAlerts, aConv)
141148
// Ignore AlertStateInactive; the alerts endpoint doesn't return them
@@ -173,27 +180,31 @@ func executeTemplate(templateFile string, obj any) (string, error) {
173180
// template. This allows us to add or change queries (hopefully) without needing to
174181
// touch code. The summary will be output to a file supplied by the env target.
175182
func PrintSummary(envTarget string) error {
183+
path := os.Getenv(envTarget)
184+
if path == "" {
185+
fmt.Printf("No summary output specified; skipping")
186+
return nil
187+
}
188+
176189
client, err := api.NewClient(api.Config{
177190
Address: defaultPromUrl,
178191
})
179192
if err != nil {
180-
fmt.Printf("Error creating prometheus client: %v\n", err)
181-
os.Exit(1)
193+
return err
182194
}
183195

184196
summary := NewSummary(client, "operator-controller", "catalogd")
185-
summaryMarkdown, err := executeTemplate(summaryTemplate, summary)
197+
summaryMarkdown, err := executeTemplate(summaryTemplate, &summary)
186198
if err != nil {
187199
return err
188200
}
189-
if path := os.Getenv(envTarget); path != "" {
190-
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
191-
if err != nil {
192-
return err
193-
}
194-
fmt.Printf("Test summary output to %s successful\n", envTarget)
195-
} else {
196-
fmt.Printf("No summary output specified; skipping")
201+
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
202+
if err != nil {
203+
return err
204+
}
205+
fmt.Printf("Test summary output to %s successful\n", envTarget)
206+
if summary.FiringAlerts() {
207+
return fmt.Errorf("Alert(s) encountered during test run; see summary for details")
197208
}
198209
return nil
199210
}

test/utils/templates/summary.md.tmpl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111

1212
#### CPU Usage
1313
{{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}}
14+
15+
#### API Queries Total
16+
{{$.PerformanceQuery "API Queries Total" $pod `sum(rest_client_requests_total{job=~"%s.*"})[5m:]` "# queries" 1}}
17+
18+
#### API Query Rate
19+
{{$.PerformanceQuery "API Queries/sec" $pod `sum(rate(rest_client_requests_total{job=~"%s.*"}[5m]))[5m:]` "per sec" 1}}
20+
1421
{{end}}
1522
{{- end}}
1623

0 commit comments

Comments
 (0)