Skip to content

Commit 6c168e2

Browse files
committed
API Call Alerts
Do not fail e2e run when issues with summary generation are encountered. Add prometheus alerts for excessive API calls from operator-controller or catalogd, as well as summary graphs to match. Signed-off-by: Daniel Franz <[email protected]>
1 parent a62ff79 commit 6c168e2

File tree

5 files changed

+54
-35
lines changed

5 files changed

+54
-35
lines changed

.github/workflows/e2e.yaml

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,22 +33,7 @@ jobs:
3333
go-version-file: go.mod
3434

3535
- name: Run e2e tests
36-
run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
37-
38-
- name: alerts-check
39-
# Grab all current alerts, filtering out pending, and print the GH actions warning string
40-
# containing the alert name and description.
41-
#
42-
# NOTE: Leaving this as annotating-only instead of failing the run until we have some more
43-
# finely-tuned alerts.
44-
run: |
45-
if [[ -s /tmp/artifacts/alerts.out ]]; then \
46-
jq -r 'if .state=="firing" then
47-
"::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)"
48-
elif .state=="pending" then
49-
"::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
50-
end' /tmp/artifacts/alerts.out
51-
fi
36+
run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-e2e
5237

5338
- uses: actions/upload-artifact@v4
5439
if: failure()
@@ -75,7 +60,7 @@ jobs:
7560
go-version-file: go.mod
7661

7762
- name: Run e2e tests
78-
run: ARTIFACT_PATH=/tmp/artifacts make test-experimental-e2e
63+
run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-experimental-e2e
7964

8065
- uses: actions/upload-artifact@v4
8166
if: failure()

config/overlays/prometheus/prometheus_rule.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,15 @@ spec:
5757
keep_firing_for: 1d
5858
annotations:
5959
description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%"
60+
- alert: operator-controller-api-call-rate
61+
expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10
62+
for: 5m
63+
keep_firing_for: 1d
64+
annotations:
65+
description: "operator-controller making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"
66+
- alert: catalogd-api-call-rate
67+
expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5
68+
for: 5m
69+
keep_firing_for: 1d
70+
annotations:
71+
description: "catalogd making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"

test/e2e/e2e_suite_test.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package e2e
22

33
import (
44
"context"
5+
"fmt"
56
"os"
67
"testing"
78

@@ -24,7 +25,7 @@ var (
2425
)
2526

2627
const (
27-
testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY"
28+
testSummaryOutputEnvVar = "E2E_SUMMARY_OUTPUT"
2829
testCatalogRefEnvVar = "CATALOG_IMG"
2930
testCatalogName = "test-catalog"
3031
latestImageTag = "latest"
@@ -39,8 +40,18 @@ func TestMain(m *testing.M) {
3940
utilruntime.Must(err)
4041

4142
res := m.Run()
42-
err = utils.PrintSummary(testSummaryOutputEnvVar)
43-
utilruntime.Must(err)
43+
44+
path := os.Getenv(testSummaryOutputEnvVar)
45+
if path == "" {
46+
fmt.Printf("Note: E2E_SUMMARY_OUTPUT is unset; skipping summary generation")
47+
} else {
48+
err = utils.PrintSummary(path)
49+
if err != nil {
50+
// Fail the run if alerts are found
51+
fmt.Printf("%s", err)
52+
os.Exit(1)
53+
}
54+
}
4455
os.Exit(res)
4556
}
4657

test/utils/summary.go

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func NewSummary(c api.Client, pods ...string) githubSummary {
6060
// yLabel - Label of the Y axis i.e. "KB/s", "MB", etc.
6161
// scaler - Constant by which to scale the results. For instance, cpu usage is more human-readable
6262
// as "mCPU" vs "CPU", so we scale the results by a factor of 1,000.
63-
func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) {
63+
func (s githubSummary) PerformanceQuery(title, pod, query, yLabel string, scaler float64) (string, error) {
6464
v1api := v1.NewAPI(s.client)
6565
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
6666
defer cancel()
@@ -90,8 +90,9 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string,
9090
formattedData := make([]string, 0)
9191
// matrix does not allow [] access, so we just do one iteration for the single result
9292
for _, metric := range matrix {
93-
if len(metric.Values) < 1 {
94-
return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values))
93+
if len(metric.Values) < 2 {
94+
// A graph with one data point means something with the collection was wrong
95+
return "", fmt.Errorf("expected at least two data points; got: %d", len(metric.Values))
9596
}
9697
for _, sample := range metric.Values {
9798
floatSample := float64(sample.Value) * scaler
@@ -172,28 +173,31 @@ func executeTemplate(templateFile string, obj any) (string, error) {
172173
// The markdown is template-driven; the summary methods are called from within the
173174
// template. This allows us to add or change queries (hopefully) without needing to
174175
// touch code. The summary will be output to a file supplied by the env target.
175-
func PrintSummary(envTarget string) error {
176+
func PrintSummary(path string) error {
177+
if path == "" {
178+
fmt.Printf("No summary output path specified; skipping")
179+
return nil
180+
}
181+
176182
client, err := api.NewClient(api.Config{
177183
Address: defaultPromUrl,
178184
})
179185
if err != nil {
180-
fmt.Printf("Error creating prometheus client: %v\n", err)
181-
os.Exit(1)
186+
fmt.Printf("warning: failed to initialize promQL client: %s", err)
187+
return nil
182188
}
183189

184190
summary := NewSummary(client, "operator-controller", "catalogd")
185191
summaryMarkdown, err := executeTemplate(summaryTemplate, summary)
186192
if err != nil {
187-
return err
193+
fmt.Printf("warning: failed to generate e2e summary: %s", err)
194+
return nil
188195
}
189-
if path := os.Getenv(envTarget); path != "" {
190-
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
191-
if err != nil {
192-
return err
193-
}
194-
fmt.Printf("Test summary output to %s successful\n", envTarget)
195-
} else {
196-
fmt.Printf("No summary output specified; skipping")
196+
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
197+
if err != nil {
198+
fmt.Printf("warning: failed to write summary output to %s: %s", path, err)
199+
return nil
197200
}
201+
fmt.Printf("Test summary output to %s successful\n", path)
198202
return nil
199203
}

test/utils/templates/summary.md.tmpl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111

1212
#### CPU Usage
1313
{{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}}
14+
15+
#### API Queries Total
16+
{{$.PerformanceQuery "API Queries Total" $pod `sum(rest_client_requests_total{job=~"%s.*"})[5m:]` "# queries" 1}}
17+
18+
#### API Query Rate
19+
{{$.PerformanceQuery "API Queries/sec" $pod `sum(rate(rest_client_requests_total{job=~"%s.*"}[5m]))[5m:]` "per sec" 1}}
20+
1421
{{end}}
1522
{{- end}}
1623

0 commit comments

Comments
 (0)