Skip to content

Commit 441eca5

Browse files
committed
API Call Alerts
Do not fail e2e run when issues with summary generation are encountered. Add prometheus alerts for excessive API calls from operator-controller or catalogd, as well as summary graphs to match. Signed-off-by: Daniel Franz <[email protected]>
1 parent a62ff79 commit 441eca5

File tree

5 files changed

+69
-39
lines changed

5 files changed

+69
-39
lines changed

.github/workflows/e2e.yaml

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,22 +33,7 @@ jobs:
3333
go-version-file: go.mod
3434

3535
- name: Run e2e tests
36-
run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
37-
38-
- name: alerts-check
39-
# Grab all current alerts, filtering out pending, and print the GH actions warning string
40-
# containing the alert name and description.
41-
#
42-
# NOTE: Leaving this as annotating-only instead of failing the run until we have some more
43-
# finely-tuned alerts.
44-
run: |
45-
if [[ -s /tmp/artifacts/alerts.out ]]; then \
46-
jq -r 'if .state=="firing" then
47-
"::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)"
48-
elif .state=="pending" then
49-
"::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
50-
end' /tmp/artifacts/alerts.out
51-
fi
36+
run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-e2e
5237

5338
- uses: actions/upload-artifact@v4
5439
if: failure()
@@ -75,7 +60,7 @@ jobs:
7560
go-version-file: go.mod
7661

7762
- name: Run e2e tests
78-
run: ARTIFACT_PATH=/tmp/artifacts make test-experimental-e2e
63+
run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-experimental-e2e
7964

8065
- uses: actions/upload-artifact@v4
8166
if: failure()

config/overlays/prometheus/prometheus_rule.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,15 @@ spec:
5757
keep_firing_for: 1d
5858
annotations:
5959
description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%"
60+
- alert: operator-controller-api-call-rate
61+
expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10
62+
for: 5m
63+
keep_firing_for: 1d
64+
annotations:
65+
description: "operator-controller making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"
66+
- alert: catalogd-api-call-rate
67+
expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5
68+
for: 5m
69+
keep_firing_for: 1d
70+
annotations:
71+
description: "catalogd making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"

test/e2e/e2e_suite_test.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package e2e
22

33
import (
44
"context"
5+
"fmt"
56
"os"
67
"testing"
78

@@ -24,7 +25,7 @@ var (
2425
)
2526

2627
const (
27-
testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY"
28+
testSummaryOutputEnvVar = "E2E_SUMMARY_OUTPUT"
2829
testCatalogRefEnvVar = "CATALOG_IMG"
2930
testCatalogName = "test-catalog"
3031
latestImageTag = "latest"
@@ -39,8 +40,18 @@ func TestMain(m *testing.M) {
3940
utilruntime.Must(err)
4041

4142
res := m.Run()
42-
err = utils.PrintSummary(testSummaryOutputEnvVar)
43-
utilruntime.Must(err)
43+
44+
path := os.Getenv(testSummaryOutputEnvVar)
45+
if path == "" {
46+
fmt.Printf("Note: E2E_SUMMARY_OUTPUT is unset; skipping summary generation")
47+
} else {
48+
err = utils.PrintSummary(path)
49+
if err != nil {
50+
// Fail the run if alerts are found
51+
fmt.Printf("%s", err)
52+
os.Exit(1)
53+
}
54+
}
4455
os.Exit(res)
4556
}
4657

test/utils/summary.go

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -42,25 +42,31 @@ type xychart struct {
4242
}
4343

4444
type githubSummary struct {
45-
client api.Client
46-
Pods []string
45+
client api.Client
46+
firingAlerts bool
47+
Pods []string
4748
}
4849

4950
func NewSummary(c api.Client, pods ...string) githubSummary {
5051
return githubSummary{
51-
client: c,
52-
Pods: pods,
52+
client: c,
53+
Pods: pods,
54+
firingAlerts: false,
5355
}
5456
}
5557

58+
func (s *githubSummary) FiringAlerts() bool {
59+
return s.firingAlerts
60+
}
61+
5662
// PerformanceQuery queries the prometheus server and generates a mermaid xychart with the data.
5763
// title - Display name of the xychart
5864
// pod - Pod name with which to filter results from prometheus
5965
// query - Prometheus query
6066
// yLabel - Label of the Y axis i.e. "KB/s", "MB", etc.
6167
// scaler - Constant by which to scale the results. For instance, cpu usage is more human-readable
6268
// as "mCPU" vs "CPU", so we scale the results by a factor of 1,000.
63-
func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) {
69+
func (s githubSummary) PerformanceQuery(title, pod, query, yLabel string, scaler float64) (string, error) {
6470
v1api := v1.NewAPI(s.client)
6571
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
6672
defer cancel()
@@ -90,8 +96,9 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string,
9096
formattedData := make([]string, 0)
9197
// matrix does not allow [] access, so we just do one iteration for the single result
9298
for _, metric := range matrix {
93-
if len(metric.Values) < 1 {
94-
return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values))
99+
if len(metric.Values) < 2 {
100+
// A graph with one data point means something with the collection was wrong
101+
return "", fmt.Errorf("expected at least two data points; got: %d", len(metric.Values))
95102
}
96103
for _, sample := range metric.Values {
97104
floatSample := float64(sample.Value) * scaler
@@ -136,6 +143,7 @@ func (s githubSummary) Alerts() (string, error) {
136143
switch a.State {
137144
case v1.AlertStateFiring:
138145
firingAlerts = append(firingAlerts, aConv)
146+
s.firingAlerts = true
139147
case v1.AlertStatePending:
140148
pendingAlerts = append(pendingAlerts, aConv)
141149
// Ignore AlertStateInactive; the alerts endpoint doesn't return them
@@ -172,28 +180,35 @@ func executeTemplate(templateFile string, obj any) (string, error) {
172180
// The markdown is template-driven; the summary methods are called from within the
173181
// template. This allows us to add or change queries (hopefully) without needing to
174182
// touch code. The summary will be output to a file supplied by the env target.
175-
func PrintSummary(envTarget string) error {
183+
func PrintSummary(path string) error {
184+
if path == "" {
185+
fmt.Printf("No summary output path specified; skipping")
186+
return nil
187+
}
188+
176189
client, err := api.NewClient(api.Config{
177190
Address: defaultPromUrl,
178191
})
179192
if err != nil {
180-
fmt.Printf("Error creating prometheus client: %v\n", err)
181-
os.Exit(1)
193+
fmt.Printf("warning: failed to initialize promQL client: %s", err)
194+
return nil
182195
}
183196

184197
summary := NewSummary(client, "operator-controller", "catalogd")
185198
summaryMarkdown, err := executeTemplate(summaryTemplate, summary)
186199
if err != nil {
187-
return err
200+
fmt.Printf("warning: failed to generate e2e summary: %s", err)
201+
return nil
188202
}
189-
if path := os.Getenv(envTarget); path != "" {
190-
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
191-
if err != nil {
192-
return err
193-
}
194-
fmt.Printf("Test summary output to %s successful\n", envTarget)
195-
} else {
196-
fmt.Printf("No summary output specified; skipping")
203+
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
204+
if err != nil {
205+
fmt.Printf("warning: failed to write summary output to %s: %s", path, err)
206+
return nil
197207
}
208+
fmt.Printf("Test summary output to %s successful\n", path)
209+
// TODO: uncomment when the metrics collection is proven to be stable
210+
// if summary.FiringAlerts() {
211+
// return fmt.Errorf("Alert(s) encountered during test run; see summary for details")
212+
//}
198213
return nil
199214
}

test/utils/templates/summary.md.tmpl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111

1212
#### CPU Usage
1313
{{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}}
14+
15+
#### API Queries Total
16+
{{$.PerformanceQuery "API Queries Total" $pod `sum(rest_client_requests_total{job=~"%s.*"})[5m:]` "# queries" 1}}
17+
18+
#### API Query Rate
19+
{{$.PerformanceQuery "API Queries/sec" $pod `sum(rate(rest_client_requests_total{job=~"%s.*"}[5m]))[5m:]` "per sec" 1}}
20+
1421
{{end}}
1522
{{- end}}
1623

0 commit comments

Comments
 (0)