API Call Alerts

dtfranz · dtfranz · commit 441eca5663f9 · 2025-08-09T12:18:46.000+09:00
Do not fail e2e run when issues with summary generation are encountered.

Add prometheus alerts for excessive API calls from operator-controller or catalogd, as well as summary graphs to match.

Signed-off-by: Daniel Franz &lt;dfranz@redhat.com&gt;
diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
@@ -33,22 +33,7 @@ jobs:
           go-version-file: go.mod
 
       - name: Run e2e tests
-        run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
-
-      - name: alerts-check
-        # Grab all current alerts, filtering out pending, and print the GH actions warning string
-        # containing the alert name and description.
-        #
-        # NOTE: Leaving this as annotating-only instead of failing the run until we have some more
-        # finely-tuned alerts.
-        run: |
-          if [[ -s /tmp/artifacts/alerts.out ]]; then \
-            jq -r 'if .state=="firing" then
-              "::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)"
-            elif .state=="pending" then
-              "::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
-            end' /tmp/artifacts/alerts.out
-          fi
+        run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-e2e
 
       - uses: actions/upload-artifact@v4
         if: failure()
@@ -75,7 +60,7 @@ jobs:
           go-version-file: go.mod
 
       - name: Run e2e tests
-        run: ARTIFACT_PATH=/tmp/artifacts make test-experimental-e2e
+        run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-experimental-e2e
 
       - uses: actions/upload-artifact@v4
         if: failure()
diff --git a/config/overlays/prometheus/prometheus_rule.yaml b/config/overlays/prometheus/prometheus_rule.yaml
@@ -57,3 +57,15 @@ spec:
       keep_firing_for: 1d
       annotations:
         description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%"
+    - alert: operator-controller-api-call-rate
+      expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "operator-controller making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"
+    - alert: catalogd-api-call-rate
+      expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "catalogd making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"
diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go
@@ -2,6 +2,7 @@ package e2e
 
 import (
 	"context"
+	"fmt"
 	"os"
 	"testing"
 
@@ -24,7 +25,7 @@ var (
 )
 
 const (
-	testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY"
+	testSummaryOutputEnvVar = "E2E_SUMMARY_OUTPUT"
 	testCatalogRefEnvVar    = "CATALOG_IMG"
 	testCatalogName         = "test-catalog"
 	latestImageTag          = "latest"
@@ -39,8 +40,18 @@ func TestMain(m *testing.M) {
 	utilruntime.Must(err)
 
 	res := m.Run()
-	err = utils.PrintSummary(testSummaryOutputEnvVar)
-	utilruntime.Must(err)
+
+	path := os.Getenv(testSummaryOutputEnvVar)
+	if path == "" {
+		fmt.Printf("Note: E2E_SUMMARY_OUTPUT is unset; skipping summary generation")
+	} else {
+		err = utils.PrintSummary(path)
+		if err != nil {
+			// Fail the run if alerts are found
+			fmt.Printf("%s", err)
+			os.Exit(1)
+		}
+	}
 	os.Exit(res)
 }
 
diff --git a/test/utils/summary.go b/test/utils/summary.go
@@ -42,25 +42,31 @@ type xychart struct {
 }
 
 type githubSummary struct {
-	client api.Client
-	Pods   []string
+	client       api.Client
+	firingAlerts bool
+	Pods         []string
 }
 
 func NewSummary(c api.Client, pods ...string) githubSummary {
 	return githubSummary{
-		client: c,
-		Pods:   pods,
+		client:       c,
+		Pods:         pods,
+		firingAlerts: false,
 	}
 }
 
+func (s *githubSummary) FiringAlerts() bool {
+	return s.firingAlerts
+}
+
 // PerformanceQuery queries the prometheus server and generates a mermaid xychart with the data.
 // title  - Display name of the xychart
 // pod    - Pod name with which to filter results from prometheus
 // query  - Prometheus query
 // yLabel - Label of the Y axis i.e. "KB/s", "MB", etc.
 // scaler - Constant by which to scale the results. For instance, cpu usage is more human-readable
 // as "mCPU" vs "CPU", so we scale the results by a factor of 1,000.
-func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) {
+func (s githubSummary) PerformanceQuery(title, pod, query, yLabel string, scaler float64) (string, error) {
 	v1api := v1.NewAPI(s.client)
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -90,8 +96,9 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string,
 	formattedData := make([]string, 0)
 	// matrix does not allow [] access, so we just do one iteration for the single result
 	for _, metric := range matrix {
-		if len(metric.Values) < 1 {
-			return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values))
+		if len(metric.Values) < 2 {
+			// A graph with one data point means something with the collection was wrong
+			return "", fmt.Errorf("expected at least two data points; got: %d", len(metric.Values))
 		}
 		for _, sample := range metric.Values {
 			floatSample := float64(sample.Value) * scaler
@@ -136,6 +143,7 @@ func (s githubSummary) Alerts() (string, error) {
 			switch a.State {
 			case v1.AlertStateFiring:
 				firingAlerts = append(firingAlerts, aConv)
+				s.firingAlerts = true
 			case v1.AlertStatePending:
 				pendingAlerts = append(pendingAlerts, aConv)
 				// Ignore AlertStateInactive; the alerts endpoint doesn't return them
@@ -172,28 +180,35 @@ func executeTemplate(templateFile string, obj any) (string, error) {
 // The markdown is template-driven; the summary methods are called from within the
 // template. This allows us to add or change queries (hopefully) without needing to
 // touch code. The summary will be output to a file supplied by the env target.
-func PrintSummary(envTarget string) error {
+func PrintSummary(path string) error {
+	if path == "" {
+		fmt.Printf("No summary output path specified; skipping")
+		return nil
+	}
+
 	client, err := api.NewClient(api.Config{
 		Address: defaultPromUrl,
 	})
 	if err != nil {
-		fmt.Printf("Error creating prometheus client: %v\n", err)
-		os.Exit(1)
+		fmt.Printf("warning: failed to initialize promQL client: %s", err)
+		return nil
 	}
 
 	summary := NewSummary(client, "operator-controller", "catalogd")
 	summaryMarkdown, err := executeTemplate(summaryTemplate, summary)
 	if err != nil {
-		return err
+		fmt.Printf("warning: failed to generate e2e summary: %s", err)
+		return nil
 	}
-	if path := os.Getenv(envTarget); path != "" {
-		err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
-		if err != nil {
-			return err
-		}
-		fmt.Printf("Test summary output to %s successful\n", envTarget)
-	} else {
-		fmt.Printf("No summary output specified; skipping")
+	err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
+	if err != nil {
+		fmt.Printf("warning: failed to write summary output to %s: %s", path, err)
+		return nil
 	}
+	fmt.Printf("Test summary output to %s successful\n", path)
+	// TODO: uncomment when the metrics collection is proven to be stable
+	// if summary.FiringAlerts() {
+	//	return fmt.Errorf("Alert(s) encountered during test run; see summary for details")
+	//}
 	return nil
 }
diff --git a/test/utils/templates/summary.md.tmpl b/test/utils/templates/summary.md.tmpl
@@ -11,6 +11,13 @@
 
 #### CPU Usage
 {{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}}
+
+#### API Queries Total
+{{$.PerformanceQuery "API Queries Total" $pod `sum(rest_client_requests_total{job=~"%s.*"})[5m:]` "# queries" 1}}
+
+#### API Query Rate
+{{$.PerformanceQuery "API Queries/sec" $pod `sum(rate(rest_client_requests_total{job=~"%s.*"}[5m]))[5m:]` "per sec" 1}}
+
 {{end}}
 {{- end}}