diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 97e0a2181..8e7d8d511 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -33,22 +33,7 @@ jobs: go-version-file: go.mod - name: Run e2e tests - run: ARTIFACT_PATH=/tmp/artifacts make test-e2e - - - name: alerts-check - # Grab all current alerts, filtering out pending, and print the GH actions warning string - # containing the alert name and description. - # - # NOTE: Leaving this as annotating-only instead of failing the run until we have some more - # finely-tuned alerts. - run: | - if [[ -s /tmp/artifacts/alerts.out ]]; then \ - jq -r 'if .state=="firing" then - "::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)" - elif .state=="pending" then - "::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)" - end' /tmp/artifacts/alerts.out - fi + run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-e2e - uses: actions/upload-artifact@v4 if: failure() @@ -75,7 +60,7 @@ jobs: go-version-file: go.mod - name: Run e2e tests - run: ARTIFACT_PATH=/tmp/artifacts make test-experimental-e2e + run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-experimental-e2e - uses: actions/upload-artifact@v4 if: failure() diff --git a/config/overlays/prometheus/prometheus_rule.yaml b/config/overlays/prometheus/prometheus_rule.yaml index 5bd7e120b..b7e3fcdaf 100644 --- a/config/overlays/prometheus/prometheus_rule.yaml +++ b/config/overlays/prometheus/prometheus_rule.yaml @@ -57,3 +57,15 @@ spec: keep_firing_for: 1d annotations: description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%" + - alert: operator-controller-api-call-rate + expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10 + for: 5m + keep_firing_for: 1d + annotations: + description: "operator-controller making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec" + - alert: catalogd-api-call-rate + expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5 + for: 5m + keep_firing_for: 1d + annotations: + description: "catalogd making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec" diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index dabfb48ca..7b0a369c3 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -2,6 +2,7 @@ package e2e import ( "context" + "fmt" "os" "testing" @@ -24,7 +25,7 @@ var ( ) const ( - testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY" + testSummaryOutputEnvVar = "E2E_SUMMARY_OUTPUT" testCatalogRefEnvVar = "CATALOG_IMG" testCatalogName = "test-catalog" latestImageTag = "latest" @@ -39,8 +40,18 @@ func TestMain(m *testing.M) { utilruntime.Must(err) res := m.Run() - err = utils.PrintSummary(testSummaryOutputEnvVar) - utilruntime.Must(err) + + path := os.Getenv(testSummaryOutputEnvVar) + if path == "" { + fmt.Printf("Note: E2E_SUMMARY_OUTPUT is unset; skipping summary generation") + } else { + err = utils.PrintSummary(path) + if err != nil { + // Fail the run if alerts are found + fmt.Printf("%s", err) + os.Exit(1) + } + } os.Exit(res) } diff --git a/test/utils/summary.go b/test/utils/summary.go index d91ae3239..276546592 100644 --- a/test/utils/summary.go +++ b/test/utils/summary.go @@ -60,7 +60,7 @@ func NewSummary(c api.Client, pods ...string) githubSummary { // yLabel - Label of the Y axis i.e. "KB/s", "MB", etc. // scaler - Constant by which to scale the results. For instance, cpu usage is more human-readable // as "mCPU" vs "CPU", so we scale the results by a factor of 1,000. -func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) { +func (s githubSummary) PerformanceQuery(title, pod, query, yLabel string, scaler float64) (string, error) { v1api := v1.NewAPI(s.client) ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() @@ -90,8 +90,9 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, formattedData := make([]string, 0) // matrix does not allow [] access, so we just do one iteration for the single result for _, metric := range matrix { - if len(metric.Values) < 1 { - return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values)) + if len(metric.Values) < 2 { + // A graph with one data point means something with the collection was wrong + return "", fmt.Errorf("expected at least two data points; got: %d", len(metric.Values)) } for _, sample := range metric.Values { floatSample := float64(sample.Value) * scaler @@ -172,28 +173,31 @@ func executeTemplate(templateFile string, obj any) (string, error) { // The markdown is template-driven; the summary methods are called from within the // template. This allows us to add or change queries (hopefully) without needing to // touch code. The summary will be output to a file supplied by the env target. -func PrintSummary(envTarget string) error { +func PrintSummary(path string) error { + if path == "" { + fmt.Printf("No summary output path specified; skipping") + return nil + } + client, err := api.NewClient(api.Config{ Address: defaultPromUrl, }) if err != nil { - fmt.Printf("Error creating prometheus client: %v\n", err) - os.Exit(1) + fmt.Printf("warning: failed to initialize promQL client: %s", err) + return nil } summary := NewSummary(client, "operator-controller", "catalogd") summaryMarkdown, err := executeTemplate(summaryTemplate, summary) if err != nil { - return err + fmt.Printf("warning: failed to generate e2e summary: %s", err) + return nil } - if path := os.Getenv(envTarget); path != "" { - err = os.WriteFile(path, []byte(summaryMarkdown), 0o600) - if err != nil { - return err - } - fmt.Printf("Test summary output to %s successful\n", envTarget) - } else { - fmt.Printf("No summary output specified; skipping") + err = os.WriteFile(path, []byte(summaryMarkdown), 0o600) + if err != nil { + fmt.Printf("warning: failed to write summary output to %s: %s", path, err) + return nil } + fmt.Printf("Test summary output to %s successful\n", path) return nil } diff --git a/test/utils/templates/summary.md.tmpl b/test/utils/templates/summary.md.tmpl index c094d49f3..b1372b874 100644 --- a/test/utils/templates/summary.md.tmpl +++ b/test/utils/templates/summary.md.tmpl @@ -11,6 +11,13 @@ #### CPU Usage {{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}} + +#### API Queries Total +{{$.PerformanceQuery "API Queries Total" $pod `sum(rest_client_requests_total{job=~"%s.*"})[5m:]` "# queries" 1}} + +#### API Query Rate +{{$.PerformanceQuery "API Queries/sec" $pod `sum(rate(rest_client_requests_total{job=~"%s.*"}[5m]))[5m:]` "per sec" 1}} + {{end}} {{- end}}