Merge pull request #19 from bleu/jefferson/cow-598-13-alerting-rules

lgahdl · web-flow · commit 5184210f4388 · 2026-02-25T17:04:59.000-03:00
Alerting rules
diff --git a/configs/dashboards/performance.json b/configs/dashboards/performance.json
@@ -1,6 +1,31 @@
 {
   "annotations": {
-    "list": []
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "enable": true,
+        "expr": "ALERTS{alertstate=\"firing\", component=\"cow-performance-testing\"}",
+        "iconColor": "red",
+        "name": "Firing Alerts",
+        "tagKeys": "alertname,severity",
+        "titleFormat": "{{ alertname }}"
+      }
+    ]
   },
   "editable": true,
   "fiscalYearStartMonth": 0,
diff --git a/configs/prometheus.yml b/configs/prometheus.yml
@@ -70,11 +70,12 @@ scrape_configs:
     # Fail gracefully if exporter not running
     scrape_timeout: 5s
 
-# Optional: Add alerting rules
-# rule_files:
-#   - "/etc/prometheus/alerts/*.yml"
+# Alert rule files
+rule_files:
+  - "/etc/prometheus/alerts/*.yml"
 
-# Optional: Configure Alertmanager
+# Note: Alertmanager not configured - alerts visible in Prometheus UI and Grafana only
+# To enable Alertmanager notifications, uncomment below and add alertmanager service:
 # alerting:
 #   alertmanagers:
 #     - static_configs:
diff --git a/configs/prometheus/alerts/performance-testing.yml b/configs/prometheus/alerts/performance-testing.yml
@@ -0,0 +1,184 @@
+# =============================================================================
+# CoW Performance Testing Suite - Prometheus Alert Rules
+# =============================================================================
+#
+# This file defines alerting rules for the CoW Performance Testing Suite.
+# Alerts are evaluated by Prometheus and can be viewed in the Prometheus UI
+# or visualized in Grafana dashboards.
+#
+# =============================================================================
+# ALERT PARAMETERS - Edit values here for easy customization
+# =============================================================================
+#
+# TODO(COW-617): Move these thresholds to configurable TOML/env variables
+#
+# LATENCY THRESHOLDS (seconds):
+#   submission_latency_warning_threshold: 5      # P95 > 5s triggers warning
+#   submission_latency_critical_threshold: 10    # P95 > 10s triggers critical
+#
+# ERROR RATE THRESHOLDS (decimal, where 0.05 = 5%):
+#   error_rate_critical_threshold: 0.05          # > 5% error rate
+#
+# THROUGHPUT THRESHOLDS (ratio, where 0.8 = 80%):
+#   throughput_low_threshold: 0.8                # < 80% of target rate
+#
+# RESOURCE THRESHOLDS (percentage):
+#   cpu_warning_threshold: 80                    # CPU > 80%
+#   memory_critical_threshold: 95                # Memory > 95%
+#
+# ALERT DURATIONS (prevents flapping):
+#   latency_warning_for: 2m
+#   latency_critical_for: 1m
+#   error_rate_for: 1m
+#   throughput_for: 2m
+#   cpu_for: 5m
+#   memory_for: 2m
+#   test_stalled_for: 1m
+#
+# =============================================================================
+
+groups:
+  - name: cow_performance_testing
+    # Evaluation interval inherited from global config (5s)
+    rules:
+      # =========================================================================
+      # LATENCY ALERTS
+      # =========================================================================
+
+      # High Submission Latency (Warning)
+      # Triggers when P95 submission latency exceeds warning threshold
+      - alert: HighSubmissionLatency
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(cow_perf_submission_latency_seconds_bucket[1m])) by (le, scenario)
+          ) > 5
+        for: 2m
+        labels:
+          severity: warning
+          component: cow-performance-testing
+          category: latency
+        annotations:
+          summary: "High submission latency detected"
+          description: "P95 submission latency is {{ $value | printf \"%.2f\" }}s (threshold: 5s) for scenario {{ $labels.scenario }}"
+          runbook: "Check API logs, verify network connectivity, review recent code changes"
+
+      # Critical Submission Latency (Critical)
+      # Triggers when P95 submission latency exceeds critical threshold
+      - alert: CriticalSubmissionLatency
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(cow_perf_submission_latency_seconds_bucket[1m])) by (le, scenario)
+          ) > 10
+        for: 1m
+        labels:
+          severity: critical
+          component: cow-performance-testing
+          category: latency
+        annotations:
+          summary: "Critical submission latency - immediate attention required"
+          description: "P95 submission latency is {{ $value | printf \"%.2f\" }}s (threshold: 10s) for scenario {{ $labels.scenario }}"
+          runbook: "Immediate action: Check API health, container resources, database connections"
+
+      # =========================================================================
+      # ERROR RATE ALERTS
+      # =========================================================================
+
+      # High Error Rate (Critical)
+      # Triggers when order failure rate exceeds threshold
+      - alert: HighErrorRate
+        expr: |
+          (
+            sum(rate(cow_perf_orders_failed_total[5m])) by (scenario)
+            /
+            sum(rate(cow_perf_orders_submitted_total[5m])) by (scenario)
+          ) > 0.05
+        for: 1m
+        labels:
+          severity: critical
+          component: cow-performance-testing
+          category: errors
+        annotations:
+          summary: "High error rate detected"
+          description: "Error rate is {{ $value | humanizePercentage }} (threshold: 5%) for scenario {{ $labels.scenario }}"
+          runbook: "Check order validation errors, API error responses, contract state"
+
+      # =========================================================================
+      # THROUGHPUT ALERTS
+      # =========================================================================
+
+      # Low Throughput (Warning)
+      # Triggers when actual throughput falls below target
+      - alert: LowThroughput
+        expr: |
+          (
+            cow_perf_actual_rate
+            /
+            cow_perf_target_rate
+          ) < 0.8
+          and cow_perf_target_rate > 0
+        for: 2m
+        labels:
+          severity: warning
+          component: cow-performance-testing
+          category: throughput
+        annotations:
+          summary: "Low throughput - not meeting target rate"
+          description: "Actual throughput is {{ $value | humanizePercentage }} of target for scenario {{ $labels.scenario }}"
+          runbook: "Check for bottlenecks: API rate limits, network latency, resource constraints"
+
+      # =========================================================================
+      # TEST EXECUTION ALERTS
+      # =========================================================================
+
+      # Test Stalled (Critical)
+      # Triggers when no orders are being submitted during an active test
+      - alert: TestStalled
+        expr: |
+          rate(cow_perf_orders_submitted_total[1m]) == 0
+          and
+          cow_perf_test_progress_percent > 0
+          and
+          cow_perf_test_progress_percent < 100
+        for: 1m
+        labels:
+          severity: critical
+          component: cow-performance-testing
+          category: test-execution
+        annotations:
+          summary: "Performance test appears to be stalled"
+          description: "No orders submitted in the last minute for scenario {{ $labels.scenario }} (progress: {{ $value }}%)"
+          runbook: "Check test process, verify API connectivity, review error logs"
+
+      # =========================================================================
+      # RESOURCE ALERTS
+      # =========================================================================
+
+      # High CPU Usage (Warning)
+      # Triggers when container CPU usage is high
+      - alert: HighCPUUsage
+        expr: |
+          cow_perf_container_cpu_percent > 80
+        for: 5m
+        labels:
+          severity: warning
+          component: cow-performance-testing
+          category: resources
+        annotations:
+          summary: "High CPU usage on {{ $labels.container }}"
+          description: "CPU usage is {{ $value | printf \"%.1f\" }}% (threshold: 80%) on container {{ $labels.container }}"
+          runbook: "Consider scaling resources, check for inefficient operations, review container limits"
+
+      # Critical Memory Usage (Critical)
+      # Triggers when container memory usage approaches limit
+      - alert: CriticalMemoryUsage
+        expr: |
+          cow_perf_container_memory_percent > 95
+        for: 2m
+        labels:
+          severity: critical
+          component: cow-performance-testing
+          category: resources
+        annotations:
+          summary: "Critical memory usage on {{ $labels.container }}"
+          description: "Memory usage is {{ $value | printf \"%.1f\" }}% (threshold: 95%) on container {{ $labels.container }}"
+          runbook: "Immediate action: Check for memory leaks, increase container memory limit, restart if necessary"
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -245,6 +245,7 @@ services:
       - "9090:9090"
     volumes:
       - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./configs/prometheus/alerts:/etc/prometheus/alerts:ro
       - prometheus_data:/prometheus
     profiles:
       - monitoring
diff --git a/src/cow_performance/prometheus/exporter.py b/src/cow_performance/prometheus/exporter.py
@@ -272,6 +272,9 @@ def _update_resource_metrics(self, metric: object) -> None:
                 self._metrics.container_memory_bytes.labels(container=container_name).set(
                     sample.memory_bytes
                 )
+                self._metrics.container_memory_percent.labels(container=container_name).set(
+                    sample.memory_percent
+                )
                 self._metrics.container_network_rx_bytes.labels(container=container_name).set(
                     sample.network_rx_bytes
                 )
@@ -406,10 +409,13 @@ def update_container_resources(
         memory_bytes: int,
         network_rx_bytes: int = 0,
         network_tx_bytes: int = 0,
+        memory_percent: float | None = None,
     ) -> None:
         """Update resource metrics for a container."""
         self._metrics.container_cpu_percent.labels(container=container).set(cpu_percent)
         self._metrics.container_memory_bytes.labels(container=container).set(memory_bytes)
+        if memory_percent is not None:
+            self._metrics.container_memory_percent.labels(container=container).set(memory_percent)
         self._metrics.container_network_rx_bytes.labels(container=container).set(network_rx_bytes)
         self._metrics.container_network_tx_bytes.labels(container=container).set(network_tx_bytes)
 
diff --git a/src/cow_performance/prometheus/metrics.py b/src/cow_performance/prometheus/metrics.py
@@ -211,6 +211,12 @@ def _init_resource_metrics(self) -> None:
             ["container"],
             registry=self.registry,
         )
+        self.container_memory_percent = Gauge(
+            "cow_perf_container_memory_percent",
+            "Container memory usage as percentage of limit (0-100)",
+            ["container"],
+            registry=self.registry,
+        )
         self.container_network_rx_bytes = Gauge(
             "cow_perf_container_network_rx_bytes",
             "Container network bytes received",
diff --git a/thoughts/INDEX.md b/thoughts/INDEX.md
@@ -122,6 +122,7 @@ Detailed implementation approaches for tickets. Read these before implementing t
 | [2026-02-02-cow-588-baseline-snapshot-system.md](plans/2026-02-02-cow-588-baseline-snapshot-system.md) | COW-588 | ✅ Complete | BaselineManager, git-info, UUID-index, serialization |
 | [2026-02-03-cow-589-comparison-engine.md](plans/2026-02-03-cow-589-comparison-engine.md) | COW-589 | ✅ Complete | ComparisonEngine, regression, statistics, p-value, Cohen's-d |
 | [2026-02-03-cow-590-automated-reporting.md](plans/2026-02-03-cow-590-automated-reporting.md) | COW-590 | ✅ Complete | ReportGenerator, formatters, CSV, recommendations, CLI |
+| [2026-02-13-cow-598-alerting-rules.md](plans/2026-02-13-cow-598-alerting-rules.md) | COW-598 | 🔲 Ready | Prometheus alerts, alerting rules, thresholds, Grafana annotations |
 
 ---
 
@@ -227,6 +228,7 @@ tickets/COW-593-grafana-dashboards.md
 ### Alerting Rules (COW-598) — M3
 ```
 tickets/COW-598-alerting-rules.md
+└── plans/2026-02-13-cow-598-alerting-rules.md  (execution plan)
 ```
 
 ---
diff --git a/thoughts/plans/2026-02-13-cow-598-alerting-rules.md b/thoughts/plans/2026-02-13-cow-598-alerting-rules.md
diff --git a/thoughts/tickets/COW-598-alerting-rules.md b/thoughts/tickets/COW-598-alerting-rules.md