|
| 1 | +apiVersion: monitoring.coreos.com/v1 |
| 2 | +kind: PrometheusRule |
| 3 | +metadata: |
| 4 | + name: cert-manager-alerting-rules |
| 5 | + namespace: "{{ certmanager_release_namespace }}" |
| 6 | + labels: |
| 7 | + release: kube-prometheus-stack |
| 8 | +{% raw %} |
| 9 | +spec: |
| 10 | + groups: |
| 11 | + - name: cert-manager.rules |
| 12 | + rules: |
| 13 | + - alert: CertManagerAbsent |
| 14 | + annotations: |
| 15 | + description: >- |
| 16 | + New certificates will not be able to be minted, and existing ones can't |
| 17 | + be renewed until cert-manager is back. |
| 18 | + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerabsent |
| 19 | + summary: Cert Manager has disappeared from Prometheus service discovery. |
| 20 | + expr: absent(up{job="cert-manager"}) |
| 21 | + for: 10m |
| 22 | + labels: |
| 23 | + severity: critical |
| 24 | + |
| 25 | + - alert: CertManagerCertExpirySoon |
| 26 | + annotations: |
| 27 | + description: >- |
| 28 | + The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}. |
| 29 | + Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}. |
| 30 | + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertexpirysoon |
| 31 | + summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry. |
| 32 | + expr: | |
| 33 | + avg by (exported_namespace, namespace, name) ( |
| 34 | + certmanager_certificate_expiration_timestamp_seconds - time() |
| 35 | + ) < (30 * 24 * 3600) |
| 36 | + for: 1h |
| 37 | + labels: |
| 38 | + severity: warning |
| 39 | + |
| 40 | + - alert: CertManagerHittingRateLimits |
| 41 | + annotations: |
| 42 | + description: >- |
| 43 | + Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week. |
| 44 | + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerhittingratelimits |
| 45 | + summary: Cert manager hitting LetsEncrypt rate limits. |
| 46 | + expr: | |
| 47 | + sum by (host) ( |
| 48 | + rate(certmanager_http_acme_client_request_count{status="429"}[5m]) |
| 49 | + ) > 0 |
| 50 | + for: 5m |
| 51 | + labels: |
| 52 | + severity: critical |
| 53 | +{% endraw %} |
0 commit comments