Skip to content

Commit fa758bc

Browse files
author
Matt Pryor
authored
Fix or replace dashboards using Angular components (#674)
1 parent 9a35f14 commit fa758bc

File tree

10 files changed

+9609
-8525
lines changed

10 files changed

+9609
-8525
lines changed

roles/certmanager/files/grafana_dashboard.json

Lines changed: 650 additions & 1227 deletions
Large diffs are not rendered by default.

roles/certmanager/tasks/main.yml

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -36,22 +36,28 @@
3636
wait: yes
3737
wait_timeout: "{{ certmanager_wait_timeout }}"
3838

39-
- name: Install Grafana dashboard for cert-manager metrics
40-
command: kubectl apply -f -
41-
args:
42-
stdin: "{{ certmanager_dashboard_definition | to_nice_yaml }}"
43-
vars:
44-
certmanager_dashboard_definition:
45-
apiVersion: v1
46-
kind: ConfigMap
47-
metadata:
48-
name: cert-manager-grafana-dashboard
49-
namespace: "{{ certmanager_release_namespace }}"
50-
labels:
51-
grafana_dashboard: "1"
52-
data:
53-
certmanager_dashboard.json: |-
54-
{{ lookup("file", "grafana_dashboard.json") | from_json | to_nice_json }}
39+
- block:
40+
- name: Install Grafana dashboard for cert-manager metrics
41+
command: kubectl apply -f -
42+
args:
43+
stdin: "{{ certmanager_dashboard_definition | to_nice_yaml }}"
44+
vars:
45+
certmanager_dashboard_definition:
46+
apiVersion: v1
47+
kind: ConfigMap
48+
metadata:
49+
name: cert-manager-grafana-dashboard
50+
namespace: "{{ certmanager_release_namespace }}"
51+
labels:
52+
grafana_dashboard: "1"
53+
data:
54+
certmanager_dashboard.json: |-
55+
{{ lookup("file", "grafana_dashboard.json") | from_json | to_nice_json }}
56+
57+
- name: Configure custom alerting rules for cert-manager
58+
command: kubectl apply -f -
59+
args:
60+
stdin: "{{ lookup('template', 'prometheusrule.yaml.j2') }}"
5561
when: certmanager_monitoring_enabled
5662

5763
- block:
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: cert-manager-alerting-rules
5+
namespace: "{{ certmanager_release_namespace }}"
6+
labels:
7+
release: kube-prometheus-stack
8+
{% raw %}
9+
spec:
10+
groups:
11+
- name: cert-manager.rules
12+
rules:
13+
- alert: CertManagerAbsent
14+
annotations:
15+
description: >-
16+
New certificates will not be able to be minted, and existing ones can't
17+
be renewed until cert-manager is back.
18+
runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerabsent
19+
summary: Cert Manager has disappeared from Prometheus service discovery.
20+
expr: absent(up{job="cert-manager"})
21+
for: 10m
22+
labels:
23+
severity: critical
24+
25+
- alert: CertManagerCertExpirySoon
26+
annotations:
27+
description: >-
28+
The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}.
29+
Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}.
30+
runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertexpirysoon
31+
summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry.
32+
expr: |
33+
avg by (exported_namespace, namespace, name) (
34+
certmanager_certificate_expiration_timestamp_seconds - time()
35+
) < (30 * 24 * 3600)
36+
for: 1h
37+
labels:
38+
severity: warning
39+
40+
- alert: CertManagerHittingRateLimits
41+
annotations:
42+
description: >-
43+
Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week.
44+
runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerhittingratelimits
45+
summary: Cert manager hitting LetsEncrypt rate limits.
46+
expr: |
47+
sum by (host) (
48+
rate(certmanager_http_acme_client_request_count{status="429"}[5m])
49+
) > 0
50+
for: 5m
51+
labels:
52+
severity: critical
53+
{% endraw %}

0 commit comments

Comments
 (0)