@@ -9,19 +9,6 @@ defaultRules:
99
1010grafana:
1111 adminPassword: ${grafana_admin_password}
12- ingress:
13- enabled: true
14- hosts:
15- - monitoring. polinetwork . org
16- annotations:
17- cert- manager. io / cluster- issuer: letsencrypt- prod- issuer
18- kubernetes. io / ingress. class : nginx
19- kubernetes. io / tls- acme: " true"
20- path : /
21- tls:
22- - hosts:
23- - monitoring. polinetwork . org
24- secretName: grafana- ingress- secret
2512 persistence:
2613 enabled: true
2714 type: pvc
@@ -118,47 +105,3 @@ additionalPrometheusRulesMap:
118105 annotations:
119106 summary: High deployment failure rate
120107 description: More than 90 % of total replicas for Deployment {{$labels.namespace}}/ {{$labels.deployment}} are down
121- - name: cert- manager
122- rules:
123- - alert: CertManagerAbsent
124- annotations:
125- description: New certificates will not be able to be minted, and existing ones can't be renewed until cert- manager is back.
126- runbook_url: https: // gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent
127- summary: Cert Manager has dissapeared from Prometheus service discovery.
128- expr: absent(up{job= " cert-manager" })
129- for: 10m
130- labels:
131- severity: critical
132- - name: certificates
133- rules:
134- - alert: CertManagerCertExpirySoon
135- annotations:
136- dashboard_url: https: // grafana.example.com/d/TvuRo2iMk/cert-manager
137- description: The domain that this cert covers will be unavailable after {{$value | humanizeDuration }}. Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration}}.
138- runbook_url: https: // gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon
139- summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry, it should have renewed over a week ago.
140- expr: |
141- avg by (exported_namespace, namespace, name) (certmanager_certificate_expiration_timestamp_seconds - time()) < (21 * 24 * 3600 ) # 21 days in seconds
142- for: 1h
143- labels:
144- severity: warning
145- - alert: CertManagerCertNotReady
146- annotations:
147- dashboard_url: https: // grafana.example.com/d/TvuRo2iMk/cert-manager
148- description: This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead.
149- runbook_url: https: // gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready
150- summary: The cert `{{ $labels.name }}` is not ready to serve traffic.
151- expr: max by (name, exported_namespace, namespace, condition) (certmanager_certificate_ready_status{condition!= " True" } == 1 )
152- for: 10m
153- labels:
154- severity: critical
155- - alert: CertManagerHittingRateLimits
156- annotations:
157- dashboard_url: https: // grafana.example.com/d/TvuRo2iMk/cert-manager
158- description: Depending on the rate limit, cert- manager may be unable to generate certificates for up to a week.
159- runbook_url: https: // gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits
160- summary: Cert manager hitting LetsEncrypt rate limits.
161- expr: sum by (host) (rate(certmanager_http_acme_client_request_count{status= " 429" }[5m])) > 0
162- for: 5m
163- labels:
164- severity: critical
0 commit comments