Skip to content

Commit dbfd6c0

Browse files
committed
OCPBUGS-15430: move alerting rules from CMO over
Signed-off-by: Thomas Jungblut <[email protected]>
1 parent aabe03a commit dbfd6c0

File tree

3 files changed

+52
-0
lines changed

3 files changed

+52
-0
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: kubernetes-system-apiserver
5+
namespace: openshift-kube-apiserver
6+
spec:
7+
groups:
8+
- name: kubernetes-system-apiserver
9+
rules:
10+
- alert: KubeAggregatedAPIErrors
11+
annotations:
12+
description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors.
13+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/KubeAggregatedAPIErrors.md
14+
summary: Kubernetes aggregated API has reported errors.
15+
expr: |
16+
sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0
17+
for: 10m
18+
labels:
19+
severity: warning
20+
- alert: KubeAggregatedAPIDown
21+
annotations:
22+
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
23+
summary: Kubernetes aggregated API is down.
24+
expr: |
25+
(1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85
26+
for: 15m
27+
labels:
28+
severity: warning
29+
- alert: KubeAPIDown
30+
annotations:
31+
description: KubeAPI has disappeared from Prometheus target discovery.
32+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/KubeAPIDown.md
33+
summary: Target disappeared from Prometheus target discovery.
34+
expr: |
35+
absent(up{job="apiserver"} == 1)
36+
for: 15m
37+
labels:
38+
severity: critical
39+
- alert: KubeAPITerminatedRequests
40+
annotations:
41+
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
42+
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
43+
expr: |
44+
sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
45+
for: 5m
46+
labels:
47+
severity: warning

pkg/operator/starter.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle
215215
"assets/kube-apiserver/storage-version-migration-prioritylevelconfiguration-v1beta3.yaml",
216216
"assets/alerts/api-usage.yaml",
217217
"assets/alerts/audit-errors.yaml",
218+
"assets/alerts/kube-apiserver-down.yaml",
218219
"assets/alerts/kube-apiserver-requests.yaml",
219220
"assets/alerts/kube-apiserver-slos-basic.yaml",
220221
"assets/alerts/podsecurity-violations.yaml",

pkg/test/assets_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ func readAllYaml(path string, t *testing.T) {
3636
return false, nil
3737
}
3838
// there is an alert message containing $labels strings that cause the reader to fail.
39+
if strings.HasSuffix(info.Name(), "kube-apiserver-down.yaml") {
40+
return false, nil
41+
}
42+
// there is an alert message containing $labels strings that cause the reader to fail.
3943
if strings.HasSuffix(info.Name(), "api-usage.yaml") {
4044
return false, nil
4145
}

0 commit comments

Comments
 (0)