Skip to content

Commit af5e898

Browse files
authored
feat: de-dupe KubeletTooManyPods, add cluster to descriptions (#1011)
1 parent 35aebca commit af5e898

File tree

5 files changed

+127
-41
lines changed

5 files changed

+127
-41
lines changed

alerts/apps_alerts.libsonnet

Lines changed: 50 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ local utils = import '../lib/utils.libsonnet';
2323
severity: 'warning',
2424
},
2525
annotations: {
26-
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").',
26+
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff")%s.' % [
27+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
28+
],
2729
summary: 'Pod is crash looping.',
2830
},
2931
'for': '15m',
@@ -47,7 +49,9 @@ local utils = import '../lib/utils.libsonnet';
4749
severity: 'warning',
4850
},
4951
annotations: {
50-
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.',
52+
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes%s.' % [
53+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
54+
],
5155
summary: 'Pod has been in a non-ready state for more than 15 minutes.',
5256
},
5357
'for': '15m',
@@ -63,7 +67,9 @@ local utils = import '../lib/utils.libsonnet';
6367
severity: 'warning',
6468
},
6569
annotations: {
66-
description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.',
70+
description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back%s.' % [
71+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
72+
],
6773
summary: 'Deployment generation mismatch due to possible roll-back',
6874
},
6975
'for': '15m',
@@ -85,7 +91,9 @@ local utils = import '../lib/utils.libsonnet';
8591
severity: 'warning',
8692
},
8793
annotations: {
88-
description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.',
94+
description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes%s.' % [
95+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
96+
],
8997
summary: 'Deployment has not matched the expected number of replicas.',
9098
},
9199
'for': '15m',
@@ -100,7 +108,9 @@ local utils = import '../lib/utils.libsonnet';
100108
severity: 'warning',
101109
},
102110
annotations: {
103-
description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.',
111+
description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes%s.' % [
112+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
113+
],
104114
summary: 'Deployment rollout is not progressing.',
105115
},
106116
'for': '15m',
@@ -122,7 +132,9 @@ local utils = import '../lib/utils.libsonnet';
122132
severity: 'warning',
123133
},
124134
annotations: {
125-
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
135+
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes%s.' % [
136+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
137+
],
126138
summary: 'StatefulSet has not matched the expected number of replicas.',
127139
},
128140
'for': '15m',
@@ -138,7 +150,9 @@ local utils = import '../lib/utils.libsonnet';
138150
severity: 'warning',
139151
},
140152
annotations: {
141-
description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.',
153+
description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back%s.' % [
154+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
155+
],
142156
summary: 'StatefulSet generation mismatch due to possible roll-back',
143157
},
144158
'for': '15m',
@@ -168,7 +182,9 @@ local utils = import '../lib/utils.libsonnet';
168182
severity: 'warning',
169183
},
170184
annotations: {
171-
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.',
185+
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out%s.' % [
186+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
187+
],
172188
summary: 'StatefulSet update has not been rolled out.',
173189
},
174190
'for': '15m',
@@ -205,7 +221,10 @@ local utils = import '../lib/utils.libsonnet';
205221
severity: 'warning',
206222
},
207223
annotations: {
208-
description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %(kubeDaemonSetRolloutStuckFor)s.' % $._config,
224+
description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %s%s.' % [
225+
$._config.kubeDaemonSetRolloutStuckFor,
226+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
227+
],
209228
summary: 'DaemonSet rollout is stuck.',
210229
},
211230
'for': $._config.kubeDaemonSetRolloutStuckFor,
@@ -218,7 +237,9 @@ local utils = import '../lib/utils.libsonnet';
218237
severity: 'warning',
219238
},
220239
annotations: {
221-
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").',
240+
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}")%s.' % [
241+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
242+
],
222243
summary: 'Pod container waiting longer than 1 hour',
223244
},
224245
'for': '1h',
@@ -235,7 +256,9 @@ local utils = import '../lib/utils.libsonnet';
235256
severity: 'warning',
236257
},
237258
annotations: {
238-
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.',
259+
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled%s.' % [
260+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
261+
],
239262
summary: 'DaemonSet pods are not scheduled.',
240263
},
241264
'for': '10m',
@@ -249,7 +272,9 @@ local utils = import '../lib/utils.libsonnet';
249272
severity: 'warning',
250273
},
251274
annotations: {
252-
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.',
275+
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run%s.' % [
276+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
277+
],
253278
summary: 'DaemonSet pods are misscheduled.',
254279
},
255280
'for': '15m',
@@ -265,7 +290,10 @@ local utils = import '../lib/utils.libsonnet';
265290
severity: 'warning',
266291
},
267292
annotations: {
268-
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%(kubeJobTimeoutDuration)s" | humanizeDuration }} to complete.' % $._config,
293+
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%s" | humanizeDuration }} to complete%s.' % [
294+
$._config.kubeJobTimeoutDuration,
295+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
296+
],
269297
summary: 'Job did not complete in time',
270298
},
271299
},
@@ -279,7 +307,9 @@ local utils = import '../lib/utils.libsonnet';
279307
severity: 'warning',
280308
},
281309
annotations: {
282-
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.',
310+
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert%s.' % [
311+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
312+
],
283313
summary: 'Job failed to complete.',
284314
},
285315
},
@@ -303,7 +333,9 @@ local utils = import '../lib/utils.libsonnet';
303333
severity: 'warning',
304334
},
305335
annotations: {
306-
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.',
336+
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes%s.' % [
337+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
338+
],
307339
summary: 'HPA has not matched desired number of replicas.',
308340
},
309341
'for': '15m',
@@ -319,7 +351,9 @@ local utils = import '../lib/utils.libsonnet';
319351
severity: 'warning',
320352
},
321353
annotations: {
322-
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.',
354+
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes%s.' % [
355+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
356+
],
323357
summary: 'HPA is running at max replicas',
324358
},
325359
'for': '15m',

alerts/kube_apiserver.libsonnet

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ local utils = import '../lib/utils.libsonnet';
3636
long: '%(long)s' % w,
3737
},
3838
annotations: {
39-
description: 'The API server is burning too much error budget.',
39+
description: 'The API server is burning too much error budget%s.' % [
40+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
41+
],
4042
summary: 'The API server is burning too much error budget.',
4143
},
4244
'for': '%(for)s' % w,
@@ -111,7 +113,9 @@ local utils = import '../lib/utils.libsonnet';
111113
severity: 'warning',
112114
},
113115
annotations: {
114-
description: 'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.',
116+
description: 'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}%% available over the last 10m%s.' % [
117+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
118+
],
115119
summary: 'Kubernetes aggregated API is down.',
116120
},
117121
},
@@ -128,7 +132,9 @@ local utils = import '../lib/utils.libsonnet';
128132
severity: 'warning',
129133
},
130134
annotations: {
131-
description: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.',
135+
description: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests%s.' % [
136+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
137+
],
132138
summary: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.',
133139
},
134140
'for': '5m',

0 commit comments

Comments
 (0)