Skip to content

Commit eb631d8

Browse files
Refactor overcommit rules (#1087)
* chore: reduce duplication for Kube*Overcommit rules Signed-off-by: Simon Pasquier <[email protected]> * chore: remove unnecessary test in expression Signed-off-by: Simon Pasquier <[email protected]> * chore: simplify description for Kube*OverCommit rules --------- Signed-off-by: Simon Pasquier <[email protected]>
1 parent 5f7bb7a commit eb631d8

File tree

2 files changed

+66
-73
lines changed

2 files changed

+66
-73
lines changed

alerts/resource_alerts.libsonnet

Lines changed: 63 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,61 @@
11
local utils = import '../lib/utils.libsonnet';
22

33
{
4+
local kubeOvercommitExpression(resource) = if $._config.showMultiCluster then
5+
|||
6+
# Non-HA clusters.
7+
(
8+
(
9+
sum by(%(clusterLabel)s) (namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
10+
-
11+
sum by(%(clusterLabel)s) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="%(resource)s"}) > 0
12+
)
13+
and
14+
count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3
15+
)
16+
or
17+
# HA clusters.
18+
(
19+
sum by(%(clusterLabel)s) (namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
20+
-
21+
(
22+
# Skip clusters with only one allocatable node.
23+
(
24+
sum by (%(clusterLabel)s) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="%(resource)s"})
25+
-
26+
max by (%(clusterLabel)s) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="%(resource)s"})
27+
) > 0
28+
) > 0
29+
)
30+
||| % $._config { resource: resource }
31+
else
32+
|||
33+
# Non-HA clusters.
34+
(
35+
(
36+
sum(namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
37+
-
38+
sum(kube_node_status_allocatable{resource="%(resource)s", %(kubeStateMetricsSelector)s}) > 0
39+
)
40+
and
41+
count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3
42+
)
43+
or
44+
# HA clusters.
45+
(
46+
sum(namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
47+
-
48+
(
49+
# Skip clusters with only one allocatable node.
50+
(
51+
sum(kube_node_status_allocatable{resource="%(resource)s", %(kubeStateMetricsSelector)s})
52+
-
53+
max(kube_node_status_allocatable{resource="%(resource)s", %(kubeStateMetricsSelector)s})
54+
) > 0
55+
) > 0
56+
)
57+
||| % $._config { resource: resource },
58+
459
_config+:: {
560
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
661
nodeExporterSelector: error 'must provide selector for node-exporter',
@@ -31,43 +86,12 @@ local utils = import '../lib/utils.libsonnet';
3186
},
3287
annotations: {
3388
summary: 'Cluster has overcommitted CPU resource requests.',
89+
description: 'Cluster%s has overcommitted CPU resource requests for Pods by {{ printf "%%.2f" $value }} CPU shares and cannot tolerate node failure.' % [
90+
utils.ifShowMultiCluster($._config, ' {{ $labels.%(clusterLabel)s }}' % $._config),
91+
],
3492
},
3593
'for': '10m',
36-
} +
37-
if $._config.showMultiCluster then {
38-
expr: |||
39-
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
40-
sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) > 0
41-
and
42-
count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
43-
or
44-
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
45-
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
46-
max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
47-
and
48-
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
49-
max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0)
50-
||| % $._config,
51-
annotations+: {
52-
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ printf "%%.2f" $value }} CPU shares and cannot tolerate node failure.' % $._config,
53-
},
54-
} else {
55-
expr: |||
56-
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
57-
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) > 0
58-
and
59-
count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
60-
or
61-
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
62-
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
63-
max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
64-
and
65-
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
66-
max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0)
67-
||| % $._config,
68-
annotations+: {
69-
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
70-
},
94+
expr: kubeOvercommitExpression('cpu'),
7195
},
7296
{
7397
alert: 'KubeMemoryOvercommit',
@@ -76,43 +100,12 @@ local utils = import '../lib/utils.libsonnet';
76100
},
77101
annotations: {
78102
summary: 'Cluster has overcommitted memory resource requests.',
103+
description: 'Cluster%s has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % [
104+
utils.ifShowMultiCluster($._config, ' {{ $labels.%(clusterLabel)s }}' % $._config),
105+
],
79106
},
80107
'for': '10m',
81-
} +
82-
if $._config.showMultiCluster then {
83-
expr: |||
84-
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
85-
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) > 0
86-
and
87-
count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
88-
or
89-
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
90-
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
91-
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
92-
and
93-
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
94-
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0)
95-
||| % $._config,
96-
annotations+: {
97-
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
98-
},
99-
} else {
100-
expr: |||
101-
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
102-
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) > 0
103-
and
104-
count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
105-
or
106-
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
107-
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
108-
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
109-
and
110-
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
111-
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0)
112-
||| % $._config,
113-
annotations+: {
114-
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
115-
},
108+
expr: kubeOvercommitExpression('memory'),
116109
},
117110
{
118111
alert: 'KubeCPUQuotaOvercommit',

tests/tests.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1444,7 +1444,7 @@ tests:
14441444
- exp_labels:
14451445
severity: warning
14461446
exp_annotations:
1447-
description: Cluster has overcommitted CPU resource requests for Pods by 0.10000000000000009 CPU shares and cannot tolerate node failure.
1447+
description: Cluster has overcommitted CPU resource requests for Pods by 0.10 CPU shares and cannot tolerate node failure.
14481448
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
14491449
summary: Cluster has overcommitted CPU resource requests.
14501450

@@ -1472,7 +1472,7 @@ tests:
14721472
- exp_labels:
14731473
severity: warning
14741474
exp_annotations:
1475-
description: Cluster has overcommitted CPU resource requests for Pods by 0.20000000000000018 CPU shares and cannot tolerate node failure.
1475+
description: Cluster has overcommitted CPU resource requests for Pods by 0.20 CPU shares and cannot tolerate node failure.
14761476
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
14771477
summary: Cluster has overcommitted CPU resource requests.
14781478

@@ -1504,7 +1504,7 @@ tests:
15041504
- exp_labels:
15051505
severity: warning
15061506
exp_annotations:
1507-
description: Cluster has overcommitted CPU resource requests for Pods by 0.20000000000000062 CPU shares and cannot tolerate node failure.
1507+
description: Cluster has overcommitted CPU resource requests for Pods by 0.20 CPU shares and cannot tolerate node failure.
15081508
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
15091509
summary: Cluster has overcommitted CPU resource requests.
15101510

0 commit comments

Comments
 (0)