Skip to content

Commit 4917f2d

Browse files
committed
bugfix: refactor alerts to accomodate for single-node clusters
For the sake of brevity, let: Q: kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"} (allocable), and, QQ: namespace_cpu:kube_pod_container_resource_requests:sum{} (requested), thus, both quota alerts relevant here exist in the form: sum(QQ) by (cluster) - (sum(Q) by (cluster) - max(Q) by (cluster)) > 0 and (sum(Q) by (cluster) - max(Q) by (cluster)) > 0, which, in case of a single-node cluster (sum(Q) by (cluster) = max(Q) by (cluster)), is reduced to, sum(QQ) by (cluster) > 0, i.e., the alert will fire if *any* request limits exist. To address this, drop the "max(Q) by (cluster)" buffer assumed in non-SNO clusters from SNO, reducing the expression to: sum(QQ) by (cluster) - sum(Q) by (cluster) > 0 (total requeted - total allocable > 0 to trigger alert), since there is only a single node, so a buffer of the same sort does not make sense. Signed-off-by: Pranshu Srivastava <[email protected]>
1 parent 96d585c commit 4917f2d

File tree

1 file changed

+47
-16
lines changed

1 file changed

+47
-16
lines changed

alerts/resource_alerts.libsonnet

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,34 @@ local utils = import '../lib/utils.libsonnet';
3636
} +
3737
if $._config.showMultiCluster then {
3838
expr: |||
39-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
39+
(count(kube_node_info) == 1
4040
and
41-
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
41+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
42+
sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) > 0)
43+
or
44+
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
45+
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
46+
max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
47+
and
48+
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
49+
max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0)
4250
||| % $._config,
4351
annotations+: {
4452
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
4553
},
4654
} else {
4755
expr: |||
48-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
56+
(count(kube_node_info) == 1
57+
and
58+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
59+
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) > 0)
60+
or
61+
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
62+
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
63+
max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
4964
and
50-
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
65+
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
66+
max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0)
5167
||| % $._config,
5268
annotations+: {
5369
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
@@ -65,24 +81,39 @@ local utils = import '../lib/utils.libsonnet';
6581
} +
6682
if $._config.showMultiCluster then {
6783
expr: |||
68-
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
84+
(count(kube_node_info) == 1
6985
and
70-
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
86+
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
87+
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) > 0)
88+
or
89+
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
90+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
91+
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
92+
and
93+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
94+
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0)
7195
||| % $._config,
7296
annotations+: {
7397
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
7498
},
75-
} else
76-
{
77-
expr: |||
78-
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
79-
and
80-
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
81-
||| % $._config,
82-
annotations+: {
83-
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
84-
},
99+
} else {
100+
expr: |||
101+
(count(kube_node_info) == 1
102+
and
103+
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
104+
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) > 0)
105+
or
106+
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
107+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
108+
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
109+
and
110+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
111+
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0)
112+
||| % $._config,
113+
annotations+: {
114+
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
85115
},
116+
},
86117
{
87118
alert: 'KubeCPUQuotaOvercommit',
88119
labels: {

0 commit comments

Comments
 (0)