1
1
local utils = import '../lib/utils.libsonnet' ;
2
2
3
3
{
4
+ local kubeOvercommitExpression(resource) = if $._config.showMultiCluster then
5
+ |||
6
+ # Non-HA clusters.
7
+ (
8
+ (
9
+ sum by(%(clusterLabel)s) (namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
10
+ -
11
+ sum by(%(clusterLabel)s) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="%(resource)s"}) > 0
12
+ )
13
+ and
14
+ count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3
15
+ )
16
+ or
17
+ # HA clusters.
18
+ (
19
+ sum by(%(clusterLabel)s) (namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
20
+ -
21
+ (
22
+ # Skip clusters with only one allocatable node.
23
+ (
24
+ sum by (%(clusterLabel)s) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="%(resource)s"})
25
+ -
26
+ max by (%(clusterLabel)s) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="%(resource)s"})
27
+ ) > 0
28
+ ) > 0
29
+ )
30
+ ||| % $._config { resource: resource }
31
+ else
32
+ |||
33
+ # Non-HA clusters.
34
+ (
35
+ (
36
+ sum(namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
37
+ -
38
+ sum(kube_node_status_allocatable{resource="%(resource)s", %(kubeStateMetricsSelector)s}) > 0
39
+ )
40
+ and
41
+ count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3
42
+ )
43
+ or
44
+ # HA clusters.
45
+ (
46
+ sum(namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
47
+ -
48
+ (
49
+ # Skip clusters with only one allocatable node.
50
+ (
51
+ sum(kube_node_status_allocatable{resource="%(resource)s", %(kubeStateMetricsSelector)s})
52
+ -
53
+ max(kube_node_status_allocatable{resource="%(resource)s", %(kubeStateMetricsSelector)s})
54
+ ) > 0
55
+ ) > 0
56
+ )
57
+ ||| % $._config { resource: resource },
58
+
4
59
_config+:: {
5
60
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics' ,
6
61
nodeExporterSelector: error 'must provide selector for node-exporter' ,
@@ -31,43 +86,12 @@ local utils = import '../lib/utils.libsonnet';
31
86
},
32
87
annotations: {
33
88
summary: 'Cluster has overcommitted CPU resource requests.' ,
89
+ description: 'Cluster%s has overcommitted CPU resource requests for Pods by {{ printf "%%.2f" $value }} CPU shares and cannot tolerate node failure.' % [
90
+ utils.ifShowMultiCluster($._config, ' {{ $labels.%(clusterLabel)s }}' % $._config),
91
+ ],
34
92
},
35
93
'for' : '10m' ,
36
- } +
37
- if $._config.showMultiCluster then {
38
- expr: |||
39
- (sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
40
- sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) > 0
41
- and
42
- count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
43
- or
44
- (sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
45
- (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
46
- max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
47
- and
48
- (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
49
- max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0)
50
- ||| % $._config,
51
- annotations+: {
52
- description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ printf "%%.2f" $value }} CPU shares and cannot tolerate node failure.' % $._config,
53
- },
54
- } else {
55
- expr: |||
56
- (sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
57
- sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) > 0
58
- and
59
- count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
60
- or
61
- (sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
62
- (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
63
- max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
64
- and
65
- (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
66
- max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0)
67
- ||| % $._config,
68
- annotations+: {
69
- description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
70
- },
94
+ expr: kubeOvercommitExpression('cpu' ),
71
95
},
72
96
{
73
97
alert: 'KubeMemoryOvercommit' ,
@@ -76,43 +100,12 @@ local utils = import '../lib/utils.libsonnet';
76
100
},
77
101
annotations: {
78
102
summary: 'Cluster has overcommitted memory resource requests.' ,
103
+ description: 'Cluster%s has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % [
104
+ utils.ifShowMultiCluster($._config, ' {{ $labels.%(clusterLabel)s }}' % $._config),
105
+ ],
79
106
},
80
107
'for' : '10m' ,
81
- } +
82
- if $._config.showMultiCluster then {
83
- expr: |||
84
- (sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
85
- sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) > 0
86
- and
87
- count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
88
- or
89
- (sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
90
- (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
91
- max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
92
- and
93
- (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
94
- max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0)
95
- ||| % $._config,
96
- annotations+: {
97
- description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
98
- },
99
- } else {
100
- expr: |||
101
- (sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
102
- sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) > 0
103
- and
104
- count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
105
- or
106
- (sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
107
- (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
108
- max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
109
- and
110
- (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
111
- max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0)
112
- ||| % $._config,
113
- annotations+: {
114
- description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' ,
115
- },
108
+ expr: kubeOvercommitExpression('memory' ),
116
109
},
117
110
{
118
111
alert: 'KubeCPUQuotaOvercommit' ,
0 commit comments