Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 47 additions & 16 deletions alerts/resource_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,34 @@ local utils = import '../lib/utils.libsonnet';
} +
if $._config.showMultiCluster then {
expr: |||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) > 0
and
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
or
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
and
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0)
||| % $._config,
annotations+: {
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ printf "%%.2f" $value }} CPU shares and cannot tolerate node failure.' % $._config,
},
} else {
expr: |||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) > 0
and
count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
or
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
and
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0)
||| % $._config,
annotations+: {
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
Expand All @@ -65,24 +81,39 @@ local utils = import '../lib/utils.libsonnet';
} +
if $._config.showMultiCluster then {
expr: |||
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) > 0
and
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
or
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
and
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0)
||| % $._config,
annotations+: {
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
},
} else
{
expr: |||
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
and
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
||| % $._config,
annotations+: {
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
},
} else {
expr: |||
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) > 0
and
count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
or
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
and
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0)
||| % $._config,
annotations+: {
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
},
},
{
alert: 'KubeCPUQuotaOvercommit',
labels: {
Expand Down
167 changes: 167 additions & 0 deletions tests/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1424,3 +1424,170 @@ tests:
runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch"
summary: "StatefulSet has not matched the expected number of replicas."

- name: KubeCPUOvercommit alert (single-node)
interval: 1m
input_series:
- series: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="default"}'
values: '1x10'
- series: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="kube-system"}'
values: '1x10'
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n1", resource="cpu", job="kube-state-metrics"}'
values: '1.9x10' # This value was seen on a 2x vCPU node
- series: 'kube_node_role{cluster="kubernetes", node="n1", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
alert_rule_test:
- eval_time: 9m
alertname: KubeCPUOvercommit
- eval_time: 10m
alertname: KubeCPUOvercommit
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
description: Cluster has overcommitted CPU resource requests for Pods by 0.10000000000000009 CPU shares and cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.

- name: KubeCPUOvercommit alert (multi-node; non-HA)
interval: 1m
input_series:
- series: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="default"}'
values: '2x10'
- series: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="kube-system"}'
values: '2x10'
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n1", resource="cpu", job="kube-state-metrics"}'
values: '1.9x10' # This value was seen on a 2x vCPU node
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n2", resource="cpu", job="kube-state-metrics"}'
values: '1.9x10'
- series: 'kube_node_role{cluster="kubernetes", node="n1", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
- series: 'kube_node_role{cluster="kubernetes", node="n2", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
alert_rule_test:
- eval_time: 9m
alertname: KubeCPUOvercommit
- eval_time: 10m
alertname: KubeCPUOvercommit
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
description: Cluster has overcommitted CPU resource requests for Pods by 0.20000000000000018 CPU shares and cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.

- name: KubeCPUOvercommit alert (multi-node; HA)
interval: 1m
input_series:
- series: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="default"}'
values: '2x10'
- series: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="kube-system"}'
values: '2x10'
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n1", resource="cpu", job="kube-state-metrics"}'
values: '1.9x10' # This value was seen on a 2x vCPU node
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n2", resource="cpu", job="kube-state-metrics"}'
values: '1.9x10'
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n3", resource="cpu", job="kube-state-metrics"}'
values: '1.9x10'
- series: 'kube_node_role{cluster="kubernetes", node="n1", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
- series: 'kube_node_role{cluster="kubernetes", node="n2", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
- series: 'kube_node_role{cluster="kubernetes", node="n3", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
alert_rule_test:
- eval_time: 9m
alertname: KubeCPUOvercommit
- eval_time: 10m
alertname: KubeCPUOvercommit
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
description: Cluster has overcommitted CPU resource requests for Pods by 0.20000000000000062 CPU shares and cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.

- name: KubeMemoryOvercommit alert (single-node)
interval: 1m
input_series:
- series: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="default"}'
values: '1000000000x10' # 1 GB
- series: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="kube-system"}'
values: '1000000000x10'
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n1", resource="memory", job="kube-state-metrics"}'
values: '1000000000x10'
- series: 'kube_node_role{cluster="kubernetes", node="n1", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
alert_rule_test:
- eval_time: 9m
alertname: KubeMemoryOvercommit
- eval_time: 10m
alertname: KubeMemoryOvercommit
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
description: Cluster has overcommitted memory resource requests for Pods by 1G bytes and cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.

- name: KubeMemoryOvercommit alert (multi-node; non-HA)
interval: 1m
input_series:
- series: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="default"}'
values: '2000000000x10' # 2 GB
- series: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="kube-system"}'
values: '2000000000x10'
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n1", resource="memory", job="kube-state-metrics"}'
values: '1000000000x10'
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n2", resource="memory", job="kube-state-metrics"}'
values: '1000000000x10'
- series: 'kube_node_role{cluster="kubernetes", node="n1", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
- series: 'kube_node_role{cluster="kubernetes", node="n2", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
alert_rule_test:
- eval_time: 9m
alertname: KubeMemoryOvercommit
- eval_time: 10m
alertname: KubeMemoryOvercommit
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
description: Cluster has overcommitted memory resource requests for Pods by 2G bytes and cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.

- name: KubeMemoryOvercommit alert (multi-node; HA)
interval: 1m
input_series:
- series: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="default"}'
values: '2000000000x10' # 2 GB
- series: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="kube-system"}'
values: '2000000000x10'
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n1", resource="memory", job="kube-state-metrics"}'
values: '1000000000x10'
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n2", resource="memory", job="kube-state-metrics"}'
values: '1000000000x10'
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n3", resource="memory", job="kube-state-metrics"}'
values: '1000000000x10'
- series: 'kube_node_role{cluster="kubernetes", node="n1", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
- series: 'kube_node_role{cluster="kubernetes", node="n2", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
- series: 'kube_node_role{cluster="kubernetes", node="n3", role="control-plane", job="kube-state-metrics"}'
values: '1x10'
alert_rule_test:
- eval_time: 9m
alertname: KubeMemoryOvercommit
- eval_time: 10m
alertname: KubeMemoryOvercommit
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
description: Cluster has overcommitted memory resource requests for Pods by 2G bytes and cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.