Skip to content

Commit bdbf7f4

Browse files
authored
Update CPUThrottlingHigh and KubeContainerWaiting alerts (#942)
* CPUThrottlingHigh: change aggregation to without to keep external labels. Add cadvisorSelector selector. * KubeContainerWaiting: remove sum by, to keep external labels. Add reason in description.
1 parent 3830dfd commit bdbf7f4

File tree

2 files changed

+4
-4
lines changed

2 files changed

+4
-4
lines changed

alerts/apps_alerts.libsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,13 +211,13 @@ local utils = import '../lib/utils.libsonnet';
211211
},
212212
{
213213
expr: |||
214-
sum by (namespace, pod, container, %(clusterLabel)s) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0
214+
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", %(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
215215
||| % $._config,
216216
labels: {
217217
severity: 'warning',
218218
},
219219
annotations: {
220-
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.',
220+
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").',
221221
summary: 'Pod container waiting longer than 1 hour',
222222
},
223223
'for': '1h',

alerts/resource_alerts.libsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,9 @@
199199
{
200200
alert: 'CPUThrottlingHigh',
201201
expr: |||
202-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (%(clusterLabel)s, container, pod, namespace)
202+
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node)
203203
/
204-
sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (%(clusterLabel)s, container, pod, namespace)
204+
sum(increase(container_cpu_cfs_periods_total{%(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node)
205205
> ( %(cpuThrottlingPercent)s / 100 )
206206
||| % $._config,
207207
'for': '15m',

0 commit comments

Comments
 (0)