Skip to content

Commit 5f7bb7a

Browse files
authored
Protect KubeletPodStartUpLatencyHigh against duplicate series (#1086)
1 parent 1da53ae commit 5f7bb7a

File tree

2 files changed

+48
-1
lines changed

2 files changed

+48
-1
lines changed

alerts/kubelet.libsonnet

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,18 @@ local utils = import '../lib/utils.libsonnet';
164164
{
165165
alert: 'KubeletPodStartUpLatencyHigh',
166166
expr: |||
167-
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (%(clusterLabel)s, instance, le)) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s} > 60
167+
histogram_quantile(0.99,
168+
sum by (%(clusterLabel)s, instance, le) (
169+
topk by (%(clusterLabel)s, instance, le, operation_type) (1,
170+
rate(kubelet_pod_worker_duration_seconds_bucket{%(kubeletSelector)s}[5m])
171+
)
172+
)
173+
)
174+
* on(%(clusterLabel)s, instance) group_left(node)
175+
topk by (%(clusterLabel)s, instance, node) (1,
176+
kubelet_node_name{%(kubeletSelector)s}
177+
)
178+
> 60
168179
||| % $._config,
169180
'for': '15m',
170181
labels: {

tests/kubelet-test.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
rule_files:
22
- ../prometheus_rules.yaml
3+
- ../prometheus_alerts.yaml
34

45
evaluation_interval: 1m
56

@@ -26,3 +27,38 @@ tests:
2627
labels: 'node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{cluster="cluster",instance="ip-172-0-0-1", node="ip-172-0-0-1", quantile="0.9"}'
2728
- value: 0.99
2829
labels: 'node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{cluster="cluster",instance="ip-172-0-0-1", node="ip-172-0-0-1", quantile="0.99"}'
30+
31+
- name: "KubeletPodStartUpLatencyHigh alert test, including duplicate series check"
32+
interval: 1m
33+
input_series:
34+
- series: 'kubelet_pod_worker_duration_seconds_bucket{cluster="cluster",instance="ip-172-0-0-1",job="kubelet",le="+Inf"}'
35+
values: '0+1x16'
36+
- series: 'kubelet_pod_worker_duration_seconds_bucket{cluster="cluster",instance="ip-172-0-0-1",job="kubelet",le="+Inf",dupe="dupe"}'
37+
values: '0+1x16'
38+
- series: 'kubelet_pod_worker_duration_seconds_bucket{cluster="cluster",instance="ip-172-0-0-1",job="kubelet",le="99"}'
39+
values: '0+1x16'
40+
- series: 'kubelet_pod_worker_duration_seconds_bucket{cluster="cluster",instance="ip-172-0-0-1",job="kubelet",le="99",dupe="dupe"}'
41+
values: '0+1x16'
42+
- series: 'kubelet_pod_worker_duration_seconds_count{cluster="cluster",instance="ip-172-0-0-1",job="kubelet"}'
43+
values: '0+1x16'
44+
- series: 'kubelet_pod_worker_duration_seconds_count{cluster="cluster",instance="ip-172-0-0-1",job="kubelet",dupe="dupe"}'
45+
values: '0+1x16'
46+
- series: 'kubelet_node_name{cluster="cluster",node="ip-172-0-0-1",instance="ip-172-0-0-1",job="kubelet"}'
47+
values: '1x16'
48+
- series: 'kubelet_node_name{cluster="cluster",node="ip-172-0-0-1",instance="ip-172-0-0-1",job="kubelet",dupe="dupe"}'
49+
values: '1x16'
50+
alert_rule_test:
51+
- eval_time: 15m
52+
alertname: KubeletPodStartUpLatencyHigh
53+
- eval_time: 16m
54+
alertname: KubeletPodStartUpLatencyHigh
55+
exp_alerts:
56+
- exp_labels:
57+
severity: warning
58+
cluster: cluster
59+
instance: ip-172-0-0-1
60+
node: ip-172-0-0-1
61+
exp_annotations:
62+
summary: "Kubelet Pod startup latency is too high."
63+
description: "Kubelet Pod startup 99th percentile latency is 98.01 seconds on node ip-172-0-0-1."
64+
runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh"

0 commit comments

Comments
 (0)