Skip to content

Commit 3b95dd1

Browse files
authored
Merge pull request #2058 from jkroepke/multi-cluster
Implement multi-cluster alerts
2 parents 5f31736 + 7c7a9ce commit 3b95dd1

File tree

3 files changed

+15
-14
lines changed

3 files changed

+15
-14
lines changed

examples/prometheus-alerting-rules/alerts.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ groups:
66
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
77
summary: kube-state-metrics is experiencing errors in list operations.
88
expr: |
9-
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
9+
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
1010
/
11-
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
11+
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
1212
> 0.01
1313
for: 15m
1414
labels:
@@ -18,9 +18,9 @@ groups:
1818
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
1919
summary: kube-state-metrics is experiencing errors in watch operations.
2020
expr: |
21-
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
21+
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
2222
/
23-
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
23+
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
2424
> 0.01
2525
for: 15m
2626
labels:
@@ -30,7 +30,7 @@ groups:
3030
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
3131
summary: kube-state-metrics sharding is misconfigured.
3232
expr: |
33-
stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0
33+
stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
3434
for: 15m
3535
labels:
3636
severity: critical
@@ -39,9 +39,9 @@ groups:
3939
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
4040
summary: kube-state-metrics shards are missing.
4141
expr: |
42-
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1
42+
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
4343
-
44-
sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) )
44+
sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
4545
!= 0
4646
for: 15m
4747
labels:

jsonnet/kube-state-metrics-mixin/alerts.libsonnet

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
{
88
alert: 'KubeStateMetricsListErrors',
99
expr: |||
10-
(sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s,result="error"}[5m]))
10+
(sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s,result="error"}[5m])) by (%(clusterLabel)s)
1111
/
12-
sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s}[5m])))
12+
sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s}[5m])) by (%(clusterLabel)s))
1313
> 0.01
1414
||| % $._config,
1515
'for': '15m',
@@ -24,9 +24,9 @@
2424
{
2525
alert: 'KubeStateMetricsWatchErrors',
2626
expr: |||
27-
(sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s,result="error"}[5m]))
27+
(sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s,result="error"}[5m])) by (%(clusterLabel)s)
2828
/
29-
sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s}[5m])))
29+
sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s}[5m])) by (%(clusterLabel)s))
3030
> 0.01
3131
||| % $._config,
3232
'for': '15m',
@@ -42,7 +42,7 @@
4242
alert: 'KubeStateMetricsShardingMismatch',
4343
//
4444
expr: |||
45-
stdvar (kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) != 0
45+
stdvar (kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) != 0
4646
||| % $._config,
4747
'for': '15m',
4848
labels: {
@@ -61,9 +61,9 @@
6161
// A handy side effect of this computation is the result indicates what ordinals are missing.
6262
// Eg. a result of "5" decimal, which translates to binary "101", means shards #0 and #2 are not available.
6363
expr: |||
64-
2^max(kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) - 1
64+
2^max(kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - 1
6565
-
66-
sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{%(kubeStateMetricsSelector)s}) )
66+
sum( 2 ^ max by (%(clusterLabel)s, shard_ordinal) (kube_state_metrics_shard_ordinal{%(kubeStateMetricsSelector)s}) ) by (%(clusterLabel)s)
6767
!= 0
6868
||| % $._config,
6969
'for': '15m',

jsonnet/kube-state-metrics-mixin/config.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@
22
_config+:: {
33
// Select the metrics coming from the kube state metrics.
44
kubeStateMetricsSelector: 'job="kube-state-metrics"',
5+
clusterLabel: 'cluster',
56
},
67
}

0 commit comments

Comments
 (0)