Skip to content

Commit 3422a51

Browse files
authored
Merge pull request #766 from grafana/alerts_by_cluster
Add cluster label to aggregations in alert queries
2 parents 62ad10f + 69f4291 commit 3422a51

File tree

3 files changed

+10
-10
lines changed

3 files changed

+10
-10
lines changed

alerts/apps_alerts.libsonnet

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@
2727
},
2828
{
2929
// We wrap kube_pod_owner with the topk() aggregator to ensure that
30-
// every (namespace, pod) tuple is unique even if the "owner_kind"
30+
// every (namespace, pod, %(clusterLabel)s) tuple is unique even if the "owner_kind"
3131
// label exists for 2 values. This avoids "many-to-many matching
3232
// not allowed" errors when joining with kube_pod_status_phase.
3333
expr: |||
34-
sum by (namespace, pod) (
35-
max by(namespace, pod) (
34+
sum by (namespace, pod, %(clusterLabel)s) (
35+
max by(namespace, pod, %(clusterLabel)s) (
3636
kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending|Unknown"}
37-
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
38-
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
37+
) * on(namespace, pod, %(clusterLabel)s) group_left(owner_kind) topk by(namespace, pod, %(clusterLabel)s) (
38+
1, max by(namespace, pod, owner_kind, %(clusterLabel)s) (kube_pod_owner{owner_kind!="Job"})
3939
)
4040
) > 0
4141
||| % $._config,
@@ -193,7 +193,7 @@
193193
},
194194
{
195195
expr: |||
196-
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0
196+
sum by (namespace, pod, container, %(clusterLabel)s) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0
197197
||| % $._config,
198198
labels: {
199199
severity: 'warning',
@@ -238,7 +238,7 @@
238238
{
239239
alert: 'KubeJobNotCompleted',
240240
expr: |||
241-
time() - max by(namespace, job_name) (kube_job_status_start_time{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
241+
time() - max by(namespace, job_name, %(clusterLabel)s) (kube_job_status_start_time{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
242242
and
243243
kube_job_status_active{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0) > %(kubeJobTimeoutDuration)s
244244
||| % $._config,

alerts/kube_apiserver.libsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ local utils = import '../lib/utils.libsonnet';
7575
{
7676
alert: 'KubeAggregatedAPIErrors',
7777
expr: |||
78-
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
78+
sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
7979
||| % $._config,
8080
labels: {
8181
severity: 'warning',
@@ -88,7 +88,7 @@ local utils = import '../lib/utils.libsonnet';
8888
{
8989
alert: 'KubeAggregatedAPIDown',
9090
expr: |||
91-
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
91+
(1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
9292
||| % $._config,
9393
'for': '5m',
9494
labels: {

alerts/system_alerts.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
{
1212
alert: 'KubeVersionMismatch',
1313
expr: |||
14-
count(count by (git_version) (label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
14+
count by (%(clusterLabel)s) (count by (git_version, %(clusterLabel)s) (label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
1515
||| % $._config,
1616
'for': '15m',
1717
labels: {

0 commit comments

Comments
 (0)