|
27 | 27 | }, |
28 | 28 | { |
29 | 29 | // We wrap kube_pod_owner with the topk() aggregator to ensure that |
30 | | - // every (namespace, pod) tuple is unique even if the "owner_kind" |
| 30 | + // every (namespace, pod, %(clusterLabel)s) tuple is unique even if the "owner_kind" |
31 | 31 | // label exists for 2 values. This avoids "many-to-many matching |
32 | 32 | // not allowed" errors when joining with kube_pod_status_phase. |
33 | 33 | expr: ||| |
34 | | - sum by (namespace, pod) ( |
35 | | - max by(namespace, pod) ( |
| 34 | + sum by (namespace, pod, %(clusterLabel)s) ( |
| 35 | + max by(namespace, pod, %(clusterLabel)s) ( |
36 | 36 | kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending|Unknown"} |
37 | | - ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( |
38 | | - 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) |
| 37 | + ) * on(namespace, pod, %(clusterLabel)s) group_left(owner_kind) topk by(namespace, pod, %(clusterLabel)s) ( |
| 38 | + 1, max by(namespace, pod, owner_kind, %(clusterLabel)s) (kube_pod_owner{owner_kind!="Job"}) |
39 | 39 | ) |
40 | 40 | ) > 0 |
41 | 41 | ||| % $._config, |
|
193 | 193 | }, |
194 | 194 | { |
195 | 195 | expr: ||| |
196 | | - sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0 |
| 196 | + sum by (namespace, pod, container, %(clusterLabel)s) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0 |
197 | 197 | ||| % $._config, |
198 | 198 | labels: { |
199 | 199 | severity: 'warning', |
|
238 | 238 | { |
239 | 239 | alert: 'KubeJobNotCompleted', |
240 | 240 | expr: ||| |
241 | | - time() - max by(namespace, job_name) (kube_job_status_start_time{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} |
| 241 | + time() - max by(namespace, job_name, %(clusterLabel)s) (kube_job_status_start_time{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} |
242 | 242 | and |
243 | 243 | kube_job_status_active{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0) > %(kubeJobTimeoutDuration)s |
244 | 244 | ||| % $._config, |
|
0 commit comments