Skip to content

Commit 0821ada

Browse files
authored
Merge pull request #1425 from raptorsun/bugfix/rule_test
2 parents 10c11d5 + 1e632d2 commit 0821ada

17 files changed

+10070
-3236
lines changed

manifests/alertmanager-prometheusRule.yaml

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ spec:
1616
rules:
1717
- alert: AlertmanagerFailedReload
1818
annotations:
19-
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
19+
description: Configuration has failed to load for {{ $labels.namespace }}/{{
20+
$labels.pod}}.
2021
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
2122
summary: Reloading an Alertmanager configuration has failed.
2223
expr: |
@@ -28,9 +29,11 @@ spec:
2829
severity: critical
2930
- alert: AlertmanagerMembersInconsistent
3031
annotations:
31-
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
32+
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only
33+
found {{ $value }} members of the {{$labels.job}} cluster.
3234
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
33-
summary: A member of an Alertmanager cluster has not found all other cluster members.
35+
summary: A member of an Alertmanager cluster has not found all other cluster
36+
members.
3437
expr: |
3538
# Without max_over_time, failed scrapes could create false negatives, see
3639
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@@ -42,7 +45,9 @@ spec:
4245
severity: critical
4346
- alert: AlertmanagerFailedToSendAlerts
4447
annotations:
45-
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
48+
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed
49+
to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration
50+
}}.
4651
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
4752
summary: An Alertmanager instance failed to send notifications.
4853
expr: |
@@ -57,9 +62,12 @@ spec:
5762
severity: warning
5863
- alert: AlertmanagerClusterFailedToSendAlerts
5964
annotations:
60-
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
65+
description: The minimum notification failure rate to {{ $labels.integration
66+
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
67+
humanizePercentage }}.
6168
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
62-
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
69+
summary: All Alertmanager instances in a cluster failed to send notifications
70+
to a critical integration.
6371
expr: |
6472
min by (namespace,service, integration) (
6573
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
@@ -72,9 +80,12 @@ spec:
7280
severity: critical
7381
- alert: AlertmanagerClusterFailedToSendAlerts
7482
annotations:
75-
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
83+
description: The minimum notification failure rate to {{ $labels.integration
84+
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
85+
humanizePercentage }}.
7686
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
77-
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
87+
summary: All Alertmanager instances in a cluster failed to send notifications
88+
to a non-critical integration.
7889
expr: |
7990
min by (namespace,service, integration) (
8091
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
@@ -87,7 +98,8 @@ spec:
8798
severity: warning
8899
- alert: AlertmanagerConfigInconsistent
89100
annotations:
90-
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
101+
description: Alertmanager instances within the {{$labels.job}} cluster have
102+
different configurations.
91103
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
92104
summary: Alertmanager instances within the same cluster have different configurations.
93105
expr: |
@@ -100,9 +112,12 @@ spec:
100112
severity: critical
101113
- alert: AlertmanagerClusterDown
102114
annotations:
103-
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
115+
description: '{{ $value | humanizePercentage }} of Alertmanager instances
116+
within the {{$labels.job}} cluster have been up for less than half of the
117+
last 5m.'
104118
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
105-
summary: Half or more of the Alertmanager instances within the same cluster are down.
119+
summary: Half or more of the Alertmanager instances within the same cluster
120+
are down.
106121
expr: |
107122
(
108123
count by (namespace,service) (
@@ -119,9 +134,12 @@ spec:
119134
severity: critical
120135
- alert: AlertmanagerClusterCrashlooping
121136
annotations:
122-
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
137+
description: '{{ $value | humanizePercentage }} of Alertmanager instances
138+
within the {{$labels.job}} cluster have restarted at least 5 times in the
139+
last 10m.'
123140
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
124-
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
141+
summary: Half or more of the Alertmanager instances within the same cluster
142+
are crashlooping.
125143
expr: |
126144
(
127145
count by (namespace,service) (

manifests/kube-prometheus-prometheusRule.yaml

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@ spec:
1515
rules:
1616
- alert: TargetDown
1717
annotations:
18-
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
18+
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
19+
}} targets in {{ $labels.namespace }} namespace are down.'
1920
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
2021
summary: One or more targets are unreachable.
21-
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
22+
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job,
23+
namespace, service)) > 10
2224
for: 10m
2325
labels:
2426
severity: warning
@@ -31,15 +33,17 @@ spec:
3133
mechanisms that send a notification when this alert is not firing. For example the
3234
"DeadMansSnitch" integration in PagerDuty.
3335
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
34-
summary: An alert that should always be firing to certify that Alertmanager is working properly.
36+
summary: An alert that should always be firing to certify that Alertmanager
37+
is working properly.
3538
expr: vector(1)
3639
labels:
3740
severity: none
3841
- name: node-network
3942
rules:
4043
- alert: NodeNetworkInterfaceFlapping
4144
annotations:
42-
description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
45+
description: Network interface "{{ $labels.device }}" changing its up status
46+
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
4347
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
4448
summary: Network interface is often changing its status
4549
expr: |
@@ -49,17 +53,21 @@ spec:
4953
severity: warning
5054
- name: kube-prometheus-node-recording.rules
5155
rules:
52-
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
56+
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
57+
BY (instance)
5358
record: instance:node_cpu:rate:sum
5459
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
5560
record: instance:node_network_receive_bytes:rate:sum
5661
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
5762
record: instance:node_network_transmit_bytes:rate:sum
58-
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
63+
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
64+
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
65+
BY (instance, cpu)) BY (instance)
5966
record: instance:node_cpu:ratio
6067
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
6168
record: cluster:node_cpu:sum_rate5m
62-
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
69+
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
70+
BY (instance, cpu))
6371
record: cluster:node_cpu:ratio
6472
- name: kube-prometheus-general.rules
6573
rules:

manifests/kube-state-metrics-prometheusRule.yaml

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ spec:
1616
rules:
1717
- alert: KubeStateMetricsListErrors
1818
annotations:
19-
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
19+
description: kube-state-metrics is experiencing errors at an elevated rate
20+
in list operations. This is likely causing it to not be able to expose metrics
21+
about Kubernetes objects correctly or at all.
2022
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
2123
summary: kube-state-metrics is experiencing errors in list operations.
2224
expr: |
@@ -29,7 +31,9 @@ spec:
2931
severity: critical
3032
- alert: KubeStateMetricsWatchErrors
3133
annotations:
32-
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
34+
description: kube-state-metrics is experiencing errors at an elevated rate
35+
in watch operations. This is likely causing it to not be able to expose
36+
metrics about Kubernetes objects correctly or at all.
3337
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
3438
summary: kube-state-metrics is experiencing errors in watch operations.
3539
expr: |
@@ -42,7 +46,9 @@ spec:
4246
severity: critical
4347
- alert: KubeStateMetricsShardingMismatch
4448
annotations:
45-
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
49+
description: kube-state-metrics pods are running with different --total-shards
50+
configuration, some Kubernetes objects may be exposed multiple times or
51+
not exposed at all.
4652
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
4753
summary: kube-state-metrics sharding is misconfigured.
4854
expr: |
@@ -52,7 +58,8 @@ spec:
5258
severity: critical
5359
- alert: KubeStateMetricsShardsMissing
5460
annotations:
55-
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
61+
description: kube-state-metrics shards are missing, some Kubernetes objects
62+
are not being exposed.
5663
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
5764
summary: kube-state-metrics shards are missing.
5865
expr: |

0 commit comments

Comments
 (0)