Skip to content

Commit 49b5313

Browse files
Merge pull request #1496 from simonpasquier/add-runbook-urls
NO-JIRA: add runbook urls for alerting rules
2 parents afe57ac + b039259 commit 49b5313

File tree

3 files changed

+24
-4
lines changed

3 files changed

+24
-4
lines changed

jsonnet/jsonnetfile.lock.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"subdir": "contrib/mixin"
99
}
1010
},
11-
"version": "c218423621d0f574b709a2c5920970669e00f21c",
11+
"version": "e4d6a05f8f1ec972384e24a83c420f707a6644f2",
1212
"sum": "XmXkOCriQIZmXwlIIFhqlJMa0e6qGWdxZD+ZDYaN0Po="
1313
},
1414
{
@@ -18,7 +18,7 @@
1818
"subdir": "gen/grafonnet-v10.0.0"
1919
}
2020
},
21-
"version": "5a8f3d6aa89b7e7513528371d2d1265aedc844bc",
21+
"version": "42d098fae987f25f08480e203ca6ddc548c6efbf",
2222
"sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0="
2323
},
2424
{

jsonnet/main.jsonnet

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,25 @@ local excludeRules = std.map(
1515
}, alertingRules + promRules
1616
);
1717

18-
// modifiedRules injects runbook_url to all critical alerts on all rules.
18+
// Collect alert names for runbook_url annotations.
19+
// By definition, an alerting rule with a critical severity label must have a
20+
// runbook URL.
21+
local alertingRulesWithRunbooks = std.flattenArrays(std.map(
22+
function(group)
23+
std.map(
24+
function(rule)
25+
rule.alert,
26+
std.filter(
27+
function(rule)
28+
std.objectHas(rule, 'alert') && rule.labels.severity == 'critical',
29+
group.rules
30+
)
31+
),
32+
excludeRules + openshiftRules.prometheusRules.groups
33+
));
34+
1935
local modifiedRules = std.map(function(group) group {
20-
rules: std.map(function(rule) if 'alert' in rule && !('runbook_url' in rule.annotations) && (rule.labels.severity == 'critical') then rule {
36+
rules: std.map(function(rule) if 'alert' in rule && !('runbook_url' in rule.annotations) && std.member(alertingRulesWithRunbooks, rule.alert) then rule {
2137
annotations+: {
2238
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/%s.md' % rule.alert,
2339
},

manifests/0000_90_etcd-operator_03_prometheusrule.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ spec:
4242
- alert: etcdHighFsyncDurations
4343
annotations:
4444
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
45+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
4546
summary: etcd cluster 99th percentile fsync durations are too high.
4647
expr: |
4748
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
@@ -84,6 +85,7 @@ spec:
8485
- alert: etcdDatabaseQuotaLowSpace
8586
annotations:
8687
description: 'etcd cluster "{{ $labels.job }}": database size is 65% of the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
88+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdDatabaseQuotaLowSpace.md
8789
summary: etcd cluster database is using >= 65% of the defined quota.
8890
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 65
8991
for: 10m
@@ -92,6 +94,7 @@ spec:
9294
- alert: etcdDatabaseQuotaLowSpace
9395
annotations:
9496
description: 'etcd cluster "{{ $labels.job }}": database size is 75% of the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
97+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdDatabaseQuotaLowSpace.md
9598
summary: etcd cluster database is using >= 75% of the defined quota.
9699
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 75
97100
for: 10m
@@ -132,6 +135,7 @@ spec:
132135
- alert: etcdHighNumberOfFailedGRPCRequests
133136
annotations:
134137
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
138+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighNumberOfFailedGRPCRequests.md
135139
summary: etcd cluster has high number of failed grpc requests.
136140
expr: |
137141
(sum(rate(grpc_server_handled_total{job="etcd", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)

0 commit comments

Comments
 (0)