Add runbook URLs to alerting rules

simonpasquier · simonpasquier · commit b039259aec94 · 2025-10-10T09:04:09.000+02:00
This commit adds the `runbook_url` annotation to info/warning alerting
rules for which an alerting rule with critical severity exists.

Signed-off-by: Simon Pasquier &lt;spasquie@redhat.com&gt;
diff --git a/jsonnet/jsonnetfile.lock.json b/jsonnet/jsonnetfile.lock.json
@@ -8,7 +8,7 @@
           "subdir": "contrib/mixin"
         }
       },
-      "version": "c218423621d0f574b709a2c5920970669e00f21c",
+      "version": "e4d6a05f8f1ec972384e24a83c420f707a6644f2",
       "sum": "XmXkOCriQIZmXwlIIFhqlJMa0e6qGWdxZD+ZDYaN0Po="
     },
     {
@@ -18,7 +18,7 @@
           "subdir": "gen/grafonnet-v10.0.0"
         }
       },
-      "version": "5a8f3d6aa89b7e7513528371d2d1265aedc844bc",
+      "version": "42d098fae987f25f08480e203ca6ddc548c6efbf",
       "sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0="
     },
     {
diff --git a/jsonnet/main.jsonnet b/jsonnet/main.jsonnet
@@ -15,9 +15,25 @@ local excludeRules = std.map(
   }, alertingRules + promRules
 );
 
-// modifiedRules injects runbook_url to all critical alerts on all rules.
+// Collect alert names for runbook_url annotations.
+// By definition, an alerting rule with a critical severity label must have a
+// runbook URL.
+local alertingRulesWithRunbooks = std.flattenArrays(std.map(
+  function(group)
+    std.map(
+      function(rule)
+        rule.alert,
+      std.filter(
+        function(rule)
+          std.objectHas(rule, 'alert') && rule.labels.severity == 'critical',
+        group.rules
+      )
+    ),
+  excludeRules + openshiftRules.prometheusRules.groups
+));
+
 local modifiedRules = std.map(function(group) group {
-  rules: std.map(function(rule) if 'alert' in rule && !('runbook_url' in rule.annotations) && (rule.labels.severity == 'critical') then rule {
+  rules: std.map(function(rule) if 'alert' in rule && !('runbook_url' in rule.annotations) && std.member(alertingRulesWithRunbooks, rule.alert) then rule {
                    annotations+: {
                      runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/%s.md' % rule.alert,
                    },
diff --git a/manifests/0000_90_etcd-operator_03_prometheusrule.yaml b/manifests/0000_90_etcd-operator_03_prometheusrule.yaml
@@ -42,6 +42,7 @@ spec:
     - alert: etcdHighFsyncDurations
       annotations:
         description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
         summary: etcd cluster 99th percentile fsync durations are too high.
       expr: |
         histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
@@ -84,6 +85,7 @@ spec:
     - alert: etcdDatabaseQuotaLowSpace
       annotations:
         description: 'etcd cluster "{{ $labels.job }}": database size is 65% of the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdDatabaseQuotaLowSpace.md
         summary: etcd cluster database is using >= 65% of the defined quota.
       expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 65
       for: 10m
@@ -92,6 +94,7 @@ spec:
     - alert: etcdDatabaseQuotaLowSpace
       annotations:
         description: 'etcd cluster "{{ $labels.job }}": database size is 75% of the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdDatabaseQuotaLowSpace.md
         summary: etcd cluster database is using >= 75% of the defined quota.
       expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 75
       for: 10m
@@ -132,6 +135,7 @@ spec:
     - alert: etcdHighNumberOfFailedGRPCRequests
       annotations:
         description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighNumberOfFailedGRPCRequests.md
         summary: etcd cluster has high number of failed grpc requests.
       expr: |
         (sum(rate(grpc_server_handled_total{job="etcd", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`"subdir": "contrib/mixin"`
`9`	`9`	`}`
`10`	`10`	`},`
`11`		`- "version": "c218423621d0f574b709a2c5920970669e00f21c",`
	`11`	`+ "version": "e4d6a05f8f1ec972384e24a83c420f707a6644f2",`
`12`	`12`	`"sum": "XmXkOCriQIZmXwlIIFhqlJMa0e6qGWdxZD+ZDYaN0Po="`
`13`	`13`	`},`
`14`	`14`	`{`
`@@ -18,7 +18,7 @@`
`18`	`18`	`"subdir": "gen/grafonnet-v10.0.0"`
`19`	`19`	`}`
`20`	`20`	`},`
`21`		`- "version": "5a8f3d6aa89b7e7513528371d2d1265aedc844bc",`
	`21`	`+ "version": "42d098fae987f25f08480e203ca6ddc548c6efbf",`
`22`	`22`	`"sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0="`
`23`	`23`	`},`
`24`	`24`	`{`