From f2f70372d27eba933ff0a288a030d33a240414d0 Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Thu, 9 Oct 2025 14:36:43 +0200 Subject: [PATCH] NO-JIRA: add recommended etcd threshold for alerts --- jsonnet/custom.libsonnet | 15 ++++++++- jsonnet/jsonnetfile.lock.json | 4 +-- jsonnet/main.jsonnet | 2 +- ...00_90_etcd-operator_03_prometheusrule.yaml | 32 ++++++------------- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/jsonnet/custom.libsonnet b/jsonnet/custom.libsonnet index 5cd4cb1d7..2527dbfe9 100644 --- a/jsonnet/custom.libsonnet +++ b/jsonnet/custom.libsonnet @@ -61,7 +61,7 @@ }, { alert: 'etcdHighCommitDurations', - expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5', + expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.025', 'for': '10m', labels: { severity: 'warning', @@ -71,6 +71,19 @@ summary: 'etcd cluster 99th percentile commit durations are too high.', }, }, + { + alert: 'etcdHighFsyncDurations', + expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.010', + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster 99th percentile fsync durations are too high.', + runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md' + }, + }, { alert: 'etcdHighNumberOfFailedGRPCRequests', expr: ||| diff --git a/jsonnet/jsonnetfile.lock.json b/jsonnet/jsonnetfile.lock.json index 403af4e40..2fa756c32 100644 --- a/jsonnet/jsonnetfile.lock.json +++ b/jsonnet/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/mixin" } }, - "version": "c218423621d0f574b709a2c5920970669e00f21c", + "version": "2261dfb44bd8512226fc653a44173e5f57251c59", "sum": "XmXkOCriQIZmXwlIIFhqlJMa0e6qGWdxZD+ZDYaN0Po=" }, { @@ -18,7 +18,7 @@ "subdir": "gen/grafonnet-v10.0.0" } }, - "version": "5a8f3d6aa89b7e7513528371d2d1265aedc844bc", + "version": "42d098fae987f25f08480e203ca6ddc548c6efbf", "sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0=" }, { diff --git a/jsonnet/main.jsonnet b/jsonnet/main.jsonnet index fc5fcc91e..f316a8c12 100644 --- a/jsonnet/main.jsonnet +++ b/jsonnet/main.jsonnet @@ -6,7 +6,7 @@ local promRules = if std.objectHasAll(etcdMixin, 'prometheusRules') then etcdMix // Exclude rules that are either OpenShift specific or do not work for OpenShift. // List should be ordered! -local excludedAlerts = ['etcdDatabaseQuotaLowSpace', 'etcdGRPCRequestsSlow', 'etcdHighCommitDurations', 'etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers', 'etcdMembersDown']; +local excludedAlerts = ['etcdDatabaseQuotaLowSpace', 'etcdGRPCRequestsSlow', 'etcdHighCommitDurations', 'etcdHighFsyncDurations', 'etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers', 'etcdMembersDown']; local excludeRules = std.map( function(group) group { rules: std.filter( diff --git a/manifests/0000_90_etcd-operator_03_prometheusrule.yaml b/manifests/0000_90_etcd-operator_03_prometheusrule.yaml index 58b398a85..ab9564b63 100644 --- a/manifests/0000_90_etcd-operator_03_prometheusrule.yaml +++ b/manifests/0000_90_etcd-operator_03_prometheusrule.yaml @@ -39,27 +39,6 @@ spec: for: 15m labels: severity: warning - - alert: etcdHighFsyncDurations - annotations: - description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' - summary: etcd cluster 99th percentile fsync durations are too high. - expr: | - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.5 - for: 10m - labels: - severity: warning - - alert: etcdHighFsyncDurations - annotations: - description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md - summary: etcd cluster 99th percentile fsync durations are too high. - expr: | - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 1 - for: 10m - labels: - severity: critical - alert: etcdExcessiveDatabaseGrowth annotations: description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.' @@ -125,10 +104,19 @@ spec: annotations: description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' summary: etcd cluster 99th percentile commit durations are too high. - expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.025 for: 10m labels: severity: warning + - alert: etcdHighFsyncDurations + annotations: + description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md + summary: etcd cluster 99th percentile fsync durations are too high. + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.010 + for: 10m + labels: + severity: critical - alert: etcdHighNumberOfFailedGRPCRequests annotations: description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'