Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion jsonnet/custom.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
},
{
alert: 'etcdHighCommitDurations',
expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5',
expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.025',
'for': '10m',
labels: {
severity: 'warning',
Expand All @@ -71,6 +71,19 @@
summary: 'etcd cluster 99th percentile commit durations are too high.',
},
},
{
alert: 'etcdHighFsyncDurations',
expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.010',
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster 99th percentile fsync durations are too high.',
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md'
},
},
{
alert: 'etcdHighNumberOfFailedGRPCRequests',
expr: |||
Expand Down
4 changes: 2 additions & 2 deletions jsonnet/jsonnetfile.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"subdir": "contrib/mixin"
}
},
"version": "c218423621d0f574b709a2c5920970669e00f21c",
"version": "2261dfb44bd8512226fc653a44173e5f57251c59",
"sum": "XmXkOCriQIZmXwlIIFhqlJMa0e6qGWdxZD+ZDYaN0Po="
},
{
Expand All @@ -18,7 +18,7 @@
"subdir": "gen/grafonnet-v10.0.0"
}
},
"version": "5a8f3d6aa89b7e7513528371d2d1265aedc844bc",
"version": "42d098fae987f25f08480e203ca6ddc548c6efbf",
"sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0="
},
{
Expand Down
2 changes: 1 addition & 1 deletion jsonnet/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ local promRules = if std.objectHasAll(etcdMixin, 'prometheusRules') then etcdMix

// Exclude rules that are either OpenShift specific or do not work for OpenShift.
// List should be ordered!
local excludedAlerts = ['etcdDatabaseQuotaLowSpace', 'etcdGRPCRequestsSlow', 'etcdHighCommitDurations', 'etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers', 'etcdMembersDown'];
local excludedAlerts = ['etcdDatabaseQuotaLowSpace', 'etcdGRPCRequestsSlow', 'etcdHighCommitDurations', 'etcdHighFsyncDurations', 'etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers', 'etcdMembersDown'];
local excludeRules = std.map(
function(group) group {
rules: std.filter(
Expand Down
32 changes: 10 additions & 22 deletions manifests/0000_90_etcd-operator_03_prometheusrule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,6 @@ spec:
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 1
for: 10m
labels:
severity: critical
- alert: etcdExcessiveDatabaseGrowth
annotations:
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
Expand Down Expand Up @@ -125,10 +104,19 @@ spec:
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile commit durations are too high.
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.025
for: 10m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
summary: etcd cluster 99th percentile fsync durations are too high.
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.010
for: 10m
labels:
severity: critical
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
Expand Down