Skip to content

Commit 9d9a14e

Browse files
authored
[DPE-3160] Add awesome alert rules (#323)
* Add awesome alert rules. * Fix namings of awesome alert rules. * Restore PatroniPostgresqlDown alert rule.
1 parent a08142a commit 9d9a14e

File tree

3 files changed

+291
-21
lines changed

3 files changed

+291
-21
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# This file based on https://samber.github.io/awesome-prometheus-alerts/rules#patroni-1
2+
3+
groups:
4+
5+
- name: PatroniExporter
6+
7+
rules:
8+
9+
- alert: PatroniPostgresqlDown
10+
expr: "patroni_postgres_running == 0"
11+
for: 0m
12+
labels:
13+
severity: critical
14+
annotations:
15+
summary: Patroni Posrgresql Down (instance {{ $labels.instance }})
16+
description: "Patroni Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
17+
18+
# 2.4.1
19+
- alert: PatroniHasNoLeader
20+
expr: '(max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
21+
for: 0m
22+
labels:
23+
severity: critical
24+
annotations:
25+
summary: Patroni has no Leader (instance {{ $labels.instance }})
26+
description: "A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# This file based on https://samber.github.io/awesome-prometheus-alerts/rules#pgbouncer-1
2+
3+
groups:
4+
5+
- name: PgbouncerExporter
6+
7+
rules:
8+
9+
# 2.5.1
10+
- alert: PgbouncerActiveConnections
11+
expr: 'pgbouncer_pools_server_active_connections > 200'
12+
for: 2m
13+
labels:
14+
severity: warning
15+
annotations:
16+
summary: PGBouncer active connections (instance {{ $labels.instance }})
17+
description: "PGBouncer pools are filling up\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
18+
19+
# 2.5.2
20+
# 10 -> 3
21+
- alert: PgbouncerErrors
22+
expr: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 3'
23+
for: 0m
24+
labels:
25+
severity: warning
26+
annotations:
27+
summary: PGBouncer errors (instance {{ $labels.instance }})
28+
description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
29+
30+
# 2.5.3
31+
- alert: PgbouncerMaxConnections
32+
expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0'
33+
for: 0m
34+
labels:
35+
severity: critical
36+
annotations:
37+
summary: PGBouncer max connections (instance {{ $labels.instance }})
38+
description: "The number of PGBouncer client connections has reached max_client_conn.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
Lines changed: 227 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,228 @@
1+
# This file based on https://samber.github.io/awesome-prometheus-alerts/rules#postgresql-1
2+
13
groups:
2-
- name: PostgresqlExporterK8s
3-
4-
rules:
5-
# Based on https://samber.github.io/awesome-prometheus-alerts/rules#rule-postgresql-1-1
6-
- alert: PostgresqlDown
7-
expr: pg_up == 0
8-
for: 0m
9-
labels:
10-
severity: critical
11-
annotations:
12-
summary: Postgresql down (instance {{ $labels.instance }})
13-
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
14-
# Based on https://samber.github.io/awesome-prometheus-alerts/rules#rule-postgresql-1-2
15-
- alert: PostgresqlRestarted
16-
expr: time() - pg_postmaster_start_time_seconds < 60
17-
for: 0m
18-
labels:
19-
severity: critical
20-
annotations:
21-
summary: Postgresql restarted (instance {{ $labels.instance }})
22-
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
4+
5+
- name: PostgresqlExporter
6+
7+
rules:
8+
9+
# 2.2.1
10+
- alert: PostgresqlDown
11+
expr: 'pg_up == 0'
12+
for: 0m
13+
labels:
14+
severity: critical
15+
annotations:
16+
summary: Postgresql down (instance {{ $labels.instance }})
17+
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
18+
19+
# 2.2.2
20+
# critical -> info
21+
- alert: PostgresqlRestarted
22+
expr: 'time() - pg_postmaster_start_time_seconds < 60'
23+
for: 0m
24+
labels:
25+
severity: info
26+
annotations:
27+
summary: Postgresql restarted (instance {{ $labels.instance }})
28+
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
29+
30+
# 2.2.3
31+
- alert: PostgresqlExporterError
32+
expr: 'pg_exporter_last_scrape_error > 0'
33+
for: 0m
34+
labels:
35+
severity: critical
36+
annotations:
37+
summary: Postgresql exporter error (instance {{ $labels.instance }})
38+
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
39+
40+
# 2.2.4
41+
# 10 days -> 7 days
42+
- alert: PostgresqlTableNotAutoVacuumed
43+
expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 7'
44+
for: 0m
45+
labels:
46+
severity: warning
47+
annotations:
48+
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
49+
description: "Table {{ $labels.relname }} has not been auto vacuumed for 7 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
50+
51+
# 2.2.5
52+
# 10 days -> 7 days
53+
- alert: PostgresqlTableNotAutoAnalyzed
54+
expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 7'
55+
for: 0m
56+
labels:
57+
severity: warning
58+
annotations:
59+
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
60+
description: "Table {{ $labels.relname }} has not been auto analyzed for 7 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
61+
62+
# 2.2.6
63+
- alert: PostgresqlTooManyConnections
64+
expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
65+
for: 2m
66+
labels:
67+
severity: warning
68+
annotations:
69+
summary: Postgresql too many connections (instance {{ $labels.instance }})
70+
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
71+
72+
# 2.2.7
73+
# warning -> info
74+
- alert: PostgresqlNotEnoughConnections
75+
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
76+
for: 2m
77+
labels:
78+
severity: info
79+
annotations:
80+
summary: Postgresql not enough connections (instance {{ $labels.instance }})
81+
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
82+
83+
# 2.2.8
84+
- alert: PostgresqlDeadLocks
85+
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
86+
for: 0m
87+
labels:
88+
severity: warning
89+
annotations:
90+
summary: Postgresql dead locks (instance {{ $labels.instance }})
91+
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
92+
93+
# 2.2.9
94+
- alert: PostgresqlHighRollbackRate
95+
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
96+
for: 0m
97+
labels:
98+
severity: warning
99+
annotations:
100+
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
101+
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
102+
103+
# 2.2.10
104+
# critical -> info
105+
- alert: PostgresqlCommitRateLow
106+
expr: 'rate(pg_stat_database_xact_commit[1m]) < 10'
107+
for: 2m
108+
labels:
109+
severity: info
110+
annotations:
111+
summary: Postgresql commit rate low (instance {{ $labels.instance }})
112+
description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
113+
114+
# 2.2.11
115+
# warning -> info
116+
- alert: PostgresqlLowXidConsumption
117+
expr: 'rate(pg_txid_current[1m]) < 5'
118+
for: 2m
119+
labels:
120+
severity: info
121+
annotations:
122+
summary: Postgresql low XID consumption (instance {{ $labels.instance }})
123+
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
124+
125+
# 2.2.12
126+
- alert: PostgresqlHighRateStatementTimeout
127+
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
128+
for: 0m
129+
labels:
130+
severity: critical
131+
annotations:
132+
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
133+
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
134+
135+
# 2.2.13
136+
# critical -> warning
137+
- alert: PostgresqlHighRateDeadlock
138+
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
139+
for: 0m
140+
labels:
141+
severity: warning
142+
annotations:
143+
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
144+
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
145+
146+
# 2.2.14
147+
# warning -> info
148+
- alert: PostgresqlUnusedReplicationSlot
149+
expr: 'pg_replication_slots_active == 0'
150+
for: 1m
151+
labels:
152+
severity: info
153+
annotations:
154+
summary: Postgresql unused replication slot (instance {{ $labels.instance }})
155+
description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
156+
157+
# 2.2.15
158+
- alert: PostgresqlTooManyDeadTuples
159+
expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
160+
for: 2m
161+
labels:
162+
severity: warning
163+
annotations:
164+
summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
165+
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
166+
167+
# 2.2.16
168+
- alert: PostgresqlConfigurationChanged
169+
expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
170+
for: 0m
171+
labels:
172+
severity: info
173+
annotations:
174+
summary: Postgresql configuration changed (instance {{ $labels.instance }})
175+
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
176+
177+
# 2.2.17
178+
# critical -> warning
179+
- alert: PostgresqlSslCompressionActive
180+
expr: 'sum(pg_stat_ssl_compression) > 0'
181+
for: 0m
182+
labels:
183+
severity: warning
184+
annotations:
185+
summary: Postgresql SSL compression active (instance {{ $labels.instance }})
186+
description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
187+
188+
# 2.2.18
189+
# critical -> warning
190+
- alert: PostgresqlTooManyLocksAcquired
191+
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
192+
for: 2m
193+
labels:
194+
severity: warning
195+
annotations:
196+
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
197+
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
198+
199+
# 2.2.19
200+
- alert: PostgresqlBloatIndexHigh(>80%)
201+
expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
202+
for: 1h
203+
labels:
204+
severity: warning
205+
annotations:
206+
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
207+
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
208+
209+
# 2.2.20
210+
- alert: PostgresqlBloatTableHigh(>80%)
211+
expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
212+
for: 1h
213+
labels:
214+
severity: warning
215+
annotations:
216+
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
217+
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
218+
219+
# 2.2.21
220+
# warning -> critical
221+
- alert: PostgresqlInvalidIndex
222+
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
223+
for: 6h
224+
labels:
225+
severity: critical
226+
annotations:
227+
summary: Postgresql invalid index (instance {{ $labels.instance }})
228+
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

0 commit comments

Comments
 (0)