Skip to content

Commit d34dc68

Browse files
alerts: update replication lag condition
Replication lag alert example uses deprecated tnt_replication_*_lag metric. This patch replaces it with up-to-date tnt_replication_lag{id=*} metric. Follows up #133
1 parent f51340c commit d34dc68

File tree

2 files changed

+10
-9
lines changed

2 files changed

+10
-9
lines changed

example_cluster/prometheus/alerts.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,14 +109,14 @@ groups:
109109

110110
# Alert for Tarantool replication high lag (both for masters and replicas).
111111
- alert: HighReplicationLag
112-
expr: '{__name__=~"tnt_replication_[[:digit:]]{1,2}_lag"} > 1'
112+
expr: tnt_replication_lag > 1
113113
for: 1m
114114
labels:
115115
severity: warning
116116
annotations:
117-
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high replication lag"
118-
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have high replication lag,
119-
check up your network and cluster state."
117+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high replication lag (id {{ $labels.id }})"
118+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have high replication lag
119+
(id {{ $labels.id }}), check up your network and cluster state."
120120

121121
# Alert for Tarantool low vinyl engine regulator rate limit.
122122
- alert: LowVinylRegulatorRateLimit

example_cluster/prometheus/test_alerts.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -258,9 +258,9 @@ tests:
258258

259259
- interval: 15s
260260
input_series:
261-
- series: tnt_replication_1_lag{job="tarantool", instance="app:8081", alias="tnt_storage_master"}
261+
- series: tnt_replication_lag{job="tarantool", instance="app:8081", alias="tnt_storage_master", id="1"}
262262
values: '0+0x10'
263-
- series: tnt_replication_2_lag{job="tarantool", instance="app:8082", alias="tnt_storage_replica"}
263+
- series: tnt_replication_lag{job="tarantool", instance="app:8082", alias="tnt_storage_replica", id="2"}
264264
values: '1+15x10'
265265
alert_rule_test:
266266
- eval_time: 2m
@@ -271,10 +271,11 @@ tests:
271271
instance: app:8082
272272
alias: tnt_storage_replica
273273
job: tarantool
274+
id: "2"
274275
exp_annotations:
275-
summary: "Instance 'tnt_storage_replica' ('tarantool') have high replication lag"
276-
description: "Instance 'tnt_storage_replica' of job 'tarantool' have high replication lag,
277-
check up your network and cluster state."
276+
summary: "Instance 'tnt_storage_replica' ('tarantool') have high replication lag (id 2)"
277+
description: "Instance 'tnt_storage_replica' of job 'tarantool' have high replication lag
278+
(id 2), check up your network and cluster state."
278279

279280

280281
- interval: 15s

0 commit comments

Comments
 (0)