Skip to content

Commit bc622fb

Browse files
Merge pull request #79 from tarantool/57-replication-lag
Add replication lag panel
2 parents 2abee3b + f638933 commit bc622fb

File tree

7 files changed

+503
-207
lines changed

7 files changed

+503
-207
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1414
- Network activity row and panels
1515
- Non-CRUD operations panels
1616
- CPU time getrusage panels
17+
- Replication lag panel
1718

1819
## Changed
1920
- Update metrics version to 0.9.0

example/prometheus/alerts.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,17 @@ groups:
105105
description: "Possible reasons: replication process critical fail,
106106
running out of available memory."
107107

108+
109+
- alert: HighReplicationLag
110+
expr: '{__name__=~"tnt_replication_[[:digit:]]{1,2}_lag"} > 1'
111+
for: 1m
112+
labels:
113+
severity: warning
114+
annotations:
115+
summary: "Instance {{ $labels.alias }} have high replication lag"
116+
description: "Instance {{ $labels.alias }} of job {{ $labels.job }} have high replication lag,
117+
check up your network and cluster state."
118+
108119
- name: tarantool-business
109120
rules:
110121
# Warning for any endpoint of an instance in tarantool_app job that responds too long.

example/prometheus/test_alerts.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,27 @@ tests:
253253
running out of available memory."
254254

255255

256+
- interval: 15s
257+
input_series:
258+
- series: tnt_replication_1_lag{job="tarantool_app", instance="app:8081", alias="tnt_storage_master"}
259+
values: '0+0x10'
260+
- series: tnt_replication_2_lag{job="tarantool_app", instance="app:8082", alias="tnt_storage_replica"}
261+
values: '1+15x10'
262+
alert_rule_test:
263+
- eval_time: 2m
264+
alertname: HighReplicationLag
265+
exp_alerts:
266+
- exp_labels:
267+
severity: warning
268+
instance: app:8082
269+
alias: tnt_storage_replica
270+
job: tarantool_app
271+
exp_annotations:
272+
summary: "Instance tnt_storage_replica have high replication lag"
273+
description: "Instance tnt_storage_replica of job tarantool_app have high replication lag,
274+
check up your network and cluster state."
275+
276+
256277
- interval: 15s
257278
input_series:
258279
- series: http_server_request_latency_count{job="tarantool_app",instance="app:8081",path="/hello",method="GET",status="200",alias="tnt_router"}

tarantool/cluster.libsonnet

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,4 +370,43 @@ local prometheus = grafana.prometheus;
370370
level='critical',
371371
),
372372

373+
replication_lag(
374+
title='Tarantool replication lag',
375+
description=|||
376+
Replication lag value for Tarantool instance.
377+
|||,
378+
datasource=null,
379+
policy=null,
380+
measurement=null,
381+
job=null,
382+
):: graph.new(
383+
title=title,
384+
description=description,
385+
datasource=datasource,
386+
387+
format='s',
388+
fill=0,
389+
min=0,
390+
sort='decreasing',
391+
legend_alignAsTable=true,
392+
legend_current=true,
393+
legend_max=true,
394+
legend_values=true,
395+
legend_sort='current',
396+
legend_sortDesc=true,
397+
).addTarget(
398+
if datasource == '${DS_PROMETHEUS}' then
399+
prometheus.target(
400+
expr=std.format('{__name__=~"tnt_replication_[[:digit:]]{1,2}_lag", job=~"%s"}', [job]),
401+
legendFormat='{{alias}}',
402+
)
403+
else if datasource == '${DS_INFLUXDB}' then
404+
influxdb.target(
405+
policy=policy,
406+
measurement=measurement,
407+
group_tags=['label_pairs_alias'],
408+
alias='$tag_label_pairs_alias',
409+
).where('metric_name', '=~', '/tnt_replication_\\d{1,2}_lag/')
410+
.selectField('value').addConverter('mean')
411+
),
373412
}

0 commit comments

Comments
 (0)