tarantool
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎example/prometheus/alerts.yml‎
Lines changed: 11 additions & 0 deletions b/‎example/prometheus/alerts.yml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎example/prometheus/test_alerts.yml‎
Lines changed: 21 additions & 0 deletions b/‎example/prometheus/test_alerts.yml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎tarantool/cluster.libsonnet‎
Lines changed: 39 additions & 0 deletions b/‎tarantool/cluster.libsonnet‎
Lines changed: 39 additions & 0 deletions
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Network activity row and panels
 - Non-CRUD operations panels
 - CPU time getrusage panels
+- Replication lag panel
 
 ## Changed
 - Update metrics version to 0.9.0
 
@@ -105,6 +105,17 @@ groups:
       description: "Possible reasons: replication process critical fail,
         running out of available memory."
 
+  
+  - alert: HighReplicationLag
+    expr: '{__name__=~"tnt_replication_[[:digit:]]{1,2}_lag"} > 1'
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance {{ $labels.alias }} have high replication lag"
+      description: "Instance {{ $labels.alias }} of job {{ $labels.job }} have high replication lag,
+        check up your network and cluster state."
+
 - name: tarantool-business
   rules:
   # Warning for any endpoint of an instance in tarantool_app job that responds too long.
 
@@ -253,6 +253,27 @@ tests:
                 running out of available memory."
 
 
+  - interval: 15s
+    input_series:
+      - series: tnt_replication_1_lag{job="tarantool_app", instance="app:8081", alias="tnt_storage_master"}
+        values: '0+0x10'
+      - series: tnt_replication_2_lag{job="tarantool_app", instance="app:8082", alias="tnt_storage_replica"}
+        values: '1+15x10'
+    alert_rule_test:
+      - eval_time: 2m
+        alertname: HighReplicationLag
+        exp_alerts:
+          - exp_labels:
+              severity: warning
+              instance: app:8082
+              alias: tnt_storage_replica
+              job: tarantool_app
+            exp_annotations:
+              summary: "Instance tnt_storage_replica have high replication lag"
+              description: "Instance tnt_storage_replica of job tarantool_app have high replication lag,
+                check up your network and cluster state."
+
+
   - interval: 15s
     input_series:
         - series: http_server_request_latency_count{job="tarantool_app",instance="app:8081",path="/hello",method="GET",status="200",alias="tnt_router"}
 
@@ -370,4 +370,43 @@ local prometheus = grafana.prometheus;
     level='critical',
   ),
 
+  replication_lag(
+    title='Tarantool replication lag',
+    description=|||
+      Replication lag value for Tarantool instance.
+    |||,
+    datasource=null,
+    policy=null,
+    measurement=null,
+    job=null,
+  ):: graph.new(
+    title=title,
+    description=description,
+    datasource=datasource,
+
+    format='s',
+    fill=0,
+    min=0,
+    sort='decreasing',
+    legend_alignAsTable=true,
+    legend_current=true,
+    legend_max=true,
+    legend_values=true,
+    legend_sort='current',
+    legend_sortDesc=true,
+  ).addTarget(
+    if datasource == '${DS_PROMETHEUS}' then
+      prometheus.target(
+        expr=std.format('{__name__=~"tnt_replication_[[:digit:]]{1,2}_lag", job=~"%s"}', [job]),
+        legendFormat='{{alias}}',
+      )
+    else if datasource == '${DS_INFLUXDB}' then
+      influxdb.target(
+        policy=policy,
+        measurement=measurement,
+        group_tags=['label_pairs_alias'],
+        alias='$tag_label_pairs_alias',
+      ).where('metric_name', '=~', '/tnt_replication_\\d{1,2}_lag/')
+      .selectField('value').addConverter('mean')
+  ),
 }