Skip to content

Commit 8fa8bab

Browse files
Add Cartridge issues panels and Prometheus alerts
Closes #55
1 parent 9742d95 commit 8fa8bab

File tree

9 files changed

+851
-156
lines changed

9 files changed

+851
-156
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77
## Unreleased
88

99
## Added
10-
- Prometheus example alert rules (instance state, memory usage, HTTP load and latency rule examples)
10+
- Prometheus example alert rules (instance state, memory usage, HTTP load and latency rule examples, etc)
1111
- Test Prometheus example alert rules with promtool
1212
- Cartridge issues metrics labels to Telegraf configuration
13+
- Cartridge issues panels and "Cluster overview" row
1314

1415
## Changed
1516
- Update metrics version to 0.9.0
1617
- Separate app cluster and load generator in example docker stand
1718
- Use cartridge-cli to run and setup example app cluster instead of luatest
19+
- Group Prometheus cluster overview panels into "Cluster overview" row
1820

1921
## Fixed
2022
- Add missing space and replication metrics labels to Telegraf configuration

example/prometheus/alerts.yml

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,31 @@ groups:
7979
annotations:
8080
summary: "Instance {{ $labels.alias }} low items memory remaining"
8181
description: "Low items memory (tuples) remaining for {{ $labels.alias }} instance of job {{ $labels.job }}.
82-
You are likely to hit limit soon.
83-
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
82+
You are likely to hit limit soon.
83+
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
84+
85+
# Warning for Cartridge warning issues.
86+
- alert: CartridgeWarningIssues
87+
expr: tnt_cartridge_issues{level="warning"} > 0
88+
for: 1m
89+
labels:
90+
severity: warning
91+
annotations:
92+
summary: "Instance {{ $labels.alias }} have 'warning'-level Cartridge issues"
93+
description: "Possible reasons: high replication lag, replication long idle,
94+
failover or switchover issues, clock issues, memory fragmentation,
95+
configuration issues, alien members."
96+
97+
# Alert for Cartridge critical issues.
98+
- alert: CartridgeCriticalIssues
99+
expr: tnt_cartridge_issues{level="critical"} > 0
100+
for: 1m
101+
labels:
102+
severity: page
103+
annotations:
104+
summary: "Instance {{ $labels.alias }} have 'critical'-level Cartridge issues"
105+
description: "Possible reasons: replication process critical fail,
106+
running out of available memory."
84107

85108
- name: tarantool-business
86109
rules:

example/prometheus/test_alerts.yml

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,68 @@ tests:
191191
exp_alerts: # no alert firing
192192

193193

194+
- interval: 15s
195+
input_series:
196+
- series: 'tnt_cartridge_issues{job="tarantool_app", instance="app:8081", alias="tnt_router", level="warning"}'
197+
values: '0+0x2 1+0x8'
198+
- series: 'tnt_cartridge_issues{job="tarantool_app", instance="app:8081", alias="tnt_router", level="critical"}'
199+
values: '0+0x10'
200+
alert_rule_test:
201+
- eval_time: 2m
202+
alertname: CartridgeWarningIssues
203+
exp_alerts:
204+
- exp_labels:
205+
severity: warning
206+
level: warning
207+
instance: app:8081
208+
alias: tnt_router
209+
job: tarantool_app
210+
exp_annotations:
211+
summary: "Instance tnt_router have 'warning'-level Cartridge issues"
212+
description: "Possible reasons: high replication lag, replication long idle,
213+
failover or switchover issues, clock issues, memory fragmentation,
214+
configuration issues, alien members."
215+
- eval_time: 2m
216+
alertname: CartridgeCriticalIssues
217+
exp_alerts: # no alert firing
218+
219+
220+
- interval: 15s
221+
input_series:
222+
- series: 'tnt_cartridge_issues{job="tarantool_app", instance="app:8081", alias="tnt_router", level="warning"}'
223+
values: '0+0x2 2+0x8'
224+
- series: 'tnt_cartridge_issues{job="tarantool_app", instance="app:8081", alias="tnt_router", level="critical"}'
225+
values: '1+0x10'
226+
alert_rule_test:
227+
- eval_time: 2m
228+
alertname: CartridgeWarningIssues
229+
exp_alerts:
230+
- exp_labels:
231+
severity: warning
232+
level: warning
233+
instance: app:8081
234+
alias: tnt_router
235+
job: tarantool_app
236+
exp_annotations:
237+
summary: "Instance tnt_router have 'warning'-level Cartridge issues"
238+
description: "Possible reasons: high replication lag, replication long idle,
239+
failover or switchover issues, clock issues, memory fragmentation,
240+
configuration issues, alien members."
241+
- eval_time: 2m
242+
alertname: CartridgeCriticalIssues
243+
exp_alerts:
244+
- exp_labels:
245+
severity: page
246+
level: critical
247+
instance: app:8081
248+
alias: tnt_router
249+
job: tarantool_app
250+
exp_annotations:
251+
summary: "Instance tnt_router have 'critical'-level Cartridge issues"
252+
description: "Possible reasons: replication process critical fail,
253+
running out of available memory."
254+
255+
194256
- interval: 15s
195257
input_series:
196258
- series: http_server_request_latency_count{job="tarantool_app",instance="app:8081",path="/hello",method="GET",status="200",alias="tnt_router"}

tarantool/cluster.libsonnet

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
local grafana = import 'grafonnet/grafana.libsonnet';
22

3+
local graph = grafana.graphPanel;
34
local statPanel = grafana.statPanel;
45
local tablePanel = grafana.tablePanel;
6+
local influxdb = grafana.influxdb;
57
local prometheus = grafana.prometheus;
68

79
{
@@ -286,4 +288,86 @@ local prometheus = grafana.prometheus;
286288
unit='reqps',
287289
expr=std.format('sum(rate(http_server_request_latency_count{job=~"%s"}[%s]))', [job, rate_time_range]),
288290
),
291+
292+
local cartridge_issues(
293+
title,
294+
description,
295+
datasource,
296+
policy,
297+
measurement,
298+
job,
299+
level,
300+
) = graph.new(
301+
title=title,
302+
description=description,
303+
datasource=datasource,
304+
305+
format='none',
306+
fill=0,
307+
decimals=0,
308+
sort='decreasing',
309+
legend_alignAsTable=true,
310+
legend_current=true,
311+
legend_values=true,
312+
legend_sort='current',
313+
legend_sortDesc=true,
314+
).addTarget(
315+
if datasource == '${DS_PROMETHEUS}' then
316+
prometheus.target(
317+
expr=std.format('tnt_cartridge_issues{job=~"%s",level="%s"}', [job, level]),
318+
legendFormat='{{alias}}',
319+
)
320+
else if datasource == '${DS_INFLUXDB}' then
321+
influxdb.target(
322+
policy=policy,
323+
measurement=measurement,
324+
group_tags=['label_pairs_alias'],
325+
alias='$tag_label_pairs_alias',
326+
).where('metric_name', '=', 'tnt_cartridge_issues').where('label_pairs_level', '=', level)
327+
.selectField('value').addConverter('last')
328+
),
329+
330+
cartridge_warning_issues(
331+
title='Cartridge warning issues',
332+
description=|||
333+
Number of "warning" issues on each cluster instance.
334+
"warning" issues includes high replication lag, replication long idle,
335+
failover and switchover issues, clock issues, memory fragmentation,
336+
configuration issues and alien members warnings.
337+
|||,
338+
datasource=null,
339+
policy=null,
340+
measurement=null,
341+
job=null,
342+
):: cartridge_issues(
343+
title=title,
344+
description=description,
345+
datasource=datasource,
346+
policy=policy,
347+
measurement=measurement,
348+
job=job,
349+
level='warning',
350+
),
351+
352+
cartridge_critical_issues(
353+
title='Cartridge critical issues',
354+
description=|||
355+
Number of "critical" issues on each cluster instance.
356+
"critical" issues includes replication process critical fails and
357+
running out of available memory.
358+
|||,
359+
datasource=null,
360+
policy=null,
361+
measurement=null,
362+
job=null,
363+
):: cartridge_issues(
364+
title=title,
365+
description=description,
366+
datasource=datasource,
367+
policy=policy,
368+
measurement=measurement,
369+
job=job,
370+
level='critical',
371+
),
372+
289373
}

0 commit comments

Comments
 (0)