Skip to content

Commit de37642

Browse files
alerts: update names
Update some alert names to be more readable and consistent. Fix alert descriptions. Change some alerts level. Part of #64
1 parent d34dc68 commit de37642

File tree

2 files changed

+48
-48
lines changed

2 files changed

+48
-48
lines changed

example_cluster/prometheus/alerts.yml

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ groups:
1414

1515
- name: tarantool-common
1616
rules:
17-
# Warning for any instance that uses too Lua runtime memory.
18-
- alert: LuaRuntimeWarning
17+
# Warning for any instance that uses too much Lua runtime memory.
18+
- alert: HighLuaMemoryWarning
1919
expr: tnt_info_memory_lua >= (512 * 1024 * 1024)
2020
for: 1m
2121
labels:
@@ -25,8 +25,8 @@ groups:
2525
description: "'{{ $labels.alias }}' instance of job '{{ $labels.job }}' uses too much Lua memory
2626
and may hit threshold soon."
2727

28-
# Alert for any instance that uses too Lua runtime memory.
29-
- alert: LuaRuntimeAlert
28+
# Alert for any instance that uses too much Lua runtime memory.
29+
- alert: HighLuaMemory
3030
expr: tnt_info_memory_lua >= (1024 * 1024 * 1024)
3131
for: 1m
3232
labels:
@@ -37,7 +37,7 @@ groups:
3737
and likely to hit threshold soon."
3838

3939
# Warning for any instance that have low remaining arena memory.
40-
- alert: MemtxArenaWarning
40+
- alert: LowMemtxArenaRemainingWarning
4141
expr: (tnt_slab_quota_used_ratio >= 80) and (tnt_slab_arena_used_ratio >= 80)
4242
for: 1m
4343
labels:
@@ -48,7 +48,7 @@ groups:
4848
Consider increasing memtx_memory or number of storages in case of sharded data."
4949

5050
# Alert for any instance that have low remaining arena memory.
51-
- alert: MemtxArenaAlert
51+
- alert: LowMemtxArenaRemaining
5252
expr: (tnt_slab_quota_used_ratio >= 90) and (tnt_slab_arena_used_ratio >= 90)
5353
for: 1m
5454
labels:
@@ -60,7 +60,7 @@ groups:
6060
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
6161

6262
# Warning for any instance that have low remaining items memory.
63-
- alert: MemtxItemsWarning
63+
- alert: LowMemtxItemsRemainingWarning
6464
expr: (tnt_slab_quota_used_ratio >= 80) and (tnt_slab_items_used_ratio >= 80)
6565
for: 1m
6666
labels:
@@ -71,7 +71,7 @@ groups:
7171
Consider increasing memtx_memory or number of storages in case of sharded data."
7272

7373
# Alert for any instance that have low remaining arena memory.
74-
- alert: MemtxItemsAlert
74+
- alert: LowMemtxItemsRemaining
7575
expr: (tnt_slab_quota_used_ratio >= 90) and (tnt_slab_items_used_ratio >= 90)
7676
for: 1m
7777
labels:
@@ -178,7 +178,7 @@ groups:
178178
- name: tarantool-crud
179179
rules:
180180
# Alert for CRUD module request errors.
181-
- alert: CRUDHighErrorRate
181+
- alert: HighCRUDErrorRate
182182
expr: rate(tnt_crud_stats_count{ job="tarantool", status="error" }[5m]) > 0.1
183183
for: 1m
184184
labels:
@@ -189,7 +189,7 @@ groups:
189189
'{{ $labels.alias }}' instance of job '{{ $labels.job }}' get module error responses."
190190

191191
# Warning for CRUD module requests too long responses.
192-
- alert: CRUDHighLatency
192+
- alert: HighCRUDLatency
193193
expr: tnt_crud_stats{ job="tarantool", quantile="0.99" } > 0.1
194194
for: 1m
195195
labels:
@@ -200,7 +200,7 @@ groups:
200200
'{{ $labels.alias }}' instance of job '{{ $labels.job }}' are processed too long."
201201

202202
# Warning for too many map reduce CRUD module requests.
203-
- alert: CRUDHighMapReduceRate
203+
- alert: HighCRUDMapReduceRate
204204
expr: rate(tnt_crud_map_reduces{ job="tarantool" }[5m]) > 0.1
205205
for: 1m
206206
labels:
@@ -218,7 +218,7 @@ groups:
218218
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
219219
# and request depends on type of this collector.
220220
# This example based on summary collector with default name.
221-
- alert: HTTPHighLatency
221+
- alert: HighHTTPLatency
222222
expr: http_server_request_latency{ job="tarantool", quantile="0.99" } > 0.1
223223
for: 5m
224224
labels:
@@ -228,43 +228,43 @@ groups:
228228
description: "Some {{ $labels.method }} requests to {{ $labels.path }} path with {{ $labels.status }} response status
229229
on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' are processed too long."
230230

231-
# Warning for any endpoint of an instance in tarantool job that sends too much 4xx responses.
231+
# Alert for any endpoint of an instance in tarantool job that sends too much 4xx responses.
232232
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
233233
# and request depends on type of this collector.
234234
# This example based on summary collector with default name.
235-
- alert: HTTPHighClientErrorRateInstance
235+
- alert: HighInstanceHTTPClientErrorRate
236236
expr: sum by (job, instance, method, path, alias) (rate(http_server_request_latency_count{ job="tarantool", status=~"^4\\d{2}$" }[5m])) > 10
237237
for: 1m
238238
labels:
239-
severity: warning
239+
severity: page
240240
annotations:
241241
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') high rate of client error responses"
242242
description: "Too many {{ $labels.method }} requests to {{ $labels.path }} path
243243
on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get client error (4xx) responses."
244244

245-
# Warning for any endpoint in tarantool job that sends too much 4xx responses (cluster overall).
245+
# Alert for any endpoint in tarantool job that sends too much 4xx responses (cluster overall).
246246
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
247247
# and request depends on type of this collector.
248248
# This example based on summary collector with default name.
249-
- alert: HTTPHighClientErrorRate
249+
- alert: HighHTTPClientErrorRate
250250
expr: sum by (job, method, path) (rate(http_server_request_latency_count{ job="tarantool", status=~"^4\\d{2}$" }[5m])) > 20
251251
for: 1m
252252
labels:
253-
severity: warning
253+
severity: page
254254
annotations:
255255
summary: "Job '{{ $labels.job }}' high rate of client error responses"
256256
description: "Too many {{ $labels.method }} requests to {{ $labels.path }} path
257257
on instances of job '{{ $labels.job }}' get client error (4xx) responses."
258258

259-
# Warning for any endpoint of an instance in tarantool job that sends 5xx responses.
259+
# Alert for any endpoint of an instance in tarantool job that sends 5xx responses.
260260
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
261261
# and request depends on type of this collector.
262262
# This example based on summary collector with default name.
263-
- alert: HTTPServerErrors
263+
- alert: HighHTTPServerErrorRate
264264
expr: sum by (job, instance, method, path, alias) (rate(http_server_request_latency_count{ job="tarantool", status=~"^5\\d{2}$" }[5m])) > 0
265265
for: 1m
266266
labels:
267-
severity: warning
267+
severity: page
268268
annotations:
269269
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') server error responses"
270270
description: "Some {{ $labels.method }} requests to {{ $labels.path }} path
@@ -274,7 +274,7 @@ groups:
274274
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
275275
# and request depends on type of this collector.
276276
# This example based on summary collector with default name.
277-
- alert: HTTPLowRequestRateRouter
277+
- alert: LowRouterHTTPRequestRate
278278
expr: sum by (job, instance, alias) (rate(http_server_request_latency_count{ job="tarantool", alias=~"^.*router.*$" }[5m])) < 10
279279
for: 5m
280280
labels:

example_cluster/prometheus/test_alerts.yml

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ tests:
3232
values: '209715200+104857600x8' # 200 Mb + 100 Mb each interval
3333
alert_rule_test:
3434
- eval_time: 2m
35-
alertname: LuaRuntimeWarning
35+
alertname: HighLuaMemoryWarning
3636
exp_alerts:
3737
- exp_labels:
3838
severity: warning
@@ -44,7 +44,7 @@ tests:
4444
description: "'tnt_router' instance of job 'tarantool' uses too much Lua memory
4545
and may hit threshold soon."
4646
- eval_time: 2m
47-
alertname: LuaRuntimeAlert
47+
alertname: HighLuaMemory
4848
exp_alerts: # no alert firing
4949

5050

@@ -54,7 +54,7 @@ tests:
5454
values: '419430400+209715200x8' # 400 Mb + 200 Mb each interval
5555
alert_rule_test:
5656
- eval_time: 2m
57-
alertname: LuaRuntimeWarning
57+
alertname: HighLuaMemoryWarning
5858
exp_alerts:
5959
- exp_labels:
6060
severity: warning
@@ -66,7 +66,7 @@ tests:
6666
description: "'tnt_router' instance of job 'tarantool' uses too much Lua memory
6767
and may hit threshold soon."
6868
- eval_time: 2m
69-
alertname: LuaRuntimeAlert
69+
alertname: HighLuaMemory
7070
exp_alerts:
7171
- exp_labels:
7272
severity: page
@@ -87,10 +87,10 @@ tests:
8787
values: '92+0x2 76+0x8'
8888
alert_rule_test:
8989
- eval_time: 2m
90-
alertname: MemtxArenaWarning
90+
alertname: LowMemtxArenaRemainingWarning
9191
exp_alerts: # no alert firing
9292
- eval_time: 2m
93-
alertname: MemtxArenaAlert
93+
alertname: LowMemtxArenaRemaining
9494
exp_alerts: # no alert firing
9595

9696

@@ -102,7 +102,7 @@ tests:
102102
values: '92+0x2 82+0x8'
103103
alert_rule_test:
104104
- eval_time: 2m
105-
alertname: MemtxArenaWarning
105+
alertname: LowMemtxArenaRemainingWarning
106106
exp_alerts:
107107
- exp_labels:
108108
severity: warning
@@ -114,7 +114,7 @@ tests:
114114
description: "Low arena memory (tuples and indexes) remaining for 'tnt_router' instance of job 'tarantool'.
115115
Consider increasing memtx_memory or number of storages in case of sharded data."
116116
- eval_time: 2m
117-
alertname: MemtxArenaAlert
117+
alertname: LowMemtxArenaRemaining
118118
exp_alerts: # no alert firing
119119

120120

@@ -126,7 +126,7 @@ tests:
126126
values: '92+0x2 91+0x8'
127127
alert_rule_test:
128128
- eval_time: 2m
129-
alertname: MemtxArenaWarning
129+
alertname: LowMemtxArenaRemainingWarning
130130
exp_alerts:
131131
- exp_labels:
132132
severity: warning
@@ -138,7 +138,7 @@ tests:
138138
description: "Low arena memory (tuples and indexes) remaining for 'tnt_router' instance of job 'tarantool'.
139139
Consider increasing memtx_memory or number of storages in case of sharded data."
140140
- eval_time: 2m
141-
alertname: MemtxArenaAlert
141+
alertname: LowMemtxArenaRemaining
142142
exp_alerts:
143143
- exp_labels:
144144
severity: page
@@ -160,10 +160,10 @@ tests:
160160
values: '95+0x2 79+0x8'
161161
alert_rule_test:
162162
- eval_time: 2m
163-
alertname: MemtxItemsWarning
163+
alertname: LowMemtxItemsRemainingWarning
164164
exp_alerts: # no alert firing
165165
- eval_time: 2m
166-
alertname: MemtxItemsAlert
166+
alertname: LowMemtxItemsRemaining
167167
exp_alerts: # no alert firing
168168

169169

@@ -175,7 +175,7 @@ tests:
175175
values: '92+0x2 82+0x8'
176176
alert_rule_test:
177177
- eval_time: 2m
178-
alertname: MemtxItemsWarning
178+
alertname: LowMemtxItemsRemainingWarning
179179
exp_alerts:
180180
- exp_labels:
181181
severity: warning
@@ -187,7 +187,7 @@ tests:
187187
description: "Low items memory (tuples) remaining for 'tnt_router' instance of job 'tarantool'.
188188
Consider increasing memtx_memory or number of storages in case of sharded data."
189189
- eval_time: 2m
190-
alertname: MemtxItemsAlert
190+
alertname: LowMemtxItemsRemaining
191191
exp_alerts: # no alert firing
192192

193193

@@ -396,7 +396,7 @@ tests:
396396
values: '0+100x100'
397397
alert_rule_test:
398398
- eval_time: 5m
399-
alertname: CRUDHighErrorRate
399+
alertname: HighCRUDErrorRate
400400
exp_alerts:
401401
- exp_labels:
402402
severity: critical
@@ -418,7 +418,7 @@ tests:
418418
values: '0.11+0x0'
419419
alert_rule_test:
420420
- eval_time: 2m
421-
alertname: CRUDHighLatency
421+
alertname: HighCRUDLatency
422422
exp_alerts:
423423
- exp_labels:
424424
severity: warning
@@ -441,7 +441,7 @@ tests:
441441
values: '0+100x100'
442442
alert_rule_test:
443443
- eval_time: 5m
444-
alertname: CRUDHighMapReduceRate
444+
alertname: HighCRUDMapReduceRate
445445
exp_alerts:
446446
- exp_labels:
447447
severity: warning
@@ -471,7 +471,7 @@ tests:
471471
values: '0.11+0x60'
472472
alert_rule_test:
473473
- eval_time: 10m
474-
alertname: HTTPHighLatency
474+
alertname: HighHTTPLatency
475475
exp_alerts:
476476
- exp_labels:
477477
severity: warning
@@ -502,10 +502,10 @@ tests:
502502
values: '0.02+0x100'
503503
alert_rule_test:
504504
- eval_time: 5m
505-
alertname: HTTPHighClientErrorRateInstance
505+
alertname: HighInstanceHTTPClientErrorRate
506506
exp_alerts:
507507
- exp_labels:
508-
severity: warning
508+
severity: page
509509
instance: app:8081
510510
alias: tnt_router
511511
job: tarantool
@@ -551,13 +551,13 @@ tests:
551551
values: '0.02+0x100'
552552
alert_rule_test:
553553
- eval_time: 5m
554-
alertname: HTTPHighClientErrorRateInstance
554+
alertname: HighHTTPClientErrorRateInstance
555555
exp_alerts: # no alert firing
556556
- eval_time: 5m
557-
alertname: HTTPHighClientErrorRate
557+
alertname: HighHTTPClientErrorRate
558558
exp_alerts:
559559
- exp_labels:
560-
severity: warning
560+
severity: page
561561
job: tarantool
562562
path: /hell0
563563
method: GET
@@ -580,10 +580,10 @@ tests:
580580
values: '0+0x10 0.01+0x100'
581581
alert_rule_test:
582582
- eval_time: 5m
583-
alertname: HTTPServerErrors
583+
alertname: HighHTTPServerErrorRate
584584
exp_alerts:
585585
- exp_labels:
586-
severity: warning
586+
severity: page
587587
instance: app:8081
588588
alias: tnt_router
589589
job: tarantool
@@ -608,7 +608,7 @@ tests:
608608
values: '0+0x10 0.01+0x100'
609609
alert_rule_test:
610610
- eval_time: 15m
611-
alertname: HTTPLowRequestRateRouter
611+
alertname: LowRouterHTTPRequestRate
612612
exp_alerts:
613613
- exp_labels:
614614
severity: warning

0 commit comments

Comments
 (0)