Skip to content

Commit 6e19ad6

Browse files
Merge pull request #80 from tarantool/minor-reworks
Introduce minor reworks
2 parents bc622fb + 98adf4a commit 6e19ad6

File tree

12 files changed

+498
-440
lines changed

12 files changed

+498
-440
lines changed

example/prometheus/alerts.yml

Lines changed: 32 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ groups:
88
labels:
99
severity: page
1010
annotations:
11-
summary: "Instance {{ $labels.instance }} down"
12-
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than a minute."
11+
summary: "Instance '{{ $labels.instance }}' ('{{ $labels.job }}') down"
12+
description: "'{{ $labels.instance }}' of job '{{ $labels.job }}' has been down for more than a minute."
1313

1414

1515
- name: tarantool-common
@@ -21,8 +21,8 @@ groups:
2121
labels:
2222
severity: warning
2323
annotations:
24-
summary: "Instance {{ $labels.alias }} Lua runtime warning"
25-
description: "{{ $labels.alias }} instance of job {{ $labels.job }} uses too much Lua memory
24+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') Lua runtime warning"
25+
description: "'{{ $labels.alias }}' instance of job '{{ $labels.job }}' uses too much Lua memory
2626
and may hit threshold soon."
2727

2828
# Alert for any instance that uses too Lua runtime memory.
@@ -32,8 +32,8 @@ groups:
3232
labels:
3333
severity: page
3434
annotations:
35-
summary: "Instance {{ $labels.alias }} Lua runtime alert"
36-
description: "{{ $labels.alias }} instance of job {{ $labels.job }} uses too much Lua memory
35+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') Lua runtime alert"
36+
description: "'{{ $labels.alias }}' instance of job '{{ $labels.job }}' uses too much Lua memory
3737
and likely to hit threshold soon."
3838

3939
# Warning for any instance that have low remaining arena memory.
@@ -43,8 +43,8 @@ groups:
4343
labels:
4444
severity: warning
4545
annotations:
46-
summary: "Instance {{ $labels.alias }} low arena memory remaining"
47-
description: "Low arena memory (tuples and indexes) remaining for {{ $labels.alias }} instance of job {{ $labels.job }}.
46+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low arena memory remaining"
47+
description: "Low arena memory (tuples and indexes) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
4848
Consider increasing memtx_memory or number of storages in case of sharded data."
4949

5050
# Alert for any instance that have low remaining arena memory.
@@ -54,8 +54,8 @@ groups:
5454
labels:
5555
severity: page
5656
annotations:
57-
summary: "Instance {{ $labels.alias }} low arena memory remaining"
58-
description: "Low arena memory (tuples and indexes) remaining for {{ $labels.alias }} instance of job {{ $labels.job }}.
57+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low arena memory remaining"
58+
description: "Low arena memory (tuples and indexes) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
5959
You are likely to hit limit soon.
6060
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
6161

@@ -66,8 +66,8 @@ groups:
6666
labels:
6767
severity: warning
6868
annotations:
69-
summary: "Instance {{ $labels.alias }} low items memory remaining"
70-
description: "Low items memory (tuples) remaining for {{ $labels.alias }} instance of job {{ $labels.job }}.
69+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low items memory remaining"
70+
description: "Low items memory (tuples) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
7171
Consider increasing memtx_memory or number of storages in case of sharded data."
7272

7373
# Alert for any instance that have low remaining arena memory.
@@ -77,8 +77,8 @@ groups:
7777
labels:
7878
severity: page
7979
annotations:
80-
summary: "Instance {{ $labels.alias }} low items memory remaining"
81-
description: "Low items memory (tuples) remaining for {{ $labels.alias }} instance of job {{ $labels.job }}.
80+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low items memory remaining"
81+
description: "Low items memory (tuples) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
8282
You are likely to hit limit soon.
8383
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
8484

@@ -89,8 +89,9 @@ groups:
8989
labels:
9090
severity: warning
9191
annotations:
92-
summary: "Instance {{ $labels.alias }} have 'warning'-level Cartridge issues"
93-
description: "Possible reasons: high replication lag, replication long idle,
92+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has 'warning'-level Cartridge issues"
93+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has 'warning'-level Cartridge issues.
94+
Possible reasons: high replication lag, replication long idle,
9495
failover or switchover issues, clock issues, memory fragmentation,
9596
configuration issues, alien members."
9697

@@ -101,8 +102,9 @@ groups:
101102
labels:
102103
severity: page
103104
annotations:
104-
summary: "Instance {{ $labels.alias }} have 'critical'-level Cartridge issues"
105-
description: "Possible reasons: replication process critical fail,
105+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has 'critical'-level Cartridge issues"
106+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has 'critical'-level Cartridge issues.
107+
Possible reasons: replication process critical fail,
106108
running out of available memory."
107109

108110

@@ -112,8 +114,8 @@ groups:
112114
labels:
113115
severity: warning
114116
annotations:
115-
summary: "Instance {{ $labels.alias }} have high replication lag"
116-
description: "Instance {{ $labels.alias }} of job {{ $labels.job }} have high replication lag,
117+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high replication lag"
118+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have high replication lag,
117119
check up your network and cluster state."
118120

119121
- name: tarantool-business
@@ -128,9 +130,9 @@ groups:
128130
labels:
129131
severity: warning
130132
annotations:
131-
summary: "Instance {{ $labels.alias }} high HTTP latency"
133+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') high HTTP latency"
132134
description: "Some {{ $labels.method }} requests to {{ $labels.path }} path with {{ $labels.status }} response status
133-
on {{ $labels.alias }} instance of job {{ $labels.job }} are processed too long."
135+
on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' are processed too long."
134136

135137
# Warning for any endpoint of an instance in tarantool_app job that sends too much 4xx responses.
136138
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
@@ -142,9 +144,9 @@ groups:
142144
labels:
143145
severity: warning
144146
annotations:
145-
summary: "Instance {{ $labels.alias }} high rate of client error responses"
147+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') high rate of client error responses"
146148
description: "Too many {{ $labels.method }} requests to {{ $labels.path }} path
147-
on {{ $labels.alias }} instance of job {{ $labels.job }} get client error (4xx) responses."
149+
on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get client error (4xx) responses."
148150

149151
# Warning for any endpoint in tarantool_app job that sends too much 4xx responses (cluster overall).
150152
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
@@ -156,9 +158,9 @@ groups:
156158
labels:
157159
severity: warning
158160
annotations:
159-
summary: "Job {{ $labels.job }} high rate of client error responses"
161+
summary: "Job '{{ $labels.job }}' high rate of client error responses"
160162
description: "Too many {{ $labels.method }} requests to {{ $labels.path }} path
161-
on instances of job {{ $labels.job }} get client error (4xx) responses."
163+
on instances of job '{{ $labels.job }}' get client error (4xx) responses."
162164

163165
# Warning for any endpoint of an instance in tarantool_app job that sends 5xx responses.
164166
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
@@ -170,9 +172,9 @@ groups:
170172
labels:
171173
severity: warning
172174
annotations:
173-
summary: "Instance {{ $labels.alias }} server error responses"
175+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') server error responses"
174176
description: "Some {{ $labels.method }} requests to {{ $labels.path }} path
175-
on {{ $labels.alias }} instance of job {{ $labels.job }} get server error (5xx) responses."
177+
on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get server error (5xx) responses."
176178

177179
# Warning for any endpoint of a router instance (with "router" in alias) in tarantool_app job that gets too little requests.
178180
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
@@ -184,6 +186,6 @@ groups:
184186
labels:
185187
severity: warning
186188
annotations:
187-
summary: "Router {{ $labels.alias }} low activity"
188-
description: Router {{ $labels.alias }} instance of job {{ $labels.job }} gets too little requests.
189+
summary: "Router '{{ $labels.alias }}' ('{{ $labels.job }}') low activity"
190+
description: Router '{{ $labels.alias }}' instance of job '{{ $labels.job }}' gets too little requests.
189191
Please, check up your balancer middleware."

example/prometheus/test_alerts.yml

Lines changed: 37 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ tests:
2222
instance: app:8082
2323
job: tarantool_app
2424
exp_annotations:
25-
summary: "Instance app:8082 down"
26-
description: "app:8082 of job tarantool_app has been down for more than a minute."
25+
summary: "Instance 'app:8082' ('tarantool_app') down"
26+
description: "'app:8082' of job 'tarantool_app' has been down for more than a minute."
2727

2828

2929
- interval: 15s
@@ -40,8 +40,8 @@ tests:
4040
alias: tnt_router
4141
job: tarantool_app
4242
exp_annotations:
43-
summary: "Instance tnt_router Lua runtime warning"
44-
description: "tnt_router instance of job tarantool_app uses too much Lua memory
43+
summary: "Instance 'tnt_router' ('tarantool_app') Lua runtime warning"
44+
description: "'tnt_router' instance of job 'tarantool_app' uses too much Lua memory
4545
and may hit threshold soon."
4646
- eval_time: 2m
4747
alertname: LuaRuntimeAlert
@@ -62,8 +62,8 @@ tests:
6262
alias: tnt_router
6363
job: tarantool_app
6464
exp_annotations:
65-
summary: "Instance tnt_router Lua runtime warning"
66-
description: "tnt_router instance of job tarantool_app uses too much Lua memory
65+
summary: "Instance 'tnt_router' ('tarantool_app') Lua runtime warning"
66+
description: "'tnt_router' instance of job 'tarantool_app' uses too much Lua memory
6767
and may hit threshold soon."
6868
- eval_time: 2m
6969
alertname: LuaRuntimeAlert
@@ -74,8 +74,8 @@ tests:
7474
alias: tnt_router
7575
job: tarantool_app
7676
exp_annotations:
77-
summary: "Instance tnt_router Lua runtime alert"
78-
description: "tnt_router instance of job tarantool_app uses too much Lua memory
77+
summary: "Instance 'tnt_router' ('tarantool_app') Lua runtime alert"
78+
description: "'tnt_router' instance of job 'tarantool_app' uses too much Lua memory
7979
and likely to hit threshold soon."
8080

8181

@@ -110,8 +110,8 @@ tests:
110110
alias: tnt_router
111111
job: tarantool_app
112112
exp_annotations:
113-
summary: "Instance tnt_router low arena memory remaining"
114-
description: "Low arena memory (tuples and indexes) remaining for tnt_router instance of job tarantool_app.
113+
summary: "Instance 'tnt_router' ('tarantool_app') low arena memory remaining"
114+
description: "Low arena memory (tuples and indexes) remaining for 'tnt_router' instance of job 'tarantool_app'.
115115
Consider increasing memtx_memory or number of storages in case of sharded data."
116116
- eval_time: 2m
117117
alertname: MemtxArenaAlert
@@ -134,8 +134,8 @@ tests:
134134
alias: tnt_router
135135
job: tarantool_app
136136
exp_annotations:
137-
summary: "Instance tnt_router low arena memory remaining"
138-
description: "Low arena memory (tuples and indexes) remaining for tnt_router instance of job tarantool_app.
137+
summary: "Instance 'tnt_router' ('tarantool_app') low arena memory remaining"
138+
description: "Low arena memory (tuples and indexes) remaining for 'tnt_router' instance of job 'tarantool_app'.
139139
Consider increasing memtx_memory or number of storages in case of sharded data."
140140
- eval_time: 2m
141141
alertname: MemtxArenaAlert
@@ -146,8 +146,8 @@ tests:
146146
alias: tnt_router
147147
job: tarantool_app
148148
exp_annotations:
149-
summary: "Instance tnt_router low arena memory remaining"
150-
description: "Low arena memory (tuples and indexes) remaining for tnt_router instance of job tarantool_app.
149+
summary: "Instance 'tnt_router' ('tarantool_app') low arena memory remaining"
150+
description: "Low arena memory (tuples and indexes) remaining for 'tnt_router' instance of job 'tarantool_app'.
151151
You are likely to hit limit soon.
152152
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
153153

@@ -183,8 +183,8 @@ tests:
183183
alias: tnt_router
184184
job: tarantool_app
185185
exp_annotations:
186-
summary: "Instance tnt_router low items memory remaining"
187-
description: "Low items memory (tuples) remaining for tnt_router instance of job tarantool_app.
186+
summary: "Instance 'tnt_router' ('tarantool_app') low items memory remaining"
187+
description: "Low items memory (tuples) remaining for 'tnt_router' instance of job 'tarantool_app'.
188188
Consider increasing memtx_memory or number of storages in case of sharded data."
189189
- eval_time: 2m
190190
alertname: MemtxItemsAlert
@@ -208,8 +208,9 @@ tests:
208208
alias: tnt_router
209209
job: tarantool_app
210210
exp_annotations:
211-
summary: "Instance tnt_router have 'warning'-level Cartridge issues"
212-
description: "Possible reasons: high replication lag, replication long idle,
211+
summary: "Instance 'tnt_router' ('tarantool_app') has 'warning'-level Cartridge issues"
212+
description: "Instance 'tnt_router' of job 'tarantool_app' has 'warning'-level Cartridge issues.
213+
Possible reasons: high replication lag, replication long idle,
213214
failover or switchover issues, clock issues, memory fragmentation,
214215
configuration issues, alien members."
215216
- eval_time: 2m
@@ -234,8 +235,9 @@ tests:
234235
alias: tnt_router
235236
job: tarantool_app
236237
exp_annotations:
237-
summary: "Instance tnt_router have 'warning'-level Cartridge issues"
238-
description: "Possible reasons: high replication lag, replication long idle,
238+
summary: "Instance 'tnt_router' ('tarantool_app') has 'warning'-level Cartridge issues"
239+
description: "Instance 'tnt_router' of job 'tarantool_app' has 'warning'-level Cartridge issues.
240+
Possible reasons: high replication lag, replication long idle,
239241
failover or switchover issues, clock issues, memory fragmentation,
240242
configuration issues, alien members."
241243
- eval_time: 2m
@@ -248,8 +250,9 @@ tests:
248250
alias: tnt_router
249251
job: tarantool_app
250252
exp_annotations:
251-
summary: "Instance tnt_router have 'critical'-level Cartridge issues"
252-
description: "Possible reasons: replication process critical fail,
253+
summary: "Instance 'tnt_router' ('tarantool_app') has 'critical'-level Cartridge issues"
254+
description: "Instance 'tnt_router' of job 'tarantool_app' has 'critical'-level Cartridge issues.
255+
Possible reasons: replication process critical fail,
253256
running out of available memory."
254257

255258

@@ -269,8 +272,8 @@ tests:
269272
alias: tnt_storage_replica
270273
job: tarantool_app
271274
exp_annotations:
272-
summary: "Instance tnt_storage_replica have high replication lag"
273-
description: "Instance tnt_storage_replica of job tarantool_app have high replication lag,
275+
summary: "Instance 'tnt_storage_replica' ('tarantool_app') have high replication lag"
276+
description: "Instance 'tnt_storage_replica' of job 'tarantool_app' have high replication lag,
274277
check up your network and cluster state."
275278

276279

@@ -300,9 +303,9 @@ tests:
300303
status: '200'
301304
quantile: '0.99'
302305
exp_annotations:
303-
summary: "Instance tnt_router high HTTP latency"
306+
summary: "Instance 'tnt_router' ('tarantool_app') high HTTP latency"
304307
description: "Some GET requests to /hello path with 200 response status
305-
on tnt_router instance of job tarantool_app are processed too long."
308+
on 'tnt_router' instance of job 'tarantool_app' are processed too long."
306309

307310

308311
- interval: 15s
@@ -329,9 +332,9 @@ tests:
329332
path: /hell0
330333
method: GET
331334
exp_annotations:
332-
summary: "Instance tnt_router high rate of client error responses"
335+
summary: "Instance 'tnt_router' ('tarantool_app') high rate of client error responses"
333336
description: "Too many GET requests to /hell0 path
334-
on tnt_router instance of job tarantool_app get client error (4xx) responses."
337+
on 'tnt_router' instance of job 'tarantool_app' get client error (4xx) responses."
335338

336339
# Total rate of 4xx is high, but distributed between different routers
337340
- interval: 15s
@@ -379,9 +382,9 @@ tests:
379382
path: /hell0
380383
method: GET
381384
exp_annotations:
382-
summary: "Job tarantool_app high rate of client error responses"
385+
summary: "Job 'tarantool_app' high rate of client error responses"
383386
description: "Too many GET requests to /hell0 path
384-
on instances of job tarantool_app get client error (4xx) responses."
387+
on instances of job 'tarantool_app' get client error (4xx) responses."
385388

386389
- interval: 15s
387390
input_series:
@@ -407,9 +410,9 @@ tests:
407410
path: /goodbye
408411
method: POST
409412
exp_annotations:
410-
summary: "Instance tnt_router server error responses"
413+
summary: "Instance 'tnt_router' ('tarantool_app') server error responses"
411414
description: "Some POST requests to /goodbye path
412-
on tnt_router instance of job tarantool_app get server error (5xx) responses."
415+
on 'tnt_router' instance of job 'tarantool_app' get server error (5xx) responses."
413416

414417
- interval: 15s
415418
input_series:
@@ -433,6 +436,6 @@ tests:
433436
alias: tnt_router
434437
job: tarantool_app
435438
exp_annotations:
436-
summary: "Router tnt_router low activity"
437-
description: Router tnt_router instance of job tarantool_app gets too little requests.
439+
summary: "Router 'tnt_router' ('tarantool_app') low activity"
440+
description: Router 'tnt_router' instance of job 'tarantool_app' gets too little requests.
438441
Please, check up your balancer middleware."

0 commit comments

Comments
 (0)