|
1 | | -- title: "${MACHINE_FQDN}: Interactive Study Data too large for AWS S3 and hanging" |
2 | | - description: "${MACHINE_FQDN}: Study Hanging" |
3 | | - priority: 2 |
4 | | - config: |
5 | | - query: > |
6 | | - "EntityTooLarge" AND NOT container_name:/.*graylog_graylog.*/ |
7 | | - query_parameters: [] |
8 | | - search_within_ms: 600000 |
9 | | - execute_every_ms: 600000 |
10 | | - group_by: [] |
11 | | - series: [] |
12 | | - conditions: {} |
13 | | - type: aggregation-v1 |
14 | | - field_spec: |
15 | | - source: |
16 | | - data_type: string |
17 | | - providers: |
18 | | - - type: template-v1 |
19 | | - template: "${source.source}" |
20 | | - require_values: false |
21 | | - container_name: |
22 | | - data_type: string |
23 | | - providers: |
24 | | - - type: template-v1 |
25 | | - template: "${source.container_name}" |
26 | | - require_values: false |
27 | | - full_message: |
28 | | - data_type: string |
29 | | - providers: |
30 | | - - type: template-v1 |
31 | | - template: "${source.full_message}" |
32 | | - key_spec: |
33 | | - - source |
34 | | - - container_name |
35 | | - - full_message |
36 | | - notification_settings: |
37 | | - grace_period_ms: 0 |
38 | | - backlog_size: 99 |
39 | | - alert: true |
40 | | -- title: "${MACHINE_FQDN}: Writer Is None Error in Webserver" |
41 | | - description: "${MACHINE_FQDN}: Alert if \"writer is None\" pops up. Communication with rabbitMQ is disrupted and this will make simcore go crazy" |
42 | | - priority: 2 |
43 | | - config: |
44 | | - query: > |
45 | | - "writer is None" AND NOT container_name:/.*graylog_graylog.*/ |
46 | | - query_parameters: [] |
47 | | - search_within_ms: 600000 |
48 | | - execute_every_ms: 600000 |
49 | | - group_by: [] |
50 | | - series: [] |
51 | | - conditions: {} |
52 | | - type: aggregation-v1 |
53 | | - field_spec: |
54 | | - source: |
55 | | - data_type: string |
56 | | - providers: |
57 | | - - type: template-v1 |
58 | | - template: "${source.source}" |
59 | | - require_values: false |
60 | | - container_name: |
61 | | - data_type: string |
62 | | - providers: |
63 | | - - type: template-v1 |
64 | | - template: "${source.container_name}" |
65 | | - require_values: false |
66 | | - full_message: |
67 | | - data_type: string |
68 | | - providers: |
69 | | - - type: template-v1 |
70 | | - template: "${source.full_message}" |
71 | | - key_spec: |
72 | | - - source |
73 | | - - container_name |
74 | | - - full_message |
75 | | - notification_settings: |
76 | | - grace_period_ms: 0 |
77 | | - backlog_size: 99 |
78 | | - alert: true |
79 | | -- title: "${MACHINE_FQDN}: Dynamic Sidecar failed to save with S3TransferError" |
80 | | - description: "${MACHINE_FQDN}: Alert if Dynamic Sidecar failed to save with S3TransferError" |
81 | | - priority: 2 |
82 | | - config: |
83 | | - query: > |
84 | | - "simcore_sdk.node_ports_common.exceptions.S3TransferError: Could not upload file" AND NOT container_name:/.*graylog_graylog.*/ |
85 | | - query_parameters: [] |
86 | | - search_within_ms: 600000 |
87 | | - execute_every_ms: 600000 |
88 | | - group_by: [] |
89 | | - series: [] |
90 | | - conditions: {} |
91 | | - type: aggregation-v1 |
92 | | - field_spec: |
93 | | - source: |
94 | | - data_type: string |
95 | | - providers: |
96 | | - - type: template-v1 |
97 | | - template: "${source.source}" |
98 | | - require_values: false |
99 | | - container_name: |
100 | | - data_type: string |
101 | | - providers: |
102 | | - - type: template-v1 |
103 | | - template: "${source.container_name}" |
104 | | - require_values: false |
105 | | - full_message: |
106 | | - data_type: string |
107 | | - providers: |
108 | | - - type: template-v1 |
109 | | - template: "${source.full_message}" |
110 | | - key_spec: |
111 | | - - source |
112 | | - - container_name |
113 | | - - full_message |
114 | | - notification_settings: |
115 | | - grace_period_ms: 0 |
116 | | - backlog_size: 99 |
117 | | - alert: true |
118 | | -- title: "${MACHINE_FQDN}: Dynamic Sidecar failed to save - 2" |
119 | | - description: "${MACHINE_FQDN}: Alert if Dynamic Sidecar failed to save - 2" |
120 | | - priority: 2 |
121 | | - config: |
122 | | - query: > |
123 | | - "Could not contact dynamic-sidecar to save service" AND NOT container_name:/.*graylog_graylog.*/ |
124 | | - query_parameters: [] |
125 | | - search_within_ms: 60000 |
126 | | - execute_every_ms: 60000 |
127 | | - group_by: [] |
128 | | - series: [] |
129 | | - conditions: {} |
130 | | - type: aggregation-v1 |
131 | | - field_spec: |
132 | | - source: |
133 | | - data_type: string |
134 | | - providers: |
135 | | - - type: template-v1 |
136 | | - template: "${source.source}" |
137 | | - require_values: false |
138 | | - container_name: |
139 | | - data_type: string |
140 | | - providers: |
141 | | - - type: template-v1 |
142 | | - template: "${source.container_name}" |
143 | | - require_values: false |
144 | | - full_message: |
145 | | - data_type: string |
146 | | - providers: |
147 | | - - type: template-v1 |
148 | | - template: "${source.full_message}" |
149 | | - key_spec: |
150 | | - - source |
151 | | - - container_name |
152 | | - - full_message |
153 | | - notification_settings: |
154 | | - grace_period_ms: 0 |
155 | | - backlog_size: 99 |
156 | | - alert: true |
157 | | -- title: "${MACHINE_FQDN}: simcore-agent failed pushing docker volume data to backup S3 bucket" |
158 | | - description: "${MACHINE_FQDN}: simcore-agent failed pushing docker volume data to backup S3 bucket" |
159 | | - priority: 2 |
160 | | - config: |
161 | | - query: > |
162 | | - container_name: /.*agent.*/ AND "Shell subprocesses yielded nonzero error code" AND NOT container_name:/.*graylog_graylog.*/ |
163 | | - query_parameters: [] |
164 | | - search_within_ms: 600000 |
165 | | - execute_every_ms: 600000 |
166 | | - group_by: [] |
167 | | - series: [] |
168 | | - conditions: {} |
169 | | - type: aggregation-v1 |
170 | | - field_spec: |
171 | | - source: |
172 | | - data_type: string |
173 | | - providers: |
174 | | - - type: template-v1 |
175 | | - template: "${source.source}" |
176 | | - require_values: false |
177 | | - container_name: |
178 | | - data_type: string |
179 | | - providers: |
180 | | - - type: template-v1 |
181 | | - template: "${source.container_name}" |
182 | | - require_values: false |
183 | | - full_message: |
184 | | - data_type: string |
185 | | - providers: |
186 | | - - type: template-v1 |
187 | | - template: "${source.full_message}" |
188 | | - key_spec: |
189 | | - - source |
190 | | - - container_name |
191 | | - - full_message |
192 | | - notification_settings: |
193 | | - grace_period_ms: 0 |
194 | | - backlog_size: 99 |
195 | | - alert: true |
196 | | -- title: "${MACHINE_FQDN}: faulty env-var setup" |
197 | | - description: "${MACHINE_FQDN}: Look e.g. here https://git.speag.com/oSparc/osparc-ops-environments/-/issues/564" |
198 | | - priority: 2 |
199 | | - config: |
200 | | - query: > |
201 | | - "unresolved, defaulting to None" AND NOT container_name:/.*graylog_graylog.*/ |
202 | | - query_parameters: [] |
203 | | - search_within_ms: 600000 |
204 | | - execute_every_ms: 600000 |
205 | | - group_by: [] |
206 | | - series: [] |
207 | | - conditions: {} |
208 | | - type: aggregation-v1 |
209 | | - field_spec: |
210 | | - source: |
211 | | - data_type: string |
212 | | - providers: |
213 | | - - type: template-v1 |
214 | | - template: "${source.source}" |
215 | | - require_values: false |
216 | | - container_name: |
217 | | - data_type: string |
218 | | - providers: |
219 | | - - type: template-v1 |
220 | | - template: "${source.container_name}" |
221 | | - require_values: false |
222 | | - full_message: |
223 | | - data_type: string |
224 | | - providers: |
225 | | - - type: template-v1 |
226 | | - template: "${source.full_message}" |
227 | | - key_spec: |
228 | | - - source |
229 | | - - container_name |
230 | | - - full_message |
231 | | - notification_settings: |
232 | | - grace_period_ms: 0 |
233 | | - backlog_size: 99 |
234 | | - alert: true |
235 | 1 | - title: "${MACHINE_FQDN}: DOCKER IP POOL EXHAUSTED, no service can start" |
236 | 2 | description: "${MACHINE_FQDN}: DOCKER IP POOL EXHAUSTED, no service can start. See: https://github.com/moby/moby/issues/30820" |
237 | 3 | priority: 3 |
|
240 | 6 | container_name: /.*director-v2.*/ AND "could not find an available, non-overlapping IPv4 address pool among the defaults to assign to the network" AND NOT container_name:/.*graylog_graylog.*/ |
241 | 7 | query_parameters: [] |
242 | 8 | search_within_ms: 600000 |
| 9 | + event_limit: 1 |
243 | 10 | execute_every_ms: 600000 |
244 | 11 | group_by: [] |
245 | 12 | series: [] |
|
271 | 38 | grace_period_ms: 0 |
272 | 39 | backlog_size: 99 |
273 | 40 | alert: true |
274 | | -- title: "${MACHINE_FQDN}: Potential hanging dy-sidecar service detected." |
275 | | - description: "${MACHINE_FQDN}: Potential hanging dy-sidecar service detected. Human intervention required. Please investigate." |
276 | | - priority: 3 |
277 | | - config: |
278 | | - query: > |
279 | | - "waiting for manual intervention" AND container_name:/.*director-v2.*/ |
280 | | - query_parameters: [] |
281 | | - search_within_ms: 3600000 |
282 | | - execute_every_ms: 3600000 |
283 | | - group_by: [] |
284 | | - series: [] |
285 | | - conditions: {} |
286 | | - type: aggregation-v1 |
287 | | - field_spec: |
288 | | - source: |
289 | | - data_type: string |
290 | | - providers: |
291 | | - - type: template-v1 |
292 | | - template: "${source.source}" |
293 | | - require_values: false |
294 | | - container_name: |
295 | | - data_type: string |
296 | | - providers: |
297 | | - - type: template-v1 |
298 | | - template: "${source.container_name}" |
299 | | - require_values: false |
300 | | - full_message: |
301 | | - data_type: string |
302 | | - providers: |
303 | | - - type: template-v1 |
304 | | - template: "${source.full_message}" |
305 | | - key_spec: |
306 | | - - source |
307 | | - - container_name |
308 | | - notification_settings: |
309 | | - grace_period_ms: 0 |
310 | | - backlog_size: 99 |
311 | | - alert: true |
312 | | -- title: "${MACHINE_FQDN}: Syslogs indicate OOM-Event" |
313 | | - description: "${MACHINE_FQDN}: Likely the oom-killer has reaped a container Please investigate and adjust service limitations." |
314 | | - priority: 2 |
315 | | - config: |
316 | | - query: > |
317 | | - "Memory cgroup out of memory:" |
318 | | - query_parameters: [] |
319 | | - search_within_ms: 3600000 |
320 | | - execute_every_ms: 3600000 |
321 | | - group_by: [] |
322 | | - series: [] |
323 | | - conditions: {} |
324 | | - type: aggregation-v1 |
325 | | - field_spec: |
326 | | - source: |
327 | | - data_type: string |
328 | | - providers: |
329 | | - - type: template-v1 |
330 | | - template: "${source.source}" |
331 | | - require_values: false |
332 | | - container_name: |
333 | | - data_type: string |
334 | | - providers: |
335 | | - - type: template-v1 |
336 | | - template: "${source.container_name}" |
337 | | - require_values: false |
338 | | - full_message: |
339 | | - data_type: string |
340 | | - providers: |
341 | | - - type: template-v1 |
342 | | - template: "${source.full_message}" |
343 | | - key_spec: |
344 | | - - source |
345 | | - - container_name |
346 | | - notification_settings: |
347 | | - grace_period_ms: 0 |
348 | | - backlog_size: 99 |
349 | | - alert: true |
350 | 41 | - title: "${MACHINE_FQDN}: Unexpected error with redis lock detected" |
351 | 42 | description: "${MACHINE_FQDN}: This error should only occur in unit tests due to very low timings, maybe something happene here" |
352 | 43 | priority: 2 |
|
355 | 46 | "lock is no longer owned. This is unexpected and requires investigation" AND NOT container_name:/.*graylog_graylog.*/ |
356 | 47 | query_parameters: [] |
357 | 48 | search_within_ms: 3600000 |
| 49 | + event_limit: 1 |
358 | 50 | execute_every_ms: 3600000 |
359 | 51 | group_by: [] |
360 | 52 | series: [] |
|
393 | 85 | "LockNotOwnedError" AND NOT container_name:/.*graylog_graylog.*/ |
394 | 86 | query_parameters: [] |
395 | 87 | search_within_ms: 3600000 |
| 88 | + event_limit: 1 |
396 | 89 | execute_every_ms: 3600000 |
397 | 90 | group_by: [] |
398 | 91 | series: [] |
|
432 | 125 | query_parameters: [] |
433 | 126 | search_within_ms: 86400000 |
434 | 127 | execute_every_ms: 86400000 |
| 128 | + event_limit: 1 |
435 | 129 | group_by: [] |
436 | 130 | series: [] |
437 | 131 | conditions: {} |
|
463 | 157 | query: log_service:/.+payments/ AND (log_level:ERROR OR log_level:WARNING) |
464 | 158 | query_parameters: [] |
465 | 159 | search_within_ms: 600000 |
| 160 | + event_limit: 1 |
466 | 161 | execute_every_ms: 600000 |
467 | 162 | group_by: [] |
468 | 163 | series: [] |
|
0 commit comments