|
71 | 71 | |||,
|
72 | 72 | },
|
73 | 73 | },
|
74 |
| - { |
75 |
| - // We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail |
76 |
| - // and we will never trigger the alert. |
77 |
| - // We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage. |
78 |
| - alert: 'CortexTableSyncFailure', |
79 |
| - expr: ||| |
80 |
| - 100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m]) |
81 |
| - / |
82 |
| - rate(cortex_table_manager_sync_duration_seconds_count[15m]) |
83 |
| - > 10 |
84 |
| - |||, |
85 |
| - 'for': '30m', |
86 |
| - labels: { |
87 |
| - severity: 'critical', |
88 |
| - }, |
89 |
| - annotations: { |
90 |
| - message: ||| |
91 |
| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables. |
92 |
| - |||, |
93 |
| - }, |
94 |
| - }, |
95 | 74 | {
|
96 | 75 | alert: 'CortexQueriesIncorrect',
|
97 | 76 | expr: |||
|
|
206 | 185 | |||,
|
207 | 186 | },
|
208 | 187 | },
|
209 |
| - { |
210 |
| - alert: 'CortexTransferFailed', |
211 |
| - expr: ||| |
212 |
| - max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m]) |
213 |
| - |||, |
214 |
| - 'for': '5m', |
215 |
| - labels: { |
216 |
| - severity: 'critical', |
217 |
| - }, |
218 |
| - annotations: { |
219 |
| - message: ||| |
220 |
| - {{ $labels.job }}/{{ $labels.instance }} transfer failed. |
221 |
| - |||, |
222 |
| - }, |
223 |
| - }, |
224 |
| - { |
225 |
| - alert: 'CortexOldChunkInMemory', |
226 |
| - // Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer |
227 |
| - // to 10 hours. |
228 |
| - // Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors). |
229 |
| - expr: ||| |
230 |
| - (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000) |
231 |
| - and |
232 |
| - (cortex_oldest_unflushed_chunk_timestamp_seconds > 0) |
233 |
| - |||, |
234 |
| - 'for': '5m', |
235 |
| - labels: { |
236 |
| - severity: 'warning', |
237 |
| - }, |
238 |
| - annotations: { |
239 |
| - message: ||| |
240 |
| - {{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory. |
241 |
| - |||, |
242 |
| - }, |
243 |
| - }, |
244 | 188 | {
|
245 | 189 | alert: 'CortexKVStoreFailure',
|
246 | 190 | expr: |||
|
|
379 | 323 | },
|
380 | 324 | ],
|
381 | 325 | },
|
382 |
| - { |
383 |
| - name: 'cortex_wal_alerts', |
384 |
| - rules: [ |
385 |
| - { |
386 |
| - // Alert immediately if WAL is corrupt. |
387 |
| - alert: 'CortexWALCorruption', |
388 |
| - expr: ||| |
389 |
| - increase(cortex_ingester_wal_corruptions_total[5m]) > 0 |
390 |
| - |||, |
391 |
| - labels: { |
392 |
| - severity: 'critical', |
393 |
| - }, |
394 |
| - annotations: { |
395 |
| - message: ||| |
396 |
| - {{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. |
397 |
| - |||, |
398 |
| - }, |
399 |
| - }, |
400 |
| - { |
401 |
| - // One or more failed checkpoint creation is a warning. |
402 |
| - alert: 'CortexCheckpointCreationFailed', |
403 |
| - expr: ||| |
404 |
| - increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0 |
405 |
| - |||, |
406 |
| - labels: { |
407 |
| - severity: 'warning', |
408 |
| - }, |
409 |
| - annotations: { |
410 |
| - message: ||| |
411 |
| - {{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint. |
412 |
| - |||, |
413 |
| - }, |
414 |
| - }, |
415 |
| - { |
416 |
| - // Two or more failed checkpoint creation in 1h means something is wrong. |
417 |
| - alert: 'CortexCheckpointCreationFailed', |
418 |
| - expr: ||| |
419 |
| - increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1 |
420 |
| - |||, |
421 |
| - labels: { |
422 |
| - severity: 'critical', |
423 |
| - }, |
424 |
| - annotations: { |
425 |
| - message: ||| |
426 |
| - {{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint. |
427 |
| - |||, |
428 |
| - }, |
429 |
| - }, |
430 |
| - { |
431 |
| - // One or more failed checkpoint deletion is a warning. |
432 |
| - alert: 'CortexCheckpointDeletionFailed', |
433 |
| - expr: ||| |
434 |
| - increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0 |
435 |
| - |||, |
436 |
| - labels: { |
437 |
| - severity: 'warning', |
438 |
| - }, |
439 |
| - annotations: { |
440 |
| - message: ||| |
441 |
| - {{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint. |
442 |
| - |||, |
443 |
| - }, |
444 |
| - }, |
445 |
| - { |
446 |
| - // Two or more failed checkpoint deletion in 2h means something is wrong. |
447 |
| - // We give this more buffer than creation as this is a less critical operation. |
448 |
| - alert: 'CortexCheckpointDeletionFailed', |
449 |
| - expr: ||| |
450 |
| - increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1 |
451 |
| - |||, |
452 |
| - labels: { |
453 |
| - severity: 'critical', |
454 |
| - }, |
455 |
| - annotations: { |
456 |
| - message: ||| |
457 |
| - {{ $labels.instance }} is failing to delete checkpoint. |
458 |
| - |||, |
459 |
| - }, |
460 |
| - }, |
461 |
| - ], |
462 |
| - }, |
463 | 326 | {
|
464 | 327 | name: 'cortex-rollout-alerts',
|
465 | 328 | rules: [
|
|
524 | 387 | {
|
525 | 388 | name: 'cortex-provisioning',
|
526 | 389 | rules: [
|
527 |
| - { |
528 |
| - alert: 'CortexProvisioningMemcachedTooSmall', |
529 |
| - // 4 x in-memory series size = 24hrs of data. |
530 |
| - expr: ||| |
531 |
| - ( |
532 |
| - 4 * |
533 |
| - sum by (%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count) |
534 |
| - / 1e9 |
535 |
| - ) |
536 |
| - > |
537 |
| - ( |
538 |
| - sum by (%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 |
539 |
| - ) |
540 |
| - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], |
541 |
| - 'for': '15m', |
542 |
| - labels: { |
543 |
| - severity: 'warning', |
544 |
| - }, |
545 |
| - annotations: { |
546 |
| - message: ||| |
547 |
| - Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB. |
548 |
| - ||| % $._config, |
549 |
| - }, |
550 |
| - }, |
551 | 390 | {
|
552 | 391 | alert: 'CortexProvisioningTooManyActiveSeries',
|
553 | 392 | // We target each ingester to 1.5M in-memory series. This alert fires if the average
|
|
0 commit comments