Skip to content

Commit 5f6295e

Browse files
Fix MongoDbDegraded alert
- Rely on kube_statefulset_replicas instead of parameter, to better adapt to different deployment configurations. - Use same alert name for warning & ciritical, so they get deduplicated automatically. Issue: ZENKO-5097
1 parent dd914bb commit 5f6295e

File tree

3 files changed

+73
-40
lines changed

3 files changed

+73
-40
lines changed

.github/workflows/alerts.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ jobs:
3333
namespace=zenko
3434
service=data-db-mongodb-sharded
3535
pvc=datadir-mongodb
36-
replicas=3
3736
github_token: ${{ steps.app-token.outputs.token }}
3837

3938
- name: Render and test kafka alerts

monitoring/mongodb/alerts.test.yaml

Lines changed: 49 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Use github.com/scality/action-prom-render-test@python-renderer python module
33
#
44
# Render the alerts file with
5-
# alertgen alerts.yaml -i 'namespace=zenko,service=data-db-mongodb-sharded,pvc=datadir-mongodb,replicas=3'
5+
# alertgen alerts.yaml -i 'namespace=zenko,service=data-db-mongodb-sharded,pvc=datadir-mongodb'
66

77
evaluation_interval: 30s
88

@@ -19,33 +19,35 @@ tests:
1919
values: 1 1 1 1 0 0 0 1 1
2020
- series: up{namespace="zenko", job="zenko/data-db-mongodb-sharded-mongos", pod="data-db-mongodb-sharded-mongos-2"}
2121
values: 1 1 1 1 1 0 0 0 1
22+
- series: kube_statefulset_replicas{namespace="zenko", statefulset="data-db-mongodb-sharded-mongos", job="kube-state-metrics"}
23+
values: 3x9
24+
- series: kube_statefulset_replicas{namespace="zenko", statefulset="data-db-mongodb-sharded-shard0-data", job="kube-state-metrics"}
25+
values: 3x9
2226
alert_rule_test:
2327
- { alertname: MongoDbDegraded, eval_time: 30s, exp_alerts: [] }
24-
- { alertname: MongoDbCritical, eval_time: 30s, exp_alerts: [] }
2528

2629
- { alertname: MongoDbDegraded, eval_time: 60s, exp_alerts: [] }
27-
- { alertname: MongoDbCritical, eval_time: 60s, exp_alerts: [] }
2830

29-
- { alertname: MongoDbCritical, eval_time: 90s, exp_alerts: [] }
3031
- alertname: MongoDbDegraded
3132
eval_time: 90s
3233
exp_alerts:
3334
- exp_labels:
3435
severity: warning
3536
job: mongos
37+
statefulset: data-db-mongodb-sharded-mongos
3638
exp_annotations:
37-
description: "Less than 100% of MongoDb mongos instances are up and healthy: 2/3."
39+
description: "Less than 100% of MongoDb mongos instances are up and healthy: 2."
3840
summary: MongoDb mongos service degraded
3941

40-
- { alertname: MongoDbCritical, eval_time: 120s, exp_alerts: [] }
4142
- alertname: MongoDbDegraded
4243
eval_time: 120s
4344
exp_alerts:
4445
- exp_labels:
4546
severity: warning
4647
job: mongos
48+
statefulset: data-db-mongodb-sharded-mongos
4749
exp_annotations:
48-
description: "Less than 100% of MongoDb mongos instances are up and healthy: 1/3."
50+
description: "Less than 100% of MongoDb mongos instances are up and healthy: 1."
4951
summary: MongoDb mongos service degraded
5052

5153
- alertname: MongoDbDegraded
@@ -54,17 +56,16 @@ tests:
5456
- exp_labels:
5557
severity: warning
5658
job: mongos
59+
statefulset: data-db-mongodb-sharded-mongos
5760
exp_annotations:
58-
description: "Less than 100% of MongoDb mongos instances are up and healthy: 0/3."
61+
description: "Less than 100% of MongoDb mongos instances are up and healthy: 0."
5962
summary: MongoDb mongos service degraded
60-
- alertname: MongoDbCritical
61-
eval_time: 150s
62-
exp_alerts:
6363
- exp_labels:
6464
severity: critical
6565
job: mongos
66+
statefulset: data-db-mongodb-sharded-mongos
6667
exp_annotations:
67-
description: "Less than 50% of MongoDb mongos instances are up and healthy: 0/3."
68+
description: "Less than 50% of MongoDb mongos instances are up and healthy: 0."
6869
summary: MongoDb mongos service critical
6970

7071
- alertname: MongoDbDegraded
@@ -73,17 +74,16 @@ tests:
7374
- exp_labels:
7475
severity: warning
7576
job: mongos
77+
statefulset: data-db-mongodb-sharded-mongos
7678
exp_annotations:
77-
description: "Less than 100% of MongoDb mongos instances are up and healthy: 1/3."
79+
description: "Less than 100% of MongoDb mongos instances are up and healthy: 1."
7880
summary: MongoDb mongos service degraded
79-
- alertname: MongoDbCritical
80-
eval_time: 180s
81-
exp_alerts:
8281
- exp_labels:
8382
severity: critical
8483
job: mongos
84+
statefulset: data-db-mongodb-sharded-mongos
8585
exp_annotations:
86-
description: "Less than 50% of MongoDb mongos instances are up and healthy: 1/3."
86+
description: "Less than 50% of MongoDb mongos instances are up and healthy: 1."
8787
summary: MongoDb mongos service critical
8888

8989
- name: NoPrimary
@@ -457,7 +457,8 @@ tests:
457457
values: 2x8 stale
458458
- series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard0-data-2", member_state="SECONDARY", rs_nm="data-db-mongodb-sharded-shard-0", member_idx="shard0-data-2"}
459459
values: 2x8 stale
460-
460+
- series: kube_statefulset_replicas{namespace="zenko", statefulset="data-db-mongodb-sharded-shard0-data", job="kube-state-metrics"}
461+
values: 3x20
461462
alert_rule_test:
462463
- alertname: MongoDbRSNotSynced
463464
eval_time: 5m
@@ -477,8 +478,12 @@ tests:
477478
- exp_labels:
478479
severity: warning
479480
rs_nm: data-db-mongodb-sharded-shard-0
481+
statefulset: data-db-mongodb-sharded-shard0-data
480482
exp_annotations:
481-
description: "MongoDB replica set `data-db-mongodb-sharded-shard-0` is not in the expected state. It does not have the expected number of SECONDARY members. Please ensure that all instances are running properly."
483+
description: >-
484+
MongoDB replica set `data-db-mongodb-sharded-shard-0` is not in the expected state.
485+
It does not have the expected number of SECONDARY members. Please ensure that all
486+
instances are running properly.
482487
summary: MongoDB replica set out of sync
483488

484489
- name: MongoDbRSNotSynced_MultiShardOneFailsOneHealthy
@@ -535,6 +540,11 @@ tests:
535540
values: 2x20
536541
- series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard1-data-2", member_state="SECONDARY", rs_nm="data-db-mongodb-sharded-shard-1", member_idx="shard1-data-2"}
537542
values: 2x20
543+
544+
- series: kube_statefulset_replicas{namespace="zenko", statefulset="data-db-mongodb-sharded-shard0-data", job="kube-state-metrics"}
545+
values: 3x20
546+
- series: kube_statefulset_replicas{namespace="zenko", statefulset="data-db-mongodb-sharded-shard1-data", job="kube-state-metrics"}
547+
values: 3x20
538548
alert_rule_test:
539549
- alertname: MongoDbRSNotSynced
540550
eval_time: 5m
@@ -551,8 +561,12 @@ tests:
551561
- exp_labels:
552562
severity: warning
553563
rs_nm: data-db-mongodb-sharded-shard-0
564+
statefulset: data-db-mongodb-sharded-shard0-data
554565
exp_annotations:
555-
description: "MongoDB replica set `data-db-mongodb-sharded-shard-0` is not in the expected state. It does not have the expected number of SECONDARY members. Please ensure that all instances are running properly."
566+
description: >-
567+
MongoDB replica set `data-db-mongodb-sharded-shard-0` is not in the expected state.
568+
It does not have the expected number of SECONDARY members. Please ensure that all
569+
instances are running properly.
556570
summary: MongoDB replica set out of sync
557571

558572
- name: MongoDbRSNotSynced_TwoShardsFailSimultaneously
@@ -613,6 +627,11 @@ tests:
613627
values: 2x8 stale
614628
- series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-shard1-data-2", member_state="SECONDARY", rs_nm="data-db-mongodb-sharded-shard-1", member_idx="shard1-data-2"}
615629
values: 2x8 stale
630+
631+
- series: kube_statefulset_replicas{namespace="zenko", statefulset="data-db-mongodb-sharded-shard0-data", job="kube-state-metrics"}
632+
values: 3x20
633+
- series: kube_statefulset_replicas{namespace="zenko", statefulset="data-db-mongodb-sharded-shard1-data", job="kube-state-metrics"}
634+
values: 3x20
616635
alert_rule_test:
617636
- alertname: MongoDbRSNotSynced
618637
eval_time: 5m
@@ -629,12 +648,20 @@ tests:
629648
- exp_labels: # Expected alert for shard-0
630649
severity: warning
631650
rs_nm: data-db-mongodb-sharded-shard-0
651+
statefulset: data-db-mongodb-sharded-shard0-data
632652
exp_annotations:
633-
description: "MongoDB replica set `data-db-mongodb-sharded-shard-0` is not in the expected state. It does not have the expected number of SECONDARY members. Please ensure that all instances are running properly."
653+
description: >-
654+
MongoDB replica set `data-db-mongodb-sharded-shard-0` is not in the expected state.
655+
It does not have the expected number of SECONDARY members. Please ensure that all
656+
instances are running properly.
634657
summary: MongoDB replica set out of sync
635658
- exp_labels: # Expected alert for shard-1
636659
severity: warning
637660
rs_nm: data-db-mongodb-sharded-shard-1
661+
statefulset: data-db-mongodb-sharded-shard1-data
638662
exp_annotations:
639-
description: "MongoDB replica set `data-db-mongodb-sharded-shard-1` is not in the expected state. It does not have the expected number of SECONDARY members. Please ensure that all instances are running properly."
663+
description: >-
664+
MongoDB replica set `data-db-mongodb-sharded-shard-1` is not in the expected state.
665+
It does not have the expected number of SECONDARY members. Please ensure that all
666+
instances are running properly.
640667
summary: MongoDB replica set out of sync

monitoring/mongodb/alerts.yaml

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,6 @@ x-inputs:
1818
- name: remainingDiskSpaceWarningThreshold
1919
type: config
2020
value: 0.25
21-
- name: replicas
22-
type: config
23-
value: 3
2421
- name: replicationLagWarningThreshold
2522
type: config
2623
value: 70
@@ -38,29 +35,35 @@ groups:
3835
- alert: MongoDbDegraded
3936
expr: |
4037
label_replace(
41-
sum(up{namespace="${namespace}",pod=~"${service}.*"}) by(job) < ${replicas}
42-
, "job", "$1", "job", "(?:${namespace}/)?${service}-?(.*)")
38+
label_replace(
39+
sum(up{namespace="${namespace}",pod=~"${service}.*"}) by(job)
40+
, "statefulset", "$1", "job", "(?:${namespace}/)?(${service}-?.*)")
41+
< on(statefulset)
42+
kube_statefulset_replicas{statefulset=~"${service}.*", namespace="${namespace}"}
43+
, "job", "$1", "statefulset", "${service}-?(.*)")
4344
for: 30s
4445
labels:
4546
severity: warning
4647
annotations:
4748
description: >-
48-
Less than 100% of MongoDb {{ $labels.job }} instances are up and healthy:
49-
{{ $value }}/${replicas}.
49+
Less than 100% of MongoDb {{ $labels.job }} instances are up and healthy: {{ $value }}.
5050
summary: MongoDb {{ $labels.job }} service degraded
5151

52-
- alert: MongoDbCritical
52+
- alert: MongoDbDegraded
5353
expr: |
5454
label_replace(
55-
sum(up{namespace="${namespace}",pod=~"${service}.*"}) by(job) < ${replicas} / 2
56-
, "job", "$1", "job", "(?:${namespace}/)?${service}-?(.*)")
55+
label_replace(
56+
sum(up{namespace="${namespace}",pod=~"${service}.*"}) by(job)
57+
, "statefulset", "$1", "job", "(?:${namespace}/)?(${service}-?.*)")
58+
< on(statefulset)
59+
kube_statefulset_replicas{statefulset=~"${service}.*", namespace="${namespace}"} / 2
60+
, "job", "$1", "statefulset", "${service}-?(.*)")
5761
for: 30s
5862
labels:
5963
severity: critical
6064
annotations:
6165
description: >-
62-
Less than 50% of MongoDb {{ $labels.job }} instances are up and healthy:
63-
{{ $value }}/${replicas}.
66+
Less than 50% of MongoDb {{ $labels.job }} instances are up and healthy: {{ $value }}.
6467
summary: MongoDb {{ $labels.job }} service critical
6568

6669
- alert: NoPrimary
@@ -265,11 +268,15 @@ groups:
265268

266269
- alert: MongoDbRSNotSynced
267270
expr: |
268-
count by (rs_nm) (
269-
group by (rs_nm, member_idx) (
270-
mongodb_rs_members_state{member_state="SECONDARY", namespace="${namespace}", pod=~"${service}.*"}
271-
)
272-
) != (${replicas} - 1)
271+
count by (rs_nm, statefulset) (
272+
group by (rs_nm, statefulset, member_idx) (
273+
label_replace(
274+
mongodb_rs_members_state{member_state="SECONDARY", namespace="${namespace}", pod=~"${service}.*"}
275+
, "statefulset", "$1", "pod", "(${service}-?.*)-\\d+")
276+
)
277+
)
278+
!= on(statefulset) group_left
279+
kube_statefulset_replicas{namespace="${namespace}", statefulset=~"${service}.*"} - 1
273280
for: 10m
274281
labels:
275282
rs_nm: "{{ $labels.rs_nm }}"

0 commit comments

Comments
 (0)