Skip to content

Commit 919c2a6

Browse files
authored
Merge pull request ceph#60777 from VallariAg/wip-nvmeof-prometheus-rbd-image-reused
monitoring: Add prometheus alert NVMeoFMultipleNamespacesOfRBDImage Reviewed-by: Afreen Misbah <[email protected]>
2 parents 0a515df + 61b3289 commit 919c2a6

File tree

3 files changed

+67
-0
lines changed

3 files changed

+67
-0
lines changed

monitoring/ceph-mixin/prometheus_alerts.libsonnet

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -855,6 +855,16 @@
855855
description: 'Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}',
856856
},
857857
},
858+
{
859+
alert: 'NVMeoFMultipleNamespacesOfRBDImage',
860+
'for': '1m',
861+
expr: 'count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1',
862+
labels: { severity: 'warning', type: 'ceph_default' },
863+
annotations: {
864+
summary: 'RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace ',
865+
description: 'Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups.',
866+
},
867+
},
858868
{
859869
alert: 'NVMeoFTooManyGateways',
860870
'for': '1m',

monitoring/ceph-mixin/prometheus_alerts.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,15 @@ groups:
765765
labels:
766766
severity: "warning"
767767
type: "ceph_default"
768+
- alert: "NVMeoFMultipleNamespacesOfRBDImage"
769+
annotations:
770+
description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups."
771+
summary: "RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace "
772+
expr: "count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1"
773+
for: "1m"
774+
labels:
775+
severity: "warning"
776+
type: "ceph_default"
768777
- alert: "NVMeoFTooManyGateways"
769778
annotations:
770779
description: "You may create many gateways, but 4 is the tested limit"

monitoring/ceph-mixin/tests_alerts/test_alerts.yml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2270,6 +2270,54 @@ tests:
22702270
summary: "wah subsystem has reached its maximum number of namespaces on cluster mycluster"
22712271
description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah"
22722272

2273+
# NVMeoFMultipleNamespacesOfRBDImage
2274+
- interval: 1m
2275+
input_series:
2276+
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}'
2277+
values: '1x10'
2278+
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}'
2279+
values: '1x10'
2280+
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage2"}'
2281+
values: '1x10'
2282+
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage2"}'
2283+
values: '1x10'
2284+
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}'
2285+
values: '1x10'
2286+
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}'
2287+
values: '1x10'
2288+
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev4", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' # bdev with no ns
2289+
values: '1x10'
2290+
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm1", cluster="mycluster"}'
2291+
values: '1x10'
2292+
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm2", cluster="mycluster"}'
2293+
values: '1x10'
2294+
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm1", cluster="mycluster"}'
2295+
values: '1x10'
2296+
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm2", cluster="mycluster"}'
2297+
values: '1x10'
2298+
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm1", cluster="mycluster"}'
2299+
values: '1x10'
2300+
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm2", cluster="mycluster"}'
2301+
values: '1x10'
2302+
promql_expr_test:
2303+
- expr: count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1
2304+
eval_time: 1m
2305+
exp_samples:
2306+
- labels: '{pool_name="mypool", rbd_name="myimage1"}'
2307+
value: 2
2308+
alert_rule_test:
2309+
- eval_time: 5m
2310+
alertname: NVMeoFMultipleNamespacesOfRBDImage
2311+
exp_alerts:
2312+
- exp_labels:
2313+
pool_name: mypool
2314+
rbd_name: myimage1
2315+
severity: warning
2316+
type: ceph_default
2317+
exp_annotations:
2318+
summary: "RBD image mypool/myimage1 cannot be reused for multiple NVMeoF namespace "
2319+
description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups."
2320+
22732321
# NVMeoFTooManyGateways
22742322
- interval: 1m
22752323
input_series:

0 commit comments

Comments
 (0)