Skip to content

Commit 7976f7d

Browse files
authored
Merge pull request ceph#61634 from VallariAg/wip-vallari-nvme-maxgroup-alert
monitoring: add NVMeoFMaxGatewayGroups alert
2 parents 6546da1 + e5cb5db commit 7976f7d

File tree

4 files changed

+52
-0
lines changed

4 files changed

+52
-0
lines changed

monitoring/ceph-mixin/config.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
CephRBDMirrorImageTransferBandwidthThreshold: 0.8,
1111
CephRBDMirrorImagesPerDaemonThreshold: 100,
1212
NVMeoFMaxGatewaysPerGroup: 8,
13+
NVMeoFMaxGatewayGroups: 4,
1314
NVMeoFMaxGatewaysPerCluster: 32,
1415
NVMeoFHighGatewayCPU: 80,
1516
NVMeoFMaxSubsystemsPerGateway: 128,

monitoring/ceph-mixin/prometheus_alerts.libsonnet

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,16 @@
885885
description: 'You may create many gateways in a gateway group, but %(NVMeoFMaxGatewaysPerGroup)d is the tested limit' % $._config,
886886
},
887887
},
888+
{
889+
alert: 'NVMeoFMaxGatewayGroups',
890+
'for': '1m',
891+
expr: 'count(count by (group, cluster) (ceph_nvmeof_gateway_info)) by (cluster) > %.2f' % [$._config.NVMeoFMaxGatewayGroups],
892+
labels: { severity: 'warning', type: 'ceph_default' },
893+
annotations: {
894+
summary: 'Max gateway groups exceeded%(cluster)s' % $.MultiClusterSummary(),
895+
description: 'You may create many gateway groups, but %(NVMeoFMaxGatewayGroups)d is the tested limit' % $._config,
896+
},
897+
},
888898
{
889899
alert: 'NVMeoFSingleGatewayGroup',
890900
'for': '5m',

monitoring/ceph-mixin/prometheus_alerts.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,15 @@ groups:
792792
labels:
793793
severity: "warning"
794794
type: "ceph_default"
795+
- alert: "NVMeoFMaxGatewayGroups"
796+
annotations:
797+
description: "You may create many gateway groups, but 4 is the tested limit"
798+
summary: "Max gateway groups exceeded on cluster {{ $labels.cluster }}"
799+
expr: "count(count by (group, cluster) (ceph_nvmeof_gateway_info)) by (cluster) > 4.00"
800+
for: "1m"
801+
labels:
802+
severity: "warning"
803+
type: "ceph_default"
795804
- alert: "NVMeoFSingleGatewayGroup"
796805
annotations:
797806
description: "Although a single member gateway group is valid, it should only be used for test purposes"

monitoring/ceph-mixin/tests_alerts/test_alerts.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2450,6 +2450,38 @@ tests:
24502450
summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster"
24512451
description: "You may create many gateways in a gateway group, but 8 is the tested limit"
24522452

2453+
2454+
# NVMeoFMaxGatewayGroups
2455+
- interval: 1m
2456+
input_series:
2457+
- series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.1",cluster="mycluster"}'
2458+
values: '1+0x20'
2459+
- series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.2",cluster="mycluster"}'
2460+
values: '1+0x20'
2461+
- series: 'ceph_nvmeof_gateway_info{group="group-3",addr="1.1.1.3",cluster="mycluster"}'
2462+
values: '1+0x20'
2463+
- series: 'ceph_nvmeof_gateway_info{group="group-4",addr="1.1.1.9",cluster="mycluster"}'
2464+
values: '1+0x20'
2465+
- series: 'ceph_nvmeof_gateway_info{group="group-5",addr="1.1.1.12",cluster="mycluster"}'
2466+
values: '1+0x20'
2467+
promql_expr_test:
2468+
- expr: count(count by (group, cluster) (ceph_nvmeof_gateway_info)) by (cluster) > 4.00
2469+
eval_time: 1m
2470+
exp_samples:
2471+
- labels: '{cluster="mycluster"}'
2472+
value: 5
2473+
alert_rule_test:
2474+
- eval_time: 5m
2475+
alertname: NVMeoFMaxGatewayGroups
2476+
exp_alerts:
2477+
- exp_labels:
2478+
severity: warning
2479+
cluster: mycluster
2480+
type: ceph_default
2481+
exp_annotations:
2482+
summary: "Max gateway groups exceeded on cluster mycluster"
2483+
description: "You may create many gateway groups, but 4 is the tested limit"
2484+
24532485
# NVMeoFSingleGatewayGroup
24542486
- interval: 1m
24552487
input_series:

0 commit comments

Comments
 (0)