Skip to content

Commit 1c08663

Browse files
authored
Merge pull request ceph#61278 from VallariAg/wip-vallari-nvmeof-alerts-update
monitoring: Update nvmeof alert limits in config
2 parents a142350 + f3c1881 commit 1c08663

File tree

3 files changed

+91
-24
lines changed

3 files changed

+91
-24
lines changed

monitoring/ceph-mixin/config.libsonnet

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
CephNodeNetworkPacketDropsPerSec: 10,
1010
CephRBDMirrorImageTransferBandwidthThreshold: 0.8,
1111
CephRBDMirrorImagesPerDaemonThreshold: 100,
12-
NVMeoFMaxGatewaysPerGroup: 4,
13-
NVMeoFMaxGatewaysPerCluster: 4,
12+
NVMeoFMaxGatewaysPerGroup: 8,
13+
NVMeoFMaxGatewaysPerCluster: 32,
1414
NVMeoFHighGatewayCPU: 80,
1515
NVMeoFMaxSubsystemsPerGateway: 128,
16-
NVMeoFMaxNamespaces: 1024,
17-
NVMeoFHighClientCount: 32,
16+
NVMeoFMaxNamespaces: 2048,
17+
NVMeoFHighClientCount: 128,
1818
NVMeoFHighHostCPU: 80,
1919
//
2020
// Read/Write latency is defined in ms

monitoring/ceph-mixin/prometheus_alerts.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -776,18 +776,18 @@ groups:
776776
type: "ceph_default"
777777
- alert: "NVMeoFTooManyGateways"
778778
annotations:
779-
description: "You may create many gateways, but 4 is the tested limit"
779+
description: "You may create many gateways, but 32 is the tested limit"
780780
summary: "Max supported gateways exceeded on cluster {{ $labels.cluster }}"
781-
expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 4.00"
781+
expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 32.00"
782782
for: "1m"
783783
labels:
784784
severity: "warning"
785785
type: "ceph_default"
786786
- alert: "NVMeoFMaxGatewayGroupSize"
787787
annotations:
788-
description: "You may create many gateways in a gateway group, but 4 is the tested limit"
788+
description: "You may create many gateways in a gateway group, but 8 is the tested limit"
789789
summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}"
790-
expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00"
790+
expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00"
791791
for: "1m"
792792
labels:
793793
severity: "warning"
@@ -832,7 +832,7 @@ groups:
832832
annotations:
833833
description: "Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported"
834834
summary: "The number of namespaces defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}"
835-
expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 1024.00"
835+
expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 2048.00"
836836
for: "1m"
837837
labels:
838838
severity: "warning"
@@ -848,9 +848,9 @@ groups:
848848
type: "ceph_default"
849849
- alert: "NVMeoFHighClientCount"
850850
annotations:
851-
description: "The supported limit for clients connecting to a subsystem is 32"
851+
description: "The supported limit for clients connecting to a subsystem is 128"
852852
summary: "The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }}"
853-
expr: "ceph_nvmeof_subsystem_host_count > 32.00"
853+
expr: "ceph_nvmeof_subsystem_host_count > 128.00"
854854
for: "1m"
855855
labels:
856856
severity: "warning"

monitoring/ceph-mixin/tests_alerts/test_alerts.yml

Lines changed: 80 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2331,12 +2331,69 @@ tests:
23312331
values: '1+0x20'
23322332
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5",cluster="mycluster"}'
23332333
values: '1+0x20'
2334+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.6",cluster="mycluster"}'
2335+
values: '1+0x20'
2336+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.7",cluster="mycluster"}'
2337+
values: '1+0x20'
2338+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.8",cluster="mycluster"}'
2339+
values: '1+0x20'
2340+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.9",cluster="mycluster"}'
2341+
values: '1+0x20'
2342+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.10",cluster="mycluster"}'
2343+
values: '1+0x20'
2344+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.11",cluster="mycluster"}'
2345+
values: '1+0x20'
2346+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.12",cluster="mycluster"}'
2347+
values: '1+0x20'
2348+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.13",cluster="mycluster"}'
2349+
values: '1+0x20'
2350+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.14",cluster="mycluster"}'
2351+
values: '1+0x20'
2352+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.15",cluster="mycluster"}'
2353+
values: '1+0x20'
2354+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.16",cluster="mycluster"}'
2355+
values: '1+0x20'
2356+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.17",cluster="mycluster"}'
2357+
values: '1+0x20'
2358+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.18",cluster="mycluster"}'
2359+
values: '1+0x20'
2360+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.19",cluster="mycluster"}'
2361+
values: '1+0x20'
2362+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.20",cluster="mycluster"}'
2363+
values: '1+0x20'
2364+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.21",cluster="mycluster"}'
2365+
values: '1+0x20'
2366+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.22",cluster="mycluster"}'
2367+
values: '1+0x20'
2368+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.23",cluster="mycluster"}'
2369+
values: '1+0x20'
2370+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.24",cluster="mycluster"}'
2371+
values: '1+0x20'
2372+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.25",cluster="mycluster"}'
2373+
values: '1+0x20'
2374+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.26",cluster="mycluster"}'
2375+
values: '1+0x20'
2376+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.27",cluster="mycluster"}'
2377+
values: '1+0x20'
2378+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.28",cluster="mycluster"}'
2379+
values: '1+0x20'
2380+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.29",cluster="mycluster"}'
2381+
values: '1+0x20'
2382+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.30",cluster="mycluster"}'
2383+
values: '1+0x20'
2384+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.31",cluster="mycluster"}'
2385+
values: '1+0x20'
2386+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.32",cluster="mycluster"}'
2387+
values: '1+0x20'
2388+
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.33",cluster="mycluster"}'
2389+
values: '1+0x20'
2390+
23342391
promql_expr_test:
2335-
- expr: count(ceph_nvmeof_gateway_info) by (cluster) > 4.00
2392+
- expr: count(ceph_nvmeof_gateway_info) by (cluster) > 32.00
23362393
eval_time: 1m
23372394
exp_samples:
23382395
- labels: '{cluster="mycluster"}'
2339-
value: 5
2396+
value: 33
23402397
alert_rule_test:
23412398
- eval_time: 5m
23422399
alertname: NVMeoFTooManyGateways
@@ -2347,7 +2404,7 @@ tests:
23472404
type: ceph_default
23482405
exp_annotations:
23492406
summary: "Max supported gateways exceeded on cluster mycluster"
2350-
description: "You may create many gateways, but 4 is the tested limit"
2407+
description: "You may create many gateways, but 32 is the tested limit"
23512408

23522409
# NVMeoFMaxGatewayGroupSize
23532410
- interval: 1m
@@ -2362,16 +2419,24 @@ tests:
23622419
values: '1+0x20'
23632420
- series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.12",cluster="mycluster"}'
23642421
values: '1+0x20'
2422+
- series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.10",cluster="mycluster"}'
2423+
values: '1+0x20'
2424+
- series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.14",cluster="mycluster"}'
2425+
values: '1+0x20'
2426+
- series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.11",cluster="mycluster"}'
2427+
values: '1+0x20'
2428+
- series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.13",cluster="mycluster"}'
2429+
values: '1+0x20'
23652430
- series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}'
23662431
values: '1+0x20'
23672432
- series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}'
23682433
values: '1+0x20'
23692434
promql_expr_test:
2370-
- expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 4.00
2435+
- expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 8.00
23712436
eval_time: 1m
23722437
exp_samples:
23732438
- labels: '{cluster="mycluster",group="group-1"}'
2374-
value: 5
2439+
value: 9
23752440
alert_rule_test:
23762441
- eval_time: 5m
23772442
alertname: NVMeoFMaxGatewayGroupSize
@@ -2383,7 +2448,7 @@ tests:
23832448
type: ceph_default
23842449
exp_annotations:
23852450
summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster"
2386-
description: "You may create many gateways in a gateway group, but 4 is the tested limit"
2451+
description: "You may create many gateways in a gateway group, but 8 is the tested limit"
23872452

23882453
# NVMeoFSingleGatewayGroup
23892454
- interval: 1m
@@ -2767,12 +2832,14 @@ tests:
27672832
values: '200+0x10'
27682833
- series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn10",cluster="mycluster"}'
27692834
values: '200+0x10'
2835+
- series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn11",cluster="mycluster"}'
2836+
values: '200+0x10'
27702837
promql_expr_test:
2771-
- expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 1024
2838+
- expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 2048
27722839
eval_time: 1m
27732840
exp_samples:
27742841
- labels: '{gateway_host="node-1", cluster="mycluster"}'
2775-
value: 2000
2842+
value: 2200
27762843
alert_rule_test:
27772844
- eval_time: 5m
27782845
alertname: NVMeoFTooManyNamespaces
@@ -2815,15 +2882,15 @@ tests:
28152882
- interval: 1m
28162883
input_series:
28172884
- series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1",cluster="mycluster"}'
2818-
values: '2 2 2 4 4 8 8 8 10 10 20 20 32 34 34 38 38 40 44 44'
2885+
values: '2 4 8 10 20 30 40 50 62 74 80 95 100 110 130 130 130 130 130 130'
28192886
- series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2",cluster="mycluster"}'
2820-
values: '2 2 2 8 8 8 16 16 16 16 16 16 16 16 16 16 16 16 16 16'
2887+
values: '2 8 16 16 16 16 16 16 16 16 20 20 32 34 34 36 37 37 37 37'
28212888
promql_expr_test:
2822-
- expr: ceph_nvmeof_subsystem_host_count > 32.00
2889+
- expr: ceph_nvmeof_subsystem_host_count > 128.00
28232890
eval_time: 15m
28242891
exp_samples:
28252892
- labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1",cluster="mycluster"}'
2826-
value: 38
2893+
value: 130
28272894
alert_rule_test:
28282895
- eval_time: 20m
28292896
alertname: NVMeoFHighClientCount
@@ -2835,7 +2902,7 @@ tests:
28352902
type: ceph_default
28362903
exp_annotations:
28372904
summary: "The number of clients connected to nqn1 is too high on cluster mycluster"
2838-
description: "The supported limit for clients connecting to a subsystem is 32"
2905+
description: "The supported limit for clients connecting to a subsystem is 128"
28392906

28402907
# NVMeoFMissingListener
28412908
- interval: 1m

0 commit comments

Comments
 (0)