@@ -756,3 +756,132 @@ groups:
756756 oid : " 1.3.6.1.4.1.50495.1.2.1.10.5"
757757 severity : " warning"
758758 type : " ceph_default"
759+ - name : " nvmeof"
760+ rules :
761+ - alert : " NVMeoFSubsystemNamespaceLimit"
762+ annotations :
763+ description : " Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}"
764+ summary : " {{ $labels.nqn }} subsystem has reached its maximum number of namespaces "
765+ expr : " (count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit"
766+ for : " 1m"
767+ labels :
768+ severity : " warning"
769+ type : " ceph_default"
770+ - alert : " NVMeoFTooManyGateways"
771+ annotations :
772+ description : " You may create many gateways, but 4 is the tested limit"
773+ summary : " Max supported gateways exceeded "
774+ expr : " count(ceph_nvmeof_gateway_info) > 4.00"
775+ for : " 1m"
776+ labels :
777+ severity : " warning"
778+ type : " ceph_default"
779+ - alert : " NVMeoFMaxGatewayGroupSize"
780+ annotations :
781+ description : " You may create many gateways in a gateway group, but 2 is the tested limit"
782+ summary : " Max gateways within a gateway group ({{ $labels.group }}) exceeded "
783+ expr : " count by(group) (ceph_nvmeof_gateway_info) > 2.00"
784+ for : " 1m"
785+ labels :
786+ severity : " warning"
787+ type : " ceph_default"
788+ - alert : " NVMeoFSingleGatewayGroup"
789+ annotations :
790+ description : " Although a single member gateway group is valid, it should only be used for test purposes"
791+ summary : " The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible "
792+ expr : " count by(group) (ceph_nvmeof_gateway_info) == 1"
793+ for : " 5m"
794+ labels :
795+ severity : " warning"
796+ type : " ceph_default"
797+ - alert : " NVMeoFHighGatewayCPU"
798+ annotations :
799+ description : " Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores"
800+ summary : " CPU used by {{ $labels.instance }} NVMe-oF Gateway is high "
801+ expr : " label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode=\" busy\" }[1m])),\" instance\" ,\" $1\" ,\" instance\" ,\" (.*):.*\" ) > 80.00"
802+ for : " 10m"
803+ labels :
804+ severity : " warning"
805+ type : " ceph_default"
806+ - alert : " NVMeoFGatewayOpenSecurity"
807+ annotations :
808+ description : " It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss"
809+ summary : " Subsystem {{ $labels.nqn }} has been defined without host level security "
810+ expr : " ceph_nvmeof_subsystem_metadata{allow_any_host=\" yes\" }"
811+ for : " 5m"
812+ labels :
813+ severity : " warning"
814+ type : " ceph_default"
815+ - alert : " NVMeoFTooManySubsystems"
816+ annotations :
817+ description : " Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported"
818+ summary : " The number of subsystems defined to the gateway exceeds supported values "
819+ expr : " count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,\" gateway_host\" ,\" $1\" ,\" instance\" ,\" (.*):.*\" )) > 16.00"
820+ for : " 1m"
821+ labels :
822+ severity : " warning"
823+ type : " ceph_default"
824+ - alert : " NVMeoFVersionMismatch"
825+ annotations :
826+ description : " This may indicate an issue with deployment. Check cephadm logs"
827+ summary : " The cluster has different NVMe-oF gateway releases active "
828+ expr : " count(count by(version) (ceph_nvmeof_gateway_info)) > 1"
829+ for : " 1h"
830+ labels :
831+ severity : " warning"
832+ type : " ceph_default"
833+ - alert : " NVMeoFHighClientCount"
834+ annotations :
835+ description : " The supported limit for clients connecting to a subsystem is 32"
836+ summary : " The number of clients connected to {{ $labels.nqn }} is too high "
837+ expr : " ceph_nvmeof_subsystem_host_count > 32.00"
838+ for : " 1m"
839+ labels :
840+ severity : " warning"
841+ type : " ceph_default"
842+ - alert : " NVMeoFHighHostCPU"
843+ annotations :
844+ description : " High CPU on a gateway host can lead to CPU contention and performance degradation"
845+ summary : " The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) "
846+ expr : " 100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode=\" idle\" }[5m]),\" host\" ,\" $1\" ,\" instance\" ,\" (.*):.*\" )) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,\" host\" ,\" $1\" ,\" instance\" ,\" (.*):.*\" )))) >= 80.00"
847+ for : " 10m"
848+ labels :
849+ severity : " warning"
850+ type : " ceph_default"
851+ - alert : " NVMeoFInterfaceDown"
852+ annotations :
853+ description : " A NIC used by one or more subsystems is in a down state"
854+ summary : " Network interface {{ $labels.device }} is down "
855+ expr : " ceph_nvmeof_subsystem_listener_iface_info{operstate=\" down\" }"
856+ for : " 30s"
857+ labels :
858+ oid : " 1.3.6.1.4.1.50495.1.2.1.14.1"
859+ severity : " warning"
860+ type : " ceph_default"
861+ - alert : " NVMeoFInterfaceDuplex"
862+ annotations :
863+ description : " Until this is resolved, performance from the gateway will be degraded"
864+ summary : " Network interface {{ $labels.device }} is not running in full duplex mode "
865+ expr : " ceph_nvmeof_subsystem_listener_iface_info{duplex!=\" full\" }"
866+ for : " 30s"
867+ labels :
868+ severity : " warning"
869+ type : " ceph_default"
870+ - alert : " NVMeoFHighReadLatency"
871+ annotations :
872+ description : " High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
873+ summary : " The average read latency over the last 5 mins has reached 10 ms or more on {{ $labels.gateway }}"
874+ expr : " label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),\" gateway\" ,\" $1\" ,\" instance\" ,\" (.*):.*\" ) > 0.01"
875+ for : " 5m"
876+ labels :
877+ severity : " warning"
878+ type : " ceph_default"
879+ - alert : " NVMeoFHighWriteLatency"
880+ annotations :
881+ description : " High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
882+ summary : " The average write latency over the last 5 mins has reached 20 ms or more on {{ $labels.gateway }}"
883+ expr : " label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),\" gateway\" ,\" $1\" ,\" instance\" ,\" (.*):.*\" ) > 0.02"
884+ for : " 5m"
885+ labels :
886+ severity : " warning"
887+ type : " ceph_default"
0 commit comments