Skip to content

Commit e7d2548

Browse files
committed
ceph-mixins: nvmeof alerts added
Signed-off-by: Paul Cuzner <[email protected]>
1 parent f1573b7 commit e7d2548

File tree

1 file changed

+129
-0
lines changed

1 file changed

+129
-0
lines changed

monitoring/ceph-mixin/prometheus_alerts.yml

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,3 +756,132 @@ groups:
756756
oid: "1.3.6.1.4.1.50495.1.2.1.10.5"
757757
severity: "warning"
758758
type: "ceph_default"
759+
- name: "nvmeof"
760+
rules:
761+
- alert: "NVMeoFSubsystemNamespaceLimit"
762+
annotations:
763+
description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}"
764+
summary: "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces "
765+
expr: "(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit"
766+
for: "1m"
767+
labels:
768+
severity: "warning"
769+
type: "ceph_default"
770+
- alert: "NVMeoFTooManyGateways"
771+
annotations:
772+
description: "You may create many gateways, but 4 is the tested limit"
773+
summary: "Max supported gateways exceeded "
774+
expr: "count(ceph_nvmeof_gateway_info) > 4.00"
775+
for: "1m"
776+
labels:
777+
severity: "warning"
778+
type: "ceph_default"
779+
- alert: "NVMeoFMaxGatewayGroupSize"
780+
annotations:
781+
description: "You may create many gateways in a gateway group, but 2 is the tested limit"
782+
summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded "
783+
expr: "count by(group) (ceph_nvmeof_gateway_info) > 2.00"
784+
for: "1m"
785+
labels:
786+
severity: "warning"
787+
type: "ceph_default"
788+
- alert: "NVMeoFSingleGatewayGroup"
789+
annotations:
790+
description: "Although a single member gateway group is valid, it should only be used for test purposes"
791+
summary: "The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible "
792+
expr: "count by(group) (ceph_nvmeof_gateway_info) == 1"
793+
for: "5m"
794+
labels:
795+
severity: "warning"
796+
type: "ceph_default"
797+
- alert: "NVMeoFHighGatewayCPU"
798+
annotations:
799+
description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores"
800+
summary: "CPU used by {{ $labels.instance }} NVMe-oF Gateway is high "
801+
expr: "label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode=\"busy\"}[1m])),\"instance\",\"$1\",\"instance\",\"(.*):.*\") > 80.00"
802+
for: "10m"
803+
labels:
804+
severity: "warning"
805+
type: "ceph_default"
806+
- alert: "NVMeoFGatewayOpenSecurity"
807+
annotations:
808+
description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss"
809+
summary: "Subsystem {{ $labels.nqn }} has been defined without host level security "
810+
expr: "ceph_nvmeof_subsystem_metadata{allow_any_host=\"yes\"}"
811+
for: "5m"
812+
labels:
813+
severity: "warning"
814+
type: "ceph_default"
815+
- alert: "NVMeoFTooManySubsystems"
816+
annotations:
817+
description: "Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported"
818+
summary: "The number of subsystems defined to the gateway exceeds supported values "
819+
expr: "count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*):.*\")) > 16.00"
820+
for: "1m"
821+
labels:
822+
severity: "warning"
823+
type: "ceph_default"
824+
- alert: "NVMeoFVersionMismatch"
825+
annotations:
826+
description: "This may indicate an issue with deployment. Check cephadm logs"
827+
summary: "The cluster has different NVMe-oF gateway releases active "
828+
expr: "count(count by(version) (ceph_nvmeof_gateway_info)) > 1"
829+
for: "1h"
830+
labels:
831+
severity: "warning"
832+
type: "ceph_default"
833+
- alert: "NVMeoFHighClientCount"
834+
annotations:
835+
description: "The supported limit for clients connecting to a subsystem is 32"
836+
summary: "The number of clients connected to {{ $labels.nqn }} is too high "
837+
expr: "ceph_nvmeof_subsystem_host_count > 32.00"
838+
for: "1m"
839+
labels:
840+
severity: "warning"
841+
type: "ceph_default"
842+
- alert: "NVMeoFHighHostCPU"
843+
annotations:
844+
description: "High CPU on a gateway host can lead to CPU contention and performance degradation"
845+
summary: "The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) "
846+
expr: "100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]),\"host\",\"$1\",\"instance\",\"(.*):.*\")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,\"host\",\"$1\",\"instance\",\"(.*):.*\")))) >= 80.00"
847+
for: "10m"
848+
labels:
849+
severity: "warning"
850+
type: "ceph_default"
851+
- alert: "NVMeoFInterfaceDown"
852+
annotations:
853+
description: "A NIC used by one or more subsystems is in a down state"
854+
summary: "Network interface {{ $labels.device }} is down "
855+
expr: "ceph_nvmeof_subsystem_listener_iface_info{operstate=\"down\"}"
856+
for: "30s"
857+
labels:
858+
oid: "1.3.6.1.4.1.50495.1.2.1.14.1"
859+
severity: "warning"
860+
type: "ceph_default"
861+
- alert: "NVMeoFInterfaceDuplex"
862+
annotations:
863+
description: "Until this is resolved, performance from the gateway will be degraded"
864+
summary: "Network interface {{ $labels.device }} is not running in full duplex mode "
865+
expr: "ceph_nvmeof_subsystem_listener_iface_info{duplex!=\"full\"}"
866+
for: "30s"
867+
labels:
868+
severity: "warning"
869+
type: "ceph_default"
870+
- alert: "NVMeoFHighReadLatency"
871+
annotations:
872+
description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
873+
summary: "The average read latency over the last 5 mins has reached 10 ms or more on {{ $labels.gateway }}"
874+
expr: "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.01"
875+
for: "5m"
876+
labels:
877+
severity: "warning"
878+
type: "ceph_default"
879+
- alert: "NVMeoFHighWriteLatency"
880+
annotations:
881+
description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
882+
summary: "The average write latency over the last 5 mins has reached 20 ms or more on {{ $labels.gateway }}"
883+
expr: "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.02"
884+
for: "5m"
885+
labels:
886+
severity: "warning"
887+
type: "ceph_default"

0 commit comments

Comments
 (0)