|
846 | 846 | }, |
847 | 847 | ], |
848 | 848 | }, |
| 849 | + { |
| 850 | + name: 'nvmeof', |
| 851 | + rules: [ |
| 852 | + { |
| 853 | + alert: 'NVMeoFSubsystemNamespaceLimit', |
| 854 | + 'for': '1m', |
| 855 | + expr: '(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit', |
| 856 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 857 | + annotations: { |
| 858 | + summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces %(cluster)s' % $.MultiClusterSummary(), |
| 859 | + description: 'Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}', |
| 860 | + }, |
| 861 | + }, |
| 862 | + { |
| 863 | + alert: 'NVMeoFTooManyGateways', |
| 864 | + 'for': '1m', |
| 865 | + expr: 'count(ceph_nvmeof_gateway_info) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster], |
| 866 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 867 | + annotations: { |
| 868 | + summary: 'Max supported gateways exceeded %(cluster)s' % $.MultiClusterSummary(), |
| 869 | + description: 'You may create many gateways, but %(NVMeoFMaxGatewaysPerCluster)d is the tested limit' % $._config, |
| 870 | + }, |
| 871 | + }, |
| 872 | + { |
| 873 | + alert: 'NVMeoFMaxGatewayGroupSize', |
| 874 | + 'for': '1m', |
| 875 | + expr: 'count by(group) (ceph_nvmeof_gateway_info) > %.2f' % [$._config.NVMeoFMaxGatewaysPerGroup], |
| 876 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 877 | + annotations: { |
| 878 | + summary: 'Max gateways within a gateway group ({{ $labels.group }}) exceeded %(cluster)s' % $.MultiClusterSummary(), |
| 879 | + description: 'You may create many gateways in a gateway group, but %(NVMeoFMaxGatewaysPerGroup)d is the tested limit' % $._config, |
| 880 | + }, |
| 881 | + }, |
| 882 | + { |
| 883 | + alert: 'NVMeoFSingleGatewayGroup', |
| 884 | + 'for': '5m', |
| 885 | + expr: 'count by(group) (ceph_nvmeof_gateway_info) == 1', |
| 886 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 887 | + annotations: { |
| 888 | + summary: 'The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible %(cluster)s' % $.MultiClusterSummary(), |
| 889 | + description: 'Although a single member gateway group is valid, it should only be used for test purposes', |
| 890 | + }, |
| 891 | + }, |
| 892 | + { |
| 893 | + alert: 'NVMeoFHighGatewayCPU', |
| 894 | + 'for': '10m', |
| 895 | + expr: 'label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighGatewayCPU], |
| 896 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 897 | + annotations: { |
| 898 | + summary: 'CPU used by {{ $labels.instance }} NVMe-oF Gateway is high %(cluster)s' % $.MultiClusterSummary(), |
| 899 | + description: 'Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores', |
| 900 | + }, |
| 901 | + }, |
| 902 | + { |
| 903 | + alert: 'NVMeoFGatewayOpenSecurity', |
| 904 | + 'for': '5m', |
| 905 | + expr: 'ceph_nvmeof_subsystem_metadata{allow_any_host="yes"}', |
| 906 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 907 | + annotations: { |
| 908 | + summary: 'Subsystem {{ $labels.nqn }} has been defined without host level security %(cluster)s' % $.MultiClusterSummary(), |
| 909 | + description: 'It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss', |
| 910 | + }, |
| 911 | + }, |
| 912 | + { |
| 913 | + alert: 'NVMeoFTooManySubsystems', |
| 914 | + 'for': '1m', |
| 915 | + expr: 'count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway], |
| 916 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 917 | + annotations: { |
| 918 | + summary: 'The number of subsystems defined to the gateway exceeds supported values %(cluster)s' % $.MultiClusterSummary(), |
| 919 | + description: 'Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported', |
| 920 | + }, |
| 921 | + }, |
| 922 | + { |
| 923 | + alert: 'NVMeoFVersionMismatch', |
| 924 | + 'for': '1h', |
| 925 | + expr: 'count(count by(version) (ceph_nvmeof_gateway_info)) > 1', |
| 926 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 927 | + annotations: { |
| 928 | + summary: 'The cluster has different NVMe-oF gateway releases active %(cluster)s' % $.MultiClusterSummary(), |
| 929 | + description: 'This may indicate an issue with deployment. Check cephadm logs', |
| 930 | + }, |
| 931 | + }, |
| 932 | + { |
| 933 | + alert: 'NVMeoFHighClientCount', |
| 934 | + 'for': '1m', |
| 935 | + expr: 'ceph_nvmeof_subsystem_host_count > %.2f' % [$._config.NVMeoFHighClientCount], |
| 936 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 937 | + annotations: { |
| 938 | + summary: 'The number of clients connected to {{ $labels.nqn }} is too high %(cluster)s' % $.MultiClusterSummary(), |
| 939 | + description: 'The supported limit for clients connecting to a subsystem is %(NVMeoFHighClientCount)d' % $._config, |
| 940 | + }, |
| 941 | + }, |
| 942 | + { |
| 943 | + alert: 'NVMeoFHighHostCPU', |
| 944 | + 'for': '10m', |
| 945 | + expr: '100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= %.2f' % [$._config.NVMeoFHighHostCPU], |
| 946 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 947 | + annotations: { |
| 948 | + summary: 'The CPU is high ({{ $value }}%%) on NVMeoF Gateway host ({{ $labels.host }}) %(cluster)s' % $.MultiClusterSummary(), |
| 949 | + description: 'High CPU on a gateway host can lead to CPU contention and performance degradation', |
| 950 | + }, |
| 951 | + }, |
| 952 | + { |
| 953 | + alert: 'NVMeoFInterfaceDown', |
| 954 | + 'for': '30s', |
| 955 | + expr: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down"}', |
| 956 | + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.14.1' }, |
| 957 | + annotations: { |
| 958 | + summary: 'Network interface {{ $labels.device }} is down %(cluster)s' % $.MultiClusterSummary(), |
| 959 | + description: 'A NIC used by one or more subsystems is in a down state', |
| 960 | + }, |
| 961 | + }, |
| 962 | + { |
| 963 | + alert: 'NVMeoFInterfaceDuplex', |
| 964 | + 'for': '30s', |
| 965 | + expr: 'ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"}', |
| 966 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 967 | + annotations: { |
| 968 | + summary: 'Network interface {{ $labels.device }} is not running in full duplex mode %(cluster)s' % $.MultiClusterSummary(), |
| 969 | + description: 'Until this is resolved, performance from the gateway will be degraded', |
| 970 | + }, |
| 971 | + }, |
| 972 | + { |
| 973 | + alert: 'NVMeoFHighReadLatency', |
| 974 | + 'for': '5m', |
| 975 | + expr: 'label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighClientReadLatency / 1000], |
| 976 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 977 | + annotations: { |
| 978 | + summary: 'The average read latency over the last 5 mins has reached %(NVMeoFHighClientReadLatency)d ms or more on {{ $labels.gateway }}' % $._config, |
| 979 | + description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate', |
| 980 | + }, |
| 981 | + }, |
| 982 | + { |
| 983 | + alert: 'NVMeoFHighWriteLatency', |
| 984 | + 'for': '5m', |
| 985 | + expr: 'label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),"gateway","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighClientWriteLatency / 1000], |
| 986 | + labels: { severity: 'warning', type: 'ceph_default' }, |
| 987 | + annotations: { |
| 988 | + summary: 'The average write latency over the last 5 mins has reached %(NVMeoFHighClientWriteLatency)d ms or more on {{ $labels.gateway }}' % $._config, |
| 989 | + description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate', |
| 990 | + }, |
| 991 | + }, |
| 992 | + ], |
| 993 | + }, |
849 | 994 | ], |
850 | 995 | } |
0 commit comments