Skip to content

Commit d9c92e5

Browse files
authored
Merge pull request ceph#55510 from pcuzner/add-nvmeof-alerts
ceph-mixin: Update mixin to include alerts for the nvmeof gateway(s) Reviewed-by: Aashish Sharma <[email protected]>
2 parents 8774325 + 19ce7ab commit d9c92e5

File tree

5 files changed

+715
-1
lines changed

5 files changed

+715
-1
lines changed

monitoring/ceph-mixin/config.libsonnet

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,15 @@
99
CephNodeNetworkPacketDropsPerSec: 10,
1010
CephRBDMirrorImageTransferBandwidthThreshold: 0.8,
1111
CephRBDMirrorImagesPerDaemonThreshold: 100,
12+
NVMeoFMaxGatewaysPerGroup: 2,
13+
NVMeoFMaxGatewaysPerCluster: 4,
14+
NVMeoFHighGatewayCPU: 80,
15+
NVMeoFMaxSubsystemsPerGateway: 16,
16+
NVMeoFHighClientCount: 32,
17+
NVMeoFHighHostCPU: 80,
18+
//
19+
// Read/Write latency is defined in ms
20+
NVMeoFHighClientReadLatency: 10,
21+
NVMeoFHighClientWriteLatency: 20,
1222
},
1323
}

monitoring/ceph-mixin/prometheus_alerts.libsonnet

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,5 +846,150 @@
846846
},
847847
],
848848
},
849+
{
850+
name: 'nvmeof',
851+
rules: [
852+
{
853+
alert: 'NVMeoFSubsystemNamespaceLimit',
854+
'for': '1m',
855+
expr: '(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit',
856+
labels: { severity: 'warning', type: 'ceph_default' },
857+
annotations: {
858+
summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces %(cluster)s' % $.MultiClusterSummary(),
859+
description: 'Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}',
860+
},
861+
},
862+
{
863+
alert: 'NVMeoFTooManyGateways',
864+
'for': '1m',
865+
expr: 'count(ceph_nvmeof_gateway_info) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster],
866+
labels: { severity: 'warning', type: 'ceph_default' },
867+
annotations: {
868+
summary: 'Max supported gateways exceeded %(cluster)s' % $.MultiClusterSummary(),
869+
description: 'You may create many gateways, but %(NVMeoFMaxGatewaysPerCluster)d is the tested limit' % $._config,
870+
},
871+
},
872+
{
873+
alert: 'NVMeoFMaxGatewayGroupSize',
874+
'for': '1m',
875+
expr: 'count by(group) (ceph_nvmeof_gateway_info) > %.2f' % [$._config.NVMeoFMaxGatewaysPerGroup],
876+
labels: { severity: 'warning', type: 'ceph_default' },
877+
annotations: {
878+
summary: 'Max gateways within a gateway group ({{ $labels.group }}) exceeded %(cluster)s' % $.MultiClusterSummary(),
879+
description: 'You may create many gateways in a gateway group, but %(NVMeoFMaxGatewaysPerGroup)d is the tested limit' % $._config,
880+
},
881+
},
882+
{
883+
alert: 'NVMeoFSingleGatewayGroup',
884+
'for': '5m',
885+
expr: 'count by(group) (ceph_nvmeof_gateway_info) == 1',
886+
labels: { severity: 'warning', type: 'ceph_default' },
887+
annotations: {
888+
summary: 'The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible %(cluster)s' % $.MultiClusterSummary(),
889+
description: 'Although a single member gateway group is valid, it should only be used for test purposes',
890+
},
891+
},
892+
{
893+
alert: 'NVMeoFHighGatewayCPU',
894+
'for': '10m',
895+
expr: 'label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighGatewayCPU],
896+
labels: { severity: 'warning', type: 'ceph_default' },
897+
annotations: {
898+
summary: 'CPU used by {{ $labels.instance }} NVMe-oF Gateway is high %(cluster)s' % $.MultiClusterSummary(),
899+
description: 'Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores',
900+
},
901+
},
902+
{
903+
alert: 'NVMeoFGatewayOpenSecurity',
904+
'for': '5m',
905+
expr: 'ceph_nvmeof_subsystem_metadata{allow_any_host="yes"}',
906+
labels: { severity: 'warning', type: 'ceph_default' },
907+
annotations: {
908+
summary: 'Subsystem {{ $labels.nqn }} has been defined without host level security %(cluster)s' % $.MultiClusterSummary(),
909+
description: 'It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss',
910+
},
911+
},
912+
{
913+
alert: 'NVMeoFTooManySubsystems',
914+
'for': '1m',
915+
expr: 'count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway],
916+
labels: { severity: 'warning', type: 'ceph_default' },
917+
annotations: {
918+
summary: 'The number of subsystems defined to the gateway exceeds supported values %(cluster)s' % $.MultiClusterSummary(),
919+
description: 'Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported',
920+
},
921+
},
922+
{
923+
alert: 'NVMeoFVersionMismatch',
924+
'for': '1h',
925+
expr: 'count(count by(version) (ceph_nvmeof_gateway_info)) > 1',
926+
labels: { severity: 'warning', type: 'ceph_default' },
927+
annotations: {
928+
summary: 'The cluster has different NVMe-oF gateway releases active %(cluster)s' % $.MultiClusterSummary(),
929+
description: 'This may indicate an issue with deployment. Check cephadm logs',
930+
},
931+
},
932+
{
933+
alert: 'NVMeoFHighClientCount',
934+
'for': '1m',
935+
expr: 'ceph_nvmeof_subsystem_host_count > %.2f' % [$._config.NVMeoFHighClientCount],
936+
labels: { severity: 'warning', type: 'ceph_default' },
937+
annotations: {
938+
summary: 'The number of clients connected to {{ $labels.nqn }} is too high %(cluster)s' % $.MultiClusterSummary(),
939+
description: 'The supported limit for clients connecting to a subsystem is %(NVMeoFHighClientCount)d' % $._config,
940+
},
941+
},
942+
{
943+
alert: 'NVMeoFHighHostCPU',
944+
'for': '10m',
945+
expr: '100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= %.2f' % [$._config.NVMeoFHighHostCPU],
946+
labels: { severity: 'warning', type: 'ceph_default' },
947+
annotations: {
948+
summary: 'The CPU is high ({{ $value }}%%) on NVMeoF Gateway host ({{ $labels.host }}) %(cluster)s' % $.MultiClusterSummary(),
949+
description: 'High CPU on a gateway host can lead to CPU contention and performance degradation',
950+
},
951+
},
952+
{
953+
alert: 'NVMeoFInterfaceDown',
954+
'for': '30s',
955+
expr: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down"}',
956+
labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.14.1' },
957+
annotations: {
958+
summary: 'Network interface {{ $labels.device }} is down %(cluster)s' % $.MultiClusterSummary(),
959+
description: 'A NIC used by one or more subsystems is in a down state',
960+
},
961+
},
962+
{
963+
alert: 'NVMeoFInterfaceDuplex',
964+
'for': '30s',
965+
expr: 'ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"}',
966+
labels: { severity: 'warning', type: 'ceph_default' },
967+
annotations: {
968+
summary: 'Network interface {{ $labels.device }} is not running in full duplex mode %(cluster)s' % $.MultiClusterSummary(),
969+
description: 'Until this is resolved, performance from the gateway will be degraded',
970+
},
971+
},
972+
{
973+
alert: 'NVMeoFHighReadLatency',
974+
'for': '5m',
975+
expr: 'label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighClientReadLatency / 1000],
976+
labels: { severity: 'warning', type: 'ceph_default' },
977+
annotations: {
978+
summary: 'The average read latency over the last 5 mins has reached %(NVMeoFHighClientReadLatency)d ms or more on {{ $labels.gateway }}' % $._config,
979+
description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate',
980+
},
981+
},
982+
{
983+
alert: 'NVMeoFHighWriteLatency',
984+
'for': '5m',
985+
expr: 'label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),"gateway","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighClientWriteLatency / 1000],
986+
labels: { severity: 'warning', type: 'ceph_default' },
987+
annotations: {
988+
summary: 'The average write latency over the last 5 mins has reached %(NVMeoFHighClientWriteLatency)d ms or more on {{ $labels.gateway }}' % $._config,
989+
description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate',
990+
},
991+
},
992+
],
993+
},
849994
],
850995
}

monitoring/ceph-mixin/prometheus_alerts.yml

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,3 +756,132 @@ groups:
756756
oid: "1.3.6.1.4.1.50495.1.2.1.10.5"
757757
severity: "warning"
758758
type: "ceph_default"
759+
- name: "nvmeof"
760+
rules:
761+
- alert: "NVMeoFSubsystemNamespaceLimit"
762+
annotations:
763+
description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}"
764+
summary: "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces "
765+
expr: "(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit"
766+
for: "1m"
767+
labels:
768+
severity: "warning"
769+
type: "ceph_default"
770+
- alert: "NVMeoFTooManyGateways"
771+
annotations:
772+
description: "You may create many gateways, but 4 is the tested limit"
773+
summary: "Max supported gateways exceeded "
774+
expr: "count(ceph_nvmeof_gateway_info) > 4.00"
775+
for: "1m"
776+
labels:
777+
severity: "warning"
778+
type: "ceph_default"
779+
- alert: "NVMeoFMaxGatewayGroupSize"
780+
annotations:
781+
description: "You may create many gateways in a gateway group, but 2 is the tested limit"
782+
summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded "
783+
expr: "count by(group) (ceph_nvmeof_gateway_info) > 2.00"
784+
for: "1m"
785+
labels:
786+
severity: "warning"
787+
type: "ceph_default"
788+
- alert: "NVMeoFSingleGatewayGroup"
789+
annotations:
790+
description: "Although a single member gateway group is valid, it should only be used for test purposes"
791+
summary: "The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible "
792+
expr: "count by(group) (ceph_nvmeof_gateway_info) == 1"
793+
for: "5m"
794+
labels:
795+
severity: "warning"
796+
type: "ceph_default"
797+
- alert: "NVMeoFHighGatewayCPU"
798+
annotations:
799+
description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores"
800+
summary: "CPU used by {{ $labels.instance }} NVMe-oF Gateway is high "
801+
expr: "label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode=\"busy\"}[1m])),\"instance\",\"$1\",\"instance\",\"(.*):.*\") > 80.00"
802+
for: "10m"
803+
labels:
804+
severity: "warning"
805+
type: "ceph_default"
806+
- alert: "NVMeoFGatewayOpenSecurity"
807+
annotations:
808+
description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss"
809+
summary: "Subsystem {{ $labels.nqn }} has been defined without host level security "
810+
expr: "ceph_nvmeof_subsystem_metadata{allow_any_host=\"yes\"}"
811+
for: "5m"
812+
labels:
813+
severity: "warning"
814+
type: "ceph_default"
815+
- alert: "NVMeoFTooManySubsystems"
816+
annotations:
817+
description: "Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported"
818+
summary: "The number of subsystems defined to the gateway exceeds supported values "
819+
expr: "count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*):.*\")) > 16.00"
820+
for: "1m"
821+
labels:
822+
severity: "warning"
823+
type: "ceph_default"
824+
- alert: "NVMeoFVersionMismatch"
825+
annotations:
826+
description: "This may indicate an issue with deployment. Check cephadm logs"
827+
summary: "The cluster has different NVMe-oF gateway releases active "
828+
expr: "count(count by(version) (ceph_nvmeof_gateway_info)) > 1"
829+
for: "1h"
830+
labels:
831+
severity: "warning"
832+
type: "ceph_default"
833+
- alert: "NVMeoFHighClientCount"
834+
annotations:
835+
description: "The supported limit for clients connecting to a subsystem is 32"
836+
summary: "The number of clients connected to {{ $labels.nqn }} is too high "
837+
expr: "ceph_nvmeof_subsystem_host_count > 32.00"
838+
for: "1m"
839+
labels:
840+
severity: "warning"
841+
type: "ceph_default"
842+
- alert: "NVMeoFHighHostCPU"
843+
annotations:
844+
description: "High CPU on a gateway host can lead to CPU contention and performance degradation"
845+
summary: "The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) "
846+
expr: "100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]),\"host\",\"$1\",\"instance\",\"(.*):.*\")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,\"host\",\"$1\",\"instance\",\"(.*):.*\")))) >= 80.00"
847+
for: "10m"
848+
labels:
849+
severity: "warning"
850+
type: "ceph_default"
851+
- alert: "NVMeoFInterfaceDown"
852+
annotations:
853+
description: "A NIC used by one or more subsystems is in a down state"
854+
summary: "Network interface {{ $labels.device }} is down "
855+
expr: "ceph_nvmeof_subsystem_listener_iface_info{operstate=\"down\"}"
856+
for: "30s"
857+
labels:
858+
oid: "1.3.6.1.4.1.50495.1.2.1.14.1"
859+
severity: "warning"
860+
type: "ceph_default"
861+
- alert: "NVMeoFInterfaceDuplex"
862+
annotations:
863+
description: "Until this is resolved, performance from the gateway will be degraded"
864+
summary: "Network interface {{ $labels.device }} is not running in full duplex mode "
865+
expr: "ceph_nvmeof_subsystem_listener_iface_info{duplex!=\"full\"}"
866+
for: "30s"
867+
labels:
868+
severity: "warning"
869+
type: "ceph_default"
870+
- alert: "NVMeoFHighReadLatency"
871+
annotations:
872+
description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
873+
summary: "The average read latency over the last 5 mins has reached 10 ms or more on {{ $labels.gateway }}"
874+
expr: "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.01"
875+
for: "5m"
876+
labels:
877+
severity: "warning"
878+
type: "ceph_default"
879+
- alert: "NVMeoFHighWriteLatency"
880+
annotations:
881+
description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
882+
summary: "The average write latency over the last 5 mins has reached 20 ms or more on {{ $labels.gateway }}"
883+
expr: "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.02"
884+
for: "5m"
885+
labels:
886+
severity: "warning"
887+
type: "ceph_default"

0 commit comments

Comments
 (0)