Skip to content

Commit 5f7eda5

Browse files
authored
Merge pull request ceph#64067 from VallariAg/wip-nvmeof-keepalive-alert
monitoring: Add alert NVMeoFHostKeepAliveTimeout
2 parents 98d8c60 + 9977e5c commit 5f7eda5

File tree

4 files changed

+72
-0
lines changed

4 files changed

+72
-0
lines changed

monitoring/ceph-mixin/config.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
NVMeoFMaxSubsystemsPerGateway: 128,
1717
NVMeoFMaxNamespaces: 2048,
1818
NVMeoFHighClientCount: 128,
19+
NVMeoFHostKeepAliveTimeoutTrackDurationHours: 24,
1920
NVMeoFHighHostCPU: 80,
2021
//
2122
// Read/Write latency is defined in ms

monitoring/ceph-mixin/prometheus_alerts.libsonnet

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,6 +1035,16 @@
10351035
description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate',
10361036
},
10371037
},
1038+
{
1039+
alert: 'NVMeoFHostKeepAliveTimeout',
1040+
'for': '1m',
1041+
expr: 'ceil(changes(ceph_nvmeof_host_keepalive_timeout[%(NVMeoFHostKeepAliveTimeoutTrackDurationHours)dh:]) / 2) > 0' % $._config,
1042+
labels: { severity: 'warning', type: 'ceph_default' },
1043+
annotations: {
1044+
summary: 'Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from subsystem ({{ $labels.nqn }}) in last %(NVMeoFHostKeepAliveTimeoutTrackDurationHours)d hours' % $._config,
1045+
description: 'Host was disconnected due to host keep alive timeout',
1046+
},
1047+
},
10381048
],
10391049
},
10401050
],

monitoring/ceph-mixin/prometheus_alerts.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -928,3 +928,12 @@ groups:
928928
labels:
929929
severity: "warning"
930930
type: "ceph_default"
931+
- alert: "NVMeoFHostKeepAliveTimeout"
932+
annotations:
933+
description: "Host was disconnected due to host keep alive timeout"
934+
summary: "Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from subsystem ({{ $labels.nqn }}) in last 24 hours"
935+
expr: "ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0"
936+
for: "1m"
937+
labels:
938+
severity: "warning"
939+
type: "ceph_default"

monitoring/ceph-mixin/tests_alerts/test_alerts.yml

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3152,3 +3152,55 @@ tests:
31523152
exp_annotations:
31533153
summary: "The average write latency over the last 5 mins has reached 20 ms or more on node-1"
31543154
description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
3155+
3156+
# NVMeoFHostKeepAliveTimeout
3157+
- interval: 1h
3158+
input_series:
3159+
- series: 'ceph_nvmeof_host_keepalive_timeout{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
3160+
values: '0 0 0 0 1 0 1 0 1 1 0x14 1 1 0x3'
3161+
- series: 'ceph_nvmeof_host_keepalive_timeout{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
3162+
values: '0 1 1 0 0 0 0 0 0 0 0x19'
3163+
promql_expr_test:
3164+
- expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0
3165+
eval_time: 2h
3166+
exp_samples:
3167+
- labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
3168+
value: 1
3169+
- expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0
3170+
eval_time: 8h
3171+
exp_samples:
3172+
- labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
3173+
value: 3
3174+
- labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
3175+
value: 1
3176+
- expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0
3177+
eval_time: 29h
3178+
exp_samples:
3179+
- labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
3180+
value: 3
3181+
alert_rule_test:
3182+
- eval_time: 1h
3183+
alertname: NVMeoFHostKeepAliveTimeout
3184+
- eval_time: 12h
3185+
alertname: NVMeoFHostKeepAliveTimeout
3186+
exp_alerts:
3187+
- exp_labels:
3188+
gw_name: client.nvmeof.a
3189+
host_nqn: nqn.1
3190+
instance: node-1:10008
3191+
nqn: nqn.2016-06.io.spdk:cnode1.mygroup
3192+
severity: warning
3193+
type: ceph_default
3194+
exp_annotations:
3195+
summary: "Host (nqn.1) was disconnected 3 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours"
3196+
description: "Host was disconnected due to host keep alive timeout"
3197+
- exp_labels:
3198+
gw_name: client.nvmeof.a
3199+
host_nqn: nqn.2
3200+
instance: node-1:10008
3201+
nqn: nqn.2016-06.io.spdk:cnode1.mygroup
3202+
severity: warning
3203+
type: ceph_default
3204+
exp_annotations:
3205+
summary: "Host (nqn.2) was disconnected 1 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours"
3206+
description: "Host was disconnected due to host keep alive timeout"

0 commit comments

Comments
 (0)