Skip to content

Commit 3110774

Browse files
authored
Merge pull request ceph#64467 from ceph/fix-mtu-alert
monitoring: fix MTU Mismatch alert rule and expr Reviewed-by: Afreen Misbah <[email protected]>
2 parents ea497fc + bee24de commit 3110774

File tree

7 files changed

+112
-55
lines changed

7 files changed

+112
-55
lines changed

monitoring/ceph-mixin/prometheus_alerts.yml

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -511,14 +511,38 @@ groups:
511511
oid: "1.3.6.1.4.1.50495.1.2.1.8.4"
512512
severity: "warning"
513513
type: "ceph_default"
514-
- alert: "CephNodeInconsistentMTU"
515-
annotations:
516-
description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}."
517-
summary: "MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster }}"
518-
expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )"
514+
- alert: CephNodeInconsistentMTU
515+
expr: |
516+
node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
517+
!= on (cluster, device) group_left
518+
quantile by (cluster, device) (
519+
0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
520+
)
519521
labels:
520-
severity: "warning"
521-
type: "ceph_default"
522+
severity: warning
523+
type: ceph_default
524+
annotations:
525+
summary: "Node {{ $labels.instance }} has inconsistent MTU settings in cluster {{ $labels.cluster }}"
526+
description: "Network interface {{ $labels.device }} on node {{ $labels.instance }} has MTU {{ $value }} which differs from the cluster median."
527+
impact: |
528+
- May cause packet fragmentation or packet drops
529+
- Risk of degraded cluster communication and performance
530+
- Potential instability in services relying on consistent networking (e.g., Ceph, Kubernetes)
531+
fix: |
532+
- Check the MTU of interface `{{ $labels.device }}` on node `{{ $labels.instance }}`:
533+
ip link show {{ $labels.device }}
534+
535+
- Find the median MTU value across the cluster by running this PromQL query in Prometheus:
536+
quantile by (cluster, device) (0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
537+
538+
- Standardize MTU across all nodes to match the median (commonly 1500 or 9000):
539+
ip link set dev {{ $labels.device }} mtu <median-value>
540+
541+
- Make MTU setting persistent:
542+
- RHEL/CentOS: edit `/etc/sysconfig/network-scripts/ifcfg-<device>`
543+
- Debian/Ubuntu: edit `/etc/netplan/*.yaml` and apply with `netplan apply`
544+
545+
- Restart the affected interface or node if required.
522546
- name: "pools"
523547
rules:
524548
- alert: "CephPoolGrowthWarning"

monitoring/ceph-mixin/tests_alerts/test_alerts.yml

Lines changed: 51 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -505,80 +505,83 @@ tests:
505505
summary: Host filesystem free space is getting low on cluster mycluster
506506
description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
507507

508-
# MTU Mismatch
508+
# MTU Mismatch
509509
- interval: 1m
510510
input_series:
511-
- series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
511+
- series: 'node_network_mtu_bytes{device="eth0",instance="node1",job="node-exporter",cluster="mycluster"}'
512512
values: '1500 1500 1500 1500 1500'
513-
- series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
513+
- series: 'node_network_mtu_bytes{device="eth1",instance="node1",job="node-exporter",cluster="mycluster"}'
514514
values: '1500 1500 1500 1500 1500'
515-
- series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
515+
- series: 'node_network_mtu_bytes{device="eth2",instance="node1",job="node-exporter",cluster="mycluster"}'
516516
values: '1500 1500 1500 1500 1500'
517-
- series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
517+
- series: 'node_network_mtu_bytes{device="eth3",instance="node1",job="node-exporter",cluster="mycluster"}'
518518
values: '1500 1500 1500 1500 1500'
519-
- series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
519+
520+
- series: 'node_network_mtu_bytes{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}'
521+
values: '1500 1500 1500 1500 1500'
522+
- series: 'node_network_mtu_bytes{device="eth4",instance="node2",job="node-exporter",cluster="mycluster"}'
520523
values: '9000 9000 9000 9000 9000'
521-
- series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
522-
values: '2200 2200 2200 2200 2200'
523-
- series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
524-
values: '2400 2400 2400 2400 2400'
525-
- series: 'node_network_up{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
526-
values: '0 0 0 0 0'
527-
- series: 'node_network_up{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
528-
values: '0 0 0 0 0'
529-
- series: 'node_network_up{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
524+
- series: 'node_network_mtu_bytes{device="eth4",instance="node3",job="node-exporter",cluster="mycluster"}'
525+
values: '9000 9000 9000 9000 9000'
526+
527+
- series: 'node_network_up{device="eth0",instance="node1",job="node-exporter",cluster="mycluster"}'
530528
values: '1 1 1 1 1'
531-
- series: 'node_network_up{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
529+
- series: 'node_network_up{device="eth1",instance="node1",job="node-exporter",cluster="mycluster"}'
532530
values: '1 1 1 1 1'
533-
- series: 'node_network_up{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
531+
- series: 'node_network_up{device="eth2",instance="node1",job="node-exporter",cluster="mycluster"}'
534532
values: '1 1 1 1 1'
535-
- series: 'node_network_up{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
533+
- series: 'node_network_up{device="eth3",instance="node1",job="node-exporter",cluster="mycluster"}'
534+
values: '1 1 1 1 1'
535+
- series: 'node_network_up{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}'
536+
values: '1 1 1 1 1'
537+
- series: 'node_network_up{device="eth4",instance="node2",job="node-exporter",cluster="mycluster"}'
538+
values: '1 1 1 1 1'
539+
- series: 'node_network_up{device="eth4",instance="node3",job="node-exporter",cluster="mycluster"}'
536540
values: '1 1 1 1 1'
537-
- series: 'node_network_up{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
538-
values: '0 0 0 0 0'
539541
promql_expr_test:
540542
- expr: |
541-
node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
542-
scalar(
543-
max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
544-
quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
545-
)
546-
or
547-
node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
548-
scalar(
549-
min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
550-
quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
551-
)
543+
node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
544+
!= on (cluster, device) group_left
545+
quantile by (cluster, device) (
546+
0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
547+
)
552548
eval_time: 1m
553549
exp_samples:
554-
- labels: '{device="eth4", instance="node-exporter", job="node-exporter", cluster="mycluster"}'
555-
value: 9000
556-
- labels: '{device="eth4", instance="hostname1", job="node-exporter", cluster="mycluster"}'
557-
value: 2200
550+
- labels: '{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}'
551+
value: 1500
558552
alert_rule_test:
559553
- eval_time: 1m
560554
alertname: CephNodeInconsistentMTU
561555
exp_alerts:
562556
- exp_labels:
563557
device: eth4
564-
instance: hostname1
558+
instance: node1
565559
job: node-exporter
566560
severity: warning
567561
type: ceph_default
568562
cluster: "mycluster"
569563
exp_annotations:
570-
summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster
571-
description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
572-
- exp_labels:
573-
device: eth4
574-
instance: node-exporter
575-
job: node-exporter
576-
severity: warning
577-
type: ceph_default
578-
cluster: "mycluster"
579-
exp_annotations:
580-
summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster
581-
description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
564+
summary: "Node node1 has inconsistent MTU settings in cluster mycluster"
565+
description: "Network interface eth4 on node node1 has MTU 1500 which differs from the cluster median."
566+
impact: |
567+
- May cause packet fragmentation or packet drops
568+
- Risk of degraded cluster communication and performance
569+
- Potential instability in services relying on consistent networking (e.g., Ceph, Kubernetes)
570+
fix: |
571+
- Check the MTU of interface `eth4` on node `node1`:
572+
ip link show eth4
573+
574+
- Find the median MTU value across the cluster by running this PromQL query in Prometheus:
575+
quantile by (cluster, device) (0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
576+
577+
- Standardize MTU across all nodes to match the median (commonly 1500 or 9000):
578+
ip link set dev eth4 mtu <median-value>
579+
580+
- Make MTU setting persistent:
581+
- RHEL/CentOS: edit `/etc/sysconfig/network-scripts/ifcfg-<device>`
582+
- Debian/Ubuntu: edit `/etc/netplan/*.yaml` and apply with `netplan apply`
583+
584+
- Restart the affected interface or node if required.
582585
583586
# pool full, data series has 6 but using topk(5) so to ensure the
584587
# results are working as expected

src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
[data]="expandedRow"
3737
[customCss]="customCss"
3838
[autoReload]="false"
39+
[showMultiLineText]="true"
40+
[multilineTextKeys]="multilineTextKeys"
3941
>
4042
</cd-table-key-value>
4143
} @else if (expandedRow?.alert_count > 1) {
@@ -60,6 +62,8 @@
6062
[data]="expandedInnerRow"
6163
[customCss]="customCss"
6264
[autoReload]="false"
65+
[showMultiLineText]="true"
66+
[multilineTextKeys]="multilineTextKeys"
6367
>
6468
</cd-table-key-value> }
6569
</cd-table>

src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ export class ActiveAlertListComponent extends PrometheusListHelper implements On
3030
selection = new CdTableSelection();
3131
icons = Icons;
3232
expandedInnerRow: any;
33+
multilineTextKeys = ['description', 'impact', 'fix'];
3334

3435
constructor(
3536
// NotificationsComponent will refresh all alerts every 5s (No need to do it here as well)

src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,15 @@
1212
[limit]="0">
1313
</cd-table>
1414
</div>
15+
16+
<ng-template #valueCellTpl
17+
let-row="data.row"
18+
let-value="data.value">
19+
<span
20+
class="pre-wrap"
21+
*ngIf="row.key in multilineTextKeys; else normalText"
22+
>{{ value }}</span>
23+
<ng-template #normalText>
24+
<span>{{ value }}</span>
25+
</ng-template>
26+
</ng-template>
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.pre-wrap {
2+
white-space: pre-wrap;
3+
}

src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import {
55
OnChanges,
66
OnInit,
77
Output,
8+
TemplateRef,
89
ViewChild
910
} from '@angular/core';
1011

@@ -36,6 +37,8 @@ interface KeyValueItem {
3637
export class TableKeyValueComponent implements OnInit, OnChanges {
3738
@ViewChild(TableComponent, { static: true })
3839
table: TableComponent;
40+
@ViewChild('valueCellTpl', { static: true })
41+
valueCellTpl: TemplateRef<any>;
3942

4043
@Input()
4144
data: any;
@@ -50,6 +53,10 @@ export class TableKeyValueComponent implements OnInit, OnChanges {
5053
hideEmpty = false;
5154
@Input()
5255
hideKeys: string[] = []; // Keys of pairs not to be displayed
56+
@Input()
57+
showMultiLineText = false; // If true, the value field will use a template that supports multi line text
58+
@Input()
59+
multilineTextKeys: string[]; // If set, the value field will use a template that supports multi line text for this key
5360

5461
// If set, the classAddingTpl is used to enable different css for different values
5562
@Input()
@@ -81,6 +88,9 @@ export class TableKeyValueComponent implements OnInit, OnChanges {
8188
if (this.customCss) {
8289
this.columns[1].cellTransformation = CellTemplate.classAdding;
8390
}
91+
if (this.showMultiLineText) {
92+
this.columns[1].cellTemplate = this.valueCellTpl;
93+
}
8494
// We need to subscribe the 'fetchData' event here and not in the
8595
// HTML template, otherwise the data table will display the loading
8696
// indicator infinitely if data is only bound via '[data]="xyz"'.

0 commit comments

Comments
 (0)