@@ -505,80 +505,83 @@ tests:
505505 summary : Host filesystem free space is getting low on cluster mycluster
506506 description : " Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
507507
508- # MTU Mismatch
508+ # MTU Mismatch
509509 - interval : 1m
510510 input_series :
511- - series : ' node_network_mtu_bytes{device="eth0",instance="node-exporter ",job="node-exporter",cluster="mycluster"}'
511+ - series : ' node_network_mtu_bytes{device="eth0",instance="node1 ",job="node-exporter",cluster="mycluster"}'
512512 values : ' 1500 1500 1500 1500 1500'
513- - series : ' node_network_mtu_bytes{device="eth1",instance="node-exporter ",job="node-exporter",cluster="mycluster"}'
513+ - series : ' node_network_mtu_bytes{device="eth1",instance="node1 ",job="node-exporter",cluster="mycluster"}'
514514 values : ' 1500 1500 1500 1500 1500'
515- - series : ' node_network_mtu_bytes{device="eth2",instance="node-exporter ",job="node-exporter",cluster="mycluster"}'
515+ - series : ' node_network_mtu_bytes{device="eth2",instance="node1 ",job="node-exporter",cluster="mycluster"}'
516516 values : ' 1500 1500 1500 1500 1500'
517- - series : ' node_network_mtu_bytes{device="eth3",instance="node-exporter ",job="node-exporter",cluster="mycluster"}'
517+ - series : ' node_network_mtu_bytes{device="eth3",instance="node1 ",job="node-exporter",cluster="mycluster"}'
518518 values : ' 1500 1500 1500 1500 1500'
519- - series : ' node_network_mtu_bytes{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
519+
520+ - series : ' node_network_mtu_bytes{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}'
521+ values : ' 1500 1500 1500 1500 1500'
522+ - series : ' node_network_mtu_bytes{device="eth4",instance="node2",job="node-exporter",cluster="mycluster"}'
520523 values : ' 9000 9000 9000 9000 9000'
521- - series : ' node_network_mtu_bytes{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
522- values : ' 2200 2200 2200 2200 2200'
523- - series : ' node_network_mtu_bytes{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
524- values : ' 2400 2400 2400 2400 2400'
525- - series : ' node_network_up{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
526- values : ' 0 0 0 0 0'
527- - series : ' node_network_up{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
528- values : ' 0 0 0 0 0'
529- - series : ' node_network_up{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
524+ - series : ' node_network_mtu_bytes{device="eth4",instance="node3",job="node-exporter",cluster="mycluster"}'
525+ values : ' 9000 9000 9000 9000 9000'
526+
527+ - series : ' node_network_up{device="eth0",instance="node1",job="node-exporter",cluster="mycluster"}'
530528 values : ' 1 1 1 1 1'
531- - series : ' node_network_up{device="eth3 ",instance="node-exporter ",job="node-exporter",cluster="mycluster"}'
529+ - series : ' node_network_up{device="eth1 ",instance="node1 ",job="node-exporter",cluster="mycluster"}'
532530 values : ' 1 1 1 1 1'
533- - series : ' node_network_up{device="eth4 ",instance="node-exporter ",job="node-exporter",cluster="mycluster"}'
531+ - series : ' node_network_up{device="eth2 ",instance="node1 ",job="node-exporter",cluster="mycluster"}'
534532 values : ' 1 1 1 1 1'
535- - series : ' node_network_up{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
533+ - series : ' node_network_up{device="eth3",instance="node1",job="node-exporter",cluster="mycluster"}'
534+ values : ' 1 1 1 1 1'
535+ - series : ' node_network_up{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}'
536+ values : ' 1 1 1 1 1'
537+ - series : ' node_network_up{device="eth4",instance="node2",job="node-exporter",cluster="mycluster"}'
538+ values : ' 1 1 1 1 1'
539+ - series : ' node_network_up{device="eth4",instance="node3",job="node-exporter",cluster="mycluster"}'
536540 values : ' 1 1 1 1 1'
537- - series : ' node_network_up{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
538- values : ' 0 0 0 0 0'
539541 promql_expr_test :
540542 - expr : |
541- node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
542- scalar(
543- max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
544- quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
545- )
546- or
547- node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
548- scalar(
549- min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
550- quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
551- )
543+ node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
544+ != on (cluster, device) group_left
545+ quantile by (cluster, device) (
546+ 0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
547+ )
552548 eval_time: 1m
553549 exp_samples:
554- - labels: '{device="eth4", instance="node-exporter", job="node-exporter", cluster="mycluster"}'
555- value: 9000
556- - labels: '{device="eth4", instance="hostname1", job="node-exporter", cluster="mycluster"}'
557- value: 2200
550+ - labels: '{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}'
551+ value: 1500
558552 alert_rule_test :
559553 - eval_time : 1m
560554 alertname : CephNodeInconsistentMTU
561555 exp_alerts :
562556 - exp_labels :
563557 device : eth4
564- instance : hostname1
558+ instance : node1
565559 job : node-exporter
566560 severity : warning
567561 type : ceph_default
568562 cluster : " mycluster"
569563 exp_annotations :
570- summary : MTU settings across Ceph hosts are inconsistent on cluster mycluster
571- description : " Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
572- - exp_labels :
573- device : eth4
574- instance : node-exporter
575- job : node-exporter
576- severity : warning
577- type : ceph_default
578- cluster : " mycluster"
579- exp_annotations :
580- summary : MTU settings across Ceph hosts are inconsistent on cluster mycluster
581- description : " Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
564+ summary : " Node node1 has inconsistent MTU settings in cluster mycluster"
565+ description : " Network interface eth4 on node node1 has MTU 1500 which differs from the cluster median."
566+ impact : |
567+ - May cause packet fragmentation or packet drops
568+ - Risk of degraded cluster communication and performance
569+ - Potential instability in services relying on consistent networking (e.g., Ceph, Kubernetes)
570+ fix : |
571+ - Check the MTU of interface `eth4` on node `node1`:
572+ ip link show eth4
573+
574+ - Find the median MTU value across the cluster by running this PromQL query in Prometheus:
575+ quantile by (cluster, device) (0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
576+
577+ - Standardize MTU across all nodes to match the median (commonly 1500 or 9000):
578+ ip link set dev eth4 mtu <median-value>
579+
580+ - Make MTU setting persistent:
581+ - RHEL/CentOS: edit `/etc/sysconfig/network-scripts/ifcfg-<device>`
582+ - Debian/Ubuntu: edit `/etc/netplan/*.yaml` and apply with `netplan apply`
583+
584+ - Restart the affected interface or node if required.
582585
583586 # pool full, data series has 6 but using topk(5) so to ensure the
584587 # results are working as expected
0 commit comments