@@ -62,6 +62,113 @@ let get_master ~rpc ~session_id =
6262 let pool = get_pool ~rpc ~session_id in
6363 Client.Pool. get_master ~rpc ~session_id ~self: pool
6464
65+ (* MTU diagnostics during pool join - CA-384228
66+ *
67+ * This provides visibility into MTU issues but does NOT block pool join because:
68+ * 1. ICMP may be blocked by firewalls, causing false negatives
69+ * 2. TCP PMTUD (net.ipv4.tcp_mtu_probing=1) is now enabled by default and handles
70+ * MTU mismatches automatically at the TCP layer
71+ * 3. TCP PMTUD works even when ICMP is blocked (detects via packet loss + retries)
72+ *)
73+ let check_mtu_connectivity ~__context ~rpc ~session_id ~master_address
74+ ~master_host =
75+ (* Query the master's management PIF to get the actual configured MTU *)
76+ let master_mgmt_pif =
77+ Client.Host. get_management_interface ~rpc ~session_id ~host: master_host
78+ in
79+ let master_network =
80+ Client.PIF. get_network ~rpc ~session_id ~self: master_mgmt_pif
81+ in
82+ let configured_mtu =
83+ Client.Network. get_MTU ~rpc ~session_id ~self: master_network
84+ in
85+ let is_jumbo = configured_mtu > 1500L in
86+
87+ debug
88+ " MTU diagnostics: configured MTU=%Ld on master's management network to \
89+ master %s. TCP PMTUD enabled via sysctl - will auto-adjust if path MTU is \
90+ smaller"
91+ configured_mtu master_address ;
92+
93+ (* ICMP payload sizes for MTU testing:
94+ - 1472 = 1500 (standard MTU) - 20 (IP header) - 8 (ICMP header)
95+ - 8972 = 9000 (jumbo MTU) - 20 (IP header) - 8 (ICMP header) *)
96+ let standard_mtu_icmp_payload = 1472 in
97+ let jumbo_mtu_icmp_payload = 8972 in
98+
99+ (* Test MTU connectivity using ping - ICMP-based, informational only *)
100+ let test_ping size desc =
101+ try
102+ let timeout = 3.0 *. 1e9 |> Int64. of_float |> Mtime.Span. of_uint64_ns in
103+ let _stdout, _stderr =
104+ Forkhelpers. execute_command_get_output ~timeout " /usr/bin/ping"
105+ [
106+ " -c"
107+ ; " 3"
108+ ; " -M"
109+ ; " do"
110+ ; " -s"
111+ ; string_of_int size
112+ ; " -W"
113+ ; " 1"
114+ ; master_address
115+ ]
116+ in
117+ debug " MTU diagnostics: %s test PASSED (size=%d)" desc size ;
118+ true
119+ with e ->
120+ debug " MTU diagnostics: %s test FAILED (size=%d): %s" desc size
121+ (ExnHelper. string_of_exn e) ;
122+ false
123+ in
124+
125+ let standard_ok = test_ping standard_mtu_icmp_payload " standard MTU" in
126+
127+ (* Check MTU connectivity and report results *)
128+ if is_jumbo then
129+ let jumbo_ok = test_ping jumbo_mtu_icmp_payload " jumbo frame" in
130+ match (standard_ok, jumbo_ok) with
131+ | true , false ->
132+ (* CA-384228 scenario: standard works but jumbo fails *)
133+ warn
134+ " MTU diagnostics: MTU CONFIGURATION ISSUE DETECTED (CA-384228): \
135+ Jumbo frames (MTU %Ld) configured but network path does not support \
136+ them! Standard MTU works, but jumbo frames fail. This can cause TCP \
137+ connection hangs during pool operations with large requests. TCP \
138+ PMTUD (net.ipv4.tcp_mtu_probing=1) is enabled and should handle \
139+ this automatically, but if you experience hangs, consider reducing \
140+ MTU to 1500 or fixing network infrastructure. Reference: \
141+ https://blog.cloudflare.com/path-mtu-discovery-in-practice/"
142+ configured_mtu
143+ | false , false ->
144+ (* Both tests failed - ICMP may be blocked *)
145+ warn
146+ " MTU diagnostics: Both standard and jumbo MTU tests failed (ICMP may \
147+ be blocked). If ICMP is blocked, ignore this - TCP PMTUD will \
148+ handle it. If ICMP is NOT blocked, check network connectivity to \
149+ master %s"
150+ master_address
151+ | false , true ->
152+ (* Unusual: standard failed but jumbo passed *)
153+ warn
154+ " MTU diagnostics: Unusual result - standard MTU failed but jumbo \
155+ frames passed (likely ICMP issue). TCP PMTUD will handle this - \
156+ monitor for issues"
157+ | true , true ->
158+ (* Both tests passed - ideal case *)
159+ debug
160+ " MTU diagnostics: Both standard and jumbo frame tests passed - \
161+ network path fully supports configured MTU"
162+ else if not standard_ok then
163+ warn
164+ " MTU diagnostics: Standard MTU test failed (ICMP may be blocked or \
165+ connectivity issue to %s). TCP PMTUD will handle this - monitor for \
166+ issues"
167+ master_address
168+ else
169+ debug
170+ " MTU diagnostics: Standard MTU test passed, no jumbo frames configured"
171+
65172(* Pre-join asserts *)
66173let pre_join_checks ~__context ~rpc ~session_id ~force =
67174 (* I cannot join a Pool unless my management interface exists in the db, otherwise
@@ -1631,6 +1738,7 @@ let join_common ~__context ~master_address ~master_username ~master_password
16311738 side. If we're trying to join a host that does not support pooling
16321739 then an error will be thrown at this stage *)
16331740 pre_join_checks ~__context ~rpc: unverified_rpc ~session_id ~force ;
1741+
16341742 (* get hold of cluster secret - this is critical; if this fails whole pool join fails *)
16351743 new_pool_secret :=
16361744 Client.Pool. initial_auth ~rpc: unverified_rpc ~session_id ;
@@ -1665,6 +1773,10 @@ let join_common ~__context ~master_address ~master_username ~master_password
16651773 in
16661774
16671775 let remote_coordinator = get_master ~rpc ~session_id in
1776+
1777+ check_mtu_connectivity ~__context ~rpc ~session_id ~master_address
1778+ ~master_host: remote_coordinator ;
1779+
16681780 (* If management is on a VLAN, then get the Pool master
16691781 management network bridge before we logout the session *)
16701782 let pool_master_bridge, mgmt_pif =
0 commit comments