@@ -62,6 +62,155 @@ let get_master ~rpc ~session_id =
6262 let pool = get_pool ~rpc ~session_id in
6363 Client.Pool. get_master ~rpc ~session_id ~self: pool
6464
65+ (* MTU diagnostics during pool join - CA-384228
66+ *
67+ * This provides visibility into MTU issues but does NOT block pool join because:
68+ * 1. ICMP may be blocked by firewalls, causing false negatives
69+ * 2. TCP PMTUD (net.ipv4.tcp_mtu_probing=1) is now enabled by default and handles
70+ * MTU mismatches automatically at the TCP layer
71+ * 3. TCP PMTUD works even when ICMP is blocked (detects via packet loss + retries)
72+ *)
73+ let check_mtu_connectivity ~__context ~rpc ~session_id ~master_address
74+ ~master_host =
75+ (* Query the master's management PIF to get the actual configured MTU *)
76+ let master_mgmt_pif =
77+ Client.Host. get_management_interface ~rpc ~session_id ~host: master_host
78+ in
79+ let master_network =
80+ Client.PIF. get_network ~rpc ~session_id ~self: master_mgmt_pif
81+ in
82+ let configured_mtu =
83+ Client.Network. get_MTU ~rpc ~session_id ~self: master_network
84+ in
85+ (* Check if management interface is on a VLAN *)
86+ let vlan_tag = Client.PIF. get_VLAN ~rpc ~session_id ~self: master_mgmt_pif in
87+ (* VLAN adds 4 bytes *)
88+ let vlan_overhead = if vlan_tag > = 0L then 4 else 0 in
89+
90+ let has_higher_mtu = configured_mtu > 1500L in
91+
92+ debug
93+ " MTU diagnostics: configured MTU=%Ld on master's management network to \
94+ master %s%s. TCP PMTUD enabled via sysctl - will auto-adjust if path MTU \
95+ is smaller"
96+ configured_mtu master_address
97+ (if vlan_overhead > 0 then " (VLAN detected)" else " " ) ;
98+
99+ (* Calculate ICMP payload sizes dynamically:
100+ ICMP payload = MTU - IP header (20) - ICMP header (8) - VLAN tag (4 if present)
101+ Always test standard 1500 MTU, and test configured MTU if different *)
102+ let ip_overhead = 20 in
103+ let icmp_overhead = 8 in
104+ let standard_mtu_icmp_payload =
105+ 1500 - ip_overhead - icmp_overhead - vlan_overhead
106+ in
107+ let configured_mtu_icmp_payload =
108+ Int64. to_int configured_mtu - ip_overhead - icmp_overhead - vlan_overhead
109+ in
110+
111+ (* Test MTU connectivity using ping - ICMP-based, informational only *)
112+ let test_ping size desc =
113+ try
114+ let timeout = Mtime.Span. (3 * s) in
115+ let _stdout, _stderr =
116+ Forkhelpers. execute_command_get_output ~timeout " /usr/bin/ping"
117+ [
118+ " -c"
119+ ; " 3"
120+ ; " -M"
121+ ; " do"
122+ ; " -s"
123+ ; string_of_int size
124+ ; " -W"
125+ ; " 1"
126+ ; master_address
127+ ]
128+ in
129+ debug " MTU diagnostics: %s test PASSED (ICMP payload %d bytes)" desc size ;
130+ true
131+ with e ->
132+ debug " MTU diagnostics: %s test FAILED (ICMP payload %d bytes): %s" desc
133+ size
134+ (ExnHelper. string_of_exn e) ;
135+ false
136+ in
137+
138+ let standard_ok = test_ping standard_mtu_icmp_payload " standard MTU (1500)" in
139+
140+ (* Check MTU connectivity and report results *)
141+ if has_higher_mtu then
142+ let configured_ok =
143+ test_ping configured_mtu_icmp_payload
144+ (Printf. sprintf " configured MTU (%Ld)" configured_mtu)
145+ in
146+ match (standard_ok, configured_ok) with
147+ | true , false -> (
148+ (* CA-384228 scenario: standard works but configured MTU fails *)
149+ let msg_body =
150+ Printf. sprintf
151+ " Higher MTU (%Ld) configured but network path does not support it! \
152+ Standard MTU (1500) works, but configured MTU fails. This can \
153+ cause TCP connection hangs during pool operations with large \
154+ requests. TCP PMTUD (net.ipv4.tcp_mtu_probing=1) is enabled and \
155+ should handle this automatically, but if you experience hangs, \
156+ consider reducing MTU to 1500 or fixing network infrastructure."
157+ configured_mtu
158+ in
159+ warn " MTU diagnostics: MTU CONFIGURATION ISSUE DETECTED (CA-384228): %s"
160+ msg_body ;
161+ (* Create pool-level alert on master's pool for customer visibility.
162+ Use try-catch to ensure alert creation failure doesn't break pool join. *)
163+ try
164+ let master_pool =
165+ match Client.Pool. get_all ~rpc ~session_id with
166+ | [] ->
167+ failwith " No pool found on master"
168+ | pool :: _ ->
169+ pool
170+ in
171+ let master_pool_uuid =
172+ Client.Pool. get_uuid ~rpc ~session_id ~self: master_pool
173+ in
174+ let name, priority = Api_messages. pool_mtu_mismatch_detected in
175+ Client.Message. create ~rpc ~session_id ~name ~priority ~cls: `Pool
176+ ~obj_uuid: master_pool_uuid ~body: msg_body
177+ |> ignore
178+ with e ->
179+ warn " MTU diagnostics: Failed to create alert on master pool: %s"
180+ (ExnHelper. string_of_exn e)
181+ )
182+ | false , false ->
183+ (* Both tests failed - ICMP may be blocked *)
184+ warn
185+ " MTU diagnostics: Both standard MTU (1500) and configured MTU (%Ld) \
186+ tests failed (ICMP may be blocked). If ICMP is blocked, ignore this \
187+ - TCP PMTUD will handle it. If ICMP is NOT blocked, check network \
188+ connectivity to master %s"
189+ configured_mtu master_address
190+ | false , true ->
191+ (* Unusual: standard failed but configured MTU passed *)
192+ warn
193+ " MTU diagnostics: Unusual result - standard MTU (1500) failed but \
194+ configured MTU (%Ld) passed (likely ICMP issue). TCP PMTUD will \
195+ handle this - monitor for issues"
196+ configured_mtu
197+ | true , true ->
198+ (* Both tests passed - ideal case *)
199+ debug
200+ " MTU diagnostics: Both standard MTU (1500) and configured MTU (%Ld) \
201+ tests passed - network path fully supports configured MTU"
202+ configured_mtu
203+ else if not standard_ok then
204+ warn
205+ " MTU diagnostics: Standard MTU (1500) test failed (ICMP may be blocked \
206+ or connectivity issue to %s). TCP PMTUD will handle this - monitor for \
207+ issues"
208+ master_address
209+ else
210+ debug
211+ " MTU diagnostics: Standard MTU (1500) test passed, no higher MTU \
212+ configured"
213+
65214(* Pre-join asserts *)
66215let pre_join_checks ~__context ~rpc ~session_id ~force =
67216 (* I cannot join a Pool unless my management interface exists in the db, otherwise
@@ -1631,6 +1780,7 @@ let join_common ~__context ~master_address ~master_username ~master_password
16311780 side. If we're trying to join a host that does not support pooling
16321781 then an error will be thrown at this stage *)
16331782 pre_join_checks ~__context ~rpc: unverified_rpc ~session_id ~force ;
1783+
16341784 (* get hold of cluster secret - this is critical; if this fails whole pool join fails *)
16351785 new_pool_secret :=
16361786 Client.Pool. initial_auth ~rpc: unverified_rpc ~session_id ;
@@ -1665,6 +1815,10 @@ let join_common ~__context ~master_address ~master_username ~master_password
16651815 in
16661816
16671817 let remote_coordinator = get_master ~rpc ~session_id in
1818+
1819+ check_mtu_connectivity ~__context ~rpc ~session_id ~master_address
1820+ ~master_host: remote_coordinator ;
1821+
16681822 (* If management is on a VLAN, then get the Pool master
16691823 management network bridge before we logout the session *)
16701824 let pool_master_bridge, mgmt_pif =
0 commit comments