Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ocaml/xapi-consts/api_messages.ml
Original file line number Diff line number Diff line change
Expand Up @@ -376,3 +376,5 @@ let all_running_vms_in_anti_affinity_grp_on_single_host =
addMessage "ALL_RUNNING_VMS_IN_ANTI_AFFINITY_GRP_ON_SINGLE_HOST" 3L

let sm_gc_no_space = addMessage "SM_GC_NO_SPACE" 3L

let pool_mtu_mismatch_detected = addMessage "POOL_MTU_MISMATCH_DETECTED" 3L
154 changes: 154 additions & 0 deletions ocaml/xapi/xapi_pool.ml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,155 @@ let get_master ~rpc ~session_id =
let pool = get_pool ~rpc ~session_id in
Client.Pool.get_master ~rpc ~session_id ~self:pool

(* MTU diagnostics during pool join - CA-384228
*
* This provides visibility into MTU issues but does NOT block pool join because:
* 1. ICMP may be blocked by firewalls, causing false negatives
* 2. TCP PMTUD (net.ipv4.tcp_mtu_probing=1) is now enabled by default and handles
* MTU mismatches automatically at the TCP layer
* 3. TCP PMTUD works even when ICMP is blocked (detects via packet loss + retries)
*)
let check_mtu_connectivity ~__context ~rpc ~session_id ~master_address
~master_host =
(* Query the master's management PIF to get the actual configured MTU *)
let master_mgmt_pif =
Client.Host.get_management_interface ~rpc ~session_id ~host:master_host
in
let master_network =
Client.PIF.get_network ~rpc ~session_id ~self:master_mgmt_pif
in
let configured_mtu =
Client.Network.get_MTU ~rpc ~session_id ~self:master_network
in
(* Check if management interface is on a VLAN *)
let vlan_tag = Client.PIF.get_VLAN ~rpc ~session_id ~self:master_mgmt_pif in
(* VLAN adds 4 bytes *)
let vlan_overhead = if vlan_tag >= 0L then 4 else 0 in

let has_higher_mtu = configured_mtu > 1500L in

debug
"MTU diagnostics: configured MTU=%Ld on master's management network to \
master %s%s. TCP PMTUD enabled via sysctl - will auto-adjust if path MTU \
is smaller"
configured_mtu master_address
(if vlan_overhead > 0 then " (VLAN detected)" else "") ;

(* Calculate ICMP payload sizes dynamically:
ICMP payload = MTU - IP header (20) - ICMP header (8) - VLAN tag (4 if present)
Always test standard 1500 MTU, and test configured MTU if different *)
let ip_overhead = 20 in
let icmp_overhead = 8 in
let standard_mtu_icmp_payload =
1500 - ip_overhead - icmp_overhead - vlan_overhead
in
let configured_mtu_icmp_payload =
Int64.to_int configured_mtu - ip_overhead - icmp_overhead - vlan_overhead
in

(* Test MTU connectivity using ping - ICMP-based, informational only *)
let test_ping size desc =
try
let timeout = Mtime.Span.(3 * s) in
let _stdout, _stderr =
Forkhelpers.execute_command_get_output ~timeout "/usr/bin/ping"
[
"-c"
; "3"
; "-M"
; "do"
; "-s"
; string_of_int size
; "-W"
; "1"
; master_address
]
in
debug "MTU diagnostics: %s test PASSED (ICMP payload %d bytes)" desc size ;
true
with e ->
debug "MTU diagnostics: %s test FAILED (ICMP payload %d bytes): %s" desc
size
(ExnHelper.string_of_exn e) ;
false
in

let standard_ok = test_ping standard_mtu_icmp_payload "standard MTU (1500)" in

(* Check MTU connectivity and report results *)
if has_higher_mtu then
let configured_ok =
test_ping configured_mtu_icmp_payload
(Printf.sprintf "configured MTU (%Ld)" configured_mtu)
in
match (standard_ok, configured_ok) with
| true, false -> (
(* CA-384228 scenario: standard works but configured MTU fails *)
let msg_body =
Printf.sprintf
"Higher MTU (%Ld) configured but network path does not support it! \
Standard MTU (1500) works, but configured MTU fails. This can \
cause TCP connection hangs during pool operations with large \
requests. TCP PMTUD (net.ipv4.tcp_mtu_probing=1) is enabled and \
should handle this automatically, but if you experience hangs, \
consider reducing MTU to 1500 or fixing network infrastructure."
configured_mtu
in
warn "MTU diagnostics: MTU CONFIGURATION ISSUE DETECTED (CA-384228): %s"
msg_body ;
(* Create pool-level alert on master's pool for customer visibility.
Use try-catch to ensure alert creation failure doesn't break pool join. *)
try
let master_pool =
match Client.Pool.get_all ~rpc ~session_id with
| [] ->
failwith "No pool found on master"
| pool :: _ ->
pool
in
let master_pool_uuid =
Client.Pool.get_uuid ~rpc ~session_id ~self:master_pool
in
let name, priority = Api_messages.pool_mtu_mismatch_detected in
Client.Message.create ~rpc ~session_id ~name ~priority ~cls:`Pool
~obj_uuid:master_pool_uuid ~body:msg_body
|> ignore
with e ->
warn "MTU diagnostics: Failed to create alert on master pool: %s"
(ExnHelper.string_of_exn e)
)
| false, false ->
(* Both tests failed - ICMP may be blocked *)
warn
"MTU diagnostics: Both standard MTU (1500) and configured MTU (%Ld) \
tests failed (ICMP may be blocked). If ICMP is blocked, ignore this \
- TCP PMTUD will handle it. If ICMP is NOT blocked, check network \
connectivity to master %s"
configured_mtu master_address
| false, true ->
(* Unusual: standard failed but configured MTU passed *)
warn
"MTU diagnostics: Unusual result - standard MTU (1500) failed but \
configured MTU (%Ld) passed (likely ICMP issue). TCP PMTUD will \
handle this - monitor for issues"
configured_mtu
| true, true ->
(* Both tests passed - ideal case *)
debug
"MTU diagnostics: Both standard MTU (1500) and configured MTU (%Ld) \
tests passed - network path fully supports configured MTU"
configured_mtu
else if not standard_ok then
warn
"MTU diagnostics: Standard MTU (1500) test failed (ICMP may be blocked \
or connectivity issue to %s). TCP PMTUD will handle this - monitor for \
issues"
master_address
else
debug
"MTU diagnostics: Standard MTU (1500) test passed, no higher MTU \
configured"

(* Pre-join asserts *)
let pre_join_checks ~__context ~rpc ~session_id ~force =
(* I cannot join a Pool unless my management interface exists in the db, otherwise
Expand Down Expand Up @@ -1631,6 +1780,7 @@ let join_common ~__context ~master_address ~master_username ~master_password
side. If we're trying to join a host that does not support pooling
then an error will be thrown at this stage *)
pre_join_checks ~__context ~rpc:unverified_rpc ~session_id ~force ;

(* get hold of cluster secret - this is critical; if this fails whole pool join fails *)
new_pool_secret :=
Client.Pool.initial_auth ~rpc:unverified_rpc ~session_id ;
Expand Down Expand Up @@ -1665,6 +1815,10 @@ let join_common ~__context ~master_address ~master_username ~master_password
in

let remote_coordinator = get_master ~rpc ~session_id in

check_mtu_connectivity ~__context ~rpc ~session_id ~master_address
~master_host:remote_coordinator ;

(* If management is on a VLAN, then get the Pool master
management network bridge before we logout the session *)
let pool_master_bridge, mgmt_pif =
Expand Down
21 changes: 21 additions & 0 deletions scripts/92-xapi-tcp-mtu.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# TCP Path MTU Discovery settings
#
# These settings enable automatic TCP MTU detection to prevent issues when
# path MTU is smaller than configured interface MTU (e.g., jumbo frames
# configured but not supported by network path).
#
# References:
# - https://blog.cloudflare.com/path-mtu-discovery-in-practice/
# - CA-384228: Slave hang during pool join due to MTU mismatch
#
# net.ipv4.tcp_mtu_probing:
# 0 = disabled (default)
# 1 = enabled when ICMP blackhole detected (recommended)
# 2 = always enabled
#
# net.ipv4.tcp_base_mss:
# Base MSS to use for MTU probing (default 1024)
# This is the starting point for MTU probing when enabled

net.ipv4.tcp_mtu_probing = 1
net.ipv4.tcp_base_mss = 1024
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

configuration files has been a salient point of issues regarding user configuration in xcp-ng. I'm asking the platform teams whether this change follows their recommendations

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @psafont , I understand your point.
Would you please share more about the good practice or recommendations? Thank you.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm expecting somebody from xcp-ng's platform team to share them here

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not from platform team, but indeed, system configuration files should likely be part of packages like, on xcp-ng side, xcp-ng-release.

  • Do we actually need a config file to enforce that?
  • Could it be a setting that xapi applies istead? Or would that not be possible to be achieved soon enough for slaves to be able to reach master?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens on xapi updates when this file has been changed manually? Would the next xapi update overwrite any change? We might want to add a line instructing not to change this file.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you manually edit a file in /etc/sysctl.d/ that is owned by a package, whether an update overwrites it is controlled by how the package marked that file:

%config(noreplace) → your edited file is kept, and the new packaged version is written as .rpmnew.
%config (without noreplace) → your edited file may be replaced, and your previous version is saved as .rpmsave.

Runtime precedence: When sysctl --system is run, settings from /etc/sysctl.d override vendor defaults from /usr/lib/sysctl.d. The last assignment wins if the same key is set multiple times.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This means, we should write this to /usr/lib/sysctl.d such that a user would put an overwrite into /etc/sysctl.d/?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current implementation installs via xapi's Makefile to /etc/sysctl.d/ for these reasons:

  1. Tight coupling: This sysctl directly fixes xapi's CA-384228 hang issue. The diagnostic
    code added in commit 2 assumes TCP PMTUD is enabled.
  2. Immediate availability: Installed with xapi, no dependency on separate package coordination.
  3. Standard location: /etc/sysctl.d/ with priority 92 (after system defaults like 91-,
    before admin overrides like 99-
    ). Admins can override with higher-priority files if needed.
  4. Precedent in xapi: xapi's Makefile already installs various system configuration files.
    This follows the same pattern.

However, I understand the concerns about package ownership and platform-wide impact.

Alternative: We could move to /usr/lib/sysctl.d/92-xapi-tcp-mtu.conf instead, which is:

  • Package-owned location (not user config space)
  • Still loads at the right priority
  • Admin overrides in /etc/sysctl.d/ would automatically take precedence
  • More aligned with package management best practices

I'm happy to hear the preference from the platform team or other recommendations. Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I read points 1, 2, 3 and 4, and pardon me if I'm imagining things, but it looks like they are justifications made by a LLM.

I'm not going to comment on them one by one but will instead try to think about the big picture.

So the main questions are:

  1. Whether to make the configuration file belong to the XAPI package or something else. It is required by XAPI for some functionality, and doesn't modify an existing system file, so in my opinion it can make sense to package it along with XAPI or xcp-networkd.
  2. How to manage potential user modifications to the file. Here I think we want the XAPI-provided file to not be altered at all, so that we can provide future updates to it. As @lindig mentioned, there's a directory for vendor defaults, so it seems that using /usr/lib/sysctl.d/ would be appropriate, if we can ensure that the files are loaded in the right order. If for some reason a user would had to override some settings, then they could drop a file in /etc.

@ydirson Does this look good to to you?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, 100%

2 changes: 2 additions & 0 deletions scripts/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ install:
mkdir -p $(DESTDIR)/usr/lib/yum-plugins
mkdir -p $(DESTDIR)$(OPTDIR)/packages/post-install-scripts
mkdir -p $(DESTDIR)/etc/systemd/system/[email protected]/
mkdir -p $(DESTDIR)/etc/sysctl.d
$(IPROG) base-path $(DESTDIR)/etc/xapi.d
$(IDATA) 92-xapi-tcp-mtu.conf $(DESTDIR)/etc/sysctl.d/92-xapi-tcp-mtu.conf
$(IPROG) sm_diagnostics $(DESTDIR)$(LIBEXECDIR)
$(IPROG) xn_diagnostics $(DESTDIR)$(LIBEXECDIR)
$(IPROG) thread_diagnostics $(DESTDIR)$(LIBEXECDIR)
Expand Down
Loading