Skip to content

Commit d1c1c62

Browse files
committed
Register NVLink-domain AG groups before IB-domain HSDP group in UBR
Signed-off-by: jeffnvidia <jmahou@nvidia.com>
1 parent 3df8ef5 commit d1c1c62

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1691,9 +1691,6 @@ def __init__(
16911691
if self.dist_index.get_fsdp_group(is_expert_parallel=True) is not None:
16921692
# Expert-DP group when using EP
16931693
self.ubr_groups.append(self.dist_index.get_fsdp_group(is_expert_parallel=True))
1694-
if self.dist_index.get_outer_fsdp_group() is not None:
1695-
# Outer/Inter-FSDP group when using hybrid FSDP
1696-
self.ubr_groups.append(self.dist_index.get_outer_fsdp_group())
16971694
if (
16981695
self.dist_index.get_fsdp_group(
16991696
is_expert_parallel=False, independent_all_gather=True
@@ -1716,6 +1713,9 @@ def __init__(
17161713
is_expert_parallel=True, independent_all_gather=True
17171714
)
17181715
)
1716+
if self.dist_index.get_outer_fsdp_group() is not None:
1717+
# Outer/Inter-FSDP group when using hybrid FSDP (IB domain, registered last).
1718+
self.ubr_groups.append(self.dist_index.get_outer_fsdp_group())
17191719

17201720
log_single_rank(
17211721
logger,

0 commit comments

Comments
 (0)