Enable AG/RS overlap with explicit process group passing

jeffnvidia · jeffnvidia · commit cfdfe8b993e7 · 2026-02-04T16:42:06.000+02:00
diff --git a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py
@@ -242,6 +242,12 @@ def _init_dist_index(self, pg_collection):
             single_rank_group = dist.new_group(ranks=[dist.get_rank()])
             expt_tp_group = single_rank_group
 
+        # Extract AG groups from pg_collection for explicit passing
+        dp_cp_ag = getattr(pg_collection, 'dp_cp_ag', None) if pg_collection is not None else None
+        expt_dp_ag = (
+            getattr(pg_collection, 'expt_dp_ag', None) if pg_collection is not None else None
+        )
+
         if enable_hsdp:
             mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group)
             dist_index = FSDPDistributedIndex(
@@ -256,6 +262,8 @@ def _init_dist_index(self, pg_collection):
                 dp_shard_dim="dp_cp",
                 tp_dim="tp",
                 hybrid_fsdp_group=hybrid_fsdp_group,
+                fsdp_group_ag=dp_cp_ag,
+                expt_fsdp_group_ag=expt_dp_ag,
             )
         else:
             if ep_group is not None:
@@ -280,6 +288,8 @@ def _init_dist_index(self, pg_collection):
                 dp_shard_dim="dp_cp",
                 tp_dim="tp",
                 expt_device_mesh=expt_device_mesh,
+                fsdp_group_ag=dp_cp_ag,
+                expt_fsdp_group_ag=expt_dp_ag,
             )
 
         self.tp_group = tp_group
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py
@@ -78,6 +78,8 @@ def fully_shard_model(
     tp_dim: Optional[str] = None,
     hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None,
     expt_device_mesh: Optional[DeviceMesh] = None,
+    fsdp_group_ag: Optional[torch.distributed.ProcessGroup] = None,
+    expt_fsdp_group_ag: Optional[torch.distributed.ProcessGroup] = None,
     fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None,
     zero_dp_strategy: str | int = 3,
     outer_dp_sharding_strategy: str | int = 0,
@@ -139,6 +141,17 @@ class that schedules the sharding lifecycle of the model parameters and gradient
             Expert parallel device mesh object defining the topology for MoE distributed training.
             Utilizes the mesh dimension names specified by the *_dim arguments.
 
+        fsdp_group_ag (Optional[torch.distributed.ProcessGroup]):
+            Independent all-gather process group for overlapping all-gather and reduce-scatter
+            operations. When provided, enables AG/RS overlap optimization for regular (non-expert)
+            parameters. Users should create this group with the same ranks as the dp-cp group.
+            Defaults to None.
+
+        expt_fsdp_group_ag (Optional[torch.distributed.ProcessGroup]):
+            Independent all-gather process group for expert parameters in MoE models. When provided,
+            enables AG/RS overlap optimization for expert parameters. Users should create this group
+            with the same ranks as the expert data parallel group. Defaults to None.
+
         fsdp_unit_modules (Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]]):
             List of (sub-)module classes or (sub-)module class import paths that are "units",
             which are torch.nn.Module(s) that are sharded and scheduled by Megatron-FSDP.
@@ -356,6 +369,9 @@ class that schedules the sharding lifecycle of the model parameters and gradient
         hsdp_outer_dp_shard=_outer_fsdp_sharding,
         # Only required for Megatron-FSDP + EP.
         expt_device_mesh=expt_device_mesh,
+        # AG groups for AG/RS overlap optimization.
+        fsdp_group_ag=fsdp_group_ag,
+        expt_fsdp_group_ag=expt_fsdp_group_ag,
     )
 
     # Wrap model in Megatron FSDP.
@@ -522,6 +538,8 @@ def fully_shard(
     tp_dim: Optional[str] = None,
     hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None,
     expt_device_mesh: Optional[DeviceMesh] = None,
+    fsdp_group_ag: Optional[torch.distributed.ProcessGroup] = None,
+    expt_fsdp_group_ag: Optional[torch.distributed.ProcessGroup] = None,
     fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None,
     zero_dp_strategy: str | int = 3,
     outer_dp_sharding_strategy: str | int = 0,
@@ -569,6 +587,8 @@ def fully_shard(
         tp_dim=tp_dim,
         hybrid_fsdp_group=hybrid_fsdp_group,
         expt_device_mesh=expt_device_mesh,
+        fsdp_group_ag=fsdp_group_ag,
+        expt_fsdp_group_ag=expt_fsdp_group_ag,
         fsdp_unit_modules=fsdp_unit_modules,
         zero_dp_strategy=zero_dp_strategy,
         outer_dp_sharding_strategy=outer_dp_sharding_strategy,
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
@@ -1631,6 +1631,16 @@ def __init__(
                         is_expert_parallel=False, independent_all_gather=True
                     )
                 )
+            if (
+                self.dist_index.get_fsdp_group(is_expert_parallel=True, independent_all_gather=True)
+                is not None
+            ):
+                # Expert all-gather group used when overlapping all-gather and gradient reduction.
+                self.ubr_groups.append(
+                    self.dist_index.get_fsdp_group(
+                        is_expert_parallel=True, independent_all_gather=True
+                    )
+                )
 
             log_single_rank(
                 logger,
@@ -1920,14 +1930,14 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                     is_expert_parallel=group.is_expert_param
                 )
 
-            # When --create-all-gather-group is enabled, use a separate process group for
-            # all-gather operations (model_weight_buffer) to enable overlap with gradient reduction
-            # operations (main_grad_buffer). This avoids head-of-line blocking between forward
-            # all-gather and backward reduce-scatter on the same communicator.
+            # Use separate process group for all-gather operations (model_weight_buffer)
+            # to enable overlap with gradient reduction operations (main_grad_buffer).
+            # This avoids head-of-line blocking between forward all-gather and backward
+            # reduce-scatter on the same communicator.
             model_wbuf_dp_group = main_buf_dp_group
-            if not group.is_expert_param and not should_create_hfsdp_wbuf_and_gbuf:
+            if not should_create_hfsdp_wbuf_and_gbuf:
                 ag_group = self.dist_index.get_fsdp_group(
-                    is_expert_parallel=False, independent_all_gather=True
+                    is_expert_parallel=group.is_expert_param, independent_all_gather=True
                 )
                 if ag_group is not None:
                     model_wbuf_dp_group = ag_group
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py
@@ -21,13 +21,6 @@
 from importlib.metadata import version
 from typing import Callable, Optional, Sequence, Union
 
-try:
-    import megatron.core.parallel_state as parallel_state
-
-    HAVE_MEGATRON_CORE = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_MEGATRON_CORE = False
-
 try:
     import einops
 
@@ -452,6 +445,8 @@ def __init__(
         hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None,
         hsdp_outer_dp_shard: bool = False,
         expt_device_mesh: Optional[DeviceMesh] = None,
+        fsdp_group_ag: Optional[torch.distributed.ProcessGroup] = None,
+        expt_fsdp_group_ag: Optional[torch.distributed.ProcessGroup] = None,
     ):
         """
         Args:
@@ -470,6 +465,13 @@ def __init__(
                 just sharding across dp_shard ranks and replicating across dp_outer ranks.
             expt_device_mesh (Optional[DeviceMesh]): The expert parallel device mesh
                 to use for the DistributedIndex.
+            fsdp_group_ag (Optional[torch.distributed.ProcessGroup]): Independent all-gather
+                process group for overlapping all-gather and reduce-scatter operations.
+                When provided, enables AG/RS overlap optimization for regular (non-expert)
+                parameters.
+            expt_fsdp_group_ag (Optional[torch.distributed.ProcessGroup]): Independent all-gather
+                process group for expert parameters in MoE models. When provided, enables AG/RS
+                overlap optimization for expert parameters.
         """
         # Device mesh arguments.
         self.device_mesh = device_mesh
@@ -493,13 +495,10 @@ def __init__(
             if contains_submesh(self.device_mesh, self.dp_shard_dim)
             else None
         )
-        # AG group comes from parallel_state, not the mesh
-        # the purpose of this independent group is to overlap all-gather and gradient reduction.
-        self.fsdp_group_ag = None
-        if HAVE_MEGATRON_CORE and parallel_state.has_separate_all_gather_group():
-            self.fsdp_group_ag = parallel_state.get_data_parallel_group(
-                with_context_parallel=True, independent_all_gather=True
-            )
+        # AG groups passed as explicit arguments
+        # The purpose of independent AG groups is to overlap all-gather and reduce-scatter.
+        self.fsdp_group_ag = fsdp_group_ag
+        self.expt_fsdp_group_ag = expt_fsdp_group_ag
         # Retrieve the outer-FSDP process group from the DeviceMesh.
         self.outer_fsdp_group = (
             self.device_mesh[self.dp_outer_dim].get_group()
@@ -639,6 +638,8 @@ def get_fsdp_group(
     ) -> ProcessGroup:
         """Get the FSDP process group."""
         if is_expert_parallel:
+            if independent_all_gather:
+                return self.expt_fsdp_group_ag
             return self.expt_fsdp_group
         if independent_all_gather:
             return self.fsdp_group_ag
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
@@ -120,7 +120,6 @@
 
 # Data parallel group information with context parallel combined.
 _DATA_PARALLEL_GROUP_WITH_CP = None
-_DATA_PARALLEL_GROUP_WITH_CP_AG = None
 _DATA_PARALLEL_GROUP_WITH_CP_GLOO = None
 _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = None
 
@@ -567,7 +566,6 @@ def initialize_model_parallel(
     create_gloo_process_groups: bool = True,
     high_priority_stream_groups: Optional[List[str]] = None,
     sharp_enabled_group: Optional[str] = None,
-    create_all_gather_group: Optional[bool] = False,
 ) -> None:
     """Initialize model data parallel groups.
 
@@ -682,13 +680,6 @@ def initialize_model_parallel(
             By default (None), it is enabled from dp group.
             Available options (choose one): [dp, dp_replica]
 
-        create_all_gather_group (bool, default = False):
-            Create a separate process group for all-gather operations to avoid
-            head-of-line blocking with reduce-scatter operations. When enabled,
-            creates an additional NCCL communicator with identical ranks as the
-            dp-cp group but with independent progress engines for better communication
-            overlap.
-
     Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
     use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
     the model pipeline. The present function will
@@ -825,7 +816,6 @@ def initialize_model_parallel(
     global _DATA_PARALLEL_GROUP_GLOO
     global _DATA_PARALLEL_GLOBAL_RANKS
     global _DATA_PARALLEL_GROUP_WITH_CP
-    global _DATA_PARALLEL_GROUP_WITH_CP_AG
     global _DATA_PARALLEL_GROUP_WITH_CP_GLOO
     global _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP
     global _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP
@@ -857,15 +847,6 @@ def initialize_model_parallel(
             pg_options=get_nccl_options("dp_cp", nccl_comm_cfgs),
             group_desc="DATA_PARALLEL_GROUP_WITH_CP",
         )
-        if create_all_gather_group:
-            group_with_cp_ag = create_group(
-                ranks_with_cp,
-                timeout=timeout,
-                pg_options=get_nccl_options("dp_cp", nccl_comm_cfgs),
-                group_desc="DATA_PARALLEL_GROUP_WITH_CP_AG",
-            )
-        else:
-            group_with_cp_ag = None
         if create_gloo_process_groups:
             group_with_cp_gloo = create_group(
                 ranks_with_cp,
@@ -877,7 +858,6 @@ def initialize_model_parallel(
             group_with_cp_gloo = None
         if rank in ranks_with_cp:
             _DATA_PARALLEL_GROUP_WITH_CP = group_with_cp
-            _DATA_PARALLEL_GROUP_WITH_CP_AG = group_with_cp_ag
             _DATA_PARALLEL_GROUP_WITH_CP_GLOO = group_with_cp_gloo
             _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = ranks_with_cp
 
@@ -1407,21 +1387,14 @@ def get_pipeline_model_parallel_group(check_initialized=True):
     return _PIPELINE_MODEL_PARALLEL_GROUP
 
 
-def get_data_parallel_group(
-    with_context_parallel=False, partial_data_parallel=False, independent_all_gather=False
-):
+def get_data_parallel_group(with_context_parallel=False, partial_data_parallel=False):
     """Get the data-parallel group the caller rank belongs to."""
     if with_context_parallel:
         if partial_data_parallel:
             assert (
                 _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP is not None
             ), "Intra partial data parallel group is not initialized"
             return _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP
-        if independent_all_gather:
-            assert (
-                _DATA_PARALLEL_GROUP_WITH_CP_AG is not None
-            ), "data parallel group with context parallel AG is not initialized"
-            return _DATA_PARALLEL_GROUP_WITH_CP_AG
         assert (
             _DATA_PARALLEL_GROUP_WITH_CP is not None
         ), "data parallel group with context parallel combined is not initialized"
@@ -1432,15 +1405,6 @@ def get_data_parallel_group(
         return _DATA_PARALLEL_GROUP
 
 
-def has_separate_all_gather_group() -> bool:
-    """Check if a separate all-gather process group has been created.
-
-    Returns True if a dedicated all-gather process group exists for improved
-    communication overlap, False otherwise.
-    """
-    return _DATA_PARALLEL_GROUP_WITH_CP_AG is not None
-
-
 def get_data_parallel_group_gloo(with_context_parallel=False, partial_data_parallel=False):
     """Get the Gloo data-parallel group the caller rank belongs to."""
     if with_context_parallel:
@@ -2101,9 +2065,6 @@ def destroy_model_parallel():
     global _DATA_PARALLEL_GROUP_WITH_CP
     _DATA_PARALLEL_GROUP_WITH_CP = None
 
-    global _DATA_PARALLEL_GROUP_WITH_CP_AG
-    _DATA_PARALLEL_GROUP_WITH_CP_AG = None
-
     global _CONTEXT_PARALLEL_GROUP
     _CONTEXT_PARALLEL_GROUP = None
 
diff --git a/megatron/core/process_groups_config.py b/megatron/core/process_groups_config.py
@@ -114,13 +114,19 @@ class ProcessGroupCollection:
     # _DATA_PARALLEL_GROUP_WITH_CP
     dp_cp: torch.distributed.ProcessGroup = field(init=False)
 
+    # _DATA_PARALLEL_GROUP_WITH_CP_AG
+    dp_cp_ag: torch.distributed.ProcessGroup = field(init=False)
+
     # MoE layers need expt_dp group for sharded state dict
     # we need this workaround until distributed checkpoint is refactored
     # to have sharded_state_dict can take the PG and pass it down
     # TODO (Hepteract): remove this once distributed checkpoint is refactored
     # _EXPERT_DATA_PARALLEL_GROUP
     expt_dp: torch.distributed.ProcessGroup = field(init=False)
 
+    # _EXPERT_DATA_PARALLEL_GROUP_AG
+    expt_dp_ag: torch.distributed.ProcessGroup = field(init=False)
+
     # _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP
     intra_dp_cp: torch.distributed.ProcessGroup = field(init=False)
 
@@ -210,6 +216,7 @@ def use_mpu_process_groups(cls, required_pgs: Optional[List[str]] = None):
             ),
             'dp': parallel_state.get_data_parallel_group,
             'dp_cp': partial(parallel_state.get_data_parallel_group, with_context_parallel=True),
+            'dp_cp_ag': lambda: None,  # AG groups should be created in user code
             'intra_dp_cp': partial(
                 parallel_state.get_data_parallel_group,
                 with_context_parallel=True,
@@ -232,6 +239,7 @@ def use_mpu_process_groups(cls, required_pgs: Optional[List[str]] = None):
             'expt_dp': partial(
                 parallel_state.get_expert_data_parallel_group, check_initialized=False
             ),
+            'expt_dp_ag': lambda: None,  # Expert AG groups should be created in user code
             'tp_dp_cp': partial(
                 parallel_state.get_tensor_and_data_parallel_group,
                 check_initialized=False,
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
@@ -2301,9 +2301,6 @@ def _add_distributed_args(parser):
                        help='IB SHARP can be enabled from only one communication group. '
                        'By default, it is enabled from dp group. '
                        'Available options: [dp, dp_replica]')
-    group.add_argument('--create-all-gather-group', action='store_true',
-                   help='Create a separate process group for all-gather operations '
-                   'to overlap reduce-scatter and all-gather operations.')
     group.add_argument('--use-megatron-fsdp', action='store_true',
                        help='Use the Megatron FSDP code path in DDP.')
     group.add_argument('--data-parallel-sharding-strategy', type=str, default='no_shard',
diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py
@@ -381,7 +381,6 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks, s
                 create_gloo_process_groups=args.enable_gloo_process_groups,
                 high_priority_stream_groups=args.high_priority_stream_groups,
                 sharp_enabled_group=args.sharp_enabled_group,
-                create_all_gather_group=args.create_all_gather_group,
             )
             print_rank_0(
                 f"> initialized tensor model parallel with size "
diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py

Original file line number	Diff line number	Diff line change
`@@ -381,7 +381,6 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks, s`
`381`	`381`	`create_gloo_process_groups=args.enable_gloo_process_groups,`
`382`	`382`	`high_priority_stream_groups=args.high_priority_stream_groups,`
`383`	`383`	`sharp_enabled_group=args.sharp_enabled_group,`
`384`		`- create_all_gather_group=args.create_all_gather_group,`
`385`	`384`	`)`
`386`	`385`	`print_rank_0(`
`387`	`386`	`f"> initialized tensor model parallel with size "`