add expert all-gather process-group for overlapping

jeffnvidia · jeffnvidia · commit c5b9670babe3 · 2026-01-18T14:20:58.000+02:00
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
@@ -1602,9 +1602,28 @@ def __init__(
             if self.dist_index.get_outer_fsdp_group() is not None:
                 # Outer/Inter-FSDP group when using hybrid FSDP
                 self.ubr_groups.append(self.dist_index.get_outer_fsdp_group())
-            if self.dist_index.get_fsdp_group(is_expert_parallel=False, independent_all_gather=True) is not None:
+            if (
+                self.dist_index.get_fsdp_group(
+                    is_expert_parallel=False, independent_all_gather=True
+                )
+                is not None
+            ):
                 # All-gather group used when overlapping all-gather and gradient reduction.
-                self.ubr_groups.append(self.dist_index.get_fsdp_group(is_expert_parallel=False, independent_all_gather=True))
+                self.ubr_groups.append(
+                    self.dist_index.get_fsdp_group(
+                        is_expert_parallel=False, independent_all_gather=True
+                    )
+                )
+            if (
+                self.dist_index.get_fsdp_group(is_expert_parallel=True, independent_all_gather=True)
+                is not None
+            ):
+                # Expert all-gather group used when overlapping all-gather and gradient reduction.
+                self.ubr_groups.append(
+                    self.dist_index.get_fsdp_group(
+                        is_expert_parallel=True, independent_all_gather=True
+                    )
+                )
 
             if torch.distributed.get_rank() == 0:
                 logging.info(
@@ -1896,9 +1915,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
             # operations (main_grad_buffer). This avoids head-of-line blocking between forward
             # all-gather and backward reduce-scatter on the same communicator.
             model_wbuf_dp_group = main_buf_dp_group
-            if not group.is_expert_param and not should_create_hfsdp_wbuf_and_gbuf:
+            if not should_create_hfsdp_wbuf_and_gbuf:
                 ag_group = self.dist_index.get_fsdp_group(
-                    is_expert_parallel=False, independent_all_gather=True
+                    is_expert_parallel=group.is_expert_param, independent_all_gather=True
                 )
                 if ag_group is not None:
                     model_wbuf_dp_group = ag_group
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py
@@ -498,8 +498,7 @@ def __init__(
         self.fsdp_group_ag = None
         if HAVE_MEGATRON_CORE and parallel_state.has_separate_all_gather_group():
             self.fsdp_group_ag = parallel_state.get_data_parallel_group(
-                        with_context_parallel=True,
-                        independent_all_gather=True
+                with_context_parallel=True, independent_all_gather=True
             )
         # Retrieve the outer-FSDP process group from the DeviceMesh.
         self.outer_fsdp_group = (
@@ -518,6 +517,12 @@ def __init__(
             and contains_submesh(self.expt_device_mesh, self.dp_shard_dim)
             else None
         )
+        # Expert AG group for overlap
+        self.expt_fsdp_group_ag = None
+        if HAVE_MEGATRON_CORE and parallel_state.has_separate_expert_all_gather_group():
+            self.expt_fsdp_group_ag = parallel_state.get_expert_data_parallel_group(
+                independent_all_gather=True
+            )
 
         """
         Megatron-FSDP is responsible for storing all required DeviceMesh
@@ -635,9 +640,13 @@ def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup:
             return self.hybrid_fsdp_group
         return self.fsdp_group
 
-    def get_fsdp_group(self, is_expert_parallel: bool = False, independent_all_gather: bool = False) -> ProcessGroup:
+    def get_fsdp_group(
+        self, is_expert_parallel: bool = False, independent_all_gather: bool = False
+    ) -> ProcessGroup:
         """Get the FSDP process group."""
         if is_expert_parallel:
+            if independent_all_gather:
+                return self.expt_fsdp_group_ag
             return self.expt_fsdp_group
         if independent_all_gather:
             return self.fsdp_group_ag
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
@@ -61,6 +61,7 @@
 # Expert data parallel group
 _EXPERT_DATA_PARALLEL_GROUP = None
 _EXPERT_DATA_PARALLEL_GROUP_GLOO = None
+_EXPERT_DATA_PARALLEL_GROUP_AG = None
 _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP = None
 _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP_GLOO = None
 _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP = None
@@ -1249,6 +1250,10 @@ def initialize_model_parallel(
     assert _EXPERT_DATA_PARALLEL_GROUP is None, "Expert data group is already initialized"
     global _EXPERT_DATA_PARALLEL_GROUP_GLOO
     assert _EXPERT_DATA_PARALLEL_GROUP_GLOO is None, "Expert data group-gloo is already initialized"
+    global _EXPERT_DATA_PARALLEL_GROUP_AG
+    assert (
+        _EXPERT_DATA_PARALLEL_GROUP_AG is None
+    ), "Expert data parallel group with AG is already initialized"
     global _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP
     assert (
         _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP is None
@@ -1282,10 +1287,20 @@ def initialize_model_parallel(
             )
         else:
             group_gloo = None
+        # Create separate all-gather group for expert data parallelism to enable overlap
+        if create_all_gather_group:
+            group_ag = create_group(
+                ranks,
+                timeout=timeout,
+                pg_options=get_nccl_options("ep_dp", nccl_comm_cfgs),
+                group_desc="EXPERT_DATA_PARALLEL_GROUP_AG",
+            )
+        else:
+            group_ag = None
         if rank in ranks:
             _EXPERT_DATA_PARALLEL_GROUP = group
             _EXPERT_DATA_PARALLEL_GROUP_GLOO = group_gloo
-
+            _EXPERT_DATA_PARALLEL_GROUP_AG = group_ag
         if num_distributed_optimizer_instances > 1:
             # Create groups for Partial DistOpt, one for intra-partial DP domain
             # Another for inter-partial DP domain
@@ -1407,7 +1422,9 @@ def get_pipeline_model_parallel_group(check_initialized=True):
     return _PIPELINE_MODEL_PARALLEL_GROUP
 
 
-def get_data_parallel_group(with_context_parallel=False, partial_data_parallel=False, independent_all_gather=False):
+def get_data_parallel_group(
+    with_context_parallel=False, partial_data_parallel=False, independent_all_gather=False
+):
     """Get the data-parallel group the caller rank belongs to."""
     if with_context_parallel:
         if partial_data_parallel:
@@ -1432,13 +1449,22 @@ def get_data_parallel_group(with_context_parallel=False, partial_data_parallel=F
 
 def has_separate_all_gather_group() -> bool:
     """Check if a separate all-gather process group has been created.
-    
+
     Returns True if a dedicated all-gather process group exists for improved
     communication overlap, False otherwise.
     """
     return _DATA_PARALLEL_GROUP_WITH_CP_AG is not None
 
 
+def has_separate_expert_all_gather_group() -> bool:
+    """Check if a separate all-gather process group for experts has been created.
+
+    Returns True if a dedicated all-gather process group for expert parallelism exists
+    for improved communication overlap, False otherwise.
+    """
+    return _EXPERT_DATA_PARALLEL_GROUP_AG is not None
+
+
 def get_data_parallel_group_gloo(with_context_parallel=False, partial_data_parallel=False):
     """Get the Gloo data-parallel group the caller rank belongs to."""
     if with_context_parallel:
@@ -1940,8 +1966,16 @@ def get_expert_tensor_model_pipeline_parallel_group(check_initialized=True):
     return _EXPERT_TENSOR_MODEL_PIPELINE_PARALLEL_GROUP
 
 
-def get_expert_data_parallel_group(check_initialized=True, partial_expert_data_parallel=False):
+def get_expert_data_parallel_group(
+    check_initialized=True, partial_expert_data_parallel=False, independent_all_gather=False
+):
     """Get expert data parallel group."""
+    if independent_all_gather:
+        if check_initialized:
+            assert (
+                _EXPERT_DATA_PARALLEL_GROUP_AG is not None
+            ), "Expert data parallel group with AG is not initialized"
+        return _EXPERT_DATA_PARALLEL_GROUP_AG
     if partial_expert_data_parallel:
         if check_initialized:
             assert (
@@ -2209,6 +2243,9 @@ def destroy_model_parallel():
         torch.distributed.destroy_process_group(_EXPERT_DATA_PARALLEL_GROUP_GLOO)
     _EXPERT_DATA_PARALLEL_GROUP_GLOO = None
 
+    global _EXPERT_DATA_PARALLEL_GROUP_AG
+    _EXPERT_DATA_PARALLEL_GROUP_AG = None
+
     global _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP
     _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP = None
 
diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py
@@ -533,26 +533,18 @@ def test_hybrid_dp_cp_groups(world_size, tp_size, cp_size, dp_size):
 def test_separate_all_gather_group():
     """Test separate all-gather group for improved communication overlap."""
     # Test without creating AG group (default)
-    Utils.initialize_model_parallel(
-        context_parallel_size=world_size,
-        create_all_gather_group=False,
-    )
+    Utils.initialize_model_parallel(context_parallel_size=world_size, create_all_gather_group=False)
     assert not ps.has_separate_all_gather_group()
     assert ps._DATA_PARALLEL_GROUP_WITH_CP_AG is None
     Utils.destroy_model_parallel()
 
     # Test with creating AG group
-    Utils.initialize_model_parallel(
-        context_parallel_size=world_size,
-        create_all_gather_group=True,
-    )
+    Utils.initialize_model_parallel(context_parallel_size=world_size, create_all_gather_group=True)
     assert ps.has_separate_all_gather_group()
     assert ps._DATA_PARALLEL_GROUP_WITH_CP_AG is not None
 
     # Verify it returns the correct group
-    ag_group = ps.get_data_parallel_group(
-        with_context_parallel=True, independent_all_gather=True
-    )
+    ag_group = ps.get_data_parallel_group(with_context_parallel=True, independent_all_gather=True)
     regular_group = ps.get_data_parallel_group(
         with_context_parallel=True, independent_all_gather=False
     )
@@ -564,3 +556,36 @@ def test_separate_all_gather_group():
     assert ag_ranks == regular_ranks
 
     Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize('order', test_parallel_order)
+@pytest.mark.flaky
+@pytest.mark.flaky_in_dev
+def test_separate_expert_all_gather_group(order):
+    """Test separate all-gather group for expert parallelism to enable communication overlap."""
+    # Test without creating expert AG group (default)
+    Utils.initialize_model_parallel(
+        expert_model_parallel_size=world_size, create_all_gather_group=False, order=order
+    )
+    assert not ps.has_separate_expert_all_gather_group()
+    assert ps._EXPERT_DATA_PARALLEL_GROUP_AG is None
+    Utils.destroy_model_parallel()
+
+    # Test with creating expert AG group
+    Utils.initialize_model_parallel(
+        expert_model_parallel_size=world_size, create_all_gather_group=True, order=order
+    )
+    assert ps.has_separate_expert_all_gather_group()
+    assert ps._EXPERT_DATA_PARALLEL_GROUP_AG is not None
+
+    # Verify it returns the correct group
+    ag_group = ps.get_expert_data_parallel_group(independent_all_gather=True)
+    regular_group = ps.get_expert_data_parallel_group(independent_all_gather=False)
+    assert ag_group is not None
+    assert regular_group is not None
+    # They should have the same ranks but different communicators
+    ag_ranks = torch.distributed.get_process_group_ranks(ag_group)
+    regular_ranks = torch.distributed.get_process_group_ranks(regular_group)
+    assert ag_ranks == regular_ranks
+
+    Utils.destroy_model_parallel()