add expert all-gather process-group for overlapping

jeffnvidia · jeffnvidia · commit 33ede31d14ce · 2026-01-12T17:54:20.000+02:00
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
@@ -1605,6 +1605,9 @@ def __init__(
             if self.dist_index.get_fsdp_group(is_expert_parallel=False, independent_all_gather=True) is not None:
                 # All-gather group used when overlapping all-gather and gradient reduction.
                 self.ubr_groups.append(self.dist_index.get_fsdp_group(is_expert_parallel=False, independent_all_gather=True))
+            if self.dist_index.get_fsdp_group(is_expert_parallel=True, independent_all_gather=True) is not None:
+                # Expert all-gather group used when overlapping all-gather and gradient reduction.
+                self.ubr_groups.append(self.dist_index.get_fsdp_group(is_expert_parallel=True, independent_all_gather=True))
 
             if torch.distributed.get_rank() == 0:
                 logging.info(
@@ -1896,9 +1899,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
             # operations (main_grad_buffer). This avoids head-of-line blocking between forward
             # all-gather and backward reduce-scatter on the same communicator.
             model_wbuf_dp_group = main_buf_dp_group
-            if not group.is_expert_param and not should_create_hfsdp_wbuf_and_gbuf:
+            if not should_create_hfsdp_wbuf_and_gbuf:
                 ag_group = self.dist_index.get_fsdp_group(
-                    is_expert_parallel=False, independent_all_gather=True
+                    is_expert_parallel=group.is_expert_param, independent_all_gather=True
                 )
                 if ag_group is not None:
                     model_wbuf_dp_group = ag_group
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py
@@ -518,6 +518,12 @@ def __init__(
             and contains_submesh(self.expt_device_mesh, self.dp_shard_dim)
             else None
         )
+        # Expert AG group for overlap
+        self.expt_fsdp_group_ag = None
+        if HAVE_MEGATRON_CORE and parallel_state.has_separate_expert_all_gather_group():
+            self.expt_fsdp_group_ag = parallel_state.get_expert_data_parallel_group(
+                independent_all_gather=True
+            )
 
         """
         Megatron-FSDP is responsible for storing all required DeviceMesh
@@ -638,6 +644,8 @@ def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup:
     def get_fsdp_group(self, is_expert_parallel: bool = False, independent_all_gather: bool = False) -> ProcessGroup:
         """Get the FSDP process group."""
         if is_expert_parallel:
+            if independent_all_gather:
+                return self.expt_fsdp_group_ag
             return self.expt_fsdp_group
         if independent_all_gather:
             return self.fsdp_group_ag
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
@@ -60,6 +60,7 @@
 # Expert data parallel group
 _EXPERT_DATA_PARALLEL_GROUP = None
 _EXPERT_DATA_PARALLEL_GROUP_GLOO = None
+_EXPERT_DATA_PARALLEL_GROUP_AG = None
 _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP = None
 _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP_GLOO = None
 _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP = None
@@ -1207,6 +1208,8 @@ def initialize_model_parallel(
     assert _EXPERT_DATA_PARALLEL_GROUP is None, "Expert data group is already initialized"
     global _EXPERT_DATA_PARALLEL_GROUP_GLOO
     assert _EXPERT_DATA_PARALLEL_GROUP_GLOO is None, "Expert data group-gloo is already initialized"
+    global _EXPERT_DATA_PARALLEL_GROUP_AG
+    assert _EXPERT_DATA_PARALLEL_GROUP_AG is None, "Expert data parallel group with AG is already initialized"
     global _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP
     assert (
         _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP is None
@@ -1240,10 +1243,20 @@ def initialize_model_parallel(
             )
         else:
             group_gloo = None
+        # Create separate all-gather group for expert data parallelism to enable overlap
+        if create_all_gather_group:
+            group_ag = create_group(
+                ranks,
+                timeout=timeout,
+                pg_options=get_nccl_options("ep_dp", nccl_comm_cfgs),
+                group_desc="EXPERT_DATA_PARALLEL_GROUP_AG",
+            )
+        else:
+            group_ag = None
         if rank in ranks:
             _EXPERT_DATA_PARALLEL_GROUP = group
             _EXPERT_DATA_PARALLEL_GROUP_GLOO = group_gloo
-
+            _EXPERT_DATA_PARALLEL_GROUP_AG = group_ag
         if num_distributed_optimizer_instances > 1:
             # Create groups for Partial DistOpt, one for intra-partial DP domain
             # Another for inter-partial DP domain
@@ -1397,6 +1410,15 @@ def has_separate_all_gather_group() -> bool:
     return _DATA_PARALLEL_GROUP_WITH_CP_AG is not None
 
 
+def has_separate_expert_all_gather_group() -> bool:
+    """Check if a separate all-gather process group for experts has been created.
+    
+    Returns True if a dedicated all-gather process group for expert parallelism exists
+    for improved communication overlap, False otherwise.
+    """
+    return _EXPERT_DATA_PARALLEL_GROUP_AG is not None
+
+
 def get_data_parallel_group_gloo(with_context_parallel=False, partial_data_parallel=False):
     """Get the Gloo data-parallel group the caller rank belongs to."""
     if with_context_parallel:
@@ -1886,8 +1908,14 @@ def get_expert_tensor_model_pipeline_parallel_group(check_initialized=True):
     return _EXPERT_TENSOR_MODEL_PIPELINE_PARALLEL_GROUP
 
 
-def get_expert_data_parallel_group(check_initialized=True, partial_expert_data_parallel=False):
+def get_expert_data_parallel_group(check_initialized=True, partial_expert_data_parallel=False, independent_all_gather=False):
     """Get expert data parallel group."""
+    if independent_all_gather:
+        if check_initialized:
+            assert (
+                _EXPERT_DATA_PARALLEL_GROUP_AG is not None
+            ), "Expert data parallel group with AG is not initialized"
+        return _EXPERT_DATA_PARALLEL_GROUP_AG
     if partial_expert_data_parallel:
         if check_initialized:
             assert (
@@ -2155,6 +2183,9 @@ def destroy_model_parallel():
         torch.distributed.destroy_process_group(_EXPERT_DATA_PARALLEL_GROUP_GLOO)
     _EXPERT_DATA_PARALLEL_GROUP_GLOO = None
 
+    global _EXPERT_DATA_PARALLEL_GROUP_AG
+    _EXPERT_DATA_PARALLEL_GROUP_AG = None
+
     global _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP
     _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP = None
 
diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py
@@ -535,3 +535,40 @@ def test_separate_all_gather_group():
     assert ag_ranks == regular_ranks
 
     Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize('order', test_parallel_order)
+@pytest.mark.flaky
+@pytest.mark.flaky_in_dev
+def test_separate_expert_all_gather_group(order):
+    """Test separate all-gather group for expert parallelism to enable communication overlap."""
+    # Test without creating expert AG group (default)
+    Utils.initialize_model_parallel(
+        expert_model_parallel_size=world_size,
+        create_all_gather_group=False,
+        order=order,
+    )
+    assert not ps.has_separate_expert_all_gather_group()
+    assert ps._EXPERT_DATA_PARALLEL_GROUP_AG is None
+    Utils.destroy_model_parallel()
+
+    # Test with creating expert AG group
+    Utils.initialize_model_parallel(
+        expert_model_parallel_size=world_size,
+        create_all_gather_group=True,
+        order=order,
+    )
+    assert ps.has_separate_expert_all_gather_group()
+    assert ps._EXPERT_DATA_PARALLEL_GROUP_AG is not None
+
+    # Verify it returns the correct group
+    ag_group = ps.get_expert_data_parallel_group(independent_all_gather=True)
+    regular_group = ps.get_expert_data_parallel_group(independent_all_gather=False)
+    assert ag_group is not None
+    assert regular_group is not None
+    # They should have the same ranks but different communicators
+    ag_ranks = torch.distributed.get_process_group_ranks(ag_group)
+    regular_ranks = torch.distributed.get_process_group_ranks(regular_group)
+    assert ag_ranks == regular_ranks
+
+    Utils.destroy_model_parallel()