fix: address review comments

kmehant · kmehant · commit 46ad230b274f · 2025-04-01T12:49:30.000+05:30
Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
@@ -42,12 +42,31 @@ def __init__(self, configurations: Dict[str, Dict]):
         super().__init__(configurations)
 
         # ep_degree determines the expert parallel sharding
-        # - default of 1 means experts are not sharded and operate in pure replication.
+        # If disable_distributed is False, expert sharding is handled
+        # by the plugin else deferred to top-level distribution (e.g. FSDP).
+        #
+        # default of 1 for ep_degree and False for disable_distributed
+        # mean experts are not sharded and operate in pure replication with
+        # Scatter MoE kernels.
+        #
+        # ep_degree==1 and disable_distributed is True mean use of Scatter MoE
+        # kernels + distribution deferred to top level distribution protocol (e.g. FSDP).
+        #
+        # ep_degree>1 and disabled_distributed is False mean enabling expert parallel
+        # and Scatter MoE Kernels.
+        #
+        # ep_degree>1 and disable_distributed is True errors out.
+
         self._ep_degree = self._check_config_and_maybe_check_values(
             key="training.moe.scattermoe.ep_degree",
             default=1,
         )
 
+        self._disable_distributed = self._check_config_and_maybe_check_values(
+            key="training.moe.scattermoe.disable_distributed",
+            default=False,
+        )
+
     @property
     def requires_augmentation(self):
         return True
@@ -77,6 +96,7 @@ def augmentation(
             rank=rank,
             world_size=world_size,
             ep_degree=self._ep_degree,
+            disable_distributed=self._disable_distributed,
             mixed_precision=False,  # Currently this is hardcoded to OFF
         )
         return model, modifiable_args
@@ -91,8 +111,7 @@ def get_callbacks_and_ready_for_train(
             and getattr(accelerator.state, "fsdp_plugin", None) is not None
         ):
 
-            # When EP is not enabled we want to shard the experts using FSDP
-            if self._ep_degree != 0:
+            if not self._disable_distributed:
                 # - use an internal function call to get the no split
                 # module names, which are typically layers
                 _layers = model._get_no_split_modules("")
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_prepare.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_prepare.py
@@ -104,6 +104,7 @@ def prepare_scattermoe(
     rank: int = None,
     world_size: int = None,
     ep_degree: int = 1,
+    disable_distributed: bool = False,
     key_rep: str = KEY_REPLICATE,
     key_ep: str = KEY_EXPERT_PARALLEL,
     device_type: str = "cuda",
@@ -116,13 +117,10 @@ def prepare_scattermoe(
     # pylint: disable=import-outside-toplevel
     from .scattermoe import ScatterMoE
 
-    ep_disabled = False
-    if ep_degree == 0:
-        ep_disabled = True
-        # flow of code when EP not enabled is mostly same as
-        # with ep_degree set to 1. Therefore, we explicitly set
-        # ep_degree to 1 however handle it along with ep_disabled var
-        ep_degree = 1
+    if disable_distributed and ep_degree > 1:
+        raise ValueError(
+            "expert sharding can not be deferred to top level sharding protocol (e.g. FSDP) when ep_degree > 1"
+        )
 
     assert world_size % ep_degree == 0, (
         f"world size ({world_size}) " f"not divisible by ep_size ({ep_degree})."
@@ -137,11 +135,7 @@ def prepare_scattermoe(
     # current rank of the device
     device = torch.device(f"{device_type}:{rank}")
 
-    if ep_disabled:
-        # Larger models result in OOM especially when loading
-        # all experts to the same GPU device (when EP disabled).
-        # For cases like FSDP + EP disabled, its memory efficient to
-        # load the model to CPU and hand it over to the FSDP.
+    if ep_degree == 1 and disable_distributed and is_fsdp_enabled() and rank == 0:
         device = torch.device("cpu")
 
     # get the scattermoe conversion spec
@@ -158,7 +152,7 @@ def prepare_scattermoe(
 
     rep_size = world_size // ep_degree
 
-    if ep_degree == 1 and (rep_size == 1 or ep_disabled):
+    if ep_degree == 1:
         # in this case no need for sharding
         device_mesh = None
     elif rep_size == 1:
@@ -281,10 +275,10 @@ def prepare_scattermoe(
                 )
 
             if device_mesh is None:
-                if is_fsdp_enabled() and rank > 0:
-                    _init_scattermoe_context = init_empty_weights
-                else:
+                if not is_fsdp_enabled() or is_local_dist_rank_0():
                     _init_scattermoe_context = nullcontext
+                else:
+                    _init_scattermoe_context = init_empty_weights
             else:
                 # in this case we need to distribute parameters, so just initialize
                 # the scattermoe module swap with empty weights,
@@ -337,7 +331,7 @@ def prepare_scattermoe(
             if device_mesh is None:
                 # - if not on meta, just load the state dict
                 # - and then put on the device
-                if rank == 0 or not is_fsdp_enabled():
+                if not is_fsdp_enabled() or is_local_dist_rank_0():
                     moe.load_state_dict(sd)
                     moe = moe.to(device)
             else: