feat: Allow for MoE kernels (Scatter MoE) irrespective of use of EP (#136)

kmehant · web-flow · commit 2dbff487f613 · 2025-04-01T10:58:34.000-04:00
* feat: no ep yes kernels

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* feat: support low_cpu_mem

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* fix: review comments

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* fix: review comments

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* fix: address review comments

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* fix: lint error for peft

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* fix: review comment from Fabian:

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

---------

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
@@ -42,12 +42,30 @@ def __init__(self, configurations: Dict[str, Dict]):
         super().__init__(configurations)
 
         # ep_degree determines the expert parallel sharding
-        # - default of 1 means experts are not sharded and operate in pure replication.
+        # If disable_distributed==False, the moe plugin handles sharding / replication,
+        # otherwise user will need handle this manually (e.g., using FSDP)
+        #
+        # ep_degree=1 (default):
+        # - disable_distributed=False (default) means
+        # experts are replicated while using ScatterMoE kernels.
+        # - disable_distributed=True means no replication (please use
+        #    own training framework)
+        #
+        # ep_degree > 1:
+        # - disabled_distributed=False (default) means expert sharding with
+        # Scatter MoE Kernels.
+        # disable_distributed=True cannot be set in this case; errors out.
+
         self._ep_degree = self._check_config_and_maybe_check_values(
             key="training.moe.scattermoe.ep_degree",
             default=1,
         )
 
+        self._disable_distributed = self._check_config_and_maybe_check_values(
+            key="training.moe.scattermoe.disable_distributed",
+            default=False,
+        )
+
     @property
     def requires_augmentation(self):
         return True
@@ -77,6 +95,7 @@ def augmentation(
             rank=rank,
             world_size=world_size,
             ep_degree=self._ep_degree,
+            disable_distributed=self._disable_distributed,
             mixed_precision=False,  # Currently this is hardcoded to OFF
         )
         return model, modifiable_args
@@ -91,23 +110,24 @@ def get_callbacks_and_ready_for_train(
             and getattr(accelerator.state, "fsdp_plugin", None) is not None
         ):
 
-            # - use an internal function call to get the no split
-            # module names, which are typically layers
-            _layers = model._get_no_split_modules("")
-            accelerator.state.fsdp_plugin.ignored_modules = [
-                getattr(layer, name)
-                for name in self._moe_component_module_names
-                for layer in model.modules()
-                if layer.__class__.__name__ in _layers
-            ]
-
-            # call this to patch the HF save and load functions to be able
-            # to save DTensors propery
-            patch_huggingface_save_and_load_for_dtensors()
-
-            # call this to patch torch optim to not use
-            # foreach for dtensors
-            patch_torch_optim_foreach_to_not_apply_to_dtensors()
+            if not self._disable_distributed:
+                # - use an internal function call to get the no split
+                # module names, which are typically layers
+                _layers = model._get_no_split_modules("")
+                accelerator.state.fsdp_plugin.ignored_modules = [
+                    getattr(layer, name)
+                    for name in self._moe_component_module_names
+                    for layer in model.modules()
+                    if layer.__class__.__name__ in _layers
+                ]
+
+                # call this to patch the HF save and load functions to be able
+                # to save DTensors propery
+                patch_huggingface_save_and_load_for_dtensors()
+
+                # call this to patch torch optim to not use
+                # foreach for dtensors
+                patch_torch_optim_foreach_to_not_apply_to_dtensors()
 
         return callbacks
 
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_prepare.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_prepare.py
@@ -104,6 +104,7 @@ def prepare_scattermoe(
     rank: int = None,
     world_size: int = None,
     ep_degree: int = 1,
+    disable_distributed: bool = False,
     key_rep: str = KEY_REPLICATE,
     key_ep: str = KEY_EXPERT_PARALLEL,
     device_type: str = "cuda",
@@ -116,6 +117,12 @@ def prepare_scattermoe(
     # pylint: disable=import-outside-toplevel
     from .scattermoe import ScatterMoE
 
+    if disable_distributed and ep_degree > 1:
+        raise ValueError(
+            "expert sharding can not be deferred to top level sharding"
+            "protocol (e.g. FSDP) when ep_degree > 1"
+        )
+
     assert world_size % ep_degree == 0, (
         f"world size ({world_size}) " f"not divisible by ep_size ({ep_degree})."
     )
@@ -129,6 +136,9 @@ def prepare_scattermoe(
     # current rank of the device
     device = torch.device(f"{device_type}:{rank}")
 
+    if ep_degree == 1 and disable_distributed and is_fsdp_enabled() and rank == 0:
+        device = torch.device("cpu")
+
     # get the scattermoe conversion spec
     (
         moe_cls,
@@ -142,7 +152,8 @@ def prepare_scattermoe(
     expert_name = expert_name.split("|")
 
     rep_size = world_size // ep_degree
-    if ep_degree == 1 and rep_size == 1:
+
+    if ep_degree == 1:
         # in this case no need for sharding
         device_mesh = None
     elif rep_size == 1:
@@ -265,7 +276,10 @@ def prepare_scattermoe(
                 )
 
             if device_mesh is None:
-                _init_scattermoe_context = nullcontext
+                if not is_fsdp_enabled() or is_local_dist_rank_0():
+                    _init_scattermoe_context = nullcontext
+                else:
+                    _init_scattermoe_context = init_empty_weights
             else:
                 # in this case we need to distribute parameters, so just initialize
                 # the scattermoe module swap with empty weights,
@@ -318,8 +332,9 @@ def prepare_scattermoe(
             if device_mesh is None:
                 # - if not on meta, just load the state dict
                 # - and then put on the device
-                moe.load_state_dict(sd)
-                moe = moe.to(device)
+                if not is_fsdp_enabled() or is_local_dist_rank_0():
+                    moe.load_state_dict(sd)
+                    moe = moe.to(device)
             else:
                 # - otherwise, we need to distribtue and will
                 #   replace the parameters
diff --git a/plugins/accelerated-peft/requirements.txt b/plugins/accelerated-peft/requirements.txt
@@ -16,3 +16,4 @@ bitsandbytes >=0.41,<=0.43.3
 threadpoolctl >= 3.5.0
 
 datasets >= 2.20.0
+
diff --git a/plugins/framework/pyproject.toml b/plugins/framework/pyproject.toml
@@ -24,7 +24,7 @@ classifiers=[
 dependencies = [
   "numpy<2.0", # numpy needs to be bounded due to incompatiblity with current torch<2.3
   "torch>2.2",
-  "peft",
+  "peft<=0.14.0", # QuantLinear is not available for peft version > 0.14.0
   "accelerate",
   "pandas",
 ]

Original file line number	Diff line number	Diff line change
`@@ -16,3 +16,4 @@ bitsandbytes >=0.41,<=0.43.3`
`16`	`16`	`threadpoolctl >= 3.5.0`
`17`	`17`
`18`	`18`	`datasets >= 2.20.0`
	`19`	`+`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ classifiers=[`
`24`	`24`	`dependencies = [`
`25`	`25`	`"numpy<2.0", # numpy needs to be bounded due to incompatiblity with current torch<2.3`
`26`	`26`	`"torch>2.2",`
`27`		`- "peft",`
	`27`	`+ "peft<=0.14.0", # QuantLinear is not available for peft version > 0.14.0`
`28`	`28`	`"accelerate",`
`29`	`29`	`"pandas",`
`30`	`30`	`]`