feat: FSDP2 with MoE kernels and expert parallel (#157)

kmehant · web-flow · commit f06964d7e6cd · 2025-10-15T15:09:14.000+05:30
* fsdp2 patches

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* fsdp2 patches

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* fsdp2 patches

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

* fsdp2 patches

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;

---------

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
@@ -25,6 +25,8 @@
 
 # Local
 from .utils import (
+    patch_huggingface_clip_grad_norm_fsdp2,
+    patch_huggingface_fsdp2_load_full_state_dict,
     patch_huggingface_save_and_load_for_dtensors,
     patch_torch_optim_foreach_to_not_apply_to_dtensors,
     prepare_scattermoe,
@@ -144,9 +146,23 @@ def get_callbacks_and_ready_for_train(
                 # to save DTensors propery
                 patch_huggingface_save_and_load_for_dtensors()
 
-                # call this to patch torch optim to not use
-                # foreach for dtensors
-                patch_torch_optim_foreach_to_not_apply_to_dtensors()
+                if (
+                    not hasattr(accelerator.state.fsdp_plugin, "fsdp_version")
+                    or accelerator.state.fsdp_plugin.fsdp_version == 1
+                ):
+                    # call this to patch torch optim to not use
+                    # foreach for dtensors only when fsdpv1 is used
+                    # fsdpv2 with transformers does implicit replication to convert all to dtensors
+                    # before grad norm and optimizer.step() operations
+                    patch_torch_optim_foreach_to_not_apply_to_dtensors()
+
+                if (
+                    hasattr(accelerator.state.fsdp_plugin, "fsdp_version")
+                    and accelerator.state.fsdp_plugin.fsdp_version == 2
+                ):
+                    # when EP and FSDPv2 is used
+                    patch_huggingface_clip_grad_norm_fsdp2(accelerator)
+                    patch_huggingface_fsdp2_load_full_state_dict()
 
         return callbacks
 
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/__init__.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/__init__.py
@@ -14,6 +14,8 @@
 
 # Local
 from .checkpoint_utils import (
+    patch_huggingface_clip_grad_norm_fsdp2,
+    patch_huggingface_fsdp2_load_full_state_dict,
     patch_huggingface_save_and_load_for_dtensors,
     recover_safetensors_from_dcp,
 )
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,8 @@`
`14`	`14`
`15`	`15`	`# Local`
`16`	`16`	`from .checkpoint_utils import (`
	`17`	`+ patch_huggingface_clip_grad_norm_fsdp2,`
	`18`	`+ patch_huggingface_fsdp2_load_full_state_dict,`
`17`	`19`	`patch_huggingface_save_and_load_for_dtensors,`
`18`	`20`	`recover_safetensors_from_dcp,`
`19`	`21`	`)`