handle requires_grad in shard_moe

fabianlim · fabianlim · commit 1b6d7a7bb363 · 2024-08-29T07:18:28.000Z
Signed-off-by: Yu Chin Fabian Lim &lt;flim@sg.ibm.com&gt;
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/megablocks_utils/shard_moe_utils.py b/plugins/accelerated-moe/src/fms_acceleration_moe/megablocks_utils/shard_moe_utils.py
@@ -215,6 +215,8 @@ def load_sharded_experts_onto_device(
             else:
                 mod_dtype = getattr(mod, name).dtype
 
+            requires_grad = getattr(mod, name).requires_grad
+
             # the megablocks dmoe experts the expert features to be on DIM_EXPERT.
             # - concat on dim 0 and distribute
             # - cast to the correct dtype for the module
@@ -227,7 +229,8 @@ def load_sharded_experts_onto_device(
                 _placements = [Replicate() for _ in range(len(placements))]
 
             param = torch.nn.Parameter(
-                distribute_tensor(param, device_mesh, _placements)
+                distribute_tensor(param, device_mesh, _placements),
+                requires_grad=requires_grad
             )
 
             # register the sharded parameter onto the megablocks.dmoe