partially address mixed precision

fabianlim · fabianlim · commit feaeaa53795e · 2024-08-29T06:22:19.000Z
Signed-off-by: Yu Chin Fabian Lim &lt;flim@sg.ibm.com&gt;
diff --git a/plugins/accelerated-moe/README.md b/plugins/accelerated-moe/README.md
@@ -44,13 +44,16 @@ bash scripts/run_benchmarks.sh \
 
 ## Expert-Parallel MoE with Megablocks
 
+Currently supports *mixed precision*. Will upcast the router and the sharded experts if turned on.
+- However this is hard-coded to off at the moment.
+- The FSDP mixed precision works independenly of the MoE one.
+
 Not all of the features of `megablocks` are being incorporated; listing down some of the restrictions of the current integration:
 - currently not passing the data parallel `dp_mesh` to the `FSDP` constructor, so `FSDP` will always shard over the default process group (over world_size).
 - now support only loading *sharded* `safetensor` non-GGUF MoE checkpoints. This is a reasonable assumption since MoE checkpoints are typically above the size limit that prevents it being saved into a single checkpoint filed.
 - only supports the *dropless sparse* MLPs in the megablocks package; the other variations like non-dropless and grouped computes are not currently integrated.
 - the `shard_moe` may not scale well with larger models as the current implementation `torch.concat` all the expert weights together before passing to `torch.distributed` to be sharded. This is redundently done in all devices, so it is inefficient.
 - currently only supports `StateDictType.SHARDED_STATE_DICT` because the implementation uses `DTensors` which have limited support for full state dicts. However for efficiency considerations, sharded state dicts are the most efficient. 
-- currently may not support *mixed precision* properly; need to ascertain more clearly how the sharded `DTensors` are upcasted in the optimizer (if at all).
 
 ### Megablocks Dependencies
 
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_megablocks.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_megablocks.py
@@ -151,7 +151,13 @@ def model_loader(self, model_name: str, **kwargs):
             shared_mesh_dim=self._shard_along_dp,
             router_name=self._gate_module_name,
             expert_name=self._experts_module_name,
+            mixed_precision=False, # Currently this is hardcoded to OFF
         )
+
+        # NOTE: there is currently no good way to get the mixed precision
+        # flag from train_args. It will be better to handle this if
+        # when we move the sharding to augmentation.
+
         # NOTE: Currently, it is a bit troublesome to pass the device_mesh to
         #  the FSDP constructor, so we do not do that.
         # - therefore FSDP will always shard on world_size over the default process
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/megablocks_utils/shard_moe_utils.py b/plugins/accelerated-moe/src/fms_acceleration_moe/megablocks_utils/shard_moe_utils.py
@@ -20,6 +20,7 @@
 import json
 import os
 import re
+import warnings
 
 # Third Party
 from accelerate import init_empty_weights
@@ -175,6 +176,7 @@ def load_sharded_experts_onto_device(
     device_mesh: DeviceMesh,
     placements: Placement,
     expert_name: str = "experts",  # e.g., named "experts" within block_sparse_moe
+    mixed_precision: bool = False,
 ):
     # typically they all should be same file, but to play safe, load the checkpoint file onto
     # cpu first since we may not need all weights in that file.
@@ -191,6 +193,7 @@ def load_sharded_experts_onto_device(
 
         # go by one weight at a time.
         # - weight_name: points to megablocks.dmoe
+        upcasted = set()
         for weight_name, vs in checkpoint_metadata.items():
             data = []
             for k, fi in vs:
@@ -204,11 +207,18 @@ def load_sharded_experts_onto_device(
             name = weight_name.split(".")
             path, name = ".".join(name[:-1]), name[-1]
             mod = dmoe.get_submodule(path)
-            mod_dtype = getattr(mod, name).dtype
+
+            # if mixed_precision and KEY_DMOE_ROUTER not in weight_name:
+            if mixed_precision:
+                mod_dtype = torch.float32
+                upcasted.add(weight_name)
+            else:
+                mod_dtype = getattr(mod, name).dtype
 
             # the megablocks dmoe experts the expert features to be on DIM_EXPERT.
             # - concat on dim 0 and distribute
             # - cast to the correct dtype for the module
+            # - if mixed precision is enabled, then sharded params are cased
             param = torch.concat(data, dim=DIM_EXPERT).to(mod_dtype)
 
             _placements = placements
@@ -223,6 +233,9 @@ def load_sharded_experts_onto_device(
             # register the sharded parameter onto the megablocks.dmoe
             mod.register_parameter(name, param)
 
+    upcasted = ", ".join(sorted(upcasted))
+    warnings.warn(f"Mixed precision turned on, upcasted MoE parameters: {upcasted}")
+
 
 def shard_moe(
     model: torch.nn.Module,
@@ -238,6 +251,7 @@ def shard_moe(
     expert_name: str = "experts",
     shared_mesh_dim: bool = True,
     ep_size: int = 1,
+    mixed_precision: bool = False,
 ):
     """shard_moe takes a mixture-of-experts huggingface model and shards the experts
     on the current device. All layers layers that have a MoE module will be sharded.
@@ -272,6 +286,7 @@ def shard_moe(
         expert_name (str): module name of the experts in moe_cls (e.g., "experts").
         shared_mesh_dim (bool): for the sharding mode, see explanation above.
         ep_size (int): for shard_mesh_dim=False only, see explanation above.
+        mixed_precision (bool): activate mixed precision and upcasts sharded params
 
     """
     # guarded import
@@ -389,7 +404,13 @@ def shard_moe(
                 mp_dmoe = dmoe.dMoE(_args)  # drop in replacement for now
 
             load_sharded_experts_onto_device(
-                mp_dmoe, loc, checkpoint_metadata, device_mesh, placements, expert_name
+                mp_dmoe,
+                loc,
+                checkpoint_metadata,
+                device_mesh,
+                placements,
+                expert_name,
+                mixed_precision,
             )
             parent = model.get_submodule(prefix)
             setattr(parent, module_name, mp_dmoe)