more cleanup

fabianlim · fabianlim · commit b45fdda0c0ce · 2024-11-03T13:55:49.000Z
Signed-off-by: Yu Chin Fabian Lim &lt;flim@sg.ibm.com&gt;
diff --git a/plugins/accelerated-moe/README.md b/plugins/accelerated-moe/README.md
@@ -20,7 +20,7 @@ Run the below in the top-level directory of this repo:
 tox -e run-benches \
     -x testenv:run-benches.deps+="-r plugins/accelerated-moe/requirements-khd.txt" \
     -- \
-    "1 2 4 8" 128 benchmark_outputs scenarios-granite.yaml accelerated-moe-scatter
+    "1 2 4 8" 128 benchmark_outputs scenarios-moe.yaml accelerated-moe-scatter
 ```
 or run the larger `Mixtral-8x7B` bench:
 ```
@@ -43,10 +43,21 @@ bash scripts/run_benchmarks.sh \
     ....
 ```
 
+
 ### Triton Kernel Dependencies
 
 Currently we do not copy the `scattermoe` kernels into this respository, to this is an additional manual install:
 
 ```
 # this will install the kernel-hyperdrive fork with the scattermoe triton kernels
 pip install -r requirements-khd.txt
+
+### Known Issues
+
+These are currently some known issues not yet resolved
+- The design currently does a swap for the mixture-of-expert module with [ScatterMoE](./src/fms_acceleration_moe/utils/scattermoe.py). This affects the `state_dict` of the model, so any saved checkpoint may need to be converted back to original.
+- should eventually remove the dependency on an external `kernel-hyperdrive` repository.
+- now support only loading *sharded* `safetensor` non-GGUF MoE checkpoints. This is a reasonable assumption since MoE checkpoints are typically above the size limit that prevents it being saved into a single checkpoint filed.
+- currently only supports `StateDictType.SHARDED_STATE_DICT` because the implementation uses `DTensors` which have limited support for full state dicts. However for efficiency considerations, sharded state dicts are the most efficient. 
+
+
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
@@ -19,16 +19,19 @@
 # Third Party
 from fms_acceleration import AccelerationPlugin
 from transformers import AutoConfig, AutoModelForCausalLM
-import torch
+import torch 
 
+from .utils import (
+    prepare_scattemoe, patch_huggingface_save_and_load_for_dtensors, patch_torch_optim_foreach_to_not_apply_to_dtensors
+)
 
 # pylint: disable=too-many-instance-attributes
 class ScatterMoEAccelerationPlugin(AccelerationPlugin):
 
     # NOTE: its not packaged properly so, "importlib.util.find_spec('khd')"
     # returns but "importlib.metadata.version('kernel-hyperdrive') is needed"
     # require_packages = {"khd"}
-
+    # NOTE: will address this later if we remove the dependency on kernel-hyperdrive
     restricted_model_archs = [
         'GraniteMoeForCausalLM', 'MixtralForCausalLM'
     ]
@@ -49,11 +52,6 @@ def requires_custom_loading(self):
 
     def model_loader(self, model_name: str, **kwargs):
 
-        # guarded
-        # Local
-        # pylint: disable=import-outside-toplevel
-        from .utils import prepare_scattemoe
-
         # load the model
         model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
 
@@ -62,19 +60,14 @@ def model_loader(self, model_name: str, **kwargs):
             world_size = torch.distributed.get_world_size()
             rank = torch.distributed.get_rank()
 
-        # shard the MOE, and store products required for
-        # FSDP configuration
-        # pylint: disable=unused-variable
+        # shard the MOE, and store the component names, eventually needed 
+        # to configure the FSDP
         self._moe_component_module_names = prepare_scattemoe(
             model,
-            # self._moe_component_cls,
             checkpoint_name_or_path=model_name,
             rank=rank,
             world_size=world_size,
             ep_degree=self._ep_degree,
-            # shared_mesh_dim=self._shard_along_dp,
-            # router_name=self._gate_module_name,
-            # expert_name=self._experts_module_name,
             mixed_precision=False,  # Currently this is hardcoded to OFF
         )
 
@@ -93,17 +86,6 @@ def get_callbacks_and_ready_for_train(
             accelerator is not None
             and getattr(accelerator.state, "fsdp_plugin", None) is not None
         ):
-            # TODO: refactor
-            # for newer torch that enables foreach for Dtensors we need to remove it
-            from torch.optim.optimizer import _foreach_supported_types
-
-            i = 0
-            while i < len(_foreach_supported_types):
-                x = _foreach_supported_types[i]
-                if x.__name__ == 'DTensor':
-                    _foreach_supported_types.pop(i)
-                else:
-                    i += 1 
 
             # - use an internal function call to get the no split
             # module names, which are typically layers
@@ -115,22 +97,13 @@ def get_callbacks_and_ready_for_train(
                 if layer.__class__.__name__ in _layers
             ]
 
-            # Third Party
-            # TODO: REFACTOR
-            from fms_acceleration.model_patcher import patch_target_module
-
-            # Local
-            from .utils.checkpoint_utils import (
-                load_fsdp_model,
-                load_fsdp_optimizer,
-                save_fsdp_model,
-                save_fsdp_optimizer,
-            )
-
-            patch_target_module("transformers.trainer.save_fsdp_model", save_fsdp_model)
-            patch_target_module("transformers.trainer.save_fsdp_optimizer", save_fsdp_optimizer)
-            patch_target_module("transformers.trainer.load_fsdp_model", load_fsdp_model)
-            patch_target_module("transformers.trainer.load_fsdp_optimizer", load_fsdp_optimizer)
+            # call this to patch the HF save and load functions to be able
+            # to save DTensors propery
+            patch_huggingface_save_and_load_for_dtensors()
+
+            # call this to patch torch optim to not use 
+            # foreach for dtensors
+            patch_torch_optim_foreach_to_not_apply_to_dtensors()
 
         return callbacks
 
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/__init__.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/__init__.py
@@ -1 +1,38 @@
-from .scattermoe_prepare import prepare_scattemoe
+# Copyright The FMS HF Tuning Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .scattermoe_prepare import prepare_scattemoe
+from .checkpoint_utils import patch_huggingface_save_and_load_for_dtensors
+
+# this is a special patch function to disable foreach for 
+# dtensors, which has been introduced since torch 2.4.
+# The reason is because this will cause problems in the optimizer 
+# lerp.
+
+def patch_torch_optim_foreach_to_not_apply_to_dtensors():
+    # guarded. 
+    # this is an array of supported types, we will remove 
+    # dtensor from it, so the optimizer will faillback to per 
+    # parameter
+    from torch.optim.optimizer import _foreach_supported_types
+
+    i = 0 # list index
+    while i < len(_foreach_supported_types):
+        x = _foreach_supported_types[i]
+        if x.__name__ == 'DTensor':
+            # pop from list
+            _foreach_supported_types.pop(i)
+        else:
+            i += 1 
+
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py
@@ -150,3 +150,15 @@ def load_fsdp_optimizer(
             group["initial_lr"] = 0.0
             group["eps"] = 1e-8
             group["weight_decay"] = 0.0
+
+# function to replace various trainer functions in HF with the ones
+# above
+def patch_huggingface_save_and_load_for_dtensors():
+    # Third Party
+    # NOTE: this is really a global replacement, which we use the patcher
+    # to do
+    from fms_acceleration.model_patcher import patch_target_module
+    patch_target_module("transformers.trainer.save_fsdp_model", save_fsdp_model)
+    patch_target_module("transformers.trainer.save_fsdp_optimizer", save_fsdp_optimizer)
+    patch_target_module("transformers.trainer.load_fsdp_model", load_fsdp_model)
+    patch_target_module("transformers.trainer.load_fsdp_optimizer", load_fsdp_optimizer)