update readme, code cleanup, add comments and initial bench

fabianlim · fabianlim · commit 6d1e6463ebec · 2024-11-03T09:09:48.000Z
Signed-off-by: Yu Chin Fabian Lim &lt;flim@sg.ibm.com&gt;
diff --git a/plugins/accelerated-moe/README.md b/plugins/accelerated-moe/README.md
@@ -1,7 +1,7 @@
 # FMS Acceleration for Mixture-of-Experts
 
 This library contains plugins to accelerate finetuning with the following optimizations:
-1. Expert-Parallel MoE with ScatterMoe & Megablocks
+1. Expert-Parallel MoE with Triton Kernels from ScatterMoe (& Megablocks)
 
 ## Plugins
 
@@ -20,12 +20,12 @@ Run the below in the top-level directory of this repo:
 tox -e run-benches \
     -x testenv:run-benches.deps+="-r plugins/accelerated-moe/requirements-khd.txt" \
     -- \
-    "1 2" "4 8" benchmark_outputs scenarios-granite.yaml accelerated-moe-scatter
+    "1 2 4 8" 128 benchmark_outputs scenarios-granite.yaml accelerated-moe-scatter
 ```
 or run the larger `Mixtral-8x7B` bench:
 ```
 tox ... \
-    8 8 benchmark_outputs scenarios-granite.yaml accelerated-moe-scatter
+    8 128 benchmark_outputs scenarios-granite.yaml accelerated-moe-scatter
 ```
 
 NOTE: if `FileNotFoundError` is observed on the *triton cache*, similar to issues like these:
@@ -40,10 +40,10 @@ running in `bash`:
 
 source .tox/run-benches/bin/activate
 bash scripts/run_benchmarks.sh \
-    "1 2" "4 8" benchmark_outputs scenarios-granite.yaml accelerated-moe-scatter
+    ....
 ```
 
-### Megablocks Dependencies
+### Triton Kernel Dependencies
 
 Currently we do not copy the `scattermoe` kernels into this respository, to this is an additional manual install:
 
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
@@ -36,108 +36,31 @@ class ScatterMoEAccelerationPlugin(AccelerationPlugin):
     def __init__(self, configurations: Dict[str, Dict]):
         super().__init__(configurations)
 
-        # arguments for configuring the mixture-of-experts model with defaults
-        # shown below for Mixtral 7x8b
-        # - 1. component class
-        # self._moe_component_cls = self._check_config_and_maybe_check_values(
-        #     key="training.moe.scattermoe.moe_component_class",
-        #     # default="MixtralSparseMoeBlock",
-        #     default="GraniteMoeMoE",
-        # )
-        # - 2. gate_module_name
-        # self._gate_module_name = self._check_config_and_maybe_check_values(
-        #     key="training.moe.scattermoe.moe_gate_module_name", default="gate"
-        # )
-        # # - 3. experts_module_name
-        # self._experts_module_name = self._check_config_and_maybe_check_values(
-        #     key="training.moe.scattermoe.moe_experts_module_name", default="experts"
-        # )
-        # # - 4. mlp version
-        # self._mlp_version = self._check_config_and_maybe_check_values(
-        #     key="training.moe.scattermoe.moe_mlp_impl",
-        #     values=["v1", "v2"],
-        #     default="v2",
-        # )
-
-        # for controlling the type of sharding
-        # self._shard_along_dp = self._check_config_and_maybe_check_values(
-        #     key="training.moe.scattermoe.shard_along_dp",
-        #     values=[True, False],
-        #     default=True,
-        # )
-
-        # ep_size determines the expert parallel sharding
-        # - ep_size is ignored if _shard_along_dp=True
+        # ep_degree determines the expert parallel sharding
+        # - default of 1 means experts are not sharded and operate in pure replication.
         self._ep_degree = self._check_config_and_maybe_check_values(
             key="training.moe.scattermoe.ep_degree",
             default=1,
         )
 
-        # for the moe_implementation, currently we only use the megablocks
-        # dropless sparse implementation
-        # self._moe_implementation = self._check_config_and_maybe_check_values(
-        #     key="training.moe.scattermoe.moe_implementation",
-        #     values=["dropless_sparse"],
-        #     default="dropless_sparse",
-        # )
-        # self._moe_implementation = self._moe_implementation.split("_")[1]
-
-        # self._load_balancing_loss = self._check_config_and_maybe_check_values(
-        #     key="training.moe.scattermoe.load_balancing_loss",
-        #     values=[True, False],
-        #     default=False,
-        # )
-
     @property
     def requires_custom_loading(self):
         return True
 
     def model_loader(self, model_name: str, **kwargs):
+
         # guarded
         # Local
         # pylint: disable=import-outside-toplevel
-        # from .megablocks_utils.config_utils import update_mlp_registry
-        # from .megablocks_utils.shard_moe_utils import get_moe_kwargs, shard_moe
-
-        # # - check the config
-        # if self._load_balancing_loss and not hasattr(
-        #     AutoConfig.from_pretrained(model_name), "output_router_logits"
-        # ):
-        #     warnings.warn(
-        #         "load_balancing_loss=True but "
-        #         "the model '{model_name}' config not have 'output_router_logits' "
-        #         "in its config, hence it might not support load balancing and "
-        #         "fallback to load_balancing_loss=False."
-        #     )
-        #     self._load_balancing_loss = False
-
-        # this one does a forward patching on MLP, but needs to be fixed
-        # properly as the load balancing loss is currently not properly
-        # handled
-        # update_mlp_registry(
-        #     self._moe_implementation, self._mlp_version, self._load_balancing_loss
-        # )
         from .utils import prepare_scattemoe
 
-        # get additional parameters
-        # torch_dtype = kwargs.get("torch_dtype", torch.float32)
-
         # load the model
         model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
 
-        # set this in the config, which will be picked up by the forward
-        # function to go into the load_balancing loss
-        # model.config.output_router_logits = self._load_balancing_loss
-
         rank, world_size = 0, 1
         if torch.distributed.is_initialized():
             world_size = torch.distributed.get_world_size()
             rank = torch.distributed.get_rank()
-        # else:
-        #     # NOTE: or should we do a silent fallback
-        #     raise AssertionError(
-        #         "Megablocks expert parallel only works for distributed training."
-        #     )
 
         # shard the MOE, and store products required for
         # FSDP configuration
@@ -159,11 +82,6 @@ def model_loader(self, model_name: str, **kwargs):
         # flag from train_args. It will be better to handle this if
         # when we move the sharding to augmentation.
 
-        # NOTE: Currently, it is a bit troublesome to pass the device_mesh to
-        #  the FSDP constructor, so we do not do that.
-        # - therefore FSDP will always shard on world_size over the default process
-        #   group
-
         return model
 
     def get_callbacks_and_ready_for_train(
diff --git a/scripts/benchmarks/refs/a100_80gb_moe.csv b/scripts/benchmarks/refs/a100_80gb_moe.csv
@@ -0,0 +1,13 @@
+epoch,framework_config,gradient_accumulation_steps,mem_nvidia_mem_reserved,model_name_or_path,num_gpus,per_device_train_batch_size,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second
+0.25,none,16,70749.0,ibm/PowerMoE-3b,1,8,bfloat16,0.9477007621526718,2350.0523,5.447,0.043,1508.732
+0.25,none,8,46699.0,ibm/PowerMoE-3b,2,8,bfloat16,0.9477724695205688,1341.9179,9.539,0.075,1321.094
+0.25,none,4,38885.0,ibm/PowerMoE-3b,4,8,bfloat16,0.9478064042329788,712.2347,17.972,0.14,1244.534
+0.25,moe-scattermoe-granite-ep1,16,71049.0,ibm/PowerMoE-3b,1,8,bfloat16,0.9477236008644104,741.1462,17.271,0.135,4783.942
+0.25,moe-scattermoe-granite-ep1,8,52294.0,ibm/PowerMoE-3b,2,8,bfloat16,0.9511111199855804,484.7077,26.408,0.206,3657.462
+0.25,moe-scattermoe-granite-ep1,4,51251.5,ibm/PowerMoE-3b,4,8,bfloat16,0.9541541540622711,264.6776,48.361,0.378,3348.98
+,moe-scattermoe-granite-ep2,16,3.0,ibm/PowerMoE-3b,1,8,bfloat16,,,,,
+0.25,moe-scattermoe-granite-ep2,8,39854.0,ibm/PowerMoE-3b,2,8,bfloat16,0.9480846971273422,602.4418,21.247,0.166,2942.691
+0.25,moe-scattermoe-granite-ep2,4,40937.0,ibm/PowerMoE-3b,4,8,bfloat16,0.9512380701303482,305.5111,41.897,0.327,2901.367
+,moe-scattermoe-granite-ep4,16,3.0,ibm/PowerMoE-3b,1,8,bfloat16,,,,,
+,moe-scattermoe-granite-ep4,8,213.0,ibm/PowerMoE-3b,2,8,bfloat16,,,,,
+0.25,moe-scattermoe-granite-ep4,4,32128.0,ibm/PowerMoE-3b,4,8,bfloat16,0.9484522187709808,314.6519,40.68,0.318,2817.082