Merge remote-tracking branch 'upstream/main'

dchourasia · dchourasia · commit 871b3dad3487 · 2025-04-03T00:14:43.000Z
diff --git a/README.md b/README.md
@@ -800,7 +800,7 @@ The list of configurations for various `fms_acceleration` plugins:
   - `--padding_free`: technique to process multiple examples in single batch without adding padding tokens that waste compute.
   - `--multipack`: technique for *multi-gpu training* to balance out number of tokens processed in each device, to minimize waiting time.
 - [fast_moe_config](./tuning/config/acceleration_configs/fast_moe.py) (experimental):
-  - `--fast_moe`: trains MoE models in parallel, increasing throughput and decreasing memory usage.
+  - `--fast_moe`: trains MoE models in parallel with [Scatter MoE kernels](https://github.com/foundation-model-stack/fms-acceleration/tree/main/plugins/accelerated-moe#fms-acceleration-for-mixture-of-experts), increasing throughput and decreasing memory usage.
 
 Notes: 
  * `quantized_lora_config` requires that it be used along with LoRA tuning technique. See [LoRA tuning section](https://github.com/foundation-model-stack/fms-hf-tuning/tree/main?tab=readme-ov-file#lora-tuning-example) on the LoRA parameters to pass.
@@ -820,8 +820,13 @@ Notes:
     - works only for *multi-gpu*.
     - currently only includes the version of *multipack* optimized for linear attention implementations like *flash-attn*.
  * Notes on Fast MoE
-    - `--fast_moe` is an integer value that configures the amount of expert parallel sharding (ep_degree).
+    - `--fast_moe` takes either an integer or boolean value.
+      - When an integer `n` is passed, it enables expert parallel sharding with the expert parallel degree as `n` along with Scatter MoE kernels enabled.
+      - When a boolean is passed, the expert parallel degree defaults to 1 and further the behaviour would be as follows:
+          - if True, it is Scatter MoE Kernels with experts sharded based on the top level sharding protocol (e.g. FSDP).
+          - if False, Scatter MoE Kernels with complete replication of experts across ranks.
     - `world_size` must be divisible by the `ep_degree`
+    - `number of experts` in the MoE module must be divisible by the `ep_degree`
     - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script.
       - The typical usecase for this script is to run:
         ```
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -1352,21 +1352,23 @@ def test_run_e2e_with_hf_dataset_id(data_args):
     reason="Only runs if fms-accelerate is installed along with accelerated-moe plugin",
 )
 @pytest.mark.parametrize(
-    "dataset_path",
+    "dataset_path, ep_degree",
     [
-        TWITTER_COMPLAINTS_DATA_JSONL,
+        (TWITTER_COMPLAINTS_DATA_JSONL, 1),
+        (TWITTER_COMPLAINTS_DATA_JSONL, True),
+        (TWITTER_COMPLAINTS_DATA_JSONL, False),
     ],
 )
-def test_run_moe_ft_and_inference(dataset_path):
-    """Check if we can finetune a moe model and check if hf checkpoint is created"""
+def test_run_moe_ft_and_inference_ep1_kernels(dataset_path, ep_degree):
+    """Check if we can finetune a moe model with moe kernels and ep_degree=1"""
     with tempfile.TemporaryDirectory() as tempdir:
         data_args = copy.deepcopy(DATA_ARGS)
         data_args.training_data_path = dataset_path
         model_args = copy.deepcopy(MODEL_ARGS)
         model_args.model_name_or_path = "Isotonic/TinyMixtral-4x248M-MoE"
         train_args = copy.deepcopy(TRAIN_ARGS)
         train_args.output_dir = tempdir
-        fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=1))
+        fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=ep_degree))
         sft_trainer.train(
             model_args, data_args, train_args, fast_moe_config=fast_moe_config
         )
diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 # Standard
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import Union
+import argparse
 import os
 
 # Third Party
@@ -42,8 +44,15 @@
 @parsable_dataclass
 @dataclass
 class FastMoe:
+    ep_degree: Union[int, bool] = 1
+    disable_distributed: bool = field(
+        default=False, metadata={"help": argparse.SUPPRESS}
+    )
 
-    ep_degree: int = 1
+    def __post_init__(self):
+        if isinstance(self.ep_degree, bool):
+            self.disable_distributed = self.ep_degree
+            self.ep_degree = 1
 
 
 @dataclass
diff --git a/tuning/config/acceleration_configs/utils.py b/tuning/config/acceleration_configs/utils.py
@@ -14,7 +14,8 @@
 
 # Standard
 from dataclasses import fields, is_dataclass
-from typing import List, Type, get_type_hints
+from typing import List, Type, Union, get_type_hints
+import argparse
 
 # Third Party
 from transformers.hf_argparser import DataClass, string_to_bool
@@ -39,6 +40,15 @@ def ensure_nested_dataclasses_initialized(dataclass: DataClass):
         setattr(dataclass, f.name, values)
 
 
+def bool_or_int(v):
+    if isinstance(v, str):
+        if v.isdigit():
+            return int(v)
+    elif isinstance(v, int):
+        return v
+    return string_to_bool(v)
+
+
 class EnsureTypes:
     """EnsureTypes is a caster with an internal state to memorize the
     the casting order, so that we can apply the correct casting type.
@@ -47,7 +57,7 @@ class EnsureTypes:
     """
 
     def __init__(self, *types: Type):
-        _map = {bool: string_to_bool}
+        _map = {bool: string_to_bool, Union[bool, int]: bool_or_int}
         self.types = [_map.get(t, t) for t in types]
         self.reset()
 
@@ -76,7 +86,13 @@ def parsable_dataclass(cls):
     if not is_dataclass(cls):
         raise ValueError("parsable only works with dataclass")
 
-    types = [fi.type for fi in fields(cls)]
+    types = (
+        fi.type
+        for fi in fields(cls)
+        if not any(
+            v == argparse.SUPPRESS for k, v in fi.metadata.items() if k == "help"
+        )
+    )
 
     class ParsableDataclass(cls, List):