foundation-model-stack · fabianlim · Feb 24, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
@@ -40,7 +40,11 @@ class ScatterMoEAccelerationPlugin(AccelerationPlugin):
     # if we decide to extract the kernels, then we do not need to anymore,
     # https://github.com/foundation-model-stack/fms-acceleration/issues/105
 
-    restricted_model_archs = ["GraniteMoeForCausalLM", "MixtralForCausalLM"]
+    restricted_model_archs = [
+        "GraniteMoeForCausalLM",
+        "MixtralForCausalLM",
+        "GraniteMoeSharedForCausalLM",
+    ]
 
     def __init__(self, configurations: Dict[str, Dict]):
         super().__init__(configurations)

diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py
@@ -76,6 +76,13 @@
         SCATTERMOE_SPEC_HAS_GATE,
         False,
     ),
+    "GraniteMoeSharedForCausalLM": (
+        "GraniteMoeSharedMoE",
+        "router",
+        "input_linear|output_linear|input_linear",
+        SCATTERMOE_SPEC_HAS_GATE,
+        False,
+    ),
 }
 
 

diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py
@@ -44,6 +44,7 @@ def register_foak_model_patch_rules(
         gpt_bigcode,
         granite,
         granitemoe,
+        granitemoeshared,
         llama,
         mistral,
         mixtral,
@@ -54,6 +55,7 @@ def register_foak_model_patch_rules(
         *gpt_bigcode.get_mp_rules(base_type),
         *granite.get_mp_rules(base_type, config),
         *granitemoe.get_mp_rules(base_type),
+        *granitemoeshared.get_mp_rules(base_type),
         *llama.get_mp_rules(base_type, config),
         *mistral.get_mp_rules(base_type, config),
         *mixtral.get_mp_rules(base_type),
@@ -91,6 +93,7 @@ class FastKernelsAccelerationPlugin(AccelerationPlugin):
         "MixtralForCausalLM",
         "LlamaForCausalLM",
         "MistralForCausalLM",
+        "GraniteMoeSharedForCausalLM",
     ]
 
     def __init__(self, configurations: Dict[str, Dict]):

diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granitemoeshared.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granitemoeshared.py
@@ -0,0 +1,117 @@
+# Copyright The FMS HF Tuning Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Standard
+from functools import partial
+
+# Third Party
+from fms_acceleration.model_patcher import (
+    ModelPatcherRule,
+    ModelPatcherTrigger,
+    combine_functions,
+    combine_triggers,
+)
+
+# Local
+from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss
+from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm
+from ..kernels.unsloth.rope_embedding import fast_rope_embedding
+from .utils import KEY_O, KEY_QKV, build_lora_fused_ops, trigger_fused_ops
+
+
+def get_mp_rules(base_type: str):
+    """
+    Function to access all patch rules in this module.
+    If it is a forward_builder rule with `base_type` in
+    its forward builder argument, wrap the forward_builder
+    function as a partial function with the base_type argument
+    """
+    try:
+        # Third Party
+        from transformers.models.granitemoeshared.modeling_granitemoeshared import (  # pylint: disable=import-outside-toplevel
+            GraniteMoeSharedAttention,
+            GraniteMoeSharedRMSNorm,
+        )
+    except ImportError:
+        return []
+
+    return [
+        # TODO: have a generic version of this rule
+        # - do regex on RMSNorm class name
+        # - check on the tensors required for fast_rms_layernorm
+        ModelPatcherRule(
+            rule_id="granitemoeshared-rms",
+            trigger=ModelPatcherTrigger(check=GraniteMoeSharedRMSNorm),
+            forward=fast_rms_layernorm,
+        ),
+        # TODO: have a generic version of this rule
+        # - do regex on Attention class name
+        # - have a set of qkv / o module names and check on that
+        ModelPatcherRule(
+            rule_id="granitemoeshared-qkvo",
+            trigger=combine_triggers(
+                ModelPatcherTrigger(
+                    check=partial(
+                        trigger_fused_ops,
+                        attn_cls=GraniteMoeSharedAttention,
+                        submodule_names=["q_proj", "k_proj", "v_proj"],
+                    )
+                ),
+                ModelPatcherTrigger(
+                    check=partial(
+                        trigger_fused_ops,
+                        attn_cls=GraniteMoeSharedAttention,
+                        submodule_names=["o_proj"],
+                    )
+                ),
+                logic="OR",
+            ),
+            forward_builder=combine_functions(
+                partial(
+                    build_lora_fused_ops,
+                    submodule_names=["q_proj", "k_proj", "v_proj"],
+                    fused_op=KEY_QKV,
+                    base_type=base_type,
+                ),
+                partial(
+                    build_lora_fused_ops,
+                    submodule_names=["o_proj"],
+                    fused_op=KEY_O,
+                    base_type=base_type,
+                ),
+                logic="APPEND",
+            ),
+        ),
+        ModelPatcherRule(
+            rule_id="granitemoeshared-cross-ent",
+            import_and_maybe_reload=(
+                "torch.nn.CrossEntropyLoss",
+                FastCrossEntropyLoss,
+                "transformers.models.granitemoeshared.modeling_granitemoeshared",
+            ),
+        ),
+        # TODO: have a generic version of this rule
+        # - get the module name
+        # - check if "apply_rotary_pos_emb" exists
+        # - patch
+        ModelPatcherRule(
+            rule_id="granitemoeshared-rope",
+            import_and_maybe_reload=(
+                "transformers.models.granitemoeshared.\
+                    modeling_granitemoeshared.apply_rotary_pos_emb",
+                fast_rope_embedding,
+                None,
+            ),
+        ),
+    ]
diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py
@@ -319,10 +319,11 @@ def build_args_from_products(products: List[Dict], defaults: Dict):
                     ]
                 )
             elif grad_accum is None and pdtbs is not None:
+                gas = effective_batch_size // num_gpus // pdtbs
                 argument_list.extend(
                     [
                         "--gradient_accumulation_steps",
-                        str(effective_batch_size // num_gpus // pdtbs),
+                        str(1 if gas == 0 else gas),
                     ]
                 )
             else:

diff --git a/scripts/benchmarks/scenarios-moe.yaml b/scripts/benchmarks/scenarios-moe.yaml
@@ -36,7 +36,7 @@
 
 
 scenarios:
-    -   name: accelerated-moe-full
+    -   name: accelerated-moe-full-granite-moe
         framework_config: 
             - # without acceleration
             - moe-scattermoe-granite-ep1
@@ -59,6 +59,47 @@ scenarios:
             model_name_or_path: 
                 - 'ibm-granite/granite-3.0-3b-a800m-instruct'
 
+    -   name: accelerated-moe-full-granite-moe-shared
+        framework_config: 
+            - # without acceleration
+            - moe-scattermoe-granite-ep1
+            - moe-scattermoe-granite-ep2
+            - moe-scattermoe-granite-ep4
+            - moe-scattermoe-granite-ep1-padding-free
+            - moe-scattermoe-granite-ep1-padding-free-foak
+            - moe-scattermoe-granite-ep2-padding-free
+            - moe-scattermoe-granite-ep2-padding-free-foak
+            - moe-scattermoe-granite-ep4-padding-free
+            - moe-scattermoe-granite-ep4-padding-free-foak
+        arguments:
+            learning_rate: 5e-5
+            torch_dtype: bfloat16
+            gradient_accumulation_steps: null
+            per_device_train_batch_size: 8
+            logging_steps: 1
+            packing: False
+            adam_epsilon: 1e-8
+            model_name_or_path: 
+                - 'ibm-research/moe-7b-1b-active-shared-experts'
+
+    -   name: accelerated-moe-full-granite-moe-shared-small
+        framework_config: 
+            - # without acceleration
+            - moe-scattermoe-granite-ep4
+            - moe-scattermoe-granite-ep4-padding-free
+            - moe-scattermoe-granite-ep4-padding-free-foak
+        arguments:
+            learning_rate: 5e-5
+            torch_dtype: bfloat16
+            gradient_accumulation_steps: null
+            per_device_train_batch_size: 8
+            logging_steps: 1
+            packing: False
+            adam_epsilon: 1e-8
+            model_name_or_path: 
+                - 'ibm-research/moe-7b-1b-active-shared-experts'
+
+
     -   name: accelerated-moe-full-mixtral
         framework_config: 
             - # without acceleration