Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@ class ScatterMoEAccelerationPlugin(AccelerationPlugin):
# if we decide to extract the kernels, then we do not need to anymore,
# https://github.com/foundation-model-stack/fms-acceleration/issues/105

restricted_model_archs = ["GraniteMoeForCausalLM", "MixtralForCausalLM"]
restricted_model_archs = [
"GraniteMoeForCausalLM",
"MixtralForCausalLM",
"GraniteMoeSharedForCausalLM",
]

def __init__(self, configurations: Dict[str, Dict]):
super().__init__(configurations)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,13 @@
SCATTERMOE_SPEC_HAS_GATE,
False,
),
"GraniteMoeSharedForCausalLM": (
"GraniteMoeSharedMoE",
"router",
"input_linear|output_linear|input_linear",
SCATTERMOE_SPEC_HAS_GATE,
False,
),
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def register_foak_model_patch_rules(
gpt_bigcode,
granite,
granitemoe,
granitemoeshared,
llama,
mistral,
mixtral,
Expand All @@ -54,6 +55,7 @@ def register_foak_model_patch_rules(
*gpt_bigcode.get_mp_rules(base_type),
*granite.get_mp_rules(base_type, config),
*granitemoe.get_mp_rules(base_type),
*granitemoeshared.get_mp_rules(base_type),
*llama.get_mp_rules(base_type, config),
*mistral.get_mp_rules(base_type, config),
*mixtral.get_mp_rules(base_type),
Expand Down Expand Up @@ -91,6 +93,7 @@ class FastKernelsAccelerationPlugin(AccelerationPlugin):
"MixtralForCausalLM",
"LlamaForCausalLM",
"MistralForCausalLM",
"GraniteMoeSharedForCausalLM",
]

def __init__(self, configurations: Dict[str, Dict]):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Copyright The FMS HF Tuning Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Standard
from functools import partial

# Third Party
from fms_acceleration.model_patcher import (
ModelPatcherRule,
ModelPatcherTrigger,
combine_functions,
combine_triggers,
)

# Local
from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss
from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm
from ..kernels.unsloth.rope_embedding import fast_rope_embedding
from .utils import KEY_O, KEY_QKV, build_lora_fused_ops, trigger_fused_ops


def get_mp_rules(base_type: str):
"""
Function to access all patch rules in this module.
If it is a forward_builder rule with `base_type` in
its forward builder argument, wrap the forward_builder
function as a partial function with the base_type argument
"""
try:
# Third Party
from transformers.models.granitemoeshared.modeling_granitemoeshared import ( # pylint: disable=import-outside-toplevel
GraniteMoeSharedAttention,
GraniteMoeSharedRMSNorm,
)
except ImportError:
return []

return [
# TODO: have a generic version of this rule
# - do regex on RMSNorm class name
# - check on the tensors required for fast_rms_layernorm
ModelPatcherRule(
rule_id="granitemoeshared-rms",
trigger=ModelPatcherTrigger(check=GraniteMoeSharedRMSNorm),
forward=fast_rms_layernorm,
),
# TODO: have a generic version of this rule
# - do regex on Attention class name
# - have a set of qkv / o module names and check on that
ModelPatcherRule(
rule_id="granitemoeshared-qkvo",
trigger=combine_triggers(
ModelPatcherTrigger(
check=partial(
trigger_fused_ops,
attn_cls=GraniteMoeSharedAttention,
submodule_names=["q_proj", "k_proj", "v_proj"],
)
),
ModelPatcherTrigger(
check=partial(
trigger_fused_ops,
attn_cls=GraniteMoeSharedAttention,
submodule_names=["o_proj"],
)
),
logic="OR",
),
forward_builder=combine_functions(
partial(
build_lora_fused_ops,
submodule_names=["q_proj", "k_proj", "v_proj"],
fused_op=KEY_QKV,
base_type=base_type,
),
partial(
build_lora_fused_ops,
submodule_names=["o_proj"],
fused_op=KEY_O,
base_type=base_type,
),
logic="APPEND",
),
),
ModelPatcherRule(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

refer to how granite does it

rule_id="granitemoeshared-cross-ent",
import_and_maybe_reload=(
"torch.nn.CrossEntropyLoss",
FastCrossEntropyLoss,
"transformers.models.granitemoeshared.modeling_granitemoeshared",
),
),
# TODO: have a generic version of this rule
# - get the module name
# - check if "apply_rotary_pos_emb" exists
# - patch
ModelPatcherRule(
rule_id="granitemoeshared-rope",
import_and_maybe_reload=(
"transformers.models.granitemoeshared.\
modeling_granitemoeshared.apply_rotary_pos_emb",
fast_rope_embedding,
None,
),
),
]
3 changes: 2 additions & 1 deletion scripts/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,10 +319,11 @@ def build_args_from_products(products: List[Dict], defaults: Dict):
]
)
elif grad_accum is None and pdtbs is not None:
gas = effective_batch_size // num_gpus // pdtbs
argument_list.extend(
[
"--gradient_accumulation_steps",
str(effective_batch_size // num_gpus // pdtbs),
str(1 if gas == 0 else gas),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok this worjks, but dont understand why you need it, because your benches use the same parameters as the existing ones, and we dont run into this issue

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing I noticed is the experiments continue to happen silently even when some of them are failed and also the benchmark report gets generated irrespectively,

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

each job is independent. the bench will run all jobs and those jobs get failed will have empty reports.

]
)
else:
Expand Down
43 changes: 42 additions & 1 deletion scripts/benchmarks/scenarios-moe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@


scenarios:
- name: accelerated-moe-full
- name: accelerated-moe-full-granite-moe
framework_config:
- # without acceleration
- moe-scattermoe-granite-ep1
Expand All @@ -59,6 +59,47 @@ scenarios:
model_name_or_path:
- 'ibm-granite/granite-3.0-3b-a800m-instruct'

- name: accelerated-moe-full-granite-moe-shared
framework_config:
- # without acceleration
- moe-scattermoe-granite-ep1
- moe-scattermoe-granite-ep2
- moe-scattermoe-granite-ep4
- moe-scattermoe-granite-ep1-padding-free
- moe-scattermoe-granite-ep1-padding-free-foak
- moe-scattermoe-granite-ep2-padding-free
- moe-scattermoe-granite-ep2-padding-free-foak
- moe-scattermoe-granite-ep4-padding-free
- moe-scattermoe-granite-ep4-padding-free-foak
arguments:
learning_rate: 5e-5
torch_dtype: bfloat16
gradient_accumulation_steps: null
per_device_train_batch_size: 8
logging_steps: 1
packing: False
adam_epsilon: 1e-8
model_name_or_path:
- 'ibm-research/moe-7b-1b-active-shared-experts'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you shouldnt copy and paste this, you should just add to model_name_or_path in the existing scenarios

model_name_or_path:
  - 'ibm-granite/granite-3.0-3b-a800m-instruct'
  - 'ibm-research/moe-7b-1b-active-shared-experts'


- name: accelerated-moe-full-granite-moe-shared-small
framework_config:
- # without acceleration
- moe-scattermoe-granite-ep4
- moe-scattermoe-granite-ep4-padding-free
- moe-scattermoe-granite-ep4-padding-free-foak
arguments:
learning_rate: 5e-5
torch_dtype: bfloat16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here.. dont copy and paste.. if its all the same arguments there is no need

I dont understand why you need this different bench

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wanted to run only for moesharedmodel so had to copy paste. Is there a way I can subselect a model along with scenario with scenariofilter? Apologies if I have missed that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry you cant. If you want to do ad hoc testing then just uncomment the other models you dont want to test. For the official bench we need to update all models, this is because we only version 1 set of requirements for reproducibilty, and we cant have partial benches running, otherwise there will be inconsistency

gradient_accumulation_steps: null
per_device_train_batch_size: 8
logging_steps: 1
packing: False
adam_epsilon: 1e-8
model_name_or_path:
- 'ibm-research/moe-7b-1b-active-shared-experts'


- name: accelerated-moe-full-mixtral
framework_config:
- # without acceleration
Expand Down