diff --git a/plugins/accelerated-moe/README.md b/plugins/accelerated-moe/README.md index ff1c31b2..45ae209d 100644 --- a/plugins/accelerated-moe/README.md +++ b/plugins/accelerated-moe/README.md @@ -51,12 +51,12 @@ tox -e run-benches \ -x testenv:run-benches.deps+="-r plugins/accelerated-moe/requirements-khd.txt" \ -x testenv:run-benches.setenv+="MEMORY_LOGGING=nvidia" \ -- \ - "1 2 4" 128 benchmark_outputs scenarios-moe.yaml accelerated-moe-scatter + "1 2 4" 128 benchmark_outputs scenarios-moe.yaml accelerated-moe-full ``` or run the larger `Mixtral-8x7B` bench: ``` tox ... \ - 8 128 benchmark_outputs scenarios-moe.yaml accelerated-moe-scatter-mixtral + 8 128 benchmark_outputs scenarios-moe.yaml accelerated-moe-full-mixtral ``` NOTE: if `FileNotFoundError` is observed on the *triton cache*, similar to issues like these: diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py index 528693ea..2ad26ed2 100644 --- a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py +++ b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py @@ -40,7 +40,11 @@ class ScatterMoEAccelerationPlugin(AccelerationPlugin): # if we decide to extract the kernels, then we do not need to anymore, # https://github.com/foundation-model-stack/fms-acceleration/issues/105 - restricted_model_archs = ["GraniteMoeForCausalLM", "MixtralForCausalLM"] + restricted_model_archs = [ + "GraniteMoeForCausalLM", + "MixtralForCausalLM", + "GraniteMoeSharedForCausalLM", + ] def __init__(self, configurations: Dict[str, Dict]): super().__init__(configurations) diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py index be3057b5..2a6847be 100644 --- a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py +++ b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py @@ -76,6 +76,13 @@ SCATTERMOE_SPEC_HAS_GATE, False, ), + "GraniteMoeSharedForCausalLM": ( + "GraniteMoeSharedMoE", + "router", + "input_linear|output_linear|input_linear", + SCATTERMOE_SPEC_HAS_GATE, + False, + ), } diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py index b1a4b2b3..818a7e51 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py @@ -44,6 +44,7 @@ def register_foak_model_patch_rules( gpt_bigcode, granite, granitemoe, + granitemoeshared, llama, mistral, mixtral, @@ -54,6 +55,7 @@ def register_foak_model_patch_rules( *gpt_bigcode.get_mp_rules(base_type), *granite.get_mp_rules(base_type, config), *granitemoe.get_mp_rules(base_type), + *granitemoeshared.get_mp_rules(base_type), *llama.get_mp_rules(base_type, config), *mistral.get_mp_rules(base_type, config), *mixtral.get_mp_rules(base_type), @@ -91,6 +93,7 @@ class FastKernelsAccelerationPlugin(AccelerationPlugin): "MixtralForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", + "GraniteMoeSharedForCausalLM", ] def __init__(self, configurations: Dict[str, Dict]): diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granitemoeshared.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granitemoeshared.py new file mode 100644 index 00000000..dd28e7ac --- /dev/null +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granitemoeshared.py @@ -0,0 +1,141 @@ +# Copyright The FMS HF Tuning Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Standard +from functools import partial + +# Third Party +from fms_acceleration.model_patcher import ( + ModelPatcherRule, + ModelPatcherTrigger, + combine_functions, + combine_triggers, +) + +# Local +from ..kernels.unsloth.cross_entropy_loss import ( + FastCrossEntropyLoss, + replace_custom_loss_when_triggered, +) +from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm +from ..kernels.unsloth.rope_embedding import fast_rope_embedding +from .utils import ( + KEY_O, + KEY_QKV, + build_lora_fused_ops, + get_transformers_version, + trigger_fused_ops, +) + + +def get_mp_rules(base_type: str): + """ + Function to access all patch rules in this module. + If it is a forward_builder rule with `base_type` in + its forward builder argument, wrap the forward_builder + function as a partial function with the base_type argument + """ + try: + # Third Party + from transformers.models.granitemoeshared.modeling_granitemoeshared import ( # pylint: disable=import-outside-toplevel + GraniteMoeSharedAttention, + GraniteMoeSharedForCausalLM, + GraniteMoeSharedRMSNorm, + ) + except ImportError: + return [] + + return [ + # TODO: have a generic version of this rule + # - do regex on RMSNorm class name + # - check on the tensors required for fast_rms_layernorm + ModelPatcherRule( + rule_id="granitemoeshared-rms", + trigger=ModelPatcherTrigger(check=GraniteMoeSharedRMSNorm), + forward=fast_rms_layernorm, + ), + # TODO: have a generic version of this rule + # - do regex on Attention class name + # - have a set of qkv / o module names and check on that + ModelPatcherRule( + rule_id="granitemoeshared-qkvo", + trigger=combine_triggers( + ModelPatcherTrigger( + check=partial( + trigger_fused_ops, + attn_cls=GraniteMoeSharedAttention, + submodule_names=["q_proj", "k_proj", "v_proj"], + ) + ), + ModelPatcherTrigger( + check=partial( + trigger_fused_ops, + attn_cls=GraniteMoeSharedAttention, + submodule_names=["o_proj"], + ) + ), + logic="OR", + ), + forward_builder=combine_functions( + partial( + build_lora_fused_ops, + submodule_names=["q_proj", "k_proj", "v_proj"], + fused_op=KEY_QKV, + base_type=base_type, + ), + partial( + build_lora_fused_ops, + submodule_names=["o_proj"], + fused_op=KEY_O, + base_type=base_type, + ), + logic="APPEND", + ), + ), + *[ + ( + ModelPatcherRule( + rule_id="granitemoeshared-custom-loss", + trigger=ModelPatcherTrigger( + check=replace_custom_loss_when_triggered( + GraniteMoeSharedForCausalLM, + custom_loss_type="granite-custom-loss", + ) + ), + ) + if get_transformers_version() >= "4.46" + else ModelPatcherRule( + rule_id="granitemoeshared-cross-ent", + import_and_maybe_reload=( + "torch.nn.CrossEntropyLoss", + FastCrossEntropyLoss, + "transformers.models.granitemoeshared.modeling_granitemoeshared", + ), + ) + ) + ], + # TODO: have a generic version of this rule + # - get the module name + # - check if "apply_rotary_pos_emb" exists + # - patch + ModelPatcherRule( + rule_id="granitemoeshared-rope", + import_and_maybe_reload=( + "transformers.models.granitemoeshared.\ + modeling_granitemoeshared.apply_rotary_pos_emb", + fast_rope_embedding, + None, + ), + ), + ] diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py index 72e0e23c..9a53ceb0 100644 --- a/scripts/benchmarks/benchmark.py +++ b/scripts/benchmarks/benchmark.py @@ -319,10 +319,11 @@ def build_args_from_products(products: List[Dict], defaults: Dict): ] ) elif grad_accum is None and pdtbs is not None: + grad_accum_steps = effective_batch_size // num_gpus // pdtbs argument_list.extend( [ "--gradient_accumulation_steps", - str(effective_batch_size // num_gpus // pdtbs), + str(1 if grad_accum_steps == 0 else grad_accum_steps), ] ) else: diff --git a/scripts/benchmarks/refs/a100_80gb_moe.csv b/scripts/benchmarks/refs/a100_80gb_moe.csv index 4936cd6f..4bdbb8d9 100644 --- a/scripts/benchmarks/refs/a100_80gb_moe.csv +++ b/scripts/benchmarks/refs/a100_80gb_moe.csv @@ -1,25 +1,42 @@ -epoch,framework_config,gradient_accumulation_steps,mem_nvidia_mem_reserved,model_name_or_path,num_gpus,per_device_train_batch_size,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second -0.25,none,16.0,71199.0,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.9438143467903136,2371.9316,5.396,0.042,1505.608 -0.25,none,8.0,46829.0,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.9437569552659988,1355.7096,9.442,0.074,1317.096 -0.25,none,4.0,37996.0,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.9437739425897598,708.3914,18.069,0.141,1260.32 -0.25,moe-scattermoe-granite-ep1,16.0,71187.0,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.9439476370811464,742.739,17.234,0.135,4808.149 -0.25,moe-scattermoe-granite-ep1,8.0,52503.0,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.9506204092502594,485.5103,26.364,0.206,3677.78 -0.25,moe-scattermoe-granite-ep1,4.0,51145.0,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.9572784686088562,262.9566,48.677,0.38,3395.238 -0.25,moe-scattermoe-granite-ep2,8.0,40193.0,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.9437192791700364,577.2164,22.175,0.173,3093.467 -0.25,moe-scattermoe-granite-ep2,4.0,40878.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.9509018939733506,300.285,42.626,0.333,2973.176 -0.25,moe-scattermoe-granite-ep4,4.0,31777.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.9434539985656738,307.1264,41.677,0.326,2906.946 -0.25,moe-scattermoe-granite-ep1-padding-free,16.0,48401.0,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.9437484860420228,631.9756,20.254,0.158,3924.202 -0.25,moe-scattermoe-granite-ep1-padding-free,8.0,42452.0,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.9506663566827774,454.3444,28.172,0.22,2729.207 -0.25,moe-scattermoe-granite-ep1-padding-free,4.0,38560.0,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.957276314496994,241.2967,53.047,0.414,2569.451 -0.25,moe-scattermoe-granite-ep2-padding-free,8.0,31012.0,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.943688799738884,546.507,23.421,0.183,2268.955 -0.25,moe-scattermoe-granite-ep2-padding-free,4.0,28133.0,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.9505942213535308,283.5444,45.143,0.353,2186.607 -0.25,moe-scattermoe-granite-ep4-padding-free,4.0,21585.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.9441865116357804,284.6079,44.974,0.351,2178.436 -0.25,moe-scattermoe-granite-ep1-padding-free-foak,16.0,42651.0,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.9437448275089264,615.4528,20.798,0.162,4029.554 -0.25,moe-scattermoe-granite-ep1-padding-free-foak,8.0,37743.0,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.950773031115532,433.4811,29.528,0.231,2860.563 -0.25,moe-scattermoe-granite-ep1-padding-free-foak,4.0,35153.0,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.9572476959228516,232.0428,55.162,0.431,2671.921 -0.25,moe-scattermoe-granite-ep2-padding-free-foak,8.0,26075.0,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.9437651455402374,524.7751,24.391,0.191,2362.917 -0.25,moe-scattermoe-granite-ep2-padding-free-foak,4.0,24665.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.9507779973745346,274.126,46.694,0.365,2261.733 -0.25,moe-scattermoe-granite-ep4-padding-free-foak,4.0,18368.0,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.943427557349205,278.1245,46.023,0.36,2229.217 -,none,,65607.25,mistralai/Mixtral-8x7B-Instruct-v0.1,8,1,bfloat16,0.8599078696966171,4180.9544,3.062,0.024,80.364 -,moe-scattermoe-granite-ep8,,52004.75,mistralai/Mixtral-8x7B-Instruct-v0.1,8,1,bfloat16,0.8588122856616974,1071.1967,11.949,0.093,313.668 -,moe-scattermoe-granite-ep8-foak,,51961.25,mistralai/Mixtral-8x7B-Instruct-v0.1,8,1,bfloat16,0.8599798053503036,1043.6675,12.264,0.096,321.942 +epoch,framework_config,gradient_accumulation_steps,mem_nvidia_mem_reserved,model_name_or_path,num_gpus,per_device_train_batch_size,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second +0.25,none,16,72072,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.938093501,1986.7714,6.443,0.05,1797.489 +0.25,none,8,49689,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.937983845,1082.5484,11.824,0.092,1649.441 +0.25,none,4,41754.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.93852025,569.5617,22.473,0.176,1567.521 +0.25,moe-scattermoe-granite-ep1,16,72068,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.938054211,660.687,19.374,0.151,5405.283 +0.25,moe-scattermoe-granite-ep1,8,53917,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.944801819,362.751,35.286,0.276,4922.385 +0.25,moe-scattermoe-granite-ep1,4,53070,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.95192752,202.3782,63.248,0.494,4411.543 +0.25,moe-scattermoe-granite-ep2,8,41880,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.938050581,441.5269,28.99,0.226,4044.147 +0.25,moe-scattermoe-granite-ep2,4,43092,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.945302382,235.4383,54.367,0.425,3792.076 +0.25,moe-scattermoe-granite-ep4,4,33673.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.938171822,259.2932,49.365,0.386,3443.207 +0.25,moe-scattermoe-granite-ep1-padding-free,16,49580,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.937993399,505.6847,25.312,0.198,4904.241 +0.25,moe-scattermoe-granite-ep1-padding-free,8,43821,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.944808855,311.785,41.054,0.321,3977.099 +0.25,moe-scattermoe-granite-ep1-padding-free,4,40070.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.951866873,169.9554,75.314,0.588,3648.016 +0.25,moe-scattermoe-granite-ep1-padding-free-foak,16,49114,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.938123143,476.8099,26.845,0.21,5201.235 +0.25,moe-scattermoe-granite-ep1-padding-free-foak,8,43865,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.944894351,296.5204,43.167,0.337,4181.837 +0.25,moe-scattermoe-granite-ep1-padding-free-foak,4,40070.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.951975068,163.756,78.165,0.611,3786.12 +0.25,moe-scattermoe-granite-ep2-padding-free,8,32276,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.937930156,356.1296,35.942,0.281,3481.878 +0.25,moe-scattermoe-granite-ep2-padding-free,4,29787,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.945339936,192.7168,66.419,0.519,3217.156 +0.25,moe-scattermoe-granite-ep2-padding-free-foak,8,32376,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.938017525,342.9327,37.325,0.292,3615.87 +0.25,moe-scattermoe-granite-ep2-padding-free-foak,4,29734.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.945357794,184.554,69.356,0.542,3359.451 +0.25,moe-scattermoe-granite-ep4-padding-free,4,23386.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.938359724,191.205,66.944,0.523,3242.593 +0.25,moe-scattermoe-granite-ep4-padding-free-foak,4,23359.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.938333818,183.9191,69.596,0.544,3371.048 +0.25,none,16,81018,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,0.878051637,4223.9158,3.03,0.024,839.411 +0.25,none,8,74462,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.877874975,2247.4716,5.695,0.044,788.798 +0.25,none,4,63033,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.878253661,1155.5903,11.077,0.087,767.054 +0.25,moe-scattermoe-granite-ep1,16,81018,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,0.878006854,907.8407,14.099,0.11,3905.531 +0.25,moe-scattermoe-granite-ep1,8,73870,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.879557709,492.5063,25.99,0.203,3599.548 +0.25,moe-scattermoe-granite-ep1,4,74108.5,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.881521969,277.8191,46.073,0.36,3190.565 +0.25,moe-scattermoe-granite-ep2,8,54168,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.877982622,563.0434,22.734,0.178,3148.603 +0.25,moe-scattermoe-granite-ep2,4,54582,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.880103117,299.2522,42.773,0.334,2962.05 +0.25,moe-scattermoe-granite-ep1-padding-free,16,77632,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,0.878018975,726.1255,17.628,0.138,3410.98 +0.25,moe-scattermoe-granite-ep1-padding-free,8,68019,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.879643369,429.5618,29.798,0.233,2882.938 +0.25,moe-scattermoe-granite-ep1-padding-free,4,63879,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.88148216,239.3677,53.474,0.418,2586.815 +0.25,moe-scattermoe-granite-ep1-padding-free-foak,16,72666,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,0.878073001,688.38,18.594,0.145,3598.013 +0.25,moe-scattermoe-granite-ep1-padding-free-foak,8,63074,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.879622684,419.7876,30.492,0.238,2950.063 +0.25,moe-scattermoe-granite-ep1-padding-free-foak,4,60126.5,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.881447418,231.7976,55.221,0.431,2671.296 +0.25,moe-scattermoe-granite-ep2-padding-free,8,45093,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.8779908,471.1344,27.168,0.212,2628.549 +0.25,moe-scattermoe-granite-ep2-padding-free,4,42590,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.879999972,250.48,51.102,0.399,2472.054 +0.25,moe-scattermoe-granite-ep2-padding-free-foak,8,40281,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.878110015,461.6668,27.726,0.217,2682.454 +0.25,moe-scattermoe-granite-ep2-padding-free-foak,4,38934.5,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.880085612,250.2941,51.14,0.4,2473.889 +0.25,moe-scattermoe-granite-ep8,16,56845,mistralai/Mixtral-8x7B-Instruct-v0.1,8,1,bfloat16,0.86557077,779.9315,16.412,0.128,430.807 +0.25,moe-scattermoe-granite-ep8-foak,16,56769.25,mistralai/Mixtral-8x7B-Instruct-v0.1,8,1,bfloat16,0.86551428,734.0756,17.437,0.136,457.719 \ No newline at end of file diff --git a/scripts/benchmarks/refs/requirements_moe.txt b/scripts/benchmarks/refs/requirements_moe.txt index 63700ed0..9f5fc5f5 100644 --- a/scripts/benchmarks/refs/requirements_moe.txt +++ b/scripts/benchmarks/refs/requirements_moe.txt @@ -1,44 +1,42 @@ accelerate==1.0.1 -aiohappyeyeballs==2.4.3 -aiohttp==3.10.10 -aiosignal==1.3.1 -async-timeout==4.0.3 -attrs==24.2.0 +aiohappyeyeballs==2.4.6 +aiohttp==3.11.12 +aiosignal==1.3.2 +attrs==25.1.0 bitsandbytes==0.43.3 -certifi==2024.8.30 -charset-normalizer==3.4.0 -contourpy==1.3.0 +certifi==2025.1.31 +charset-normalizer==3.4.1 +contourpy==1.3.1 cycler==0.12.1 datasets==2.21.0 dill==0.3.8 -docstring_parser==0.16 -einops==0.8.0 -filelock==3.16.1 -flash-attn==2.6.3 +einops==0.8.1 +filelock==3.17.0 +flash_attn==2.7.4.post1 -e git+https://github.com/foundation-model-stack/fms-acceleration.git@21af5fb9f2989b3dbf443c016e4c0470b536a593#egg=fms_acceleration&subdirectory=plugins/framework -e git+https://github.com/foundation-model-stack/fms-acceleration.git@21af5fb9f2989b3dbf443c016e4c0470b536a593#egg=fms_acceleration_aadp&subdirectory=plugins/attention-and-distributed-packing -e git+https://github.com/foundation-model-stack/fms-acceleration.git@21af5fb9f2989b3dbf443c016e4c0470b536a593#egg=fms_acceleration_foak&subdirectory=plugins/fused-ops-and-kernels -e git+https://github.com/foundation-model-stack/fms-acceleration.git@21af5fb9f2989b3dbf443c016e4c0470b536a593#egg=fms_acceleration_moe&subdirectory=plugins/accelerated-moe -e git+https://github.com/foundation-model-stack/fms-acceleration.git@21af5fb9f2989b3dbf443c016e4c0470b536a593#egg=fms_acceleration_peft&subdirectory=plugins/accelerated-peft -fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@398c2a8fe26d734344240555585d95e05299faa8 -fonttools==4.54.1 +fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@fdc7527510692ada03e4303df1549cebc5139b31 +fonttools==4.56.0 frozenlist==1.5.0 fsspec==2024.6.1 -huggingface-hub==0.26.2 +huggingface-hub==0.29.0 idna==3.10 -Jinja2==3.1.4 +Jinja2==3.1.5 kernel-hyperdrive @ git+https://github.com/fabianlim/kernel-hyperdrive.git@45036497e12444ca98a6f0072204538aee4543ba -kiwisolver==1.4.7 -llvmlite==0.43.0 +kiwisolver==1.4.8 +llvmlite==0.44.0 markdown-it-py==3.0.0 MarkupSafe==3.0.2 -matplotlib==3.9.2 +matplotlib==3.10.0 mdurl==0.1.2 mpmath==1.3.0 multidict==6.1.0 multiprocess==0.70.16 networkx==3.4.2 -numba==0.60.0 +numba==0.61.0 numpy==1.26.4 nvidia-cublas-cu12==12.1.3.1 nvidia-cuda-cupti-cu12==12.1.105 @@ -49,41 +47,41 @@ nvidia-cufft-cu12==11.0.2.54 nvidia-curand-cu12==10.3.2.106 nvidia-cusolver-cu12==11.4.5.107 nvidia-cusparse-cu12==12.1.0.106 +nvidia-cusparselt-cu12==0.6.2 nvidia-nccl-cu12==2.20.5 nvidia-nvjitlink-cu12==12.4.127 nvidia-nvtx-cu12==12.1.105 packaging==24.2 pandas==2.2.3 peft==0.13.2 -pillow==11.0.0 -propcache==0.2.0 -protobuf==5.28.3 -psutil==6.1.0 -pyarrow==18.0.0 -Pygments==2.18.0 -pyparsing==3.2.0 +pillow==11.1.0 +propcache==0.2.1 +protobuf==5.29.3 +psutil==7.0.0 +pyarrow==19.0.1 +Pygments==2.19.1 +pyparsing==3.2.1 python-dateutil==2.9.0.post0 -pytz==2024.2 +pytz==2025.1 PyYAML==6.0.2 regex==2024.11.6 requests==2.32.3 rich==13.9.4 -safetensors==0.4.5 +safetensors==0.5.2 sentencepiece==0.2.0 -shtab==1.7.1 +setuptools==75.8.0 simpleeval==0.9.13 -six==1.16.0 +six==1.17.0 sympy==1.13.1 threadpoolctl==3.5.0 -tokenizers==0.20.3 +tokenizers==0.21.0 torch==2.4.1 -tqdm==4.67.0 -transformers==4.45.2 +tqdm==4.67.1 +transformers==4.49.0 triton==3.0.0 -trl==0.11.4 +trl==0.14.0 typing_extensions==4.12.2 -tyro==0.8.14 -tzdata==2024.2 -urllib3==2.2.3 +tzdata==2025.1 +urllib3==2.3.0 xxhash==3.5.0 -yarl==1.17.1 +yarl==1.18.3 diff --git a/scripts/benchmarks/scenarios-moe.yaml b/scripts/benchmarks/scenarios-moe.yaml index efa2725e..a5171e11 100644 --- a/scripts/benchmarks/scenarios-moe.yaml +++ b/scripts/benchmarks/scenarios-moe.yaml @@ -58,6 +58,8 @@ scenarios: adam_epsilon: 1e-8 model_name_or_path: - 'ibm-granite/granite-3.0-3b-a800m-instruct' + - 'ibm-research/moe-7b-1b-active-shared-experts' + - name: accelerated-moe-full-mixtral framework_config: