diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py index 52040e0e..123f99cb 100644 --- a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py +++ b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py @@ -36,6 +36,7 @@ class ScatterMoEAccelerationPlugin(AccelerationPlugin): "GraniteMoeForCausalLM", "MixtralForCausalLM", "GraniteMoeSharedForCausalLM", + "GraniteMoeHybridForCausalLM", ] def __init__(self, configurations: Dict[str, Dict]): diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py index 2a6847be..aa9b4e87 100644 --- a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py +++ b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py @@ -83,6 +83,13 @@ SCATTERMOE_SPEC_HAS_GATE, False, ), + "GraniteMoeHybridForCausalLM": ( + "GraniteMoeHybridMoE", + "router", + "input_linear|output_linear|input_linear", + SCATTERMOE_SPEC_HAS_GATE, + False, + ), } diff --git a/plugins/accelerated-peft/requirements.txt b/plugins/accelerated-peft/requirements.txt index e752f06b..f7670c7c 100644 --- a/plugins/accelerated-peft/requirements.txt +++ b/plugins/accelerated-peft/requirements.txt @@ -5,10 +5,9 @@ accelerate >= 0.29 # bitsandbytes for the BNB plugin -# - lower bound is because bnb is missing quant_state -# - upper bound is because of segmentation faults -# see https://github.com/foundation-model-stack/fms-acceleration/issues/17 -bitsandbytes >=0.41,<=0.43.3 +# versions above 0.45.1 to support torch 2.6 + +bitsandbytes >= 0.45.1 # Used to manage the thread limit in functions for converting old # GPTQ models to new GPTQ model format that support symmetrical=False diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py index 818a7e51..f3a31855 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py @@ -45,6 +45,7 @@ def register_foak_model_patch_rules( granite, granitemoe, granitemoeshared, + granitemoehybrid, llama, mistral, mixtral, @@ -56,6 +57,7 @@ def register_foak_model_patch_rules( *granite.get_mp_rules(base_type, config), *granitemoe.get_mp_rules(base_type), *granitemoeshared.get_mp_rules(base_type), + *granitemoehybrid.get_mp_rules(base_type), *llama.get_mp_rules(base_type, config), *mistral.get_mp_rules(base_type, config), *mixtral.get_mp_rules(base_type), @@ -94,6 +96,7 @@ class FastKernelsAccelerationPlugin(AccelerationPlugin): "LlamaForCausalLM", "MistralForCausalLM", "GraniteMoeSharedForCausalLM", + "GraniteMoeHybridForCausalLM", ] def __init__(self, configurations: Dict[str, Dict]): diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granitemoehybrid.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granitemoehybrid.py new file mode 100644 index 00000000..16935150 --- /dev/null +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granitemoehybrid.py @@ -0,0 +1,141 @@ +# Copyright The FMS HF Tuning Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Standard +from functools import partial + +# Third Party +from fms_acceleration.model_patcher import ( + ModelPatcherRule, + ModelPatcherTrigger, + combine_functions, + combine_triggers, +) + +# Local +from ..kernels.unsloth.cross_entropy_loss import ( + FastCrossEntropyLoss, + replace_custom_loss_when_triggered, +) +from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm +from ..kernels.unsloth.rope_embedding import fast_rope_embedding +from .utils import ( + KEY_O, + KEY_QKV, + build_lora_fused_ops, + get_transformers_version, + trigger_fused_ops, +) + + +def get_mp_rules(base_type: str): + """ + Function to access all patch rules in this module. + If it is a forward_builder rule with `base_type` in + its forward builder argument, wrap the forward_builder + function as a partial function with the base_type argument + """ + try: + # Third Party + from transformers.models.granitemoehybrid.modeling_granitemoehybrid import ( # pylint: disable=import-outside-toplevel + GraniteMoeHybridAttention, + GraniteMoeHybridForCausalLM, + GraniteMoeHybridRMSNorm, + ) + except ImportError: + return [] + + return [ + # TODO: have a generic version of this rule + # - do regex on RMSNorm class name + # - check on the tensors required for fast_rms_layernorm + ModelPatcherRule( + rule_id="granitemoehybrid-rms", + trigger=ModelPatcherTrigger(check=GraniteMoeHybridRMSNorm), + forward=fast_rms_layernorm, + ), + # TODO: have a generic version of this rule + # - do regex on Attention class name + # - have a set of qkv / o module names and check on that + ModelPatcherRule( + rule_id="granitemoehybrid-qkvo", + trigger=combine_triggers( + ModelPatcherTrigger( + check=partial( + trigger_fused_ops, + attn_cls=GraniteMoeHybridAttention, + submodule_names=["q_proj", "k_proj", "v_proj"], + ) + ), + ModelPatcherTrigger( + check=partial( + trigger_fused_ops, + attn_cls=GraniteMoeHybridAttention, + submodule_names=["o_proj"], + ) + ), + logic="OR", + ), + forward_builder=combine_functions( + partial( + build_lora_fused_ops, + submodule_names=["q_proj", "k_proj", "v_proj"], + fused_op=KEY_QKV, + base_type=base_type, + ), + partial( + build_lora_fused_ops, + submodule_names=["o_proj"], + fused_op=KEY_O, + base_type=base_type, + ), + logic="APPEND", + ), + ), + *[ + ( + ModelPatcherRule( + rule_id="granitemoehybrid-custom-loss", + trigger=ModelPatcherTrigger( + check=replace_custom_loss_when_triggered( + GraniteMoeHybridForCausalLM, + custom_loss_type="granite-custom-loss", + ) + ), + ) + if get_transformers_version() >= "4.46" + else ModelPatcherRule( + rule_id="granitemoehybrid-cross-ent", + import_and_maybe_reload=( + "torch.nn.CrossEntropyLoss", + FastCrossEntropyLoss, + "transformers.models.granitemoehybrid.modeling_granitemoehybrid", + ), + ) + ) + ], + # TODO: have a generic version of this rule + # - get the module name + # - check if "apply_rotary_pos_emb" exists + # - patch + ModelPatcherRule( + rule_id="granitemoehybrid-rope", + import_and_maybe_reload=( + "transformers.models.granitemoehybrid.\ + modeling_granitemoehybrid.apply_rotary_pos_emb", + fast_rope_embedding, + None, + ), + ), + ] diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md index 4795efee..e172bc4e 100644 --- a/scripts/benchmarks/README.md +++ b/scripts/benchmarks/README.md @@ -56,6 +56,11 @@ The best way is via `tox` which manages the dependencies, including installing t pip install -r setup_requirements.txt ``` +- install mamba kernels to evaluate mamba based models: + ``` + tox -e run-benches -x testenv:run-benches.setenv+="INSTALL_MAMBA=true" ... + ``` + - run a *small* representative set of benches: ``` tox -e run-benches diff --git a/scripts/benchmarks/refs/a100_80gb_moe.csv b/scripts/benchmarks/refs/a100_80gb_moe.csv index 4bdbb8d9..a93f1b11 100644 --- a/scripts/benchmarks/refs/a100_80gb_moe.csv +++ b/scripts/benchmarks/refs/a100_80gb_moe.csv @@ -1,42 +1,42 @@ epoch,framework_config,gradient_accumulation_steps,mem_nvidia_mem_reserved,model_name_or_path,num_gpus,per_device_train_batch_size,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second -0.25,none,16,72072,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.938093501,1986.7714,6.443,0.05,1797.489 -0.25,none,8,49689,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.937983845,1082.5484,11.824,0.092,1649.441 -0.25,none,4,41754.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.93852025,569.5617,22.473,0.176,1567.521 -0.25,moe-scattermoe-granite-ep1,16,72068,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.938054211,660.687,19.374,0.151,5405.283 -0.25,moe-scattermoe-granite-ep1,8,53917,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.944801819,362.751,35.286,0.276,4922.385 -0.25,moe-scattermoe-granite-ep1,4,53070,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.95192752,202.3782,63.248,0.494,4411.543 -0.25,moe-scattermoe-granite-ep2,8,41880,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.938050581,441.5269,28.99,0.226,4044.147 -0.25,moe-scattermoe-granite-ep2,4,43092,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.945302382,235.4383,54.367,0.425,3792.076 -0.25,moe-scattermoe-granite-ep4,4,33673.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.938171822,259.2932,49.365,0.386,3443.207 -0.25,moe-scattermoe-granite-ep1-padding-free,16,49580,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.937993399,505.6847,25.312,0.198,4904.241 -0.25,moe-scattermoe-granite-ep1-padding-free,8,43821,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.944808855,311.785,41.054,0.321,3977.099 -0.25,moe-scattermoe-granite-ep1-padding-free,4,40070.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.951866873,169.9554,75.314,0.588,3648.016 -0.25,moe-scattermoe-granite-ep1-padding-free-foak,16,49114,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.938123143,476.8099,26.845,0.21,5201.235 -0.25,moe-scattermoe-granite-ep1-padding-free-foak,8,43865,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.944894351,296.5204,43.167,0.337,4181.837 -0.25,moe-scattermoe-granite-ep1-padding-free-foak,4,40070.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.951975068,163.756,78.165,0.611,3786.12 -0.25,moe-scattermoe-granite-ep2-padding-free,8,32276,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.937930156,356.1296,35.942,0.281,3481.878 -0.25,moe-scattermoe-granite-ep2-padding-free,4,29787,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.945339936,192.7168,66.419,0.519,3217.156 -0.25,moe-scattermoe-granite-ep2-padding-free-foak,8,32376,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.938017525,342.9327,37.325,0.292,3615.87 -0.25,moe-scattermoe-granite-ep2-padding-free-foak,4,29734.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.945357794,184.554,69.356,0.542,3359.451 -0.25,moe-scattermoe-granite-ep4-padding-free,4,23386.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.938359724,191.205,66.944,0.523,3242.593 -0.25,moe-scattermoe-granite-ep4-padding-free-foak,4,23359.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.938333818,183.9191,69.596,0.544,3371.048 -0.25,none,16,81018,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,0.878051637,4223.9158,3.03,0.024,839.411 -0.25,none,8,74462,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.877874975,2247.4716,5.695,0.044,788.798 -0.25,none,4,63033,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.878253661,1155.5903,11.077,0.087,767.054 -0.25,moe-scattermoe-granite-ep1,16,81018,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,0.878006854,907.8407,14.099,0.11,3905.531 -0.25,moe-scattermoe-granite-ep1,8,73870,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.879557709,492.5063,25.99,0.203,3599.548 -0.25,moe-scattermoe-granite-ep1,4,74108.5,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.881521969,277.8191,46.073,0.36,3190.565 -0.25,moe-scattermoe-granite-ep2,8,54168,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.877982622,563.0434,22.734,0.178,3148.603 -0.25,moe-scattermoe-granite-ep2,4,54582,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.880103117,299.2522,42.773,0.334,2962.05 -0.25,moe-scattermoe-granite-ep1-padding-free,16,77632,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,0.878018975,726.1255,17.628,0.138,3410.98 -0.25,moe-scattermoe-granite-ep1-padding-free,8,68019,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.879643369,429.5618,29.798,0.233,2882.938 -0.25,moe-scattermoe-granite-ep1-padding-free,4,63879,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.88148216,239.3677,53.474,0.418,2586.815 -0.25,moe-scattermoe-granite-ep1-padding-free-foak,16,72666,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,0.878073001,688.38,18.594,0.145,3598.013 -0.25,moe-scattermoe-granite-ep1-padding-free-foak,8,63074,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.879622684,419.7876,30.492,0.238,2950.063 -0.25,moe-scattermoe-granite-ep1-padding-free-foak,4,60126.5,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.881447418,231.7976,55.221,0.431,2671.296 -0.25,moe-scattermoe-granite-ep2-padding-free,8,45093,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.8779908,471.1344,27.168,0.212,2628.549 -0.25,moe-scattermoe-granite-ep2-padding-free,4,42590,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.879999972,250.48,51.102,0.399,2472.054 -0.25,moe-scattermoe-granite-ep2-padding-free-foak,8,40281,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.878110015,461.6668,27.726,0.217,2682.454 -0.25,moe-scattermoe-granite-ep2-padding-free-foak,4,38934.5,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.880085612,250.2941,51.14,0.4,2473.889 -0.25,moe-scattermoe-granite-ep8,16,56845,mistralai/Mixtral-8x7B-Instruct-v0.1,8,1,bfloat16,0.86557077,779.9315,16.412,0.128,430.807 -0.25,moe-scattermoe-granite-ep8-foak,16,56769.25,mistralai/Mixtral-8x7B-Instruct-v0.1,8,1,bfloat16,0.86551428,734.0756,17.437,0.136,457.719 \ No newline at end of file +0.25,none,16,77748,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.93802941,1830.0024,6.995,0.055,1951.473 +0.25,none,8,56837,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.937978864,970.492,13.189,0.103,1839.891 +0.25,none,4,47395,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.938433378,508.0143,25.196,0.197,1757.431 +0.25,moe-scattermoe-granite-ep1,16,78376,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,0.938084066,656.3588,19.502,0.152,5440.927 +0.25,moe-scattermoe-granite-ep2,8,45422,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,0.938047224,439.6446,29.114,0.227,4061.462 +0.25,moe-scattermoe-granite-ep2,4,46506,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.945220579,234.7146,54.534,0.426,3803.769 +0.25,moe-scattermoe-granite-ep4,4,37025.5,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,0.938337043,255.5461,50.089,0.391,3493.694 +0.25,moe-scattermoe-granite-ep1-padding-free,16,49462,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,1.196784412,431.9774,29.631,0.231,5741.041 +0.25,moe-scattermoe-granite-ep1-padding-free-foak,16,49060,ibm-granite/granite-3.0-3b-a800m-instruct,1,8,bfloat16,1.200383433,398.976,32.082,0.251,6215.913 +0.25,moe-scattermoe-granite-ep2-padding-free,8,32265,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,1.198062455,335.7106,38.128,0.298,3693.657 +0.25,moe-scattermoe-granite-ep2-padding-free,4,29720,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,1.210442821,180.924,70.748,0.553,3426.854 +0.25,moe-scattermoe-granite-ep2-padding-free-foak,8,32285,ibm-granite/granite-3.0-3b-a800m-instruct,2,8,bfloat16,1.199450992,320.9043,39.887,0.312,3864.08 +0.25,moe-scattermoe-granite-ep2-padding-free-foak,4,29771,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,1.21032447,175.5856,72.899,0.57,3531.042 +0.25,moe-scattermoe-granite-ep4-padding-free,4,23248,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,1.200576434,175.5905,72.897,0.57,3530.942 +0.25,moe-scattermoe-granite-ep4-padding-free-foak,4,23422,ibm-granite/granite-3.0-3b-a800m-instruct,4,8,bfloat16,1.199994416,173.1652,73.918,0.577,3580.397 +0.25,none,16,78704,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,0.878016037,3924.9586,3.261,0.025,903.347 +0.25,none,8,79299,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.877915607,2059.5193,6.215,0.049,860.783 +0.25,none,4,67966.5,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.878266089,1054.1087,12.143,0.095,840.9 +0.25,moe-scattermoe-granite-ep1,16,80638,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,0.878047609,899.6248,14.228,0.111,3941.198 +0.25,moe-scattermoe-granite-ep2,8,58769,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,0.877957979,550.4483,23.254,0.182,3220.647 +0.25,moe-scattermoe-granite-ep2,4,58932,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,0.880045412,300.1744,42.642,0.333,2952.95 +0.25,moe-scattermoe-granite-ep1-padding-free,16,77512,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,1.256636418,630.1126,20.314,0.159,3930.726 +0.25,moe-scattermoe-granite-ep1-padding-free-foak,16,72604,ibm-research/moe-7b-1b-active-shared-experts,1,8,bfloat16,1.261311768,598.0884,21.402,0.167,4141.194 +0.25,moe-scattermoe-granite-ep2-padding-free,8,45237,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,1.259768015,436.0593,29.354,0.229,2839.981 +0.25,moe-scattermoe-granite-ep2-padding-free,4,42449,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,1.267803932,236.4495,54.134,0.423,2618.741 +0.25,moe-scattermoe-granite-ep2-padding-free-foak,8,40279,ibm-research/moe-7b-1b-active-shared-experts,2,8,bfloat16,1.262602715,434.2257,29.478,0.23,2851.973 +0.25,moe-scattermoe-granite-ep2-padding-free-foak,4,38827,ibm-research/moe-7b-1b-active-shared-experts,4,8,bfloat16,1.268255376,231.5911,55.27,0.432,2673.678 +0.25,none,16,78670,ibm-granite/granite-4.0-tiny-preview,1,8,bfloat16,0.855069562,4106.7072,3.117,0.024,863.368 +0.25,none,8,77928,ibm-granite/granite-4.0-tiny-preview,2,8,bfloat16,0.854871556,2107.9397,6.072,0.047,841.011 +0.25,none,4,70000,ibm-granite/granite-4.0-tiny-preview,4,8,bfloat16,0.855348825,1117.3656,11.456,0.089,793.295 +0.25,moe-scattermoe-granite-ep1,16,78634,ibm-granite/granite-4.0-tiny-preview,1,8,bfloat16,0.855006663,968.4797,13.217,0.103,3660.996 +0.25,moe-scattermoe-granite-ep2,8,61692,ibm-granite/granite-4.0-tiny-preview,2,8,bfloat16,0.854951358,611.0101,20.949,0.164,2901.425 +0.25,moe-scattermoe-granite-ep2,4,61213,ibm-granite/granite-4.0-tiny-preview,4,8,bfloat16,0.856631212,337.9265,37.878,0.296,2623.055 +0.25,moe-scattermoe-granite-ep1-padding-free,16,79842,ibm-granite/granite-4.0-tiny-preview,1,8,bfloat16,0.852907363,823.4639,15.544,0.121,3007.782 +0.25,moe-scattermoe-granite-ep1-padding-free-foak,16,76916,ibm-granite/granite-4.0-tiny-preview,1,8,bfloat16,0.852861792,734.3252,17.431,0.136,3372.893 +0.25,moe-scattermoe-granite-ep2-padding-free,8,48068,ibm-granite/granite-4.0-tiny-preview,2,8,bfloat16,0.852783817,554.2306,23.095,0.18,2234.449 +0.25,moe-scattermoe-granite-ep2-padding-free,4,44790,ibm-granite/granite-4.0-tiny-preview,4,8,bfloat16,0.854414411,308.2351,41.527,0.324,2008.856 +0.25,moe-scattermoe-granite-ep2-padding-free-foak,8,43180,ibm-granite/granite-4.0-tiny-preview,2,8,bfloat16,0.85276741,541.444,23.64,0.185,2287.217 +0.25,moe-scattermoe-granite-ep2-padding-free-foak,4,41128,ibm-granite/granite-4.0-tiny-preview,4,8,bfloat16,0.854435267,308.1642,41.536,0.325,2009.318 +0.25,moe-scattermoe-granite-ep8,16,56687.5,mistralai/Mixtral-8x7B-Instruct-v0.1,8,1,bfloat16,0.8654898,810.9653,15.784,0.123,414.321 +0.25,moe-scattermoe-granite-ep8-foak,16,56710.25,mistralai/Mixtral-8x7B-Instruct-v0.1,8,1,bfloat16,0.86548216,775.5419,16.505,0.129,433.245 \ No newline at end of file diff --git a/scripts/benchmarks/refs/requirements_moe.txt b/scripts/benchmarks/refs/requirements_moe.txt index 9f5fc5f5..38b75c62 100644 --- a/scripts/benchmarks/refs/requirements_moe.txt +++ b/scripts/benchmarks/refs/requirements_moe.txt @@ -1,87 +1,91 @@ -accelerate==1.0.1 -aiohappyeyeballs==2.4.6 -aiohttp==3.11.12 +accelerate==1.6.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.12 aiosignal==1.3.2 -attrs==25.1.0 -bitsandbytes==0.43.3 -certifi==2025.1.31 -charset-normalizer==3.4.1 -contourpy==1.3.1 +attrs==25.3.0 +bitsandbytes==0.45.1 +causal-conv1d==1.5.0.post8 +certifi==2025.4.26 +charset-normalizer==3.4.2 +contourpy==1.3.2 cycler==0.12.1 -datasets==2.21.0 +datasets==3.6.0 dill==0.3.8 einops==0.8.1 -filelock==3.17.0 +filelock==3.18.0 flash_attn==2.7.4.post1 --e git+https://github.com/foundation-model-stack/fms-acceleration.git@21af5fb9f2989b3dbf443c016e4c0470b536a593#egg=fms_acceleration&subdirectory=plugins/framework --e git+https://github.com/foundation-model-stack/fms-acceleration.git@21af5fb9f2989b3dbf443c016e4c0470b536a593#egg=fms_acceleration_aadp&subdirectory=plugins/attention-and-distributed-packing --e git+https://github.com/foundation-model-stack/fms-acceleration.git@21af5fb9f2989b3dbf443c016e4c0470b536a593#egg=fms_acceleration_foak&subdirectory=plugins/fused-ops-and-kernels --e git+https://github.com/foundation-model-stack/fms-acceleration.git@21af5fb9f2989b3dbf443c016e4c0470b536a593#egg=fms_acceleration_moe&subdirectory=plugins/accelerated-moe --e git+https://github.com/foundation-model-stack/fms-acceleration.git@21af5fb9f2989b3dbf443c016e4c0470b536a593#egg=fms_acceleration_peft&subdirectory=plugins/accelerated-peft -fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@fdc7527510692ada03e4303df1549cebc5139b31 -fonttools==4.56.0 -frozenlist==1.5.0 -fsspec==2024.6.1 -huggingface-hub==0.29.0 +-e git+https://github.com/kmehant/fms-acceleration.git@bb3c88ecf960403295c34a49b9a4acdeff782d1d#egg=fms_acceleration&subdirectory=plugins/framework +-e git+https://github.com/kmehant/fms-acceleration.git@bb3c88ecf960403295c34a49b9a4acdeff782d1d#egg=fms_acceleration_aadp&subdirectory=plugins/attention-and-distributed-packing +-e git+https://github.com/kmehant/fms-acceleration.git@bb3c88ecf960403295c34a49b9a4acdeff782d1d#egg=fms_acceleration_foak&subdirectory=plugins/fused-ops-and-kernels +-e git+https://github.com/kmehant/fms-acceleration.git@bb3c88ecf960403295c34a49b9a4acdeff782d1d#egg=fms_acceleration_moe&subdirectory=plugins/accelerated-moe +-e git+https://github.com/kmehant/fms-acceleration.git@bb3c88ecf960403295c34a49b9a4acdeff782d1d#egg=fms_acceleration_peft&subdirectory=plugins/accelerated-peft +fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@9ca5739a4f7c1f0c9446b28d1bb80939d4199a75 +fonttools==4.58.2 +frozenlist==1.7.0 +fsspec==2025.3.0 +hf-xet==1.1.3 +huggingface-hub==0.33.0 idna==3.10 -Jinja2==3.1.5 -kernel-hyperdrive @ git+https://github.com/fabianlim/kernel-hyperdrive.git@45036497e12444ca98a6f0072204538aee4543ba +Jinja2==3.1.6 kiwisolver==1.4.8 llvmlite==0.44.0 +mamba-ssm==2.2.4 markdown-it-py==3.0.0 MarkupSafe==3.0.2 -matplotlib==3.10.0 +matplotlib==3.10.3 mdurl==0.1.2 mpmath==1.3.0 -multidict==6.1.0 +multidict==6.4.4 multiprocess==0.70.16 -networkx==3.4.2 -numba==0.61.0 +networkx==3.5 +ninja==1.11.1.4 +numba==0.61.2 numpy==1.26.4 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 nvidia-cudnn-cu12==9.1.0.70 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 +nvidia-cufft-cu12==11.2.1.3 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 nvidia-cusparselt-cu12==0.6.2 -nvidia-nccl-cu12==2.20.5 +nvidia-nccl-cu12==2.21.5 nvidia-nvjitlink-cu12==12.4.127 -nvidia-nvtx-cu12==12.1.105 -packaging==24.2 -pandas==2.2.3 -peft==0.13.2 -pillow==11.1.0 -propcache==0.2.1 -protobuf==5.29.3 +nvidia-nvtx-cu12==12.4.127 +packaging==25.0 +pandas==2.3.0 +peft==0.14.0 +pillow==11.2.1 +propcache==0.3.2 +protobuf==5.29.5 psutil==7.0.0 -pyarrow==19.0.1 +pyarrow==20.0.0 Pygments==2.19.1 -pyparsing==3.2.1 +pyparsing==3.2.3 python-dateutil==2.9.0.post0 -pytz==2025.1 +pytz==2025.2 PyYAML==6.0.2 regex==2024.11.6 -requests==2.32.3 -rich==13.9.4 -safetensors==0.5.2 +requests==2.32.4 +rich==14.0.0 +safetensors==0.5.3 sentencepiece==0.2.0 -setuptools==75.8.0 -simpleeval==0.9.13 +setuptools==80.9.0 +simpleeval==1.0.3 six==1.17.0 sympy==1.13.1 -threadpoolctl==3.5.0 -tokenizers==0.21.0 -torch==2.4.1 +threadpoolctl==3.6.0 +tokenizers==0.21.1 +torch==2.6.0 tqdm==4.67.1 -transformers==4.49.0 -triton==3.0.0 -trl==0.14.0 -typing_extensions==4.12.2 -tzdata==2025.1 -urllib3==2.3.0 +transformers==4.52.4 +triton==3.2.0 +trl==0.17.0 +typing_extensions==4.14.0 +tzdata==2025.2 +urllib3==2.4.0 xxhash==3.5.0 -yarl==1.18.3 +yarl==1.20.1 diff --git a/scripts/benchmarks/scenarios-moe.yaml b/scripts/benchmarks/scenarios-moe.yaml index a5171e11..2d763199 100644 --- a/scripts/benchmarks/scenarios-moe.yaml +++ b/scripts/benchmarks/scenarios-moe.yaml @@ -59,6 +59,7 @@ scenarios: model_name_or_path: - 'ibm-granite/granite-3.0-3b-a800m-instruct' - 'ibm-research/moe-7b-1b-active-shared-experts' + - 'ibm-granite/granite-4.0-tiny-preview' - name: accelerated-moe-full-mixtral @@ -77,4 +78,4 @@ scenarios: packing: False adam_epsilon: 1e-8 model_name_or_path: - - 'mistralai/Mixtral-8x7B-Instruct-v0.1' \ No newline at end of file + - 'mistralai/Mixtral-8x7B-Instruct-v0.1' diff --git a/tox.ini b/tox.ini index a62ae961..2787267c 100644 --- a/tox.ini +++ b/tox.ini @@ -34,6 +34,9 @@ commands = # some models need this for tokenizers pip install protobuf + # install mamba kernels to benchmark mamba based models + bash -c '[[ "{env:INSTALL_MAMBA:false}" == "true" ]] && pip install --no-build-isolation "mamba_ssm[causal-conv1d]>=2.0.0" || true' + # install the plugins for test # NOTE: when there are more plugins install here python -m fms_acceleration.cli install -e {toxinidir}/plugins/accelerated-peft