diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py index 85c17ee8..07a7f772 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py @@ -33,11 +33,14 @@ PreTrainedModel, ) from transformers.modeling_utils import ( + dtype_byte_size, is_local_dist_rank_0, no_init_weights, - shard_checkpoint, ) +from transformers.pytorch_utils import id_tensor_storage +from transformers.utils import WEIGHTS_NAME from transformers.utils.generic import ContextManagers +from transformers.utils.hub import convert_file_size_to_int import accelerate import torch import torch.nn as nn @@ -688,7 +691,7 @@ def save_quantized( torch.save(model.state_dict(), join(save_dir, model_save_name)) else: # Shard checkpoint - shards, index = shard_checkpoint( + shards, index = self.shard_checkpoint( state_dict, max_shard_size=max_shard_size, weights_name=model_save_name ) @@ -766,6 +769,106 @@ def save_quantized( quantize_config.model_file_base_name = model_base_name quantize_config.save_pretrained(save_dir) + # added by anh.uong@ibm.com + # adapted from transformers.modeling_utils.shard_checkpoint + # from transformers v4.46, removed in later versions + # TODO: split_torch_state_dict_into_shards from huggingface_hub library + def shard_checkpoint( + self, + state_dict: Dict[str, torch.Tensor], + max_shard_size: Union[int, str] = "10GB", + weights_name: str = WEIGHTS_NAME, + ): + """ + Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a + given size. + + The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no + optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the + limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB], + [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB]. + + + + If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will + have a size greater than `max_shard_size`. + + + + Args: + state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save. + max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): + The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit + (like `"5MB"`). + weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`): + The name of the model save file. + """ + logger.warning( + "Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using " + "split_torch_state_dict_into_shards from huggingface_hub library" + ) + max_shard_size = convert_file_size_to_int(max_shard_size) + + sharded_state_dicts = [{}] + last_block_size = 0 + total_size = 0 + storage_id_to_block = {} + + for key, weight in state_dict.items(): + # when bnb serialization is used the weights in the state dict can be strings + # check: https://github.com/huggingface/transformers/pull/24416 for more details + if isinstance(weight, str): + continue + else: + storage_id = id_tensor_storage(weight) + + # If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block` + if storage_id in storage_id_to_block and weight.device != torch.device( + "meta" + ): + block_id = storage_id_to_block[storage_id] + sharded_state_dicts[block_id][key] = weight + continue + + weight_size = weight.numel() * dtype_byte_size(weight.dtype) + # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one + # weight in the current shard. + if ( + last_block_size + weight_size > max_shard_size + and len(sharded_state_dicts[-1]) > 0 + ): + sharded_state_dicts.append({}) + last_block_size = 0 + + sharded_state_dicts[-1][key] = weight + last_block_size += weight_size + total_size += weight_size + storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1 + + # If we only have one shard, we return it + if len(sharded_state_dicts) == 1: + return {weights_name: sharded_state_dicts[0]}, None + + # Otherwise, let's build the index + weight_map = {} + shards = {} + for idx, shard in enumerate(sharded_state_dicts): + shard_file = weights_name.replace( + ".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin" + ) + shard_file = shard_file.replace( + ".safetensors", + f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors", + ) + shards[shard_file] = shard + for key in shard.keys(): + weight_map[key] = shard_file + + # Add the metadata + metadata = {"total_size": total_size} + index = {"metadata": metadata, "weight_map": weight_map} + return shards, index + def save_pretrained( self, save_dir: str, diff --git a/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/multipack_sampler.py b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/multipack_sampler.py index 481557e4..c10ff1a6 100644 --- a/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/multipack_sampler.py +++ b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/multipack_sampler.py @@ -20,7 +20,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -taken from https://github.com/imoneoi/multipack_sampler with some modifications +taken from https://github.com/imoneoi/multipack_sampler with some modifications taken from https://github.com/instructlab/training/blob/main/src/instructlab/training/multipack_sampler.py """ diff --git a/plugins/framework/src/fms_acceleration/model_patcher.py b/plugins/framework/src/fms_acceleration/model_patcher.py index 4db02d1d..36137d98 100644 --- a/plugins/framework/src/fms_acceleration/model_patcher.py +++ b/plugins/framework/src/fms_acceleration/model_patcher.py @@ -184,11 +184,11 @@ def __post_init__(self): self.import_and_maybe_reload is not None, ] ) - != 1 + > 1 ): raise ValueError( - f"Rule '{self.rule_id}' must only have only one of forward, " - "foward builder, or import_and_maybe_reload, specified." + f"Rule '{self.rule_id}' must only have at most one of forward, " + "forward builder, or import_and_maybe_reload, specified." ) if self.import_and_maybe_reload is not None and self.trigger is not None: @@ -425,7 +425,7 @@ def _patch_forwards( # otherwise triggered if rule.forward is not None: forward = rule.forward - else: + elif rule.forward_builder is not None: fba = {} if rule.forward_builder_args is not None: fba = { @@ -434,6 +434,9 @@ def _patch_forwards( if rule.forward_builder_args } forward = rule.forward_builder(mod, **fba) + else: + # trigger-only case + forward = None if isinstance(forward, list): # this will be list of tuples case @@ -468,7 +471,8 @@ def _patch_forwards( continue # otherwise - mod.forward = MethodType(forward, mod) + if forward is not None: + mod.forward = MethodType(forward, mod) ModelPatcher.history.append( ModelPatcherHistory( instance=mod_id, diff --git a/plugins/framework/tests/test_model_patcher_helpers.py b/plugins/framework/tests/test_model_patcher_helpers.py index 0660f577..52d56b3e 100644 --- a/plugins/framework/tests/test_model_patcher_helpers.py +++ b/plugins/framework/tests/test_model_patcher_helpers.py @@ -254,12 +254,12 @@ def test_combine_mp_triggers_produces_correct_output( def test_mp_rule_raises_error_when_arguments_incorrectly_configured(): - "Ensure MP rule is throws appropriate error when wrong argument combinations are passed" + "Ensure MP rule throws appropriate error when wrong argument combinations are passed" # Test mp rule construction raises with multiple arguments with pytest.raises( ValueError, - match="must only have only one of forward, " - "foward builder, or import_and_maybe_reload, specified.", + match="must only have at most one of forward, " + "forward builder, or import_and_maybe_reload, specified.", ): ModelPatcherRule( rule_id=DUMMY_RULE_ID, diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py index 0d7ce802..b1a4b2b3 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py @@ -73,7 +73,7 @@ def register_foak_model_patch_rules( FILTER_MAP = { "fused_lora": {"qkvo", "mlp"}, "fast_loss": { - True: "cross-ent", + True: {"cross-ent", "custom-loss"}, "fused_ce_liger": "fused-lce", }, "fast_rms_layernorm": "rms", diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/kernels/unsloth/cross_entropy_loss.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/kernels/unsloth/cross_entropy_loss.py index cade9c85..3da07310 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/kernels/unsloth/cross_entropy_loss.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/kernels/unsloth/cross_entropy_loss.py @@ -16,6 +16,7 @@ import triton.language as tl import torch from .utils import calculate_settings, MAX_FUSED_SIZE +from typing import Type @triton.jit @@ -290,3 +291,55 @@ def forward(self, input, target): ) n_items = torch.count_nonzero(target != -100) return loss.sum() / n_items + + +# added by flim@sg.ibm.com + +# adapted from transformers.loss.loss_utils.ForCausalLMLoss +def FastForCausalLMLoss( + logits, labels, vocab_size: int, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs +): + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() + labels = labels.to(logits.device) + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + + # Flatten the tokens + shift_logits = shift_logits.view(-1, vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + + reduction = "sum" if num_items_in_batch is not None else "mean" + assert ignore_index == -100, "FastForCausalLMLoss currently supports only hardcoded ignore index -100." + loss = Fast_CrossEntropyLoss.apply( + shift_logits, shift_labels + ) + if reduction == "sum": + n_items = num_items_in_batch + else: + n_items = torch.count_nonzero(shift_labels != -100) + return loss.sum() / n_items + + +def replace_custom_loss_when_triggered( + module_cls: Type, + custom_loss_type: str, +): + + # this is a special trigger that will perform the replacement + def _trigger(mod): + if isinstance (mod, module_cls) and hasattr(mod, "loss_function"): + # guarded + from transformers.loss.loss_utils import LOSS_MAPPING + LOSS_MAPPING[custom_loss_type] = FastForCausalLMLoss + mod.loss_type = custom_loss_type + return True + + return False + + return _trigger + + diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granite.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granite.py index ee2f4206..6da7f056 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granite.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granite.py @@ -27,7 +27,10 @@ # Local from ..fused_ops.liger_ce.fused_linear_cross_entropy_loss import lce_forward -from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss +from ..kernels.unsloth.cross_entropy_loss import ( + FastCrossEntropyLoss, + replace_custom_loss_when_triggered, +) from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm from ..kernels.unsloth.rope_embedding import fast_rope_embedding from ..utils import filter_mp_rules @@ -37,6 +40,7 @@ KEY_QKV, build_lora_fused_ops, get_hidden_activation_fn_key, + get_transformers_version, trigger_fused_ops, ) @@ -122,16 +126,27 @@ def get_mp_rules(base_type: str, config: PretrainedConfig = None): base_type=base_type, ), ), - # TODO: have a generic version of this rule - # - get the module_name and reload on that - ModelPatcherRule( - rule_id="granite-cross-ent", - import_and_maybe_reload=( - "torch.nn.CrossEntropyLoss", - FastCrossEntropyLoss, - "transformers.models.granite.modeling_granite", - ), - ), + *[ + ( + ModelPatcherRule( + rule_id="granite-custom-loss", + trigger=ModelPatcherTrigger( + check=replace_custom_loss_when_triggered( + GraniteForCausalLM, custom_loss_type="granite-custom-loss" + ) + ), + ) + if get_transformers_version() >= "4.46" + else ModelPatcherRule( + rule_id="granite-cross-ent", + import_and_maybe_reload=( + "torch.nn.CrossEntropyLoss", + FastCrossEntropyLoss, + "transformers.models.granite.modeling_granite", + ), + ) + ) + ], ModelPatcherRule( rule_id="granite-fused-lce", trigger=ModelPatcherTrigger(check=GraniteForCausalLM), diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py index ad2df4a7..760aeeb4 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py @@ -33,7 +33,10 @@ # Local from ..fused_ops.liger_ce.fused_linear_cross_entropy_loss import lce_forward -from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss +from ..kernels.unsloth.cross_entropy_loss import ( + FastCrossEntropyLoss, + replace_custom_loss_when_triggered, +) from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm from ..kernels.unsloth.rope_embedding import fast_rope_embedding from ..utils import filter_mp_rules @@ -43,6 +46,7 @@ KEY_QKV, build_lora_fused_ops, get_hidden_activation_fn_key, + get_transformers_version, trigger_fused_ops, ) @@ -122,14 +126,27 @@ def get_mp_rules(base_type: str, config: PretrainedConfig = None): trigger=ModelPatcherTrigger(check=LlamaForCausalLM), forward=lce_forward, ), - ModelPatcherRule( - rule_id="llama-cross-ent", - import_and_maybe_reload=( - "torch.nn.CrossEntropyLoss", - FastCrossEntropyLoss, - "transformers.models.llama.modeling_llama", - ), - ), + *[ + ( + ModelPatcherRule( + rule_id="llama-custom-loss", + trigger=ModelPatcherTrigger( + check=replace_custom_loss_when_triggered( + LlamaForCausalLM, custom_loss_type="llama-custom-loss" + ) + ), + ) + if get_transformers_version() >= "4.46" + else ModelPatcherRule( + rule_id="llama-cross-ent", + import_and_maybe_reload=( + "torch.nn.CrossEntropyLoss", + FastCrossEntropyLoss, + "transformers.models.llama.modeling_llama", + ), + ) + ) + ], # TODO: have a generic version of this rule # - get the module name # - check if "apply_rotary_pos_emb" exists diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py index 56dc48f0..515ae8f3 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py @@ -33,7 +33,10 @@ # Local from ..fused_ops.liger_ce.fused_linear_cross_entropy_loss import lce_forward -from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss +from ..kernels.unsloth.cross_entropy_loss import ( + FastCrossEntropyLoss, + replace_custom_loss_when_triggered, +) from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm from ..kernels.unsloth.rope_embedding import fast_rope_embedding from ..utils import filter_mp_rules @@ -43,6 +46,7 @@ KEY_QKV, build_lora_fused_ops, get_hidden_activation_fn_key, + get_transformers_version, trigger_fused_ops, ) @@ -114,14 +118,27 @@ def get_mp_rules(base_type: str, config: PretrainedConfig = None): base_type=base_type, ), ), - ModelPatcherRule( - rule_id="mistral-cross-ent", - import_and_maybe_reload=( - "torch.nn.CrossEntropyLoss", - FastCrossEntropyLoss, - "transformers.models.mistral.modeling_mistral", - ), - ), + *[ + ( + ModelPatcherRule( + rule_id="mistral-custom-loss", + trigger=ModelPatcherTrigger( + check=replace_custom_loss_when_triggered( + MistralForCausalLM, custom_loss_type="mistral-custom-loss" + ) + ), + ) + if get_transformers_version() >= "4.46" + else ModelPatcherRule( + rule_id="mistral-cross-ent", + import_and_maybe_reload=( + "torch.nn.CrossEntropyLoss", + FastCrossEntropyLoss, + "transformers.models.mistral.modeling_mistral", + ), + ) + ) + ], ModelPatcherRule( rule_id="mistral-fused-lce", trigger=ModelPatcherTrigger(check=MistralForCausalLM), diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mixtral.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mixtral.py index 67eada1c..432bdf34 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mixtral.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mixtral.py @@ -24,14 +24,24 @@ ) from transformers.models.mixtral.modeling_mixtral import ( MixtralAttention, + MixtralForCausalLM, MixtralRMSNorm, ) # Local -from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss +from ..kernels.unsloth.cross_entropy_loss import ( + FastCrossEntropyLoss, + replace_custom_loss_when_triggered, +) from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm from ..kernels.unsloth.rope_embedding import fast_rope_embedding -from .utils import KEY_O, KEY_QKV, build_lora_fused_ops, trigger_fused_ops +from .utils import ( + KEY_O, + KEY_QKV, + build_lora_fused_ops, + get_transformers_version, + trigger_fused_ops, +) def get_mp_rules(base_type): @@ -85,14 +95,27 @@ def get_mp_rules(base_type): logic="APPEND", ), ), - ModelPatcherRule( - rule_id="mixtral-cross-ent", - import_and_maybe_reload=( - "torch.nn.CrossEntropyLoss", - FastCrossEntropyLoss, - "transformers.models.mixtral.modeling_mixtral", - ), - ), + *[ + ( + ModelPatcherRule( + rule_id="mixtral-custom-loss", + trigger=ModelPatcherTrigger( + check=replace_custom_loss_when_triggered( + MixtralForCausalLM, custom_loss_type="mixtral-custom-loss" + ) + ), + ) + if get_transformers_version() >= "4.46" + else ModelPatcherRule( + rule_id="mixtral-cross-ent", + import_and_maybe_reload=( + "torch.nn.CrossEntropyLoss", + FastCrossEntropyLoss, + "transformers.models.mixtral.modeling_mixtral", + ), + ) + ) + ], ModelPatcherRule( rule_id="mixtral-rope", import_and_maybe_reload=( diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/utils.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/utils.py index 375a2a6e..be7acbc3 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/utils.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/utils.py @@ -6,6 +6,7 @@ # Third Party from fms_acceleration.model_patcher import ModelPatcherTrigger from transformers import PretrainedConfig +from transformers.utils.import_utils import _is_package_available import torch # Local @@ -214,3 +215,10 @@ def get_hidden_activation_fn_key(config: PretrainedConfig): "Unable to determine activation function key for " f"architecture {config.architectures}." ) + + +def get_transformers_version(): + _, _transformers_version = _is_package_available( + "transformers", return_version=True + ) + return _transformers_version diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py index df314868..72e0e23c 100644 --- a/scripts/benchmarks/benchmark.py +++ b/scripts/benchmarks/benchmark.py @@ -171,9 +171,7 @@ def __init__( ) -> None: self.dataset_split = datasets.load_dataset( - dataset_name, - split=dataset_split, - **additional_dataset_kwargs + dataset_name, split=dataset_split, **additional_dataset_kwargs ) self.kwargs = { @@ -206,9 +204,8 @@ def prepare_dataset( ) response_template = self.response_template - if ( - self.kwargs['tokenize'] - or (not self.kwargs['tokenize'] and self.kwargs['chat_template']) + if self.kwargs["tokenize"] or ( + not self.kwargs["tokenize"] and self.kwargs["chat_template"] ): tokenizer = AutoTokenizer.from_pretrained(model_name) # for now, if pad_token_id is None, will just do a replacement diff --git a/scripts/benchmarks/refs/a100_80gb.csv b/scripts/benchmarks/refs/a100_80gb.csv index c2ac8bbe..b15f5ead 100644 --- a/scripts/benchmarks/refs/a100_80gb.csv +++ b/scripts/benchmarks/refs/a100_80gb.csv @@ -1,125 +1,125 @@ -bf16,epoch,fp16,framework_config,learning_rate,lora_alpha,lora_dropout,mem_nvidia_mem_reserved,mem_peak_torch_mem_alloc_in_bytes,mem_torch_mem_alloc_in_bytes,model_name_or_path,num_gpus,peft_method,per_device_train_batch_size,r,target_modules,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second -,0.07,,none,2e-5,,,13953.0,11267745280.0,6770300416.0,bigcode/gpt_bigcode-santacoder,1,,4,,,bfloat16,2.34609375,48.2373,8.292,2.073,16982.695 -,0.07,,none,2e-5,,,9182.0,5712779264.0,4521985024.0,bigcode/gpt_bigcode-santacoder,2,,2,,,bfloat16,2.345390625,31.7465,12.6,3.15,12902.203 -,0.14,,none,2e-5,,,16939.0,15193274880.0,6769448448.0,bigcode/gpt_bigcode-santacoder,1,,8,,,bfloat16,2.33109375,89.1425,8.974,1.122,18379.565 -,0.14,,none,2e-5,,,11246.0,7819071488.0,4522589184.0,bigcode/gpt_bigcode-santacoder,2,,4,,,bfloat16,2.331171875,51.2448,15.611,1.951,15986.02 -,0.07,,foak-fast-kernels,2e-5,,,13957.0,11267745280.0,6770300416.0,bigcode/gpt_bigcode-santacoder,1,,4,,,bfloat16,2.3324218368530274,48.934,8.174,2.044,16740.92 -,0.07,,foak-fast-kernels,2e-5,,,9286.0,5657082880.0,4521985024.0,bigcode/gpt_bigcode-santacoder,2,,2,,,bfloat16,2.3320528411865236,31.9784,12.508,3.127,12808.646 -,0.14,,foak-fast-kernels,2e-5,,,16943.0,13582956544.0,6769448448.0,bigcode/gpt_bigcode-santacoder,1,,8,,,bfloat16,2.3164530181884766,89.056,8.983,1.123,18397.422 -,0.14,,foak-fast-kernels,2e-5,,,11250.0,7013912064.0,4522589184.0,bigcode/gpt_bigcode-santacoder,2,,4,,,bfloat16,2.317208099365234,51.8339,15.434,1.929,15804.335 -,0.15,,none,2e-5,,,76045.0,72434853376.0,43467892224.0,mistralai/Mistral-7B-v0.1,1,,4,,,bfloat16,0.8365114259719849,543.6512,0.736,0.184,3013.697 -,0.15,,none,2e-5,,,43220.0,36225955840.0,28984215552.0,mistralai/Mistral-7B-v0.1,2,,2,,,bfloat16,0.8365651750564576,296.8573,1.347,0.337,2759.576 -,0.29,,none,2e-5,,,72725.0,72435246592.0,43468285440.0,mistralai/Mistral-7B-v0.1,1,,8,,,bfloat16,0.8329308223724365,1060.1522,0.755,0.094,3090.877 -,0.29,,none,2e-5,,,52426.0,36226152448.0,28984412160.0,mistralai/Mistral-7B-v0.1,2,,4,,,bfloat16,0.8329250192642212,551.1644,1.451,0.181,2972.616 -,0.15,,foak-fast-kernels,2e-5,,,76077.0,72432723456.0,43466827264.0,mistralai/Mistral-7B-v0.1,1,,4,,,bfloat16,0.8363362979888916,481.8087,0.83,0.208,3400.52 -,0.15,,foak-fast-kernels,2e-5,,,42925.0,36225955840.0,28984215552.0,mistralai/Mistral-7B-v0.1,2,,2,,,bfloat16,0.8364450407028198,268.3677,1.49,0.373,3052.528 -,0.29,,foak-fast-kernels,2e-5,,,70041.0,72433116672.0,43467220480.0,mistralai/Mistral-7B-v0.1,1,,8,,,bfloat16,0.8330383920669555,937.1945,0.854,0.107,3496.393 -,0.29,,foak-fast-kernels,2e-5,,,51889.0,36226152448.0,28984412160.0,mistralai/Mistral-7B-v0.1,2,,4,,,bfloat16,0.833041114807129,493.3262,1.622,0.203,3321.129 -,,,none,2e-5,,,80975.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,4,,,bfloat16,,,,, -,,,none,2e-5,,,80308.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,2,,,bfloat16,,,,, -,,,none,2e-5,,,80975.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,8,,,bfloat16,,,,, -,,,none,2e-5,,,80944.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,4,,,bfloat16,,,,, -,,,foak-fast-kernels,2e-5,,,80975.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,4,,,bfloat16,,,,, -,,,foak-fast-kernels,2e-5,,,78733.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,2,,,bfloat16,,,,, -,,,foak-fast-kernels,2e-5,,,80975.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,8,,,bfloat16,,,,, -,,,foak-fast-kernels,2e-5,,,80813.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,4,,,bfloat16,,,,, -,,,none,2e-5,,,3.0,,,NousResearch/Llama-2-70b-hf,1,,4,,,bfloat16,,,,, -,,,none,2e-5,,,80765.0,,,NousResearch/Llama-2-70b-hf,2,,2,,,bfloat16,,,,, -,,,none,2e-5,,,81039.0,,,NousResearch/Llama-2-70b-hf,1,,8,,,bfloat16,,,,, -,,,none,2e-5,,,80881.0,,,NousResearch/Llama-2-70b-hf,2,,4,,,bfloat16,,,,, -,,,foak-fast-kernels,2e-5,,,81039.0,,,NousResearch/Llama-2-70b-hf,1,,4,,,bfloat16,,,,, -,,,foak-fast-kernels,2e-5,,,80822.0,,,NousResearch/Llama-2-70b-hf,2,,2,,,bfloat16,,,,, -,,,foak-fast-kernels,2e-5,,,81039.0,,,NousResearch/Llama-2-70b-hf,1,,8,,,bfloat16,,,,, -,,,foak-fast-kernels,2e-5,,,80865.0,,,NousResearch/Llama-2-70b-hf,2,,4,,,bfloat16,,,,, -,0.15,,none,2e-4,16,0.1,28071.0,25653881344.0,14664508928.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.8588938808441162,493.6093,0.81,0.203,3319.225 -,0.15,,none,2e-4,16,0.1,17747.0,15245549568.0,7368046592.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.8592093753814697,285.023,1.403,0.351,2874.155 -,0.29,,none,2e-4,16,0.1,41407.0,36643613184.0,14664902144.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,0.8575004673004151,977.4317,0.818,0.102,3352.459 -,0.29,,none,2e-4,16,0.1,25349.0,22161170432.0,7368243200.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.8573768520355225,517.9485,1.545,0.193,3163.249 -,0.15,,foak-fast-kernels,2e-4,16,0.1,27901.0,24068304384.0,14664508928.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.8589437294006348,434.7949,0.92,0.23,3768.214 -,0.15,,foak-fast-kernels,2e-4,16,0.1,17048.0,15044222976.0,7368046592.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.8586760711669922,264.1168,1.514,0.379,3101.658 -,0.29,,foak-fast-kernels,2e-4,16,0.1,41045.0,33470361088.0,14664902144.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,0.8573414993286133,860.2678,0.93,0.116,3809.046 -,0.29,,foak-fast-kernels,2e-4,16,0.1,24167.0,21758517248.0,7368243200.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.8574250793457031,462.6096,1.729,0.216,3541.647 -,,,none,2e-4,16,0.1,81021.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,0.15,,none,2e-4,16,0.1,61598.0,58196957184.0,47365978112.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.8971081733703613,534.4072,0.748,0.187,1532.913 -,,,none,2e-4,16,0.1,81021.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,0.29,,none,2e-4,16,0.1,69370.0,65592615936.0,47366174720.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.8931161880493164,902.3954,0.887,0.111,1815.612 -,,,foak-fast-kernels,2e-4,16,0.1,81021.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,0.15,,foak-fast-kernels,2e-4,16,0.1,61430.0,57971103744.0,47365978112.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.8946351528167724,505.7831,0.791,0.198,1619.667 -,,,foak-fast-kernels,2e-4,16,0.1,81021.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,0.29,,foak-fast-kernels,2e-4,16,0.1,68815.0,65137640448.0,47366174720.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.8956324863433838,845.1003,0.947,0.118,1938.705 -,,,none,2e-4,16,0.1,80601.0,,,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,,,none,2e-4,16,0.1,80892.0,,,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,,,none,2e-4,16,0.1,80601.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,,,none,2e-4,16,0.1,80879.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,,,foak-fast-kernels,2e-4,16,0.1,80601.0,,,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,,,foak-fast-kernels,2e-4,16,0.1,80821.0,,,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,,,foak-fast-kernels,2e-4,16,0.1,80601.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -,,,foak-fast-kernels,2e-4,16,0.1,80335.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -True,0.15,,baseline-peft-bnb,2e-4,16,0.1,24733.0,20556796416.0,4307044864.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.8750703716278077,576.5728,0.694,0.173,2841.619 -True,0.15,,baseline-peft-bnb,2e-4,16,0.1,12210.0,9525273600.0,2244541440.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.8754423999786377,299.3352,1.336,0.334,2736.732 -True,0.29,,baseline-peft-bnb,2e-4,16,0.1,44727.0,36801860096.0,4307438080.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,0.8729377365112305,1118.4307,0.715,0.089,2929.82 -True,0.29,,baseline-peft-bnb,2e-4,16,0.1,20190.0,16171410432.0,2244738048.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.8730156898498536,501.6657,1.595,0.199,3265.92 -True,0.15,,baseline-peft-bnb,2e-4,16,0.1,44299.0,43550249472.0,25201920512.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.9007136249542236,1088.2613,0.368,0.092,1505.521 -True,0.15,,baseline-peft-bnb,2e-4,16,0.1,30916.0,21768876032.0,13273627648.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.9021518707275391,547.2421,0.731,0.183,1496.961 -True,0.29,,baseline-peft-bnb,2e-4,16,0.1,63459.0,61492521984.0,25202313728.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,0.8979566383361817,1969.7975,0.406,0.051,1663.521 -True,0.29,,baseline-peft-bnb,2e-4,16,0.1,36289.0,28883986432.0,13273824256.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.8986497783660888,893.7177,0.895,0.112,1833.241 -True,,,baseline-peft-bnb,2e-4,16,0.1,79743.0,,,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -True,0.14,,baseline-peft-bnb,2e-4,16,0.1,50954.0,46685328896.0,19266784768.0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.9998255729675293,1924.178,0.208,0.052,425.74 -True,,,baseline-peft-bnb,2e-4,16,0.1,79939.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -True,,,baseline-peft-bnb,2e-4,16,0.1,79293.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, -True,0.07,,accelerated-peft-bnb,2e-4,16,0.1,11435.0,9148997120.0,810277376.0,bigcode/gpt_bigcode-santacoder,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.453810634613037,54.6477,7.32,1.83,14990.581 -True,0.07,,accelerated-peft-bnb,2e-4,16,0.1,7408.0,4788195328.0,411216896.0,bigcode/gpt_bigcode-santacoder,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.4479636192321776,59.8799,6.68,1.67,6840.355 -True,0.14,,accelerated-peft-bnb,2e-4,16,0.1,21927.0,17486716416.0,810473984.0,bigcode/gpt_bigcode-santacoder,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.455012626647949,102.1406,7.832,0.979,16040.633 -True,0.14,,accelerated-peft-bnb,2e-4,16,0.1,12452.0,8957644800.0,411315200.0,bigcode/gpt_bigcode-santacoder,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.4509134101867676,61.1132,13.09,1.636,13404.641 -True,0.07,,accelerated-peft-bnb-foak,2e-4,16,0.1,9131.0,7538417152.0,810277376.0,bigcode/gpt_bigcode-santacoder,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.440969181060791,54.5466,7.333,1.833,15018.361 -True,0.07,,accelerated-peft-bnb-foak,2e-4,16,0.1,6176.0,3989590016.0,411216896.0,bigcode/gpt_bigcode-santacoder,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.4551522445678713,39.5904,10.103,2.526,10345.932 -True,0.14,,accelerated-peft-bnb-foak,2e-4,16,0.1,17319.0,14264901120.0,808638976.0,bigcode/gpt_bigcode-santacoder,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.4382666397094725,101.4046,7.889,0.986,16157.066 -True,0.14,,accelerated-peft-bnb-foak,2e-4,16,0.1,10197.0,7353749504.0,411315200.0,bigcode/gpt_bigcode-santacoder,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.4603317260742186,56.4512,14.172,1.771,14511.636 -True,0.15,,accelerated-peft-bnb,2e-4,16,0.1,18269.0,15323147776.0,4306512384.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8744064712524414,462.2962,0.865,0.216,3544.048 -True,0.15,,accelerated-peft-bnb,2e-4,16,0.1,12155.0,9525273600.0,2244541440.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.874450798034668,299.1359,1.337,0.334,2738.555 -True,0.29,,accelerated-peft-bnb,2e-4,16,0.1,32693.0,26312879616.0,4306905600.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8731369590759277,906.2349,0.883,0.11,3615.84 -True,0.29,,accelerated-peft-bnb,2e-4,16,0.1,20457.0,16171410432.0,2244738048.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8723946666717529,501.7688,1.594,0.199,3265.249 -True,0.15,,accelerated-peft-bnb-foak,2e-4,16,0.1,18815.0,13064809472.0,4306512384.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8750362300872803,404.9234,0.988,0.247,4046.197 -True,0.15,,accelerated-peft-bnb-foak,2e-4,16,0.1,12145.0,9309332480.0,2244541440.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8801666259765625,223.4344,1.79,0.448,3666.4 -True,0.29,,accelerated-peft-bnb-foak,2e-4,16,0.1,31959.0,21823466496.0,4306905600.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8722082996368408,791.2713,1.011,0.126,4141.184 -True,0.29,,accelerated-peft-bnb-foak,2e-4,16,0.1,19846.0,15685985280.0,2244738048.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8774080085754394,415.5235,1.925,0.241,3942.978 -True,0.15,,accelerated-peft-bnb,2e-4,16,0.1,37401.0,36218023424.0,25201388032.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8989591693878174,845.0543,0.473,0.118,1938.81 -True,0.15,,accelerated-peft-bnb,2e-4,16,0.1,30772.0,21766241792.0,13273627648.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8999812889099121,549.525,0.728,0.182,1490.742 -True,0.29,,accelerated-peft-bnb,2e-4,16,0.1,49955.0,47207755264.0,25201781248.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8977096939086914,1575.2553,0.508,0.063,2080.171 -True,0.29,,accelerated-peft-bnb,2e-4,16,0.1,36198.0,28882755584.0,13273824256.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8979197406768799,893.2317,0.896,0.112,1834.239 -True,0.15,,accelerated-peft-bnb-foak,2e-4,16,0.1,37547.0,34868725760.0,25201388032.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8993332195281982,785.6706,0.509,0.127,2085.352 -True,0.15,,accelerated-peft-bnb-foak,2e-4,16,0.1,34149.0,21482356224.0,13273627648.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.9065353488922119,471.7526,0.848,0.212,1736.504 -True,0.29,,accelerated-peft-bnb-foak,2e-4,16,0.1,50369.0,44399629312.0,25201781248.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.8978278636932373,1455.8545,0.55,0.069,2250.774 -True,0.29,,accelerated-peft-bnb-foak,2e-4,16,0.1,39368.0,28266069504.0,13273824256.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.9034731101989746,811.3348,0.986,0.123,2019.388 -True,0.14,,accelerated-peft-bnb,2e-4,16,0.1,71647.0,68126422016.0,37179042816.0,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.9993585586547852,3515.3767,0.114,0.028,466.067 -True,0.14,,accelerated-peft-bnb,2e-4,16,0.1,50850.0,46685328896.0,19266784768.0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.9996768379211426,1924.6899,0.208,0.052,425.627 -True,,,accelerated-peft-bnb,2e-4,16,0.1,81019.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,,,,, -True,,,accelerated-peft-bnb,2e-4,16,0.1,79383.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,,,,, -True,0.14,,accelerated-peft-bnb-foak,2e-4,16,0.1,71073.0,67048944640.0,37179042816.0,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.9997596263885498,3246.4834,0.123,0.031,504.669 -True,0.14,,accelerated-peft-bnb-foak,2e-4,16,0.1,53731.0,46407652864.0,19266784768.0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0037711620330811,1712.1747,0.234,0.058,478.456 -True,,,accelerated-peft-bnb-foak,2e-4,16,0.1,80405.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,,,,, -True,,,accelerated-peft-bnb-foak,2e-4,16,0.1,78390.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,,,,, -,0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,18791.0,15353458176.0,4336822784.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9889778804779052,480.3997,0.833,0.208,3410.493 -,0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,12723.0,9542804992.0,2261220352.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9913083648681641,302.5678,1.322,0.331,2707.493 -,0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,32585.0,26343190016.0,4337216000.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.9652094364166259,948.9509,0.843,0.105,3453.076 -,0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,20980.0,16188941824.0,2261416960.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9700868701934815,516.12,1.55,0.194,3174.455 -,0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,18559.0,13095119872.0,4336822784.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9892044544219971,417.8225,0.957,0.239,3921.283 -,0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,12503.0,9326863872.0,2261220352.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,1.0182433414459229,230.7691,1.733,0.433,3549.869 -,0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,32343.0,21853776896.0,4337216000.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.964646291732788,822.5069,0.973,0.122,3983.918 -,0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,20594.0,15703516672.0,2261416960.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9815717029571533,429.6704,1.862,0.233,3813.155 -,0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,36449.0,35528093184.0,24511457792.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9061469841003418,833.9906,0.48,0.12,1964.531 -,0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,25222.0,21069974016.0,12581256192.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9071274089813233,527.2802,0.759,0.19,1553.633 -,0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,48983.0,46517825024.0,24511851008.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.903361701965332,1589.9549,0.503,0.063,2060.939 -,0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,31156.0,28182287872.0,12581452800.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9034396743774414,882.8256,0.906,0.113,1855.859 -,0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,37013.0,34186799616.0,24511457792.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9072639083862305,771.2908,0.519,0.13,2124.231 -,0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,26218.0,20786931712.0,12581256192.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9141145133972168,448.8141,0.891,0.223,1825.255 -,0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,49925.0,43789790720.0,24511851008.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.9036093616485595,1464.3528,0.546,0.068,2237.712 -,0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,31489.0,27565384704.0,12581452800.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9108166313171386,800.2057,1.0,0.125,2047.474 -,0.14,True,accelerated-peft-autogptq,2e-4,16,0.1,70531.0,67069752832.0,36122373120.0,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.990300931930542,3587.1091,0.112,0.028,456.747 -,0.14,True,accelerated-peft-autogptq,2e-4,16,0.1,50574.0,45638032384.0,18219970048.0,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9898345947265625,1941.7552,0.206,0.051,421.886 -,,True,accelerated-peft-autogptq,2e-4,16,0.1,79901.0,,,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, -,,True,accelerated-peft-autogptq,2e-4,16,0.1,79999.0,,,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, -,0.14,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,70449.0,65992275456.0,36122373120.0,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9903269004821778,3298.9449,0.121,0.03,496.644 -,0.14,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,50728.0,45360356352.0,18219970048.0,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9937553882598877,1713.6174,0.233,0.058,478.053 -,,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,80167.0,,,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, -,,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,79141.0,,,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, +bf16,epoch,fp16,framework_config,learning_rate,lora_alpha,lora_dropout,mem_nvidia_mem_reserved,mem_peak_torch_mem_alloc_in_bytes,mem_torch_mem_alloc_in_bytes,model_name_or_path,num_gpus,peft_method,per_device_train_batch_size,r,target_modules,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second +,0.07,,none,2.00E-05,,,15116,11267745280,6770300416,bigcode/gpt_bigcode-santacoder,1,,4,,,bfloat16,2.33703125,47.6604,8.393,2.098,17188.262 +,0.07,,none,2.00E-05,,,10345,5716304896,4525510656,bigcode/gpt_bigcode-santacoder,2,,2,,,bfloat16,2.338085938,31.1888,12.825,3.206,13132.913 +,0.14,,none,2.00E-05,,,18102,15193274880,6769448448,bigcode/gpt_bigcode-santacoder,1,,8,,,bfloat16,2.30890625,88.991,8.99,1.124,18410.851 +,0.14,,none,2.00E-05,,,12788,7819433984,4522951680,bigcode/gpt_bigcode-santacoder,2,,4,,,bfloat16,2.307929688,51.3528,15.579,1.947,15952.404 +,0.07,,foak-fast-kernels,2.00E-05,,,15122,11267745280,6770300416,bigcode/gpt_bigcode-santacoder,1,,4,,,bfloat16,2.336848526,48.2564,8.289,2.072,16976.001 +,0.07,,foak-fast-kernels,2.00E-05,,,10344,5658123264,4522503168,bigcode/gpt_bigcode-santacoder,2,,2,,,bfloat16,2.337660217,32.564,12.283,3.071,12578.301 +,0.14,,foak-fast-kernels,2.00E-05,,,18108,13582956544,6769448448,bigcode/gpt_bigcode-santacoder,1,,8,,,bfloat16,2.308277645,88.6688,9.022,1.128,18477.746 +,0.14,,foak-fast-kernels,2.00E-05,,,12429,7019943936,4528621056,bigcode/gpt_bigcode-santacoder,2,,4,,,bfloat16,2.308311367,51.5096,15.531,1.941,15903.839 +,0.15,,none,2.00E-05,,,77256,72434837504,43467876352,mistralai/Mistral-7B-v0.1,1,,4,,,bfloat16,0.840260801,534.9538,0.748,0.187,3062.694 +,0.15,,none,2.00E-05,,,44382,36225939968,28984199680,mistralai/Mistral-7B-v0.1,2,,2,,,bfloat16,0.840439606,294.9755,1.356,0.339,2777.18 +,0.29,,none,2.00E-05,,,72988,72435230720,43468269568,mistralai/Mistral-7B-v0.1,1,,8,,,bfloat16,0.833927689,1042.5316,0.767,0.096,3143.118 +,0.29,,none,2.00E-05,,,53587,36226136576,28984396288,mistralai/Mistral-7B-v0.1,2,,4,,,bfloat16,0.833756981,544.5118,1.469,0.184,3008.934 +,0.15,,foak-fast-kernels,2.00E-05,,,77278,72432707584,43466811392,mistralai/Mistral-7B-v0.1,1,,4,,,bfloat16,0.840406713,475.3033,0.842,0.21,3447.062 +,0.15,,foak-fast-kernels,2.00E-05,,,44088,36225939968,28984199680,mistralai/Mistral-7B-v0.1,2,,2,,,bfloat16,0.840214605,264.701,1.511,0.378,3094.813 +,0.29,,foak-fast-kernels,2.00E-05,,,71198,72433100800,43467204608,mistralai/Mistral-7B-v0.1,1,,8,,,bfloat16,0.833851738,923.9493,0.866,0.108,3546.515 +,0.29,,foak-fast-kernels,2.00E-05,,,53304,36226136576,28984396288,mistralai/Mistral-7B-v0.1,2,,4,,,bfloat16,0.833796377,485.7084,1.647,0.206,3373.217 +,,,none,2.00E-05,,,81166,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,4,,,bfloat16,,,,, +,,,none,2.00E-05,,,80284,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,2,,,bfloat16,,,,, +,,,none,2.00E-05,,,81166,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,8,,,bfloat16,,,,, +,,,none,2.00E-05,,,80750,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,4,,,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-05,,,81168,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,4,,,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-05,,,79960,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,2,,,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-05,,,81168,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,8,,,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-05,,,80464,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,4,,,bfloat16,,,,, +,,,none,2.00E-05,,,80882,,,NousResearch/Llama-2-70b-hf,1,,4,,,bfloat16,,,,, +,,,none,2.00E-05,,,80808,,,NousResearch/Llama-2-70b-hf,2,,2,,,bfloat16,,,,, +,,,none,2.00E-05,,,80882,,,NousResearch/Llama-2-70b-hf,1,,8,,,bfloat16,,,,, +,,,none,2.00E-05,,,80603,,,NousResearch/Llama-2-70b-hf,2,,4,,,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-05,,,80884,,,NousResearch/Llama-2-70b-hf,1,,4,,,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-05,,,80938,,,NousResearch/Llama-2-70b-hf,2,,2,,,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-05,,,80884,,,NousResearch/Llama-2-70b-hf,1,,8,,,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-05,,,80430,,,NousResearch/Llama-2-70b-hf,2,,4,,,bfloat16,,,,, +,0.15,,none,2.00E-04,16,0.1,29576,26704571904,14664493056,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.862205324,492.8771,0.812,0.203,3324.155 +,0.15,,none,2.00E-04,16,0.1,18678,15144870400,7368030720,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.861689148,278.0431,1.439,0.36,2946.306 +,0.29,,none,2.00E-04,16,0.1,43232,38742912512,14664886272,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,0.858190823,977.0384,0.819,0.102,3353.809 +,0.29,,none,2.00E-04,16,0.1,26328,21959827968,7368227328,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.858473749,517.0716,1.547,0.193,3168.613 +,0.15,,foak-fast-kernels,2.00E-04,16,0.1,29066,24473332736,14664493056,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.862244663,433.3721,0.923,0.231,3780.585 +,0.15,,foak-fast-kernels,2.00E-04,16,0.1,18352,14943543808,7368030720,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.861774035,249.4827,1.603,0.401,3283.594 +,0.29,,foak-fast-kernels,2.00E-04,16,0.1,41698,34280434688,14664886272,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,0.858119917,860.622,0.93,0.116,3807.479 +,0.29,,foak-fast-kernels,2.00E-04,16,0.1,25266,21557174784,7368227328,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.857962122,459.7072,1.74,0.218,3564.008 +,,,none,2.00E-04,16,0.1,81214,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,0.15,,none,2.00E-04,16,0.1,63398,57554708992,46829091328,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.889798498,521.3771,0.767,0.192,1571.223 +,,,none,2.00E-04,16,0.1,81214,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,0.29,,none,2.00E-04,16,0.1,70747,64851605504,46829287936,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.892950344,893.1647,0.896,0.112,1834.376 +,,,foak-fast-kernels,2.00E-04,16,0.1,81214,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,0.15,,foak-fast-kernels,2.00E-04,16,0.1,61398,57334586368,46829091328,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.890918617,495.2643,0.808,0.202,1654.066 +,,,foak-fast-kernels,2.00E-04,16,0.1,81214,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,0.29,,foak-fast-kernels,2.00E-04,16,0.1,69387,64400873984,46829287936,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.890685196,838.4909,0.954,0.119,1953.987 +,,,none,2.00E-04,16,0.1,81038,,,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,,,none,2.00E-04,16,0.1,80937,,,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,,,none,2.00E-04,16,0.1,81038,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,,,none,2.00E-04,16,0.1,80981,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-04,16,0.1,81038,,,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-04,16,0.1,80958,,,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-04,16,0.1,81038,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,,,foak-fast-kernels,2.00E-04,16,0.1,81045,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +TRUE,0.15,,baseline-peft-bnb,2.00E-04,16,0.1,26056,20669338112,4307028992,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.876798363,564.2301,0.709,0.177,2903.78 +TRUE,0.15,,baseline-peft-bnb,2.00E-04,16,0.1,13164,9424594432,2244525568,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.876854649,277.7698,1.44,0.36,2949.205 +TRUE,0.29,,baseline-peft-bnb,2.00E-04,16,0.1,46050,37002646016,4307422208,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,0.87258399,1096.8531,0.729,0.091,2987.456 +TRUE,0.29,,baseline-peft-bnb,2.00E-04,16,0.1,21182,15970067968,2244722176,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.872498674,482.0717,1.66,0.207,3398.665 +TRUE,0.15,,baseline-peft-bnb,2.00E-04,16,0.1,44924,42794268160,24665033728,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.898578081,1072.2023,0.373,0.093,1528.07 +TRUE,0.15,,baseline-peft-bnb,2.00E-04,16,0.1,29958,21129981952,12736740864,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,0.897914915,518.5422,0.771,0.193,1579.814 +TRUE,0.29,,baseline-peft-bnb,2.00E-04,16,0.1,63932,60545920512,24665426944,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,0.894624872,1938.6247,0.413,0.052,1690.27 +TRUE,0.29,,baseline-peft-bnb,2.00E-04,16,0.1,35417,28146350592,12736937472,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,0.896192532,871.1804,0.918,0.115,1880.667 +TRUE,,,baseline-peft-bnb,2.00E-04,16,0.1,79762,,,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +TRUE,0.14,,baseline-peft-bnb,2.00E-04,16,0.1,51988,46685287936,19266743808,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,1.005721827,1889.605,0.212,0.053,433.53 +TRUE,,,baseline-peft-bnb,2.00E-04,16,0.1,80220,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +TRUE,,,baseline-peft-bnb,2.00E-04,16,0.1,79517,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,,,,, +TRUE,0.07,,accelerated-peft-bnb,2.00E-04,16,0.1,12816,9147162112,808442368,bigcode/gpt_bigcode-santacoder,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.448344727,54.8476,7.293,1.823,14935.936 +TRUE,0.07,,accelerated-peft-bnb,2.00E-04,16,0.1,8468,4788195328,411216896,bigcode/gpt_bigcode-santacoder,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.44904707,47.9125,8.349,2.087,8548.922 +TRUE,0.14,,accelerated-peft-bnb,2.00E-04,16,0.1,23092,17486716416,810473984,bigcode/gpt_bigcode-santacoder,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.431303463,102.3763,7.814,0.977,16003.704 +TRUE,0.14,,accelerated-peft-bnb,2.00E-04,16,0.1,13636,8957644800,411315200,bigcode/gpt_bigcode-santacoder,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.431530228,56.9078,14.058,1.757,14395.21 +TRUE,0.07,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,10296,7538417152,810277376,bigcode/gpt_bigcode-santacoder,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.448136768,54.5851,7.328,1.832,15007.746 +TRUE,0.07,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,7383,3989590016,411216896,bigcode/gpt_bigcode-santacoder,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.45736763,34.1503,11.713,2.928,11994.044 +TRUE,0.14,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,18484,14264901120,808638976,bigcode/gpt_bigcode-santacoder,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.431345234,101.2865,7.898,0.987,16175.899 +TRUE,0.14,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,11398,7353749504,411315200,bigcode/gpt_bigcode-santacoder,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,2.439263115,56.7414,14.099,1.762,14437.437 +TRUE,0.15,,accelerated-peft-bnb,2.00E-04,16,0.1,19912,16373838336,4306496512,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.877768393,455.6536,0.878,0.219,3595.714 +TRUE,0.15,,accelerated-peft-bnb,2.00E-04,16,0.1,13284,9424594432,2244525568,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.877003698,277.8786,1.439,0.36,2948.05 +TRUE,0.29,,accelerated-peft-bnb,2.00E-04,16,0.1,34016,28412178944,4306889728,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.873239794,894.6554,0.894,0.112,3662.639 +TRUE,0.29,,accelerated-peft-bnb,2.00E-04,16,0.1,21299,15970067968,2244722176,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.873187227,482.8865,1.657,0.207,3392.93 +TRUE,0.15,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,19946,14115336192,4306496512,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.877510233,396.8986,1.008,0.252,4128.007 +TRUE,0.15,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,13134,9208653312,2244525568,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.882692728,218.9949,1.827,0.457,3740.727 +TRUE,0.29,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,33666,23922438144,4306889728,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.873534184,777.8243,1.029,0.129,4212.777 +TRUE,0.29,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,21244,15484642816,2244722176,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.878321848,407.9172,1.961,0.245,4016.501 +TRUE,0.15,,accelerated-peft-bnb,2.00E-04,16,0.1,38064,36731843072,24664501248,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.895266762,833.7459,0.48,0.12,1965.107 +TRUE,0.15,,accelerated-peft-bnb,2.00E-04,16,0.1,29896,21126185472,12736740864,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.894929171,515.4823,0.776,0.194,1589.191 +TRUE,0.29,,accelerated-peft-bnb,2.00E-04,16,0.1,50630,48770183680,24664894464,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.896698408,1556.2219,0.514,0.064,2105.612 +TRUE,0.29,,accelerated-peft-bnb,2.00E-04,16,0.1,35448,28154534912,12736937472,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.895372143,872.4582,0.917,0.115,1877.912 +TRUE,0.15,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,38202,34473340928,24664501248,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.894482374,777.0698,0.515,0.129,2108.433 +TRUE,0.15,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,33156,20843741184,12736740864,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.902506018,463.2409,0.863,0.216,1768.41 +TRUE,0.29,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,50628,44280442880,24664894464,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.896149569,1436.2648,0.557,0.07,2281.473 +TRUE,0.29,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,38249,27532073984,12736937472,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,0.902021217,798.4166,1.002,0.125,2052.062 +TRUE,0.14,,accelerated-peft-bnb,2.00E-04,16,0.1,72950,68126381056,37179001856,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.004711065,3528.2836,0.113,0.028,464.362 +TRUE,0.14,,accelerated-peft-bnb,2.00E-04,16,0.1,51989,46685287936,19266743808,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.004340935,1894.392,0.211,0.053,432.434 +TRUE,,,accelerated-peft-bnb,2.00E-04,16,0.1,80526,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,,,,, +TRUE,,,accelerated-peft-bnb,2.00E-04,16,0.1,79313,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,,,,, +TRUE,0.14,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,72376,67048903680,37179001856,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.004401016,3246.8566,0.123,0.031,504.611 +TRUE,0.14,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,55050,46407611904,19266743808,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.00918128,1705.9273,0.234,0.059,480.208 +TRUE,,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,80302,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,,,,, +TRUE,,,accelerated-peft-bnb-foak,2.00E-04,16,0.1,77734,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,,,,, +,0.15,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,20010,16404148736,4336806912,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.002504158,474.1556,0.844,0.211,3455.406 +,0.15,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,14887,9442125824,2261204480,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,1.003275042,282.8589,1.414,0.354,2896.143 +,0.29,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,33858,28442489344,4337200128,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.971892595,933.6392,0.857,0.107,3509.707 +,0.29,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,23023,15987599360,2261401088,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.992845125,497.8183,1.607,0.201,3291.161 +,0.15,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,20132,14145646592,4336806912,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.993614368,412.1468,0.971,0.243,3975.282 +,0.15,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,14914,9226184704,2261204480,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,1.019235764,224.7179,1.78,0.445,3645.46 +,0.29,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,33990,23952748544,4337200128,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.968857126,808.6617,0.989,0.124,4052.127 +,0.29,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,23029,15502174208,2261401088,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.038523531,422.0483,1.896,0.237,3882.021 +,0.15,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,37596,36041912832,23974571008,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.899851208,823.2021,0.486,0.121,1990.277 +,0.15,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,27105,20432607232,12044369408,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.898910484,495.6679,0.807,0.202,1652.719 +,0.29,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,50208,48080253440,23974964224,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.899603968,1570.5701,0.509,0.064,2086.376 +,0.29,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,32530,27446670336,12044566016,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.900239239,865.2804,0.925,0.116,1893.49 +,0.15,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,37346,33783410688,23974571008,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.900091076,762.3748,0.525,0.131,2149.074 +,0.15,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,27669,20151232000,12044369408,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.907595758,439.741,0.91,0.227,1862.915 +,0.29,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,49362,43590512640,23974964224,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.901174002,1445.4732,0.553,0.069,2266.939 +,0.29,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,32944,26828732928,12044566016,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.907834473,791.771,1.01,0.126,2069.285 +,0.14,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,71926,67069711872,36122332160,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.001004047,3591.4618,0.111,0.028,456.193 +,0.14,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,52716,45637991424,18219929088,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,1.000431108,1919.7248,0.208,0.052,426.728 +,,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,80778,,,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, +,,TRUE,accelerated-peft-autogptq,2.00E-04,16,0.1,80935,,,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, +,0.14,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,71728,65992234496,36122332160,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.00079361,3304.4395,0.121,0.03,495.818 +,0.14,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,53145,45360315392,18219929088,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,1.004220943,1707.7126,0.234,0.059,479.706 +,,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,81200,,,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, +,,TRUE,accelerated-peft-autogptq-foak,2.00E-04,16,0.1,79448,,,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, \ No newline at end of file diff --git a/scripts/benchmarks/refs/requirements.txt b/scripts/benchmarks/refs/requirements.txt index 49f1142a..1a5eb305 100644 --- a/scripts/benchmarks/refs/requirements.txt +++ b/scripts/benchmarks/refs/requirements.txt @@ -1,42 +1,40 @@ accelerate==1.0.1 -aiohappyeyeballs==2.4.3 -aiohttp==3.10.10 -aiosignal==1.3.1 -async-timeout==4.0.3 -attrs==24.2.0 +aiohappyeyeballs==2.4.4 +aiohttp==3.11.11 +aiosignal==1.3.2 +attrs==25.1.0 bitsandbytes==0.43.3 -certifi==2024.8.30 -charset-normalizer==3.4.0 -contourpy==1.3.0 -cycler==0.12.1 +cachetools==5.5.1 +certifi==2024.12.14 +chardet==5.2.0 +charset-normalizer==3.4.1 +colorama==0.4.6 datasets==2.21.0 dill==0.3.8 +distlib==0.3.9 docstring_parser==0.16 einops==0.8.0 -filelock==3.16.1 -flash-attn==2.6.3 --e git+https://github.com/foundation-model-stack/fms-acceleration.git@9cf8f6572575897ea5f1cbad5c15b3019169be87#egg=fms_acceleration&subdirectory=plugins/framework --e git+https://github.com/foundation-model-stack/fms-acceleration.git@9cf8f6572575897ea5f1cbad5c15b3019169be87#egg=fms_acceleration_aadp&subdirectory=plugins/attention-and-distributed-packing --e git+https://github.com/foundation-model-stack/fms-acceleration.git@9cf8f6572575897ea5f1cbad5c15b3019169be87#egg=fms_acceleration_foak&subdirectory=plugins/fused-ops-and-kernels --e git+https://github.com/foundation-model-stack/fms-acceleration.git@9cf8f6572575897ea5f1cbad5c15b3019169be87#egg=fms_acceleration_peft&subdirectory=plugins/accelerated-peft -fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@398c2a8fe26d734344240555585d95e05299faa8 -fonttools==4.54.1 +filelock==3.17.0 +flash-attn==2.7.3 +fms-acceleration @ file:///app/fms-acceleration/plugins/framework +fms-acceleration-aadp==0.2.0 +fms-acceleration-foak @ file:///app/fms-acceleration/plugins/fused-ops-and-kernels +fms-acceleration-peft @ file:///app/fms-acceleration/plugins/accelerated-peft +fms-hf-tuning==2.5.0 frozenlist==1.5.0 fsspec==2024.6.1 -huggingface-hub==0.26.2 +huggingface-hub==0.27.1 idna==3.10 -Jinja2==3.1.4 -kiwisolver==1.4.7 -llvmlite==0.43.0 +Jinja2==3.1.5 +llvmlite==0.44.0 markdown-it-py==3.0.0 MarkupSafe==3.0.2 -matplotlib==3.9.2 mdurl==0.1.2 mpmath==1.3.0 multidict==6.1.0 multiprocess==0.70.16 networkx==3.4.2 -numba==0.60.0 +numba==0.61.0 numpy==1.26.4 nvidia-cublas-cu12==12.1.3.1 nvidia-cuda-cupti-cu12==12.1.105 @@ -48,40 +46,46 @@ nvidia-curand-cu12==10.3.2.106 nvidia-cusolver-cu12==11.4.5.107 nvidia-cusparse-cu12==12.1.0.106 nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvjitlink-cu12==12.8.61 nvidia-nvtx-cu12==12.1.105 packaging==24.2 pandas==2.2.3 peft==0.13.2 -pillow==11.0.0 -propcache==0.2.0 -protobuf==5.28.3 -psutil==6.1.0 -pyarrow==18.0.0 -Pygments==2.18.0 -pyparsing==3.2.0 +platformdirs==4.3.6 +pluggy==1.5.0 +propcache==0.2.1 +protobuf==5.29.3 +psutil==6.1.1 +pyarrow==19.0.0 +Pygments==2.19.1 +pyproject-api==1.9.0 +pyproject_hooks==1.2.0 python-dateutil==2.9.0.post0 pytz==2024.2 PyYAML==6.0.2 regex==2024.11.6 requests==2.32.3 rich==13.9.4 -safetensors==0.4.5 +safetensors==0.5.2 sentencepiece==0.2.0 +setuptools==75.8.0 shtab==1.7.1 simpleeval==0.9.13 -six==1.16.0 -sympy==1.13.1 +six==1.17.0 +sympy==1.13.3 threadpoolctl==3.5.0 -tokenizers==0.20.3 +tokenizers==0.21.0 torch==2.4.1 -tqdm==4.67.0 -transformers==4.45.2 +tox==4.24.1 +tqdm==4.67.1 +transformers==4.48.1 triton==3.0.0 -trl==0.11.4 +trl==0.13.0 +typeguard==4.4.1 typing_extensions==4.12.2 -tyro==0.8.14 -tzdata==2024.2 -urllib3==2.2.3 +tyro==0.9.13 +tzdata==2025.1 +urllib3==2.3.0 +virtualenv==20.29.1 xxhash==3.5.0 -yarl==1.17.1 +yarl==1.18.3