diff --git a/examples/training/llama/finetune_llama.py b/examples/training/llama/finetune_llama.py index 8daaa5a59..fe4095d0e 100755 --- a/examples/training/llama/finetune_llama.py +++ b/examples/training/llama/finetune_llama.py @@ -80,7 +80,7 @@ def train(model_id, tokenizer, dataset, training_args): args = training_args.to_dict() sft_config = NeuronSFTConfig( - max_seq_length=2048, + max_length=2048, packing=True, **args, ) @@ -91,7 +91,7 @@ def train(model_id, tokenizer, dataset, training_args): args=sft_config, model=model, peft_config=lora_config, - tokenizer=tokenizer, + processing_class=tokenizer, train_dataset=dataset, formatting_func=lambda example: format_dolly(example, tokenizer), ) diff --git a/examples/training/qwen3/finetune_qwen3.py b/examples/training/qwen3/finetune_qwen3.py index f7a27bbb6..8e3b25712 100644 --- a/examples/training/qwen3/finetune_qwen3.py +++ b/examples/training/qwen3/finetune_qwen3.py @@ -84,7 +84,7 @@ def train(model_id, tokenizer, dataset, training_args): args = training_args.to_dict() sft_config = NeuronSFTConfig( - max_seq_length=4096, + max_length=4096, packing=True, **args, ) @@ -98,7 +98,7 @@ def formatting_function(examples): args=sft_config, model=model, peft_config=lora_config, - tokenizer=tokenizer, + processing_class=tokenizer, train_dataset=dataset, formatting_func=formatting_function, ) diff --git a/examples/training/qwen3/finetune_qwen3.sh b/examples/training/qwen3/finetune_qwen3.sh index d64a6572d..b2d7568e3 100755 --- a/examples/training/qwen3/finetune_qwen3.sh +++ b/examples/training/qwen3/finetune_qwen3.sh @@ -13,7 +13,8 @@ TP_DEGREE=8 BS=1 GRADIENT_ACCUMULATION_STEPS=8 LOGGING_STEPS=2 -MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name +# MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name +MODEL_NAME="Qwen/Qwen3-0.6B" # Change this to the desired model name OUTPUT_DIR="$(echo $MODEL_NAME | cut -d'/' -f2)-finetuned" DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE" SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) diff --git a/optimum/neuron/trainers/sft_config.py b/optimum/neuron/trainers/sft_config.py index 8c50d033f..45cb07b3e 100644 --- a/optimum/neuron/trainers/sft_config.py +++ b/optimum/neuron/trainers/sft_config.py @@ -10,7 +10,7 @@ # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# Seg the License for the specific language governing permissions and +# See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass @@ -32,4 +32,24 @@ def __init__(self, *args, **kwargs): @dataclass class NeuronSFTConfig(NeuronTrainingArguments, SFTConfig): - pass + """ + Configuration class for Neuron-optimized SFT training. + + Inherits from both NeuronTrainingArguments (for Trainium-specific settings) and + trl's SFTConfig (for SFT-specific settings). + + Key Neuron-specific behavior: + - padding_free is always set to False to avoid recompilation on Trainium devices + - All other SFT parameters from trl 0.24.0+ are supported + """ + + def __post_init__(self): + # Handle max_seq_length -> max_length migration for backward compatibility + if hasattr(self, "max_seq_length") and self.max_seq_length is not None: + self.max_length = self.max_seq_length + + # Force padding_free to False for Neuron - critical for avoiding recompilation + # Neuron devices require fixed input shapes; padding_free flattening breaks this requirement + self.padding_free = False + + super().__post_init__() diff --git a/optimum/neuron/trainers/sft_trainer.py b/optimum/neuron/trainers/sft_trainer.py index c9a481bb4..29b9b6b2a 100644 --- a/optimum/neuron/trainers/sft_trainer.py +++ b/optimum/neuron/trainers/sft_trainer.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from typing import Any, Callable import datasets @@ -21,7 +22,6 @@ from torch.utils.data import Dataset, IterableDataset from transformers import ( AutoModelForCausalLM, - AutoTokenizer, DataCollator, DataCollatorForLanguageModeling, PreTrainedModel, @@ -75,13 +75,18 @@ class PeftConfig: class NeuronSFTTrainer(_SFTTrainer): """ - `SFTTrainer` adapted for Neuron. - - It differs from the original `SFTTrainer` by: - - Using `_TrainerForNeuron.__init__()` instead of `Trainer.__init__()` - - Using the `_TrainerForNeuron.train()` instead of `Trainer.train()` - - Adapts the `_prepare_non_packed_dataloader` to pad to max length. In the original `SFTTrainer` examples are - not padded, which is an issue here because it triggers compilation every time. + `SFTTrainer` adapted for Neuron (Trainium) devices. + + Overrides key methods for Neuron compatibility: + - Uses NeuronTrainer.__init__() instead of transformers.Trainer.__init__() + - Uses NeuronTrainer.train() for Neuron-optimized training + - Enforces padding_free=False for fixed input shapes (required for Trainium) + - Simplifies _prepare_dataset to delegate to parent with Neuron constraints + + Neuron-specific constraints: + - padding_free is always False to avoid recompilation + - VLM training is not yet supported + - NeFTune training is not supported """ def __init__( @@ -91,33 +96,37 @@ def __init__( data_collator: DataCollator | None = None, # type: ignore train_dataset: "Dataset | IterableDataset | datasets.Dataset | None" = None, eval_dataset: "Dataset | dict[str, Dataset] | datasets.Dataset | None" = None, - processsing_class: PreTrainedTokenizerBase | ProcessorMixin | None = None, + processing_class: PreTrainedTokenizerBase | ProcessorMixin | None = None, + compute_loss_func: Callable | None = None, + compute_metrics: Callable | None = None, callbacks: list[TrainerCallback] | None = None, - optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None), optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]] | None = None, - tokenizer: PreTrainedTokenizerBase | None = None, # deprecated + preprocess_logits_for_metrics: Callable | None = None, peft_config: PeftConfig | None = None, formatting_func: Callable | None = None, + # Deprecated parameters for backward compatibility + tokenizer: PreTrainedTokenizerBase | None = None, # Use processing_class instead ): if not is_trl_available(required_version=TRL_VERSION): raise RuntimeError(f"Using NeuronSFTTrainer requires trl=={TRL_VERSION}.") from trl.extras.dataset_formatting import get_formatting_func_from_dataset - - # This will be changed to : from trl.trainer.callbacks import RichProgressCallback - from trl.trainer.utils import ( - DataCollatorForCompletionOnlyLM, - peft_module_casting_to_bf16, - ) + from trl.trainer.utils import peft_module_casting_to_bf16 if is_peft_available(): from peft import PeftConfig + # Handle backward compatibility for tokenizer parameter + if tokenizer is not None and processing_class is None: + processing_class = tokenizer + args_is_none = args is None if args is None: - output_dir = "tmp_trainer" - args = NeuronSFTConfig(output_dir=output_dir) + model_name = model if isinstance(model, str) else model.config._name_or_path + model_name = model_name.split("/")[-1] + args = NeuronSFTConfig(f"{model_name}-SFT") elif args is not None and args.__class__.__name__ == "NeuronTrainingArguments": args_as_dict = args.to_dict() # Manually copy token values as TrainingArguments.to_dict() redacts them @@ -132,7 +141,8 @@ def __init__( if args_is_none: logging.warning(f"No `SFTConfig` passed, using `output_dir={args.output_dir}`.") - if getattr(args, "model_init_kwargs", None) is None: + # Model handling - use model_init_kwargs from args + if args.model_init_kwargs is None: model_init_kwargs = {} elif not isinstance(model, str): raise ValueError("You passed model_init_kwargs to the SFTConfig, but your model is already instantiated.") @@ -150,16 +160,51 @@ def __init__( model_init_kwargs["dtype"] = torch_dtype if isinstance(model, str): - logging.warning( - "You passed a model_id to the SFTTrainer. This will automatically create an " - "`AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you." - ) - model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs) + model_id = model + dtype = model_init_kwargs.get("dtype") + if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None: + pass # dtype is already a torch.dtype or "auto" or None + elif isinstance(dtype, str) and dtype in ["bfloat16", "float16", "float32"]: + dtype = getattr(torch, dtype) + model_init_kwargs["dtype"] = dtype + else: + raise ValueError( + "Invalid `dtype` passed to `SFTConfig`. Expected either 'auto' or a string representing " + f"a valid `torch.dtype` (e.g., 'float32'), but got {dtype}." + ) + model = AutoModelForCausalLM.from_pretrained(model_id, **model_init_kwargs) + else: + model_id = model.config._name_or_path + if args.model_init_kwargs is not None: + logger.warning( + "You passed `model_init_kwargs` to the `SFTConfig`, but your model is already instantiated. " + "The `model_init_kwargs` will be ignored." + ) - if args.packing and data_collator is not None and isinstance(data_collator, DataCollatorForCompletionOnlyLM): - raise ValueError( - "You passed a `DataCollatorForCompletionOnlyLM` to the NeuronSFTTrainer. This is not compatible with the `packing` argument." - ) + # Chat template handling (trl 0.24.0+) + # This allows users to provide a custom chat template via path or directory + if hasattr(args, 'chat_template_path') and args.chat_template_path is not None: + from trl.models import clone_chat_template + + if os.path.isfile(args.chat_template_path) and args.chat_template_path.endswith((".jinja", ".j2")): + # Load Jinja template directly + with open(args.chat_template_path, encoding="utf-8") as chat_template_file: + processing_class.chat_template = chat_template_file.read() + added_tokens = [] + else: + # Clone template from another model + try: + model, processing_class, added_tokens = clone_chat_template( + model, processing_class, args.chat_template_path + ) + except Exception as e: + logger.warning( + f"Failed to clone chat template from {args.chat_template_path}: {e}. " + "Continuing without custom chat template." + ) + added_tokens = [] + else: + added_tokens = [] if is_peft_available() and peft_config is not None: if not isinstance(peft_config, PeftConfig): @@ -188,24 +233,31 @@ def make_inputs_require_grad(module, input, output): if args is not None and args.bf16: peft_module_casting_to_bf16(model) - if tokenizer is None: - tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path) - if getattr(tokenizer, "pad_token", None) is None: - tokenizer.pad_token = tokenizer.eos_token + # Processing class (tokenizer) handling + if processing_class is None: + from transformers import AutoProcessor + + processing_class = AutoProcessor.from_pretrained(model_id) - if args.max_seq_length is None: - # to overcome some issues with broken tokenizers - args.max_seq_length = min(tokenizer.model_max_length, 1024) + # Ensure we have a pad token + if hasattr(processing_class, "pad_token") and getattr(processing_class, "pad_token", None) is None: + processing_class.pad_token = processing_class.eos_token + + if args.max_length is None: + # To overcome some issues with broken tokenizers + args.max_length = min(processing_class.model_max_length, 1024) logger.warning( - f"You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to {args.max_seq_length}" + f"You didn't pass a `max_length` argument to the SFTTrainer, this will default to {args.max_length}" ) self.dataset_num_proc = args.dataset_num_proc - self.dataset_batch_size = args.dataset_batch_size + # We do not support NeFTune with NeuronSFTTrainer for now. + self._trainer_supports_neftune = False - self._trainer_supports_neftune = hasattr(args, "neftune_noise_alpha") + # Vision Language Model (VLM) support - not yet supported in Neuron + self._is_vlm = False if args.dataset_kwargs is None: args.dataset_kwargs = {} @@ -230,50 +282,61 @@ def make_inputs_require_grad(module, input, output): "You passed `packing=False` to the SFTTrainer/SFTConfig, but you didn't pass a `dataset_text_field` or `formatting_func` argument." ) + # Data collator creation with Neuron-specific constraints if data_collator is None: - data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + # Determine if this is a VLM (vision language model) + is_vlm = isinstance(processing_class, ProcessorMixin) and hasattr(processing_class, 'image_processor') + + if is_vlm: + # VLM support is not yet implemented in Neuron + logger.warning( + "Vision Language Model (VLM) detected. VLM training is not yet fully supported in Neuron. " + "Attempting to use standard language modeling collator." + ) + # For now, use standard collator - user can override if needed + data_collator = DataCollatorForLanguageModeling( + tokenizer=processing_class.tokenizer if hasattr(processing_class, 'tokenizer') else processing_class, + mlm=False, + ) + else: + # Standard language modeling collator + data_collator = DataCollatorForLanguageModeling(tokenizer=processing_class, mlm=False) + + # Ensure padding_free is False - critical Neuron requirement + # (this is already done in NeuronSFTConfig.__post_init__, but double-check) + if hasattr(data_collator, 'padding_free'): + data_collator.padding_free = False # Pre-process the datasets only once per node. The remaining processes will use the cache. with NeuronPartialState().local_main_process_first(): if train_dataset is not None: train_dataset = self._prepare_dataset( - train_dataset, - tokenizer, - args.packing, - args.dataset_text_field, - args.max_seq_length, - formatting_func, - args.num_of_sequences, - args.chars_per_token, - remove_unused_columns=args.remove_unused_columns if args is not None else True, - **args.dataset_kwargs, + train_dataset, processing_class, args, args.packing, formatting_func, "train" ) if eval_dataset is not None: _multiple = isinstance(eval_dataset, dict) _eval_datasets = eval_dataset if _multiple else {"singleton": eval_dataset} - eval_packing = args.packing if args.eval_packing is None else args.eval_packing - for _eval_dataset_name, _eval_dataset in _eval_datasets.items(): _eval_datasets[_eval_dataset_name] = self._prepare_dataset( _eval_dataset, - tokenizer, - eval_packing, - args.dataset_text_field, - args.max_seq_length, + processing_class, + args, + args.eval_packing if args.eval_packing is not None else args.packing, formatting_func, - args.num_of_sequences, - args.chars_per_token, - remove_unused_columns=args.remove_unused_columns if args is not None else True, - **args.dataset_kwargs, + _eval_dataset_name, ) if not _multiple: eval_dataset = _eval_datasets["singleton"] - if tokenizer.padding_side is not None and tokenizer.padding_side != "right": + if ( + hasattr(processing_class, "padding_side") + and processing_class.padding_side is not None + and processing_class.padding_side != "right" + ): logger.warning( - "You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to " - "overflow issues when training a model in half-precision. You might consider adding `tokenizer.padding_side = 'right'` to your code." + "You passed a processing_class with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to " + 'overflow issues when training a model in half-precision. You might consider adding `processing_class.padding_side = "right"` to your code.' ) NeuronTrainer.__init__( @@ -283,7 +346,7 @@ def make_inputs_require_grad(module, input, output): data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, - processing_class=tokenizer, + processing_class=processing_class, callbacks=callbacks, optimizers=optimizers, optimizer_cls_and_kwargs=optimizer_cls_and_kwargs, @@ -313,62 +376,60 @@ def train( ): return NeuronTrainer.train(self, resume_from_checkpoint=resume_from_checkpoint) - def _prepare_non_packed_dataloader( - self, - tokenizer, - dataset, - dataset_text_field, - max_seq_length, - formatting_func=None, - add_special_tokens=True, - remove_unused_columns=True, - ): - use_formatting_func = formatting_func is not None and dataset_text_field is None - self._dataset_sanity_checked = False - - # Inspired from: https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt - def tokenize(element): - outputs = tokenizer( - element[dataset_text_field] if not use_formatting_func else formatting_func(element), - add_special_tokens=add_special_tokens, - truncation=True, - # For Neuron we need to pad because otherwise it will trigger compilation for each new sequence length. - padding="max_length", - max_length=max_seq_length, - return_overflowing_tokens=False, - return_length=False, - ) + def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): + """ + Compute training loss for Neuron-optimized training. - if use_formatting_func and not self._dataset_sanity_checked: - if not isinstance(formatting_func(element), list): - raise ValueError( - "The `formatting_func` should return a list of processed strings since it can lead to silent bugs." - ) - else: - self._dataset_sanity_checked = True + Overrides TRL SFTTrainer's compute_loss to set use_cache=False for gradient + checkpointing compatibility and delegate to NeuronTrainer's compute_loss. + """ + # Set use_cache to False to avoid warnings with gradient checkpointing + inputs["use_cache"] = False - return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]} + # Call the parent NeuronTrainer's compute_loss method (not TRL's) + return NeuronTrainer.compute_loss(self, model, inputs, return_outputs, num_items_in_batch) - signature_columns = ["input_ids", "labels", "attention_mask"] + def training_step( + self, model: torch.nn.Module, inputs: dict[str, Any], num_items_in_batch: int | None = None + ) -> torch.Tensor: + """ + Perform a training step for Neuron-optimized training. - if dataset.column_names is not None: # None for IterableDataset - extra_columns = list(set(dataset.column_names) - set(signature_columns)) - else: - extra_columns = [] + Overrides SFTTrainer.training_step to delegate to NeuronTrainer's implementation, + which is compatible with Neuron's distributed training setup. + """ + return NeuronTrainer.training_step(self, model, inputs, num_items_in_batch=num_items_in_batch) - if not remove_unused_columns and len(extra_columns) > 0: - logger.warning( - "You passed `remove_unused_columns=False` on a non-packed dataset. This might create some issues with the default collator and yield to errors. If you want to " - f"inspect dataset other columns (in this case {extra_columns}), you can subclass `DataCollatorForLanguageModeling` in case you used the default collator and create your own data collator in order to inspect the unused dataset columns." + def _prepare_dataset( + self, + dataset, + processing_class, + args, + packing, + formatting_func=None, + dataset_name="train", + ): + """ + Prepare dataset for Neuron training. + + Delegates to parent SFTTrainer._prepare_dataset, which handles: + - Dataset type detection (language modeling, prompt-completion, conversational) + - Chat template application + - Tokenization + - Packing (if enabled) + + Neuron-specific behavior: + - Ensures padding_free=False to avoid recompilation + - Enforces padding to max_length for fixed input shapes + """ + # Ensure padding_free is disabled for Neuron - this is critical for Trainium devices + if args.padding_free: + raise ValueError( + "padding_free must be False for Neuron training. " + "Neuron devices require fixed input shapes to avoid recompilation." ) - map_kwargs = { - "batched": True, - "remove_columns": dataset.column_names if remove_unused_columns else None, - "batch_size": self.dataset_batch_size, - } - if isinstance(dataset, datasets.Dataset): - map_kwargs["num_proc"] = self.dataset_num_proc # this arg is not available for IterableDataset - tokenized_dataset = dataset.map(tokenize, **map_kwargs) - - return tokenized_dataset + # Call parent implementation from SFTTrainer + return super()._prepare_dataset( + dataset, processing_class, args, packing, formatting_func, dataset_name + ) diff --git a/optimum/neuron/trainers/transformers.py b/optimum/neuron/trainers/transformers.py index 8e9ddcbf8..cfd009887 100644 --- a/optimum/neuron/trainers/transformers.py +++ b/optimum/neuron/trainers/transformers.py @@ -936,26 +936,29 @@ def get_batch_samples( return batch_samples, num_items_in_batch - def train_step( - self, model: nn.Module, inputs: dict[str, Any], num_items_in_batch: int | torch.Tensor | None = None - ) -> torch.Tensor: - manager = self.autocast_smart_context_manager() - + def compute_loss( + self, + model: nn.Module, + inputs: dict[str, torch.Tensor | Any], + return_outputs: bool = False, + num_items_in_batch: torch.Tensor | None = None, + ): if isinstance(model, NxDPPModel): - with manager: - loss = model.run_train(**inputs) + loss = model.run_train(**inputs) # When using pipeline parallelism, the loss is only computed on the last stage. # So we set the loss to zero on other stages. if self.pp_rank != self.pp_size - 1: dtype = torch.bfloat16 if self.args.bf16 else torch.float32 loss = torch.tensor(0, dtype=dtype).to(xm.xla_device()) + + # PP does not return any outputs except the loss + outputs = {"loss": loss} else: if num_items_in_batch is not None: inputs = dict(**inputs, reduction="sum") - with manager: - outputs = model(**inputs) + outputs = model(**inputs) if isinstance(outputs, dict) and "loss" not in outputs: raise ValueError( @@ -970,8 +973,17 @@ def train_step( else: loss = loss / self.args.gradient_accumulation_steps - # Backward pass - self.accelerator.backward(loss) + return (loss, outputs) if return_outputs else loss + + def training_step( + self, model: nn.Module, inputs: dict[str, Any], num_items_in_batch: int | torch.Tensor | None = None + ) -> torch.Tensor: + manager = self.autocast_smart_context_manager() + with manager: + loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) + + # Backward pass + self.accelerator.backward(loss) return loss @@ -1102,7 +1114,7 @@ def train( if step % args.gradient_accumulation_steps == 0: self.control = self.callback_handler.on_step_begin(args, self.state, self.control) - loss_step = self.train_step(self.model, inputs, num_items_in_batch=num_items_in_batch) + loss_step = self.training_step(self.model, inputs, num_items_in_batch=num_items_in_batch) self.running_loss += loss_step.detach() if do_sync_step: diff --git a/optimum/neuron/trainers/trl_utils.py b/optimum/neuron/trainers/trl_utils.py index dac42fb54..4046dd93b 100644 --- a/optimum/neuron/trainers/trl_utils.py +++ b/optimum/neuron/trainers/trl_utils.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -TRL_VERSION = "0.11.4" +TRL_VERSION = "0.24.0" diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py index 56de4eb08..04e367429 100644 --- a/optimum/neuron/utils/__init__.py +++ b/optimum/neuron/utils/__init__.py @@ -85,7 +85,6 @@ "patch_within_function", "replace_class_in_inheritance_hierarchy", ], - "trl_utils": ["NeuronSFTConfig", "NeuronORPOConfig"], } if TYPE_CHECKING: @@ -155,7 +154,6 @@ patch_within_function, replace_class_in_inheritance_hierarchy, ) - from .trl_utils import NeuronORPOConfig, NeuronSFTConfig else: import sys diff --git a/pyproject.toml b/pyproject.toml index 9c1836bba..facb0eb40 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,7 +75,7 @@ quality = [ "isort", ] training = [ - "trl == 0.11.4", + "trl == 0.23.1", "peft == 0.17.0", "evaluate == 0.4.3", ] diff --git a/tests/training/test_neuron_sft_trainer.py b/tests/training/test_neuron_sft_trainer.py index dc4f9d15e..4be2a1a80 100644 --- a/tests/training/test_neuron_sft_trainer.py +++ b/tests/training/test_neuron_sft_trainer.py @@ -77,7 +77,7 @@ def format_dolly(sample): args = args.to_dict() sft_config = NeuronSFTConfig( # Using a small sequence-length since we are not validating the outputs. - max_seq_length=128, + max_length=128, packing=packing, dataset_num_proc=1, **args, @@ -86,7 +86,7 @@ def format_dolly(sample): # Create Trainer instance trainer = NeuronSFTTrainer( model=model, - tokenizer=tokenizer, + processing_class=tokenizer, train_dataset=dataset, formatting_func=format_dolly, args=sft_config, @@ -172,7 +172,7 @@ def format_dolly(sample): args = args.to_dict() sft_config = NeuronSFTConfig( - max_seq_length=128, + max_length=128, packing=False, # No packing for PEFT test simplicity dataset_num_proc=1, **args, @@ -181,7 +181,7 @@ def format_dolly(sample): # Create SFT Trainer instance with PEFT model trainer = NeuronSFTTrainer( model=base_model, - tokenizer=tokenizer, + processing_class=tokenizer, train_dataset=dataset, formatting_func=format_dolly, args=sft_config,