diff --git a/pyproject.toml b/pyproject.toml index eb1da2993..43742f899 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,12 +29,12 @@ classifiers=[ dependencies = [ "numpy>=1.26.4,<2.0", "accelerate>=0.20.3,!=0.34,<1.1", -"transformers>=4.45,<4.46", +"transformers>=4.46,<4.48.2", "torch>=2.2.0,<2.5", "sentencepiece>=0.1.99,<0.3", "tokenizers>=0.13.3,<1.0", "tqdm>=4.66.2,<5.0", -"trl>=0.9.3,<0.12", +"trl>=0.13,<0.15", "peft>=0.8.0,<0.14", "protobuf>=5.28.0,<6.0.0", "datasets>=2.15.0,<3.0", diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py index c699e16da..2f81fd78f 100644 --- a/tests/build/test_launch_script.py +++ b/tests/build/test_launch_script.py @@ -46,7 +46,7 @@ "num_train_epochs": 5, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 4, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 1, "learning_rate": 0.00001, "weight_decay": 0, "warmup_ratio": 0.03, diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py index 7bf07e58a..9e65b1302 100644 --- a/tests/data/test_data_preprocessing.py +++ b/tests/data/test_data_preprocessing.py @@ -667,13 +667,6 @@ def test_get_data_collator( ), False, ), - # Pretokenized data with packing to True - ( - configs.DataArguments( - training_data_path=TWITTER_COMPLAINTS_TOKENIZED_JSONL, - ), - True, - ), ], ) def test_process_data_args_throws_error_where_needed(data_args, packing): diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index b718a97cf..fb707327c 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -22,6 +22,7 @@ import copy import json import os +import re import tempfile # Third Party @@ -88,7 +89,7 @@ num_train_epochs=5, per_device_train_batch_size=4, per_device_eval_batch_size=4, - gradient_accumulation_steps=4, + gradient_accumulation_steps=1, learning_rate=0.00001, weight_decay=0, warmup_ratio=0.03, @@ -1147,7 +1148,13 @@ def _validate_hf_resource_scanner_file(tempdir): def _get_checkpoint_path(dir_path): - return os.path.join(dir_path, "checkpoint-5") + checkpoint_dirs = [ + d + for d in os.listdir(dir_path) + if os.path.isdir(os.path.join(dir_path, d)) and re.match(r"^checkpoint-\d+$", d) + ] + checkpoint_dirs.sort(key=lambda name: int(name.split("-")[-1])) + return os.path.join(dir_path, checkpoint_dirs[-1]) def _get_adapter_config(dir_path): diff --git a/tuning/data/setup_dataprocessor.py b/tuning/data/setup_dataprocessor.py index 433ddbece..730bc318c 100644 --- a/tuning/data/setup_dataprocessor.py +++ b/tuning/data/setup_dataprocessor.py @@ -74,7 +74,7 @@ def _process_dataconfig_file( # Data Format 1: Pretokenized Data -def _get_pretokenized_dataset_handlers(data_args, packing, is_eval_tokenized): +def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized): # if the provided train dataset is pretokenized # however user provides formatting flags, error out @@ -96,12 +96,6 @@ def _get_pretokenized_dataset_handlers(data_args, packing, is_eval_tokenized): along with pretokenized train data" ) - # Support for packing pretokenized datasets has been merged in trl library - # see: https://github.com/huggingface/trl/pull/2011 - # but we wait till a new transformers version is released to remove this check. - if packing: - raise ValueError("packing will not be used when datasets are pretokenized") - # We do not need a handler here as this is tokenized dataset return [], None @@ -264,7 +258,7 @@ def _process_raw_data_args( if is_traindata_tokenized: # Data Format 1: Pretokenized Data handlers, dataset_text_field = _get_pretokenized_dataset_handlers( - data_args, packing, (is_eval_dataset_present and not is_evaldata_tokenized) + data_args, (is_eval_dataset_present and not is_evaldata_tokenized) ) elif data_args.instruction_template and data_args.response_template: # Data Format 2: Chat dataset with instruction and response template