Skip to content

Commit 2f033c7

Browse files
willmjdushyantbehlAbhishek-TAMUanhuong
authored
chore(deps): upgrade trl and transformers (#448)
* chore(deps): revert trl restriction Signed-off-by: Will Johnson <[email protected]> * fix: remove check Signed-off-by: Will Johnson <[email protected]> * fix: remove packing from func def Signed-off-by: Will Johnson <[email protected]> * chore(deps): upgrade transformers + trl Signed-off-by: Will Johnson <[email protected]> * enable packing for pretokenized datasets Signed-off-by: Dushyant Behl <[email protected]> * tests: gradient accum steps = 1, get checkpoint path Co-authored-by: Abhishek <[email protected]> Signed-off-by: Will Johnson <[email protected]> * add upper limit to trl of below 0.15 Signed-off-by: Anh Uong <[email protected]> --------- Signed-off-by: Will Johnson <[email protected]> Signed-off-by: Dushyant Behl <[email protected]> Signed-off-by: Anh Uong <[email protected]> Co-authored-by: Dushyant Behl <[email protected]> Co-authored-by: Abhishek <[email protected]> Co-authored-by: Anh Uong <[email protected]>
1 parent f1fd130 commit 2f033c7

File tree

5 files changed

+14
-20
lines changed

5 files changed

+14
-20
lines changed

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ classifiers=[
2929
dependencies = [
3030
"numpy>=1.26.4,<2.0",
3131
"accelerate>=0.20.3,!=0.34,<1.1",
32-
"transformers>=4.45,<4.46",
32+
"transformers>=4.46,<4.48.2",
3333
"torch>=2.2.0,<2.5",
3434
"sentencepiece>=0.1.99,<0.3",
3535
"tokenizers>=0.13.3,<1.0",
3636
"tqdm>=4.66.2,<5.0",
37-
"trl>=0.9.3,<0.12",
37+
"trl>=0.13,<0.15",
3838
"peft>=0.8.0,<0.14",
3939
"protobuf>=5.28.0,<6.0.0",
4040
"datasets>=2.15.0,<3.0",

tests/build/test_launch_script.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
"num_train_epochs": 5,
4747
"per_device_train_batch_size": 4,
4848
"per_device_eval_batch_size": 4,
49-
"gradient_accumulation_steps": 4,
49+
"gradient_accumulation_steps": 1,
5050
"learning_rate": 0.00001,
5151
"weight_decay": 0,
5252
"warmup_ratio": 0.03,

tests/data/test_data_preprocessing.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -667,13 +667,6 @@ def test_get_data_collator(
667667
),
668668
False,
669669
),
670-
# Pretokenized data with packing to True
671-
(
672-
configs.DataArguments(
673-
training_data_path=TWITTER_COMPLAINTS_TOKENIZED_JSONL,
674-
),
675-
True,
676-
),
677670
],
678671
)
679672
def test_process_data_args_throws_error_where_needed(data_args, packing):

tests/test_sft_trainer.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import copy
2323
import json
2424
import os
25+
import re
2526
import tempfile
2627

2728
# Third Party
@@ -88,7 +89,7 @@
8889
num_train_epochs=5,
8990
per_device_train_batch_size=4,
9091
per_device_eval_batch_size=4,
91-
gradient_accumulation_steps=4,
92+
gradient_accumulation_steps=1,
9293
learning_rate=0.00001,
9394
weight_decay=0,
9495
warmup_ratio=0.03,
@@ -1147,7 +1148,13 @@ def _validate_hf_resource_scanner_file(tempdir):
11471148

11481149

11491150
def _get_checkpoint_path(dir_path):
1150-
return os.path.join(dir_path, "checkpoint-5")
1151+
checkpoint_dirs = [
1152+
d
1153+
for d in os.listdir(dir_path)
1154+
if os.path.isdir(os.path.join(dir_path, d)) and re.match(r"^checkpoint-\d+$", d)
1155+
]
1156+
checkpoint_dirs.sort(key=lambda name: int(name.split("-")[-1]))
1157+
return os.path.join(dir_path, checkpoint_dirs[-1])
11511158

11521159

11531160
def _get_adapter_config(dir_path):

tuning/data/setup_dataprocessor.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def _process_dataconfig_file(
7474

7575

7676
# Data Format 1: Pretokenized Data
77-
def _get_pretokenized_dataset_handlers(data_args, packing, is_eval_tokenized):
77+
def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):
7878

7979
# if the provided train dataset is pretokenized
8080
# however user provides formatting flags, error out
@@ -96,12 +96,6 @@ def _get_pretokenized_dataset_handlers(data_args, packing, is_eval_tokenized):
9696
along with pretokenized train data"
9797
)
9898

99-
# Support for packing pretokenized datasets has been merged in trl library
100-
# see: https://github.com/huggingface/trl/pull/2011
101-
# but we wait till a new transformers version is released to remove this check.
102-
if packing:
103-
raise ValueError("packing will not be used when datasets are pretokenized")
104-
10599
# We do not need a handler here as this is tokenized dataset
106100
return [], None
107101

@@ -264,7 +258,7 @@ def _process_raw_data_args(
264258
if is_traindata_tokenized:
265259
# Data Format 1: Pretokenized Data
266260
handlers, dataset_text_field = _get_pretokenized_dataset_handlers(
267-
data_args, packing, (is_eval_dataset_present and not is_evaldata_tokenized)
261+
data_args, (is_eval_dataset_present and not is_evaldata_tokenized)
268262
)
269263
elif data_args.instruction_template and data_args.response_template:
270264
# Data Format 2: Chat dataset with instruction and response template

0 commit comments

Comments
 (0)