chore(deps): upgrade trl and transformers (#448)

willmj · dushyantbehl · Abhishek-TAMU · web-flow · commit 2f033c72475c · 2025-02-13T14:46:58.000-05:00
* chore(deps): revert trl restriction

Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;

* fix: remove check

Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;

* fix: remove packing from func def

Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;

* chore(deps): upgrade transformers + trl

Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;

* enable packing for pretokenized datasets

Signed-off-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;

* tests: gradient accum steps = 1, get checkpoint path

Co-authored-by: Abhishek &lt;maurya.abhishek@ibm.com&gt;
Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;

* add upper limit to trl of below 0.15

Signed-off-by: Anh Uong &lt;anh.uong@ibm.com&gt;

---------

Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;
Signed-off-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;
Signed-off-by: Anh Uong &lt;anh.uong@ibm.com&gt;
Co-authored-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;
Co-authored-by: Abhishek &lt;maurya.abhishek@ibm.com&gt;
Co-authored-by: Anh Uong &lt;anh.uong@ibm.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,12 +29,12 @@ classifiers=[
 dependencies = [
 "numpy>=1.26.4,<2.0",
 "accelerate>=0.20.3,!=0.34,<1.1",
-"transformers>=4.45,<4.46",
+"transformers>=4.46,<4.48.2",
 "torch>=2.2.0,<2.5",
 "sentencepiece>=0.1.99,<0.3",
 "tokenizers>=0.13.3,<1.0",
 "tqdm>=4.66.2,<5.0",
-"trl>=0.9.3,<0.12",
+"trl>=0.13,<0.15",
 "peft>=0.8.0,<0.14",
 "protobuf>=5.28.0,<6.0.0",
 "datasets>=2.15.0,<3.0",
diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py
@@ -46,7 +46,7 @@
     "num_train_epochs": 5,
     "per_device_train_batch_size": 4,
     "per_device_eval_batch_size": 4,
-    "gradient_accumulation_steps": 4,
+    "gradient_accumulation_steps": 1,
     "learning_rate": 0.00001,
     "weight_decay": 0,
     "warmup_ratio": 0.03,
diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py
@@ -667,13 +667,6 @@ def test_get_data_collator(
             ),
             False,
         ),
-        # Pretokenized data with packing to True
-        (
-            configs.DataArguments(
-                training_data_path=TWITTER_COMPLAINTS_TOKENIZED_JSONL,
-            ),
-            True,
-        ),
     ],
 )
 def test_process_data_args_throws_error_where_needed(data_args, packing):
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -22,6 +22,7 @@
 import copy
 import json
 import os
+import re
 import tempfile
 
 # Third Party
@@ -88,7 +89,7 @@
     num_train_epochs=5,
     per_device_train_batch_size=4,
     per_device_eval_batch_size=4,
-    gradient_accumulation_steps=4,
+    gradient_accumulation_steps=1,
     learning_rate=0.00001,
     weight_decay=0,
     warmup_ratio=0.03,
@@ -1147,7 +1148,13 @@ def _validate_hf_resource_scanner_file(tempdir):
 
 
 def _get_checkpoint_path(dir_path):
-    return os.path.join(dir_path, "checkpoint-5")
+    checkpoint_dirs = [
+        d
+        for d in os.listdir(dir_path)
+        if os.path.isdir(os.path.join(dir_path, d)) and re.match(r"^checkpoint-\d+$", d)
+    ]
+    checkpoint_dirs.sort(key=lambda name: int(name.split("-")[-1]))
+    return os.path.join(dir_path, checkpoint_dirs[-1])
 
 
 def _get_adapter_config(dir_path):
diff --git a/tuning/data/setup_dataprocessor.py b/tuning/data/setup_dataprocessor.py
@@ -74,7 +74,7 @@ def _process_dataconfig_file(
 
 
 # Data Format 1: Pretokenized Data
-def _get_pretokenized_dataset_handlers(data_args, packing, is_eval_tokenized):
+def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):
 
     # if the provided train dataset is pretokenized
     # however user provides formatting flags, error out
@@ -96,12 +96,6 @@ def _get_pretokenized_dataset_handlers(data_args, packing, is_eval_tokenized):
             along with pretokenized train data"
         )
 
-    # Support for packing pretokenized datasets has been merged in trl library
-    # see: https://github.com/huggingface/trl/pull/2011
-    # but we wait till a new transformers version is released to remove this check.
-    if packing:
-        raise ValueError("packing will not be used when datasets are pretokenized")
-
     # We do not need a handler here as this is tokenized dataset
     return [], None
 
@@ -264,7 +258,7 @@ def _process_raw_data_args(
     if is_traindata_tokenized:
         # Data Format 1: Pretokenized Data
         handlers, dataset_text_field = _get_pretokenized_dataset_handlers(
-            data_args, packing, (is_eval_dataset_present and not is_evaldata_tokenized)
+            data_args, (is_eval_dataset_present and not is_evaldata_tokenized)
         )
     elif data_args.instruction_template and data_args.response_template:
         # Data Format 2: Chat dataset with instruction and response template