Add seq2seq collator for packing pretokenized data

dushyantbehl · dushyantbehl · commit 73b81c1b35c6 · 2025-03-03T16:57:45.000+05:30
Signed-off-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -756,17 +756,18 @@ def test_run_causallm_ft_save_with_save_model_dir_save_strategy_no():
 
 
 @pytest.mark.parametrize(
-    "dataset_path",
+    "dataset_path, packing",
     [
-        TWITTER_COMPLAINTS_TOKENIZED_JSONL,
-        TWITTER_COMPLAINTS_TOKENIZED_JSON,
-        TWITTER_COMPLAINTS_TOKENIZED_PARQUET,
-        TWITTER_COMPLAINTS_TOKENIZED_ARROW,
+        (TWITTER_COMPLAINTS_TOKENIZED_JSON, False),
+        (TWITTER_COMPLAINTS_TOKENIZED_JSONL, True),
+        (TWITTER_COMPLAINTS_TOKENIZED_PARQUET, True),
+        (TWITTER_COMPLAINTS_TOKENIZED_ARROW, False),
     ],
 )
-def test_run_causallm_ft_pretokenized(dataset_path):
+def test_run_causallm_ft_pretokenized(dataset_path, packing):
     """Check if we can bootstrap and finetune causallm models using pretokenized data"""
     with tempfile.TemporaryDirectory() as tempdir:
+
         data_formatting_args = copy.deepcopy(DATA_ARGS)
 
         # below args not needed for pretokenized data
@@ -779,6 +780,8 @@ def test_run_causallm_ft_pretokenized(dataset_path):
 
         train_args = copy.deepcopy(TRAIN_ARGS)
         train_args.output_dir = tempdir
+        train_args.packing = packing
+        train_args.max_seq_length = 256
 
         sft_trainer.train(MODEL_ARGS, data_formatting_args, train_args)
 
diff --git a/tuning/data/data_preprocessing_utils.py b/tuning/data/data_preprocessing_utils.py
@@ -96,3 +96,8 @@ def get_data_collator(
         raise ValueError(
             "Could not pick a data collator. Please refer to supported data formats"
         )
+
+    if is_traindata_tokenized:
+        return DataCollatorForSeq2Seq(
+            tokenizer=tokenizer, padding=False, max_length=max_seq_length
+        )

Original file line number	Diff line number	Diff line change
`@@ -96,3 +96,8 @@ def get_data_collator(`
`96`	`96`	`raise ValueError(`
`97`	`97`	`"Could not pick a data collator. Please refer to supported data formats"`
`98`	`98`	`)`
	`99`	`+`
	`100`	`+ if is_traindata_tokenized:`
	`101`	`+ return DataCollatorForSeq2Seq(`
	`102`	`+ tokenizer=tokenizer, padding=False, max_length=max_seq_length`
	`103`	`+ )`