fix tokenize_and_apply_input_masking kwargs (#465)

Abhishek-TAMU · web-flow · commit a89a4a336487 · 2025-02-12T10:57:34.000-05:00
Signed-off-by: Abhishek &lt;maurya.abhishek@ibm.com&gt;
diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py
@@ -1173,9 +1173,10 @@ def test_process_dataconfig_multiple_datasets_datafiles_sampling(
 def test_process_dataargs(data_args, is_padding_free):
     """Ensure that the train/eval data are properly formatted based on the data args / text field"""
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    max_seq_length = 5
     TRAIN_ARGS = configs.TrainingArguments(
         packing=False,
-        max_seq_length=1024,
+        max_seq_length=max_seq_length,
         output_dir="tmp",  # Not needed but positional
     )
     (train_set, eval_set, dataset_text_field, _, _, _) = process_dataargs(
@@ -1187,6 +1188,7 @@ def test_process_dataargs(data_args, is_padding_free):
         column_names = set(["input_ids", "attention_mask", "labels"])
         assert set(eval_set.column_names) == column_names
         assert set(train_set.column_names) == column_names
+        assert len(train_set[0]["input_ids"]) == max_seq_length
     else:
         assert dataset_text_field in train_set.column_names
         assert dataset_text_field in eval_set.column_names
diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py
@@ -58,7 +58,7 @@ def tokenize_and_apply_input_masking(
     column_names: List[str],
     input_field_name: str,
     output_field_name: str,
-    **tokenizer_kwargs,
+    **kwargs,
 ):
     """Function (data handler) to tokenize and apply instruction masking on dataset
        Expects to be run as a HF Map API function.
@@ -68,7 +68,7 @@ def tokenize_and_apply_input_masking(
         column_names: Name of all the columns in the dataset.
         input_field_name: Name of the input (instruction) field in dataset
         output_field_name: Name of the output field in dataset
-        **tokenizer_kwargs: Any additional kwargs to be passed to tokenizer
+        **kwargs: Any additional args passed to the handler
     Returns:
         Formatted Dataset element with input_ids, labels and attention_mask columns
     """
@@ -85,11 +85,10 @@ def tokenize_and_apply_input_masking(
 
     combined = combine_sequence(input_text, output_text, eos_token=tokenizer.eos_token)
 
-    fn_kwargs = tokenizer_kwargs.get("fn_kwargs", {})
-    tokenizer_inner_kwargs = fn_kwargs.get("tokenizer_kwargs", {})
+    tokenizer_kwargs = kwargs.get("tokenizer_kwargs", {})
 
-    tokenized_comb_seqs = tokenizer(combined, **tokenizer_inner_kwargs)
-    tokenized_input = tokenizer(input_text, **tokenizer_inner_kwargs)
+    tokenized_comb_seqs = tokenizer(combined, **tokenizer_kwargs)
+    tokenized_input = tokenizer(input_text, **tokenizer_kwargs)
 
     masked_labels = [-100] * len(
         tokenized_input.input_ids