feat: Rename data handlers and add a new one for EPT scenarios (#460)

dushyantbehl · web-flow · commit 381fdd55d42d · 2025-02-11T14:32:15.000-05:00
* Rename data handlers and add a data handler for EPT user case.

Signed-off-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;

* Fix minor bug in formatting where input_ids was missing post
duplication.

Signed-off-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;

* Add docstring

Signed-off-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;

* change name of dataset in data config yaml

Signed-off-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;

---------

Signed-off-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;
diff --git a/docs/advanced-data-preprocessing.md b/docs/advanced-data-preprocessing.md
@@ -206,14 +206,16 @@ Users can also pass any number of `kwargs` arguments required for each data hand
 This library currently supports the following [preexisting data handlers](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/data/data_handlers.py#L156):
  - `tokenize_and_apply_input_masking`:
     Tokenizes input text and applies masking to the labels for causal language modeling tasks, good for input/output datasets.
- - `apply_dataset_formatting`:
-    Formats a dataset by appending an EOS token to a specified field.
+ - `add_tokenizer_eos_token`:
+    Appends the tokenizer's EOS token to a specified dataset field.
  - `apply_custom_data_formatting_template`:
     Applies a custom template (e.g., Alpaca style) to format dataset elements.
- - `apply_custom_data_formatting_jinja_template`:
+ - `apply_custom_jinja_template`:
     Applies a custom jinja template (e.g., Alpaca style) to format dataset elements.
  - `apply_tokenizer_chat_template`:
     Uses a tokenizer's chat template to preprocess dataset elements, good for single/multi turn chat templates.
+ - `duplicate_columns`:
+    Duplicates one column of the dataset to another column.
 
 These handlers could be requested by their same name and users can lookup the function args from [here](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/data/data_handlers.py)
 
diff --git a/tests/artifacts/predefined_data_configs/__init__.py b/tests/artifacts/predefined_data_configs/__init__.py
@@ -34,3 +34,6 @@
 DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_YAML = os.path.join(
     PREDEFINED_DATA_CONFIGS, "multiple_datasets_with_sampling.yaml"
 )
+DATA_CONFIG_DUPLICATE_COLUMNS = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "duplicate_columns.yaml"
+)
diff --git a/tests/artifacts/predefined_data_configs/apply_custom_jinja_template.yaml b/tests/artifacts/predefined_data_configs/apply_custom_jinja_template.yaml
@@ -5,7 +5,7 @@ datasets:
     data_paths:
       - "FILE_PATH"
     data_handlers:
-      - name: apply_custom_data_formatting_jinja_template
+      - name: apply_custom_jinja_template
         arguments:
           remove_columns: all
           batched: false
diff --git a/tests/artifacts/predefined_data_configs/duplicate_columns.yaml b/tests/artifacts/predefined_data_configs/duplicate_columns.yaml
@@ -0,0 +1,14 @@
+dataprocessor:
+    type: default
+datasets:
+  - name: pre_tokenized_with_only_input_ids
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: duplicate_columns
+        arguments:
+          remove_columns: all
+          batched: false
+          fn_kwargs:
+            old_column: "input_ids"
+            new_column: "labels"
diff --git a/tests/artifacts/testdata/__init__.py b/tests/artifacts/testdata/__init__.py
@@ -53,6 +53,10 @@
 TWITTER_COMPLAINTS_TOKENIZED_JSON = os.path.join(
     JSON_DATA_DIR, "twitter_complaints_tokenized_with_maykeye_tinyllama_v0.json"
 )
+TWITTER_COMPLAINTS_TOKENIZED_ONLY_INPUT_IDS_JSON = os.path.join(
+    JSON_DATA_DIR,
+    "twitter_complaints_tokenized_with_maykeye_tinyllama_v0_only_input_ids.json",
+)
 TWITTER_COMPLAINTS_TOKENIZED_JSONL = os.path.join(
     JSONL_DATA_DIR, "twitter_complaints_tokenized_with_maykeye_tinyllama_v0.jsonl"
 )
diff --git a/tests/artifacts/testdata/json/twitter_complaints_tokenized_with_maykeye_tinyllama_v0_only_input_ids.json b/tests/artifacts/testdata/json/twitter_complaints_tokenized_with_maykeye_tinyllama_v0_only_input_ids.json
@@ -0,0 +1,32 @@
+[
+    {
+        "input_ids": [1, 16121, 9211, 31871, 1662, 31866, 31856, 7416, 17632, 369, 1398, 433, 322, 629, 712, 1784, 13, 13, 8458, 31922, 21597, 31871, 697, 9566]
+    },
+    {
+        "input_ids": [1, 16121, 9211, 31871, 1662, 31892, 1260, 31825, 11273, 503, 31857, 632, 5284, 365, 329, 553, 1280, 31905, 960, 365, 6194, 289, 11025, 31844, 365, 473, 987, 12207, 4218, 389, 31822, 31853, 31854, 31886, 31852, 31852, 31854, 11300, 31847, 3873, 1507, 31843, 13, 13, 8458, 31922, 21597, 31871, 697, 9566]
+    },
+    {
+        "input_ids": [1, 16121, 9211, 31871, 960, 312, 473, 31876, 31824, 685, 629, 31822, 31878, 4449, 5861, 287, 1662, 1299, 1574, 1590, 31833, 263, 1360, 1299, 1574, 289, 623, 31822, 31824, 16346, 312, 31876, 31836, 994, 277, 3560, 567, 31843, 672, 322, 260, 29458, 288, 629, 14881, 31843, 2628, 1423, 1662, 31858, 601, 1662, 31858, 601, 8378, 13, 13, 8458, 31922, 21597, 31871, 9566]
+    },
+    {
+        "input_ids": [1, 16121, 9211, 31871, 1662, 7766, 1078, 8123, 17561, 308, 3456, 1833, 975, 10849, 291, 4372, 15379, 504, 10011, 2368, 1512, 31822, 31855, 31852, 31852, 1243, 31843, 3007, 322, 433, 31843, 13, 13, 8458, 31922, 21597, 31871, 9566]
+    },
+    {
+        "input_ids": [1, 16121, 9211, 31871, 12371, 2208, 26657, 31844, 560, 14138, 31843, 21994, 1257, 24870, 496, 31829, 8198, 19057, 13, 13, 8458, 31922, 21597, 31871, 697, 9566]
+    },
+    {
+        "input_ids": [1, 16121, 9211, 31871, 1662, 31836, 651, 307, 395, 13094, 672, 1467, 701, 333, 515, 31844, 504, 1097, 2266, 282, 305, 781, 31902, 21626, 31822, 31824, 5540, 397, 560, 5253, 662, 365, 31876, 263, 4985, 31854, 8903, 16801, 291, 612, 31925, 2011, 1129, 31824, 31843, 1358, 31873, 19919, 31824, 31865, 31829, 469, 2131, 31874, 13, 13, 8458, 31922, 21597, 31871, 697, 9566]
+    },
+    {
+        "input_ids": [1, 16121, 9211, 31871, 1662, 31900, 307, 31837, 473, 382, 685, 266, 3195, 17532, 329, 260, 1173, 9363, 352, 1671, 1881, 646, 619, 31822, 31882, 5556, 504, 2091, 31822, 31882, 31843, 31855, 31861, 405, 499, 382, 863, 260, 31822, 31878, 4449, 2540, 2042, 31902, 13, 13, 8458, 31922, 21597, 31871, 697, 9566]
+    },
+    {
+        "input_ids": [1, 16121, 9211, 31871, 1662, 14390, 16373, 337, 312, 435, 697, 1579, 291, 266, 3925, 322, 1434, 291, 3877, 31843, 1456, 365, 499, 1419, 562, 433, 31902, 13, 13, 8458, 31922, 21597, 31871, 9566]
+    },
+    {
+        "input_ids": [1, 16121, 9211, 31871, 7265, 7550, 389, 1662, 31856, 2226, 11596, 27771, 898, 31843, 3259, 647, 312, 498, 288, 635, 31844, 518, 3822, 397, 2168, 28910, 31873, 13627, 4107, 1708, 31843, 312, 31876, 608, 1090, 629, 10279, 289, 1662, 29966, 31831, 5605, 13, 13, 8458, 31922, 21597, 31871, 9566]
+    },
+    {
+        "input_ids": [1, 16121, 9211, 31871, 1662, 31884, 1450, 7064, 31847, 6538, 30894, 4472, 289, 362, 828, 31843, 864, 685, 541, 9932, 843, 584, 18694, 31986, 13, 13, 8458, 31922, 21597, 31871, 697, 9566]
+    }
+]
diff --git a/tests/data/test_data_handlers.py b/tests/data/test_data_handlers.py
@@ -21,13 +21,19 @@
 import pytest
 
 # First Party
-from tests.artifacts.testdata import MODEL_NAME, TWITTER_COMPLAINTS_DATA_JSONL
+from tests.artifacts.testdata import (
+    MODEL_NAME,
+    TWITTER_COMPLAINTS_DATA_JSONL,
+    TWITTER_COMPLAINTS_TOKENIZED_JSON,
+    TWITTER_COMPLAINTS_TOKENIZED_ONLY_INPUT_IDS_JSON,
+)
 
 # Local
 from tuning.data.data_handlers import (
-    apply_custom_data_formatting_jinja_template,
     apply_custom_data_formatting_template,
+    apply_custom_jinja_template,
     combine_sequence,
+    duplicate_columns,
 )
 
 
@@ -66,7 +72,7 @@ def test_apply_custom_formatting_jinja_template():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     formatted_dataset_field = "formatted_data_field"
     formatted_dataset = json_dataset.map(
-        apply_custom_data_formatting_jinja_template,
+        apply_custom_jinja_template,
         fn_kwargs={
             "tokenizer": tokenizer,
             "dataset_text_field": formatted_dataset_field,
@@ -121,7 +127,7 @@ def test_apply_custom_formatting_jinja_template_gives_error_with_wrong_keys(temp
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     with pytest.raises((KeyError, ValueError)):
         json_dataset.map(
-            apply_custom_data_formatting_jinja_template,
+            apply_custom_jinja_template,
             fn_kwargs={
                 "tokenizer": tokenizer,
                 "dataset_text_field": formatted_dataset_field,
@@ -162,3 +168,50 @@ def test_combine_sequence_adds_eos(input_element, output_element, expected_res):
     expected_res += tokenizer.eos_token
     assert isinstance(comb_seq, str)
     assert comb_seq == expected_res
+
+
+@pytest.mark.parametrize(
+    "dataset, old, new",
+    [
+        (TWITTER_COMPLAINTS_DATA_JSONL, "input_ids", "labels"),
+        (TWITTER_COMPLAINTS_TOKENIZED_JSON, "input_ids", "labels"),
+        (TWITTER_COMPLAINTS_DATA_JSONL, None, None),
+        (TWITTER_COMPLAINTS_DATA_JSONL, "input_ids", None),
+    ],
+)
+def test_duplicate_columns_throws_error_on_wrong_args(dataset, old, new):
+    """Ensure that duplicate_columns data handler throws error if column names are wrong."""
+    d = datasets.load_dataset("json", data_files=dataset)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    with pytest.raises(ValueError):
+        d.map(
+            duplicate_columns,
+            fn_kwargs={
+                "tokenizer": tokenizer,
+                "old_column": old,
+                "new_column": new,
+            },
+        )
+
+
+def test_duplicate_columns_copies_columns():
+    """Ensure that duplicate_columns data handler copies and maintains both columns."""
+    old = "input_ids"
+    new = "labels"
+    d = datasets.load_dataset(
+        "json", data_files=TWITTER_COMPLAINTS_TOKENIZED_ONLY_INPUT_IDS_JSON
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    updated_dataaset = d.map(
+        duplicate_columns,
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "old_column": old,
+            "new_column": new,
+        },
+    )
+
+    first_element = updated_dataaset["train"][0]
+    assert new in first_element
+    assert old in first_element
+    assert first_element[new] == first_element[old]
diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -36,6 +36,7 @@
 from build.utils import serialize_args
 from scripts.run_inference import TunedCausalLM
 from tests.artifacts.predefined_data_configs import (
+    DATA_CONFIG_DUPLICATE_COLUMNS,
     DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_YAML,
     DATA_CONFIG_TOKENIZE_AND_APPLY_INPUT_MASKING_YAML,
 )
@@ -58,6 +59,7 @@
     TWITTER_COMPLAINTS_TOKENIZED_ARROW,
     TWITTER_COMPLAINTS_TOKENIZED_JSON,
     TWITTER_COMPLAINTS_TOKENIZED_JSONL,
+    TWITTER_COMPLAINTS_TOKENIZED_ONLY_INPUT_IDS_JSON,
     TWITTER_COMPLAINTS_TOKENIZED_PARQUET,
 )
 
@@ -71,7 +73,7 @@
     DataPreProcessorConfig,
     DataSetConfig,
 )
-from tuning.data.data_handlers import apply_dataset_formatting
+from tuning.data.data_handlers import add_tokenizer_eos_token
 
 MODEL_ARGS = configs.ModelArguments(
     model_name_or_path=MODEL_NAME, use_flash_attn=False, torch_dtype="float32"
@@ -880,6 +882,52 @@ def test_run_causallm_ft_and_inference_with_multiple_dataset(
         assert "### Text: @NortonSupport Thanks much.\n\n### Label:" in output_inference
 
 
+def test_run_training_with_pretokenised_dataset_containing_input_ids():
+    """Ensure that we can train on pretokenised dataset containing just input_ids by
+    choosing duplicate_columns data handler via data config."""
+    with tempfile.TemporaryDirectory() as tempdir:
+
+        data_args = copy.deepcopy(DATA_ARGS)
+
+        # set training_data_path and response_template to none
+        data_args.response_template = None
+        data_args.training_data_path = None
+
+        dataconfigfile = DATA_CONFIG_DUPLICATE_COLUMNS
+        datapath = TWITTER_COMPLAINTS_TOKENIZED_ONLY_INPUT_IDS_JSON
+
+        # add data_paths in data_config file
+        with tempfile.NamedTemporaryFile(
+            "w", delete=False, suffix=".yaml"
+        ) as temp_yaml_file:
+            with open(dataconfigfile, "r", encoding="utf-8") as f:
+                data = yaml.safe_load(f)
+                datasets = data["datasets"]
+                for _, d in enumerate(datasets):
+                    d["data_paths"] = [datapath]
+                yaml.dump(data, temp_yaml_file)
+                data_args.data_config_path = temp_yaml_file.name
+
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+
+        sft_trainer.train(MODEL_ARGS, data_args, train_args)
+
+        # validate full ft configs
+        _validate_training(tempdir)
+        checkpoint_path = _get_checkpoint_path(tempdir)
+
+        # Load the model
+        loaded_model = TunedCausalLM.load(checkpoint_path, MODEL_NAME)
+
+        # Run inference on the text
+        output_inference = loaded_model.run(
+            "### Text: @NortonSupport Thanks much.\n\n### Label:", max_new_tokens=50
+        )
+        assert len(output_inference) > 0
+        assert "### Text: @NortonSupport Thanks much.\n\n### Label:" in output_inference
+
+
 @pytest.mark.parametrize(
     "dataset_path",
     [CHAT_DATA_SINGLE_TURN, CHAT_DATA_MULTI_TURN],
@@ -1469,7 +1517,7 @@ def test_run_by_passing_additional_data_handlers():
     TEST_HANDLER = "my_test_handler"
 
     def test_handler(element, tokenizer, **kwargs):
-        return apply_dataset_formatting(element, tokenizer, "custom_formatted_field")
+        return add_tokenizer_eos_token(element, tokenizer, "custom_formatted_field")
 
     # This data config calls for data handler to be applied to dataset
     preprocessor_config = DataPreProcessorConfig()
diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py
diff --git a/tuning/data/setup_dataprocessor.py b/tuning/data/setup_dataprocessor.py

Original file line number	Diff line number	Diff line change
`@@ -34,3 +34,6 @@`
`34`	`34`	`DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_YAML = os.path.join(`
`35`	`35`	`PREDEFINED_DATA_CONFIGS, "multiple_datasets_with_sampling.yaml"`
`36`	`36`	`)`
	`37`	`+DATA_CONFIG_DUPLICATE_COLUMNS = os.path.join(`
	`38`	`+ PREDEFINED_DATA_CONFIGS, "duplicate_columns.yaml"`
	`39`	`+)`
Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,10 @@`
`53`	`53`	`TWITTER_COMPLAINTS_TOKENIZED_JSON = os.path.join(`
`54`	`54`	`JSON_DATA_DIR, "twitter_complaints_tokenized_with_maykeye_tinyllama_v0.json"`
`55`	`55`	`)`
	`56`	`+TWITTER_COMPLAINTS_TOKENIZED_ONLY_INPUT_IDS_JSON = os.path.join(`
	`57`	`+ JSON_DATA_DIR,`
	`58`	`+ "twitter_complaints_tokenized_with_maykeye_tinyllama_v0_only_input_ids.json",`
	`59`	`+)`
`56`	`60`	`TWITTER_COMPLAINTS_TOKENIZED_JSONL = os.path.join(`
`57`	`61`	`JSONL_DATA_DIR, "twitter_complaints_tokenized_with_maykeye_tinyllama_v0.jsonl"`
`58`	`62`	`)`