Merge remote-tracking branch 'upstream/main'

dchourasia · dchourasia · commit 6757c0bc2681 · 2025-03-06T00:14:58.000Z
diff --git a/README.md b/README.md
@@ -187,6 +187,19 @@ Here are some scenarios addressed in the flow chart:
 3. There might be special tokens used in chat template which the tokenizer might be unaware of, for example `<|start_of_role|>` which can cause issues during tokenization as it might not be treated as a single token  
 
 
+#### Add Special Tokens
+Working with multi-turn chat data might require the tokenizer to use a few new control tokens ( ex: `<|assistant|>`, `[SYS]` ) as described above in the guidelines. These special tokens might not be present in the tokenizer's vocabulary if the user is using base model.
+
+Users can pass `--add_special_tokens` argument which would add the required tokens to the tokenizer's vocabulary.  
+For example required special tokens used in `--instruction_template`/`--response_template` can be passed as follows:
+
+```
+python -m tuning.sft_trainer \
+...
+--add_special_tokens "<|start_of_role|>" "<|end_of_role|>" \
+--instruction_template "<|start_of_role|>user<|end_of_role|>" \
+--response_template "<|start_of_role|>assistant<|end_of_role|>"
+```
 
 ### 4. Pre tokenized datasets.
 
diff --git a/docs/advanced-data-preprocessing.md b/docs/advanced-data-preprocessing.md
@@ -47,6 +47,8 @@ definitions:
         type: string
       seed:
         type: integer
+      chat_template:
+        type: string
     required:
       - type
     title: Dataprocessor
@@ -118,6 +120,7 @@ Users can create a data config file in any of YAML or JSON format they choose (w
  - `streaming` (optional, bool): Stream datasets using [IterableDatasets](https://huggingface.co/docs/datasets/v3.2.0/en/package_reference/main_classes#datasets.IterableDataset).
  - `sampling_stopping_strategy` (optional, str): Dataset interleave stopping strategy in case of choosing to mix multiple datasets by weight, supported values are [`all_exhausted` or `first_exhausted`](https://huggingface.co/docs/datasets/v3.2.0/en/package_reference/main_classes#datasets.interleave_datasets.stopping_strategy), defaults to `all_exhausted`.
  - `sampling_seed` (optional, int): [Sampling seed](https://huggingface.co/docs/datasets/v3.2.0/en/package_reference/main_classes#datasets.interleave_datasets.seed) to use for interleaving datasets, for reproducibility choose same value, defaults to 42.
+ - `chat_template` (optional, str): pass `chat_template` via data_config for multi-turn data, replaces existing default chat template.
 
 `datasets` (list):
   - `name` (optional, str): A unique identifier for the dataset.
diff --git a/tests/artifacts/predefined_data_configs/__init__.py b/tests/artifacts/predefined_data_configs/__init__.py
@@ -34,6 +34,9 @@
 DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_YAML = os.path.join(
     PREDEFINED_DATA_CONFIGS, "multiple_datasets_with_sampling.yaml"
 )
+DATA_CONFIG_MULTITURN_DATA_YAML = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "multi_turn_data_with_chat_template.yaml"
+)
 DATA_CONFIG_YAML_STREAMING_INPUT_OUTPUT = os.path.join(
     PREDEFINED_DATA_CONFIGS, "tokenize_and_apply_input_masking_streaming.yaml"
 )
diff --git a/tests/artifacts/predefined_data_configs/multi_turn_data_with_chat_template.yaml b/tests/artifacts/predefined_data_configs/multi_turn_data_with_chat_template.yaml
@@ -0,0 +1,20 @@
+dataprocessor:
+    type: default
+    chat_template: |
+      {% for message in messages['messages'] %}
+        {% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + eos_token }}
+        {% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}
+        {% elif message['role'] == 'assistant' %}{{ '<|assistant|>\n'  + message['content'] + eos_token }}
+        {% endif %}
+        {% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}
+        {% endif %}
+      {% endfor %}
+datasets:
+  - name: dataset_1
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: apply_tokenizer_chat_template
+        arguments:
+          fn_kwargs:
+            dataset_text_field: formatted_chat_data
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -39,6 +39,7 @@
 from tests.artifacts.predefined_data_configs import (
     DATA_CONFIG_DUPLICATE_COLUMNS,
     DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_YAML,
+    DATA_CONFIG_MULTITURN_DATA_YAML,
     DATA_CONFIG_RENAME_RETAIN_COLUMNS,
     DATA_CONFIG_TOKENIZE_AND_APPLY_INPUT_MASKING_YAML,
     DATA_CONFIG_YAML_STREAMING_INPUT_OUTPUT,
@@ -1041,6 +1042,35 @@ def test_run_chat_style_ft(dataset_path):
         assert 'Provide two rhyming words for the word "love"' in output_inference
 
 
+def test_run_chat_style_add_special_tokens_ft():
+    """Test to check an e2e multi turn chat training by adding special tokens via command line."""
+    with tempfile.TemporaryDirectory() as tempdir:
+
+        # sample hugging face dataset id
+        data_args = configs.DataArguments(
+            training_data_path="lhoestq/demo1",
+            data_formatter_template="### Text:{{review}} \n\n### Stars: {{star}}",
+            response_template="\n### Stars:",
+            add_special_tokens=["<|assistant|>", "<|user|>"],
+        )
+
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+
+        sft_trainer.train(MODEL_ARGS, data_args, train_args)
+
+        # validate the configs
+        _validate_training(tempdir)
+        checkpoint_path = _get_checkpoint_path(tempdir)
+
+        # Load the tokenizer
+        tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint_path)
+
+        # Check if all special tokens passed are in tokenizer
+        for tok in data_args.add_special_tokens:
+            assert tok in tokenizer.vocab
+
+
 @pytest.mark.parametrize(
     "datafiles, dataconfigfile",
     [
@@ -1117,6 +1147,76 @@ def test_run_chat_style_ft_using_dataconfig(datafiles, dataconfigfile):
         assert 'Provide two rhyming words for the word "love"' in output_inference
 
 
+@pytest.mark.parametrize(
+    "datafiles, dataconfigfile",
+    [
+        (
+            [CHAT_DATA_SINGLE_TURN, CHAT_DATA_MULTI_TURN, CHAT_DATA_SINGLE_TURN],
+            DATA_CONFIG_MULTITURN_DATA_YAML,
+        )
+    ],
+)
+def test_run_chat_style_ft_using_dataconfig_for_chat_template(
+    datafiles, dataconfigfile
+):
+    """Check if we can perform an e2e run with chat template
+    and multi turn chat training using data config."""
+    with tempfile.TemporaryDirectory() as tempdir:
+
+        data_args = copy.deepcopy(DATA_ARGS)
+        data_args.response_template = "<|assistant|>"
+        data_args.instruction_template = "<|user|>"
+        data_args.dataset_text_field = "new_formatted_field"
+
+        handler_kwargs = {"dataset_text_field": data_args.dataset_text_field}
+        kwargs = {
+            "fn_kwargs": handler_kwargs,
+            "batched": False,
+            "remove_columns": "all",
+        }
+
+        handler_config = DataHandlerConfig(
+            name="apply_tokenizer_chat_template", arguments=kwargs
+        )
+
+        model_args = copy.deepcopy(MODEL_ARGS)
+        model_args.tokenizer_name_or_path = CUSTOM_TOKENIZER_TINYLLAMA
+
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+
+        with tempfile.NamedTemporaryFile(
+            "w", delete=False, suffix=".yaml"
+        ) as temp_yaml_file:
+            with open(dataconfigfile, "r", encoding="utf-8") as f:
+                data = yaml.safe_load(f)
+            datasets = data["datasets"]
+            for i, d in enumerate(datasets):
+                d["data_paths"] = [datafiles[i]]
+                # Basic chat datasets don't need data handling
+                d["data_handlers"] = [asdict(handler_config)]
+            yaml.dump(data, temp_yaml_file)
+            data_args.data_config_path = temp_yaml_file.name
+
+        sft_trainer.train(model_args, data_args, train_args)
+
+        # validate the configs
+        _validate_training(tempdir)
+        checkpoint_path = _get_checkpoint_path(tempdir)
+
+        # Load the model
+        loaded_model = TunedCausalLM.load(checkpoint_path, MODEL_NAME)
+
+        # Run inference on the text
+        output_inference = loaded_model.run(
+            '<|user|>\nProvide two rhyming words for the word "love"\n\
+            <nopace></s><|assistant|>',
+            max_new_tokens=50,
+        )
+        assert len(output_inference) > 0
+        assert 'Provide two rhyming words for the word "love"' in output_inference
+
+
 @pytest.mark.parametrize(
     "data_args",
     [
diff --git a/tuning/config/configs.py b/tuning/config/configs.py
@@ -122,6 +122,14 @@ class DataArguments:
             Passed in conjunction with response_template"
         },
     )
+    add_special_tokens: List[str] = field(
+        default=None,
+        metadata={
+            "help": "List of special tokens to be added to the tokenizer's vocabulary. \
+            Used to add Special Tokens to Tokenizer's Vocabulary,\
+            Add special tokens as new tokens and increase vocabulary and model embedding size."
+        },
+    )
 
 
 @dataclass
diff --git a/tuning/data/data_config.py b/tuning/data/data_config.py
@@ -48,6 +48,7 @@ class DataPreProcessorConfig:
     # Default seed is not none to ensure reproducability
     sampling_seed: Optional[float] = 42
     streaming: Optional[bool] = False
+    chat_template: Optional[str] = None
 
 
 @dataclass
@@ -147,6 +148,10 @@ def _validate_dataprocessor_config(dataprocessor_config) -> DataPreProcessorConf
         streaming = kwargs["streaming"]
         assert isinstance(streaming, bool), f"streaming: {streaming} should be a bool"
         c.streaming = streaming
+    if "chat_template" in kwargs:
+        chat_template = kwargs["chat_template"]
+        assert isinstance(chat_template, str), "chat_template should be a string"
+        c.chat_template = chat_template
     return c
 
 
diff --git a/tuning/data/setup_dataprocessor.py b/tuning/data/setup_dataprocessor.py
@@ -75,6 +75,16 @@ def _process_dataconfig_file(
         tokenizer=tokenizer,
         additional_data_handlers=additional_data_handlers,
     )
+
+    if processor.processor_config.chat_template is not None:
+        if tokenizer.chat_template:
+            logger.warning(
+                "replacing existing chat_template %s with data config's chat_template %s",
+                tokenizer.chat_template,
+                processor.processor_config.chat_template,
+            )
+        tokenizer.chat_template = processor.processor_config.chat_template
+
     if processor.processor_config.streaming:
         if train_args.max_steps < 1:
             logging.error(
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
@@ -250,6 +250,10 @@ def train(
     )
 
     if data_args.chat_template:
+        # TODO: passing "/n" through cli causes parsing issues,
+        # hence providing a temporary fix
+        data_args.chat_template = data_args.chat_template.replace(r"\n", "\n")
+
         logger.info("adding chat_template to the tokenizer")
         if tokenizer.chat_template:
             logger.warning(
@@ -297,6 +301,13 @@ def train(
                 tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN
                 special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
 
+    # adds user specified special tokens to vocab
+    if data_args.add_special_tokens:
+        logger.info(
+            "Adding user-defined special tokens: %s ", data_args.add_special_tokens
+        )
+        special_tokens_dict["additional_special_tokens"] = data_args.add_special_tokens
+
     # TODO: lower priority but understand if resizing impacts inference quality and why its needed.
     # It makes sense if we manipulate tokenizer that we also save it and provide it to inference.
     added_tokens_dict = tokenizer_and_embedding_resize(

Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,9 @@`
`34`	`34`	`DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_YAML = os.path.join(`
`35`	`35`	`PREDEFINED_DATA_CONFIGS, "multiple_datasets_with_sampling.yaml"`
`36`	`36`	`)`
	`37`	`+DATA_CONFIG_MULTITURN_DATA_YAML = os.path.join(`
	`38`	`+ PREDEFINED_DATA_CONFIGS, "multi_turn_data_with_chat_template.yaml"`
	`39`	`+)`
`37`	`40`	`DATA_CONFIG_YAML_STREAMING_INPUT_OUTPUT = os.path.join(`
`38`	`41`	`PREDEFINED_DATA_CONFIGS, "tokenize_and_apply_input_masking_streaming.yaml"`
`39`	`42`	`)`