Add multi turn chat support.

dushyantbehl · dushyantbehl · commit f44b370bb1b0 · 2024-12-12T13:34:30.000+05:30
Signed-off-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;
diff --git a/.pylintrc b/.pylintrc
@@ -280,8 +280,8 @@ ignored-parents=
 # Maximum number of arguments for function / method.
 max-args=5
 
-# Maximum number of attributes for a class (see R0902).
-max-attributes=7
+# Maximum number of attributes for a class (custom).
+max-attributes=10
 
 # Maximum number of boolean expressions in an if statement (see R0916).
 max-bool-expr=5
diff --git a/tests/data/test_data_preprocessing_utils.py b/tests/data/test_data_preprocessing_utils.py
@@ -320,10 +320,11 @@ def test_get_data_collator(
     """Ensure that the correct collator type is fetched based on the data args"""
     collator = get_data_collator(
         packing,
-        response_template,
         AutoTokenizer.from_pretrained(MODEL_NAME),
-        is_pretokenized_dataset(formatted_train_dataset),
         max_seq_length,
+        response_template,
+        None,
+        is_pretokenized_dataset(formatted_train_dataset),
     )
     assert isinstance(collator, expected_collator)
 
diff --git a/tuning/config/configs.py b/tuning/config/configs.py
@@ -102,6 +102,21 @@ class DataArguments:
                      Supports both JSON and YAML based config files."
         },
     )
+    chat_template: str = field(
+        default=None,
+        metadata={
+            "help": "chat template to use for tokenization. \
+            No need to pass this if the tokenizer already has a chat_template \
+            if passed, it will overwrite tokenizer.chat_template if it exists"
+        },
+    )
+    instruction_template: str = field(
+        default=None,
+        metadata={
+            "help": "Should be provided for chat training. \
+            Piece of text that determines the start of human response"
+        },
+    )
 
 
 @dataclass
diff --git a/tuning/data/data_preprocessing_utils.py b/tuning/data/data_preprocessing_utils.py
@@ -24,10 +24,11 @@
 
 def get_data_collator(
     packing: bool,
-    response_template: Optional[str],
     tokenizer: AutoTokenizer,
-    is_traindata_tokenized: bool,
     max_seq_length: int,
+    response_template: Optional[str],
+    instruction_template: Optional[str],
+    is_traindata_tokenized: bool,
 ) -> Callable:
     """Create and return the the appropriate collator type based on the configuration for packing,
     response_template, and dataset_text_field.
@@ -49,6 +50,20 @@ def get_data_collator(
             Callable collator to be leveraged by the trainer.
     """
 
+    if response_template and instruction_template:
+        # response_template_ids = tokenizer.encode(
+        #     response_template, add_special_tokens=False
+        # )[2:]
+        # intruction_template_ids = tokenizer.encode(
+        #     instruction_template, add_special_tokens=False
+        # )[2:]
+        return DataCollatorForCompletionOnlyLM(
+            response_template=response_template,
+            instruction_template=instruction_template,
+            tokenizer=tokenizer,
+            ignore_index=configs.IGNORE_INDEX,
+        )
+
     if not packing:
         # TODO: near term - how response template ids are parsed out needs to be cleaned.
         # The [2:] here applies if response template has \n prefix, it is needed to strip \n,
diff --git a/tuning/data/setup_dataprocessor.py b/tuning/data/setup_dataprocessor.py
@@ -34,8 +34,8 @@
 from tuning.data.data_processors import get_datapreprocessor
 
 # In future we may make the fields configurable
-DEFAULT_JSON_INPUT_KEY = "input"
-DEFAULT_JSON_OUTPUT_KEY = "output"
+DEFAULT_INPUT_COLUMN = "input"
+DEFAULT_OUTPUT_COLUMN = "output"
 
 # check if the provided dataset is pretokenized or not
 # the check is taken from trl
@@ -145,12 +145,12 @@ def _get_dataset_formatting_handlers(data_args, packing):
     return [handler], dataset_text_field
 
 
-### Data format 3
-def _get_default_json_dataset_handlers(data_args, tokenizer_kwargs):
+### Default Data format
+def _get_default_dataset_handlers(data_args, tokenizer_kwargs):
 
     fn_kwargs = {}
-    fn_kwargs["input_field_name"] = DEFAULT_JSON_INPUT_KEY
-    fn_kwargs["output_field_name"] = DEFAULT_JSON_OUTPUT_KEY
+    fn_kwargs["input_field_name"] = DEFAULT_INPUT_COLUMN
+    fn_kwargs["output_field_name"] = DEFAULT_OUTPUT_COLUMN
     fn_kwargs["tokenizer_kwargs"] = tokenizer_kwargs
 
     kwargs = {
@@ -171,7 +171,9 @@ def _get_default_json_dataset_handlers(data_args, tokenizer_kwargs):
 #   If a text field is specified, append the tokenizer's EOS token to it.
 #   If a formatter template is provided, apply it and save the result.
 #   Data remains un-tokenized.
-# Data Format 3: JSON Dataset with Input/Output Fields
+# Data Format 3: Chat datasets
+#   User provides response_template and instruction_template.
+# Default Data Format: Dataset with Input/Output Fields
 #   Combine input and output fields, tokenize the data, and apply input attention masking.
 #   Requires both input and output fields; throws an error if missing.
 def _process_raw_data_args(
@@ -231,9 +233,13 @@ def _process_raw_data_args(
         handlers, dataset_text_field = _get_dataset_formatting_handlers(
             data_args, packing
         )
+    elif data_args.instruction_template and data_args.response_template:
+        # Data Format 3: Chat dataset with instruction and response template
+        # We don't do processing for chat dataset
+        handlers, dataset_text_field = [], None
     else:
-        # Data Format 3: JSON Dataset with Input/Output Fields
-        handlers, dataset_text_field = _get_default_json_dataset_handlers(
+        # Default Data Format: Dataset with Input/Output Fields
+        handlers, dataset_text_field = _get_default_dataset_handlers(
             data_args, tokenizer_kwargs
         )
 
@@ -299,13 +305,14 @@ def process_dataargs(
 
     data_collator = get_data_collator(
         train_args.packing,
-        data_args.response_template,
-        tokenizer,
+        tokenizer=tokenizer,
+        max_seq_length=max_seq_length,
+        response_template=data_args.response_template,
+        instruction_template=data_args.instruction_template,
         # Note: This check should not be removed.
         #       Its important to recompute this post handling to
         #       check if we already tokenized the dataset or not.
-        is_pretokenized_dataset(train_dataset),
-        max_seq_length,
+        is_traindata_tokenized=is_pretokenized_dataset(train_dataset),
     )
 
     dataset_kwargs = {}
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
@@ -285,6 +285,16 @@ def train(
         multiple_of=model_args.embedding_size_multiple_of,
     )
 
+    if data_args.chat_template:
+        logger.info("adding chat_template to the tokenizer")
+        if tokenizer.chat_template:
+            logger.warning(
+                "replacing existing chat_template %s with the given chat_template %s",
+                tokenizer.chat_template,
+                data_args.chat_template,
+            )
+        tokenizer.chat_template = data_args.chat_template
+
     # Configure the collator and validate args related to packing prior to formatting the dataset
     data_collator = None
     logger.info("Packing is set to %s ", train_args.packing)