red-hat-data-services
diff --git a/‎docs/advanced-data-preprocessing.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/advanced-data-preprocessing.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/ept.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/ept.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎scripts/offline_data_processing.py‎
Lines changed: 5 additions & 37 deletions b/‎scripts/offline_data_processing.py‎
Lines changed: 5 additions & 37 deletions
diff --git a/‎tests/artifacts/predefined_data_configs/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/artifacts/predefined_data_configs/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/artifacts/predefined_data_configs/mt_data_granite_3_1B_tokenize_and_mask_handler.yaml‎
Lines changed: 83 additions & 0 deletions b/‎tests/artifacts/predefined_data_configs/mt_data_granite_3_1B_tokenize_and_mask_handler.yaml‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎tests/test_sft_trainer.py‎
Lines changed: 10 additions & 1 deletion b/‎tests/test_sft_trainer.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎tests/utils/test_tokenizer_data_utils.py‎
Lines changed: 150 additions & 6 deletions b/‎tests/utils/test_tokenizer_data_utils.py‎
Lines changed: 150 additions & 6 deletions
@@ -9,7 +9,7 @@ These things are supported via what we call a [`data_config`](#data-config) whic
 
 ## Data Config
 
-Data config is a configuration file which `sft_trainer.py` supports as an argument via `--data_config` flag. In this 
+Data config is a configuration file which `sft_trainer.py` supports as an argument via `--data_config_path` flag. In this
 configuration users can describe multiple datasets, configurations on how to load the datasets and configuration on how to 
 process the datasets. Users can currently pass both YAML or JSON based configuration files as data_configs.
 
 
@@ -43,7 +43,7 @@ datasets:
 And the commandline passed to the library should include following.
 
 ```
---data_config <path to the data config> --packing=True --max_seq_len 8192
+--data_config_path <path to the data config> --packing=True --max_seq_len 8192
 ```
 
 Please note that for non tokenized dataset our code adds `EOS_TOKEN` to the lines, for e.g. `Tweet` column before passing that as a dataset.
@@ -102,7 +102,7 @@ NOTE: More in-depth documentation of `sampling_stopping_strategy` and how to spe
 Here also the command line arguments would be 
 
 ```
---data_config <path to the data config> --packing=True --max_seq_len 8192
+--data_config_path <path to the data config> --packing=True --max_seq_len 8192
 ```
 
 The code again would add `EOS_TOKEN` to the non tokenized data before using it and also note that the `dataset_text_field` is assumed to be same across all datasets for now.
@@ -131,7 +131,7 @@ datasets:
 The command-line arguments passed to the library should include the following:
 
 ```
---data_config <path to the data config> --packing=True --max_seq_len 8192 --max_steps <num training steps>
+--data_config_path <path to the data config> --packing=True --max_seq_len 8192 --max_steps <num training steps>
 ```
 
 Please note when using streaming, user must pass `max_steps` instead of `num_train_epochs`. See advanced data preprocessing [document](./advanced-data-preprocessing.md#data-streaming) for more info.
 
@@ -5,20 +5,15 @@
 import traceback
 
 # Third Party
-from transformers import (
-    AutoTokenizer,
-    GPT2Tokenizer,
-    GPTNeoXTokenizerFast,
-    LlamaTokenizer,
-    LlamaTokenizerFast,
-)
+from transformers import AutoTokenizer
 
 # Local
 from tuning.config import configs
 from tuning.data.setup_dataprocessor import process_dataargs
 from tuning.sft_trainer import get_parser
 from tuning.utils.error_logging import USER_ERROR_EXIT_CODE, write_termination_log
 from tuning.utils.logging import set_log_level
+from tuning.utils.tokenizer_data_utils import get_special_tokens_dict
 
 
 def save_dataset_shards(
@@ -92,36 +87,9 @@ def get_processed_dataset(
         tokenizer.chat_template = data_args.chat_template
 
     # Prepare special tokens dictionary
-    special_tokens_dict = {}
-    if not model_args.tokenizer_name_or_path:
-        if isinstance(tokenizer, (LlamaTokenizer, LlamaTokenizerFast)):
-            special_tokens_dict["bos_token"] = "<s>"
-            special_tokens_dict["eos_token"] = "</s>"
-            special_tokens_dict["unk_token"] = "<unk>"
-            special_tokens_dict["pad_token"] = "<pad>"
-        elif isinstance(tokenizer, (GPT2Tokenizer, GPTNeoXTokenizerFast)):
-            special_tokens_dict["pad_token"] = "<pad>"
-
-        if tokenizer.pad_token is None:
-            logger.warning(
-                "PAD token not found in tokenizer; setting PAD token to default."
-            )
-            special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
-        if tokenizer.eos_token is None:
-            logger.warning(
-                "EOS token not found in tokenizer; setting EOS token to default."
-            )
-            special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
-        if tokenizer.pad_token == tokenizer.eos_token:
-            logger.warning(
-                "PAD token and EOS token are the same. Overriding accordingly."
-            )
-            if tokenizer.eos_token != configs.DEFAULT_PAD_TOKEN:
-                tokenizer.pad_token = configs.DEFAULT_PAD_TOKEN
-                special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
-            else:
-                tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN
-                special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
+    special_tokens_dict = get_special_tokens_dict(
+        tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
+    )
 
     # adds user specified special tokens to vocab
     if data_args.add_special_tokens:
 
@@ -40,6 +40,9 @@
 DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML = os.path.join(
     PREDEFINED_DATA_CONFIGS, "multi_turn_data_with_chat_template_granite_3_1B.yaml"
 )
+DATA_CONFIG_MULTITURN_CHAT_TOKENIZE_AND_MASKING_DATA_HANDLER = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "mt_data_granite_3_1B_tokenize_and_mask_handler.yaml"
+)
 DATA_CONFIG_YAML_STREAMING_INPUT_OUTPUT = os.path.join(
     PREDEFINED_DATA_CONFIGS, "tokenize_and_apply_input_masking_streaming.yaml"
 )
 
@@ -0,0 +1,83 @@
+dataprocessor:
+    type: default
+    chat_template: |
+      {%- if messages[0]['role'] == 'system' %}
+          {%- set system_message = messages[0]['content'] %}
+          {%- set loop_messages = messages[1:] %}
+      {%- else %}
+          {%- set system_message = "Knowledge Cutoff Date: April 2024.\nToday's Date: " + strftime_now('%B %d, %Y') + ".\nYou are Granite, developed by IBM." %}
+          {%- if tools and documents %}
+              {%- set system_message = system_message + " You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.\n\nWrite the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+          {%- elif tools %}
+              {%- set system_message = system_message + " You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
+          {%- elif documents %}
+              {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+          {%- else %}
+              {%- set system_message = system_message + " You are a helpful AI assistant." %}    
+          {%- endif %}
+          {%- if 'citations' in controls and documents %}
+              {%- set system_message = system_message + '\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
+          {%- endif %}
+          {%- if 'hallucinations' in controls and documents %}
+              {%- set system_message = system_message + '\n\nFinally, after the response is written, include a numbered list of sentences from the response that are potentially hallucinated and not based in the documents.' %}
+          {%- endif %}
+          {%- set loop_messages = messages %}
+      {%- endif %}
+      {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>\n' }}
+      {%- if tools %}
+          {{- '<|start_of_role|>tools<|end_of_role|>' }}
+          {{- tools | tojson(indent=4) }}
+          {{- '<|end_of_text|>\n' }}
+      {%- endif %}
+      {%- if documents %}
+          {{- '<|start_of_role|>documents<|end_of_role|>' }}
+          {%- for document in documents %}
+              {{- 'Document ' + loop.index0 | string + '\n' }}
+              {{- document['text'] }}
+              {%- if not loop.last %}
+                  {{- '\n\n'}}
+              {%- endif%}
+          {%- endfor %}
+          {{- '<|end_of_text|>\n' }}
+      {%- endif %}
+      {%- for message in loop_messages %}
+          {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}
+          {%- if loop.last and add_generation_prompt %}
+              {{- '<|start_of_role|>assistant' }}
+                  {%- if controls %}
+                      {{- ' ' + controls | tojson()}}
+                  {%- endif %}
+              {{- '<|end_of_role|>' }}
+          {%- endif %}
+      {%- endfor %}
+datasets:
+  - name: dataset_1
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_chat_template_with_masking
+        arguments:
+          remove_columns: all
+          fn_kwargs:
+            max_seq_length: 1024
+            conversation_column: "messages"
+  - name: dataset_2
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_chat_template_with_masking
+        arguments:
+          remove_columns: all
+          fn_kwargs:
+            max_seq_length: 1024
+            conversation_column: "messages"
+  - name: dataset_3
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_chat_template_with_masking
+        arguments:
+          remove_columns: all
+          fn_kwargs:
+            max_seq_length: 1024
+            conversation_column: "messages"
@@ -39,6 +39,7 @@
 from tests.artifacts.predefined_data_configs import (
     DATA_CONFIG_DUPLICATE_COLUMNS,
     DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_YAML,
+    DATA_CONFIG_MULTITURN_CHAT_TOKENIZE_AND_MASKING_DATA_HANDLER,
     DATA_CONFIG_MULTITURN_DATA_YAML,
     DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML,
     DATA_CONFIG_RENAME_RETAIN_COLUMNS,
@@ -1258,6 +1259,14 @@ def test_run_chat_style_ft_using_dataconfig(datafiles, dataconfigfile):
             ],
             DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML,
         ),
+        (
+            [
+                CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
+                CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
+                CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
+            ],
+            DATA_CONFIG_MULTITURN_CHAT_TOKENIZE_AND_MASKING_DATA_HANDLER,
+        ),
     ],
 )
 def test_run_chat_style_ft_using_dataconfig_for_chat_template(
@@ -1768,7 +1777,7 @@ def test_pretokenized_dataset_bad_args(dataset_text_field, response_template):
         data_args = copy.deepcopy(DATA_ARGS)
         data_args.dataset_text_field = dataset_text_field
         data_args.response_template = response_template
-        data_args.training_data_path = TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL
+        data_args.training_data_path = TWITTER_COMPLAINTS_TOKENIZED_JSON
         # We should raise an error since we should not have a dataset text
         # field or a response template if we have pretokenized data
         with pytest.raises(ValueError):
 
@@ -1,20 +1,164 @@
-# Third party
 # Third Party
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 # First Party
 from tests.artifacts.testdata import MODEL_NAME
 
 # Local
-# First party
-from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize
+from tuning.config import configs
+from tuning.utils.tokenizer_data_utils import (
+    get_special_tokens_dict,
+    tokenizer_and_embedding_resize,
+)
 
 
-def test_tokenizer_and_embedding_resize_return_values():
-    """Test to ensure number of added tokens are returned correctly"""
+def test_setting_special_tokens_with_LlamaTokenizerFast():
+    """
+    Unit test using a LlamaTokenizerFast tokenizer. This tokenizer is only missing a PAD token,
+    however because it is a LlamaTokenizer, the function code automatically adds the BOS, EOS,
+    UNK and PAD tokens to the special tokens dict. Then, the <pad> token is replaced with
+    a <PAD> token, because the Llama tokenizer does not have a pad token specified.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True)
+    model_args = configs.ModelArguments()
+    special_tokens_dict = get_special_tokens_dict(
+        tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
+    )
+    assert special_tokens_dict == {
+        "bos_token": "<s>",
+        "eos_token": "</s>",
+        "unk_token": "<unk>",
+        "pad_token": "<PAD>",
+    }
+
+
+def test_setting_special_tokens_with_GPT2TokenizerFast():
+    """
+    Unit test using a GPT2TokenizerFast tokenizer. This tokenizer is the case where the
+    EOS token = PAD token, both of them are <|endoftext|>. So, the pad token in the tokenizer is set
+    to <PAD> and the "pad_token": "<PAD>" is also added to the special tokens dict.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-8b-base")
+    model_args = configs.ModelArguments()
+    special_tokens_dict = get_special_tokens_dict(
+        tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
+    )
+    assert special_tokens_dict == {
+        "pad_token": "<PAD>",
+    }
+
+
+def test_setting_special_tokens_with_GPTNeoXTokenizerFast():
+    """
+    Unit test using a GPTNeoXTokenizerFast tokenizer. This tokenizer is another one that is
+    hardcoded into the function to automatically add just a pad token to the special tokens dict.
+    However, the tokenizer itself is also missing a pad token, so the function then replaces
+    the <pad> token with the default <PAD> token.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    model_args = configs.ModelArguments()
+    special_tokens_dict = get_special_tokens_dict(
+        tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
+    )
+    assert special_tokens_dict == {
+        "pad_token": "<PAD>",
+    }
+
+
+def test_setting_special_tokens_when_missing_all_special_tokens():
+    """
+    Unit test using the GPT2TokenizerFast tokenizer. All the special tokens have been
+    removed from the tokenizer, so we expect all of them to appear in the special tokens dict.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-8b-base")
+
+    # Set all special tokens to None
+    tokenizer.bos_token = None
+    tokenizer.eos_token = None
+    tokenizer.unk_token = None
+    tokenizer.pad_token = None
+
+    model_args = configs.ModelArguments()
+    special_tokens_dict = get_special_tokens_dict(
+        tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
+    )
+    assert special_tokens_dict == {
+        "pad_token": "<PAD>",
+        "eos_token": "</s>",
+        "bos_token": "<s>",
+        "unk_token": "<unk>",
+    }
+
+
+def test_setting_special_tokens_when_path_is_not_none():
+    """
+    A simple unit test that sets the `tokenizer_name_or_path` argument in
+    `model_args` to a non None value. Since the argument is not None, almost
+    the entire `get_special_tokens_dict` function is skipped and the
+    special tokens dict is expected to be empty.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True)
+    model_args = configs.ModelArguments(tokenizer_name_or_path="test_path")
+    special_tokens_dict = get_special_tokens_dict(
+        tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
+    )
+    # Assert special_tokens_dict is empty
+    assert not special_tokens_dict
+
+
+def test_tokenizer_and_embedding_resize_return_values_missing_one_token():
+    """
+    Tests the resizing function when the special tokens dict contains a PAD token,
+    which means the tokenizer is missing one special token.
+
+    `mulitple_of` is set to 1.
+    """
     special_tokens_dict = {"pad_token": "<pad>"}
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
     metadata = tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model)
     assert metadata["num_new_tokens"] == 1
-    assert "new_embedding_size" in metadata
+    assert metadata["new_embedding_size"] == len(tokenizer)
+
+
+def test_tokenizer_and_embedding_resize_return_values_missing_four_tokens():
+    """
+    Tests the resizing when the special tokens dict contains a PAD, EOS, BOS and UNK token,
+    which means the tokenizer is missing four special tokens.
+
+    `mulitple_of` is set to 1.
+    """
+    special_tokens_dict = {
+        "pad_token": "<PAD>",
+        "eos_token": "</s>",
+        "bos_token": "<s>",
+        "unk_token": "<unk>",
+    }
+    tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True)
+    model = AutoModelForCausalLM.from_pretrained("Maykeye/TinyLLama-v0")
+    metadata = tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model)
+    assert metadata["num_new_tokens"] == 4
+    assert metadata["new_embedding_size"] == len(tokenizer)
+
+
+def test_tokenizer_and_embedding_resize_return_values_mutliple_of_two():
+    """
+    Tests the resizing when the special tokens dict contains a PAD, EOS, BOS and UNK token,
+    which means the tokenizer is missing four special tokens.
+
+    `mulitple_of` is set to 2; this add one to the count of num_new_tokens and adds
+    one to the count of new_embedding_size.
+    """
+    special_tokens_dict = {
+        "pad_token": "<PAD>",
+        "eos_token": "</s>",
+        "bos_token": "<s>",
+        "unk_token": "<unk>",
+    }
+    tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True)
+    model = AutoModelForCausalLM.from_pretrained("Maykeye/TinyLLama-v0")
+    metadata = tokenizer_and_embedding_resize(
+        special_tokens_dict, tokenizer, model, multiple_of=2
+    )
+    assert metadata["num_new_tokens"] == 5
+    assert metadata["new_embedding_size"] == len(tokenizer) + 1
Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,9 @@`
`40`	`40`	`DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML = os.path.join(`
`41`	`41`	`PREDEFINED_DATA_CONFIGS, "multi_turn_data_with_chat_template_granite_3_1B.yaml"`
`42`	`42`	`)`
	`43`	`+DATA_CONFIG_MULTITURN_CHAT_TOKENIZE_AND_MASKING_DATA_HANDLER = os.path.join(`
	`44`	`+ PREDEFINED_DATA_CONFIGS, "mt_data_granite_3_1B_tokenize_and_mask_handler.yaml"`
	`45`	`+)`
`43`	`46`	`DATA_CONFIG_YAML_STREAMING_INPUT_OUTPUT = os.path.join(`
`44`	`47`	`PREDEFINED_DATA_CONFIGS, "tokenize_and_apply_input_masking_streaming.yaml"`
`45`	`48`	`)`