foundation-model-stack
diff --git a/‎.github/workflows/build-and-publish.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build-and-publish.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/coverage.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/coverage.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/format.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/format.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/test.yaml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/test.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 24 additions & 4 deletions b/‎README.md‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎build/Dockerfile‎
Lines changed: 10 additions & 4 deletions b/‎build/Dockerfile‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎docs/images/chat_template_guide.jpg‎
995 KB b/‎docs/images/chat_template_guide.jpg‎
995 KB
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/build/test_launch_script.py‎
Lines changed: 38 additions & 1 deletion b/‎tests/build/test_launch_script.py‎
Lines changed: 38 additions & 1 deletion
diff --git a/‎tests/test_sft_trainer.py‎
Lines changed: 23 additions & 3 deletions b/‎tests/test_sft_trainer.py‎
Lines changed: 23 additions & 3 deletions
@@ -24,8 +24,8 @@ jobs:
     strategy:
       matrix:
         python-version:
-          - setup: "3.11"
-            tox: "py311"
+          - setup: "3.12"
+            tox: "py312"
 
     environment:
       name: pypi
 
@@ -10,10 +10,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - name: Set up Python 3.11
+      - name: Set up Python 3.12
         uses: actions/setup-python@v4
         with:
-          python-version: 3.11
+          python-version: 3.12
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
 
@@ -25,10 +25,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - name: Set up Python 3.9
+      - name: Set up Python 3.12
         uses: actions/setup-python@v4
         with:
-          python-version: 3.9
+          python-version: 3.12
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
 
@@ -17,6 +17,8 @@ jobs:
             tox: "py310"
           - setup: "3.11"
             tox: "py311"
+          - setup: "3.12"
+            tox: "py312"
     steps:
       - uses: actions/checkout@v4
       - name: Install dependencies
 
@@ -64,10 +64,11 @@ For more details on how to enable and use the trackers, Please see, [the experim
 ## Data Support
 Users can pass training data as either a single file or a Hugging Face dataset ID using the `--training_data_path` argument along with other arguments required for various [use cases](#use-cases-supported-with-training_data_path-argument) (see details below). If user choose to pass a file, it can be in any of the [supported formats](#supported-data-formats). Alternatively, you can use our powerful [data preprocessing backend](./docs/advanced-data-preprocessing.md) to preprocess datasets on the fly.
 
-
 Below, we mention the list of supported data usecases via `--training_data_path` argument. For details of our advanced data preprocessing see more details in [Advanced Data Preprocessing](./docs/advanced-data-preprocessing.md).
 
-## Supported Data Formats
+EOS tokens are added to all data formats listed below (EOS token is appended to the end of each data point, like a sentence or paragraph within the dataset), except for pretokenized data format at this time. For more info, see [pretokenized](#4-pre-tokenized-datasets).
+
+## Supported Data File Formats
 We support the following file formats via `--training_data_path` argument
 
 Data Format | Tested Support
@@ -79,6 +80,11 @@ ARROW       |   ✅
 
 As iterated above, we also support passing a HF dataset ID directly via `--training_data_path` argument.
 
+**NOTE**: Due to the variety of supported data formats and file types, `--training_data_path` is handled as follows:
+- If `--training_data_path` ends in a valid file extension (e.g., .json, .csv), it is treated as a file.
+- If `--training_data_path` points to a valid folder, it is treated as a folder.
+- If neither of these are true, the data preprocessor tries to load `--training_data_path` as a Hugging Face (HF) dataset ID.
+
 ## Use cases supported with `training_data_path` argument
 
 ### 1. Data formats with a single sequence and a specified response_template to use for masking on completion.
@@ -169,15 +175,29 @@ For the [granite model above](https://huggingface.co/ibm-granite/granite-3.0-8b-
 
 The code internally uses [`DataCollatorForCompletionOnlyLM`](https://github.com/huggingface/trl/blob/main/trl/trainer/utils.py#L93) to perform masking of text ensuring model learns only on the `assistant` responses for both single and multi turn chat.
 
-### 3. Pre tokenized datasets.
+Depending on various scenarios users might need to decide on how to use chat template with their data or which chat template to use for their use case.  
+
+Following are the Guidelines from us in a flow chart :  
+![guidelines for chat template](docs/images/chat_template_guide.jpg)  
+
+Here are some scenarios addressed in the flow chart:  
+1. Depending on the model the tokenizer for the model may or may not have a chat template  
+2. If the template is available then the `json object schema` of the dataset might not match the chat template's `string format`
+3. There might be special tokens used in chat template which the tokenizer might be unaware of, for example `<|start_of_role|>` which can cause issues during tokenization as it might not be treated as a single token  
+
+
+
+### 4. Pre tokenized datasets.
 
 Users can also pass a pretokenized dataset (containing `input_ids` and `labels` columns) as `--training_data_path` argument e.g.
 
+At this time, the data preprocessor does not add EOS tokens to pretokenized datasets, users must ensure EOS tokens are included in their pretokenized data if needed.
+
 ```
 python tuning/sft_trainer.py ... --training_data_path twitter_complaints_tokenized_with_maykeye_tinyllama_v0.arrow
 ```
 
-### 4. Advanced data preprocessing.
+### Advanced data preprocessing.
 
 For advanced data preprocessing support including mixing and custom preprocessing of datasets please see [this document](./docs/advanced-data-preprocessing.md).
 
 
@@ -17,12 +17,13 @@
 ARG BASE_UBI_IMAGE_TAG=latest
 ARG USER=tuning
 ARG USER_UID=1000
-ARG PYTHON_VERSION=3.11
+ARG PYTHON_VERSION=3.12
 ARG WHEEL_VERSION=""
 ## Enable Aimstack or MLflow if requested via ENABLE_AIM/MLFLOW set to "true"
 ARG ENABLE_AIM=false
 ARG ENABLE_MLFLOW=false
 ARG ENABLE_FMS_ACCELERATION=true
+ARG ENABLE_SCANNER=false
 
 ## Base Layer ##################################################################
 FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
@@ -31,7 +32,7 @@ ARG PYTHON_VERSION
 ARG USER
 ARG USER_UID
 
-# Note this works for 3.9, 3.11, 3.12
+# Note this is tested to be working for version 3.9, 3.11, 3.12
 RUN dnf remove -y --disableplugin=subscription-manager \
         subscription-manager \
     && dnf install -y python${PYTHON_VERSION} procps g++ python${PYTHON_VERSION}-devel \
@@ -51,7 +52,7 @@ RUN useradd -u $USER_UID ${USER} -m -g 0 --system && \
 ## Used as base of the Release stage to removed unrelated the packages and CVEs
 FROM base AS release-base
 
-# Removes the python3.9 code to eliminate possible CVEs.  Also removes dnf
+# Removes the python code to eliminate possible CVEs.  Also removes dnf
 RUN rpm -e $(dnf repoquery python3-* -q --installed) dnf python3 yum crypto-policies-scripts
 
 
@@ -111,6 +112,7 @@ ARG USER
 ARG USER_UID
 ARG ENABLE_FMS_ACCELERATION
 ARG ENABLE_AIM
+ARG ENABLE_SCANNER
 
 RUN dnf install -y git && \
     # perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
@@ -154,7 +156,11 @@ RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
 
 RUN if [[ "${ENABLE_MLFLOW}" == "true" ]]; then \
     python -m pip install --user "$(head bdist_name)[mlflow]"; \
-fi
+    fi
+
+RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \
+        python -m pip install --user "$(head bdist_name)[scanner-dev]"; \
+    fi
 
     # Clean up the wheel module. It's only needed by flash-attn install
 RUN python -m pip uninstall wheel build -y && \
 
@@ -24,6 +24,7 @@ classifiers=[
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12"
 ]
 dependencies = [
 "numpy>=1.26.4,<2.0",
@@ -47,6 +48,7 @@ aim = ["aim>=3.19.0,<4.0"]
 mlflow = ["mlflow"]
 fms-accel = ["fms-acceleration>=0.6"]
 gptq-dev = ["auto_gptq>0.4.2", "optimum>=1.15.0"]
+scanner-dev = ["HFResourceScanner>=0.1.0"]
 
 
 [tool.setuptools.packages.find]
 
@@ -16,12 +16,14 @@
 """
 
 # Standard
+import json
 import os
 import tempfile
 import glob
 
 # Third Party
 import pytest
+from transformers.utils.import_utils import _is_package_available
 
 # First Party
 from build.accelerate_launch import main
@@ -31,7 +33,10 @@
     USER_ERROR_EXIT_CODE,
     INTERNAL_ERROR_EXIT_CODE,
 )
-from tuning.config.tracker_configs import FileLoggingTrackerConfig
+from tuning.config.tracker_configs import (
+    FileLoggingTrackerConfig,
+    HFResourceScannerConfig,
+)
 
 SCRIPT = "tuning/sft_trainer.py"
 MODEL_NAME = "Maykeye/TinyLLama-v0"
@@ -246,6 +251,38 @@ def test_lora_with_lora_post_process_for_vllm_set_to_true():
         assert os.path.exists(new_embeddings_file_path)
 
 
+@pytest.mark.skipif(
+    not _is_package_available("HFResourceScanner"),
+    reason="Only runs if HFResourceScanner is installed",
+)
+def test_launch_with_HFResourceScanner_enabled():
+    with tempfile.TemporaryDirectory() as tempdir:
+        setup_env(tempdir)
+        scanner_outfile = os.path.join(
+            tempdir, HFResourceScannerConfig.scanner_output_filename
+        )
+        TRAIN_KWARGS = {
+            **BASE_LORA_KWARGS,
+            **{
+                "output_dir": tempdir,
+                "save_model_dir": tempdir,
+                "lora_post_process_for_vllm": True,
+                "gradient_accumulation_steps": 1,
+                "trackers": ["hf_resource_scanner"],
+                "scanner_output_filename": scanner_outfile,
+            },
+        }
+        serialized_args = serialize_args(TRAIN_KWARGS)
+        os.environ["SFT_TRAINER_CONFIG_JSON_ENV_VAR"] = serialized_args
+
+        assert main() == 0
+        assert os.path.exists(scanner_outfile) is True
+        with open(scanner_outfile, "r", encoding="utf-8") as f:
+            scanner_res = json.load(f)
+        assert scanner_res["time_data"] is not None
+        assert scanner_res["mem_data"] is not None
+
+
 def test_bad_script_path():
     """Check for appropriate error for an invalid training script location"""
     with tempfile.TemporaryDirectory() as tempdir:
 
@@ -363,6 +363,7 @@ def test_parse_arguments(job_config):
         _,
         _,
         _,
+        _,
     ) = sft_trainer.parse_arguments(parser, job_config_copy)
     assert str(model_args.torch_dtype) == "torch.bfloat16"
     assert data_args.dataset_text_field == "output"
@@ -390,6 +391,7 @@ def test_parse_arguments_defaults(job_config):
         _,
         _,
         _,
+        _,
     ) = sft_trainer.parse_arguments(parser, job_config_defaults)
     assert str(model_args.torch_dtype) == "torch.bfloat16"
     assert model_args.use_flash_attn is False
@@ -400,14 +402,14 @@ def test_parse_arguments_peft_method(job_config):
     parser = sft_trainer.get_parser()
     job_config_pt = copy.deepcopy(job_config)
     job_config_pt["peft_method"] = "pt"
-    _, _, _, _, tune_config, _, _, _, _, _, _, _, _ = sft_trainer.parse_arguments(
+    _, _, _, _, tune_config, _, _, _, _, _, _, _, _, _ = sft_trainer.parse_arguments(
         parser, job_config_pt
     )
     assert isinstance(tune_config, peft_config.PromptTuningConfig)
 
     job_config_lora = copy.deepcopy(job_config)
     job_config_lora["peft_method"] = "lora"
-    _, _, _, _, tune_config, _, _, _, _, _, _, _, _ = sft_trainer.parse_arguments(
+    _, _, _, _, tune_config, _, _, _, _, _, _, _, _, _ = sft_trainer.parse_arguments(
         parser, job_config_lora
     )
     assert isinstance(tune_config, peft_config.LoraConfig)
@@ -1053,12 +1055,18 @@ def _test_run_inference(checkpoint_path):
 
 
 def _validate_training(
-    tempdir, check_eval=False, train_logs_file="training_logs.jsonl"
+    tempdir,
+    check_eval=False,
+    train_logs_file="training_logs.jsonl",
+    check_scanner_file=False,
 ):
     assert any(x.startswith("checkpoint-") for x in os.listdir(tempdir))
     train_logs_file_path = "{}/{}".format(tempdir, train_logs_file)
     _validate_logfile(train_logs_file_path, check_eval)
 
+    if check_scanner_file:
+        _validate_hf_resource_scanner_file(tempdir)
+
 
 def _validate_logfile(log_file_path, check_eval=False):
     train_log_contents = ""
@@ -1073,6 +1081,18 @@ def _validate_logfile(log_file_path, check_eval=False):
         assert "validation_loss" in train_log_contents
 
 
+def _validate_hf_resource_scanner_file(tempdir):
+    scanner_file_path = os.path.join(tempdir, "scanner_output.json")
+    assert os.path.exists(scanner_file_path) is True
+    assert os.path.getsize(scanner_file_path) > 0
+
+    with open(scanner_file_path, "r", encoding="utf-8") as f:
+        scanner_contents = json.load(f)
+
+    assert scanner_contents["time_data"] is not None
+    assert scanner_contents["mem_data"] is not None
+
+
 def _get_checkpoint_path(dir_path):
     return os.path.join(dir_path, "checkpoint-5")