foundation-model-stack
diff --git a/‎.github/actions/free-up-disk-space/action.yml‎
Lines changed: 46 additions & 0 deletions b/‎.github/actions/free-up-disk-space/action.yml‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎.github/workflows/image.yaml‎
Lines changed: 2 additions & 7 deletions b/‎.github/workflows/image.yaml‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎README.md‎
Lines changed: 14 additions & 1 deletion b/‎README.md‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎build/Dockerfile‎
Lines changed: 13 additions & 2 deletions b/‎build/Dockerfile‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎docs/advanced-data-preprocessing.md‎
Lines changed: 64 additions & 0 deletions b/‎docs/advanced-data-preprocessing.md‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎docs/ept.md‎
Lines changed: 29 additions & 0 deletions b/‎docs/ept.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 3 deletions b/‎pyproject.toml‎
Lines changed: 4 additions & 3 deletions
@@ -0,0 +1,46 @@
+name: "Free up disk space"
+description: "Removes non-essential tools, libraries and cached files from GitHub action runner node"
+
+runs:
+  using: "composite"
+  steps:
+    - name: "Remove non-essential tools and libraries"
+      shell: bash
+      run: |
+        # https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+        echo "Disk usage before cleanup:"
+        df -h
+        echo "Removing non-essential tools and libraries ..."
+        sudo rm -rf /opt/ghc
+        sudo rm -rf /usr/local/.ghcup
+        sudo rm -rf /usr/share/dotnet
+        # sudo rm -rf /usr/local/share/boost
+        echo "Deleting libraries for Android (12G), CodeQL (5.3G), PowerShell (1.3G), Swift (1.7G) ..."
+        sudo rm -rf /usr/local/lib/android
+        sudo rm -rf "${AGENT_TOOLSDIRECTORY}/CodeQL"
+        sudo rm -rf /usr/local/share/powershell
+        sudo rm -rf /usr/share/swift
+        # ref: https://github.com/jlumbroso/free-disk-space/blob/main/action.yml
+        echo "Deleting some larger apt packages:"
+        sudo apt-get remove -y '^aspnetcore-.*' || echo "::warning::The command [sudo apt-get remove -y '^aspnetcore-.*'] failed to complete successfully. Proceeding..."
+        sudo apt-get remove -y '^dotnet-.*' --fix-missing || echo "::warning::The command [sudo apt-get remove -y '^dotnet-.*' --fix-missing] failed to complete successfully. Proceeding..."
+        sudo apt-get remove -y '^llvm-.*' --fix-missing || echo "::warning::The command [sudo apt-get remove -y '^llvm-.*' --fix-missing] failed to complete successfully. Proceeding..."
+        sudo apt-get remove -y 'php.*' --fix-missing || echo "::warning::The command [sudo apt-get remove -y 'php.*' --fix-missing] failed to complete successfully. Proceeding..."
+        sudo apt-get remove -y '^mongodb-.*' --fix-missing || echo "::warning::The command [sudo apt-get remove -y '^mongodb-.*' --fix-missing] failed to complete successfully. Proceeding..."
+        sudo apt-get remove -y '^mysql-.*' --fix-missing || echo "::warning::The command [sudo apt-get remove -y '^mysql-.*' --fix-missing] failed to complete successfully. Proceeding..."
+        sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel libgl1-mesa-dri --fix-missing || echo "::warning::The command [sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel libgl1-mesa-dri --fix-missing] failed to complete successfully. Proceeding..."
+        sudo apt-get remove -y google-cloud-sdk --fix-missing || echo "::debug::The command [sudo apt-get remove -y google-cloud-sdk --fix-missing] failed to complete successfully. Proceeding..."
+        sudo apt-get remove -y google-cloud-cli --fix-missing || echo "::debug::The command [sudo apt-get remove -y google-cloud-cli --fix-missing] failed to complete successfully. Proceeding..."
+        sudo apt-get autoremove -y || echo "::warning::The command [sudo apt-get autoremove -y] failed to complete successfully. Proceeding..."
+        sudo apt-get clean || echo "::warning::The command [sudo apt-get clean] failed to complete successfully. Proceeding..."
+        echo "Disk usage after cleanup:"
+        df -h
+
+    - name: "Prune docker images"
+      shell: bash
+      run: |
+        echo "Pruning docker images ..."
+        docker image prune -a -f
+        docker system df
+        echo "Disk usage after pruning docker images:"
+        df -h
@@ -10,13 +10,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - name: Free disk space
-        run: |
-          sudo swapoff -a
-          sudo rm -f /swapfile
-          sudo apt clean
-          if [ "$(docker image ls -q)" ]; then docker rmi $(docker image ls -aq); fi
-          df -h
+      - name: "Free up disk space"
+        uses: ./.github/actions/free-up-disk-space
       - name: Build image
         run: |
           docker build -t fms-hf-tuning:dev . -f build/Dockerfile
@@ -187,6 +187,19 @@ Here are some scenarios addressed in the flow chart:
 3. There might be special tokens used in chat template which the tokenizer might be unaware of, for example `<|start_of_role|>` which can cause issues during tokenization as it might not be treated as a single token  
 
 
+#### Add Special Tokens
+Working with multi-turn chat data might require the tokenizer to use a few new control tokens ( ex: `<|assistant|>`, `[SYS]` ) as described above in the guidelines. These special tokens might not be present in the tokenizer's vocabulary if the user is using base model.
+
+Users can pass `--add_special_tokens` argument which would add the required tokens to the tokenizer's vocabulary.  
+For example required special tokens used in `--instruction_template`/`--response_template` can be passed as follows:
+
+```
+python -m tuning.sft_trainer \
+...
+--add_special_tokens "<|start_of_role|>" "<|end_of_role|>" \
+--instruction_template "<|start_of_role|>user<|end_of_role|>" \
+--response_template "<|start_of_role|>assistant<|end_of_role|>"
+```
 
 ### 4. Pre tokenized datasets.
 
@@ -791,7 +804,7 @@ Notes:
  * Notes on Fast MoE
     - `--fast_moe` is an integer value that configures the amount of expert parallel sharding (ep_degree).
     - `world_size` must be divisible by the `ep_degree`
-    - Running fast moe modifies the state dict of the model, and must be post-processed using [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) to run inference (HF, vLLM, etc.).
+    - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script.
       - The typical usecase for this script is to run:
         ```
         python -m fms_acceleration_moe.utils.checkpoint_utils \
 
@@ -88,7 +88,8 @@ ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
     NV_NVML_DEV_VERSION=12.1.55-1 \
     NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
     NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
-    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1
+    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1 \
+    NV_CUDNN9_CUDA_VERSION=9.6.0.74-1
 
 RUN dnf config-manager \
        --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
@@ -103,6 +104,15 @@ RUN dnf config-manager \
         libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
     && dnf clean all
 
+# opening connection for too long in one go was resulting in timeouts
+RUN dnf config-manager \
+       --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
+    && dnf clean packages \
+    && dnf install -y \
+        libcusparselt0 libcusparselt-devel \
+        cudnn9-cuda-12-6-${NV_CUDNN9_CUDA_VERSION} \
+    && dnf clean all
+
 ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
 
 FROM cuda-devel AS python-installations
@@ -138,7 +148,8 @@ RUN if [[ -z "${WHEEL_VERSION}" ]]; \
 RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
     python -m pip install --user wheel && \
     python -m pip install --user "$(head bdist_name)" && \
-    python -m pip install --user "$(head bdist_name)[flash-attn]"
+    python -m pip install --user "$(head bdist_name)[flash-attn]" && \
+    python -m pip install --user "$(head bdist_name)[mamba]"
 
 # fms_acceleration_peft = PEFT-training, e.g., 4bit QLoRA
 # fms_acceleration_foak = Fused LoRA and triton kernels
 
@@ -47,6 +47,8 @@ definitions:
         type: string
       seed:
         type: integer
+      chat_template:
+        type: string
     required:
       - type
     title: Dataprocessor
@@ -115,8 +117,10 @@ Users can create a data config file in any of YAML or JSON format they choose (w
 
 `datapreprocessor`:
  - `type` (optional, str): Type of data preprocessor, `default` is currently the only supported type.
+ - `streaming` (optional, bool): Stream datasets using [IterableDatasets](https://huggingface.co/docs/datasets/v3.2.0/en/package_reference/main_classes#datasets.IterableDataset).
  - `sampling_stopping_strategy` (optional, str): Dataset interleave stopping strategy in case of choosing to mix multiple datasets by weight, supported values are [`all_exhausted` or `first_exhausted`](https://huggingface.co/docs/datasets/v3.2.0/en/package_reference/main_classes#datasets.interleave_datasets.stopping_strategy), defaults to `all_exhausted`.
  - `sampling_seed` (optional, int): [Sampling seed](https://huggingface.co/docs/datasets/v3.2.0/en/package_reference/main_classes#datasets.interleave_datasets.seed) to use for interleaving datasets, for reproducibility choose same value, defaults to 42.
+ - `chat_template` (optional, str): pass `chat_template` via data_config for multi-turn data, replaces existing default chat template.
 
 `datasets` (list):
   - `name` (optional, str): A unique identifier for the dataset.
@@ -229,6 +233,8 @@ This library currently supports the following [preexisting data handlers](https:
     Uses a tokenizer's chat template to preprocess dataset elements, good for single/multi turn chat templates.
  - `duplicate_columns`:
     Duplicates one column of the dataset to another column.
+ - `tokenize`:
+    Tokenizes one column of the dataset passed as input `dataset_text_field`.
 
 These handlers could be requested by their same name and users can lookup the function args from [here](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/data/data_handlers.py)
 
@@ -251,6 +257,64 @@ We also allow users to pass a [`seed`](https://huggingface.co/docs/datasets/v3.2
 
 `Note: If a user specifies data sampling they can expect the datasets to be mixed and individual samples in the dataset to not be broken unless the max_seq_len argument is smaller than the length of individual samples in the dataset`
 
+### Data Streaming
+Dataset streaming allows users to utilize the functionality of iterable datasets to pass in data piece by piece, avoiding memory constraints with large datasets for use-cases like extended pre-training.
+
+Users can use streaming by setting `streaming` to `true` in the `datapreprocessor` config. This top-level variable must be set for all datasets in the config, and cannot differ from dataset to dataset. When `streaming` is `true`, the dataset is loaded as an `IterableDataset` ([docs](https://huggingface.co/docs/datasets/v3.2.0/en/package_reference/main_classes#datasets.IterableDataset)) instead of a regular `Dataset`, this means the dataset is loaded chunk-by-chunk rather than all at once and is processed lazily. For more details on the differences, see the [HF Blog](https://huggingface.co/docs/datasets/en/about_mapstyle_vs_iterable).
+
+In a data config this looks like (see [ept document](./ept.md#large-non-tokenized-dataset) for a more in-depth example):
+```
+dataprocessor:
+    type: default
+    streaming: true
+```
+
+When using streaming, `split_batches` in the `TrainingArguments` will automatically be set to `True`, by doing so, the main process will fetch a full batch and slice it into `num_processes` batches for each process. This means that `num_processes` must be divisible by `batch_size`. This will replace the global batch size.
+
+**When using streaming, the user must set `max_steps` in the `TrainingArguments` instead of `num_train_epochs`.** Since iterable datasets are loaded chunk-by-chunk, data cannot run through epochs in a typical fashion as the **Trainer** can not know length of the dataset as it is being passed through. If both `max_steps` and `num_train_epochs` are given in a training config, `max_steps` will overwrite `num_train_epochs` since `max_steps` directly specifies the total number of optimization steps, which is needed when dataset length cannot be known. 
+
+If the dataset size is known to the user, `max_steps` can be calculated as the total number of samples divided by the batch size.
+
 ### Example data configs.
 
 We provide some example data configs [here](../tests/artifacts/predefined_data_configs/)
+
+## Offline Data preprocessing
+
+[This script](../scripts/offline_data_processing.py) provides the capability for users to perform standalone data 
+preprocessing, decoupled from the tuning/training part. It processes raw datasets, performs data preprocessing, and 
+saves the train and validation datasets (in shards if `--num_dataset_shards` if passed) in parquet format inside the specified `output_dir`. 
+A data config YAML file can be used to pass configuration to this script. Example command to run this script:
+
+```
+python scripts/offline_data_processing.py \
+--data_config_path  /path/to/data_config.yaml \
+--model_name_or_path "model_name"  \
+--max_seq_length 4096 \
+--output_dir /path/to/output/directory  \
+--log_level info \
+--num_dataset_shards 3
+```
+
+Example data config file:
+
+```
+dataprocessor:
+    type: default
+    sampling_stopping_strategy: first_exhausted
+    seed: 66
+datasets:
+  - name: dataset_1
+    data_paths:
+      - tests/artifacts/testdata/jsonl/twitter_complaints_input_output.jsonl
+    data_handlers:
+      - name: tokenize_and_apply_input_masking
+        arguments:
+          remove_columns: all
+          batched: false
+          fn_kwargs:
+            input_field_name: input
+            output_field_name: output
+```
+
+
@@ -107,6 +107,35 @@ Here also the command line arguments would be
 
 The code again would add `EOS_TOKEN` to the non tokenized data before using it and also note that the `dataset_text_field` is assumed to be same across all datasets for now.
 
+### Large Non-Tokenized Dataset
+Let's say you have a large JSONL data file that cannot all fit into memory at once and you want to perform EPT on it, you can use the streaming feature to efficiently load and process data in chunks. To enable streaming, you can define a data_config as follows:
+
+Sample data config for the above use case.
+```
+dataprocessor:
+    type: default
+    streaming: true
+datasets:
+  - name: non_tokenized_text_dataset
+    data_paths:
+      - "<path-to-the-jsonl-dataset>"
+        data_handlers:
+        - name: add_tokenizer_eos_token
+            arguments:
+            remove_columns: all
+            batched: false
+            fn_kwargs:
+                dataset_text_field: "dataset_text_field"
+```
+
+The command-line arguments passed to the library should include the following:
+
+```
+--data_config <path to the data config> --packing=True --max_seq_len 8192 --max_steps <num training steps>
+```
+
+Please note when using streaming, user must pass `max_steps` instead of `num_train_epochs`. See advanced data preprocessing [document](./advanced-data-preprocessing.md#data-streaming) for more info.
+
 ### Additional Information
 This feature is supported post [v2.3.1](https://github.com/foundation-model-stack/fms-hf-tuning/releases/tag/v2.3.1) of this library.
 Post Last Updated On: 12-02-2025
@@ -29,16 +29,16 @@ classifiers=[
 dependencies = [
 "numpy>=1.26.4,<2.0",
 "accelerate>=0.20.3,!=0.34,<1.1",
-"transformers>=4.46,<4.48.2",
+"transformers>=4.49,<5.0",
 "torch>=2.2.0,<2.5",
 "sentencepiece>=0.1.99,<0.3",
 "tokenizers>=0.13.3,<1.0",
 "tqdm>=4.66.2,<5.0",
 "trl>=0.13,<0.15",
 "peft>=0.8.0,<0.14",
 "protobuf>=5.28.0,<6.0.0",
-"datasets>=2.15.0,<3.0",
-"simpleeval>=0.9.13,<1.0",
+"datasets>=2.15.0,<4.0",
+"simpleeval>=0.9.13,<2.0",
 ]
 
 [project.optional-dependencies]
@@ -48,6 +48,7 @@ aim = ["aim>=3.19.0,<4.0"]
 mlflow = ["mlflow"]
 fms-accel = ["fms-acceleration>=0.6"]
 gptq-dev = ["auto_gptq>0.4.2", "optimum>=1.15.0"]
+mamba = ["mamba_ssm[causal-conv1d] @ git+https://github.com/state-spaces/mamba.git"]
 scanner-dev = ["HFResourceScanner>=0.1.0"]