Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
048a0e8
Update offline-data-preprocessing.md (#589)
dushyantbehl Jul 24, 2025
7258505
use default dataprocessor if one is not provided (#590)
dushyantbehl Jul 28, 2025
5d49c76
fix data processing job exiting with failure even though successful (…
dushyantbehl Aug 4, 2025
ef11bd1
fix: Disable caching in transformers via `use_cache` flag to avoid un…
romitjain Aug 4, 2025
5951c9a
fix: Rank is set to zero by default (#594)
seshapad Aug 4, 2025
9c7858e
upgrade trl (#601)
dushyantbehl Aug 29, 2025
607984c
Add changes to support granite 4 models (#599)
YashasviChaurasia Sep 1, 2025
7e261d2
feat: Support gpt-oss class of models with flash attention 3 support …
dushyantbehl Sep 3, 2025
bc39f95
feat: Restructure README (#598)
dushyantbehl Sep 4, 2025
f41eb2c
fix: add default optim arg in training arg (#607)
YashasviChaurasia Sep 15, 2025
8232e63
fix typo in dockerfile
dushyantbehl Sep 16, 2025
47e80ce
fix typo which ignored qlora config
dushyantbehl Sep 16, 2025
b6aa877
fix typos in data handler names
dushyantbehl Sep 16, 2025
2949a3a
Merge pull request #610 from dushyantbehl/main
dushyantbehl Sep 16, 2025
abeb12c
fix: subclass Lora config from upstream peft.LoraConfig (#609)
romitjain Sep 26, 2025
384d424
fix: update fms-accel to main (#608)
YashasviChaurasia Sep 26, 2025
d6dc4c9
Update advanced-data-preprocessing.md (#613)
dushyantbehl Sep 27, 2025
d9ee35f
feat: add ckpt conversion script fp32-bf16 (#614)
YashasviChaurasia Sep 30, 2025
f337875
feat: Allow chat template to be specified via a path in data config. …
YashasviChaurasia Oct 6, 2025
4ec1340
feat: add online data mixing plugin (#612)
kmehant Oct 8, 2025
452c13b
feat: Adopt resumption feature of online data mixing (#617)
kmehant Oct 9, 2025
5187516
feat: Alora migration to PEFT upstream. (#618)
YashasviChaurasia Oct 10, 2025
ebf5743
feat: ensure fms-acceleration callbacks run before TrainerController …
YashasviChaurasia Oct 27, 2025
6f2134d
fix: add on_init_end before adding tc callback (#621)
YashasviChaurasia Oct 28, 2025
7ff1ce9
fix: add hf compatible path update (#622)
YashasviChaurasia Oct 28, 2025
3344193
fix: avoid updating path kwarg (#624)
YashasviChaurasia Oct 28, 2025
1e844d3
fix: directly save final ckpt in save_model_dir (#626)
YashasviChaurasia Nov 5, 2025
faea400
Add free up disk space to gh runners (#628)
dushyantbehl Nov 7, 2025
2706020
fix: image build failure due to flash attn (#629)
dushyantbehl Nov 7, 2025
8c7566f
Merge tag 'v3.1.0-rc3' into v3.1.0-rc3
dushyantbehl Nov 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/build-and-publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ jobs:

steps:
- uses: actions/checkout@v4
- name: "Free up disk space"
uses: ./.github/actions/free-up-disk-space
- name: Set up Python ${{ matrix.python-version.setup }}
uses: actions/setup-python@v4
with:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Free up disk space"
uses: ./.github/actions/free-up-disk-space
- name: Set up Python 3.12
uses: actions/setup-python@v4
with:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Free up disk space"
uses: ./.github/actions/free-up-disk-space
- name: Set up Python 3.12
uses: actions/setup-python@v4
with:
Expand Down
1,108 changes: 50 additions & 1,058 deletions README.md

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ ARG PYTHON_VERSION=3.12
ARG WHEEL_VERSION=""
## Enable Aimstack or MLflow if requested via ENABLE_AIM/MLFLOW set to "true"
ARG ENABLE_AIM=false
ARG ENABLE_ALORA=false
ARG ENABLE_MLFLOW=false
ARG ENABLE_FMS_ACCELERATION=true
ARG ENABLE_SCANNER=false
Expand Down Expand Up @@ -127,7 +126,6 @@ ARG USER_UID
ARG ENABLE_FMS_ACCELERATION
ARG ENABLE_AIM
ARG ENABLE_MLFLOW
ARG ENABLE_ALORA
ARG ENABLE_SCANNER
ARG ENABLE_CLEARML

Expand All @@ -151,33 +149,35 @@ RUN if [[ -z "${WHEEL_VERSION}" ]]; \
fi && \
ls /tmp/*.whl >/tmp/bdist_name

# Ensures to always build mamba_ssm from source
ENV PIP_NO_BINARY=mamba-ssm,mamba_ssm

# Install from the wheel
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
python -m pip install --user wheel && \
python -m pip install --user "$(head bdist_name)" && \
python -m pip install --user "$(head bdist_name)[flash-attn]" && \
python -m pip install --user --no-build-isolation "$(head bdist_name)[mamba]"

RUN python -m pip install --user --no-build-isolation "$(head bdist_name)[flash-attn]"

# fms_acceleration_peft = PEFT-training, e.g., 4bit QLoRA
# fms_acceleration_foak = Fused LoRA and triton kernels
# fms_acceleration_aadp = Padding-Free Flash Attention Computation
# fms_acceleration_moe = Parallelized Mixture of Experts
# fms_acceleration_odm = Online Data Mixing
RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \
python -m pip install --user "$(head bdist_name)[fms-accel]"; \
python -m fms_acceleration.cli install fms_acceleration_peft; \
python -m fms_acceleration.cli install fms_acceleration_foak; \
python -m fms_acceleration.cli install fms_acceleration_aadp; \
python -m fms_acceleration.cli install fms_acceleration_moe; \
python -m fms_acceleration.cli install fms_acceleration_odm; \
fi

RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
python -m pip install --user "$(head bdist_name)[aim]"; \
fi

RUN if [[ "${ENABLE_ALORA}" == "true" ]]; then \
python -m pip install --user "$(head bdist_name)[activated-lora]"; \
fi

RUN if [[ "${ENABLE_MLFLOW}" == "true" ]]; then \
python -m pip install --user "$(head bdist_name)[mlflow]"; \
fi
Expand Down Expand Up @@ -234,4 +234,4 @@ USER ${USER}
COPY --from=python-installations /home/${USER}/.local /home/${USER}/.local
ENV PYTHONPATH="/home/${USER}/.local/lib/python${PYTHON_VERSION}/site-packages"

CMD [ "python", "/app/accelerate_launch.py" ]
CMD [ "python", "/app/accelerate_launch.py" ]
93 changes: 93 additions & 0 deletions build/nvcr.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Copyright The FMS HF Tuning Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Global Args #################################################################
## If the nvcr container is updated, ensure to check the torch and python
## installation version inside the dockerfile before pushing changes.
ARG NVCR_IMAGE_VERSION=25.02-py3

# This is based on what is inside the NVCR image already
ARG PYTHON_VERSION=3.12

## Base Layer ##################################################################
FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION} AS dev

ARG USER=root
ARG USER_UID=0
ARG WORKDIR=/app
ARG SOURCE_DIR=${WORKDIR}/fms-hf-tuning

ARG ENABLE_FMS_ACCELERATION=true
ARG ENABLE_AIM=true
ARG ENABLE_MLFLOW=true
ARG ENABLE_SCANNER=true
ARG ENABLE_CLEARML=true
ARG ENABLE_TRITON_KERNELS=true
ARG ENABLE_MAMBA_SUPPORT=true

# Ensures to always build mamba_ssm from source
ENV PIP_NO_BINARY=mamba-ssm,mamba_ssm

RUN python -m pip install --upgrade pip

# upgrade torch as the base layer contains only torch 2.7
RUN pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128

# Install main package + flash attention
COPY . ${SOURCE_DIR}
RUN cd ${SOURCE_DIR}
RUN pip install --no-cache-dir ${SOURCE_DIR} && \
pip install --no-cache-dir ${SOURCE_DIR}[flash-attn]

# Optional extras
RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[fms-accel] && \
python -m fms_acceleration.cli install fms_acceleration_peft && \
python -m fms_acceleration.cli install fms_acceleration_foak && \
python -m fms_acceleration.cli install fms_acceleration_aadp && \
python -m fms_acceleration.cli install fms_acceleration_moe && \
python -m fms_acceleration.cli install fms_acceleration_odm; \
fi

RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[aim]; \
fi
RUN if [[ "${ENABLE_MLFLOW}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[mlflow]; \
fi
RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[scanner-dev]; \
fi
RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[clearml]; \
fi
RUN if [[ "${ENABLE_MAMBA_SUPPORT}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[mamba]; \
fi
RUN if [[ "${ENABLE_TRITON_KERNELS}" == "true" ]]; then \
pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"; \
fi

RUN chmod -R g+rwX $WORKDIR /tmp
RUN mkdir -p /.cache && chmod -R 777 /.cache

# Set Triton environment variables for qLoRA
ENV TRITON_HOME="/tmp/triton_home"
ENV TRITON_DUMP_DIR="/tmp/triton_dump_dir"
ENV TRITON_CACHE_DIR="/tmp/triton_cache_dir"
ENV TRITON_OVERRIDE_DIR="/tmp/triton_override_dir"

WORKDIR $WORKDIR

CMD ["${SOURCE_DIR}/build/accelerate_launch.py"]
Loading