Skip to content

Commit bc9cbba

Browse files
committed
Merge branch 'main' into save-model-dir-moe-checkpoint
Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
2 parents 4afea6d + 0a42075 commit bc9cbba

File tree

3 files changed

+17
-4
lines changed

3 files changed

+17
-4
lines changed

build/Dockerfile

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
8888
NV_NVML_DEV_VERSION=12.1.55-1 \
8989
NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
9090
NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
91-
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1
91+
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1 \
92+
NV_CUDNN9_CUDA_VERSION=9.6.0.74-1
9293

9394
RUN dnf config-manager \
9495
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
@@ -103,6 +104,15 @@ RUN dnf config-manager \
103104
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
104105
&& dnf clean all
105106

107+
# opening connection for too long in one go was resulting in timeouts
108+
RUN dnf config-manager \
109+
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
110+
&& dnf clean packages \
111+
&& dnf install -y \
112+
libcusparselt0 libcusparselt-devel \
113+
cudnn9-cuda-12-6-${NV_CUDNN9_CUDA_VERSION} \
114+
&& dnf clean all
115+
106116
ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
107117

108118
FROM cuda-devel AS python-installations
@@ -138,7 +148,8 @@ RUN if [[ -z "${WHEEL_VERSION}" ]]; \
138148
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
139149
python -m pip install --user wheel && \
140150
python -m pip install --user "$(head bdist_name)" && \
141-
python -m pip install --user "$(head bdist_name)[flash-attn]"
151+
python -m pip install --user "$(head bdist_name)[flash-attn]" && \
152+
python -m pip install --user "$(head bdist_name)[mamba]"
142153

143154
# fms_acceleration_peft = PEFT-training, e.g., 4bit QLoRA
144155
# fms_acceleration_foak = Fused LoRA and triton kernels

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ aim = ["aim>=3.19.0,<4.0"]
4848
mlflow = ["mlflow"]
4949
fms-accel = ["fms-acceleration>=0.6"]
5050
gptq-dev = ["auto_gptq>0.4.2", "optimum>=1.15.0"]
51+
mamba = ["mamba_ssm[causal-conv1d] @ git+https://github.com/state-spaces/mamba.git"]
5152
scanner-dev = ["HFResourceScanner>=0.1.0"]
5253

5354

tuning/data/data_processors.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -434,9 +434,10 @@ def process_dataset_configs(
434434
# https://github.com/huggingface/trl/blob/e3244d/trl/trainer/sft_trainer.py#L367
435435
state = PartialState()
436436

437-
# The local_main_process_first context ensures that the main process runs first per node
437+
# The main_process_first context ensures that the main process runs first
438438
# as we want to reuse HF cache and not redo computation on all nodes
439-
with state.local_main_process_first():
439+
# For rationale see https://github.com/huggingface/trl/pull/3106
440+
with state.main_process_first():
440441
train_dataset = self._process_dataset_configs(dataset_configs, **kwargs)
441442

442443
return train_dataset

0 commit comments

Comments
 (0)