Merge branch 'main' into save-model-dir-moe-checkpoint

willmj · willmj · commit bc9cbbaae6e8 · 2025-03-20T13:58:16.000-04:00
Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -88,7 +88,8 @@ ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
     NV_NVML_DEV_VERSION=12.1.55-1 \
     NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
     NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
-    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1
+    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1 \
+    NV_CUDNN9_CUDA_VERSION=9.6.0.74-1
 
 RUN dnf config-manager \
        --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
@@ -103,6 +104,15 @@ RUN dnf config-manager \
         libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
     && dnf clean all
 
+# opening connection for too long in one go was resulting in timeouts
+RUN dnf config-manager \
+       --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
+    && dnf clean packages \
+    && dnf install -y \
+        libcusparselt0 libcusparselt-devel \
+        cudnn9-cuda-12-6-${NV_CUDNN9_CUDA_VERSION} \
+    && dnf clean all
+
 ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
 
 FROM cuda-devel AS python-installations
@@ -138,7 +148,8 @@ RUN if [[ -z "${WHEEL_VERSION}" ]]; \
 RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
     python -m pip install --user wheel && \
     python -m pip install --user "$(head bdist_name)" && \
-    python -m pip install --user "$(head bdist_name)[flash-attn]"
+    python -m pip install --user "$(head bdist_name)[flash-attn]" && \
+    python -m pip install --user "$(head bdist_name)[mamba]"
 
 # fms_acceleration_peft = PEFT-training, e.g., 4bit QLoRA
 # fms_acceleration_foak = Fused LoRA and triton kernels
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,7 @@ aim = ["aim>=3.19.0,<4.0"]
 mlflow = ["mlflow"]
 fms-accel = ["fms-acceleration>=0.6"]
 gptq-dev = ["auto_gptq>0.4.2", "optimum>=1.15.0"]
+mamba = ["mamba_ssm[causal-conv1d] @ git+https://github.com/state-spaces/mamba.git"]
 scanner-dev = ["HFResourceScanner>=0.1.0"]
 
 
diff --git a/tuning/data/data_processors.py b/tuning/data/data_processors.py
@@ -434,9 +434,10 @@ def process_dataset_configs(
         # https://github.com/huggingface/trl/blob/e3244d/trl/trainer/sft_trainer.py#L367
         state = PartialState()
 
-        # The local_main_process_first context ensures that the main process runs first per node
+        # The main_process_first context ensures that the main process runs first
         # as we want to reuse HF cache and not redo computation on all nodes
-        with state.local_main_process_first():
+        # For rationale see https://github.com/huggingface/trl/pull/3106
+        with state.main_process_first():
             train_dataset = self._process_dataset_configs(dataset_configs, **kwargs)
 
         return train_dataset