Skip to content

Commit f348a1b

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 3f96837 + ccb22f0 commit f348a1b

File tree

2 files changed

+16
-39
lines changed

2 files changed

+16
-39
lines changed

images/runtime/training/cuda/Dockerfile

Lines changed: 15 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -32,57 +32,34 @@ ENV CUDA_VERSION=12.1.0 \
3232
NVIDIA_VISIBLE_DEVICES=all \
3333
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
3434
NV_CUDA_CUDART_VERSION=12.1.55-1 \
35-
NV_CUDA_COMPAT_VERSION=530.30.02-1
35+
NV_CUDA_COMPAT_VERSION=530.30.02-1 \
36+
NV_CUDA_NVCC_VERSION=12.1.66-1
3637

38+
# Ref: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.1.1/ubi9/base/Dockerfile
39+
# nvcc is required for Flash Attention
3740
RUN dnf config-manager \
3841
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
3942
&& dnf install -y \
4043
cuda-cudart-12-1-${NV_CUDA_CUDART_VERSION} \
4144
cuda-compat-12-1-${NV_CUDA_COMPAT_VERSION} \
45+
cuda-nvcc-12-1-${NV_CUDA_NVCC_VERSION} \
4246
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
4347
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
4448
&& dnf clean all
4549

46-
RUN dnf -y install --allowerasing cudnn9-cuda-12
47-
4850
ENV CUDA_HOME="/usr/local/cuda" \
4951
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
50-
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \
51-
LD_LIBRARY_PATH="/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH"
52-
53-
# Ref: https://developer.nvidia.com/nccl/nccl-legacy-downloads
54-
ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
55-
NV_NVML_DEV_VERSION=12.1.55-1 \
56-
NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
57-
NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
58-
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1
52+
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
5953

54+
# Install InfiniBand and RDMA packages
6055
RUN dnf config-manager \
61-
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
56+
--add-repo https://linux.mellanox.com/public/repo/mlnx_ofed/latest/rhel9.5/mellanox_mlnx_ofed.repo \
6257
&& dnf install -y \
63-
cuda-command-line-tools-12-1-${NV_CUDA_LIB_VERSION} \
64-
cuda-libraries-devel-12-1-${NV_CUDA_LIB_VERSION} \
65-
cuda-minimal-build-12-1-${NV_CUDA_LIB_VERSION} \
66-
cuda-cudart-devel-12-1-${NV_CUDA_CUDART_DEV_VERSION} \
67-
cuda-nvml-devel-12-1-${NV_NVML_DEV_VERSION} \
68-
libcublas-devel-12-1-${NV_LIBCUBLAS_DEV_VERSION} \
69-
libnpp-devel-12-1-${NV_LIBNPP_DEV_VERSION} \
70-
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
71-
&& dnf clean all
72-
73-
ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
74-
75-
# Install CUDA devel cudnn8 from:
76-
# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.1.1/ubi9/devel/cudnn8/Dockerfile
77-
ENV NV_CUDNN_VERSION=8.9.0.131-1
78-
ENV NV_CUDNN_PACKAGE=libcudnn8-${NV_CUDNN_VERSION}.cuda12.1
79-
ENV NV_CUDNN_PACKAGE_DEV=libcudnn8-devel-${NV_CUDNN_VERSION}.cuda12.1
80-
81-
LABEL com.nvidia.cudnn.version="${NV_CUDNN_VERSION}"
82-
83-
RUN dnf install -y \
84-
${NV_CUDNN_PACKAGE} \
85-
${NV_CUDNN_PACKAGE_DEV} \
58+
libibverbs-utils \
59+
infiniband-diags \
60+
libibumad \
61+
librdmacm \
62+
rdma-core \
8663
&& dnf clean all \
8764
&& rm -rf /var/cache/dnf/*
8865

@@ -94,15 +71,15 @@ RUN pip install --no-cache-dir -U "micropipenv[toml]"
9471
# Install Python dependencies from Pipfile.lock file
9572
COPY Pipfile.lock ./
9673

97-
RUN micropipenv install && \
74+
RUN micropipenv install -- --no-cache-dir && \
9875
rm -f ./Pipfile.lock && \
9976
# Fix permissions to support pip in OpenShift environments \
10077
chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
10178
fix-permissions /opt/app-root -P
10279

10380
# Install Flash Attention
10481
RUN pip install wheel
105-
RUN pip install flash-attn==2.7.4.post1 --no-build-isolation
82+
RUN pip install --no-cache-dir flash-attn==2.7.4.post1 --no-build-isolation
10683

10784
# Restore user workspace
10885
USER 1001

tests/kfto/kfto_training_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
193193
--model_dir /tmp/model/bloom-560m \
194194
--dataset_file /tmp/all_datasets/alpaca_data_tenth.json \
195195
--transformer_type AutoModelForCausalLM \
196-
--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
196+
--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/tmp/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
197197
--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'`,
198198
},
199199
Env: []corev1.EnvVar{

0 commit comments

Comments
 (0)