Skip to content

Commit 025ee15

Browse files
Fiona-Waterssutaakar
authored andcommitted
Adding CUDA training image with updated and added dependencies
Co-authored-by: Karel Suta <[email protected]>
1 parent 4af5bf0 commit 025ee15

File tree

5 files changed

+2955
-0
lines changed

5 files changed

+2955
-0
lines changed
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
## Global Args ######################################################
2+
ARG IMAGE_TAG=9.6-1755735361
3+
ARG PYTHON_VERSION=312
4+
5+
# use UBI9
6+
FROM registry.access.redhat.com/ubi9/python-${PYTHON_VERSION}:${IMAGE_TAG}
7+
8+
LABEL name="training:py312-cuda128-torch280" \
9+
summary="CUDA 12.8 Python 3.12 PyTorch 2.8.0 image based on UBI9 for Training" \
10+
description="CUDA 12.8 Python 3.12 PyTorch 2.8.0 image based on UBI9 for Training" \
11+
io.k8s.display-name="CUDA 12.8 Python 3.12 PyTorch 2.8.0 base image for Training" \
12+
io.k8s.description="CUDA 12.8 Python 3.12 PyTorch 2.8.0 image based on UBI9 for Training" \
13+
authoritative-source-url="https://github.com/opendatahub-io/distributed-workloads"
14+
15+
# Copy license
16+
COPY LICENSE.md /licenses/cuda-license.md
17+
18+
# Set the working directory in the container
19+
USER 0
20+
WORKDIR /app
21+
22+
# upgrade requests package
23+
RUN pip install --no-cache-dir --upgrade requests==2.32.3
24+
25+
# Install CUDA
26+
WORKDIR /opt/app-root/bin
27+
28+
# Ref: https://docs.nvidia.com/cuda/archive/12.8.0/cuda-toolkit-release-notes/
29+
ENV CUDA_VERSION=12.8.0 \
30+
NVIDIA_REQUIRE_CUDA="cuda>=12.8 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 brand=tesla,driver>=570,driver<571 brand=unknown,driver>=570,driver<571 brand=nvidia,driver>=570,driver<571 brand=nvidiartx,driver>=570,driver<571 brand=geforce,driver>=570,driver<571 brand=geforcertx,driver>=570,driver<571 brand=quadro,driver>=570,driver<571 brand=quadrortx,driver>=570,driver<571 brand=titan,driver>=570,driver<571 brand=titanrtx,driver>=570,driver<571" \
31+
NV_CUDA_LIB_VERSION=12.8.0-1 \
32+
NVIDIA_VISIBLE_DEVICES=all \
33+
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
34+
NV_CUDA_CUDART_VERSION=12.8.57-1 \
35+
NV_CUDA_COMPAT_VERSION=3:570.172.08-1.el9 \
36+
NV_CUDA_NVCC_VERSION=12.8.61-1
37+
38+
# Ref: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.8.0/ubi9/base/Dockerfile
39+
# nvcc is required for Flash Attention
40+
RUN dnf config-manager \
41+
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
42+
&& dnf install -y \
43+
--disablerepo=rhel-9-for-x86_64-baseos-rpms \
44+
--disablerepo=rhel-9-for-x86_64-appstream-rpms \
45+
cuda-cudart-12-8-${NV_CUDA_CUDART_VERSION} \
46+
cuda-compat-12-8-${NV_CUDA_COMPAT_VERSION} \
47+
cuda-nvcc-12-8-${NV_CUDA_NVCC_VERSION} \
48+
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
49+
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
50+
&& dnf clean all
51+
52+
ENV CUDA_HOME="/usr/local/cuda" \
53+
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
54+
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
55+
56+
# Install InfiniBand and RDMA packages
57+
RUN dnf config-manager \
58+
--add-repo https://linux.mellanox.com/public/repo/mlnx_ofed/latest/rhel9.5/mellanox_mlnx_ofed.repo
59+
60+
RUN dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,mlnx_ofed_24.10-1.1.4.0_base,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
61+
libibverbs-utils \
62+
infiniband-diags \
63+
libibumad3 \
64+
librdmacm \
65+
librdmacm-utils \
66+
rdma-core \
67+
mlnx-tools \
68+
&& dnf clean all \
69+
&& rm -rf /var/cache/dnf/*
70+
71+
# Install Python packages
72+
73+
# Install micropipenv to deploy packages from Pipfile.lock
74+
RUN pip install --no-cache-dir -U "micropipenv[toml]"
75+
76+
# Install Python dependencies from Pipfile.lock file
77+
COPY Pipfile.lock ./
78+
79+
RUN micropipenv install -- --no-cache-dir && \
80+
rm -f ./Pipfile.lock && \
81+
# Fix permissions to support pip in OpenShift environments \
82+
chmod -R g+w /opt/app-root/lib/python3.12/site-packages && \
83+
fix-permissions /opt/app-root -P
84+
85+
# Install Flash Attention
86+
RUN pip install wheel
87+
RUN pip install --no-cache-dir flash-attn==2.8.3 --no-build-isolation
88+
89+
# Upgrade NCCL to a more recent version and add Training Hub NVIDIA dependencies
90+
RUN pip install \
91+
nvidia-nccl-cu12==2.27.3 \
92+
nvidia-cublas-cu12==12.8.4.1 \
93+
nvidia-cuda-cupti-cu12==12.8.90 \
94+
nvidia-cuda-nvrtc-cu12==12.8.93 \
95+
nvidia-cuda-runtime-cu12==12.8.90 \
96+
nvidia-cudnn-cu12==9.10.2.21 \
97+
nvidia-cufft-cu12==11.3.3.83 \
98+
nvidia-cufile-cu12==1.13.1.3 \
99+
nvidia-curand-cu12==10.3.9.90 \
100+
nvidia-cusolver-cu12==11.7.3.90 \
101+
nvidia-cusparse-cu12==12.5.8.93 \
102+
nvidia-cusparselt-cu12==0.7.1 \
103+
nvidia-nvjitlink-cu12==12.8.93 \
104+
nvidia-nvtx-cu12==12.8.90 \
105+
&& fix-permissions /opt/app-root -P
106+
107+
# Restore user workspace
108+
USER 1001
109+
110+
WORKDIR /opt/app-root/src

0 commit comments

Comments
 (0)