|
| 1 | +## Global Args ###################################################### |
| 2 | +ARG IMAGE_TAG=9.6-1755735361 |
| 3 | +ARG PYTHON_VERSION=312 |
| 4 | + |
| 5 | +# use UBI9 |
| 6 | +FROM registry.access.redhat.com/ubi9/python-${PYTHON_VERSION}:${IMAGE_TAG} |
| 7 | + |
| 8 | +LABEL name="training:py312-cuda128-torch280" \ |
| 9 | + summary="CUDA 12.8 Python 3.12 PyTorch 2.8.0 image based on UBI9 for Training" \ |
| 10 | + description="CUDA 12.8 Python 3.12 PyTorch 2.8.0 image based on UBI9 for Training" \ |
| 11 | + io.k8s.display-name="CUDA 12.8 Python 3.12 PyTorch 2.8.0 base image for Training" \ |
| 12 | + io.k8s.description="CUDA 12.8 Python 3.12 PyTorch 2.8.0 image based on UBI9 for Training" \ |
| 13 | + authoritative-source-url="https://github.com/opendatahub-io/distributed-workloads" |
| 14 | + |
| 15 | +# Copy license |
| 16 | +COPY LICENSE.md /licenses/cuda-license.md |
| 17 | + |
| 18 | +# Set the working directory in the container |
| 19 | +USER 0 |
| 20 | +WORKDIR /app |
| 21 | + |
| 22 | +# upgrade requests package |
| 23 | +RUN pip install --no-cache-dir --upgrade requests==2.32.3 |
| 24 | + |
| 25 | +# Install CUDA |
| 26 | +WORKDIR /opt/app-root/bin |
| 27 | + |
| 28 | +# Ref: https://docs.nvidia.com/cuda/archive/12.8.0/cuda-toolkit-release-notes/ |
| 29 | +ENV CUDA_VERSION=12.8.0 \ |
| 30 | + NVIDIA_REQUIRE_CUDA="cuda>=12.8 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 brand=tesla,driver>=570,driver<571 brand=unknown,driver>=570,driver<571 brand=nvidia,driver>=570,driver<571 brand=nvidiartx,driver>=570,driver<571 brand=geforce,driver>=570,driver<571 brand=geforcertx,driver>=570,driver<571 brand=quadro,driver>=570,driver<571 brand=quadrortx,driver>=570,driver<571 brand=titan,driver>=570,driver<571 brand=titanrtx,driver>=570,driver<571" \ |
| 31 | + NV_CUDA_LIB_VERSION=12.8.0-1 \ |
| 32 | + NVIDIA_VISIBLE_DEVICES=all \ |
| 33 | + NVIDIA_DRIVER_CAPABILITIES=compute,utility \ |
| 34 | + NV_CUDA_CUDART_VERSION=12.8.57-1 \ |
| 35 | + NV_CUDA_COMPAT_VERSION=3:570.172.08-1.el9 \ |
| 36 | + NV_CUDA_NVCC_VERSION=12.8.61-1 |
| 37 | + |
| 38 | +# Ref: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.8.0/ubi9/base/Dockerfile |
| 39 | +# nvcc is required for Flash Attention |
| 40 | +RUN dnf config-manager \ |
| 41 | + --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \ |
| 42 | + && dnf install -y \ |
| 43 | + --disablerepo=rhel-9-for-x86_64-baseos-rpms \ |
| 44 | + --disablerepo=rhel-9-for-x86_64-appstream-rpms \ |
| 45 | + cuda-cudart-12-8-${NV_CUDA_CUDART_VERSION} \ |
| 46 | + cuda-compat-12-8-${NV_CUDA_COMPAT_VERSION} \ |
| 47 | + cuda-nvcc-12-8-${NV_CUDA_NVCC_VERSION} \ |
| 48 | + && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \ |
| 49 | + && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \ |
| 50 | + && dnf clean all |
| 51 | + |
| 52 | +ENV CUDA_HOME="/usr/local/cuda" \ |
| 53 | + PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \ |
| 54 | + LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH" |
| 55 | + |
| 56 | +# Install InfiniBand and RDMA packages |
| 57 | +RUN dnf config-manager \ |
| 58 | + --add-repo https://linux.mellanox.com/public/repo/mlnx_ofed/latest/rhel9.5/mellanox_mlnx_ofed.repo |
| 59 | + |
| 60 | +RUN dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,mlnx_ofed_24.10-1.1.4.0_base,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \ |
| 61 | + libibverbs-utils \ |
| 62 | + infiniband-diags \ |
| 63 | + libibumad3 \ |
| 64 | + librdmacm \ |
| 65 | + librdmacm-utils \ |
| 66 | + rdma-core \ |
| 67 | + mlnx-tools \ |
| 68 | + && dnf clean all \ |
| 69 | + && rm -rf /var/cache/dnf/* |
| 70 | + |
| 71 | +# Install Python packages |
| 72 | + |
| 73 | +# Install micropipenv to deploy packages from Pipfile.lock |
| 74 | +RUN pip install --no-cache-dir -U "micropipenv[toml]" |
| 75 | + |
| 76 | +# Install Python dependencies from Pipfile.lock file |
| 77 | +COPY Pipfile.lock ./ |
| 78 | + |
| 79 | +RUN micropipenv install -- --no-cache-dir && \ |
| 80 | + rm -f ./Pipfile.lock && \ |
| 81 | + # Fix permissions to support pip in OpenShift environments \ |
| 82 | + chmod -R g+w /opt/app-root/lib/python3.12/site-packages && \ |
| 83 | + fix-permissions /opt/app-root -P |
| 84 | + |
| 85 | +# Install Flash Attention |
| 86 | +RUN pip install wheel |
| 87 | +RUN pip install --no-cache-dir flash-attn==2.8.3 --no-build-isolation |
| 88 | + |
| 89 | +# Upgrade NCCL to a more recent version and add Training Hub NVIDIA dependencies |
| 90 | +RUN pip install \ |
| 91 | + nvidia-nccl-cu12==2.27.3 \ |
| 92 | + nvidia-cublas-cu12==12.8.4.1 \ |
| 93 | + nvidia-cuda-cupti-cu12==12.8.90 \ |
| 94 | + nvidia-cuda-nvrtc-cu12==12.8.93 \ |
| 95 | + nvidia-cuda-runtime-cu12==12.8.90 \ |
| 96 | + nvidia-cudnn-cu12==9.10.2.21 \ |
| 97 | + nvidia-cufft-cu12==11.3.3.83 \ |
| 98 | + nvidia-cufile-cu12==1.13.1.3 \ |
| 99 | + nvidia-curand-cu12==10.3.9.90 \ |
| 100 | + nvidia-cusolver-cu12==11.7.3.90 \ |
| 101 | + nvidia-cusparse-cu12==12.5.8.93 \ |
| 102 | + nvidia-cusparselt-cu12==0.7.1 \ |
| 103 | + nvidia-nvjitlink-cu12==12.8.93 \ |
| 104 | + nvidia-nvtx-cu12==12.8.90 \ |
| 105 | + && fix-permissions /opt/app-root -P |
| 106 | + |
| 107 | +# Restore user workspace |
| 108 | +USER 1001 |
| 109 | + |
| 110 | +WORKDIR /opt/app-root/src |
0 commit comments