diff --git a/.github/workflows/publish_devel_image.yml b/.github/workflows/publish_devel_image.yml index 6306893e..5edb61f9 100644 --- a/.github/workflows/publish_devel_image.yml +++ b/.github/workflows/publish_devel_image.yml @@ -21,7 +21,7 @@ jobs: strategy: fail-fast: false matrix: - cuda: ["12.1", "12.4", "12.6"] + cuda: ["12.1", "12.4", "12.6", "12.8"] gcc: ["12"] include: # build cuda 11.8 with gcc 11 - cuda: "11.8" diff --git a/docker/Dockerfile.devel b/docker/Dockerfile.devel index b9505cbb..97c2c121 100644 --- a/docker/Dockerfile.devel +++ b/docker/Dockerfile.devel @@ -40,14 +40,6 @@ RUN if [ -n "${CCACHE_VERSION}" ]; then bash ./install_ccache.sh; fi RUN rm install_ccache.sh RUN ccache --version -# Install cuda, cudnn and nccl -ARG CUDA_VERSION=12.1 -COPY ./common/install_cuda.sh install_cuda.sh -RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh -ENV DESIRED_CUDA=${CUDA_VERSION} -ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH -RUN nvcc --version - # Install rust ENV RUSTUP_HOME=/usr/local/rustup ENV CARGO_HOME=/usr/local/cargo @@ -57,13 +49,12 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y RUN chmod -R a+w ${RUSTUP_HOME} ${CARGO_HOME} RUN rustup --version; cargo --version; rustc --version -# Install jemalloc (optional) -RUN cd /tmp && \ - wget -nc --no-check-certificate https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2 && \ - tar -xvf jemalloc-5.3.0.tar.bz2 && \ - (cd jemalloc-5.3.0 && \ - ./configure --enable-prof --disable-initial-exec-tls && \ - make -j$(nproc) && make install && \ - ldconfig) +# Install cuda, cudnn and nccl +ARG CUDA_VERSION=12.1 +COPY ./common/install_cuda.sh install_cuda.sh +RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh +ENV DESIRED_CUDA=${CUDA_VERSION} +ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH +RUN nvcc --version CMD ["bash"] \ No newline at end of file diff --git a/docker/common/install_cuda.sh b/docker/common/install_cuda.sh index bea67665..d86951bd 100755 --- a/docker/common/install_cuda.sh +++ b/docker/common/install_cuda.sh @@ -5,7 +5,7 @@ set -ex NCCL_VERSION=v2.21.5-1 -CUDNN_VERSION=9.1.0.70 +CUDNN_VERSION=9.5.1.17 function install_cusparselt_040 { # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html @@ -40,7 +40,19 @@ function install_cusparselt_062 { rm -rf tmp_cusparselt } +function install_cusparselt_063 { + # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html + mkdir tmp_cusparselt && pushd tmp_cusparselt + wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz + tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz + cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/ + cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_cusparselt +} + function install_118 { + CUDNN_VERSION=9.1.0.70 echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0" rm -rf /usr/local/cuda-11.8 /usr/local/cuda # install CUDA 11.8.0 in the same container @@ -107,6 +119,7 @@ function install_121 { } function install_124 { + CUDNN_VERSION=9.1.0.70 echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2" rm -rf /usr/local/cuda-12.4 /usr/local/cuda # install CUDA 12.4.1 in the same container @@ -140,13 +153,13 @@ function install_124 { } function install_126 { - echo "Installing CUDA 12.6.2 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2" + echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3" rm -rf /usr/local/cuda-12.6 /usr/local/cuda - # install CUDA 12.6.2 in the same container - wget -q https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run - chmod +x cuda_12.6.2_560.35.03_linux.run - ./cuda_12.6.2_560.35.03_linux.run --toolkit --silent - rm -f cuda_12.6.2_560.35.03_linux.run + # install CUDA 12.6.3 in the same container + wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run + chmod +x cuda_12.6.3_560.35.05_linux.run + ./cuda_12.6.3_560.35.05_linux.run --toolkit --silent + rm -f cuda_12.6.3_560.35.05_linux.run rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement @@ -167,7 +180,7 @@ function install_126 { cd .. rm -rf nccl - install_cusparselt_062 + install_cusparselt_063 ldconfig } @@ -302,6 +315,40 @@ function prune_126 { rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/ } +function install_128 { + CUDNN_VERSION=9.7.0.66 + echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3" + rm -rf /usr/local/cuda-12.8 /usr/local/cuda + # install CUDA 12.8.0 in the same container + wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run + chmod +x cuda_12.8.0_570.86.10_linux.run + ./cuda_12.8.0_570.86.10_linux.run --toolkit --silent + rm -f cuda_12.8.0_570.86.10_linux.run + rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda + + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + mkdir tmp_cudnn && cd tmp_cudnn + wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/ + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf tmp_cudnn + + # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses + # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build + git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl && make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf nccl + + install_cusparselt_063 + + ldconfig +} + # idiomatic parameter and option handling in sh while test $# -gt 0 do @@ -314,6 +361,8 @@ do ;; 12.6) install_126; prune_126 ;; + 12.8) install_128; + ;; *) echo "bad argument $1"; exit 1 ;; esac