Skip to content

Commit b102ddb

Browse files
[UX] Pre-build a EFA version of the default Docker image #2793
1 parent a4e6d29 commit b102ddb

File tree

5 files changed

+174
-1
lines changed

5 files changed

+174
-1
lines changed

.github/workflows/docker.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ jobs:
7575
--build-arg PYTHON=${{ matrix.python }} \
7676
--provenance=false \
7777
--push \
78-
-f default/${{ matrix.version }}/Dockerfile .
78+
-f base/${{ matrix.version }}/Dockerfile .
7979
8080
build-aws-images:
8181
needs: build-docker

docker/base/Dockerfile.common

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
FROM nvidia/cuda:12.1.1-base-ubuntu20.04
2+
3+
ARG PYTHON
4+
ARG _UV_HOME="/opt/uv"
5+
6+
ENV UV_PYTHON="${PYTHON}"
7+
ENV UV_INSTALL_DIR="${_UV_HOME}/bin"
8+
ENV UV_PYTHON_INSTALL_DIR="${_UV_HOME}/python"
9+
ENV UV_PYTHON_BIN_DIR="${UV_PYTHON_INSTALL_DIR}/bin"
10+
ENV UV_MANAGED_PYTHON=1
11+
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
12+
13+
ENV PATH="${UV_INSTALL_DIR}:${UV_PYTHON_BIN_DIR}:${PATH}"
14+
15+
RUN export DEBIAN_FRONTEND=noninteractive && \
16+
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
17+
apt-get update --fix-missing && \
18+
apt-get upgrade -y && \
19+
ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \
20+
apt-get install -y tzdata && \
21+
dpkg-reconfigure --frontend noninteractive tzdata && \
22+
apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \
23+
libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags && \
24+
sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config && mkdir /run/sshd && \
25+
mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys && rm /etc/ssh/ssh_host_*
26+
27+
RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \
28+
uv python install --preview --default

docker/base/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Image for `dstack` runner instances.

docker/base/efa/Dockerfile

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# syntax = edrevo/dockerfile-plus
2+
3+
INCLUDE+ base/Dockerfile
4+
5+
ENV NCCL_HOME=/usr/local
6+
ENV CUDA_HOME=/usr/local/cuda
7+
ENV LIBFABRIC_PATH=/opt/amazon/efa
8+
ENV MPI_HOME=/opt/amazon/openmpi
9+
ENV NCCL_TESTS_HOME=/opt/nccl-tests
10+
ENV PATH="${LIBFABRIC_PATH}/bin:${MPI_HOME}/bin:${NCCL_TESTS_HOME}/build:${PATH}"
11+
ENV LD_LIBRARY_PATH="${MPI_HOME}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}"
12+
13+
# Prerequisites
14+
15+
RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
16+
&& apt-get install -y --no-install-recommends \
17+
cuda-libraries-dev-${cuda_version} \
18+
cuda-nvcc-${cuda_version} \
19+
libhwloc-dev \
20+
autoconf \
21+
automake \
22+
libtool
23+
24+
# EFA
25+
26+
ARG EFA_VERSION=1.38.1
27+
28+
RUN cd $HOME \
29+
&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
30+
&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
31+
&& cd aws-efa-installer \
32+
&& ./efa_installer.sh -y --skip-kmod -g
33+
34+
# NCCL
35+
36+
ARG NCCL_VERSION=2.26.2-1
37+
38+
RUN cd $HOME \
39+
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
40+
&& cd nccl \
41+
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME}
42+
43+
# AWS OFI NCCL
44+
45+
ARG OFI_VERSION=1.14.0
46+
47+
RUN cd $HOME \
48+
&& git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \
49+
&& cd aws-ofi-nccl \
50+
&& ./autogen.sh \
51+
&& ./configure \
52+
--with-cuda=${CUDA_HOME} \
53+
--with-libfabric=${LIBFABRIC_PATH} \
54+
--with-mpi=${MPI_HOME} \
55+
--with-nccl=${NCCL_HOME} \
56+
--disable-tests \
57+
--prefix=${NCCL_HOME} \
58+
&& make -j$(numproc) \
59+
&& make install
60+
61+
# NCCL Tests
62+
63+
RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \
64+
&& cd ${NCCL_TESTS_HOME} \
65+
&& make -j$(numproc) \
66+
MPI=1 \
67+
MPI_HOME=${MPI_HOME} \
68+
CUDA_HOME=${CUDA_HOME} \
69+
NCCL_HOME=${NCCL_HOME} \
70+
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
71+
&& ldconfig
72+
73+
ARG FLAVOR
74+
ENV FLAVOR=${FLAVOR}
75+
76+
# If FLAVOR is base, uninstall development packages to reduce image size
77+
RUN if [ "$FLAVOR" = "base" ]; then \
78+
cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
79+
&& apt-get remove -y \
80+
cuda-nvcc-${cuda_version} \
81+
libhwloc-dev \
82+
autoconf \
83+
automake \
84+
libtool \
85+
&& apt-get autoremove -y \
86+
&& apt-get clean \
87+
&& rm -rf /var/lib/apt/lists/*; \
88+
fi

docker/base/nvidia/Dockerfile

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# syntax = edrevo/dockerfile-plus
2+
3+
INCLUDE+ base/Dockerfile.common
4+
5+
# NCCL & NCCL tests
6+
7+
ARG NCCL_VERSION=2.26.2-1
8+
9+
ENV NCCL_HOME=/usr/local
10+
ENV CUDA_HOME=/usr/local/cuda
11+
ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi
12+
ENV NCCL_TESTS_HOME=/opt/nccl-tests
13+
ENV PATH="${NCCL_TESTS_HOME}/build:${PATH}"
14+
ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}"
15+
16+
RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
17+
&& apt-get install -y --no-install-recommends \
18+
cuda-libraries-dev-${cuda_version} \
19+
cuda-nvcc-${cuda_version} \
20+
libhwloc-dev \
21+
autoconf \
22+
automake \
23+
libtool \
24+
libopenmpi-dev
25+
26+
RUN cd $HOME \
27+
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
28+
&& cd nccl \
29+
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \
30+
&& git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \
31+
&& cd ${NCCL_TESTS_HOME} \
32+
&& make -j$(nproc) \
33+
MPI=1 \
34+
MPI_HOME=${OPEN_MPI_PATH} \
35+
CUDA_HOME=${CUDA_HOME} \
36+
NCCL_HOME=${NCCL_HOME} \
37+
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
38+
&& ldconfig
39+
40+
ARG FLAVOR
41+
ENV FLAVOR=${FLAVOR}
42+
43+
# If FLAVOR is base, uninstall development packages to reduce image size
44+
RUN if [ "$FLAVOR" = "base" ]; then \
45+
cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
46+
&& apt-get remove -y \
47+
cuda-nvcc-${cuda_version} \
48+
libhwloc-dev \
49+
autoconf \
50+
automake \
51+
libtool \
52+
libopenmpi-dev \
53+
&& apt-get autoremove -y \
54+
&& apt-get clean \
55+
&& rm -rf /var/lib/apt/lists/*; \
56+
fi

0 commit comments

Comments
 (0)