Skip to content

Commit a4e6d29

Browse files
[UX] Pre-build a EFA version of the default Docker image #2793
1 parent 5b0539f commit a4e6d29

File tree

5 files changed

+57
-33
lines changed

5 files changed

+57
-33
lines changed

.github/workflows/docker.yml

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ jobs:
5252
strategy:
5353
matrix:
5454
python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
55-
flavor: ["base", "devel", "efa"]
55+
version: ["nvidia", "efa"]
56+
flavor: ["base", "devel"]
5657
steps:
5758
- name: Checkout repository
5859
uses: actions/checkout@v4
@@ -67,24 +68,14 @@ jobs:
6768
uses: docker/setup-qemu-action@v3
6869
- name: Build and upload to DockerHub
6970
run: |
70-
if [ "${{ matrix.flavor }}" = "base" ]; then
71-
TAG_SUFFIX=""
72-
FILE="base/Dockerfile"
73-
elif [ "${{ matrix.flavor }}" = "efa" ]; then
74-
TAG_SUFFIX="-efa"
75-
FILE="base/efa/Dockerfile"
76-
else
77-
TAG_SUFFIX="-devel"
78-
FILE="base/devel/Dockerfile"
79-
fi
8071
docker buildx build \
8172
--platform linux/amd64 \
82-
--tag dstackai/${{ env.BUILD_DOCKER_REPO }}:py${{ matrix.python }}-${{ inputs.image_version }}-cuda-12.1${TAG_SUFFIX} \
83-
--build-arg FLAVOR=$FLAVOR \
73+
--tag dstackai/${{ env.BUILD_DOCKER_REPO }}:py${{ matrix.python }}-${{ inputs.image_version }}-${{ matrix.version }}-${{ matrix.flavor }} \
74+
--build-arg FLAVOR=${{ matrix.flavor }} \
8475
--build-arg PYTHON=${{ matrix.python }} \
8576
--provenance=false \
8677
--push \
87-
-f $FILE .
78+
-f default/${{ matrix.version }}/Dockerfile .
8879
8980
build-aws-images:
9081
needs: build-docker
File renamed without changes.
Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
INCLUDE+ base/Dockerfile
44

55
ENV NCCL_HOME=/usr/local
6-
ENV CUDA_PATH=/usr/local/cuda
6+
ENV CUDA_HOME=/usr/local/cuda
77
ENV LIBFABRIC_PATH=/opt/amazon/efa
8-
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
9-
ENV NCCL_TESTS_PATH=/opt/nccl-tests
10-
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${NCCL_TESTS_PATH}:${PATH}"
11-
ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}"
8+
ENV MPI_HOME=/opt/amazon/openmpi
9+
ENV NCCL_TESTS_HOME=/opt/nccl-tests
10+
ENV PATH="${LIBFABRIC_PATH}/bin:${MPI_HOME}/bin:${NCCL_TESTS_HOME}/build:${PATH}"
11+
ENV LD_LIBRARY_PATH="${MPI_HOME}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}"
1212

1313
# Prerequisites
1414

@@ -49,10 +49,9 @@ RUN cd $HOME \
4949
&& cd aws-ofi-nccl \
5050
&& ./autogen.sh \
5151
&& ./configure \
52-
--with-cuda=${CUDA_PATH} \
52+
--with-cuda=${CUDA_HOME} \
5353
--with-libfabric=${LIBFABRIC_PATH} \
54-
--with-mpi=${OPEN_MPI_PATH} \
55-
--with-cuda=${CUDA_PATH} \
54+
--with-mpi=${MPI_HOME} \
5655
--with-nccl=${NCCL_HOME} \
5756
--disable-tests \
5857
--prefix=${NCCL_HOME} \
@@ -61,13 +60,29 @@ RUN cd $HOME \
6160

6261
# NCCL Tests
6362

64-
RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_PATH} \
65-
&& cd ${NCCL_TESTS_PATH} \
63+
RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \
64+
&& cd ${NCCL_TESTS_HOME} \
6665
&& make -j$(numproc) \
6766
MPI=1 \
68-
MPI_HOME=${OPEN_MPI_PATH} \
69-
CUDA_HOME=${CUDA_PATH} \
67+
MPI_HOME=${MPI_HOME} \
68+
CUDA_HOME=${CUDA_HOME} \
7069
NCCL_HOME=${NCCL_HOME} \
7170
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
7271
&& ldconfig
7372

73+
ARG FLAVOR
74+
ENV FLAVOR=${FLAVOR}
75+
76+
# If FLAVOR is base, uninstall development packages to reduce image size
77+
RUN if [ "$FLAVOR" = "base" ]; then \
78+
cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
79+
&& apt-get remove -y \
80+
cuda-nvcc-${cuda_version} \
81+
libhwloc-dev \
82+
autoconf \
83+
automake \
84+
libtool \
85+
&& apt-get autoremove -y \
86+
&& apt-get clean \
87+
&& rm -rf /var/lib/apt/lists/*; \
88+
fi
Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
# syntax = edrevo/dockerfile-plus
22

3-
INCLUDE+ base/Dockerfile
3+
INCLUDE+ default/Dockerfile.common
44

55
# NCCL & NCCL tests
66

77
ARG NCCL_VERSION=2.26.2-1
88

99
ENV NCCL_HOME=/usr/local
10-
ENV CUDA_PATH=/usr/local/cuda
10+
ENV CUDA_HOME=/usr/local/cuda
1111
ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi
12-
ENV NCCL_TESTS_PATH=/opt/nccl-tests
13-
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${NCCL_TESTS_PATH}:${PATH}"
12+
ENV NCCL_TESTS_HOME=/opt/nccl-tests
13+
ENV PATH="${NCCL_TESTS_HOME}/build:${PATH}"
1414
ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}"
1515

1616
RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
@@ -27,12 +27,30 @@ RUN cd $HOME \
2727
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
2828
&& cd nccl \
2929
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \
30-
&& git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_PATH} \
31-
&& cd ${NCCL_TESTS_PATH} \
30+
&& git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \
31+
&& cd ${NCCL_TESTS_HOME} \
3232
&& make -j$(nproc) \
3333
MPI=1 \
3434
MPI_HOME=${OPEN_MPI_PATH} \
35-
CUDA_HOME=${CUDA_PATH} \
35+
CUDA_HOME=${CUDA_HOME} \
3636
NCCL_HOME=${NCCL_HOME} \
3737
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
3838
&& ldconfig
39+
40+
ARG FLAVOR
41+
ENV FLAVOR=${FLAVOR}
42+
43+
# If FLAVOR is base, uninstall development packages to reduce image size
44+
RUN if [ "$FLAVOR" = "base" ]; then \
45+
cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
46+
&& apt-get remove -y \
47+
cuda-nvcc-${cuda_version} \
48+
libhwloc-dev \
49+
autoconf \
50+
automake \
51+
libtool \
52+
libopenmpi-dev \
53+
&& apt-get autoremove -y \
54+
&& apt-get clean \
55+
&& rm -rf /var/lib/apt/lists/*; \
56+
fi

0 commit comments

Comments
 (0)