| 
1 |  | -## base docker image  | 
 | 1 | +## Base docker image  | 
2 | 2 | ARG ROCM_IMAGE_NAME=rocm/dev-ubuntu-22.04  | 
3 |  | -ARG ROCM_IMAGE_TAG=6.3.2  | 
 | 3 | +ARG ROCM_IMAGE_TAG=7.0.2  | 
4 | 4 | FROM "${ROCM_IMAGE_NAME}:${ROCM_IMAGE_TAG}"  | 
5 | 5 | 
 
  | 
6 |  | -## rccl repo  | 
7 |  | -ARG RCCL_REPO=https://github.com/ROCm/rccl  | 
8 |  | -ARG RCCL_BRANCH=develop  | 
 | 6 | +## Re-declare to use in build stage (inherits value from above)  | 
 | 7 | +ARG ROCM_IMAGE_TAG  | 
9 | 8 | 
 
  | 
10 |  | -## rccl-tests repo  | 
 | 9 | +## RCCL repo  | 
 | 10 | +ARG RCCL_VERSION=rocm-${ROCM_IMAGE_TAG}  | 
 | 11 | + | 
 | 12 | +## RCCL tests repo  | 
11 | 13 | ARG RCCL_TESTS_REPO=https://github.com/ROCm/rccl-tests  | 
12 | 14 | ARG RCCL_TESTS_BRANCH=develop  | 
13 | 15 | 
 
  | 
14 |  | -## Mellanox OFED version  | 
15 |  | -ARG MELLANOX_OFED_VERSION  | 
 | 16 | +## AMD GPU Targets  | 
 | 17 | +ARG GPU_TARGETS=gfx942  | 
16 | 18 | 
 
  | 
17 |  | -## creating scratch space  | 
18 |  | -RUN mkdir -p /workspace  | 
19 |  | -WORKDIR /workspace  | 
 | 19 | +ENV WORKDIR=/workspace  | 
 | 20 | +RUN mkdir -p ${WORKDIR}  | 
 | 21 | +WORKDIR ${WORKDIR}  | 
20 | 22 | 
 
  | 
21 |  | -## install dependencies  | 
22 | 23 | RUN apt-get update \  | 
23 | 24 |     && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \  | 
24 | 25 |     ca-certificates \  | 
25 | 26 |     git \  | 
26 |  | -    openssh-server \  | 
27 |  | -    iputils-ping \  | 
28 |  | -    net-tools \  | 
29 | 27 |     make \  | 
30 | 28 |     rocm-cmake \  | 
31 | 29 |     ninja-build \  | 
@@ -60,65 +58,76 @@ RUN apt-get update \  | 
60 | 58 |     python3-dev \  | 
61 | 59 |     python3-tk \  | 
62 | 60 |     python3-yaml \  | 
63 |  | -    wget \  | 
 | 61 | +    vim \  | 
 | 62 | +    less \  | 
 | 63 | +    openssh-client \  | 
 | 64 | +    openssh-server \  | 
64 | 65 |     && \  | 
65 | 66 |     apt-get clean && \  | 
66 | 67 |     rm -rf /var/lib/apt/lists/*  | 
67 | 68 | 
 
  | 
68 |  | -# Mellanox OFED  | 
69 |  | -RUN wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add -  | 
70 |  | -RUN cd /etc/apt/sources.list.d/ && wget https://linux.mellanox.com/public/repo/mlnx_ofed/${MELLANOX_OFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list  | 
71 |  | - | 
72 |  | -RUN apt-get -qq update \  | 
73 |  | -    && apt-get -qq install -y --no-install-recommends \  | 
74 |  | -    ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \  | 
75 |  | -    && rm -rf /var/lib/apt/lists/*  | 
76 |  | - | 
77 | 69 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0/cmake-3.28.0-linux-x86_64.sh \  | 
78 | 70 |     && chmod +x cmake-3.28.0-linux-x86_64.sh \  | 
79 | 71 |     && bash ./cmake-3.28.0-linux-x86_64.sh --prefix=/usr --exclude-subdir --skip-license \  | 
80 | 72 |     && rm cmake-3.28.0-linux-x86_64.sh  | 
81 | 73 | 
 
  | 
 | 74 | +# Mellanox OFED  | 
 | 75 | +RUN wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add -  | 
 | 76 | +    RUN cd /etc/apt/sources.list.d/ && wget https://linux.mellanox.com/public/repo/mlnx_ofed/${MELLANOX_OFED_VERSION:-latest}/ubuntu22.04/mellanox_mlnx_ofed.list  | 
 | 77 | +    RUN apt-get -qq update \  | 
 | 78 | +        && apt-get -qq install -y --no-install-recommends \  | 
 | 79 | +        ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \  | 
 | 80 | +        && rm -rf /var/lib/apt/lists/*  | 
 | 81 | +      | 
 | 82 | +## Set ROCm path  | 
 | 83 | +ENV ROCM_PATH=/opt/rocm  | 
 | 84 | + | 
82 | 85 | ## Install UCX  | 
83 | 86 | ENV UCX_INSTALL_PREFIX=/opt/ucx  | 
84 |  | -RUN wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz \  | 
 | 87 | +RUN wget https://github.com/openucx/ucx/releases/download/v1.19.0/ucx-1.19.0.tar.gz \  | 
85 | 88 |     && mkdir -p ucx \  | 
86 |  | -    && tar -zxf ucx-1.16.0.tar.gz -C ucx --strip-components=1 \  | 
 | 89 | +    && tar -zxf ucx-1.19.0.tar.gz -C ucx --strip-components=1 \  | 
87 | 90 |     && cd ucx \  | 
88 | 91 |     && mkdir build \  | 
89 | 92 |     && cd build \  | 
90 |  | -    && ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=/opt/rocm \  | 
 | 93 | +    && ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=${ROCM_PATH} \  | 
91 | 94 |     && make -j$(nproc) install \  | 
92 | 95 |     && cd ../.. \  | 
93 |  | -    && rm -rf ucx ucx-1.16.0.tar.gz  | 
 | 96 | +    && rm -rf ucx ucx-1.19.0.tar.gz  | 
94 | 97 | 
 
  | 
95 | 98 | ## Install OpenMPI  | 
96 | 99 | ENV MPI_INSTALL_PREFIX=/opt/ompi  | 
97 |  | -RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz \  | 
 | 100 | +RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.8.tar.gz \  | 
98 | 101 |     && mkdir -p ompi4 \  | 
99 |  | -    && tar -zxf openmpi-4.1.6.tar.gz -C ompi4 --strip-components=1 \  | 
 | 102 | +    && tar -zxf openmpi-4.1.8.tar.gz -C ompi4 --strip-components=1 \  | 
100 | 103 |     && cd ompi4 \  | 
101 | 104 |     && mkdir build \  | 
102 | 105 |     && cd build \  | 
103 | 106 |     && ../configure --prefix=${MPI_INSTALL_PREFIX} --with-ucx=${UCX_INSTALL_PREFIX} --disable-oshmem --disable-mpi-fortran --enable-orterun-prefix-by-default \  | 
104 | 107 |     && make -j$(nproc) install \  | 
105 | 108 |     && cd ../.. \  | 
106 |  | -    && rm -rf ompi4 openmpi-4.1.6.tar.gz  | 
 | 109 | +    && rm -rf ompi4 openmpi-4.1.8.tar.gz  | 
107 | 110 | 
 
  | 
108 |  | -## building RCCL  | 
109 |  | -ENV RCCL_INSTALL_PREFIX=/opt/rocm  | 
110 |  | -RUN git clone --recurse-submodules -b "${RCCL_BRANCH}" "${RCCL_REPO}" ./rccl \  | 
 | 111 | +## Build RCCL  | 
 | 112 | +ENV RCCL_INSTALL_PREFIX=${WORKDIR}/rccl/install  | 
 | 113 | +RUN git clone --recurse-submodules -b ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl \  | 
111 | 114 |     && cd ./rccl \  | 
112 |  | -    && ./install.sh -t -j$(nproc) --amdgpu_targets="gfx942" --prefix=${RCCL_INSTALL_PREFIX}  | 
 | 115 | +    && ./install.sh --amdgpu_targets=${GPU_TARGETS} --prefix=${RCCL_INSTALL_PREFIX}  | 
113 | 116 | 
 
  | 
114 |  | -## building RCCL-Tests  | 
 | 117 | +## Build RCCL-Tests  | 
115 | 118 | RUN git clone -b "${RCCL_TESTS_BRANCH}" "${RCCL_TESTS_REPO}" ./rccl-tests \  | 
116 | 119 |     && cd ./rccl-tests \  | 
117 |  | -    && make MPI=1 MPI_HOME=${MPI_INSTALL_PREFIX} NCCL_HOME=${RCCL_INSTALL_PREFIX} -j$(nproc)  | 
 | 120 | +    && mkdir build \  | 
 | 121 | +    && cd build \  | 
 | 122 | +    && cmake -DCMAKE_BUILD_TYPE=Release -DUSE_MPI=ON -DCMAKE_PREFIX_PATH="${RCCL_INSTALL_PREFIX};${MPI_INSTALL_PREFIX}" -DGPU_TARGETS=${GPU_TARGETS} .. \  | 
 | 123 | +    && make -j$(nproc)  | 
118 | 124 | 
 
  | 
119 |  | -## set environment variables  | 
120 |  | -ENV PATH="${RCCL_INSTALL_PREFIX}/bin:${MPI_INSTALL_PREFIX}/bin:${PATH}"  | 
121 |  | -ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}/lib:${MPI_INSTALL_PREFIX}/lib:${LD_LIBRARY_PATH}"  | 
 | 125 | +## Set environment variables  | 
 | 126 | +ENV PATH="${MPI_INSTALL_PREFIX}/bin:${ROCM_PATH}/bin:${PATH}"  | 
 | 127 | +ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}:${MPI_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib"  | 
 | 128 | +ENV UCX_WARN_UNUSED_ENV_VARS=n  | 
 | 129 | +ENV OMPI_ALLOW_RUN_AS_ROOT=1  | 
 | 130 | +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1  | 
122 | 131 | 
 
  | 
123 | 132 | # Configure SSH  | 
124 | 133 | RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \  | 
 | 
0 commit comments