|
1 | | -## base docker image |
| 1 | +## Base docker image |
2 | 2 | ARG ROCM_IMAGE_NAME=rocm/dev-ubuntu-22.04 |
3 | | -ARG ROCM_IMAGE_TAG=6.3.2 |
| 3 | +ARG ROCM_IMAGE_TAG=7.0.2 |
4 | 4 | FROM "${ROCM_IMAGE_NAME}:${ROCM_IMAGE_TAG}" |
5 | 5 |
|
6 | | -## rccl repo |
7 | | -ARG RCCL_REPO=https://github.com/ROCm/rccl |
8 | | -ARG RCCL_BRANCH=develop |
| 6 | +## Re-declare to use in build stage (inherits value from above) |
| 7 | +ARG ROCM_IMAGE_TAG |
9 | 8 |
|
10 | | -## rccl-tests repo |
| 9 | +## RCCL repo |
| 10 | +ARG RCCL_VERSION=rocm-${ROCM_IMAGE_TAG} |
| 11 | + |
| 12 | +## RCCL tests repo |
11 | 13 | ARG RCCL_TESTS_REPO=https://github.com/ROCm/rccl-tests |
12 | 14 | ARG RCCL_TESTS_BRANCH=develop |
13 | 15 |
|
14 | | -## Mellanox OFED version |
15 | | -ARG MELLANOX_OFED_VERSION |
| 16 | +## AMD GPU Targets |
| 17 | +ARG GPU_TARGETS=gfx942 |
16 | 18 |
|
17 | | -## creating scratch space |
18 | | -RUN mkdir -p /workspace |
19 | | -WORKDIR /workspace |
| 19 | +ENV WORKDIR=/workspace |
| 20 | +RUN mkdir -p ${WORKDIR} |
| 21 | +WORKDIR ${WORKDIR} |
20 | 22 |
|
21 | | -## install dependencies |
22 | 23 | RUN apt-get update \ |
23 | 24 | && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ |
24 | 25 | ca-certificates \ |
25 | 26 | git \ |
26 | | - openssh-server \ |
27 | | - iputils-ping \ |
28 | | - net-tools \ |
29 | 27 | make \ |
30 | 28 | rocm-cmake \ |
31 | 29 | ninja-build \ |
@@ -60,65 +58,76 @@ RUN apt-get update \ |
60 | 58 | python3-dev \ |
61 | 59 | python3-tk \ |
62 | 60 | python3-yaml \ |
63 | | - wget \ |
| 61 | + vim \ |
| 62 | + less \ |
| 63 | + openssh-client \ |
| 64 | + openssh-server \ |
64 | 65 | && \ |
65 | 66 | apt-get clean && \ |
66 | 67 | rm -rf /var/lib/apt/lists/* |
67 | 68 |
|
68 | | -# Mellanox OFED |
69 | | -RUN wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add - |
70 | | -RUN cd /etc/apt/sources.list.d/ && wget https://linux.mellanox.com/public/repo/mlnx_ofed/${MELLANOX_OFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list |
71 | | - |
72 | | -RUN apt-get -qq update \ |
73 | | - && apt-get -qq install -y --no-install-recommends \ |
74 | | - ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \ |
75 | | - && rm -rf /var/lib/apt/lists/* |
76 | | - |
77 | 69 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0/cmake-3.28.0-linux-x86_64.sh \ |
78 | 70 | && chmod +x cmake-3.28.0-linux-x86_64.sh \ |
79 | 71 | && bash ./cmake-3.28.0-linux-x86_64.sh --prefix=/usr --exclude-subdir --skip-license \ |
80 | 72 | && rm cmake-3.28.0-linux-x86_64.sh |
81 | 73 |
|
| 74 | +# Mellanox OFED |
| 75 | +RUN wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add - |
| 76 | + RUN cd /etc/apt/sources.list.d/ && wget https://linux.mellanox.com/public/repo/mlnx_ofed/${MELLANOX_OFED_VERSION:-latest}/ubuntu22.04/mellanox_mlnx_ofed.list |
| 77 | + RUN apt-get -qq update \ |
| 78 | + && apt-get -qq install -y --no-install-recommends \ |
| 79 | + ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \ |
| 80 | + && rm -rf /var/lib/apt/lists/* |
| 81 | + |
| 82 | +## Set ROCm path |
| 83 | +ENV ROCM_PATH=/opt/rocm |
| 84 | + |
82 | 85 | ## Install UCX |
83 | 86 | ENV UCX_INSTALL_PREFIX=/opt/ucx |
84 | | -RUN wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz \ |
| 87 | +RUN wget https://github.com/openucx/ucx/releases/download/v1.19.0/ucx-1.19.0.tar.gz \ |
85 | 88 | && mkdir -p ucx \ |
86 | | - && tar -zxf ucx-1.16.0.tar.gz -C ucx --strip-components=1 \ |
| 89 | + && tar -zxf ucx-1.19.0.tar.gz -C ucx --strip-components=1 \ |
87 | 90 | && cd ucx \ |
88 | 91 | && mkdir build \ |
89 | 92 | && cd build \ |
90 | | - && ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=/opt/rocm \ |
| 93 | + && ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=${ROCM_PATH} \ |
91 | 94 | && make -j$(nproc) install \ |
92 | 95 | && cd ../.. \ |
93 | | - && rm -rf ucx ucx-1.16.0.tar.gz |
| 96 | + && rm -rf ucx ucx-1.19.0.tar.gz |
94 | 97 |
|
95 | 98 | ## Install OpenMPI |
96 | 99 | ENV MPI_INSTALL_PREFIX=/opt/ompi |
97 | | -RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz \ |
| 100 | +RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.8.tar.gz \ |
98 | 101 | && mkdir -p ompi4 \ |
99 | | - && tar -zxf openmpi-4.1.6.tar.gz -C ompi4 --strip-components=1 \ |
| 102 | + && tar -zxf openmpi-4.1.8.tar.gz -C ompi4 --strip-components=1 \ |
100 | 103 | && cd ompi4 \ |
101 | 104 | && mkdir build \ |
102 | 105 | && cd build \ |
103 | 106 | && ../configure --prefix=${MPI_INSTALL_PREFIX} --with-ucx=${UCX_INSTALL_PREFIX} --disable-oshmem --disable-mpi-fortran --enable-orterun-prefix-by-default \ |
104 | 107 | && make -j$(nproc) install \ |
105 | 108 | && cd ../.. \ |
106 | | - && rm -rf ompi4 openmpi-4.1.6.tar.gz |
| 109 | + && rm -rf ompi4 openmpi-4.1.8.tar.gz |
107 | 110 |
|
108 | | -## building RCCL |
109 | | -ENV RCCL_INSTALL_PREFIX=/opt/rocm |
110 | | -RUN git clone --recurse-submodules -b "${RCCL_BRANCH}" "${RCCL_REPO}" ./rccl \ |
| 111 | +## Build RCCL |
| 112 | +ENV RCCL_INSTALL_PREFIX=${WORKDIR}/rccl/install |
| 113 | +RUN git clone --recurse-submodules -b ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl \ |
111 | 114 | && cd ./rccl \ |
112 | | - && ./install.sh -t -j$(nproc) --amdgpu_targets="gfx942" --prefix=${RCCL_INSTALL_PREFIX} |
| 115 | + && ./install.sh --amdgpu_targets=${GPU_TARGETS} --prefix=${RCCL_INSTALL_PREFIX} |
113 | 116 |
|
114 | | -## building RCCL-Tests |
| 117 | +## Build RCCL-Tests |
115 | 118 | RUN git clone -b "${RCCL_TESTS_BRANCH}" "${RCCL_TESTS_REPO}" ./rccl-tests \ |
116 | 119 | && cd ./rccl-tests \ |
117 | | - && make MPI=1 MPI_HOME=${MPI_INSTALL_PREFIX} NCCL_HOME=${RCCL_INSTALL_PREFIX} -j$(nproc) |
| 120 | + && mkdir build \ |
| 121 | + && cd build \ |
| 122 | + && cmake -DCMAKE_BUILD_TYPE=Release -DUSE_MPI=ON -DCMAKE_PREFIX_PATH="${RCCL_INSTALL_PREFIX};${MPI_INSTALL_PREFIX}" -DGPU_TARGETS=${GPU_TARGETS} .. \ |
| 123 | + && make -j$(nproc) |
118 | 124 |
|
119 | | -## set environment variables |
120 | | -ENV PATH="${RCCL_INSTALL_PREFIX}/bin:${MPI_INSTALL_PREFIX}/bin:${PATH}" |
121 | | -ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}/lib:${MPI_INSTALL_PREFIX}/lib:${LD_LIBRARY_PATH}" |
| 125 | +## Set environment variables |
| 126 | +ENV PATH="${MPI_INSTALL_PREFIX}/bin:${ROCM_PATH}/bin:${PATH}" |
| 127 | +ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}:${MPI_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib" |
| 128 | +ENV UCX_WARN_UNUSED_ENV_VARS=n |
| 129 | +ENV OMPI_ALLOW_RUN_AS_ROOT=1 |
| 130 | +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 |
122 | 131 |
|
123 | 132 | # Configure SSH |
124 | 133 | RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ |
|
0 commit comments