|
| 1 | +FROM ubuntu:22.04 |
| 2 | + |
| 3 | +# docker build -t ghcr.io/converged-computing/flux-tutorials:azurehpc-2204 . |
| 4 | +# docker push ghcr.io/converged-computing/flux-tutorials:azurehpc-2204 |
| 5 | + |
| 6 | +WORKDIR /opt |
| 7 | +RUN apt-get update && apt-get install -y munge git curl wget unzip gpg debian-archive-keyring \ |
| 8 | + pkg-config vim ubuntu-keyring systemctl && apt-get clean |
| 9 | +RUN export VERSION="1.2.2" && \ |
| 10 | + curl -LO "https://github.com/oras-project/oras/releases/download/v${VERSION}/oras_${VERSION}_linux_amd64.tar.gz" && \ |
| 11 | + mkdir -p oras-install/ && \ |
| 12 | + tar -zxf oras_${VERSION}_*.tar.gz -C oras-install/ && \ |
| 13 | + mv oras-install/oras /usr/local/bin/ && \ |
| 14 | + rm -rf oras_${VERSION}_*.tar.gz oras-install/ |
| 15 | + |
| 16 | +# Azure hpc-images deps added here - not clear if all of these are needed |
| 17 | +RUN apt-get update && apt-get install -y numactl rpm libnuma-dev libmpc-dev libmpfr-dev libxml2-dev m4 byacc \ |
| 18 | + libnl-3-dev libnl-route-3-dev libnl-3-200 libnl-genl-3-dev libnl-genl-3-200 libnl-route-3-200 bison \ |
| 19 | + libsecret-1-0 dkms libyaml-dev libreadline-dev libkeyutils1 libkeyutils-dev libmount-dev nfs-common pssh \ |
| 20 | + libvulkan1 hwloc selinux-policy-dev nvme-cli && apt-get clean # vulkan is for nvidia gpu driver |
| 21 | +ENV DEBIAN_FRONTEND=noninteractive |
| 22 | + |
| 23 | +# OSU Benchmarks in hpcx-v2.19-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64/ompi/tests/ |
| 24 | +RUN oras pull ghcr.io/converged-computing/rdma-infiniband:ubuntu-22.04-tgz --output /opt && \ |
| 25 | + cd /opt && \ |
| 26 | + tar -xzvf MLNX_OFED_LINUX-24.04-0.7.0.0-ubuntu22.04-x86_64.tgz && \ |
| 27 | + cd MLNX_OFED_LINUX-24.04-0.7.0.0-ubuntu22.04-x86_64/DEBS/ && \ |
| 28 | + dpkg -i mpitests_3.2.23-45a045b.2404066_amd64.deb && \ |
| 29 | + dpkg -i libibverbs1* && \ |
| 30 | + dpkg -i ibverbs-providers* && \ |
| 31 | + dpkg -i libibverbs* && \ |
| 32 | + dpkg -i librdmacm* && \ |
| 33 | + dpkg -i ucx_1.17.0-1.2404066_amd64.deb && \ |
| 34 | + dpkg -i libibumad3* && \ |
| 35 | + dpkg -i sharp_3.7.0.MLNX20240421.48444036-1.2404066_amd64.deb && \ |
| 36 | + dpkg -i hcoll_4.8.3227-1.2404066_amd64.deb |
| 37 | + |
| 38 | +# This was extracted into separate lines, below, to avoid one large layer (and debug each) |
| 39 | +# RUN ./install.sh |
| 40 | +ENV GPU=NVIDIA |
| 41 | + |
| 42 | +# Install only what we need as we go (so change to single file doesn't require complete rebuild) |
| 43 | +WORKDIR /opt/azhpc-images/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc |
| 44 | +COPY ./azhpc-images/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_prerequisites.sh ./install_prerequisites.sh |
| 45 | + |
| 46 | +# install pre-requisites |
| 47 | +RUN ./install_prerequisites.sh |
| 48 | +COPY ./azhpc-images/versions.json /opt/azhpc-images/versions.json |
| 49 | + |
| 50 | +COPY ./azhpc-images/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/set_properties.sh ./ |
| 51 | +COPY ./azhpc-images/ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_utils.sh ./ |
| 52 | +COPY ./azhpc-images/ubuntu/common/remove_unused_packages.sh /opt/azhpc-images/ubuntu/common/remove_unused_packages.sh |
| 53 | +COPY ./azhpc-images/ubuntu/common/install_utils.sh /opt/azhpc-images/ubuntu/common/install_utils.sh |
| 54 | +COPY ./azhpc-images/ubuntu/common/install_pmix.sh /opt/azhpc-images/ubuntu/common/install_pmix.sh |
| 55 | +COPY ./azhpc-images/ubuntu/common/install_pmix.sh /opt/azhpc-images/ubuntu/common/install_mpis.sh |
| 56 | +COPY ./azhpc-images/common/ /opt/azhpc-images/common/ |
| 57 | +COPY ./azhpc-images/tools/ /opt/azhpc-images/tools/ |
| 58 | + |
| 59 | +# remove packages requiring Ubuntu Pro for security updates |
| 60 | +RUN . ./set_properties.sh && \ |
| 61 | + /bin/bash $UBUNTU_COMMON_DIR/remove_unused_packages.sh && \ |
| 62 | + ./install_utils.sh |
| 63 | + |
| 64 | +COPY ./azhpc-images/ubuntu/common/install_docker.sh /opt/azhpc-images/ubuntu/common/install_docker.sh |
| 65 | +COPY ./azhpc-images/ubuntu/common/* /opt/azhpc-images/ubuntu/common/ |
| 66 | +RUN . ./set_properties.sh && \ |
| 67 | + /bin/bash $UBUNTU_COMMON_DIR/install_docker.sh |
| 68 | + |
| 69 | +# install diagnostic script, optimizations |
| 70 | +RUN . ./set_properties.sh && \ |
| 71 | + /bin/bash $COMMON_DIR/install_hpcdiag.sh && \ |
| 72 | + /bin/bash $COMMON_DIR/install_azure_persistent_rdma_naming.sh |
| 73 | + |
| 74 | +RUN . ./set_properties.sh && \ |
| 75 | + /bin/bash $UBUNTU_COMMON_DIR/hpc-tuning.sh |
| 76 | + |
| 77 | +COPY ./azhpc-images/tests/ /opt/azhpc-images/tests |
| 78 | +COPY ./azhpc-images/customizations/ /opt/azhpc-images/customizations |
| 79 | +COPY ./azhpc-images/topology/ /opt/azhpc-images/topology |
| 80 | + |
| 81 | +RUN . ./set_properties.sh && \ |
| 82 | + /bin/bash $COMMON_DIR/copy_test_file.sh && \ |
| 83 | + /bin/bash $COMMON_DIR/install_monitoring_tools.sh && \ |
| 84 | + /bin/bash $COMMON_DIR/install_amd_libs.sh |
| 85 | + |
| 86 | +RUN . ./set_properties.sh && \ |
| 87 | + /bin/bash $COMMON_DIR/setup_sku_customizations.sh |
| 88 | + |
| 89 | +RUN . ./set_properties.sh && \ |
| 90 | + /bin/bash $UBUNTU_COMMON_DIR/install_pmix.sh |
| 91 | + |
| 92 | +# For some reason this command, when moved higher up, was flaky. |
| 93 | +# Watch it and make sure it doesn't skip (if it does the build will fail later) |
| 94 | +RUN . ./set_properties.sh && \ |
| 95 | + /bin/bash $UBUNTU_COMMON_DIR/install_mpis.sh |
| 96 | + |
| 97 | +# This would match the VM exactly (you'd ned to change the source script, etc). |
| 98 | +# RUN mv /opt/hpcx-v2.19-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64 /opt/hpcx-v2.15-gcc-MLNX_OFED_LINUX-5-ubuntu22.04-cuda12-gdrcopy2-nccl2.17-x86_64/ |
| 99 | + |
| 100 | +# cleanup downloaded tarballs - clear some space |
| 101 | +RUN rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh && \ |
| 102 | + rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf* && \ |
| 103 | + rm -rf /var/intel/ /var/cache/* && \ |
| 104 | + rm -Rf -- */ |
| 105 | + |
| 106 | +# INFO: Building OMPI with HCOLL |
| 107 | +# Ready to rebuild |
| 108 | +# HPCX_ROOT: /opt/hpcx-v2.19-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64 |
| 109 | +# OMPI PREFIX: /opt/hpcx-v2.19-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64/hpcx-rebuild |
| 110 | +# UCX location: /opt/hpcx-v2.19-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64/ucx |
| 111 | +# UCC location: /opt/hpcx-v2.19-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64/ucc |
| 112 | +# HCOLL location: /opt/hpcx-v2.19-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64/hcoll |
| 113 | + |
| 114 | +ENV LANG=C.UTF-8 |
| 115 | +RUN apt-get update && apt-get install -y locales && locale-gen en_US.UTF-8 && apt-get clean |
| 116 | + |
| 117 | +# Add an azureuser |
| 118 | +ARG USER=azureuser |
| 119 | +ARG UID=1000 |
| 120 | +ARG GID=1000 |
| 121 | +RUN set -x && groupadd -g $UID $USER && \ |
| 122 | + useradd -g $USER -u $UID -d /home/$USER -m $USER && \ |
| 123 | + printf "$USER ALL= NOPASSWD: ALL\\n" >> /etc/sudoers |
| 124 | + |
| 125 | +# flux security |
| 126 | +WORKDIR /opt/flux |
| 127 | + |
| 128 | +RUN apt-get update && \ |
| 129 | + apt-get install -y man flex ssh sudo vim luarocks munge lcov ccache lua5.4 \ |
| 130 | + valgrind build-essential pkg-config autotools-dev libtool \ |
| 131 | + libffi-dev autoconf automake make clang clang-tidy \ |
| 132 | + gcc g++ libpam-dev apt-utils lua-posix \ |
| 133 | + libsodium-dev libzmq3-dev libczmq-dev libjansson-dev libmunge-dev \ |
| 134 | + libncursesw5-dev liblua5.4-dev liblz4-dev libsqlite3-dev uuid-dev \ |
| 135 | + libhwloc-dev libs3-dev libevent-dev libarchive-dev \ |
| 136 | + libboost-graph-dev libboost-system-dev libboost-filesystem-dev \ |
| 137 | + libboost-regex-dev libyaml-cpp-dev libedit-dev uidmap dbus-user-session python3-cffi && apt-get clean |
| 138 | + |
| 139 | +COPY ./azhpc-images/source-hpcx.sh /source-hpcx.sh |
| 140 | +RUN . /source-hpcx.sh && hpcx_load && \ |
| 141 | + wget https://github.com/flux-framework/flux-security/releases/download/v0.13.0/flux-security-0.13.0.tar.gz && \ |
| 142 | + tar -xzvf flux-security-0.13.0.tar.gz && \ |
| 143 | + mv flux-security-0.13.0 /opt/flux/flux-security && \ |
| 144 | + cd /opt/flux/flux-security && \ |
| 145 | + ./configure --prefix=/usr --sysconfdir=/etc && \ |
| 146 | + make -j && make install |
| 147 | + |
| 148 | +# The VMs will share the same munge key |
| 149 | +RUN mkdir -p /var/run/munge && \ |
| 150 | + dd if=/dev/urandom bs=1 count=1024 > munge.key && \ |
| 151 | + mv munge.key /etc/munge/munge.key && \ |
| 152 | + chown -R munge /etc/munge/munge.key /var/run/munge && \ |
| 153 | + chmod 600 /etc/munge/munge.key |
| 154 | + |
| 155 | +# Make the flux run directory |
| 156 | +RUN mkdir -p /home/azureuser/run/flux && chown azureuser /home/azureuser |
| 157 | +RUN python3 -m pip install jsonschema --upgrade |
| 158 | + |
| 159 | +# Flux core |
| 160 | +RUN . /source-hpcx.sh && hpcx_load && \ |
| 161 | + wget https://github.com/flux-framework/flux-core/releases/download/v0.68.0/flux-core-0.68.0.tar.gz && \ |
| 162 | + tar -xzvf flux-core-0.68.0.tar.gz && \ |
| 163 | + mv flux-core-0.68.0 /opt/flux/flux-core && \ |
| 164 | + cd /opt/flux/flux-core && \ |
| 165 | + ./configure --prefix=/usr --sysconfdir=/etc --with-flux-security && \ |
| 166 | + make clean && \ |
| 167 | + make -j && make install |
| 168 | + |
| 169 | +# Flux sched (later than this requires newer gcc and clang) |
| 170 | +RUN . /source-hpcx.sh && hpcx_load && \ |
| 171 | + wget https://github.com/flux-framework/flux-sched/releases/download/v0.37.0/flux-sched-0.37.0.tar.gz && \ |
| 172 | + tar -xzvf flux-sched-0.37.0.tar.gz && \ |
| 173 | + mv flux-sched-0.37.0 /opt/flux/flux-sched && \ |
| 174 | + cd /opt/flux/flux-sched && \ |
| 175 | + mkdir build && \ |
| 176 | + cd build && \ |
| 177 | + cmake ../ && make -j && make install && ldconfig && \ |
| 178 | + echo "DONE flux build" |
| 179 | + |
| 180 | +# Flux curve.cert |
| 181 | +# Ensure we have a shared curve certificate |
| 182 | +RUN flux keygen /tmp/curve.cert && \ |
| 183 | + mkdir -p /etc/flux/system && \ |
| 184 | + cp /tmp/curve.cert /etc/flux/system/curve.cert && \ |
| 185 | + chown azureuser /etc/flux/system/curve.cert && \ |
| 186 | + chmod o-r /etc/flux/system/curve.cert && \ |
| 187 | + chmod g-r /etc/flux/system/curve.cert && \ |
| 188 | + # Permissions for imp |
| 189 | + chmod u+s /usr/libexec/flux/flux-imp && \ |
| 190 | + chmod 4755 /usr/libexec/flux/flux-imp && \ |
| 191 | + # /var/lib/flux needs to be owned by the instance owner |
| 192 | + mkdir -p /var/lib/flux && \ |
| 193 | + chown azureuser -R /var/lib/flux && \ |
| 194 | + # clean up (and make space) |
| 195 | + cd /opt && \ |
| 196 | + rm -rf /opt/flux |
| 197 | + |
| 198 | +# Ensure we source the environment. |
| 199 | +RUN echo ". /opt/hpcx-v2.15-gcc-MLNX_OFED_LINUX-5-ubuntu22.04-cuda12-gdrcopy2-nccl2.17-x86_64/hpcx-mt-init.sh" >> /root/.bashrc && \ |
| 200 | + echo ". /opt/hpcx-v2.15-gcc-MLNX_OFED_LINUX-5-ubuntu22.04-cuda12-gdrcopy2-nccl2.17-x86_64/hpcx-mt-init.sh" >> /home/azureuser/.bashrc && \ |
| 201 | + echo "hpcx_load" >> /root/.bashrc && \ |
| 202 | + echo "hpcx_load" >> /home/azureuser/.bashrc && \ |
| 203 | + echo "FLUX_URI DEFAULT=local:///opt/run/flux/local" >> ./environment && \ |
| 204 | + mv ./environment /etc/security/pam_env.conf |
| 205 | +WORKDIR /opt |
0 commit comments