@@ -4,28 +4,63 @@ set -euo pipefail
44
55# ###############################################################
66#
7- # Flux, Singularity, and Infiniband dependenciess
7+ # Flux, Singularity, and Infiniband dependencies
8+ # Starting on ubuntu 24.04
89#
910
10- /usr/bin/cloud-init status --wait
11+ # In practice I haven't seen needing this
12+ # /usr/bin/cloud-init status --wait
1113
1214export DEBIAN_FRONTEND=noninteractive
1315sudo apt-get update && \
14- sudo apt-get install -y apt-transport-https ca-certificates curl jq apt-utils wget \
15- libelf-dev libpcap-dev libbfd-dev binutils-dev build-essential make \
16- linux-tools-common linux-tools-$( uname -r) \
17- python3-pip git net-tools
18-
19- # Microsoft packages for 24.04
20- # curl https://packages.microsoft.com/config/ubuntu/24.04/prod.list > ./microsoft-prod.list
21- # sudo cp ./microsoft-prod.list /etc/apt/sources.list.d/
22-
23- # Install the Microsoft GPG public key
24- # Note I'm commenting this out - I don't see any microsoft packages I want / need
25- # curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg
26- # sudo cp ./microsoft.gpg /etc/apt/trusted.gpg.d/
27- # sudo cp ./microsoft.gpg /usr/share/keyrings/microsoft-prod.gpg
28- # gpg --refresh-keys
16+ sudo apt-get install -y apt-transport-https ca-certificates curl jq apt-utils wget curl jq \
17+ build-essential make linux-tools-common linux-tools-$( uname -r)
18+
19+ # Install ORAS client
20+ VERSION=" 1.2.2"
21+ curl -LO " https://github.com/oras-project/oras/releases/download/v${VERSION} /oras_${VERSION} _linux_amd64.tar.gz"
22+ mkdir -p oras-install/
23+ tar -zxf oras_${VERSION} _* .tar.gz -C oras-install/
24+ sudo mv oras-install/oras /usr/local/bin/
25+ rm -rf oras_${VERSION} _* .tar.gz oras-install/
26+
27+ # Infiniband
28+ # make sure secure boot is disabled
29+ # mokutil --sb-state
30+ sudo chown -R azureuser /opt
31+
32+ # https://docs.nvidia.com/networking/display/mlnxofedv24101140lts/installing+the+driver#src-3411296587_InstallingtheDriver-InstallationScript
33+ # check we have devices
34+ # lspci -v | grep Mellanox
35+ cd /opt
36+ oras pull ghcr.io/converged-computing/rdma-infiniband:ubuntu-24.04-tgz
37+ tar -xzvf MLNX_OFED_LINUX-24.10-1.1.4.0-ubuntu24.04-x86_64.tgz
38+ touch MLNX_OFED_LINUX-24.10-1.1.4.0-ubuntu24.04-x86_64.txt
39+ mv MLNX_OFED_LINUX-24.10-1.1.4.0-ubuntu24.04-x86_64 mlnx
40+ rm MLNX_OFED_LINUX-24.10-1.1.4.0-ubuntu24.04-x86_64.tgz
41+ cd mlnx
42+ sudo ./mlnxofedinstall --force
43+ sudo /etc/init.d/openibd restart
44+
45+ # Rename device to ib0
46+ cd /opt
47+ wget https://raw.githubusercontent.com/converged-computing/aks-infiniband-install/main/ubuntu22.04/parse-links.py
48+ sudo python3 parse-links.py
49+ ip link
50+
51+ cd /opt
52+ wget https://github.com/openucx/ucx/releases/download/v1.17.0/ucx-1.17.0.tar.gz && \
53+ tar -xzvf ucx-1.17.0.tar.gz && \
54+ cd ucx-1.17.0 && \
55+ ./configure --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt --prefix=/usr --enable-examples --without-java --without-go --without-xpmem --without-cuda --with-rc --with-ud --with-dc \
56+ --with-mlx5-dv --with-verbs --with-ib-hw-tm --with-dm --with-devx && \
57+ make -j && sudo make install && sudo ldconfig
58+
59+ wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.2.tar.gz && \
60+ tar -xzvf openmpi-4.1.2.tar.gz && \
61+ cd openmpi-4.1.2 && \
62+ ./configure --with-ucx=/usr && \
63+ make -j && sudo make install && sudo ldconfig
2964
3065# cmake is needed for flux-sched, and make sure to choose arm or x86
3166export CMAKE=3.23.1
@@ -45,55 +80,18 @@ curl -s -L https://github.com/Kitware/CMake/releases/download/v$CMAKE/cmake-$CMA
4580 libboost-graph-dev libboost-system-dev libboost-filesystem-dev \
4681 libboost-regex-dev libyaml-cpp-dev libedit-dev uidmap dbus-user-session python3-cffi
4782
48- # Azure hpc-images deps added here - not clear if all of these are needed
49- # sudo apt-get install -y numactl rpm libnuma-dev libmpc-dev libmpfr-dev libxml2-dev m4 byacc \
50- # libnl-3-dev libnl-route-3-dev libnl-3-200 libnl-genl-3-dev libnl-genl-3-200 libnl-route-3-200 bison \
51- # libsecret-1-0 dkms libyaml-dev libreadline-dev libkeyutils1 libkeyutils-dev libmount-dev nfs-common pssh \
52- # libvulkan1 hwloc selinux-policy-dev nvme-cli # vulkan is for nvidia gpu driver
53-
54- sudo ldconfig
55- # Other Microsoft helpers
56- # Not installing:
57- # azcopy
58- # kvp client
59- # torset-tool
60- # lustre
61-
62- # First manual test build 12/26/2024
63- # $ lspci
64- # 0101:00:00.0 Infiniband controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function]
65- # 3532:00:00.0 Non-Volatile memory controller: Microsoft Corporation Device b111
66- # 78a8:00:00.0 Non-Volatile memory controller: Microsoft Corporation Device b111
67- # cc4c:00:02.0 Ethernet controller: Mellanox Technologies MT27710 Family [ConnectX-4 Lx Virtual Function] (rev 80)
68-
69- # https://docs.nvidia.com/doca/archive/2-8-0/nvidia+doca+installation+guide+for+linux/index.html
70- # wget https://developer.nvidia.com/downloads/networking/secure/doca-sdk/DOCA_2.8/doca-host_2.8.0-204000-24.07-ubuntu2404_amd64.deb
71- # sudo dpkg -i doca-host_2.8.0-204000-24.07-ubuntu2404_amd64.deb
72- # rm doca-host_2.8.0-204000-24.07-ubuntu2404_amd64.deb
73-
74- # ofed_info -s
75- # MLNX_OFED_LINUX-23.04-1.1.3.0
76-
77- # Install ORAS client
78- VERSION=" 1.2.2"
79- curl -LO " https://github.com/oras-project/oras/releases/download/v${VERSION} /oras_${VERSION} _linux_amd64.tar.gz"
80- mkdir -p oras-install/
81- tar -zxf oras_${VERSION} _* .tar.gz -C oras-install/
82- sudo mv oras-install/oras /usr/local/bin/
83- rm -rf oras_${VERSION} _* .tar.gz oras-install/
84-
8583# /etc/init.d/openibd status
86- # HCA driver loaded
84+ # HCA driver loaded
8785
8886# Configured IPoIB devices:
8987# ib0
9088
9189# Currently active IPoIB devices:
9290# Configured Mellanox EN devices:
93- # eth1
91+ # enP54485s1
9492
9593# Currently active Mellanox devices:
96- # eth1
94+ # enP54485s1
9795# ib0
9896
9997# The following OFED modules are loaded:
@@ -108,41 +106,33 @@ rm -rf oras_${VERSION}_*.tar.gz oras-install/
108106# ib_cm
109107# ib_core
110108# mlxfw
111- sudo locale-gen en_US.UTF-8
112109
113- # HPC-x (MPI)
114- # Located at
115- . /opt/hpcx-v2.15-gcc-MLNX_OFED_LINUX-5-ubuntu22.04-cuda12-gdrcopy2-nccl2.17-x86_64/hpcx-mt-init.sh
116- hpcx_load
117- env | grep HPCX
110+ sudo locale-gen en_US.UTF-8
118111
119112# ###############################################################
120113# # Install Flux and dependencies
121114
122- # QUESTION: Do we still need this given packages above?
123- # sudo chown -R $(whoami) /opt && \
124- # mkdir -p /opt/prrte && \
125- # cd /opt/prrte && \
126- # git clone https://github.com/openpmix/openpmix.git && \
127- # git clone https://github.com/openpmix/prrte.git && \
128- # cd openpmix && \
129- # git checkout fefaed568f33bf86f28afb6e45237f1ec5e4de93 && \
130- # ./autogen.pl && \
131- # ./configure --prefix=/usr --disable-static && sudo make install && \
132- # sudo ldconfig
133-
134- # cd /opt/prrte/prrte && \
135- # git checkout 477894f4720d822b15cab56eee7665107832921c && \
136- # ./autogen.pl && \
137- # ./configure --prefix=/usr && sudo make -j install
115+ mkdir -p /opt/prrte && \
116+ cd /opt/prrte && \
117+ git clone https://github.com/openpmix/openpmix.git && \
118+ git clone https://github.com/openpmix/prrte.git && \
119+ cd openpmix && \
120+ git checkout fefaed568f33bf86f28afb6e45237f1ec5e4de93 && \
121+ ./autogen.pl && \
122+ ./configure --prefix=/usr --disable-static && sudo make install && \
123+ sudo ldconfig
124+
125+ cd /opt/prrte/prrte && \
126+ git checkout 477894f4720d822b15cab56eee7665107832921c && \
127+ ./autogen.pl && \
128+ ./configure --prefix=/usr && sudo make -j install
138129
139130# flux security
140- sudo mkdir -p /opt/flux
141- sudo chown -R $( whoami) /opt/flux
131+ cd /opt
142132wget https://github.com/flux-framework/flux-security/releases/download/v0.13.0/flux-security-0.13.0.tar.gz && \
143133 tar -xzvf flux-security-0.13.0.tar.gz && \
144- mv flux-security-0.13.0 /opt/flux/flux -security && \
145- cd /opt/flux/flux -security && \
134+ mv flux-security-0.13.0 /opt/flux-security && \
135+ cd /opt/flux-security && \
146136 ./configure --prefix=/usr --sysconfdir=/etc && \
147137 make -j && sudo make install
148138
@@ -154,34 +144,35 @@ sudo mkdir -p /var/run/munge && \
154144 sudo chmod 600 /etc/munge/munge.key
155145
156146# Make the flux run directory
157- ls /home
158- whoami
159- sudo mkdir -p /home/azureuser/run/flux
160- sudo chown azureuser /home/azureuser
147+ mkdir -p /home/azureuser/run/flux
161148
162149# Flux core
150+ sudo apt-get install -y python3-pip
151+ cd /opt
163152wget https://github.com/flux-framework/flux-core/releases/download/v0.68.0/flux-core-0.68.0.tar.gz && \
164153 tar -xzvf flux-core-0.68.0.tar.gz && \
165- mv flux-core-0.68.0 /opt/flux/flux -core && \
166- cd /opt/flux/flux -core && \
154+ mv flux-core-0.68.0 /opt/flux-core && \
155+ cd /opt/flux-core && \
167156 ./configure --prefix=/usr --sysconfdir=/etc --with-flux-security && \
168157 make clean && \
169158 make -j && sudo make install
170159
171160# Flux pmix (must be installed after flux core)
172- # wget https://github.com/flux-framework/flux-pmix/releases/download/v0.5.0/flux-pmix-0.5.0.tar.gz && \
173- # tar -xzvf flux-pmix-0.5.0.tar.gz && \
174- # mv flux-pmix-0.5.0 /opt/flux/flux-pmix && \
175- # cd /opt/flux/flux-pmix && \
176- # ./configure --prefix=/usr && \
177- # make -j && \
178- # sudo make install
179-
180- # Flux sched (later than this requires newer gcc and clang)
181- wget https://github.com/flux-framework/flux-sched/releases/download/v0.37.0/flux-sched-0.37.0.tar.gz && \
182- tar -xzvf flux-sched-0.37.0.tar.gz && \
183- mv flux-sched-0.37.0 /opt/flux/flux-sched && \
184- cd /opt/flux/flux-sched && \
161+ cd /opt
162+ wget https://github.com/flux-framework/flux-pmix/releases/download/v0.5.0/flux-pmix-0.5.0.tar.gz && \
163+ tar -xzvf flux-pmix-0.5.0.tar.gz && \
164+ mv flux-pmix-0.5.0 /opt/flux-pmix && \
165+ cd /opt/flux-pmix && \
166+ ./configure --prefix=/usr && \
167+ make -j && \
168+ sudo make install
169+
170+ # Flux sched
171+ cd /opt
172+ wget https://github.com/flux-framework/flux-sched/releases/download/v0.40.0/flux-sched-0.40.0.tar.gz && \
173+ tar -xzvf flux-sched-0.40.0.tar.gz && \
174+ mv flux-sched-0.40.0 /opt/flux-sched && \
175+ cd /opt/flux-sched && \
185176 mkdir build && \
186177 cd build && \
187178 cmake ../ && make -j && sudo make install && sudo ldconfig && \
@@ -201,9 +192,7 @@ flux keygen /tmp/curve.cert && \
201192 # /var/lib/flux needs to be owned by the instance owner
202193 sudo mkdir -p /var/lib/flux && \
203194 sudo chown azureuser -R /var/lib/flux && \
204- # clean up (and make space)
205195 cd /opt
206- sudo rm -rf /opt/flux
207196
208197# Install Singularity
209198# flux start mpirun -n 6 singularity exec singularity-mpi_mpich.sif /opt/mpitest
@@ -249,23 +238,23 @@ export VERSION=4.0.1 && \
249238 make -C builddir && \
250239 sudo make -C builddir install
251240
252- /usr/sbin/waagent -force -deprovision+user && export HISTSIZE=0 && sync
253-
254241# Ensure the flux uri is exported for all users
255242# The build should be done as azureuser, but don't assume it.
256243export FLUX_URI=local:///opt/run/flux/local
257244echo " export FLUX_URI=local:///opt/run/flux/local" >> /home/$( whoami) /.bashrc
258245echo " export FLUX_URI=local:///opt/run/flux/local" >> /home/azureuser/.bashrc
259246
260- # Ensure we source the environment.
261- echo " . /opt/hpcx-v2.15-gcc-MLNX_OFED_LINUX-5-ubuntu22.04-cuda12-gdrcopy2-nccl2.17-x86_64/hpcx-mt-init.sh" >> /home/$( whoami) /.bashrc
262- echo " . /opt/hpcx-v2.15-gcc-MLNX_OFED_LINUX-5-ubuntu22.04-cuda12-gdrcopy2-nccl2.17-x86_64/hpcx-mt-init.sh" >> /home/azureuser/.bashrc
263- echo " hpcx_load" >> /home/$( whoami) /.bashrc
264- echo " hpcx_load" >> /home/azureuser/.bashrc
265-
266247# The flux uri needs to be set for all users that logic
267248echo " FLUX_URI DEFAULT=local:///opt/run/flux/local" >> ./environment
268249sudo mv ./environment /etc/security/pam_env.conf
269250
251+ # https://ubuntu.com/blog/ubuntu-23-10-restricted-unprivileged-user-namespaces
252+ sudo sysctl -w kernel.apparmor_restrict_unprivileged_unconfined=0
253+ sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0
254+
255+ sudo sysctl -p
256+ sudo systemctl daemon-reload
270257#
271258# At this point we have what we need!
259+
260+ /usr/sbin/waagent -force -deprovision+user && export HISTSIZE=0 && sync
0 commit comments