Skip to content

Commit 20c86c2

Browse files
committed
Docs: improve mpi batch code sample
Signed-off-by: Francesco De Martino <[email protected]>
1 parent b3ae572 commit 20c86c2

File tree

1 file changed

+24
-10
lines changed

1 file changed

+24
-10
lines changed

docs/tutorials/code_samples/batch_mpi/submit_mpi.sh

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,34 @@
22
echo "ip container: $(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1)"
33
echo "ip host: $(curl -s "http://169.254.169.254/latest/meta-data/local-ipv4")"
44

5+
# get shared dir
6+
IFS=',' _shared_dirs=(${PCLUSTER_SHARED_DIRS})
7+
_shared_dir=${_shared_dirs[0]}
8+
_job_dir="${_shared_dir}/${AWS_BATCH_JOB_ID%#*}-${AWS_BATCH_JOB_ATTEMPT}"
9+
_exit_code_file="${_job_dir}/batch-exit-code"
10+
511
if [[ "${AWS_BATCH_JOB_NODE_INDEX}" -eq "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" ]]; then
6-
# get shared dir
7-
IFS=',' _shared_dirs=(${PCLUSTER_SHARED_DIRS})
8-
_shared_dir=${_shared_dirs[0]}
12+
echo "Hello I'm the main node $(hostname)! I run the mpi job!"
13+
14+
mkdir -p "${_job_dir}"
915

1016
echo "Compiling..."
11-
/usr/lib64/openmpi/bin/mpicc -o "${_shared_dir}/mpi_hello_world" "${_shared_dir}/mpi_hello_world.c"
17+
/usr/lib64/openmpi/bin/mpicc -o "${_job_dir}/mpi_hello_world" "${_shared_dir}/mpi_hello_world.c"
18+
19+
echo "Running..."
20+
/usr/lib64/openmpi/bin/mpirun --mca btl_tcp_if_include eth0 --allow-run-as-root --machinefile "${HOME}/hostfile" "${_job_dir}/mpi_hello_world"
1221

13-
echo "Hello I'm the main node! I run the mpi job!"
14-
/usr/lib64/openmpi/bin/mpirun --mca btl_tcp_if_include eth0 --allow-run-as-root --machinefile "${HOME}/hostfile" "${_shared_dir}/mpi_hello_world"
22+
# Write exit status code
23+
echo "0" > "${_exit_code_file}"
24+
# Waiting for compute nodes to terminate
25+
sleep 30
1526
else
16-
echo "Hello I'm a compute note! I let the main node orchestrate the mpi execution!"
27+
echo "Hello I'm the compute node $(hostname)! I let the main node orchestrate the mpi execution!"
1728
# Since mpi orchestration happens on the main node, we need to make sure the containers representing the compute
18-
# nodes are not terminated. A simple trick is to run an infinite sleep.
19-
# All compute nodes will be terminated by Batch once the main node exits.
20-
sleep infinity
29+
# nodes are not terminated. A simple trick is to wait for a file containing the status code to be created.
30+
# All compute nodes are terminated by Batch if the main node exits abruptly.
31+
while [ ! -f "${_exit_code_file}" ]; do
32+
sleep 2
33+
done
34+
exit $(cat "${_exit_code_file}")
2135
fi

0 commit comments

Comments
 (0)