Docs: improve mpi batch code sample

demartinofra · demartinofra · commit 20c86c23f270 · 2019-01-07T16:49:58.000+01:00
Signed-off-by: Francesco De Martino &lt;fdm@amazon.com&gt;
diff --git a/docs/tutorials/code_samples/batch_mpi/submit_mpi.sh b/docs/tutorials/code_samples/batch_mpi/submit_mpi.sh
@@ -2,20 +2,34 @@
 echo "ip container: $(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1)"
 echo "ip host: $(curl -s "http://169.254.169.254/latest/meta-data/local-ipv4")"
 
+# get shared dir
+IFS=',' _shared_dirs=(${PCLUSTER_SHARED_DIRS})
+_shared_dir=${_shared_dirs[0]}
+_job_dir="${_shared_dir}/${AWS_BATCH_JOB_ID%#*}-${AWS_BATCH_JOB_ATTEMPT}"
+_exit_code_file="${_job_dir}/batch-exit-code"
+
 if [[ "${AWS_BATCH_JOB_NODE_INDEX}" -eq  "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" ]]; then
-    # get shared dir
-    IFS=',' _shared_dirs=(${PCLUSTER_SHARED_DIRS})
-    _shared_dir=${_shared_dirs[0]}
+    echo "Hello I'm the main node $(hostname)! I run the mpi job!"
+
+    mkdir -p "${_job_dir}"
 
     echo "Compiling..."
-    /usr/lib64/openmpi/bin/mpicc -o "${_shared_dir}/mpi_hello_world" "${_shared_dir}/mpi_hello_world.c"
+    /usr/lib64/openmpi/bin/mpicc -o "${_job_dir}/mpi_hello_world" "${_shared_dir}/mpi_hello_world.c"
+
+    echo "Running..."
+    /usr/lib64/openmpi/bin/mpirun --mca btl_tcp_if_include eth0 --allow-run-as-root --machinefile "${HOME}/hostfile" "${_job_dir}/mpi_hello_world"
 
-    echo "Hello I'm the main node! I run the mpi job!"
-    /usr/lib64/openmpi/bin/mpirun --mca btl_tcp_if_include eth0 --allow-run-as-root --machinefile "${HOME}/hostfile" "${_shared_dir}/mpi_hello_world"
+    # Write exit status code
+    echo "0" > "${_exit_code_file}"
+    # Waiting for compute nodes to terminate
+    sleep 30
 else
-    echo "Hello I'm a compute note! I let the main node orchestrate the mpi execution!"
+    echo "Hello I'm the compute node $(hostname)! I let the main node orchestrate the mpi execution!"
     # Since mpi orchestration happens on the main node, we need to make sure the containers representing the compute
-    # nodes are not terminated. A simple trick is to run an infinite sleep.
-    # All compute nodes will be terminated by Batch once the main node exits.
-    sleep infinity
+    # nodes are not terminated. A simple trick is to wait for a file containing the status code to be created.
+    # All compute nodes are terminated by Batch if the main node exits abruptly.
+    while [ ! -f "${_exit_code_file}" ]; do
+        sleep 2
+    done
+    exit $(cat "${_exit_code_file}")
 fi