|
2 | 2 | echo "ip container: $(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1)" |
3 | 3 | echo "ip host: $(curl -s "http://169.254.169.254/latest/meta-data/local-ipv4")" |
4 | 4 |
|
| 5 | +# get shared dir |
| 6 | +IFS=',' _shared_dirs=(${PCLUSTER_SHARED_DIRS}) |
| 7 | +_shared_dir=${_shared_dirs[0]} |
| 8 | +_job_dir="${_shared_dir}/${AWS_BATCH_JOB_ID%#*}-${AWS_BATCH_JOB_ATTEMPT}" |
| 9 | +_exit_code_file="${_job_dir}/batch-exit-code" |
| 10 | + |
5 | 11 | if [[ "${AWS_BATCH_JOB_NODE_INDEX}" -eq "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" ]]; then |
6 | | - # get shared dir |
7 | | - IFS=',' _shared_dirs=(${PCLUSTER_SHARED_DIRS}) |
8 | | - _shared_dir=${_shared_dirs[0]} |
| 12 | + echo "Hello I'm the main node $(hostname)! I run the mpi job!" |
| 13 | + |
| 14 | + mkdir -p "${_job_dir}" |
9 | 15 |
|
10 | 16 | echo "Compiling..." |
11 | | - /usr/lib64/openmpi/bin/mpicc -o "${_shared_dir}/mpi_hello_world" "${_shared_dir}/mpi_hello_world.c" |
| 17 | + /usr/lib64/openmpi/bin/mpicc -o "${_job_dir}/mpi_hello_world" "${_shared_dir}/mpi_hello_world.c" |
| 18 | + |
| 19 | + echo "Running..." |
| 20 | + /usr/lib64/openmpi/bin/mpirun --mca btl_tcp_if_include eth0 --allow-run-as-root --machinefile "${HOME}/hostfile" "${_job_dir}/mpi_hello_world" |
12 | 21 |
|
13 | | - echo "Hello I'm the main node! I run the mpi job!" |
14 | | - /usr/lib64/openmpi/bin/mpirun --mca btl_tcp_if_include eth0 --allow-run-as-root --machinefile "${HOME}/hostfile" "${_shared_dir}/mpi_hello_world" |
| 22 | + # Write exit status code |
| 23 | + echo "0" > "${_exit_code_file}" |
| 24 | + # Waiting for compute nodes to terminate |
| 25 | + sleep 30 |
15 | 26 | else |
16 | | - echo "Hello I'm a compute note! I let the main node orchestrate the mpi execution!" |
| 27 | + echo "Hello I'm the compute node $(hostname)! I let the main node orchestrate the mpi execution!" |
17 | 28 | # Since mpi orchestration happens on the main node, we need to make sure the containers representing the compute |
18 | | - # nodes are not terminated. A simple trick is to run an infinite sleep. |
19 | | - # All compute nodes will be terminated by Batch once the main node exits. |
20 | | - sleep infinity |
| 29 | + # nodes are not terminated. A simple trick is to wait for a file containing the status code to be created. |
| 30 | + # All compute nodes are terminated by Batch if the main node exits abruptly. |
| 31 | + while [ ! -f "${_exit_code_file}" ]; do |
| 32 | + sleep 2 |
| 33 | + done |
| 34 | + exit $(cat "${_exit_code_file}") |
21 | 35 | fi |
0 commit comments