Skip to content

Commit f38119b

Browse files
author
acadev
authored
Running module for TITAN
added new run_model.sh file
1 parent f9bbfe1 commit f38119b

File tree

1 file changed

+79
-0
lines changed

1 file changed

+79
-0
lines changed
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/bin/bash
2+
3+
set -eu
4+
5+
# Check for an optional timeout threshold in seconds. If the duration of the
6+
# model run as executed below, takes longer that this threshhold
7+
# then the run will be aborted. Note that the "timeout" command
8+
# must be supported by executing OS.
9+
10+
# The timeout argument is optional. By default the "run_model" swift
11+
# app fuction sends 3 arguments, and no timeout value is set. If there
12+
# is a 4th (the TIMEOUT_ARG_INDEX) argument, we use that as the timeout value.
13+
14+
# !!! IF YOU CHANGE THE NUMBER OF ARGUMENTS PASSED TO THIS SCRIPT, YOU MUST
15+
# CHANGE THE TIMEOUT_ARG_INDEX !!!
16+
TIMEOUT_ARG_INDEX=7
17+
TIMEOUT=""
18+
if [[ $# == $TIMEOUT_ARG_INDEX ]]
19+
then
20+
TIMEOUT=${!TIMEOUT_ARG_INDEX}
21+
fi
22+
23+
TIMEOUT_CMD=""
24+
if [ -n "$TIMEOUT" ]; then
25+
TIMEOUT_CMD="timeout $TIMEOUT"
26+
fi
27+
28+
parameter_string=$1
29+
30+
# Set emews_root to the root directory of the project (i.e. the directory
31+
# that contains the scripts, swift, etc. directories and files)
32+
emews_root=$2
33+
34+
# Each model run, runs in its own "instance" directory
35+
# Set instance_directory to that and cd into it.
36+
instance_directory=$3
37+
cd $instance_directory
38+
39+
framework=$4
40+
exp_id=$5
41+
run_id=$6
42+
43+
# Theta / Tensorflow env vars
44+
export KMP_BLOCKTIME=30
45+
export KMP_SETTINGS=1
46+
export KMP_AFFINITY=granularity=fine,verbose,compact,1,0
47+
export OMP_NUM_THREADS=144
48+
49+
export PYTHONHOME="/sw/xk6/deeplearning/1.0/sles11.3_gnu4.9.3"
50+
PYTHON="$PYTHONHOME/bin/python"
51+
export LD_LIBRARY_PATH="$PYTHONHOME/lib:/sw/xk6/deeplearning/1.0/sles11.3_gnu4.9.3/lib:/sw/xk6/deeplearning/1.0/sles11.3_gnu4.9.3/cuda/lib64:/opt/gcc/4.9.3/snos/lib64:/sw/xk6/r/3.3.2/sles11.3_gnu4.9.3x/lib64/R/lib"
52+
export PATH="$PYTHONHOME/bin:$PATH"
53+
54+
BENCHMARK_DIR=$emews_root/../../../Benchmarks/common:$emews_root/../../../Benchmarks/Pilot3/P3B1
55+
COMMON_DIR=$emews_root/../common/python
56+
PYTHONPATH="$PYTHONHOME/lib/python3.6:"
57+
PYTHONPATH+="$BENCHMARK_DIR:$COMMON_DIR:"
58+
PYTHONPATH+="$PYTHONHOME/lib/python2.7/site-packages"
59+
export PYTHONPATH
60+
61+
arg_array=("$emews_root/python/p3b1_runner.py" "$parameter_string" "$instance_directory" "$framework" "$exp_id" "$run_id")
62+
MODEL_CMD="python ${arg_array[@]}"
63+
64+
# Turn bash error checking off. This is
65+
# required to properly handle the model execution return value
66+
# the optional timeout.
67+
set +e
68+
echo $MODEL_CMD
69+
$TIMEOUT_CMD python "${arg_array[@]}"
70+
# $? is the exit status of the most recently executed command (i.e the
71+
# line above)
72+
RES=$?
73+
if [ "$RES" -ne 0 ]; then
74+
if [ "$RES" == 124 ]; then
75+
echo "---> Timeout error in $MODEL_CMD"
76+
else
77+
echo "---> Error in $MODEL_CMD"
78+
fi
79+
fi

0 commit comments

Comments
 (0)