@@ -2157,6 +2157,9 @@ set RESTART_BY_OSERVER = NO
21572157
21582158/bin/rm -f $HOMDIR /SETENV.commands
21592159
2160+ # NAS has a "several_tries" script but we need an empty
2161+ # default
2162+ set SEVERAL_TRIES = ' '
21602163
21612164if( $MPI_STACK == openmpi ) then
21622165
@@ -2206,6 +2209,10 @@ EOF
22062209
22072210else if( $MPI_STACK == mpt ) then
22082211
2212+ # NAS recommends several_tries for MPT job issues
2213+ # https://www.nas.nasa.gov/hecc/support/kb/mpt-startup-failures-workarounds_526.html
2214+ set SEVERAL_TRIES = ' /u/scicon/tools/bin/several_tries'
2215+
22092216cat > $HOMDIR /SETENV.commands << EOF
22102217
22112218setenv MPI_COLL_REPRODUCIBLE
@@ -2268,6 +2275,9 @@ setenv I_MPI_ADJUST_GATHERV 3
22682275
22692276setenv I_MPI_FABRICS shm:ofi
22702277setenv I_MPI_OFI_PROVIDER psm3
2278+
2279+ # This has been found to help with congestion
2280+ setenv FI_PSM3_CONN_TIMEOUT 120
22712281EOF
22722282
22732283endif # if NCCS
@@ -2433,6 +2443,7 @@ s/@USE_IOSERVER/$USE_IOSERVER/g
24332443s/@NUM_OSERVER_NODES/$NUM_OSERVER_NODES /g
24342444s/@NUM_BACKEND_PES/$NUM_BACKEND_PES /g
24352445s/@RESTART_BY_OSERVER/$RESTART_BY_OSERVER /g
2446+ s#@SEVERAL_TRIES#$SEVERAL_TRIES #g
24362447s/@NCPUS_PER_NODE/$NCPUS_PER_NODE /g
24372448s/@NUM_READERS/$NUM_READERS /g
24382449s/@NUM_WRITERS/$NUM_WRITERS /g
0 commit comments