diff --git a/gcm_forecast.tmpl b/gcm_forecast.tmpl index 6acf75ed..10d55ce2 100755 --- a/gcm_forecast.tmpl +++ b/gcm_forecast.tmpl @@ -766,7 +766,7 @@ else endif @SINGULARITY_BUILD @OCEAN_PRELOAD $RUN_CMD $TOTAL_PES $SINGULARITY_RUN $GEOSEXE $IOSERVER_OPTIONS $IOSERVER_EXTRA --logging_config 'logging.yaml' -@NATIVE_BUILD @OCEAN_PRELOAD $RUN_CMD $TOTAL_PES $GEOSEXE $IOSERVER_OPTIONS $IOSERVER_EXTRA --logging_config 'logging.yaml' +@NATIVE_BUILD @OCEAN_PRELOAD @SEVERAL_TRIES $RUN_CMD $TOTAL_PES $GEOSEXE $IOSERVER_OPTIONS $IOSERVER_EXTRA --logging_config 'logging.yaml' if( $USE_SHMEM == 1 ) $GEOSBIN/RmShmKeys_sshmpi.csh >& /dev/null diff --git a/gcm_regress.j b/gcm_regress.j index ee8a6c4f..26cec61f 100755 --- a/gcm_regress.j +++ b/gcm_regress.j @@ -451,7 +451,7 @@ if( $RUN_STARTSTOP == TRUE ) then echo "=== Running test of duration ${test_duration_step1} with NX = $NX and NY = $NY starting at $nymd0 $nhms0 ===" - @OCEAN_PRELOAD $RUN_CMD $NPES ./GEOSgcm.x --logging_config 'logging.yaml' + @OCEAN_PRELOAD @SEVERAL_TRIES $RUN_CMD $NPES ./GEOSgcm.x --logging_config 'logging.yaml' set date = `cat cap_restart` set nymde1 = $date[1] @@ -517,7 +517,7 @@ set NY = `grep "^ *NY": AGCM.rc | cut -d':' -f2` echo "=== Running test of duration ${test_duration_step2} with NX = $NX and NY = $NY starting at $nymd0 $nhms0 ===" -@OCEAN_PRELOAD $RUN_CMD $NPES ./GEOSgcm.x --logging_config 'logging.yaml' +@OCEAN_PRELOAD @SEVERAL_TRIES $RUN_CMD $NPES ./GEOSgcm.x --logging_config 'logging.yaml' set date = `cat cap_restart` set nymde2 = $date[1] @@ -623,7 +623,7 @@ if ($RUN_STARTSTOP == TRUE) then echo "=== Running test of duration ${test_duration_step3} with NX = $NX and NY = $NY starting at $nymdb $nhmsb ===" - @OCEAN_PRELOAD $RUN_CMD $NPES ./GEOSgcm.x --logging_config 'logging.yaml' + @OCEAN_PRELOAD @SEVERAL_TRIES $RUN_CMD $NPES ./GEOSgcm.x --logging_config 'logging.yaml' set date = `cat cap_restart` set nymde3 = $date[1] @@ -737,7 +737,7 @@ if ( $RUN_LAYOUT == TRUE) then echo "=== Running test of duration ${test_duration_step4} with NX = $test_NX and NY = $test_NY starting at $nymd0 $nhms0 ===" - @OCEAN_PRELOAD $RUN_CMD $NPES ./GEOSgcm.x --logging_config 'logging.yaml' + @OCEAN_PRELOAD @SEVERAL_TRIES $RUN_CMD $NPES ./GEOSgcm.x --logging_config 'logging.yaml' set date = `cat cap_restart` set nymde4 = $date[1] diff --git a/gcm_run.j b/gcm_run.j index 3c91ad40..85bfdb0c 100755 --- a/gcm_run.j +++ b/gcm_run.j @@ -1059,7 +1059,7 @@ else endif @SINGULARITY_BUILD @OCEAN_PRELOAD $RUN_CMD $TOTAL_PES $SINGULARITY_RUN $GEOSEXE $IOSERVER_OPTIONS $IOSERVER_EXTRA --logging_config 'logging.yaml' -@NATIVE_BUILD @OCEAN_PRELOAD $RUN_CMD $TOTAL_PES $GEOSEXE $IOSERVER_OPTIONS $IOSERVER_EXTRA --logging_config 'logging.yaml' +@NATIVE_BUILD @OCEAN_PRELOAD @SEVERAL_TRIES $RUN_CMD $TOTAL_PES $GEOSEXE $IOSERVER_OPTIONS $IOSERVER_EXTRA --logging_config 'logging.yaml' if( $USE_SHMEM == 1 ) $GEOSBIN/RmShmKeys_sshmpi.csh >& /dev/null diff --git a/gcm_setup b/gcm_setup index a763d79f..573ab63f 100755 --- a/gcm_setup +++ b/gcm_setup @@ -2157,6 +2157,9 @@ set RESTART_BY_OSERVER = NO /bin/rm -f $HOMDIR/SETENV.commands +# NAS has a "several_tries" script but we need an empty +# default +set SEVERAL_TRIES = '' if( $MPI_STACK == openmpi ) then @@ -2206,6 +2209,10 @@ EOF else if( $MPI_STACK == mpt ) then +# NAS recommends several_tries for MPT job issues +# https://www.nas.nasa.gov/hecc/support/kb/mpt-startup-failures-workarounds_526.html +set SEVERAL_TRIES = '/u/scicon/tools/bin/several_tries' + cat > $HOMDIR/SETENV.commands << EOF setenv MPI_COLL_REPRODUCIBLE @@ -2268,6 +2275,9 @@ setenv I_MPI_ADJUST_GATHERV 3 setenv I_MPI_FABRICS shm:ofi setenv I_MPI_OFI_PROVIDER psm3 + +# This has been found to help with congestion +setenv FI_PSM3_CONN_TIMEOUT 120 EOF endif # if NCCS @@ -2433,6 +2443,7 @@ s/@USE_IOSERVER/$USE_IOSERVER/g s/@NUM_OSERVER_NODES/$NUM_OSERVER_NODES/g s/@NUM_BACKEND_PES/$NUM_BACKEND_PES/g s/@RESTART_BY_OSERVER/$RESTART_BY_OSERVER/g +s#@SEVERAL_TRIES#$SEVERAL_TRIES#g s/@NCPUS_PER_NODE/$NCPUS_PER_NODE/g s/@NUM_READERS/$NUM_READERS/g s/@NUM_WRITERS/$NUM_WRITERS/g diff --git a/geoschemchem_setup b/geoschemchem_setup index 28ce89a3..c152f06a 100755 --- a/geoschemchem_setup +++ b/geoschemchem_setup @@ -2187,6 +2187,9 @@ set RESTART_BY_OSERVER = NO /bin/rm -f $HOMDIR/SETENV.commands +# NAS has a "several_tries" script but we need an empty +# default +set SEVERAL_TRIES = '' if( $MPI_STACK == openmpi ) then @@ -2236,6 +2239,10 @@ EOF else if( $MPI_STACK == mpt ) then +# NAS recommends several_tries for MPT job issues +# https://www.nas.nasa.gov/hecc/support/kb/mpt-startup-failures-workarounds_526.html +set SEVERAL_TRIES = '/u/scicon/tools/bin/several_tries' + cat > $HOMDIR/SETENV.commands << EOF setenv MPI_COLL_REPRODUCIBLE @@ -2298,6 +2305,9 @@ setenv I_MPI_ADJUST_GATHERV 3 setenv I_MPI_FABRICS shm:ofi setenv I_MPI_OFI_PROVIDER psm3 + +# This has been found to help with congestion +setenv FI_PSM3_CONN_TIMEOUT 120 EOF endif # if NCCS @@ -2463,6 +2473,7 @@ s/@USE_IOSERVER/$USE_IOSERVER/g s/@NUM_OSERVER_NODES/$NUM_OSERVER_NODES/g s/@NUM_BACKEND_PES/$NUM_BACKEND_PES/g s/@RESTART_BY_OSERVER/$RESTART_BY_OSERVER/g +s#@SEVERAL_TRIES#$SEVERAL_TRIES#g s/@NCPUS_PER_NODE/$NCPUS_PER_NODE/g s/@NUM_READERS/$NUM_READERS/g s/@NUM_WRITERS/$NUM_WRITERS/g diff --git a/gmichem_setup b/gmichem_setup index 6ae62f39..8e65bd42 100755 --- a/gmichem_setup +++ b/gmichem_setup @@ -2359,6 +2359,9 @@ set RESTART_BY_OSERVER = NO /bin/rm -f $HOMDIR/SETENV.commands +# NAS has a "several_tries" script but we need an empty +# default +set SEVERAL_TRIES = '' if( $MPI_STACK == openmpi ) then @@ -2408,6 +2411,10 @@ EOF else if( $MPI_STACK == mpt ) then +# NAS recommends several_tries for MPT job issues +# https://www.nas.nasa.gov/hecc/support/kb/mpt-startup-failures-workarounds_526.html +set SEVERAL_TRIES = '/u/scicon/tools/bin/several_tries' + cat > $HOMDIR/SETENV.commands << EOF setenv MPI_COLL_REPRODUCIBLE @@ -2470,6 +2477,9 @@ setenv I_MPI_ADJUST_GATHERV 3 setenv I_MPI_FABRICS shm:ofi setenv I_MPI_OFI_PROVIDER psm3 + +# This has been found to help with congestion +setenv FI_PSM3_CONN_TIMEOUT 120 EOF endif # if NCCS @@ -2636,6 +2646,7 @@ s/@USE_IOSERVER/$USE_IOSERVER/g s/@NUM_OSERVER_NODES/$NUM_OSERVER_NODES/g s/@NUM_BACKEND_PES/$NUM_BACKEND_PES/g s/@RESTART_BY_OSERVER/$RESTART_BY_OSERVER/g +s#@SEVERAL_TRIES#$SEVERAL_TRIES#g s/@NCPUS_PER_NODE/$NCPUS_PER_NODE/g s/@NUM_READERS/$NUM_READERS/g s/@NUM_WRITERS/$NUM_WRITERS/g diff --git a/stratchem_setup b/stratchem_setup index 41bd1eb5..e1e709d3 100755 --- a/stratchem_setup +++ b/stratchem_setup @@ -2172,6 +2172,9 @@ set RESTART_BY_OSERVER = NO /bin/rm -f $HOMDIR/SETENV.commands +# NAS has a "several_tries" script but we need an empty +# default +set SEVERAL_TRIES = '' if( $MPI_STACK == openmpi ) then @@ -2221,6 +2224,10 @@ EOF else if( $MPI_STACK == mpt ) then +# NAS recommends several_tries for MPT job issues +# https://www.nas.nasa.gov/hecc/support/kb/mpt-startup-failures-workarounds_526.html +set SEVERAL_TRIES = '/u/scicon/tools/bin/several_tries' + cat > $HOMDIR/SETENV.commands << EOF setenv MPI_COLL_REPRODUCIBLE @@ -2283,6 +2290,9 @@ setenv I_MPI_ADJUST_GATHERV 3 setenv I_MPI_FABRICS shm:ofi setenv I_MPI_OFI_PROVIDER psm3 + +# This has been found to help with congestion +setenv FI_PSM3_CONN_TIMEOUT 120 EOF endif # if NCCS @@ -2449,6 +2459,7 @@ s/@USE_IOSERVER/$USE_IOSERVER/g s/@NUM_OSERVER_NODES/$NUM_OSERVER_NODES/g s/@NUM_BACKEND_PES/$NUM_BACKEND_PES/g s/@RESTART_BY_OSERVER/$RESTART_BY_OSERVER/g +s#@SEVERAL_TRIES#$SEVERAL_TRIES#g s/@NCPUS_PER_NODE/$NCPUS_PER_NODE/g s/@NUM_READERS/$NUM_READERS/g s/@NUM_WRITERS/$NUM_WRITERS/g