Skip to content

Commit 68c4a5c

Browse files
committed
Update MPI settings to match gcm_setup
1 parent 608e47d commit 68c4a5c

File tree

1 file changed

+88
-41
lines changed

1 file changed

+88
-41
lines changed

scripts/fv3_setup

Lines changed: 88 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,6 @@ if ( $SITE == 'NCCS' ) then
292292
set BUILT_ON_SLES15 = @BUILT_ON_SLES15@
293293

294294
if ("$BUILT_ON_SLES15" == "TRUE") then
295-
set DEFAULT_MODEL = 'mil'
296295
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
297296
echo " ${C2}mil (Milan)${CN} (default)"
298297
echo " "
@@ -311,7 +310,7 @@ if ( $SITE == 'NCCS' ) then
311310
else
312311
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
313312
echo " ${C2}sky (Skylake)${CN}"
314-
echo " ${C2}cas (Cascade Lake)${CN} (default)"
313+
echo " ${C2}cas (Cascade Lake) (default)${CN}"
315314
echo " "
316315
set MODEL = `echo $<`
317316
set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"`
@@ -340,20 +339,17 @@ else if ( $SITE == 'NAS' ) then
340339
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
341340
echo " ${C2}has (Haswell)${CN}"
342341
echo " ${C2}bro (Broadwell)${CN}"
343-
echo " ${C2}sky (Skylake)${CN}"
344-
echo " ${C2}cas (Cascade Lake)${CN} (default)"
342+
echo " ${C2}sky (Skylake)${CN} (default)"
343+
echo " ${C2}cas (Cascade Lake)${CN}"
345344
echo " ${C2}rom (AMD Rome)${CN}"
346345
echo " "
347-
echo " NOTE 1: Due to how FV3 is compiled by default, Sandy Bridge"
348-
echo " and Ivy Bridge are not supported by current GEOS"
349-
echo " "
350-
echo " NOTE 2: GEOS is non-zero-diff when running on AMD Rome"
351-
echo " compared to the other Intel nodes."
346+
echo " NOTE Due to how FV3 is compiled by default, Sandy Bridge"
347+
echo " and Ivy Bridge are not supported by current GEOS"
352348
echo " "
353349
set MODEL = `echo $<`
354350
set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"`
355351
if ( .$MODEL == .) then
356-
set MODEL = 'cas'
352+
set MODEL = 'sky'
357353
endif
358354

359355
if( $MODEL != 'has' & \
@@ -697,56 +693,87 @@ echo $GROUP > $HOME/.GROUProot
697693
# Set Recommended MPI Stack Settings
698694
#######################################################################
699695

696+
# By default do not write restarts by oserver
697+
set RESTART_BY_OSERVER = NO
698+
700699
/bin/rm -f $EXPDIR/SETENV.commands
701700

702701
if( $MPI_STACK == openmpi ) then
703702

704-
# This turns off an annoying warning when running
705-
# Open MPI on a system where TMPDIRs are on a networked
706-
# file system
703+
# Open MPI and GEOS has issues with restart writing. Having the
704+
# oserver write them can be orders of magnitude faster
705+
706+
set RESTART_BY_OSERVER = YES
707+
708+
# Testing by Bill Putman determined some useful
709+
# Open MPI parameters. Testing shows these work
710+
# on both OSs at NCCS and on macOS
707711

708712
cat > $EXPDIR/SETENV.commands << EOF
709-
setenv OMPI_MCA_shmem_mmap_enable_nfs_warning 0
713+
# Turn off warning about TMPDIR on NFS
714+
setenv OMPI_MCA_shmem_mmap_enable_nfs_warning 0
715+
# pre-connect MPI procs on mpi_init
716+
setenv OMPI_MCA_mpi_preconnect_all 1
717+
setenv OMPI_MCA_coll_tuned_bcast_algorithm 7
718+
setenv OMPI_MCA_coll_tuned_scatter_algorithm 2
719+
setenv OMPI_MCA_coll_tuned_reduce_scatter_algorithm 3
720+
setenv OMPI_MCA_coll_tuned_allreduce_algorithm 3
721+
setenv OMPI_MCA_coll_tuned_allgather_algorithm 4
722+
setenv OMPI_MCA_coll_tuned_allgatherv_algorithm 3
723+
setenv OMPI_MCA_coll_tuned_gather_algorithm 1
724+
setenv OMPI_MCA_coll_tuned_barrier_algorithm 0
725+
# required for a tuned flag to be effective
726+
setenv OMPI_MCA_coll_tuned_use_dynamic_rules 1
727+
# disable file locks
728+
setenv OMPI_MCA_sharedfp "^lockedfile,individual"
710729
EOF
711730

712731
# The below settings seem to be recommended for hybrid
713-
# systems using MVAPICH2 but could change
732+
# systems using MVAPICH but could change
714733

715734
else if( $MPI_STACK == mvapich ) then
716735

736+
# MVAPICH and GEOS has issues with restart writing. Having the
737+
# oserver write them seems to...work
738+
set RESTART_BY_OSERVER = YES
739+
717740
cat > $EXPDIR/SETENV.commands << EOF
718-
setenv MV2_ENABLE_AFFINITY 0
719-
setenv SLURM_DISTRIBUTION block
720-
setenv MV2_MPIRUN_TIMEOUT 100
721-
setenv MV2_GATHERV_SSEND_THRESHOLD 256
741+
setenv MV2_ENABLE_AFFINITY 0
742+
setenv SLURM_DISTRIBUTION block
743+
setenv MV2_MPIRUN_TIMEOUT 100
744+
setenv MV2_GATHERV_SSEND_THRESHOLD 256
722745
EOF
723746

724747
else if( $MPI_STACK == mpt ) then
725748

726749
cat > $EXPDIR/SETENV.commands << EOF
727750
728-
setenv MPI_COLL_REPRODUCIBLE
729-
setenv SLURM_DISTRIBUTION block
751+
setenv MPI_COLL_REPRODUCIBLE
752+
setenv SLURM_DISTRIBUTION block
730753
731-
#setenv MPI_DISPLAY_SETTINGS 1
732-
#setenv MPI_VERBOSE 1
754+
#setenv MPI_DISPLAY_SETTINGS 1
755+
#setenv MPI_VERBOSE 1
733756
734-
unsetenv MPI_MEMMAP_OFF
735-
unsetenv MPI_NUM_MEMORY_REGIONS
736-
setenv MPI_XPMEM_ENABLED yes
737-
unsetenv SUPPRESS_XPMEM_TRIM_THRESH
757+
setenv MPI_MEMMAP_OFF
758+
unsetenv MPI_NUM_MEMORY_REGIONS
759+
setenv MPI_XPMEM_ENABLED yes
760+
unsetenv SUPPRESS_XPMEM_TRIM_THRESH
738761
739-
setenv MPI_LAUNCH_TIMEOUT 40
762+
setenv MPI_LAUNCH_TIMEOUT 40
740763
741-
# For some reason, PMI_RANK is randomly set and interferes
742-
# with binarytile.x and other executables.
743-
unsetenv PMI_RANK
764+
setenv MPI_COMM_MAX 1024
765+
setenv MPI_GROUP_MAX 1024
766+
setenv MPI_BUFS_PER_PROC 256
744767
745-
# Often when debugging on MPT, the traceback from Intel Fortran
746-
# is "absorbed" and only MPT's errors are displayed. To allow the
747-
# compiler's traceback to be displayed, uncomment this environment
748-
# variable
749-
#setenv FOR_IGNORE_EXCEPTIONS false
768+
# For some reason, PMI_RANK is randomly set and interferes
769+
# with binarytile.x and other executables.
770+
unsetenv PMI_RANK
771+
772+
# Often when debugging on MPT, the traceback from Intel Fortran
773+
# is "absorbed" and only MPT's errors are displayed. To allow the
774+
# compiler's traceback to be displayed, uncomment this environment
775+
# variable
776+
#setenv FOR_IGNORE_EXCEPTIONS false
750777
751778
EOF
752779

@@ -782,13 +809,32 @@ EOF
782809

783810
endif # if NOT Singularity
784811

785-
# Testing on SLES15 showed that the mlx provider did not seem
786-
# to work at scale. So we move to use the verbs provider. Note:
787-
# still seems to have issues at c720
812+
# Testing by Bill Putman found these to be
813+
# useful flags with Intel MPI on SLES15 on the
814+
# Milan nodes.
815+
# Note 1: Testing by NCCS shows the PSM3 provider
816+
# runs on the Infiniband fabric. Tests show it runs
817+
# up to C720.
818+
# Note 2: When the Cascade Lakes are moved to
819+
# SLES15, these will need to be Milan-only flags
820+
# as Intel MPI will probably work just fine with
821+
# Intel chips.
788822
if ("$BUILT_ON_SLES15" == "TRUE") then
789823
cat >> $EXPDIR/SETENV.commands << EOF
790-
setenv I_MPI_OFI_PROVIDER verbs
791-
setenv I_MPI_COLL_EXTERNAL 0
824+
setenv I_MPI_FALLBACK 0
825+
setenv I_MPI_FABRICS ofi
826+
setenv I_MPI_OFI_PROVIDER psm3
827+
setenv I_MPI_ADJUST_SCATTER 2
828+
setenv I_MPI_ADJUST_SCATTERV 2
829+
setenv I_MPI_ADJUST_GATHER 2
830+
setenv I_MPI_ADJUST_GATHERV 3
831+
setenv I_MPI_ADJUST_ALLGATHER 3
832+
setenv I_MPI_ADJUST_ALLGATHERV 3
833+
setenv I_MPI_ADJUST_ALLREDUCE 12
834+
setenv I_MPI_ADJUST_REDUCE 10
835+
setenv I_MPI_ADJUST_BCAST 11
836+
setenv I_MPI_ADJUST_REDUCE_SCATTER 4
837+
setenv I_MPI_ADJUST_BARRIER 9
792838
EOF
793839

794840
endif # if SLES15
@@ -797,6 +843,7 @@ endif # if NCCS
797843

798844
endif # if mpi
799845

846+
800847
#######################################################################
801848
# Create Local Scripts and Resource Files
802849
#######################################################################

0 commit comments

Comments
 (0)