Skip to content

Commit 2358d29

Browse files
committed
Add scripts for santis/alps, example case, and captures for UVM comms via RDMA
1 parent a1d1576 commit 2358d29

File tree

8 files changed

+252
-3
lines changed

8 files changed

+252
-3
lines changed

CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -486,17 +486,17 @@ function(MFC_SETUP_TARGET)
486486
endforeach()
487487

488488
target_compile_options(${a_target}
489-
PRIVATE -gpu=keep,ptxinfo,lineinfo
489+
PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath
490490
)
491491

492492
# GH-200 Unified Memory Support
493493
if (MFC_Unified)
494494
target_compile_options(${ARGS_TARGET}
495-
PRIVATE -gpu=unified
495+
PRIVATE -gpu=mem:unified -cuda
496496
)
497497
# "This option must appear in both the compile and link lines" -- NVHPC Docs
498498
target_link_options(${ARGS_TARGET}
499-
PRIVATE -gpu=unified
499+
PRIVATE -gpu=mem:unified -cuda
500500
)
501501
endif()
502502

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#!/usr/bin/env python3
2+
import math
3+
import json
4+
5+
N = 799
6+
Nx = N
7+
Ny = 2*(N+1)-1
8+
Nz = 2*(N+1)-1
9+
10+
Re = 1600
11+
L = 1
12+
P0 = 101325
13+
rho0 = 1
14+
C0 = math.sqrt(1.4 * P0)
15+
V0 = 0.1 * C0
16+
mu = V0 * L / Re
17+
18+
cfl = 0.5
19+
dx = 2 * math.pi * L / (Ny + 1)
20+
21+
dt = cfl * dx / (C0)
22+
23+
tC = L / V0
24+
tEnd = 20 * tC
25+
26+
Nt = int(tEnd / dt)
27+
Nt = 10
28+
29+
30+
# Configuring case dictionary
31+
print(
32+
json.dumps(
33+
{
34+
"rdma_mpi": "T",
35+
# Logistics
36+
"run_time_info": "F",
37+
# Computational Domain Parameters
38+
"x_domain%beg": -math.pi * L,
39+
"x_domain%end": math.pi * L,
40+
"y_domain%beg": -math.pi * L,
41+
"y_domain%end": math.pi * L,
42+
"z_domain%beg": -math.pi * L,
43+
"z_domain%end": math.pi * L,
44+
"m": Nx,
45+
"n": Ny,
46+
"p": Nz,
47+
"cyl_coord": "F",
48+
"dt": dt,
49+
"t_step_start": 0,
50+
"t_step_stop": Nt,
51+
"t_step_save": int(Nt / 100),
52+
# Simulation Algorithm Parameters
53+
"num_patches": 1,
54+
"model_eqns": 2,
55+
"num_fluids": 1,
56+
"time_stepper": 3,
57+
"bc_x%beg": -1,
58+
"bc_x%end": -1,
59+
"bc_y%beg": -1,
60+
"bc_y%end": -1,
61+
"bc_z%beg": -1,
62+
"bc_z%end": -1,
63+
"igr": "T",
64+
"igr_order": 5,
65+
"igr_iter_solver": 1,
66+
"num_igr_iters": 3,
67+
"num_igr_warm_start_iters": 3,
68+
"alf_factor": 10,
69+
"viscous": "T",
70+
# Formatted Database Files Structure Parameters
71+
"format": 1,
72+
"precision": 2,
73+
"prim_vars_wrt": "T",
74+
"omega_wrt(1)": "T",
75+
"omega_wrt(2)": "T",
76+
"omega_wrt(3)": "T",
77+
"qm_wrt": "T",
78+
"fd_order": 4,
79+
"parallel_io": "T",
80+
# Patch 1: Background (AIR - 2)
81+
"patch_icpp(1)%geometry": 9,
82+
"patch_icpp(1)%x_centroid": 0,
83+
"patch_icpp(1)%y_centroid": 0,
84+
"patch_icpp(1)%z_centroid": 0,
85+
"patch_icpp(1)%length_x": 2 * math.pi * L,
86+
"patch_icpp(1)%length_y": 2 * math.pi * L,
87+
"patch_icpp(1)%length_z": 2 * math.pi * L,
88+
"patch_icpp(1)%vel(1)": 0.0,
89+
"patch_icpp(1)%vel(2)": 0.0,
90+
"patch_icpp(1)%vel(3)": 0,
91+
"patch_icpp(1)%pres": 0.0,
92+
"patch_icpp(1)%hcid": 380,
93+
"patch_icpp(1)%alpha_rho(1)": 1,
94+
"patch_icpp(1)%alpha(1)": 1,
95+
# Fluids Physical Parameters
96+
"fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
97+
"fluid_pp(1)%pi_inf": 0,
98+
"fluid_pp(1)%Re(1)": 1 / mu,
99+
}
100+
)
101+
)

misc/nvidia_uvm/bind.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/usr/bin/env bash
2+
3+
# -------------------------------- #
4+
# Binding for a single Santis node #
5+
# -------------------------------- #
6+
7+
# Local rank
8+
export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"
9+
10+
# Bind to GPU
11+
export CUDA_VISIBLE_DEVICES="$local_rank"
12+
13+
# Binding to NIC
14+
export MPICH_OFI_NIC_POLICY=USER
15+
export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
16+
17+
# Bind to cores ( first core per socket )
18+
physcores=(0 72 144 216)
19+
20+
#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
21+
22+
#set -x
23+
numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
24+
#set +x

misc/nvidia_uvm/nsys.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
#set -x
4+
set -euo pipefail
5+
6+
rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"
7+
8+
[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
9+
[[ -z "${NSYS+x}" ]] && NSYS=0
10+
11+
if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
12+
exec nsys profile \
13+
--cpuctxsw=none -b none -s none \
14+
--event-sample=system-wide \
15+
--cpu-socket-events=61,71,265,273 \
16+
--cpu-socket-metrics=103,104 \
17+
--event-sampling-interval=10 \
18+
--trace=nvtx,openacc \
19+
--force-overwrite=true \
20+
-e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
21+
-o "$NSYS_FILE" "$@"
22+
else
23+
exec "$@"
24+
fi

src/common/m_mpi_common.fpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ module m_mpi_common
3838
!! average primitive variables, for a single computational domain boundary
3939
!! at the time, from the relevant neighboring processor.
4040
41+
#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
4142
$:GPU_DECLARE(create='[buff_send, buff_recv]')
43+
#endif
4244
4345
integer :: halo_size
4446
$:GPU_DECLARE(create='[halo_size]')
@@ -78,7 +80,13 @@ contains
7880
7981
$:GPU_UPDATE(device='[halo_size, v_size]')
8082
83+
#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
8184
@:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
85+
#else
86+
ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
87+
!$acc enter data create(capture:buff_send)
88+
!$acc enter data create(capture:buff_recv)
89+
#endif
8290
#endif
8391
8492
end subroutine s_initialize_mpi_common_module

toolchain/mfc/build.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ def get_install_dirpath(self, case: Case ) -> str:
6464
# The install directory is located <root>/build/install/<slug>
6565
return os.sep.join([os.getcwd(), "build", "install", self.get_slug(case)])
6666

67+
def get_home_dirpath(self, case: Case) -> str:
68+
return os.sep.join([os.getcwd()])
69+
6770
def get_install_binpath(self, case: Case ) -> str:
6871
# <root>/install/<slug>/bin/<target>
6972
return os.sep.join([self.get_install_dirpath(case), "bin", self.name])

toolchain/modules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,6 @@ n-cpu penguin/openmpi/4.1.5/gcc-8.5.0
8585
n-gpu penguin/openmpi/4.1.5/nvhpc-22.3 nvidia/nvhpc/22.3 cuda/cuda-11.6
8686
n-gpu CC=nvc CXX=nvc++ FC=nvfortran
8787

88+
san CSCS Santis
89+
san-all cmake python
90+
san-gpu nvhpc cuda cray-mpich

toolchain/templates/santis.mako

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#!/usr/bin/env bash
2+
3+
<%namespace name="helpers" file="helpers.mako"/>
4+
5+
% if engine == 'batch':
6+
#SBATCH --uenv=icon/25.2:v1
7+
#SBATCH --nodes=${nodes}
8+
#SBATCH --reservation=g183
9+
#SBATCH --ntasks-per-node=${tasks_per_node}
10+
#SBATCH --job-name="${name}"
11+
#SBATCH --output="${name}.out"
12+
#SBATCH --error="${name}.err"
13+
#SBATCH --time=${walltime}
14+
% if account:
15+
#SBATCH --account=${account}
16+
% endif
17+
% if partition:
18+
#SBATCH --partition=${partition}
19+
% endif
20+
% if quality_of_service:
21+
#SBATCH --qos=${quality_of_service}
22+
% endif
23+
% if email:
24+
#SBATCH --mail-user=${email}
25+
#SBATCH --mail-type="BEGIN, END, FAIL"
26+
% endif
27+
% endif
28+
29+
# NVHPC and CUDA env vars
30+
export NV_ACC_USE_MALLOC=0 # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified )
31+
export NVCOMPILER_ACC_NO_MEMHINTS=1 # disable implicit compiler hints
32+
#export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH
33+
34+
# Cray MPICH
35+
export MPICH_GPU_SUPPORT_ENABLED=1
36+
export FI_CXI_RX_MATCH_MODE=software
37+
export FI_MR_CACHE_MONITOR=disabled
38+
export MPICH_NO_BUFFER_ALIAS_CHECK=1
39+
40+
# CUSTOM env vars to MFC
41+
export NVIDIA_ALLOC_MODE=0 # do nothing
42+
export NVIDIA_MANUAL_GPU_HINTS=1 # prefloc GPU on some
43+
export NVIDIA_IGR_TEMPS_ON_GPU=3 # jac, jac_rhs, and jac_old on GPU
44+
export NVIDIA_VARS_ON_GPU=7 # q_cons_ts(1)%vf%sf for j=1-7 on GPU
45+
46+
# NSYS
47+
export NSYS=1 # enable nsys profiling
48+
export NSYS_FILE=myreport.qdrep
49+
50+
${helpers.template_prologue()}
51+
52+
ok ":) Loading modules:\n"
53+
cd "${MFC_ROOT_DIR}"
54+
% if engine == 'batch':
55+
. ./mfc.sh load -c san -m ${'g' if gpu else 'c'}
56+
% endif
57+
cd - > /dev/null
58+
echo
59+
60+
% for target in targets:
61+
${helpers.run_prologue(target)}
62+
63+
% if not mpi:
64+
(set -x; ${profiler} "${target.get_install_binpath(case)}")
65+
% else:
66+
(set -x; srun --unbuffered \
67+
--ntasks=${nodes*tasks_per_node} \
68+
--cpus-per-task 1 \
69+
--cpu-bind=none \
70+
% if gpu:
71+
--gpus-per-task 1 \
72+
% endif
73+
--wait 200 --bcast=/tmp/${target.name} \
74+
"${target.get_home_dirpath(case)}/misc/nvidia_uvm/bind.sh" \
75+
% if target.name == 'simulation':
76+
"${target.get_home_dirpath(case)}/misc/nvidia_uvm/nsys.sh" \
77+
% endif
78+
"${target.get_install_binpath(case)}")
79+
% endif
80+
81+
${helpers.run_epilogue(target)}
82+
83+
echo
84+
% endfor
85+
86+
${helpers.template_epilogue()}

0 commit comments

Comments
 (0)