Skip to content

Commit 758ef54

Browse files
bdorschnBenedikt Dorschner
authored andcommitted
uvm changes from IGR_OPT_Alps
- Missing are the acc captures for the mpi buffers
1 parent c933e72 commit 758ef54

File tree

12 files changed

+564
-14
lines changed

12 files changed

+564
-14
lines changed

CMakeLists.txt

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -135,17 +135,17 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
135135
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
136136
add_compile_options(
137137
-Wall
138-
-Wextra
138+
-Wextra
139139
-fcheck=all,no-array-temps
140140
-fbacktrace
141141
-fimplicit-none
142142
-fsignaling-nans
143143
-finit-real=snan
144144
-finit-integer=-99999999
145-
-Wintrinsic-shadow
146-
-Wunderflow
147-
-Wrealloc-lhs
148-
-Wsurprising
145+
-Wintrinsic-shadow
146+
-Wunderflow
147+
-Wrealloc-lhs
148+
-Wsurprising
149149
)
150150
endif()
151151

@@ -163,7 +163,6 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
163163
"SHELL:-h acc_model=auto_async_none"
164164
"SHELL: -h acc_model=no_fast_addr"
165165
"SHELL: -h list=adm"
166-
"SHELL: -munsafe-fp-atomics" # Not unsafe for operations we do
167166
)
168167

169168
add_link_options("SHELL:-hkeepfiles")
@@ -173,6 +172,7 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
173172
"SHELL:-h acc_model=auto_async_none"
174173
"SHELL: -h acc_model=no_fast_addr"
175174
"SHELL: -K trap=fp" "SHELL: -G2"
175+
176176
)
177177
add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
178178
endif()
@@ -486,23 +486,23 @@ function(MFC_SETUP_TARGET)
486486
endforeach()
487487

488488
target_compile_options(${a_target}
489-
PRIVATE -gpu=keep,ptxinfo,lineinfo
489+
PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath
490490
)
491491

492492
# GH-200 Unified Memory Support
493493
if (MFC_Unified)
494494
target_compile_options(${ARGS_TARGET}
495-
PRIVATE -gpu=unified
495+
PRIVATE -gpu=mem:unified -cuda
496496
)
497497
# "This option must appear in both the compile and link lines" -- NVHPC Docs
498498
target_link_options(${ARGS_TARGET}
499-
PRIVATE -gpu=unified
499+
PRIVATE -gpu=mem:unified -cuda
500500
)
501501
endif()
502502

503503
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
504504
target_compile_options(${a_target}
505-
PRIVATE -gpu=autocompare,debug
505+
PRIVATE -gpu=autocompare,debug -cuda
506506
)
507507
endif()
508508
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
@@ -513,9 +513,15 @@ function(MFC_SETUP_TARGET)
513513
target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
514514
endif()
515515

516-
if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
517-
find_package(CUDAToolkit REQUIRED)
518-
target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
516+
if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR
517+
CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
518+
519+
if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8
520+
target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
521+
else() # CUDA >= 12.9
522+
target_link_libraries(${a_target} PRIVATE nvhpcwrapnvtx )
523+
endif()
524+
target_link_options(${a_target} PRIVATE "-cudalib=nvtx")
519525
endif()
520526
endforeach()
521527

misc/nvidia_uvm/README.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
## The Main Idea behind the implemented Out-of-Core Strategy for Grace-Hopper
2+
3+
To run MFC out-of-core on Grace-Hopper using Unified Memory we implement a zero-copy strategy.
4+
5+
We start by setting preferred location CPU for all buffers by hooking into the allocate macro and setting `NVIDIA_ALLOC_MODE=2`.
6+
This way we disable access counter based migrations and keep everything on the CPU memory, freeing up as much GPU memory as possible.
7+
8+
Then, for the "important" buffers that are frequently accessed from the GPU, we reset preferred location to GPU in order to place them (and directly populate them) in GPU memory.
9+
This is done by the `PREFER_GPU` macro that has been manually placed in the code right after the allocations of the "important" buffers.
10+
To activate these hints we export `NVIDIA_MANUAL_GPU_HINTS=1`.
11+
12+
To allow fine grained control and be able to simulate larger sizes, we also use the following environment variables:
13+
- With `NVIDIA_IGR_TEMPS_ON_GPU` we control how many temporaries from the IGR module are to be placed in GPU memory.
14+
- With `NVIDIA_VARS_ON_GPU` we control how many of the `q_cons_ts(1)%vf(j)%sf` arrays we place in GPU memory.
15+
16+
It is important to note that we have rearranged the timestep updates in the 3rd order TVD Runge Kutta scheme in a way that allows us to pass only `q_cons_ts(1)` to the `compute_rhs` routines.
17+
This way, in order to keep the computation of `compute_rhs` (mostly) on GPU data, we only need to store `q_cons_ts(1)` (fully or even partially) in GPU memory.
18+
Thus, we choose to keep `q_cons_ts(2)` in CPU memory for the full lifetime of the simulation, freeing up space in GPU memory that allows for bumping up the size of the simulation, without sacrificing performance.
19+
In the timestep updates between the `compute_rhs` calls, we access both `q_cons_ts(1)` and `q_cons_ts(2)` directly from the physical location where they reside (zero-copy), simultaneously pulling data from GPU memory and CPU memory (through C2C), making good use of Grace-Hopper.
20+
21+
Note: This rearrangement most likely "breaks" the timestepper for different physics cases, but we can easily fix it in a later step.
22+
23+
## Example Workflow for Out-of-Core Strategy based on Unified Memory
24+
25+
```shell
26+
# Allocate a node
27+
salloc -A g183 --partition normal -t 02:00:00 -N 1 -n 4 --cpus-per-task=71
28+
29+
# Start uenv
30+
uenv start --view=modules icon/25.2:v1
31+
32+
# cd to root directory of MFC
33+
cd MFC-Wilfong
34+
35+
# Load modules
36+
. ./mfc.sh load -c san -m g
37+
38+
# Build
39+
export MFC_CUDA_CC=90
40+
./mfc.sh build --gpu -j 71 --single --unified --verbose
41+
42+
# Run pre_process and simulation binaries with case optimization (in an interactive job)
43+
./mfc.sh run examples/3D_IGR_perf_test/case.py --case-optimization -t pre_process simulation --gpu -N 1 -n 4 -j 71 -c santis
44+
45+
# Run pre_process and simulation binaries with case optimization (in an batch job)
46+
./mfc.sh run examples/3D_IGR_perf_test/case.py --case-optimization -t pre_process simulation --gpu -N 1 -n 4 -j 71 -c santis -e batch -p normal -a g183 -w 00:15:00
47+
```
48+
The environment variables `NVIDIA_ALLOC_MODE`, `NVIDIA_MANUAL_GPU_HINTS`, `NVIDIA_VARS_ON_GPU`, and `NVIDIA_IGR_TEMPS_ON_GPU`, can be set appropriately in `toolchain/templates/santis.mako`, to configure a run with ALL buffers either in GPU or in CPU memory, or a run with SOME buffers in GPU memory and the rest in CPU memory.

misc/nvidia_uvm/bind.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/usr/bin/env bash
2+
3+
# -------------------------------- #
4+
# Binding for a single Santis node #
5+
# -------------------------------- #
6+
7+
# Local rank
8+
export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"
9+
10+
# Bind to GPU
11+
export CUDA_VISIBLE_DEVICES="$local_rank"
12+
13+
# Binding to NIC
14+
export MPICH_OFI_NIC_POLICY=USER
15+
export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
16+
17+
# Bind to cores ( second core per socket )
18+
physcores=(0 72 144 216)
19+
20+
#echo rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
21+
numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"

misc/nvidia_uvm/nsys.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
#set -x
4+
set -euo pipefail
5+
6+
rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"
7+
8+
[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
9+
[[ -z "${NSYS+x}" ]] && NSYS=0
10+
11+
if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
12+
echo "Doing nsys"
13+
exec nsys profile \
14+
--cpuctxsw=none -b none -s none \
15+
--event-sample=system-wide \
16+
--cpu-socket-events=61,71,265,273 \
17+
--cpu-socket-metrics=103,104 \
18+
--event-sampling-interval=10 \
19+
--trace=nvtx,openacc \
20+
--force-overwrite=true \
21+
-e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
22+
-o "$NSYS_FILE" "$@"
23+
else
24+
exec "$@"
25+
fi

misc/nvidia_uvm/run.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env bash
2+
3+
# TODO: Modify accordingly
4+
PATH_TO_BINARY=${SCRATCH}/projects/cfd/mfc/MFC-Wilfong/build/install/cdcd4e8762/bin/
5+
6+
# NVHPC and CUDA env vars
7+
export NV_ACC_USE_MALLOC=1 # use malloc instead of cudaMallocManaged ( compiled using -gpu=mem:unified )
8+
export NVCOMPILER_ACC_NO_MEMHINTS=1 # disable implicit compiler hints
9+
export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH
10+
11+
# Cray MPICH
12+
export MPICH_GPU_SUPPORT_ENABLED=1 # MPICH with GPU support
13+
export FI_CXI_RX_MATCH_MODE=software
14+
export FI_MR_CACHE_MONITOR=disabled
15+
16+
# CUSTOM env vars to MFC
17+
export NVIDIA_ALLOC_MODE=2 # default alloc to prefloc CPU
18+
export NVIDIA_MANUAL_GPU_HINTS=1 # prefloc GPU on some
19+
export NVIDIA_IGR_TEMPS_ON_GPU=1 # jac on GPU and jac_rhs on CPU ( NOTE: good default, tune based on size )
20+
export NVIDIA_VARS_ON_GPU=7 # q_cons_ts(1)%vf%sf for j=1-7 on GPU ( NOTE: good default, tune based on size )
21+
22+
# NSYS
23+
export NSYS=1 # enable nsys profiling
24+
export NSYS_FILE=report_uvm_single_N-499_nGPUs-4_params-${NVIDIA_VARS_ON_GPU}-${NVIDIA_IGR_TEMPS_ON_GPU}.qdrep
25+
26+
# Run using --cpu-bind=none because we use our own binding script
27+
srun --ntasks 4 --cpu-bind=none ./bind.sh ./nsys.sh ${PATH_TO_BINARY}/simulation

src/common/include/macros.fpp

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,113 @@
1212
#endif
1313
#:enddef
1414

15+
#:def PREFER_GPU(*args)
16+
#ifdef MFC_SIMULATION
17+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
18+
block
19+
use cudafor
20+
intrinsic :: minval, maxval, sum
21+
integer :: istat
22+
integer :: prefer_gpu_mode
23+
character(len=10) :: prefer_gpu_mode_str
24+
25+
! environment variable
26+
call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str)
27+
if (trim(prefer_gpu_mode_str) == "0") then ! OFF
28+
prefer_gpu_mode = 0
29+
elseif (trim(prefer_gpu_mode_str) == "1") then ! ON
30+
prefer_gpu_mode = 1
31+
else ! default
32+
prefer_gpu_mode = 0
33+
endif
34+
35+
if (prefer_gpu_mode .eq. 1) then
36+
#:for arg in args
37+
!print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
38+
! unset
39+
istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseUnSetPreferredLocation, cudaCpuDeviceId )
40+
if (istat /= cudaSuccess) then
41+
write(*,"('Error code: ',I0, ': ')") istat
42+
write(*,*) cudaGetErrorString(istat)
43+
endif
44+
! set
45+
istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
46+
if (istat /= cudaSuccess) then
47+
write(*,"('Error code: ',I0, ': ')") istat
48+
write(*,*) cudaGetErrorString(istat)
49+
endif
50+
#:endfor
51+
end if
52+
end block
53+
#endif
54+
#endif
55+
#:enddef
56+
57+
58+
#:def PARSE(s)
59+
${s if s.rfind(')') == -1 else next((s[:i] for i in range(s.rfind(')'), -1, -1) if s[i] == '(' and s.count('(', i, s.rfind(')')+1) == s.count(')', i, s.rfind(')')+1)), s)}$
60+
#:enddef
61+
1562
#:def ALLOCATE(*args)
1663
@:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
1764
#:set allocated_variables = ', '.join(args)
1865
allocate (${allocated_variables}$)
1966
$:GPU_ENTER_DATA(create=('[' + allocated_variables + ']'))
67+
68+
69+
#ifdef MFC_SIMULATION
70+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
71+
block
72+
use cudafor
73+
intrinsic :: minval, maxval, sum
74+
integer :: istat, stream_id
75+
integer :: alloc_mode
76+
character(len=10) :: alloc_mode_str
77+
78+
! environment variable
79+
call get_environment_variable("NVIDIA_ALLOC_MODE", alloc_mode_str)
80+
if (trim(alloc_mode_str) == "0") then ! no CPU first touch, no preferred location CPU
81+
alloc_mode = 0
82+
elseif (trim(alloc_mode_str) == "1") then ! CPU first touch, no preferred location CPU
83+
alloc_mode = 1
84+
elseif (trim(alloc_mode_str) == "2") then ! no CPU first touch, preferred location CPU
85+
alloc_mode = 2
86+
elseif (trim(alloc_mode_str) == "3") then ! CPU first touch, preferred location CPU
87+
alloc_mode = 3
88+
else ! default
89+
alloc_mode = 0
90+
endif
91+
92+
stream_id = 0
93+
94+
! prefetch to CPU
95+
if ((alloc_mode .eq. 1) .or. (alloc_mode .eq. 3)) then
96+
#:for arg in args
97+
istat = cudaMemPrefetchAsync( c_devloc(@{PARSE(${arg}$)}@), SIZEOF(@{PARSE(${arg}$)}@), cudaCpuDeviceId, stream_id )
98+
!print*, "! @{PARSE(${arg}$)}@ with shape", SHAPE(@{PARSE(${arg}$)}@), "=> prefetch to CPU"
99+
if (istat /= cudaSuccess) then
100+
write(*,"('Error code: ',I0, ': ')") istat
101+
write(*,*) cudaGetErrorString(istat)
102+
endif
103+
#:endfor
104+
endif
105+
106+
! memadvise preferred location
107+
if ((alloc_mode .eq. 2) .or. (alloc_mode .eq. 3)) then
108+
#:for arg in args
109+
istat = cudaMemAdvise( c_devloc(@{PARSE(${arg}$)}@), SIZEOF(@{PARSE(${arg}$)}@), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId )
110+
!print*, "! @{PARSE(${arg}$)}@ with shape", SHAPE(@{PARSE(${arg}$)}@), "=> preferred location CPU"
111+
if (istat /= cudaSuccess) then
112+
write(*,"('Error code: ',I0, ': ')") istat
113+
write(*,*) cudaGetErrorString(istat)
114+
endif
115+
#:endfor
116+
endif
117+
118+
end block
119+
#endif
120+
#endif
121+
20122
#:enddef ALLOCATE
21123

22124
#:def DEALLOCATE(*args)

src/simulation/m_global_parameters.fpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1294,16 +1294,25 @@ contains
12941294
@:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
12951295
@:ALLOCATE(x_cc(-buff_size:m + buff_size))
12961296
@:ALLOCATE(dx(-buff_size:m + buff_size))
1297+
@:PREFER_GPU(x_cb)
1298+
@:PREFER_GPU(x_cc)
1299+
@:PREFER_GPU(dx)
12971300
12981301
if (n == 0) return;
12991302
@:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
13001303
@:ALLOCATE(y_cc(-buff_size:n + buff_size))
13011304
@:ALLOCATE(dy(-buff_size:n + buff_size))
1305+
@:PREFER_GPU(y_cb)
1306+
@:PREFER_GPU(y_cc)
1307+
@:PREFER_GPU(dy)
13021308
13031309
if (p == 0) return;
13041310
@:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
13051311
@:ALLOCATE(z_cc(-buff_size:p + buff_size))
13061312
@:ALLOCATE(dz(-buff_size:p + buff_size))
1313+
@:PREFER_GPU(z_cb)
1314+
@:PREFER_GPU(z_cc)
1315+
@:PREFER_GPU(dz)
13071316
13081317
end subroutine s_initialize_global_parameters_module
13091318

0 commit comments

Comments
 (0)