Skip to content

Commit 693c7f4

Browse files
committed
Allow keeping q_cons_ts(2) on CPU using pinned allocations
1 parent 37d393b commit 693c7f4

File tree

2 files changed

+64
-7
lines changed

2 files changed

+64
-7
lines changed

src/simulation/m_time_steppers.fpp

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,14 @@ module m_time_steppers
7575
integer, private :: num_ts !<
7676
!! Number of time stages in the time-stepping scheme
7777

78+
integer, private :: out_of_core
79+
7880
$:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]')
7981

82+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
83+
real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host
84+
#endif
85+
8086
contains
8187

8288
!> The computation of parameters, the allocation of memory,
@@ -86,6 +92,21 @@ contains
8692

8793
integer :: i, j !< Generic loop iterators
8894

95+
character(len=10) :: out_of_core_str
96+
out_of_core = 0
97+
98+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
99+
call get_environment_variable("MFC_OUT_OF_CORE", out_of_core_str)
100+
101+
if (trim(out_of_core_str) == "0") then
102+
out_of_core = 0
103+
elseif (trim(out_of_core_str) == "1") then
104+
out_of_core = 1
105+
else ! default
106+
out_of_core = 0
107+
endif
108+
#endif
109+
89110
! Setting number of time-stages for selected time-stepping scheme
90111
if (time_stepper == 1) then
91112
num_ts = 1
@@ -102,12 +123,33 @@ contains
102123
@:PREFER_GPU(q_cons_ts(i)%vf)
103124
end do
104125

126+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
127+
if ( out_of_core == 1 ) then
128+
allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
129+
idwbuff(2)%beg:idwbuff(2)%end, &
130+
idwbuff(3)%beg:idwbuff(3)%end, &
131+
1:sys_size))
132+
end if
133+
#endif
134+
105135
do i = 1, num_ts
106136
do j = 1, sys_size
107-
@:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
108-
idwbuff(2)%beg:idwbuff(2)%end, &
109-
idwbuff(3)%beg:idwbuff(3)%end))
110-
@:PREFER_GPU(q_cons_ts(i)%vf(j)%sf)
137+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
138+
if ( i <= (num_ts - out_of_core) ) then
139+
!print*, "q_cons_ts", i, j, "on GPU"
140+
#endif
141+
@:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
142+
idwbuff(2)%beg:idwbuff(2)%end, &
143+
idwbuff(3)%beg:idwbuff(3)%end))
144+
@:PREFER_GPU(q_cons_ts(i)%vf(j)%sf)
145+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
146+
else
147+
!print*, "q_cons_ts", i, j, "on CPU"
148+
q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
149+
idwbuff(2)%beg:idwbuff(2)%end, &
150+
idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j)
151+
end if
152+
#endif
111153
end do
112154
@:ACC_SETUP_VFs(q_cons_ts(i))
113155
end do
@@ -1205,14 +1247,30 @@ contains
12051247
! Deallocating the cell-average conservative variables
12061248
do i = 1, num_ts
12071249
do j = 1, sys_size
1208-
@:DEALLOCATE(q_cons_ts(i)%vf(j)%sf)
1250+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
1251+
if ( i <= (num_ts - out_of_core) ) then
1252+
!print*, "q_cons_ts", i, j, "dealloc"
1253+
#endif
1254+
@:DEALLOCATE(q_cons_ts(i)%vf(j)%sf)
1255+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
1256+
else
1257+
!print*, "q_cons_ts", i, j, "nullify"
1258+
nullify(q_cons_ts(i)%vf(j)%sf)
1259+
end if
1260+
#endif
12091261
end do
12101262

12111263
@:DEALLOCATE(q_cons_ts(i)%vf)
12121264
end do
12131265

12141266
@:DEALLOCATE(q_cons_ts)
12151267

1268+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
1269+
if ( out_of_core == 1 ) then
1270+
deallocate(q_cons_ts_pool_host)
1271+
end if
1272+
#endif
1273+
12161274
! Deallocating the cell-average primitive ts variables
12171275
if (probe_wrt) then
12181276
do i = 0, 3

toolchain/templates/santis.mako

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,9 @@ export FI_MR_CACHE_MONITOR=disabled
3838
export MPICH_NO_BUFFER_ALIAS_CHECK=1
3939

4040
# CUSTOM env vars to MFC
41-
export NVIDIA_ALLOC_MODE=0 # do nothing
41+
export MFC_OUT_OF_CORE=1 # out of core
4242
export NVIDIA_MANUAL_GPU_HINTS=1 # prefloc GPU on some
4343
export NVIDIA_IGR_TEMPS_ON_GPU=3 # jac, jac_rhs, and jac_old on GPU
44-
export NVIDIA_VARS_ON_GPU=7 # q_cons_ts(1)%vf%sf for j=1-7 on GPU
4544

4645
# NSYS
4746
export NSYS=1 # enable nsys profiling

0 commit comments

Comments
 (0)