Skip to content

Commit 37d393b

Browse files
committed
Add PREFER_GPU and rearrange update for out-of-core computation
1 parent 2358d29 commit 37d393b

File tree

4 files changed

+119
-2
lines changed

4 files changed

+119
-2
lines changed

src/common/include/macros.fpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,47 @@
1212
#endif
1313
#:enddef
1414

15+
#:def PREFER_GPU(*args)
16+
#ifdef MFC_SIMULATION
17+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
18+
block
19+
use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
20+
integer :: istat
21+
integer :: prefer_gpu_mode
22+
character(len=10) :: prefer_gpu_mode_str
23+
24+
! environment variable
25+
call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str)
26+
if (trim(prefer_gpu_mode_str) == "0") then ! OFF
27+
prefer_gpu_mode = 0
28+
elseif (trim(prefer_gpu_mode_str) == "1") then ! ON
29+
prefer_gpu_mode = 1
30+
else ! default
31+
prefer_gpu_mode = 0
32+
endif
33+
34+
if (prefer_gpu_mode .eq. 1) then
35+
#:for arg in args
36+
!print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
37+
! unset
38+
istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseUnSetPreferredLocation, cudaCpuDeviceId )
39+
if (istat /= cudaSuccess) then
40+
write(*,"('Error code: ',I0, ': ')") istat
41+
write(*,*) cudaGetErrorString(istat)
42+
endif
43+
! set
44+
istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
45+
if (istat /= cudaSuccess) then
46+
write(*,"('Error code: ',I0, ': ')") istat
47+
write(*,*) cudaGetErrorString(istat)
48+
endif
49+
#:endfor
50+
end if
51+
end block
52+
#endif
53+
#endif
54+
#:enddef
55+
1556
#:def ALLOCATE(*args)
1657
@:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
1758
#:set allocated_variables = ', '.join(args)

src/simulation/m_global_parameters.fpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,16 +1308,25 @@ contains
13081308
@:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
13091309
@:ALLOCATE(x_cc(-buff_size:m + buff_size))
13101310
@:ALLOCATE(dx(-buff_size:m + buff_size))
1311+
@:PREFER_GPU(x_cb)
1312+
@:PREFER_GPU(x_cc)
1313+
@:PREFER_GPU(dx)
13111314
13121315
if (n == 0) return;
13131316
@:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
13141317
@:ALLOCATE(y_cc(-buff_size:n + buff_size))
13151318
@:ALLOCATE(dy(-buff_size:n + buff_size))
1319+
@:PREFER_GPU(y_cb)
1320+
@:PREFER_GPU(y_cc)
1321+
@:PREFER_GPU(dy)
13161322
13171323
if (p == 0) return;
13181324
@:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
13191325
@:ALLOCATE(z_cc(-buff_size:p + buff_size))
13201326
@:ALLOCATE(dz(-buff_size:p + buff_size))
1327+
@:PREFER_GPU(z_cb)
1328+
@:PREFER_GPU(z_cc)
1329+
@:PREFER_GPU(dz)
13211330
13221331
end subroutine s_initialize_global_parameters_module
13231332

src/simulation/m_igr.fpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,17 +91,23 @@ contains
9191
end do
9292
end do
9393
$:GPU_UPDATE(device='[Res, Re_idx, Re_size]')
94+
@:PREFER_GPU(Res)
95+
@:PREFER_GPU(Re_idx)
9496
end if
9597

9698
@:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, &
9799
idwbuff(2)%beg:idwbuff(2)%end, &
98100
idwbuff(3)%beg:idwbuff(3)%end))
101+
@:PREFER_GPU(jac)
102+
99103
@:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
104+
@:PREFER_GPU(jac_rhs)
100105

101106
if (igr_iter_solver == 1) then ! Jacobi iteration
102107
@:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
103108
idwbuff(2)%beg:idwbuff(2)%end, &
104109
idwbuff(3)%beg:idwbuff(3)%end))
110+
@:PREFER_GPU(jac_old)
105111
end if
106112

107113
$:GPU_PARALLEL_LOOP(collapse=3)

src/simulation/m_time_steppers.fpp

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,16 +95,19 @@ contains
9595

9696
! Allocating the cell-average conservative variables
9797
@:ALLOCATE(q_cons_ts(1:num_ts))
98+
@:PREFER_GPU(q_cons_ts)
9899

99100
do i = 1, num_ts
100101
@:ALLOCATE(q_cons_ts(i)%vf(1:sys_size))
102+
@:PREFER_GPU(q_cons_ts(i)%vf)
101103
end do
102104

103105
do i = 1, num_ts
104106
do j = 1, sys_size
105107
@:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
106108
idwbuff(2)%beg:idwbuff(2)%end, &
107109
idwbuff(3)%beg:idwbuff(3)%end))
110+
@:PREFER_GPU(q_cons_ts(i)%vf(j)%sf)
108111
end do
109112
@:ACC_SETUP_VFs(q_cons_ts(i))
110113
end do
@@ -304,11 +307,13 @@ contains
304307

305308
! Allocating the cell-average RHS variables
306309
@:ALLOCATE(rhs_vf(1:sys_size))
310+
@:PREFER_GPU(rhs_vf)
307311

308312
if (igr) then
309313
do i = 1, sys_size
310314
@:ALLOCATE(rhs_vf(i)%sf(-1:m+1,-1:n+1,-1:p+1))
311315
@:ACC_SETUP_SFs(rhs_vf(i))
316+
@:PREFER_GPU(rhs_vf(i)%sf)
312317
end do
313318
else
314319
do i = 1, sys_size
@@ -650,6 +655,7 @@ contains
650655
real(wp), intent(INOUT) :: time_avg
651656

652657
integer :: i, j, k, l, q !< Generic loop iterator
658+
integer :: dest
653659

654660
real(wp) :: start, finish
655661

@@ -682,6 +688,7 @@ contains
682688

683689
if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
684690

691+
#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM)
685692
$:GPU_PARALLEL_LOOP(collapse=4)
686693
do i = 1, sys_size
687694
do l = 0, p
@@ -694,6 +701,24 @@ contains
694701
end do
695702
end do
696703
end do
704+
dest = 2 ! result in q_cons_ts(2)%vf
705+
#else
706+
$:GPU_PARALLEL_LOOP(collapse=4)
707+
do i = 1, sys_size
708+
do l = 0, p
709+
do k = 0, n
710+
do j = 0, m
711+
q_cons_ts(2)%vf(i)%sf(j, k, l) = &
712+
q_cons_ts(1)%vf(i)%sf(j, k, l)
713+
q_cons_ts(1)%vf(i)%sf(j, k, l) = &
714+
q_cons_ts(1)%vf(i)%sf(j, k, l) &
715+
+ dt*rhs_vf(i)%sf(j, k, l)
716+
end do
717+
end do
718+
end do
719+
end do
720+
dest = 1 ! result in q_cons_ts(1)%vf
721+
#endif
697722

698723
!Evolve pb and mv for non-polytropic qbmm
699724
if (qbmm .and. (.not. polytropic)) then
@@ -750,10 +775,11 @@ contains
750775

751776
! Stage 2 of 3
752777

753-
call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
778+
call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
754779

755780
if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
756781

782+
#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM)
757783
$:GPU_PARALLEL_LOOP(collapse=4)
758784
do i = 1, sys_size
759785
do l = 0, p
@@ -767,6 +793,23 @@ contains
767793
end do
768794
end do
769795
end do
796+
dest = 2 ! result in q_cons_ts(2)%vf
797+
#else
798+
$:GPU_PARALLEL_LOOP(collapse=4)
799+
do i = 1, sys_size
800+
do l = 0, p
801+
do k = 0, n
802+
do j = 0, m
803+
q_cons_ts(1)%vf(i)%sf(j, k, l) = &
804+
(3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) &
805+
+ q_cons_ts(1)%vf(i)%sf(j, k, l) &
806+
+ dt*rhs_vf(i)%sf(j, k, l))/4._wp
807+
end do
808+
end do
809+
end do
810+
end do
811+
dest = 1 ! result in q_cons_ts(1)%vf
812+
#endif
770813

771814
if (qbmm .and. (.not. polytropic)) then
772815
$:GPU_PARALLEL_LOOP(collapse=5)
@@ -823,10 +866,11 @@ contains
823866
end if
824867

825868
! Stage 3 of 3
826-
call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3)
869+
call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3)
827870

828871
if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3)
829872

873+
#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM)
830874
$:GPU_PARALLEL_LOOP(collapse=4)
831875
do i = 1, sys_size
832876
do l = 0, p
@@ -840,6 +884,23 @@ contains
840884
end do
841885
end do
842886
end do
887+
dest = 1 ! result in q_cons_ts(1)%vf
888+
#else
889+
$:GPU_PARALLEL_LOOP(collapse=4)
890+
do i = 1, sys_size
891+
do l = 0, p
892+
do k = 0, n
893+
do j = 0, m
894+
q_cons_ts(1)%vf(i)%sf(j, k, l) = &
895+
(q_cons_ts(2)%vf(i)%sf(j, k, l) &
896+
+ 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) &
897+
+ 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp
898+
end do
899+
end do
900+
end do
901+
end do
902+
dest = 1 ! result in q_cons_ts(1)%vf
903+
#endif
843904

844905
if (qbmm .and. (.not. polytropic)) then
845906
$:GPU_PARALLEL_LOOP(collapse=5)

0 commit comments

Comments
 (0)