Skip to content

Commit f976b66

Browse files
committed
Added GPU_PARALLEL and replaced rest of acc directives
1 parent ce2a250 commit f976b66

File tree

7 files changed

+145
-51
lines changed

7 files changed

+145
-51
lines changed

docs/documentation/gpuParallelization.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,68 @@ Uses FYPP eval directive using `$:`
139139

140140
</details>
141141

142+
<details>
143+
<summary><code>GPU_PARALLEL</code> -- <code>(Execute the following on the GPU in parallel)</code></summary>
144+
145+
**Macro Invocation**
146+
147+
Uses FYPP call directive using `#:call`
148+
149+
```C
150+
#:call GPU_PARALLEL(...)
151+
{code}
152+
#:endcall GPU_PARALLEL
153+
```
154+
155+
**Parameters**
156+
157+
| name | data type | Default Value | description |
158+
|------------------|---------------------|-------------------|-------------------------------------------------------------------------------------------|
159+
| `default` | string | 'present' | Implicit assumptions compiler should make |
160+
| `private` | string list | None | Variables that are private to each iteration/thread |
161+
| `firstprivate` | string list | None | Initialized variables that are private to each iteration/thread |
162+
| `reduction` | 2-level string list | None | Variables unique to each iteration and reduced at the end |
163+
| `reductionOp` | string list | None | Operator that each list of reduction will reduce with |
164+
| `copy` | string list | None | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit |
165+
| `copyin` | string list | None | Allocates and copies data to GPU on entrance and then deallocated on exit |
166+
| `copyinReadOnly` | string list | None | Allocates and copies readonly data to GPU and then deallocated on exit |
167+
| `copyout` | string list | None | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit |
168+
| `create` | string list | None | Allocates data on GPU on entrance and then deallocates on exit |
169+
| `no_create` | string list | None | Use data in CPU memory unless data is already in GPU memory |
170+
| `present` | string list | None | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
171+
| `deviceptr` | string list | None | Pointer variables that are already allocated on GPU memory |
172+
| `attach` | string list | None | Attaches device pointer to device targets on entrance, then detach on exit |
173+
| `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive |
174+
175+
**Parameter Restrictions**
176+
177+
| name | Restricted range |
178+
|---------------|---------------------------------------------------|
179+
| `default` | 'present' or 'none' |
180+
181+
**Additional information**
182+
183+
- default present means that the any non-scalar data in assumed to be present on the GPU
184+
- default none means that the compiler should not implicitly determine the data attributes for any variable
185+
- reduction and reductionOp must match in length
186+
- With ``reduction='[[sum1, sum2], [largest]]'`` and ``reductionOp='[+, max]'``, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
187+
- A reduction implies a copy, so it does not need to be added for both
188+
189+
**Example**
190+
191+
```C
192+
#:call GPU_PARALLEL()
193+
{code}
194+
...
195+
#:endcall GPU_PARALLEL
196+
#:call GPU_PARALLEL(create='[pixel_arr]', copyin='[initial_index]')
197+
{code}
198+
...
199+
#:endcall
200+
```
201+
202+
</details>
203+
142204
------------------------------------------------------------------------------------------
143205

144206
### Data Control Macros

src/common/include/parallel_macros.fpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,34 @@
199199
$:use_device_val
200200
#:enddef
201201

202+
#:def GPU_PARALLEL(code, private=None, default='present', firstprivate=None, reduction=None, reductionOp=None, &
203+
& copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, &
204+
& no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None)
205+
#:set default_val = GEN_DEFAULT_STR(default)
206+
#:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n')
207+
#:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
208+
#:set copy_val = GEN_COPY_STR(copy)
209+
#:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
210+
#:set copyout_val = GEN_COPYOUT_STR(copyout)
211+
#:set create_val = GEN_CREATE_STR(create)
212+
#:set no_create_val = GEN_NOCREATE_STR(no_create)
213+
#:set present_val = GEN_PRESENT_STR(present)
214+
#:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
215+
#:set attach_val = GEN_ATTACH_STR(attach)
216+
#:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
217+
#:set clause_val = default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + &
218+
& copy_val.strip('\n') + copyin_val.strip('\n') + &
219+
& copyout_val.strip('\n') + create_val.strip('\n') + &
220+
& no_create_val.strip('\n') + present_val.strip('\n') + &
221+
& deviceptr_val.strip('\n') + attach_val.strip('\n')
222+
#:set acc_directive = '!$acc parallel ' + &
223+
& clause_val + extraAccArgs_val.strip('\n')
224+
#:set end_acc_directive = '!$acc end parallel'
225+
$:acc_directive
226+
$:code
227+
$:end_acc_directive
228+
#:enddef
229+
202230

203231
#:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', &
204232
& default='present', firstprivate=None, reduction=None, reductionOp=None, &

src/common/m_mpi_common.fpp

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ module m_mpi_common
2525
implicit none
2626
2727
integer, private :: ierr, v_size !<
28-
!$acc declare create(v_size)
28+
$:GPU_DECLARE(create='[v_size]')
2929
!! Generic flags used to identify and report MPI errors
3030
3131
real(wp), private, allocatable, dimension(:) :: buff_send !<
@@ -38,10 +38,10 @@ module m_mpi_common
3838
!! average primitive variables, for a single computational domain boundary
3939
!! at the time, from the relevant neighboring processor.
4040
41-
!$acc declare create(buff_send, buff_recv)
41+
$:GPU_DECLARE(create='[buff_send, buff_recv]')
4242
4343
integer :: halo_size
44-
!$acc declare create(halo_size)
44+
$:GPU_DECLARE(create='[halo_size]')
4545
4646
contains
4747
@@ -76,7 +76,7 @@ contains
7676
halo_size = -1 + buff_size*(v_size)
7777
end if
7878
79-
!$acc update device(halo_size, v_size)
79+
$:GPU_UPDATE(device='[halo_size, v_size]')
8080
8181
@:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
8282
#endif
@@ -631,7 +631,7 @@ contains
631631
/)
632632
end if
633633
634-
!$acc update device(v_size)
634+
$:GPU_UPDATE(device='[v_size]')
635635
636636
buffer_count = buffer_counts(mpi_dir)
637637
boundary_conditions = (/bc_x, bc_y, bc_z/)
@@ -667,7 +667,7 @@ contains
667667
#:for mpi_dir in [1, 2, 3]
668668
if (mpi_dir == ${mpi_dir}$) then
669669
#:if mpi_dir == 1
670-
!$acc parallel loop collapse(4) gang vector default(present) private(r)
670+
$:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
671671
do l = 0, p
672672
do k = 0, n
673673
do j = 0, buff_size - 1
@@ -680,7 +680,7 @@ contains
680680
end do
681681
682682
if (qbmm_comm) then
683-
!$acc parallel loop collapse(4) gang vector default(present) private(r)
683+
$:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
684684
do l = 0, p
685685
do k = 0, n
686686
do j = 0, buff_size - 1
@@ -695,7 +695,7 @@ contains
695695
end do
696696
end do
697697
698-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
698+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
699699
do l = 0, p
700700
do k = 0, n
701701
do j = 0, buff_size - 1
@@ -711,7 +711,7 @@ contains
711711
end do
712712
end if
713713
#:elif mpi_dir == 2
714-
!$acc parallel loop collapse(4) gang vector default(present) private(r)
714+
$:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
715715
do i = 1, nVar
716716
do l = 0, p
717717
do k = 0, buff_size - 1
@@ -726,7 +726,7 @@ contains
726726
end do
727727
728728
if (qbmm_comm) then
729-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
729+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
730730
do i = nVar + 1, nVar + 4
731731
do l = 0, p
732732
do k = 0, buff_size - 1
@@ -742,7 +742,7 @@ contains
742742
end do
743743
end do
744744
745-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
745+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
746746
do i = nVar + 1, nVar + 4
747747
do l = 0, p
748748
do k = 0, buff_size - 1
@@ -759,7 +759,7 @@ contains
759759
end do
760760
end if
761761
#:else
762-
!$acc parallel loop collapse(4) gang vector default(present) private(r)
762+
$:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
763763
do i = 1, nVar
764764
do l = 0, buff_size - 1
765765
do k = -buff_size, n + buff_size
@@ -774,7 +774,7 @@ contains
774774
end do
775775
776776
if (qbmm_comm) then
777-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
777+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
778778
do i = nVar + 1, nVar + 4
779779
do l = 0, buff_size - 1
780780
do k = -buff_size, n + buff_size
@@ -790,7 +790,7 @@ contains
790790
end do
791791
end do
792792
793-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
793+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
794794
do i = nVar + 1, nVar + 4
795795
do l = 0, buff_size - 1
796796
do k = -buff_size, n + buff_size
@@ -816,28 +816,33 @@ contains
816816
#:for rdma_mpi in [False, True]
817817
if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
818818
#:if rdma_mpi
819-
!$acc host_data use_device(buff_send, buff_recv)
820-
call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
819+
#:call GPU_HOST_DATA(use_device='[buff_send, buff_recv]')
820+
call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
821+
822+
call MPI_SENDRECV( &
823+
buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
824+
buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
825+
MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
826+
827+
call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
828+
829+
#:endcall GPU_HOST_DATA
830+
$:GPU_WAIT()
821831
#:else
822832
call nvtxStartRange("RHS-COMM-DEV2HOST")
823-
!$acc update host(buff_send)
833+
$:GPU_UPDATE(host='[buff_send]')
824834
call nvtxEndRange
825835
call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
826-
#:endif
827836
828-
call MPI_SENDRECV( &
829-
buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
830-
buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
831-
MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
837+
call MPI_SENDRECV( &
838+
buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
839+
buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
840+
MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
832841
833-
call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
842+
call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
834843
835-
#:if rdma_mpi
836-
!$acc end host_data
837-
!$acc wait
838-
#:else
839844
call nvtxStartRange("RHS-COMM-HOST2DEV")
840-
!$acc update device(buff_recv)
845+
$:GPU_UPDATE(device='[buff_recv]')
841846
call nvtxEndRange
842847
#:endif
843848
end if
@@ -854,7 +859,7 @@ contains
854859
#:for mpi_dir in [1, 2, 3]
855860
if (mpi_dir == ${mpi_dir}$) then
856861
#:if mpi_dir == 1
857-
!$acc parallel loop collapse(4) gang vector default(present) private(r)
862+
$:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
858863
do l = 0, p
859864
do k = 0, n
860865
do j = -buff_size, -1
@@ -874,7 +879,7 @@ contains
874879
end do
875880
876881
if (qbmm_comm) then
877-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
882+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
878883
do l = 0, p
879884
do k = 0, n
880885
do j = -buff_size, -1
@@ -889,7 +894,7 @@ contains
889894
end do
890895
end do
891896
892-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
897+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
893898
do l = 0, p
894899
do k = 0, n
895900
do j = -buff_size, -1
@@ -905,7 +910,7 @@ contains
905910
end do
906911
end if
907912
#:elif mpi_dir == 2
908-
!$acc parallel loop collapse(4) gang vector default(present) private(r)
913+
$:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
909914
do i = 1, nVar
910915
do l = 0, p
911916
do k = -buff_size, -1
@@ -926,7 +931,7 @@ contains
926931
end do
927932
928933
if (qbmm_comm) then
929-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
934+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
930935
do i = nVar + 1, nVar + 4
931936
do l = 0, p
932937
do k = -buff_size, -1
@@ -942,7 +947,7 @@ contains
942947
end do
943948
end do
944949
945-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
950+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
946951
do i = nVar + 1, nVar + 4
947952
do l = 0, p
948953
do k = -buff_size, -1
@@ -960,7 +965,7 @@ contains
960965
end if
961966
#:else
962967
! Unpacking buffer from bc_z%beg
963-
!$acc parallel loop collapse(4) gang vector default(present) private(r)
968+
$:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
964969
do i = 1, nVar
965970
do l = -buff_size, -1
966971
do k = -buff_size, n + buff_size
@@ -982,7 +987,7 @@ contains
982987
end do
983988
984989
if (qbmm_comm) then
985-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
990+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
986991
do i = nVar + 1, nVar + 4
987992
do l = -buff_size, -1
988993
do k = -buff_size, n + buff_size
@@ -999,7 +1004,7 @@ contains
9991004
end do
10001005
end do
10011006
1002-
!$acc parallel loop collapse(5) gang vector default(present) private(r)
1007+
$:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
10031008
do i = nVar + 1, nVar + 4
10041009
do l = -buff_size, -1
10051010
do k = -buff_size, n + buff_size

src/simulation/m_data_output.fpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -316,15 +316,14 @@ contains
316316
Rc_min_loc = minval(Rc_sf)
317317
end if
318318
#else
319-
!$acc kernels
319+
#:call GPU_PARALLEL()
320320
icfl_max_loc = maxval(icfl_sf)
321-
!$acc end kernels
322-
321+
#:endcall GPU_PARALLEL
323322
if (viscous) then
324-
!$acc kernels
323+
#:call GPU_PARALLEL()
325324
vcfl_max_loc = maxval(vcfl_sf)
326325
Rc_min_loc = minval(Rc_sf)
327-
!$acc end kernels
326+
#:endcall GPU_PARALLEL
328327
end if
329328
#endif
330329

0 commit comments

Comments
 (0)