Skip to content

Commit 18fe0bd

Browse files
authored
Merge pull request #29 from semi-h/feature
Add transposer kernels for reordering field data.
2 parents 26d4d4c + 932df13 commit 18fe0bd

File tree

9 files changed

+428
-120
lines changed

9 files changed

+428
-120
lines changed

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ set(CUDASRC
1717
cuda/allocator.f90
1818
cuda/exec_dist.f90
1919
cuda/kernels_dist.f90
20+
cuda/kernels_reorder.f90
2021
cuda/sendrecv.f90
2122
cuda/tdsops.f90
2223
)

src/backend.f90

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,15 @@ module m_base_backend
2121
!! architecture.
2222

2323
real(dp) :: nu
24+
integer :: nx_loc, ny_loc, nz_loc
2425
class(allocator_t), pointer :: allocator
2526
class(dirps_t), pointer :: xdirps, ydirps, zdirps
2627
contains
2728
procedure(transeq_ders), deferred :: transeq_x
2829
procedure(transeq_ders), deferred :: transeq_y
2930
procedure(transeq_ders), deferred :: transeq_z
3031
procedure(tds_solve), deferred :: tds_solve
31-
procedure(transposer), deferred :: trans_x2y
32-
procedure(transposer), deferred :: trans_x2z
33-
procedure(trans_d2d), deferred :: trans_y2z
34-
procedure(trans_d2d), deferred :: trans_z2y
35-
procedure(trans_d2d), deferred :: trans_y2x
32+
procedure(reorder), deferred :: reorder
3633
procedure(sum9into3), deferred :: sum_yzintox
3734
procedure(vecadd), deferred :: vecadd
3835
procedure(get_fields), deferred :: get_fields
@@ -81,22 +78,8 @@ end subroutine tds_solve
8178
end interface
8279

8380
abstract interface
84-
subroutine transposer(self, u_, v_, w_, u, v, w)
85-
!! transposer subroutines are straightforward, they rearrange
86-
!! data into our specialist data structure so that regardless
87-
!! of the direction tridiagonal systems are solved efficiently
88-
!! and fast.
89-
import :: base_backend_t
90-
import :: field_t
91-
implicit none
92-
93-
class(base_backend_t) :: self
94-
class(field_t), intent(inout) :: u_, v_, w_
95-
class(field_t), intent(in) :: u, v, w
96-
end subroutine transposer
97-
98-
subroutine trans_d2d(self, u_, u)
99-
!! transposer subroutines are straightforward, they rearrange
81+
subroutine reorder(self, u_, u, direction)
82+
!! reorder subroutines are straightforward, they rearrange
10083
!! data into our specialist data structure so that regardless
10184
!! of the direction tridiagonal systems are solved efficiently
10285
!! and fast.
@@ -107,7 +90,8 @@ subroutine trans_d2d(self, u_, u)
10790
class(base_backend_t) :: self
10891
class(field_t), intent(inout) :: u_
10992
class(field_t), intent(in) :: u
110-
end subroutine trans_d2d
93+
integer, intent(in) :: direction
94+
end subroutine reorder
11195
end interface
11296

11397
abstract interface

src/common.f90

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ module m_common
44
integer, parameter :: dp=kind(0.0d0)
55
real(dp), parameter :: pi = 4*atan(1.0_dp)
66

7+
integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
8+
RDR_Y2Z = 23, RDR_Z2Y = 32
9+
710
type :: globs_t
811
integer :: nx, ny, nz
912
integer :: nx_loc, ny_loc, nz_loc

src/cuda/backend.f90

Lines changed: 45 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
module m_cuda_backend
2+
use iso_fortran_env, only: stderr => error_unit
23
use cudafor
34

45
use m_allocator, only: allocator_t, field_t
56
use m_base_backend, only: base_backend_t
6-
use m_common, only: dp, globs_t
7+
use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y
78
use m_tdsops, only: dirps_t, tdsops_t
89

910
use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t
@@ -12,6 +13,8 @@ module m_cuda_backend
1213
use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
1314
use m_cuda_tdsops, only: cuda_tdsops_t
1415
use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
16+
use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, &
17+
reorder_y2z, reorder_z2y
1518

1619
implicit none
1720

@@ -32,11 +35,7 @@ module m_cuda_backend
3235
procedure :: transeq_y => transeq_y_cuda
3336
procedure :: transeq_z => transeq_z_cuda
3437
procedure :: tds_solve => tds_solve_cuda
35-
procedure :: trans_x2y => trans_x2y_cuda
36-
procedure :: trans_x2z => trans_x2z_cuda
37-
procedure :: trans_y2z => trans_y2z_cuda
38-
procedure :: trans_z2y => trans_z2y_cuda
39-
procedure :: trans_y2x => trans_y2x_cuda
38+
procedure :: reorder => reorder_cuda
4039
procedure :: sum_yzintox => sum_yzintox_cuda
4140
procedure :: vecadd => vecadd_cuda
4241
procedure :: set_fields => set_fields_cuda
@@ -74,6 +73,10 @@ function init(globs, allocator) result(backend)
7473
backend%zthreads = dim3(SZ, 1, 1)
7574
backend%zblocks = dim3(globs%n_groups_z, 1, 1)
7675

76+
backend%nx_loc = globs%nx_loc
77+
backend%ny_loc = globs%ny_loc
78+
backend%nz_loc = globs%nz_loc
79+
7780
n_halo = 4
7881
n_block = globs%n_groups_x
7982

@@ -415,50 +418,48 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads)
415418

416419
end subroutine tds_solve_dist
417420

418-
subroutine trans_x2y_cuda(self, u_y, v_y, w_y, u, v, w)
419-
implicit none
420-
421-
class(cuda_backend_t) :: self
422-
class(field_t), intent(inout) :: u_y, v_y, w_y
423-
class(field_t), intent(in) :: u, v, w
424-
425-
end subroutine trans_x2y_cuda
426-
427-
subroutine trans_x2z_cuda(self, u_z, v_z, w_z, u, v, w)
421+
subroutine reorder_cuda(self, u_o, u_i, direction)
428422
implicit none
429423

430424
class(cuda_backend_t) :: self
431-
class(field_t), intent(inout) :: u_z, v_z, w_z
432-
class(field_t), intent(in) :: u, v, w
433-
434-
end subroutine trans_x2z_cuda
425+
class(field_t), intent(inout) :: u_o
426+
class(field_t), intent(in) :: u_i
427+
integer, intent(in) :: direction
435428

436-
subroutine trans_y2z_cuda(self, u_z, u_y)
437-
implicit none
438-
439-
class(cuda_backend_t) :: self
440-
class(field_t), intent(inout) :: u_z
441-
class(field_t), intent(in) :: u_y
442-
443-
end subroutine trans_y2z_cuda
444-
445-
subroutine trans_z2y_cuda(self, u_y, u_z)
446-
implicit none
447-
448-
class(cuda_backend_t) :: self
449-
class(field_t), intent(inout) :: u_y
450-
class(field_t), intent(in) :: u_z
451-
452-
end subroutine trans_z2y_cuda
453-
454-
subroutine trans_y2x_cuda(self, u_x, u_y)
455-
implicit none
429+
real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d
430+
type(dim3) :: blocks, threads
456431

457-
class(cuda_backend_t) :: self
458-
class(field_t), intent(inout) :: u_x
459-
class(field_t), intent(in) :: u_y
432+
select type(u_o); type is (cuda_field_t); u_o_d => u_o%data_d; end select
433+
select type(u_i); type is (cuda_field_t); u_i_d => u_i%data_d; end select
434+
435+
select case (direction)
436+
case (RDR_X2Y) ! x2y
437+
blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
438+
threads = dim3(SZ, SZ, 1)
439+
call reorder_x2y<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
440+
case (RDR_X2Z) ! x2z
441+
blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
442+
threads = dim3(SZ, 1, 1)
443+
call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
444+
case (RDR_Y2X) ! y2x
445+
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
446+
threads = dim3(SZ, SZ, 1)
447+
call reorder_y2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
448+
case (RDR_Y2Z) ! y2z
449+
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
450+
threads = dim3(SZ, SZ, 1)
451+
call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, &
452+
self%nx_loc, self%nz_loc)
453+
case (RDR_Z2Y) ! z2y
454+
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
455+
threads = dim3(SZ, SZ, 1)
456+
call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, &
457+
self%nx_loc, self%nz_loc)
458+
case default
459+
error stop 'Reorder direction is undefined.'
460+
end select
460461

461-
end subroutine trans_y2x_cuda
462+
end subroutine reorder_cuda
462463

463464
subroutine sum_yzintox_cuda(self, du, dv, dw, &
464465
du_y, dv_y, dw_y, du_z, dv_z, dw_z)

src/cuda/kernels_reorder.f90

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
module m_cuda_kernels_reorder
2+
use cudafor
3+
4+
use m_common, only: dp
5+
use m_cuda_common, only: SZ
6+
7+
contains
8+
9+
attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
10+
implicit none
11+
12+
real(dp), device, intent(out), dimension(:, :, :) :: u_y
13+
real(dp), device, intent(in), dimension(:, :, :) :: u_x
14+
integer, value, intent(in) :: nz
15+
16+
real(dp), shared :: tile(SZ, SZ)
17+
integer :: i, j, b_i, b_j, b_k
18+
19+
i = threadIdx%x; j = threadIdx%y
20+
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
21+
22+
! copy into shared
23+
tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz)
24+
25+
call syncthreads()
26+
27+
! copy into output array from shared
28+
u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i)
29+
30+
end subroutine reorder_x2y
31+
32+
attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
33+
implicit none
34+
35+
real(dp), device, intent(out), dimension(:, :, :) :: u_z
36+
real(dp), device, intent(in), dimension(:, :, :) :: u_x
37+
integer, value, intent(in) :: nz
38+
39+
integer :: i, j, b_i, b_j, nx
40+
41+
i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
42+
nx = gridDim%x
43+
44+
! Data access pattern for reordering between x and z is quite nice
45+
! thus we don't need to use shared memory for this operation.
46+
do j = 1, nz
47+
u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz)
48+
end do
49+
50+
end subroutine reorder_x2z
51+
52+
attributes(global) subroutine reorder_y2x(u_x, u_y, nz)
53+
implicit none
54+
55+
real(dp), device, intent(out), dimension(:, :, :) :: u_x
56+
real(dp), device, intent(in), dimension(:, :, :) :: u_y
57+
integer, value, intent(in) :: nz
58+
59+
real(dp), shared :: tile(SZ, SZ)
60+
integer :: i, j, b_i, b_j, b_k
61+
62+
i = threadIdx%x; j = threadIdx%y
63+
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
64+
65+
! copy into shared
66+
tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)
67+
68+
call syncthreads()
69+
70+
! copy into output array from shared
71+
u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i)
72+
73+
end subroutine reorder_y2x
74+
75+
attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
76+
implicit none
77+
78+
real(dp), device, intent(out), dimension(:, :, :) :: u_z
79+
real(dp), device, intent(in), dimension(:, :, :) :: u_y
80+
integer, value, intent(in) :: nx, nz
81+
82+
real(dp), shared :: tile(SZ, SZ)
83+
integer :: i, j, b_i, b_j, b_k
84+
85+
i = threadIdx%x; j = threadIdx%y
86+
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
87+
88+
! copy into shared
89+
tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)
90+
91+
call syncthreads()
92+
93+
! copy into output array from shared
94+
u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i)
95+
96+
end subroutine reorder_y2z
97+
98+
attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
99+
implicit none
100+
101+
real(dp), device, intent(out), dimension(:, :, :) :: u_y
102+
real(dp), device, intent(in), dimension(:, :, :) :: u_z
103+
integer, value, intent(in) :: nx, nz
104+
105+
real(dp), shared :: tile(SZ, SZ)
106+
integer :: i, j, b_i, b_j, b_k
107+
108+
i = threadIdx%x; j = threadIdx%y
109+
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
110+
111+
! copy into shared
112+
tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx)
113+
114+
call syncthreads()
115+
116+
! copy into output array from shared
117+
u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i)
118+
119+
end subroutine reorder_z2y
120+
121+
end module m_cuda_kernels_reorder

src/omp/backend.f90

Lines changed: 4 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,7 @@ module m_omp_backend
2424
procedure :: transeq_y => transeq_y_omp
2525
procedure :: transeq_z => transeq_z_omp
2626
procedure :: tds_solve => tds_solve_omp
27-
procedure :: trans_x2y => trans_x2y_omp
28-
procedure :: trans_x2z => trans_x2z_omp
29-
procedure :: trans_y2z => trans_y2z_omp
30-
procedure :: trans_z2y => trans_z2y_omp
31-
procedure :: trans_y2x => trans_y2x_omp
27+
procedure :: reorder => reorder_omp
3228
procedure :: sum_yzintox => sum_yzintox_omp
3329
procedure :: vecadd => vecadd_omp
3430
procedure :: set_fields => set_fields_omp
@@ -164,50 +160,15 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops)
164160

165161
end subroutine tds_solve_omp
166162

167-
subroutine trans_x2y_omp(self, u_, v_, w_, u, v, w)
168-
implicit none
169-
170-
class(omp_backend_t) :: self
171-
class(field_t), intent(inout) :: u_, v_, w_
172-
class(field_t), intent(in) :: u, v, w
173-
174-
end subroutine trans_x2y_omp
175-
176-
subroutine trans_x2z_omp(self, u_, v_, w_, u, v, w)
177-
implicit none
178-
179-
class(omp_backend_t) :: self
180-
class(field_t), intent(inout) :: u_, v_, w_
181-
class(field_t), intent(in) :: u, v, w
182-
183-
end subroutine trans_x2z_omp
184-
185-
subroutine trans_y2z_omp(self, u_, u)
186-
implicit none
187-
188-
class(omp_backend_t) :: self
189-
class(field_t), intent(inout) :: u_
190-
class(field_t), intent(in) :: u
191-
192-
end subroutine trans_y2z_omp
193-
194-
subroutine trans_z2y_omp(self, u_, u)
195-
implicit none
196-
197-
class(omp_backend_t) :: self
198-
class(field_t), intent(inout) :: u_
199-
class(field_t), intent(in) :: u
200-
201-
end subroutine trans_z2y_omp
202-
203-
subroutine trans_y2x_omp(self, u_, u)
163+
subroutine reorder_omp(self, u_, u, direction)
204164
implicit none
205165

206166
class(omp_backend_t) :: self
207167
class(field_t), intent(inout) :: u_
208168
class(field_t), intent(in) :: u
169+
integer, intent(in) :: direction
209170

210-
end subroutine trans_y2x_omp
171+
end subroutine reorder_omp
211172

212173
subroutine sum_yzintox_omp(self, du, dv, dw, &
213174
du_y, dv_y, dw_y, du_z, dv_z, dw_z)

0 commit comments

Comments
 (0)