Skip to content

Commit 5614c06

Browse files
committed
feat(cuda): Add transpose kernels.
1 parent 294d6fb commit 5614c06

File tree

2 files changed

+120
-0
lines changed

2 files changed

+120
-0
lines changed

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ set(CUDASRC
1717
cuda/allocator.f90
1818
cuda/exec_dist.f90
1919
cuda/kernels_dist.f90
20+
cuda/kernels_trans.f90
2021
cuda/sendrecv.f90
2122
cuda/tdsops.f90
2223
)

src/cuda/kernels_trans.f90

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
module m_cuda_kernels_trans
2+
use cudafor
3+
4+
use m_common, only: dp
5+
use m_cuda_common, only: SZ
6+
7+
contains
8+
9+
attributes(global) subroutine trans_x2y_k(u_y, u_x, nz)
10+
implicit none
11+
12+
real(dp), device, intent(out), dimension(:, :, :) :: u_y
13+
real(dp), device, intent(in), dimension(:, :, :) :: u_x
14+
integer, value, intent(in) :: nz
15+
16+
real(dp), shared :: tile(SZ, SZ)
17+
integer :: i, j, b_i, b_j, b_k
18+
19+
i = threadIdx%x; j = threadIdx%y
20+
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
21+
22+
! copy into shared
23+
tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz)
24+
25+
call syncthreads()
26+
27+
! copy into output array from shared
28+
u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i)
29+
30+
end subroutine trans_x2y_k
31+
32+
attributes(global) subroutine trans_x2z_k(u_z, u_x, nz)
33+
implicit none
34+
35+
real(dp), device, intent(out), dimension(:, :, :) :: u_z
36+
real(dp), device, intent(in), dimension(:, :, :) :: u_x
37+
integer, value, intent(in) :: nz
38+
39+
integer :: i, j, b_i, b_j, nx!, nz
40+
41+
i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
42+
nx = gridDim%x
43+
44+
do j = 1, nz
45+
u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz)
46+
end do
47+
48+
end subroutine trans_x2z_k
49+
50+
attributes(global) subroutine trans_y2x_k(u_x, u_y, nz)
51+
implicit none
52+
53+
real(dp), device, intent(out), dimension(:, :, :) :: u_x
54+
real(dp), device, intent(in), dimension(:, :, :) :: u_y
55+
integer, value, intent(in) :: nz
56+
57+
real(dp), shared :: tile(SZ, SZ)
58+
integer :: i, j, b_i, b_j, b_k
59+
60+
i = threadIdx%x; j = threadIdx%y
61+
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
62+
63+
! copy into shared
64+
tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)
65+
66+
call syncthreads()
67+
68+
! copy into output array from shared
69+
u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i)
70+
71+
end subroutine trans_y2x_k
72+
73+
attributes(global) subroutine trans_y2z_k(u_z, u_y, nx, nz)
74+
implicit none
75+
76+
real(dp), device, intent(out), dimension(:, :, :) :: u_z
77+
real(dp), device, intent(in), dimension(:, :, :) :: u_y
78+
integer, value, intent(in) :: nx, nz
79+
80+
real(dp), shared :: tile(SZ, SZ)
81+
integer :: i, j, b_i, b_j, b_k
82+
83+
i = threadIdx%x; j = threadIdx%y
84+
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
85+
86+
! copy into shared
87+
tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)
88+
89+
call syncthreads()
90+
91+
! copy into output array from shared
92+
u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i)
93+
94+
end subroutine trans_y2z_k
95+
96+
attributes(global) subroutine trans_z2y_k(u_y, u_z, nx, nz)
97+
implicit none
98+
99+
real(dp), device, intent(out), dimension(:, :, :) :: u_y
100+
real(dp), device, intent(in), dimension(:, :, :) :: u_z
101+
integer, value, intent(in) :: nx, nz
102+
103+
real(dp), shared :: tile(SZ, SZ)
104+
integer :: i, j, b_i, b_j, b_k
105+
106+
i = threadIdx%x; j = threadIdx%y
107+
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
108+
109+
! copy into shared
110+
tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx)
111+
112+
call syncthreads()
113+
114+
! copy into output array from shared
115+
u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i)
116+
117+
end subroutine trans_z2y_k
118+
119+
end module m_cuda_kernels_trans

0 commit comments

Comments
 (0)