|
| 1 | +module m_cuda_kernels_trans |
| 2 | + use cudafor |
| 3 | + |
| 4 | + use m_common, only: dp |
| 5 | + use m_cuda_common, only: SZ |
| 6 | + |
| 7 | +contains |
| 8 | + |
| 9 | + attributes(global) subroutine trans_x2y_k(u_y, u_x, nz) |
| 10 | + implicit none |
| 11 | + |
| 12 | + real(dp), device, intent(out), dimension(:, :, :) :: u_y |
| 13 | + real(dp), device, intent(in), dimension(:, :, :) :: u_x |
| 14 | + integer, value, intent(in) :: nz |
| 15 | + |
| 16 | + real(dp), shared :: tile(SZ, SZ) |
| 17 | + integer :: i, j, b_i, b_j, b_k |
| 18 | + |
| 19 | + i = threadIdx%x; j = threadIdx%y |
| 20 | + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z |
| 21 | + |
| 22 | + ! copy into shared |
| 23 | + tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz) |
| 24 | + |
| 25 | + call syncthreads() |
| 26 | + |
| 27 | + ! copy into output array from shared |
| 28 | + u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i) |
| 29 | + |
| 30 | + end subroutine trans_x2y_k |
| 31 | + |
| 32 | + attributes(global) subroutine trans_x2z_k(u_z, u_x, nz) |
| 33 | + implicit none |
| 34 | + |
| 35 | + real(dp), device, intent(out), dimension(:, :, :) :: u_z |
| 36 | + real(dp), device, intent(in), dimension(:, :, :) :: u_x |
| 37 | + integer, value, intent(in) :: nz |
| 38 | + |
| 39 | + integer :: i, j, b_i, b_j, nx!, nz |
| 40 | + |
| 41 | + i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y |
| 42 | + nx = gridDim%x |
| 43 | + |
| 44 | + do j = 1, nz |
| 45 | + u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz) |
| 46 | + end do |
| 47 | + |
| 48 | + end subroutine trans_x2z_k |
| 49 | + |
| 50 | + attributes(global) subroutine trans_y2x_k(u_x, u_y, nz) |
| 51 | + implicit none |
| 52 | + |
| 53 | + real(dp), device, intent(out), dimension(:, :, :) :: u_x |
| 54 | + real(dp), device, intent(in), dimension(:, :, :) :: u_y |
| 55 | + integer, value, intent(in) :: nz |
| 56 | + |
| 57 | + real(dp), shared :: tile(SZ, SZ) |
| 58 | + integer :: i, j, b_i, b_j, b_k |
| 59 | + |
| 60 | + i = threadIdx%x; j = threadIdx%y |
| 61 | + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z |
| 62 | + |
| 63 | + ! copy into shared |
| 64 | + tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) |
| 65 | + |
| 66 | + call syncthreads() |
| 67 | + |
| 68 | + ! copy into output array from shared |
| 69 | + u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i) |
| 70 | + |
| 71 | + end subroutine trans_y2x_k |
| 72 | + |
| 73 | + attributes(global) subroutine trans_y2z_k(u_z, u_y, nx, nz) |
| 74 | + implicit none |
| 75 | + |
| 76 | + real(dp), device, intent(out), dimension(:, :, :) :: u_z |
| 77 | + real(dp), device, intent(in), dimension(:, :, :) :: u_y |
| 78 | + integer, value, intent(in) :: nx, nz |
| 79 | + |
| 80 | + real(dp), shared :: tile(SZ, SZ) |
| 81 | + integer :: i, j, b_i, b_j, b_k |
| 82 | + |
| 83 | + i = threadIdx%x; j = threadIdx%y |
| 84 | + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z |
| 85 | + |
| 86 | + ! copy into shared |
| 87 | + tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) |
| 88 | + |
| 89 | + call syncthreads() |
| 90 | + |
| 91 | + ! copy into output array from shared |
| 92 | + u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i) |
| 93 | + |
| 94 | + end subroutine trans_y2z_k |
| 95 | + |
| 96 | + attributes(global) subroutine trans_z2y_k(u_y, u_z, nx, nz) |
| 97 | + implicit none |
| 98 | + |
| 99 | + real(dp), device, intent(out), dimension(:, :, :) :: u_y |
| 100 | + real(dp), device, intent(in), dimension(:, :, :) :: u_z |
| 101 | + integer, value, intent(in) :: nx, nz |
| 102 | + |
| 103 | + real(dp), shared :: tile(SZ, SZ) |
| 104 | + integer :: i, j, b_i, b_j, b_k |
| 105 | + |
| 106 | + i = threadIdx%x; j = threadIdx%y |
| 107 | + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z |
| 108 | + |
| 109 | + ! copy into shared |
| 110 | + tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) |
| 111 | + |
| 112 | + call syncthreads() |
| 113 | + |
| 114 | + ! copy into output array from shared |
| 115 | + u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i) |
| 116 | + |
| 117 | + end subroutine trans_z2y_k |
| 118 | + |
| 119 | +end module m_cuda_kernels_trans |
0 commit comments