Merge pull request #29 from semi-h/feature

semi-h · web-flow · commit 18fe0bd23700 · 2024-02-06T14:16:30.000Z
Add transposer kernels for reordering field data.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -17,6 +17,7 @@ set(CUDASRC
   cuda/allocator.f90
   cuda/exec_dist.f90
   cuda/kernels_dist.f90
+  cuda/kernels_reorder.f90
   cuda/sendrecv.f90
   cuda/tdsops.f90
 )
diff --git a/src/backend.f90 b/src/backend.f90
@@ -21,18 +21,15 @@ module m_base_backend
       !! architecture.
 
       real(dp) :: nu
+      integer :: nx_loc, ny_loc, nz_loc
       class(allocator_t), pointer :: allocator
       class(dirps_t), pointer :: xdirps, ydirps, zdirps
    contains
       procedure(transeq_ders), deferred :: transeq_x
       procedure(transeq_ders), deferred :: transeq_y
       procedure(transeq_ders), deferred :: transeq_z
       procedure(tds_solve), deferred :: tds_solve
-      procedure(transposer), deferred :: trans_x2y
-      procedure(transposer), deferred :: trans_x2z
-      procedure(trans_d2d), deferred :: trans_y2z
-      procedure(trans_d2d), deferred :: trans_z2y
-      procedure(trans_d2d), deferred :: trans_y2x
+      procedure(reorder), deferred :: reorder
       procedure(sum9into3), deferred :: sum_yzintox
       procedure(vecadd), deferred :: vecadd
       procedure(get_fields), deferred :: get_fields
@@ -81,22 +78,8 @@ end subroutine tds_solve
    end interface
 
    abstract interface
-      subroutine transposer(self, u_, v_, w_, u, v, w)
-         !! transposer subroutines are straightforward, they rearrange
-         !! data into our specialist data structure so that regardless
-         !! of the direction tridiagonal systems are solved efficiently
-         !! and fast.
-         import :: base_backend_t
-         import :: field_t
-         implicit none
-
-         class(base_backend_t) :: self
-         class(field_t), intent(inout) :: u_, v_, w_
-         class(field_t), intent(in) :: u, v, w
-      end subroutine transposer
-
-      subroutine trans_d2d(self, u_, u)
-         !! transposer subroutines are straightforward, they rearrange
+      subroutine reorder(self, u_, u, direction)
+         !! reorder subroutines are straightforward, they rearrange
          !! data into our specialist data structure so that regardless
          !! of the direction tridiagonal systems are solved efficiently
          !! and fast.
@@ -107,7 +90,8 @@ subroutine trans_d2d(self, u_, u)
          class(base_backend_t) :: self
          class(field_t), intent(inout) :: u_
          class(field_t), intent(in) :: u
-      end subroutine trans_d2d
+         integer, intent(in) :: direction
+      end subroutine reorder
    end interface
 
    abstract interface
diff --git a/src/common.f90 b/src/common.f90
@@ -4,6 +4,9 @@ module m_common
    integer, parameter :: dp=kind(0.0d0)
    real(dp), parameter :: pi = 4*atan(1.0_dp)
 
+   integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
+                         RDR_Y2Z = 23, RDR_Z2Y = 32
+
    type :: globs_t
       integer :: nx, ny, nz
       integer :: nx_loc, ny_loc, nz_loc
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
@@ -1,9 +1,10 @@
 module m_cuda_backend
+   use iso_fortran_env, only: stderr => error_unit
    use cudafor
 
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
-   use m_common, only: dp, globs_t
+   use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y
    use m_tdsops, only: dirps_t, tdsops_t
 
    use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t
@@ -12,6 +13,8 @@ module m_cuda_backend
    use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
    use m_cuda_tdsops, only: cuda_tdsops_t
    use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
+   use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, &
+                                     reorder_y2z, reorder_z2y
 
    implicit none
 
@@ -32,11 +35,7 @@ module m_cuda_backend
       procedure :: transeq_y => transeq_y_cuda
       procedure :: transeq_z => transeq_z_cuda
       procedure :: tds_solve => tds_solve_cuda
-      procedure :: trans_x2y => trans_x2y_cuda
-      procedure :: trans_x2z => trans_x2z_cuda
-      procedure :: trans_y2z => trans_y2z_cuda
-      procedure :: trans_z2y => trans_z2y_cuda
-      procedure :: trans_y2x => trans_y2x_cuda
+      procedure :: reorder => reorder_cuda
       procedure :: sum_yzintox => sum_yzintox_cuda
       procedure :: vecadd => vecadd_cuda
       procedure :: set_fields => set_fields_cuda
@@ -74,6 +73,10 @@ function init(globs, allocator) result(backend)
       backend%zthreads = dim3(SZ, 1, 1)
       backend%zblocks = dim3(globs%n_groups_z, 1, 1)
 
+      backend%nx_loc = globs%nx_loc
+      backend%ny_loc = globs%ny_loc
+      backend%nz_loc = globs%nz_loc
+
       n_halo = 4
       n_block = globs%n_groups_x
 
@@ -415,50 +418,48 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads)
 
    end subroutine tds_solve_dist
 
-   subroutine trans_x2y_cuda(self, u_y, v_y, w_y, u, v, w)
-      implicit none
-
-      class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_y, v_y, w_y
-      class(field_t), intent(in) :: u, v, w
-
-   end subroutine trans_x2y_cuda
-
-   subroutine trans_x2z_cuda(self, u_z, v_z, w_z, u, v, w)
+   subroutine reorder_cuda(self, u_o, u_i, direction)
       implicit none
 
       class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_z, v_z, w_z
-      class(field_t), intent(in) :: u, v, w
-
-   end subroutine trans_x2z_cuda
+      class(field_t), intent(inout) :: u_o
+      class(field_t), intent(in) :: u_i
+      integer, intent(in) :: direction
 
-   subroutine trans_y2z_cuda(self, u_z, u_y)
-      implicit none
-
-      class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_z
-      class(field_t), intent(in) :: u_y
-
-   end subroutine trans_y2z_cuda
-
-   subroutine trans_z2y_cuda(self, u_y, u_z)
-      implicit none
-
-      class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_y
-      class(field_t), intent(in) :: u_z
-
-   end subroutine trans_z2y_cuda
-
-   subroutine trans_y2x_cuda(self, u_x, u_y)
-      implicit none
+      real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d
+      type(dim3) :: blocks, threads
 
-      class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_x
-      class(field_t), intent(in) :: u_y
+      select type(u_o); type is (cuda_field_t); u_o_d => u_o%data_d; end select
+      select type(u_i); type is (cuda_field_t); u_i_d => u_i%data_d; end select
+
+      select case (direction)
+      case (RDR_X2Y) ! x2y
+         blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_x2y<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (RDR_X2Z) ! x2z
+         blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
+         threads = dim3(SZ, 1, 1)
+         call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (RDR_Y2X) ! y2x
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_y2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (RDR_Y2Z) ! y2z
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, &
+                                               self%nx_loc, self%nz_loc)
+      case (RDR_Z2Y) ! z2y
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, &
+                                               self%nx_loc, self%nz_loc)
+      case default
+         error stop 'Reorder direction is undefined.'
+      end select
 
-   end subroutine trans_y2x_cuda
+   end subroutine reorder_cuda
 
    subroutine sum_yzintox_cuda(self, du, dv, dw, &
                                du_y, dv_y, dw_y, du_z, dv_z, dw_z)
diff --git a/src/cuda/kernels_reorder.f90 b/src/cuda/kernels_reorder.f90
@@ -0,0 +1,121 @@
+module m_cuda_kernels_reorder
+   use cudafor
+
+   use m_common, only: dp
+   use m_cuda_common, only: SZ
+
+contains
+
+   attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_y
+      real(dp), device, intent(in), dimension(:, :, :) :: u_x
+      integer, value, intent(in) :: nz
+
+      real(dp), shared :: tile(SZ, SZ)
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i)
+
+   end subroutine reorder_x2y
+
+   attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_z
+      real(dp), device, intent(in), dimension(:, :, :) :: u_x
+      integer, value, intent(in) :: nz
+
+      integer :: i, j, b_i, b_j, nx
+
+      i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
+      nx = gridDim%x
+
+      ! Data access pattern for reordering between x and z is quite nice
+      ! thus we don't need to use shared memory for this operation.
+      do j = 1, nz
+         u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz)
+      end do
+
+   end subroutine reorder_x2z
+
+   attributes(global) subroutine reorder_y2x(u_x, u_y, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_x
+      real(dp), device, intent(in), dimension(:, :, :) :: u_y
+      integer, value, intent(in) :: nz
+
+      real(dp), shared :: tile(SZ, SZ)
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i)
+
+   end subroutine reorder_y2x
+
+   attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_z
+      real(dp), device, intent(in), dimension(:, :, :) :: u_y
+      integer, value, intent(in) :: nx, nz
+
+      real(dp), shared :: tile(SZ, SZ)
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i)
+
+   end subroutine reorder_y2z
+
+   attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_y
+      real(dp), device, intent(in), dimension(:, :, :) :: u_z
+      integer, value, intent(in) :: nx, nz
+
+      real(dp), shared :: tile(SZ, SZ)
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i)
+
+   end subroutine reorder_z2y
+
+end module m_cuda_kernels_reorder
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
@@ -24,11 +24,7 @@ module m_omp_backend
       procedure :: transeq_y => transeq_y_omp
       procedure :: transeq_z => transeq_z_omp
       procedure :: tds_solve => tds_solve_omp
-      procedure :: trans_x2y => trans_x2y_omp
-      procedure :: trans_x2z => trans_x2z_omp
-      procedure :: trans_y2z => trans_y2z_omp
-      procedure :: trans_z2y => trans_z2y_omp
-      procedure :: trans_y2x => trans_y2x_omp
+      procedure :: reorder => reorder_omp
       procedure :: sum_yzintox => sum_yzintox_omp
       procedure :: vecadd => vecadd_omp
       procedure :: set_fields => set_fields_omp
@@ -164,50 +160,15 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops)
 
    end subroutine tds_solve_omp
 
-   subroutine trans_x2y_omp(self, u_, v_, w_, u, v, w)
-      implicit none
-
-      class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u_, v_, w_
-      class(field_t), intent(in) :: u, v, w
-
-   end subroutine trans_x2y_omp
-
-   subroutine trans_x2z_omp(self, u_, v_, w_, u, v, w)
-      implicit none
-
-      class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u_, v_, w_
-      class(field_t), intent(in) :: u, v, w
-
-   end subroutine trans_x2z_omp
-
-   subroutine trans_y2z_omp(self, u_, u)
-      implicit none
-
-      class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u_
-      class(field_t), intent(in) :: u
-
-   end subroutine trans_y2z_omp
-
-   subroutine trans_z2y_omp(self, u_, u)
-      implicit none
-
-      class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u_
-      class(field_t), intent(in) :: u
-
-   end subroutine trans_z2y_omp
-
-   subroutine trans_y2x_omp(self, u_, u)
+   subroutine reorder_omp(self, u_, u, direction)
       implicit none
 
       class(omp_backend_t) :: self
       class(field_t), intent(inout) :: u_
       class(field_t), intent(in) :: u
+      integer, intent(in) :: direction
 
-   end subroutine trans_y2x_omp
+   end subroutine reorder_omp
 
    subroutine sum_yzintox_omp(self, du, dv, dw, &
                                du_y, dv_y, dw_y, du_z, dv_z, dw_z)
diff --git a/src/solver.f90 b/src/solver.f90
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ set(CUDASRC`
`17`	`17`	`cuda/allocator.f90`
`18`	`18`	`cuda/exec_dist.f90`
`19`	`19`	`cuda/kernels_dist.f90`
	`20`	`+ cuda/kernels_reorder.f90`
`20`	`21`	`cuda/sendrecv.f90`
`21`	`22`	`cuda/tdsops.f90`
`22`	`23`	`)`