feat(cuda/tests): Add tests for all reordering kernels.

semi-h · semi-h · commit adf8edbcd2e4 · 2024-02-05T17:32:20.000Z
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ set(TESTSRC
 )
 set(CUDATESTSRC
   cuda/test_cuda_allocator.f90
+  cuda/test_cuda_reorder.f90
   cuda/test_cuda_tridiag.f90
   cuda/test_cuda_transeq.f90
 )
diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90
@@ -0,0 +1,228 @@
+program test_cuda_reorder
+   use iso_fortran_env, only: stderr => error_unit
+   use cudafor
+   use mpi
+
+   use m_common, only: dp
+   use m_cuda_common, only: SZ
+   use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, &
+                                     reorder_y2z, reorder_z2y
+
+   implicit none
+
+   logical :: allpass = .true.
+   real(dp), allocatable, dimension(:, :, :) :: u_i, u_o, u_temp
+   real(dp), device, allocatable, dimension(:, :, :) :: u_i_d, u_o_d, u_temp_d
+
+   integer :: n_block, i, n_iters
+   integer :: nx, ny, nz, ndof
+   integer :: nrank, nproc, pprev, pnext
+   integer :: ierr, ndevs, devnum
+
+   type(dim3) :: blocks, threads
+   real(dp) :: norm_u, tol = 1d-8, tstart, tend
+
+   call MPI_Init(ierr)
+   call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
+   call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)
+
+   if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks'
+
+   ierr = cudaGetDeviceCount(ndevs)
+   ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin
+   ierr = cudaGetDevice(devnum)
+
+   !print*, 'I am rank', nrank, 'I am running on device', devnum
+   pnext = modulo(nrank - nproc + 1, nproc)
+   pprev = modulo(nrank - 1, nproc)
+
+   nx = 512; ny = 512; nz = 512
+   n_block = ny*nz/SZ
+   ndof = nx*ny*nz
+   n_iters = 100
+
+   allocate (u_i(SZ, nx, n_block), u_o(SZ, nx, n_block))
+   allocate (u_temp(SZ, nx, n_block))
+   allocate (u_i_d(SZ, nx, n_block), u_o_d(SZ, nx, n_block))
+   allocate (u_temp_d(SZ, nx, n_block))
+
+   ! set a random field
+   call random_number(u_i)
+
+   ! move data to device
+   u_i_d = u_i
+
+   ! do a x to y reordering first and then a y to x
+   blocks = dim3(nx/SZ, nz, ny/SZ)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_x2y<<<blocks, threads>>>(u_temp_d, u_i_d, nz)
+
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_y2x<<<blocks, threads>>>(u_o_d, u_temp_d, nz)
+
+   ! move the result back to host
+   u_o = u_o_d
+
+   ! and check whether it matches the initial random field
+   norm_u = norm2(u_o - u_i)
+   if (nrank == 0) then
+      if ( norm_u > tol ) then
+         allpass = .false.
+         write(stderr, '(a)') 'Check reorder x2y and y2x... failed'
+      else
+         write(stderr, '(a)') 'Check reorder x2y and y2x... passed'
+      end if
+   end if
+
+   ! we reuse u_o_d so zeroize in any case
+   u_o_d = 0
+
+   ! u_temp_d is in y orientation, use y2z to reorder it into z direction
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_y2z<<<blocks, threads>>>(u_o_d, u_temp_d, nx, nz)
+
+   ! store this in host
+   u_o = u_o_d
+
+   ! and zeroize u_o_d again in any case
+   u_o_d = 0
+
+   ! reorder initial random field into z orientation
+   blocks = dim3(nx, ny/SZ, 1)
+   threads = dim3(SZ, 1, 1)
+   call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, nz)
+   u_temp = u_o_d
+
+   ! compare two z oriented fields
+   norm_u = norm2(u_o - u_temp)
+   if (nrank == 0) then
+      if ( norm_u > tol ) then
+         allpass = .false.
+         write(stderr, '(a)') 'Check reorder x2z and y2z... failed'
+      else
+         write(stderr, '(a)') 'Check reorder x2z and y2z... passed'
+      end if
+   end if
+
+   ! z oriented field into y
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_z2y<<<blocks, threads>>>(u_temp_d, u_o_d, nx, nz)
+
+   ! zeroize just in case for reusing
+   u_o_d = 0
+
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_y2x<<<blocks, threads>>>(u_o_d, u_temp_d, nz)
+   u_o = u_o_d
+
+   ! and check whether it matches the initial random field
+   norm_u = norm2(u_o - u_i)
+   if (nrank == 0) then
+      if ( norm_u > tol ) then
+         allpass = .false.
+         write(stderr, '(a)') 'Check reorder z2y and y2x... failed'
+      else
+         write(stderr, '(a)') 'Check reorder z2y and y2x... passed'
+      end if
+   end if
+
+   ! Now the performance checks
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx/SZ, nz, ny/SZ)
+      threads = dim3(SZ, SZ, 1)
+      call reorder_x2y<<<blocks, threads>>>(u_o_d, u_i_d, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx, ny/SZ, 1)
+      threads = dim3(SZ, 1, 1)
+      call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx/SZ, ny/SZ, nz)
+      threads = dim3(SZ, SZ, 1)
+      call reorder_y2x<<<blocks, threads>>>(u_o_d, u_i_d, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx/SZ, ny/SZ, nz)
+      threads = dim3(SZ, SZ, 1)
+      call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, nx, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx/SZ, ny/SZ, nz)
+      threads = dim3(SZ, SZ, 1)
+      call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, nx, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   if (allpass) then
+      if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.'
+   else
+      error stop 'SOME TESTS FAILED.'
+   end if
+
+   call MPI_Finalize(ierr)
+
+contains
+
+   subroutine checkperf(t_tot, n_iters, ndof, consumed_bw)
+      implicit none
+
+      real(dp), intent(in) :: t_tot, consumed_bw
+      integer, intent(in) :: n_iters, ndof
+
+      real(dp) :: achievedBW, devBW, achievedBWmax, achievedBWmin
+      integer :: ierr, memClockRt, memBusWidth
+
+      ! BW utilisation and performance checks
+      achievedBW = consumed_bw*n_iters*ndof*dp/t_tot
+      call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, &
+                         MPI_MAX, MPI_COMM_WORLD, ierr)
+      call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, &
+                         MPI_MIN, MPI_COMM_WORLD, ierr)
+
+      if (nrank == 0) then
+         print'(a, f8.3, a)', 'Achieved BW min: ', achievedBWmin/2**30, ' GiB/s'
+         print'(a, f8.3, a)', 'Achieved BW max: ', achievedBWmax/2**30, ' GiB/s'
+      end if
+
+      ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0)
+      ierr = cudaDeviceGetAttribute(memBusWidth, &
+                                    cudaDevAttrGlobalMemoryBusWidth, 0)
+      devBW = 2*memBusWidth/8._dp*memClockRt*1000
+
+      if (nrank == 0) then
+         print'(a, f8.3, a)', 'Device BW:   ', devBW/2**30, ' GiB/s'
+         print'(a, f5.2)', 'Effective BW util min: %', achievedBWmin/devBW*100
+         print'(a, f5.2)', 'Effective BW util max: %', achievedBWmax/devBW*100
+      end if
+   end subroutine checkperf
+
+end program test_cuda_reorder
+

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@ set(TESTSRC`
`4`	`4`	`)`
`5`	`5`	`set(CUDATESTSRC`
`6`	`6`	`cuda/test_cuda_allocator.f90`
	`7`	`+ cuda/test_cuda_reorder.f90`
`7`	`8`	`cuda/test_cuda_tridiag.f90`
`8`	`9`	`cuda/test_cuda_transeq.f90`
`9`	`10`	`)`