refactor(omp): move omp exec_dist_tds_compact to its own file

Nanoseb · Nanoseb · commit 4527b70a4bfd · 2023-12-08T15:07:31.000Z
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -9,6 +9,7 @@ set(SRC
   omp/common.f90
   omp/kernels_dist.f90
   omp/sendrecv.f90
+  omp/exec_dist.f90
 )
 set(CUDASRC
   cuda/backend.f90
diff --git a/src/omp/exec_dist.f90 b/src/omp/exec_dist.f90
@@ -0,0 +1,64 @@
+module m_omp_exec_dist
+   use mpi
+
+   use m_common, only: dp
+   use m_omp_common, only: SZ
+   use m_omp_kernels_dist, only: der_univ_dist, der_univ_subs
+   use m_tdsops, only: tdsops_t
+   use m_omp_sendrecv, only: sendrecv_fields
+
+   implicit none
+
+contains
+
+   subroutine exec_dist_tds_compact( &
+      du, u, u_recv_s, u_recv_e, du_send_s, du_send_e, du_recv_s, du_recv_e, &
+      tdsops, nproc, pprev, pnext, n_block &
+      )
+      implicit none
+
+      ! du = d(u)
+      real(dp), dimension(:, :, :), intent(out) :: du
+      real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e
+
+      ! The ones below are intent(out) just so that we can write data in them,
+      ! not because we actually need the data they store later where this
+      ! subroutine is called. We absolutely don't care about the data they pass back
+      real(dp), dimension(:, :, :), intent(out) :: &
+         du_send_s, du_send_e, du_recv_s, du_recv_e
+
+      type(tdsops_t), intent(in) :: tdsops
+      integer, intent(in) :: nproc, pprev, pnext
+      integer, intent(in) :: n_block
+
+      integer :: n_data
+      integer :: k
+
+      n_data = SZ*n_block
+
+      !$omp parallel do
+      do k = 1, n_block
+         call der_univ_dist( &
+            du(:, :, k), du_send_s(:, :, k), du_send_e(:, :, k), u(:, :, k), &
+            u_recv_s(:, :, k), u_recv_e(:, :, k), &
+            tdsops%coeffs_s, tdsops%coeffs_e, tdsops%coeffs, tdsops%n, &
+            tdsops%dist_fw, tdsops%dist_bw, tdsops%dist_af &
+            )
+      end do
+
+      ! halo exchange for 2x2 systems
+      call sendrecv_fields(du_recv_s, du_recv_e, du_send_s, du_send_e, &
+                           n_data, nproc, pprev, pnext)
+
+      !$omp parallel do
+      do k = 1, n_block
+         call der_univ_subs(du(:, :, k), &
+                            du_recv_s(:, :, k), du_recv_e(:, :, k), &
+                            tdsops%n, tdsops%dist_sa, tdsops%dist_sc)
+      end do
+      !$omp end parallel do
+
+   end subroutine exec_dist_tds_compact
+
+end module m_omp_exec_dist
+
diff --git a/src/omp/kernels_dist.f90 b/src/omp/kernels_dist.f90
@@ -8,7 +8,7 @@ module m_omp_kernels_dist
 
 contains
 
-   subroutine der_univ_dist_omp( &
+   subroutine der_univ_dist( &
       du, send_u_s, send_u_e, u, u_s, u_e, coeffs_s, coeffs_e, coeffs, n, &
       ffr, fbc, faf &
       )
@@ -134,9 +134,9 @@ subroutine der_univ_dist_omp( &
       end do
       !$omp end simd
 
-   end subroutine der_univ_dist_omp
+   end subroutine der_univ_dist
 
-   subroutine der_univ_subs_omp(du, recv_u_s, recv_u_e, n, dist_sa, dist_sc)
+   subroutine der_univ_subs(du, recv_u_s, recv_u_e, n, dist_sa, dist_sc)
       implicit none
 
       ! Arguments
@@ -193,6 +193,6 @@ subroutine der_univ_subs_omp(du, recv_u_s, recv_u_e, n, dist_sa, dist_sc)
       end do
       !$omp end simd
 
-   end subroutine der_univ_subs_omp
+   end subroutine der_univ_subs
 
 end module m_omp_kernels_dist
diff --git a/tests/omp/test_omp_tridiag.f90 b/tests/omp/test_omp_tridiag.f90
@@ -5,8 +5,8 @@ program test_omp_tridiag
 
    use m_common, only: dp, pi
    use m_omp_common, only: SZ
-   use m_omp_kernels_dist, only: der_univ_dist_omp, der_univ_subs_omp
    use m_omp_sendrecv, only: sendrecv_fields
+   use m_omp_exec_dist, only: exec_dist_tds_compact
 
    use m_tdsops, only: tdsops_t, tdsops_init
 
@@ -34,7 +34,7 @@ program test_omp_tridiag
 
    integer :: n, n_block, i, j, k, n_halo, n_iters, iters, n_loc
    integer :: n_glob
-   integer :: nrank, nproc, pprev, pnext, tag1=1234, tag2=1234
+   integer :: nrank, nproc, pprev, pnext, tag1 = 1234, tag2 = 1234
    integer :: ierr, ndevs, devnum, memClockRt, memBusWidth
 
    real(dp) :: dx, dx_per, norm_du, tol = 1d-8, tstart, tend
@@ -44,7 +44,7 @@ program test_omp_tridiag
    call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
    call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)
 
-   if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks'
+   if (nrank == 0) print *, 'Parallel run with', nproc, 'ranks'
 
    pnext = modulo(nrank - nproc + 1, nproc)
    pprev = modulo(nrank - 1, nproc)
@@ -54,14 +54,14 @@ program test_omp_tridiag
    n_block = 512*512/SZ
    n_iters = 1
 
-   allocate(u(SZ, n, n_block), du(SZ, n, n_block))
+   allocate (u(SZ, n, n_block), du(SZ, n, n_block))
 
    dx_per = 2*pi/n_glob
    dx = 2*pi/(n_glob - 1)
 
-   allocate(sin_0_2pi_per(n), cos_0_2pi_per(n))
-   allocate(sin_0_2pi(n), cos_0_2pi(n))
-   allocate(sin_stag(n), cos_stag(n))
+   allocate (sin_0_2pi_per(n), cos_0_2pi_per(n))
+   allocate (sin_0_2pi(n), cos_0_2pi(n))
+   allocate (sin_stag(n), cos_stag(n))
    do j = 1, n
       sin_0_2pi_per(j) = sin(((j - 1) + nrank*n)*dx_per)
       cos_0_2pi_per(j) = cos(((j - 1) + nrank*n)*dx_per)
@@ -74,13 +74,13 @@ program test_omp_tridiag
    n_halo = 4
 
    ! arrays for exchanging data between ranks
-   allocate(u_send_s(SZ, n_halo, n_block))
-   allocate(u_send_e(SZ, n_halo, n_block))
-   allocate(u_recv_s(SZ, n_halo, n_block))
-   allocate(u_recv_e(SZ, n_halo, n_block))
+   allocate (u_send_s(SZ, n_halo, n_block))
+   allocate (u_send_e(SZ, n_halo, n_block))
+   allocate (u_recv_s(SZ, n_halo, n_block))
+   allocate (u_recv_e(SZ, n_halo, n_block))
 
-   allocate(send_s(SZ, 1, n_block), send_e(SZ, 1, n_block))
-   allocate(recv_s(SZ, 1, n_block), recv_e(SZ, 1, n_block))
+   allocate (send_s(SZ, 1, n_block), send_e(SZ, 1, n_block))
+   allocate (recv_s(SZ, 1, n_block), recv_e(SZ, 1, n_block))
 
    ! =========================================================================
    ! second derivative with periodic BC
@@ -97,17 +97,17 @@ program test_omp_tridiag
                    )
 
    tend = omp_get_wtime()
-   if (nrank == 0) print*, 'Total time', tend-tstart
+   if (nrank == 0) print *, 'Total time', tend - tstart
 
    call check_error_norm(du, sin_0_2pi_per, n, n_glob, n_block, 1, norm_du)
-   if (nrank == 0) print*, 'error norm second-deriv periodic', norm_du
+   if (nrank == 0) print *, 'error norm second-deriv periodic', norm_du
 
    if (nrank == 0) then
       if (norm_du > tol) then
          allpass = .false.
-         write(stderr, '(a)') 'Check 2nd derivatives, periodic BCs... failed'
+         write (stderr, '(a)') 'Check 2nd derivatives, periodic BCs... failed'
       else
-         write(stderr, '(a)') 'Check 2nd derivatives, periodic BCs... passed'
+         write (stderr, '(a)') 'Check 2nd derivatives, periodic BCs... passed'
       end if
    end if
 
@@ -124,14 +124,14 @@ program test_omp_tridiag
                    )
 
    call check_error_norm(du, cos_0_2pi_per, n, n_glob, n_block, -1, norm_du)
-   if (nrank == 0) print*, 'error norm first-deriv periodic', norm_du
+   if (nrank == 0) print *, 'error norm first-deriv periodic', norm_du
 
    if (nrank == 0) then
       if (norm_du > tol) then
          allpass = .false.
-         write(stderr, '(a)') 'Check 1st derivatives, periodic BCs... failed'
+         write (stderr, '(a)') 'Check 1st derivatives, periodic BCs... failed'
       else
-         write(stderr, '(a)') 'Check 1st derivatives, periodic BCs... passed'
+         write (stderr, '(a)') 'Check 1st derivatives, periodic BCs... passed'
       end if
    end if
 
@@ -161,14 +161,14 @@ program test_omp_tridiag
                    )
 
    call check_error_norm(du, cos_0_2pi, n, n_glob, n_block, -1, norm_du)
-   if (nrank == 0) print*, 'error norm first deriv dir-neu', norm_du
+   if (nrank == 0) print *, 'error norm first deriv dir-neu', norm_du
 
    if (nrank == 0) then
       if (norm_du > tol) then
          allpass = .false.
-         write(stderr, '(a)') 'Check 1st derivatives, dir-neu... failed'
+         write (stderr, '(a)') 'Check 1st derivatives, dir-neu... failed'
       else
-         write(stderr, '(a)') 'Check 1st derivatives, dir-neu... passed'
+         write (stderr, '(a)') 'Check 1st derivatives, dir-neu... passed'
       end if
    end if
 
@@ -189,14 +189,14 @@ program test_omp_tridiag
                    )
 
    call check_error_norm(du, cos_stag, n_loc, n_glob, n_block, -1, norm_du)
-   if (nrank == 0) print*, 'error norm interpolate', norm_du
+   if (nrank == 0) print *, 'error norm interpolate', norm_du
 
    if (nrank == 0) then
       if (norm_du > tol) then
          allpass = .false.
-         write(stderr, '(a)') 'Check interpolation... failed'
+         write (stderr, '(a)') 'Check interpolation... failed'
       else
-         write(stderr, '(a)') 'Check interpolation... passed'
+         write (stderr, '(a)') 'Check interpolation... passed'
       end if
    end if
 
@@ -217,21 +217,21 @@ program test_omp_tridiag
                    )
 
    call check_error_norm(du, sin_0_2pi, n, n_glob, n_block, 1, norm_du)
-   if (nrank == 0) print*, 'error norm hyperviscous', norm_du
+   if (nrank == 0) print *, 'error norm hyperviscous', norm_du
 
    if (nrank == 0) then
       if (norm_du > tol) then
          allpass = .false.
-         write(stderr, '(a)') 'Check 2nd ders, hyperviscous, dir-neu... failed'
+        write (stderr, '(a)') 'Check 2nd ders, hyperviscous, dir-neu... failed'
       else
-         write(stderr, '(a)') 'Check 2nd ders, hyperviscous, dir-neu... passed'
+        write (stderr, '(a)') 'Check 2nd ders, hyperviscous, dir-neu... passed'
       end if
    end if
 
    ! =========================================================================
    ! BW utilisation and performance checks
    ! 3 in the first phase, 2 in the second phase, so 5 in total
-   achievedBW = 5._dp*n_iters*n*n_block*SZ*dp/(tend-tstart)
+   achievedBW = 5._dp*n_iters*n*n_block*SZ*dp/(tend - tstart)
    call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, &
                       MPI_MAX, MPI_COMM_WORLD, ierr)
    call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, &
@@ -247,13 +247,13 @@ program test_omp_tridiag
 
    if (nrank == 0) then
       print'(a, f8.3, a)', 'Available BW:   ', deviceBW/2**30, &
-                           ' GiB/s (per NUMA zone on ARCHER2)'
+         ' GiB/s (per NUMA zone on ARCHER2)'
       print'(a, f5.2)', 'Effective BW util min: %', achievedBWmin/deviceBW*100
       print'(a, f5.2)', 'Effective BW util max: %', achievedBWmax/deviceBW*100
    end if
 
    if (allpass) then
-      if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.'
+      if (nrank == 0) write (stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.'
    else
       error stop 'SOME TESTS FAILED.'
    end if
@@ -279,8 +279,7 @@ subroutine run_kernel(n_iters, n_block, u, du, tdsops, n, &
                                                      send_s, send_e
       integer, intent(in) :: nproc, pprev, pnext
 
-      integer :: iters, i, j, k, ierr, tag1=1234, tag2=1234
-      integer :: srerr(4), mpireq(4)
+      integer :: iters, i, j, k
 
       do iters = 1, n_iters
          ! first copy halo data into buffers
@@ -289,8 +288,8 @@ subroutine run_kernel(n_iters, n_block, u, du, tdsops, n, &
             do j = 1, 4
                !$omp simd
                do i = 1, SZ
-                  u_send_s(i,j,k) = u(i,j,k)
-                  u_send_e(i,j,k) = u(i,n-n_halo+j,k)
+                  u_send_s(i, j, k) = u(i, j, k)
+                  u_send_e(i, j, k) = u(i, n - n_halo + j, k)
                end do
                !$omp end simd
             end do
@@ -299,48 +298,11 @@ subroutine run_kernel(n_iters, n_block, u, du, tdsops, n, &
 
          ! halo exchange
          call sendrecv_fields(u_recv_s, u_recv_e, u_send_s, u_send_e, &
-                               SZ*n_halo*n_block, nproc, pprev, pnext)
-         
-         !$omp parallel do
-         do k = 1, n_block
-            call der_univ_dist_omp( &
-               du(:, :, k), send_s(:, :, k), send_e(:, :, k), u(:, :, k), &
-               u_recv_s(:, :, k), u_recv_e(:, :, k), &
-               tdsops%coeffs_s, tdsops%coeffs_e, tdsops%coeffs, n, &
-               tdsops%dist_fw, tdsops%dist_bw, tdsops%dist_af &
-            )
-         end do
-         !$omp end parallel do
+                              SZ*n_halo*n_block, nproc, pprev, pnext)
 
-         ! halo exchange for 2x2 systems
-         if (nproc == 1) then
-            recv_s = send_e
-            recv_e = send_s
-         else
-            ! MPI send/recv for multi-rank simulations
-            call MPI_Isend(send_s, SZ*n_block, &
-                           MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, &
-                           mpireq(1), srerr(1))
-            call MPI_Irecv(recv_e, SZ*n_block, &
-                           MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, &
-                           mpireq(2), srerr(2))
-            call MPI_Isend(send_e, SZ*n_block, &
-                           MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, &
-                           mpireq(3), srerr(3))
-            call MPI_Irecv(recv_s, SZ*n_block, &
-                           MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, &
-                           mpireq(4), srerr(4))
-
-            call MPI_Waitall(4, mpireq, MPI_STATUSES_IGNORE, ierr)
-         end if
+        call exec_dist_tds_compact(du, u, u_recv_s, u_recv_e, send_s, send_e, &
+                          recv_s, recv_e, tdsops, nproc, pprev, pnext, n_block)
 
-         !$omp parallel do
-         do k = 1, n_block
-            call der_univ_subs_omp(du(:, :, k), &
-                                   recv_s(:, :, k), recv_e(:, :, k), &
-                                   n, tdsops%dist_sa, tdsops%dist_sc)
-         end do
-         !$omp end parallel do
       end do
    end subroutine run_kernel
 
@@ -384,7 +346,7 @@ subroutine check_error_norm(du, line, n, n_glob, n_block, c, norm)
       norm = norm2(du(:, 1:n, :))
       norm = norm*norm/n_glob/n_block/SZ
       call MPI_Allreduce(MPI_IN_PLACE, norm, 1, MPI_DOUBLE_PRECISION, &
-                      MPI_SUM, MPI_COMM_WORLD, ierr)
+                         MPI_SUM, MPI_COMM_WORLD, ierr)
       norm = sqrt(norm)
 
    end subroutine check_error_norm

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ set(SRC`
`9`	`9`	`omp/common.f90`
`10`	`10`	`omp/kernels_dist.f90`
`11`	`11`	`omp/sendrecv.f90`
	`12`	`+ omp/exec_dist.f90`
`12`	`13`	`)`
`13`	`14`	`set(CUDASRC`
`14`	`15`	`cuda/backend.f90`