@@ -25,7 +25,7 @@ module m_mpi_common
2525 implicit none
2626
2727 integer, private :: ierr, v_size !<
28- !$acc declare create( v_size)
28+ $:GPU_DECLARE(create= ' [ v_size] ' )
2929 !! Generic flags used to identify and report MPI errors
3030
3131 real(wp), private, allocatable, dimension(:) :: buff_send !<
@@ -38,10 +38,10 @@ module m_mpi_common
3838 !! average primitive variables, for a single computational domain boundary
3939 !! at the time, from the relevant neighboring processor.
4040
41- !$acc declare create( buff_send, buff_recv)
41+ $:GPU_DECLARE(create= ' [ buff_send, buff_recv] ' )
4242
4343 integer :: halo_size
44- !$acc declare create( halo_size)
44+ $:GPU_DECLARE(create= ' [ halo_size] ' )
4545
4646contains
4747
@@ -76,7 +76,7 @@ contains
7676 halo_size = -1 + buff_size*(v_size)
7777 end if
7878
79- !$acc update device( halo_size, v_size)
79+ $:GPU_UPDATE(device= ' [ halo_size, v_size] ' )
8080
8181 @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
8282#endif
@@ -631,7 +631,7 @@ contains
631631 /)
632632 end if
633633
634- !$acc update device( v_size)
634+ $:GPU_UPDATE(device= ' [ v_size] ' )
635635
636636 buffer_count = buffer_counts(mpi_dir)
637637 boundary_conditions = (/bc_x, bc_y, bc_z/)
@@ -667,7 +667,7 @@ contains
667667 #:for mpi_dir in [1, 2, 3]
668668 if (mpi_dir == ${mpi_dir}$) then
669669 #:if mpi_dir == 1
670- !$acc parallel loop collapse(4) gang vector default(present) private(r )
670+ $:GPU_PARALLEL_LOOP(collapse=4, private= ' [r] ' )
671671 do l = 0, p
672672 do k = 0, n
673673 do j = 0, buff_size - 1
@@ -680,7 +680,7 @@ contains
680680 end do
681681
682682 if (qbmm_comm) then
683- !$acc parallel loop collapse(4) gang vector default(present) private(r )
683+ $:GPU_PARALLEL_LOOP(collapse=4, private= ' [r] ' )
684684 do l = 0, p
685685 do k = 0, n
686686 do j = 0, buff_size - 1
@@ -695,7 +695,7 @@ contains
695695 end do
696696 end do
697697
698- !$acc parallel loop collapse(5) gang vector default(present) private(r )
698+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
699699 do l = 0, p
700700 do k = 0, n
701701 do j = 0, buff_size - 1
@@ -711,7 +711,7 @@ contains
711711 end do
712712 end if
713713 #:elif mpi_dir == 2
714- !$acc parallel loop collapse(4) gang vector default(present) private(r )
714+ $:GPU_PARALLEL_LOOP(collapse=4, private= ' [r] ' )
715715 do i = 1, nVar
716716 do l = 0, p
717717 do k = 0, buff_size - 1
@@ -726,7 +726,7 @@ contains
726726 end do
727727
728728 if (qbmm_comm) then
729- !$acc parallel loop collapse(5) gang vector default(present) private(r )
729+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
730730 do i = nVar + 1, nVar + 4
731731 do l = 0, p
732732 do k = 0, buff_size - 1
@@ -742,7 +742,7 @@ contains
742742 end do
743743 end do
744744
745- !$acc parallel loop collapse(5) gang vector default(present) private(r )
745+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
746746 do i = nVar + 1, nVar + 4
747747 do l = 0, p
748748 do k = 0, buff_size - 1
@@ -759,7 +759,7 @@ contains
759759 end do
760760 end if
761761 #:else
762- !$acc parallel loop collapse(4) gang vector default(present) private(r )
762+ $:GPU_PARALLEL_LOOP(collapse=4, private= ' [r] ' )
763763 do i = 1, nVar
764764 do l = 0, buff_size - 1
765765 do k = -buff_size, n + buff_size
@@ -774,7 +774,7 @@ contains
774774 end do
775775
776776 if (qbmm_comm) then
777- !$acc parallel loop collapse(5) gang vector default(present) private(r )
777+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
778778 do i = nVar + 1, nVar + 4
779779 do l = 0, buff_size - 1
780780 do k = -buff_size, n + buff_size
@@ -790,7 +790,7 @@ contains
790790 end do
791791 end do
792792
793- !$acc parallel loop collapse(5) gang vector default(present) private(r )
793+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
794794 do i = nVar + 1, nVar + 4
795795 do l = 0, buff_size - 1
796796 do k = -buff_size, n + buff_size
@@ -816,28 +816,33 @@ contains
816816 #:for rdma_mpi in [False, True]
817817 if (rdma_mpi .eqv. ${' .true. ' if rdma_mpi else ' .false. ' }$) then
818818 #:if rdma_mpi
819- !$acc host_data use_device(buff_send, buff_recv)
820- call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
819+ #:call GPU_HOST_DATA(use_device=' [buff_send, buff_recv]' )
820+ call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
821+
822+ call MPI_SENDRECV( &
823+ buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
824+ buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
825+ MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
826+
827+ call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
828+
829+ #:endcall GPU_HOST_DATA
830+ $:GPU_WAIT()
821831 #:else
822832 call nvtxStartRange("RHS-COMM-DEV2HOST")
823- !$acc update host( buff_send)
833+ $:GPU_UPDATE(host= ' [ buff_send] ' )
824834 call nvtxEndRange
825835 call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
826- #:endif
827836
828- call MPI_SENDRECV( &
829- buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
830- buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
831- MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
837+ call MPI_SENDRECV( &
838+ buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
839+ buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
840+ MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
832841
833- call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
842+ call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
834843
835- #:if rdma_mpi
836- !$acc end host_data
837- !$acc wait
838- #:else
839844 call nvtxStartRange("RHS-COMM-HOST2DEV")
840- !$acc update device( buff_recv)
845+ $:GPU_UPDATE(device= ' [ buff_recv] ' )
841846 call nvtxEndRange
842847 #:endif
843848 end if
@@ -854,7 +859,7 @@ contains
854859 #:for mpi_dir in [1, 2, 3]
855860 if (mpi_dir == ${mpi_dir}$) then
856861 #:if mpi_dir == 1
857- !$acc parallel loop collapse(4) gang vector default(present) private(r )
862+ $:GPU_PARALLEL_LOOP(collapse=4, private= ' [r] ' )
858863 do l = 0, p
859864 do k = 0, n
860865 do j = -buff_size, -1
@@ -874,7 +879,7 @@ contains
874879 end do
875880
876881 if (qbmm_comm) then
877- !$acc parallel loop collapse(5) gang vector default(present) private(r )
882+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
878883 do l = 0, p
879884 do k = 0, n
880885 do j = -buff_size, -1
@@ -889,7 +894,7 @@ contains
889894 end do
890895 end do
891896
892- !$acc parallel loop collapse(5) gang vector default(present) private(r )
897+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
893898 do l = 0, p
894899 do k = 0, n
895900 do j = -buff_size, -1
@@ -905,7 +910,7 @@ contains
905910 end do
906911 end if
907912 #:elif mpi_dir == 2
908- !$acc parallel loop collapse(4) gang vector default(present) private(r )
913+ $:GPU_PARALLEL_LOOP(collapse=4, private= ' [r] ' )
909914 do i = 1, nVar
910915 do l = 0, p
911916 do k = -buff_size, -1
@@ -926,7 +931,7 @@ contains
926931 end do
927932
928933 if (qbmm_comm) then
929- !$acc parallel loop collapse(5) gang vector default(present) private(r )
934+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
930935 do i = nVar + 1, nVar + 4
931936 do l = 0, p
932937 do k = -buff_size, -1
@@ -942,7 +947,7 @@ contains
942947 end do
943948 end do
944949
945- !$acc parallel loop collapse(5) gang vector default(present) private(r )
950+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
946951 do i = nVar + 1, nVar + 4
947952 do l = 0, p
948953 do k = -buff_size, -1
@@ -960,7 +965,7 @@ contains
960965 end if
961966 #:else
962967 ! Unpacking buffer from bc_z%beg
963- !$acc parallel loop collapse(4) gang vector default(present) private(r )
968+ $:GPU_PARALLEL_LOOP(collapse=4, private= ' [r] ' )
964969 do i = 1, nVar
965970 do l = -buff_size, -1
966971 do k = -buff_size, n + buff_size
@@ -982,7 +987,7 @@ contains
982987 end do
983988
984989 if (qbmm_comm) then
985- !$acc parallel loop collapse(5) gang vector default(present) private(r )
990+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
986991 do i = nVar + 1, nVar + 4
987992 do l = -buff_size, -1
988993 do k = -buff_size, n + buff_size
@@ -999,7 +1004,7 @@ contains
9991004 end do
10001005 end do
10011006
1002- !$acc parallel loop collapse(5) gang vector default(present) private(r )
1007+ $:GPU_PARALLEL_LOOP(collapse=5, private= ' [r] ' )
10031008 do i = nVar + 1, nVar + 4
10041009 do l = -buff_size, -1
10051010 do k = -buff_size, n + buff_size
0 commit comments