Resolving bug with multiple ranks using IBM (#990)

anandrdbz · Anand · sbryngelson · web-flow · commit fe87fc56234a · 2025-09-01T17:56:07.000-04:00
Co-authored-by: Anand &lt;anand@lawn-128-61-19-55.lawn.gatech.edu&gt;
Co-authored-by: Spencer Bryngelson &lt;shb@gatech.edu&gt;
Co-authored-by: Anand &lt;anand@ipsec-10-2-68-64.vpn.gatech.edu&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -53,7 +53,7 @@ jobs:
         run:  |
           brew update
           brew upgrade
-          brew install coreutils python cmake fftw hdf5 gcc@15 boost open-mpi lapack
+          brew install coreutils python fftw hdf5 gcc@15 boost open-mpi lapack
           echo "FC=gfortran-15" >> $GITHUB_ENV
           echo "BOOST_INCLUDE=/opt/homebrew/include/" >> $GITHUB_ENV
 
diff --git a/src/common/m_helper_basic.fpp b/src/common/m_helper_basic.fpp
@@ -112,14 +112,15 @@ contains
 
     pure subroutine s_configure_coordinate_bounds(recon_type, weno_polyn, muscl_polyn, &
                                                   igr_order, buff_size, idwint, idwbuff, &
-                                                  viscous, bubbles_lagrange, m, n, p, num_dims, igr)
+                                                  viscous, bubbles_lagrange, m, n, p, num_dims, igr, ib)
 
         integer, intent(in) :: recon_type, weno_polyn, muscl_polyn
         integer, intent(in) :: m, n, p, num_dims, igr_order
         integer, intent(inout) :: buff_size
         type(int_bounds_info), dimension(3), intent(inout) :: idwint, idwbuff
         logical, intent(in) :: viscous, bubbles_lagrange
         logical, intent(in) :: igr
+        logical, intent(in) :: ib
 
         ! Determining the number of cells that are needed in order to store
         ! sufficient boundary conditions data as to iterate the solution in
@@ -142,6 +143,10 @@ contains
             buff_size = max(buff_size, 6)
         end if
 
+        if (ib) then
+            buff_size = max(buff_size, 10)
+        end if
+
         ! Configuring Coordinate Direction Indexes
         idwint(1)%beg = 0; idwint(2)%beg = 0; idwint(3)%beg = 0
         idwint(1)%end = m; idwint(2)%end = n; idwint(3)%end = p
diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
@@ -884,7 +884,7 @@ contains
                                            igr_order, buff_size, &
                                            idwint, idwbuff, viscous, &
                                            bubbles_lagrange, m, n, p, &
-                                           num_dims, igr)
+                                           num_dims, igr, ib)
 
 #ifdef MFC_MPI
 
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
@@ -1254,7 +1254,7 @@ contains
                                            igr_order, buff_size, &
                                            idwint, idwbuff, viscous, &
                                            bubbles_lagrange, m, n, p, &
-                                           num_dims, igr)
+                                           num_dims, igr, ib)
         $:GPU_UPDATE(device='[idwint, idwbuff]')
 
         ! Configuring Coordinate Direction Indexes
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
@@ -53,19 +53,19 @@ contains
     impure subroutine s_initialize_ibm_module()
 
         if (p > 0) then
-            @:ALLOCATE(ib_markers%sf(-gp_layers:m+gp_layers, &
-                -gp_layers:n+gp_layers, -gp_layers:p+gp_layers))
-            @:ALLOCATE(levelset%sf(-gp_layers:m+gp_layers, &
-                -gp_layers:n+gp_layers, -gp_layers:p+gp_layers, 1:num_ibs))
-            @:ALLOCATE(levelset_norm%sf(-gp_layers:m+gp_layers, &
-                -gp_layers:n+gp_layers, -gp_layers:p+gp_layers, 1:num_ibs, 1:3))
+            @:ALLOCATE(ib_markers%sf(-buff_size:m+buff_size, &
+                -buff_size:n+buff_size, -buff_size:p+buff_size))
+            @:ALLOCATE(levelset%sf(-buff_size:m+buff_size, &
+                -buff_size:n+buff_size, -buff_size:p+buff_size, 1:num_ibs))
+            @:ALLOCATE(levelset_norm%sf(-buff_size:m+buff_size, &
+                -buff_size:n+buff_size, -buff_size:p+buff_size, 1:num_ibs, 1:3))
         else
-            @:ALLOCATE(ib_markers%sf(-gp_layers:m+gp_layers, &
-                -gp_layers:n+gp_layers, 0:0))
-            @:ALLOCATE(levelset%sf(-gp_layers:m+gp_layers, &
-                -gp_layers:n+gp_layers, 0:0, 1:num_ibs))
-            @:ALLOCATE(levelset_norm%sf(-gp_layers:m+gp_layers, &
-                -gp_layers:n+gp_layers, 0:0, 1:num_ibs, 1:3))
+            @:ALLOCATE(ib_markers%sf(-buff_size:m+buff_size, &
+                -buff_size:n+buff_size, 0:0))
+            @:ALLOCATE(levelset%sf(-buff_size:m+buff_size, &
+                -buff_size:n+buff_size, 0:0, 1:num_ibs))
+            @:ALLOCATE(levelset_norm%sf(-buff_size:m+buff_size, &
+                -buff_size:n+buff_size, 0:0, 1:num_ibs, 1:3))
         end if
 
         @:ACC_SETUP_SFs(ib_markers)
@@ -381,13 +381,13 @@ contains
                 ! s_cc points to the dim array we need
                 if (dim == 1) then
                     s_cc => x_cc
-                    bound = m
+                    bound = m + buff_size - 1
                 elseif (dim == 2) then
                     s_cc => y_cc
-                    bound = n
+                    bound = n + buff_size - 1
                 else
                     s_cc => z_cc
-                    bound = p
+                    bound = p + buff_size - 1
                 end if
 
                 if (f_approx_equal(norm(dim), 0._wp)) then
@@ -402,9 +402,12 @@ contains
                     index = ghost_points_in(q)%loc(dim)
                     temp_loc = ghost_points_in(q)%ip_loc(dim)
                     do while ((temp_loc < s_cc(index) &
-                               .or. temp_loc > s_cc(index + 1)) &
-                              .and. (index >= 0 .and. index <= bound))
+                               .or. temp_loc > s_cc(index + 1)))
                         index = index + dir
+                        if (index < -buff_size .or. index > bound) then
+                            print *, "Increase buff_size further in m_helper_basic (currently set to a minimum of 10)"
+                            error stop "Increase buff_size"
+                        end if
                     end do
                     ghost_points_in(q)%ip_grid(dim) = index
                     if (ghost_points_in(q)%DB(dim) == -1) then
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
@@ -53,17 +53,17 @@ contains
         if (ib) then
             if (n > 0) then
                 if (p > 0) then
-                    i_halo_size = -1 + gp_layers* &
-                                            & (m + 2*gp_layers + 1)* &
-                                            & (n + 2*gp_layers + 1)* &
-                                            & (p + 2*gp_layers + 1)/ &
-                                            & (cells_bounds%mnp_min + 2*gp_layers + 1)
+                    i_halo_size = -1 + buff_size* &
+                                            & (m + 2*buff_size + 1)* &
+                                            & (n + 2*buff_size + 1)* &
+                                            & (p + 2*buff_size + 1)/ &
+                                            & (cells_bounds%mnp_min + 2*buff_size + 1)
                 else
-                    i_halo_size = -1 + gp_layers* &
-                                            & (cells_bounds%mn_max + 2*gp_layers + 1)
+                    i_halo_size = -1 + buff_size* &
+                                            & (cells_bounds%mn_max + 2*buff_size + 1)
                 end if
             else
-                i_halo_size = -1 + gp_layers
+                i_halo_size = -1 + buff_size
             end if
 
             $:GPU_UPDATE(device='[i_halo_size]')
@@ -270,9 +270,9 @@ contains
         call nvtxStartRange("IB-MARKER-COMM-PACKBUF")
 
         buffer_counts = (/ &
-                        gp_layers*(n + 1)*(p + 1), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1) &
+                        buff_size*(n + 1)*(p + 1), &
+                        buff_size*(m + 2*buff_size + 1)*(p + 1), &
+                        buff_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
                         /)
 
         buffer_count = buffer_counts(mpi_dir)
@@ -297,12 +297,12 @@ contains
 
         pack_offset = 0
         if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
-            pack_offset = grid_dims(mpi_dir) - gp_layers + 1
+            pack_offset = grid_dims(mpi_dir) - buff_size + 1
         end if
 
         unpack_offset = 0
         if (pbc_loc == 1) then
-            unpack_offset = grid_dims(mpi_dir) + gp_layers + 1
+            unpack_offset = grid_dims(mpi_dir) + buff_size + 1
         end if
 
         ! Pack Buffer to Send
@@ -312,30 +312,30 @@ contains
                     $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
                         do k = 0, n
-                            do j = 0, gp_layers - 1
-                                r = (j + gp_layers*(k + (n + 1)*l))
+                            do j = 0, buff_size - 1
+                                r = (j + buff_size*(k + (n + 1)*l))
                                 ib_buff_send(r) = ib_markers%sf(j + pack_offset, k, l)
                             end do
                         end do
                     end do
                 #:elif mpi_dir == 2
                     $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
-                        do k = 0, gp_layers - 1
-                            do j = -gp_layers, m + gp_layers
-                                r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                     (k + gp_layers*l))
+                        do k = 0, buff_size - 1
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     (k + buff_size*l))
                                 ib_buff_send(r) = ib_markers%sf(j, k + pack_offset, l)
                             end do
                         end do
                     end do
                 #:else
                     $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
-                    do l = 0, gp_layers - 1
-                        do k = -gp_layers, n + gp_layers
-                            do j = -gp_layers, m + gp_layers
-                                r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                     ((k + gp_layers) + (n + 2*gp_layers + 1)*l))
+                    do l = 0, buff_size - 1
+                        do k = -buff_size, n + buff_size
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     ((k + buff_size) + (n + 2*buff_size + 1)*l))
                                 ib_buff_send(r) = ib_markers%sf(j, k, l + pack_offset)
                             end do
                         end do
@@ -345,12 +345,38 @@ contains
         #:endfor
         call nvtxEndRange ! Packbuf
 
-        call nvtxStartRange("IB-MARKER-SENDRECV")
-        call MPI_SENDRECV( &
-            ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
-            ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
-            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-        call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
+        #:for rdma_mpi in [False, True]
+            if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                #:if rdma_mpi
+                    #:call GPU_HOST_DATA(use_device='[ib_buff_send, ib_buff_recv]')
+
+                        call nvtxStartRange("IB-MARKER-SENDRECV-RDMA")
+                        call MPI_SENDRECV( &
+                            ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
+                            ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
+                            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                        call nvtxEndRange
+
+                    #:endcall GPU_HOST_DATA
+                    $:GPU_WAIT()
+                #:else
+                    call nvtxStartRange("IB-MARKER-DEV2HOST")
+                    $:GPU_UPDATE(host='[ib_buff_send]')
+                    call nvtxEndRange
+
+                    call nvtxStartRange("IB-MARKER-SENDRECV-NO-RMDA")
+                    call MPI_SENDRECV( &
+                        ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
+                        ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
+                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    call nvtxEndRange
+
+                    call nvtxStartRange("IB-MARKER-HOST2DEV")
+                    $:GPU_UPDATE(device='[ib_buff_recv]')
+                    call nvtxEndRange
+                #:endif
+            end if
+        #:endfor
 
         ! Unpack Received Buffer
         call nvtxStartRange("IB-MARKER-COMM-UNPACKBUF")
@@ -360,32 +386,32 @@ contains
                     $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
                         do k = 0, n
-                            do j = -gp_layers, -1
-                                r = (j + gp_layers*((k + 1) + (n + 1)*l))
+                            do j = -buff_size, -1
+                                r = (j + buff_size*((k + 1) + (n + 1)*l))
                                 ib_markers%sf(j + unpack_offset, k, l) = ib_buff_recv(r)
                             end do
                         end do
                     end do
                 #:elif mpi_dir == 2
                     $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
-                        do k = -gp_layers, -1
-                            do j = -gp_layers, m + gp_layers
-                                r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                     ((k + gp_layers) + gp_layers*l))
+                        do k = -buff_size, -1
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     ((k + buff_size) + buff_size*l))
                                 ib_markers%sf(j, k + unpack_offset, l) = ib_buff_recv(r)
                             end do
                         end do
                     end do
                 #:else
                     ! Unpacking buffer from bc_z%beg
                     $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
-                    do l = -gp_layers, -1
-                        do k = -gp_layers, n + gp_layers
-                            do j = -gp_layers, m + gp_layers
-                                r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                     ((k + gp_layers) + (n + 2*gp_layers + 1)* &
-                                      (l + gp_layers)))
+                    do l = -buff_size, -1
+                        do k = -buff_size, n + buff_size
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     ((k + buff_size) + (n + 2*buff_size + 1)* &
+                                      (l + buff_size)))
                                 ib_markers%sf(j, k, l + unpack_offset) = ib_buff_recv(r)
                             end do
                         end do
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
@@ -1366,14 +1366,14 @@ contains
             call s_read_data_files(q_cons_ts(1)%vf)
         end if
 
+        ! Populating the buffers of the grid variables using the boundary conditions
+        call s_populate_grid_variables_buffers()
+
         if (model_eqns == 3) call s_initialize_internal_energy_equations(q_cons_ts(1)%vf)
         if (ib) call s_ibm_setup()
         if (bodyForces) call s_initialize_body_forces_module()
         if (acoustic_source) call s_precalculate_acoustic_spatial_sources()
 
-        ! Populating the buffers of the grid variables using the boundary conditions
-        call s_populate_grid_variables_buffers()
-
         ! Initialize the Temperature cache.
         if (chemistry) call s_compute_q_T_sf(q_T_sf, q_cons_ts(1)%vf, idwint)