@@ -53,17 +53,17 @@ contains
5353 if (ib) then
5454 if (n > 0) then
5555 if (p > 0) then
56- i_halo_size = -1 + gp_layers * &
57- & (m + 2*gp_layers + 1)* &
58- & (n + 2*gp_layers + 1)* &
59- & (p + 2*gp_layers + 1)/ &
60- & (cells_bounds%mnp_min + 2*gp_layers + 1)
56+ i_halo_size = -1 + buff_size * &
57+ & (m + 2*buff_size + 1)* &
58+ & (n + 2*buff_size + 1)* &
59+ & (p + 2*buff_size + 1)/ &
60+ & (cells_bounds%mnp_min + 2*buff_size + 1)
6161 else
62- i_halo_size = -1 + gp_layers * &
63- & (cells_bounds%mn_max + 2*gp_layers + 1)
62+ i_halo_size = -1 + buff_size * &
63+ & (cells_bounds%mn_max + 2*buff_size + 1)
6464 end if
6565 else
66- i_halo_size = -1 + gp_layers
66+ i_halo_size = -1 + buff_size
6767 end if
6868
6969 $:GPU_UPDATE(device=' [i_halo_size]' )
@@ -270,9 +270,9 @@ contains
270270 call nvtxStartRange("IB-MARKER-COMM-PACKBUF")
271271
272272 buffer_counts = (/ &
273- gp_layers *(n + 1)*(p + 1), &
274- gp_layers *(m + 2*gp_layers + 1)*(p + 1), &
275- gp_layers *(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1) &
273+ buff_size *(n + 1)*(p + 1), &
274+ buff_size *(m + 2*buff_size + 1)*(p + 1), &
275+ buff_size *(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
276276 /)
277277
278278 buffer_count = buffer_counts(mpi_dir)
@@ -297,12 +297,12 @@ contains
297297
298298 pack_offset = 0
299299 if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
300- pack_offset = grid_dims(mpi_dir) - gp_layers + 1
300+ pack_offset = grid_dims(mpi_dir) - buff_size + 1
301301 end if
302302
303303 unpack_offset = 0
304304 if (pbc_loc == 1) then
305- unpack_offset = grid_dims(mpi_dir) + gp_layers + 1
305+ unpack_offset = grid_dims(mpi_dir) + buff_size + 1
306306 end if
307307
308308 ! Pack Buffer to Send
@@ -312,30 +312,30 @@ contains
312312 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
313313 do l = 0, p
314314 do k = 0, n
315- do j = 0, gp_layers - 1
316- r = (j + gp_layers *(k + (n + 1)*l))
315+ do j = 0, buff_size - 1
316+ r = (j + buff_size *(k + (n + 1)*l))
317317 ib_buff_send(r) = ib_markers%sf(j + pack_offset, k, l)
318318 end do
319319 end do
320320 end do
321321 #:elif mpi_dir == 2
322322 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
323323 do l = 0, p
324- do k = 0, gp_layers - 1
325- do j = -gp_layers , m + gp_layers
326- r = ((j + gp_layers ) + (m + 2*gp_layers + 1)* &
327- (k + gp_layers *l))
324+ do k = 0, buff_size - 1
325+ do j = -buff_size , m + buff_size
326+ r = ((j + buff_size ) + (m + 2*buff_size + 1)* &
327+ (k + buff_size *l))
328328 ib_buff_send(r) = ib_markers%sf(j, k + pack_offset, l)
329329 end do
330330 end do
331331 end do
332332 #:else
333333 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
334- do l = 0, gp_layers - 1
335- do k = -gp_layers , n + gp_layers
336- do j = -gp_layers , m + gp_layers
337- r = ((j + gp_layers ) + (m + 2*gp_layers + 1)* &
338- ((k + gp_layers ) + (n + 2*gp_layers + 1)*l))
334+ do l = 0, buff_size - 1
335+ do k = -buff_size , n + buff_size
336+ do j = -buff_size , m + buff_size
337+ r = ((j + buff_size ) + (m + 2*buff_size + 1)* &
338+ ((k + buff_size ) + (n + 2*buff_size + 1)*l))
339339 ib_buff_send(r) = ib_markers%sf(j, k, l + pack_offset)
340340 end do
341341 end do
@@ -345,12 +345,38 @@ contains
345345 #:endfor
346346 call nvtxEndRange ! Packbuf
347347
348- call nvtxStartRange("IB-MARKER-SENDRECV")
349- call MPI_SENDRECV( &
350- ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
351- ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
352- MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
353- call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
348+ #:for rdma_mpi in [False, True]
349+ if (rdma_mpi .eqv. ${' .true. ' if rdma_mpi else ' .false. ' }$) then
350+ #:if rdma_mpi
351+ #:call GPU_HOST_DATA(use_device=' [ib_buff_send, ib_buff_recv]' )
352+
353+ call nvtxStartRange("IB-MARKER-SENDRECV-RDMA")
354+ call MPI_SENDRECV( &
355+ ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
356+ ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
357+ MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
358+ call nvtxEndRange
359+
360+ #:endcall GPU_HOST_DATA
361+ $:GPU_WAIT()
362+ #:else
363+ call nvtxStartRange("IB-MARKER-DEV2HOST")
364+ $:GPU_UPDATE(host=' [ib_buff_send]' )
365+ call nvtxEndRange
366+
367+ call nvtxStartRange("IB-MARKER-SENDRECV-NO-RMDA")
368+ call MPI_SENDRECV( &
369+ ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
370+ ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
371+ MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
372+ call nvtxEndRange
373+
374+ call nvtxStartRange("IB-MARKER-HOST2DEV")
375+ $:GPU_UPDATE(device=' [ib_buff_recv]' )
376+ call nvtxEndRange
377+ #:endif
378+ end if
379+ #:endfor
354380
355381 ! Unpack Received Buffer
356382 call nvtxStartRange("IB-MARKER-COMM-UNPACKBUF")
@@ -360,32 +386,32 @@ contains
360386 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
361387 do l = 0, p
362388 do k = 0, n
363- do j = -gp_layers , -1
364- r = (j + gp_layers *((k + 1) + (n + 1)*l))
389+ do j = -buff_size , -1
390+ r = (j + buff_size *((k + 1) + (n + 1)*l))
365391 ib_markers%sf(j + unpack_offset, k, l) = ib_buff_recv(r)
366392 end do
367393 end do
368394 end do
369395 #:elif mpi_dir == 2
370396 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
371397 do l = 0, p
372- do k = -gp_layers , -1
373- do j = -gp_layers , m + gp_layers
374- r = ((j + gp_layers ) + (m + 2*gp_layers + 1)* &
375- ((k + gp_layers ) + gp_layers *l))
398+ do k = -buff_size , -1
399+ do j = -buff_size , m + buff_size
400+ r = ((j + buff_size ) + (m + 2*buff_size + 1)* &
401+ ((k + buff_size ) + buff_size *l))
376402 ib_markers%sf(j, k + unpack_offset, l) = ib_buff_recv(r)
377403 end do
378404 end do
379405 end do
380406 #:else
381407 ! Unpacking buffer from bc_z%beg
382408 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
383- do l = -gp_layers , -1
384- do k = -gp_layers , n + gp_layers
385- do j = -gp_layers , m + gp_layers
386- r = ((j + gp_layers ) + (m + 2*gp_layers + 1)* &
387- ((k + gp_layers ) + (n + 2*gp_layers + 1)* &
388- (l + gp_layers )))
409+ do l = -buff_size , -1
410+ do k = -buff_size , n + buff_size
411+ do j = -buff_size , m + buff_size
412+ r = ((j + buff_size ) + (m + 2*buff_size + 1)* &
413+ ((k + buff_size ) + (n + 2*buff_size + 1)* &
414+ (l + buff_size )))
389415 ib_markers%sf(j, k, l + unpack_offset) = ib_buff_recv(r)
390416 end do
391417 end do
0 commit comments