@@ -63,17 +63,17 @@ contains
6363 if (ib) then
6464 if (n > 0) then
6565 if (p > 0) then
66- i_halo_size = -1 + gp_layers * &
67- & (m + 2*gp_layers + 1)* &
68- & (n + 2*gp_layers + 1)* &
69- & (p + 2*gp_layers + 1)/ &
70- & (cells_bounds%mnp_min + 2*gp_layers + 1)
66+ i_halo_size = -1 + buff_size * &
67+ & (m + 2*buff_size + 1)* &
68+ & (n + 2*buff_size + 1)* &
69+ & (p + 2*buff_size + 1)/ &
70+ & (cells_bounds%mnp_min + 2*buff_size + 1)
7171 else
72- i_halo_size = -1 + gp_layers * &
73- & (cells_bounds%mn_max + 2*gp_layers + 1)
72+ i_halo_size = -1 + buff_size * &
73+ & (cells_bounds%mn_max + 2*buff_size + 1)
7474 end if
7575 else
76- i_halo_size = -1 + gp_layers
76+ i_halo_size = -1 + buff_size
7777 end if
7878
7979 $:GPU_UPDATE(device=' [i_halo_size]' )
@@ -313,9 +313,9 @@ contains
313313 call nvtxStartRange("IB-MARKER-COMM-PACKBUF")
314314
315315 buffer_counts = (/ &
316- gp_layers *(n + 1)*(p + 1), &
317- gp_layers *(m + 2*gp_layers + 1)*(p + 1), &
318- gp_layers *(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1) &
316+ buff_size *(n + 1)*(p + 1), &
317+ buff_size *(m + 2*buff_size + 1)*(p + 1), &
318+ buff_size *(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
319319 /)
320320
321321 buffer_count = buffer_counts(mpi_dir)
@@ -340,12 +340,12 @@ contains
340340
341341 pack_offset = 0
342342 if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
343- pack_offset = grid_dims(mpi_dir) - gp_layers + 1
343+ pack_offset = grid_dims(mpi_dir) - buff_size + 1
344344 end if
345345
346346 unpack_offset = 0
347347 if (pbc_loc == 1) then
348- unpack_offset = grid_dims(mpi_dir) + gp_layers + 1
348+ unpack_offset = grid_dims(mpi_dir) + buff_size + 1
349349 end if
350350
351351 ! Pack Buffer to Send
@@ -355,30 +355,30 @@ contains
355355 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
356356 do l = 0, p
357357 do k = 0, n
358- do j = 0, gp_layers - 1
359- r = (j + gp_layers *(k + (n + 1)*l))
358+ do j = 0, buff_size - 1
359+ r = (j + buff_size *(k + (n + 1)*l))
360360 ib_buff_send(r) = ib_markers%sf(j + pack_offset, k, l)
361361 end do
362362 end do
363363 end do
364364 #:elif mpi_dir == 2
365365 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
366366 do l = 0, p
367- do k = 0, gp_layers - 1
368- do j = -gp_layers , m + gp_layers
369- r = ((j + gp_layers ) + (m + 2*gp_layers + 1)* &
370- (k + gp_layers *l))
367+ do k = 0, buff_size - 1
368+ do j = -buff_size , m + buff_size
369+ r = ((j + buff_size ) + (m + 2*buff_size + 1)* &
370+ (k + buff_size *l))
371371 ib_buff_send(r) = ib_markers%sf(j, k + pack_offset, l)
372372 end do
373373 end do
374374 end do
375375 #:else
376376 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
377- do l = 0, gp_layers - 1
378- do k = -gp_layers , n + gp_layers
379- do j = -gp_layers , m + gp_layers
380- r = ((j + gp_layers ) + (m + 2*gp_layers + 1)* &
381- ((k + gp_layers ) + (n + 2*gp_layers + 1)*l))
377+ do l = 0, buff_size - 1
378+ do k = -buff_size , n + buff_size
379+ do j = -buff_size , m + buff_size
380+ r = ((j + buff_size ) + (m + 2*buff_size + 1)* &
381+ ((k + buff_size ) + (n + 2*buff_size + 1)*l))
382382 ib_buff_send(r) = ib_markers%sf(j, k, l + pack_offset)
383383 end do
384384 end do
@@ -388,12 +388,38 @@ contains
388388 #:endfor
389389 call nvtxEndRange ! Packbuf
390390
391- call nvtxStartRange("IB-MARKER-SENDRECV")
392- call MPI_SENDRECV( &
393- ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
394- ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
395- MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
396- call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
391+ #:for rdma_mpi in [False, True]
392+ if (rdma_mpi .eqv. ${' .true. ' if rdma_mpi else ' .false. ' }$) then
393+ #:if rdma_mpi
394+ #:call GPU_HOST_DATA(use_device=' [ib_buff_send, ib_buff_recv]' )
395+
396+ call nvtxStartRange("IB-MARKER-SENDRECV-RDMA")
397+ call MPI_SENDRECV( &
398+ ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
399+ ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
400+ MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
401+ call nvtxEndRange
402+
403+ #:endcall GPU_HOST_DATA
404+ $:GPU_WAIT()
405+ #:else
406+ call nvtxStartRange("IB-MARKER-DEV2HOST")
407+ $:GPU_UPDATE(host=' [ib_buff_send]' )
408+ call nvtxEndRange
409+
410+ call nvtxStartRange("IB-MARKER-SENDRECV-NO-RMDA")
411+ call MPI_SENDRECV( &
412+ ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
413+ ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
414+ MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
415+ call nvtxEndRange
416+
417+ call nvtxStartRange("IB-MARKER-HOST2DEV")
418+ $:GPU_UPDATE(device=' [ib_buff_recv]' )
419+ call nvtxEndRange
420+ #:endif
421+ end if
422+ #:endfor
397423
398424 ! Unpack Received Buffer
399425 call nvtxStartRange("IB-MARKER-COMM-UNPACKBUF")
@@ -403,32 +429,32 @@ contains
403429 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
404430 do l = 0, p
405431 do k = 0, n
406- do j = -gp_layers , -1
407- r = (j + gp_layers *((k + 1) + (n + 1)*l))
432+ do j = -buff_size , -1
433+ r = (j + buff_size *((k + 1) + (n + 1)*l))
408434 ib_markers%sf(j + unpack_offset, k, l) = ib_buff_recv(r)
409435 end do
410436 end do
411437 end do
412438 #:elif mpi_dir == 2
413439 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
414440 do l = 0, p
415- do k = -gp_layers , -1
416- do j = -gp_layers , m + gp_layers
417- r = ((j + gp_layers ) + (m + 2*gp_layers + 1)* &
418- ((k + gp_layers ) + gp_layers *l))
441+ do k = -buff_size , -1
442+ do j = -buff_size , m + buff_size
443+ r = ((j + buff_size ) + (m + 2*buff_size + 1)* &
444+ ((k + buff_size ) + buff_size *l))
419445 ib_markers%sf(j, k + unpack_offset, l) = ib_buff_recv(r)
420446 end do
421447 end do
422448 end do
423449 #:else
424450 ! Unpacking buffer from bc_z%beg
425451 $:GPU_PARALLEL_LOOP(collapse=3,private=' [r]' )
426- do l = -gp_layers , -1
427- do k = -gp_layers , n + gp_layers
428- do j = -gp_layers , m + gp_layers
429- r = ((j + gp_layers ) + (m + 2*gp_layers + 1)* &
430- ((k + gp_layers ) + (n + 2*gp_layers + 1)* &
431- (l + gp_layers )))
452+ do l = -buff_size , -1
453+ do k = -buff_size , n + buff_size
454+ do j = -buff_size , m + buff_size
455+ r = ((j + buff_size ) + (m + 2*buff_size + 1)* &
456+ ((k + buff_size ) + (n + 2*buff_size + 1)* &
457+ (l + buff_size )))
432458 ib_markers%sf(j, k, l + unpack_offset) = ib_buff_recv(r)
433459 end do
434460 end do
0 commit comments