Skip to content

Commit a7ad524

Browse files
committed
fix(cuda): Fix an issue with non-periodic BCs on multiple ranks.
1 parent bdf70ac commit a7ad524

File tree

1 file changed

+22
-7
lines changed

1 file changed

+22
-7
lines changed

src/cuda/kernels_dist.f90

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -158,23 +158,38 @@ attributes(global) subroutine der_univ_subs(du, recv_u_s, recv_u_e, &
158158

159159
! Local variables
160160
integer :: i, j, b
161-
real(dp) :: ur, bl, recp, du_1, du_n
161+
real(dp) :: ur, bl, recp, du_s, du_e
162162

163163
i = threadIdx%x
164164
b = blockIdx%x
165165

166+
! A small trick we do here is valid for symmetric Toeplitz matrices.
167+
! In our case our matrices satisfy this criteria in the (5:n-4) region
168+
! and as long as a rank has around at least 20 entries the assumptions
169+
! we make here are perfectly valid.
170+
171+
! bl is the bottom left entry in the 2x2 matrix
172+
! ur is the upper right entry in the 2x2 matrix
173+
174+
! Start
175+
! At the start we have the 'bl', and assume 'ur'
166176
bl = dist_sa(1)
167-
ur = dist_sc(n)
177+
ur = dist_sa(1)
168178
recp = 1._dp/(1._dp - ur*bl)
179+
du_s = recp*(du(i, 1, b) - bl*recv_u_s(i, 1, b))
169180

170-
du_1 = recp*(du(i, 1, b) - bl*recv_u_s(i, 1, b))
171-
du_n = recp*(du(i, n, b) - ur*recv_u_e(i, 1, b))
181+
! End
182+
! At the end we have the 'ur', and assume 'bl'
183+
bl = dist_sc(n)
184+
ur = dist_sc(n)
185+
recp = 1._dp/(1._dp - ur*bl)
186+
du_e = recp*(du(i, n, b) - ur*recv_u_e(i, 1, b))
172187

173-
du(i, 1, b) = du_1
188+
du(i, 1, b) = du_s
174189
do j = 2, n - 1
175-
du(i, j, b) = (du(i, j, b) - dist_sa(j)*du_1 - dist_sc(j)*du_n)
190+
du(i, j, b) = (du(i, j, b) - dist_sa(j)*du_s - dist_sc(j)*du_e)
176191
end do
177-
du(i, n, b) = du_n
192+
du(i, n, b) = du_e
178193

179194
end subroutine der_univ_subs
180195

0 commit comments

Comments
 (0)