Skip to content

Commit adf8edb

Browse files
committed
feat(cuda/tests): Add tests for all reordering kernels.
1 parent 04883e0 commit adf8edb

File tree

2 files changed

+229
-0
lines changed

2 files changed

+229
-0
lines changed

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ set(TESTSRC
44
)
55
set(CUDATESTSRC
66
cuda/test_cuda_allocator.f90
7+
cuda/test_cuda_reorder.f90
78
cuda/test_cuda_tridiag.f90
89
cuda/test_cuda_transeq.f90
910
)

tests/cuda/test_cuda_reorder.f90

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
program test_cuda_reorder
2+
use iso_fortran_env, only: stderr => error_unit
3+
use cudafor
4+
use mpi
5+
6+
use m_common, only: dp
7+
use m_cuda_common, only: SZ
8+
use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, &
9+
reorder_y2z, reorder_z2y
10+
11+
implicit none
12+
13+
logical :: allpass = .true.
14+
real(dp), allocatable, dimension(:, :, :) :: u_i, u_o, u_temp
15+
real(dp), device, allocatable, dimension(:, :, :) :: u_i_d, u_o_d, u_temp_d
16+
17+
integer :: n_block, i, n_iters
18+
integer :: nx, ny, nz, ndof
19+
integer :: nrank, nproc, pprev, pnext
20+
integer :: ierr, ndevs, devnum
21+
22+
type(dim3) :: blocks, threads
23+
real(dp) :: norm_u, tol = 1d-8, tstart, tend
24+
25+
call MPI_Init(ierr)
26+
call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
27+
call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)
28+
29+
if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks'
30+
31+
ierr = cudaGetDeviceCount(ndevs)
32+
ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin
33+
ierr = cudaGetDevice(devnum)
34+
35+
!print*, 'I am rank', nrank, 'I am running on device', devnum
36+
pnext = modulo(nrank - nproc + 1, nproc)
37+
pprev = modulo(nrank - 1, nproc)
38+
39+
nx = 512; ny = 512; nz = 512
40+
n_block = ny*nz/SZ
41+
ndof = nx*ny*nz
42+
n_iters = 100
43+
44+
allocate (u_i(SZ, nx, n_block), u_o(SZ, nx, n_block))
45+
allocate (u_temp(SZ, nx, n_block))
46+
allocate (u_i_d(SZ, nx, n_block), u_o_d(SZ, nx, n_block))
47+
allocate (u_temp_d(SZ, nx, n_block))
48+
49+
! set a random field
50+
call random_number(u_i)
51+
52+
! move data to device
53+
u_i_d = u_i
54+
55+
! do a x to y reordering first and then a y to x
56+
blocks = dim3(nx/SZ, nz, ny/SZ)
57+
threads = dim3(SZ, SZ, 1)
58+
call reorder_x2y<<<blocks, threads>>>(u_temp_d, u_i_d, nz)
59+
60+
blocks = dim3(nx/SZ, ny/SZ, nz)
61+
threads = dim3(SZ, SZ, 1)
62+
call reorder_y2x<<<blocks, threads>>>(u_o_d, u_temp_d, nz)
63+
64+
! move the result back to host
65+
u_o = u_o_d
66+
67+
! and check whether it matches the initial random field
68+
norm_u = norm2(u_o - u_i)
69+
if (nrank == 0) then
70+
if ( norm_u > tol ) then
71+
allpass = .false.
72+
write(stderr, '(a)') 'Check reorder x2y and y2x... failed'
73+
else
74+
write(stderr, '(a)') 'Check reorder x2y and y2x... passed'
75+
end if
76+
end if
77+
78+
! we reuse u_o_d so zeroize in any case
79+
u_o_d = 0
80+
81+
! u_temp_d is in y orientation, use y2z to reorder it into z direction
82+
blocks = dim3(nx/SZ, ny/SZ, nz)
83+
threads = dim3(SZ, SZ, 1)
84+
call reorder_y2z<<<blocks, threads>>>(u_o_d, u_temp_d, nx, nz)
85+
86+
! store this in host
87+
u_o = u_o_d
88+
89+
! and zeroize u_o_d again in any case
90+
u_o_d = 0
91+
92+
! reorder initial random field into z orientation
93+
blocks = dim3(nx, ny/SZ, 1)
94+
threads = dim3(SZ, 1, 1)
95+
call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, nz)
96+
u_temp = u_o_d
97+
98+
! compare two z oriented fields
99+
norm_u = norm2(u_o - u_temp)
100+
if (nrank == 0) then
101+
if ( norm_u > tol ) then
102+
allpass = .false.
103+
write(stderr, '(a)') 'Check reorder x2z and y2z... failed'
104+
else
105+
write(stderr, '(a)') 'Check reorder x2z and y2z... passed'
106+
end if
107+
end if
108+
109+
! z oriented field into y
110+
blocks = dim3(nx/SZ, ny/SZ, nz)
111+
threads = dim3(SZ, SZ, 1)
112+
call reorder_z2y<<<blocks, threads>>>(u_temp_d, u_o_d, nx, nz)
113+
114+
! zeroize just in case for reusing
115+
u_o_d = 0
116+
117+
blocks = dim3(nx/SZ, ny/SZ, nz)
118+
threads = dim3(SZ, SZ, 1)
119+
call reorder_y2x<<<blocks, threads>>>(u_o_d, u_temp_d, nz)
120+
u_o = u_o_d
121+
122+
! and check whether it matches the initial random field
123+
norm_u = norm2(u_o - u_i)
124+
if (nrank == 0) then
125+
if ( norm_u > tol ) then
126+
allpass = .false.
127+
write(stderr, '(a)') 'Check reorder z2y and y2x... failed'
128+
else
129+
write(stderr, '(a)') 'Check reorder z2y and y2x... passed'
130+
end if
131+
end if
132+
133+
! Now the performance checks
134+
call cpu_time(tstart)
135+
do i = 1, n_iters
136+
blocks = dim3(nx/SZ, nz, ny/SZ)
137+
threads = dim3(SZ, SZ, 1)
138+
call reorder_x2y<<<blocks, threads>>>(u_o_d, u_i_d, nz)
139+
end do
140+
call cpu_time(tend)
141+
142+
call checkperf(tend - tstart, n_iters, ndof, 2._dp)
143+
144+
call cpu_time(tstart)
145+
do i = 1, n_iters
146+
blocks = dim3(nx, ny/SZ, 1)
147+
threads = dim3(SZ, 1, 1)
148+
call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, nz)
149+
end do
150+
call cpu_time(tend)
151+
152+
call checkperf(tend - tstart, n_iters, ndof, 2._dp)
153+
154+
call cpu_time(tstart)
155+
do i = 1, n_iters
156+
blocks = dim3(nx/SZ, ny/SZ, nz)
157+
threads = dim3(SZ, SZ, 1)
158+
call reorder_y2x<<<blocks, threads>>>(u_o_d, u_i_d, nz)
159+
end do
160+
call cpu_time(tend)
161+
162+
call checkperf(tend - tstart, n_iters, ndof, 2._dp)
163+
164+
call cpu_time(tstart)
165+
do i = 1, n_iters
166+
blocks = dim3(nx/SZ, ny/SZ, nz)
167+
threads = dim3(SZ, SZ, 1)
168+
call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, nx, nz)
169+
end do
170+
call cpu_time(tend)
171+
172+
call checkperf(tend - tstart, n_iters, ndof, 2._dp)
173+
174+
call cpu_time(tstart)
175+
do i = 1, n_iters
176+
blocks = dim3(nx/SZ, ny/SZ, nz)
177+
threads = dim3(SZ, SZ, 1)
178+
call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, nx, nz)
179+
end do
180+
call cpu_time(tend)
181+
182+
call checkperf(tend - tstart, n_iters, ndof, 2._dp)
183+
184+
if (allpass) then
185+
if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.'
186+
else
187+
error stop 'SOME TESTS FAILED.'
188+
end if
189+
190+
call MPI_Finalize(ierr)
191+
192+
contains
193+
194+
subroutine checkperf(t_tot, n_iters, ndof, consumed_bw)
195+
implicit none
196+
197+
real(dp), intent(in) :: t_tot, consumed_bw
198+
integer, intent(in) :: n_iters, ndof
199+
200+
real(dp) :: achievedBW, devBW, achievedBWmax, achievedBWmin
201+
integer :: ierr, memClockRt, memBusWidth
202+
203+
! BW utilisation and performance checks
204+
achievedBW = consumed_bw*n_iters*ndof*dp/t_tot
205+
call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, &
206+
MPI_MAX, MPI_COMM_WORLD, ierr)
207+
call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, &
208+
MPI_MIN, MPI_COMM_WORLD, ierr)
209+
210+
if (nrank == 0) then
211+
print'(a, f8.3, a)', 'Achieved BW min: ', achievedBWmin/2**30, ' GiB/s'
212+
print'(a, f8.3, a)', 'Achieved BW max: ', achievedBWmax/2**30, ' GiB/s'
213+
end if
214+
215+
ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0)
216+
ierr = cudaDeviceGetAttribute(memBusWidth, &
217+
cudaDevAttrGlobalMemoryBusWidth, 0)
218+
devBW = 2*memBusWidth/8._dp*memClockRt*1000
219+
220+
if (nrank == 0) then
221+
print'(a, f8.3, a)', 'Device BW: ', devBW/2**30, ' GiB/s'
222+
print'(a, f5.2)', 'Effective BW util min: %', achievedBWmin/devBW*100
223+
print'(a, f5.2)', 'Effective BW util max: %', achievedBWmax/devBW*100
224+
end if
225+
end subroutine checkperf
226+
227+
end program test_cuda_reorder
228+

0 commit comments

Comments
 (0)