Skip to content

Commit 90639b2

Browse files
refresh FFT code to master based on comment
1 parent 07d558b commit 90639b2

File tree

1 file changed

+106
-74
lines changed

1 file changed

+106
-74
lines changed

src/simulation/m_fftw.fpp

Lines changed: 106 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -137,24 +137,24 @@ contains
137137
if (bc_y%beg >= 0) return
138138
#if defined(MFC_GPU)
139139

140-
$:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
141-
do k = 1, sys_size
142-
do j = 0, m
143-
do l = 1, cmplx_size
144-
data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
140+
$:GPU_PARALLEL_LOOP(collapse=3)
141+
do k = 1, sys_size
142+
do j = 0, m
143+
do l = 1, cmplx_size
144+
data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
145+
end do
145146
end do
146147
end do
147-
end do
148148
$:END_GPU_PARALLEL_LOOP()
149149

150-
$:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
151-
do k = 1, sys_size
152-
do j = 0, m
153-
do l = 0, p
154-
data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l)
150+
$:GPU_PARALLEL_LOOP(collapse=3)
151+
do k = 1, sys_size
152+
do j = 0, m
153+
do l = 0, p
154+
data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l)
155+
end do
155156
end do
156157
end do
157-
end do
158158
$:END_GPU_PARALLEL_LOOP()
159159

160160
#:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
@@ -168,14 +168,14 @@ contains
168168
Nfq = 3
169169
$:GPU_UPDATE(device='[Nfq]')
170170

171-
$:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
172-
do k = 1, sys_size
173-
do j = 0, m
174-
do l = 1, Nfq
175-
data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
171+
$:GPU_PARALLEL_LOOP(collapse=3)
172+
do k = 1, sys_size
173+
do j = 0, m
174+
do l = 1, Nfq
175+
data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
176+
end do
176177
end do
177178
end do
178-
end do
179179
$:END_GPU_PARALLEL_LOOP()
180180

181181
#:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
@@ -187,39 +187,71 @@ contains
187187
#endif
188188
#:endcall GPU_HOST_DATA
189189

190-
$:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
191-
do k = 1, sys_size
192-
do j = 0, m
193-
do l = 0, p
194-
data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
195-
q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
190+
$:GPU_PARALLEL_LOOP(collapse=3)
191+
do k = 1, sys_size
192+
do j = 0, m
193+
do l = 0, p
194+
data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
195+
q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
196+
end do
196197
end do
197198
end do
198-
end do
199199
$:END_GPU_PARALLEL_LOOP()
200200

201201
do i = 1, fourier_rings
202202

203-
$:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
204-
do k = 1, sys_size
205-
do j = 0, m
206-
do l = 1, cmplx_size
207-
data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
203+
$:GPU_PARALLEL_LOOP(collapse=3)
204+
do k = 1, sys_size
205+
do j = 0, m
206+
do l = 1, cmplx_size
207+
data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
208+
end do
208209
end do
210+
end do
211+
$:END_GPU_PARALLEL_LOOP()
209212

213+
$:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
214+
do k = 1, sys_size
215+
do j = 0, m
216+
do l = 0, p
217+
data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
218+
end do
219+
end do
220+
end do
221+
$:END_GPU_PARALLEL_LOOP()
222+
223+
#:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
224+
#if defined(__PGI)
225+
ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
226+
#else
227+
ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
228+
call hipCheck(hipDeviceSynchronize())
229+
#endif
230+
#:endcall GPU_HOST_DATA
231+
232+
Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
233+
$:GPU_UPDATE(device='[Nfq]')
234+
235+
$:GPU_PARALLEL_LOOP(collapse=3)
236+
do k = 1, sys_size
237+
do j = 0, m
238+
do l = 1, Nfq
239+
data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
240+
end do
241+
end do
210242
end do
211-
$:END_GPU_PARALLEL_LOOP()
243+
$:END_GPU_PARALLEL_LOOP()
212244

213-
#:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
245+
#:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
214246
#if defined(__PGI)
215-
ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
247+
ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
216248
#else
217-
ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
218-
call hipCheck(hipDeviceSynchronize())
249+
ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
250+
call hipCheck(hipDeviceSynchronize())
219251
#endif
220-
#:endcall GPU_HOST_DATA
252+
#:endcall GPU_HOST_DATA
221253

222-
$:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
254+
$:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
223255
do k = 1, sys_size
224256
do j = 0, m
225257
do l = 0, p
@@ -228,66 +260,66 @@ contains
228260
end do
229261
end do
230262
end do
231-
$:END_GPU_PARALLEL_LOOP()
232-
end do
263+
$:END_GPU_PARALLEL_LOOP()
264+
end do
233265

234266
#else
235-
Nfq = 3
267+
Nfq = 3
268+
do j = 0, m
269+
do k = 1, sys_size
270+
data_fltr_cmplx(:) = (0_dp, 0_dp)
271+
data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p)
272+
call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
273+
data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
274+
call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
275+
data_real(:) = data_real(:)/real(real_size, dp)
276+
q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1)
277+
end do
278+
end do
279+
280+
! Apply Fourier filter to additional rings
281+
do i = 1, fourier_rings
282+
Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
236283
do j = 0, m
237284
do k = 1, sys_size
238285
data_fltr_cmplx(:) = (0_dp, 0_dp)
239-
data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p)
286+
data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p)
240287
call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
241288
data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
242289
call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
243290
data_real(:) = data_real(:)/real(real_size, dp)
244-
q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1)
245-
end do
246-
end do
247-
248-
! Apply Fourier filter to additional rings
249-
do i = 1, fourier_rings
250-
Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
251-
do j = 0, m
252-
do k = 1, sys_size
253-
data_fltr_cmplx(:) = (0_dp, 0_dp)
254-
data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p)
255-
call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
256-
data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
257-
call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
258-
data_real(:) = data_real(:)/real(real_size, dp)
259-
q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1)
260-
end do
291+
q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1)
261292
end do
262293
end do
294+
end do
263295
#endif
264296

265-
end subroutine s_apply_fourier_filter
297+
end subroutine s_apply_fourier_filter
266298

267-
!> The purpose of this subroutine is to destroy the fftw plan
299+
!> The purpose of this subroutine is to destroy the fftw plan
268300
!! that will be used in the forward and backward DFTs when
269301
!! applying the Fourier filter in the azimuthal direction.
270-
impure subroutine s_finalize_fftw_module
302+
impure subroutine s_finalize_fftw_module
271303

272304
#if defined(MFC_GPU)
273-
integer :: ierr !< Generic flag used to identify and report GPU errors
274-
@:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
305+
integer :: ierr !< Generic flag used to identify and report GPU errors
306+
@:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
275307
#if defined(__PGI)
276308

277-
ierr = cufftDestroy(fwd_plan_gpu)
278-
ierr = cufftDestroy(bwd_plan_gpu)
309+
ierr = cufftDestroy(fwd_plan_gpu)
310+
ierr = cufftDestroy(bwd_plan_gpu)
279311
#else
280-
ierr = hipfftDestroy(fwd_plan_gpu)
281-
ierr = hipfftDestroy(bwd_plan_gpu)
312+
ierr = hipfftDestroy(fwd_plan_gpu)
313+
ierr = hipfftDestroy(bwd_plan_gpu)
282314
#endif
283315
#else
284-
call fftw_free(fftw_real_data)
285-
call fftw_free(fftw_cmplx_data)
286-
call fftw_free(fftw_fltr_cmplx_data)
316+
call fftw_free(fftw_real_data)
317+
call fftw_free(fftw_cmplx_data)
318+
call fftw_free(fftw_fltr_cmplx_data)
287319

288-
call fftw_destroy_plan(fwd_plan)
289-
call fftw_destroy_plan(bwd_plan)
320+
call fftw_destroy_plan(fwd_plan)
321+
call fftw_destroy_plan(bwd_plan)
290322
#endif
291323

292-
end subroutine s_finalize_fftw_module
293-
end module m_fftw
324+
end subroutine s_finalize_fftw_module
325+
end module m_fftw

0 commit comments

Comments
 (0)