@@ -137,24 +137,24 @@ contains
137137 if (bc_y%beg >= 0 ) return
138138#if defined(MFC_GPU)
139139
140- $:GPU_PARALLEL_LOOP(private= ' [j,k,l]' , collapse= 3 )
141- do k = 1 , sys_size
142- do j = 0 , m
143- do l = 1 , cmplx_size
144- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = (0_dp , 0_dp )
140+ $:GPU_PARALLEL_LOOP(collapse= 3 )
141+ do k = 1 , sys_size
142+ do j = 0 , m
143+ do l = 1 , cmplx_size
144+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = (0_dp , 0_dp )
145+ end do
145146 end do
146147 end do
147- end do
148148 $:END_GPU_PARALLEL_LOOP()
149149
150- $:GPU_PARALLEL_LOOP(private= ' [j,k,l]' , collapse= 3 )
151- do k = 1 , sys_size
152- do j = 0 , m
153- do l = 0 , p
154- data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = q_cons_vf(k)%sf(j, 0 , l)
150+ $:GPU_PARALLEL_LOOP(collapse= 3 )
151+ do k = 1 , sys_size
152+ do j = 0 , m
153+ do l = 0 , p
154+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = q_cons_vf(k)%sf(j, 0 , l)
155+ end do
155156 end do
156157 end do
157- end do
158158 $:END_GPU_PARALLEL_LOOP()
159159
160160 #:call GPU_HOST_DATA(use_device_addr= ' [data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]' )
@@ -168,14 +168,14 @@ contains
168168 Nfq = 3
169169 $:GPU_UPDATE(device= ' [Nfq]' )
170170
171- $:GPU_PARALLEL_LOOP(private= ' [j,k,l]' , collapse= 3 )
172- do k = 1 , sys_size
173- do j = 0 , m
174- do l = 1 , Nfq
175- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
171+ $:GPU_PARALLEL_LOOP(collapse= 3 )
172+ do k = 1 , sys_size
173+ do j = 0 , m
174+ do l = 1 , Nfq
175+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
176+ end do
176177 end do
177178 end do
178- end do
179179 $:END_GPU_PARALLEL_LOOP()
180180
181181 #:call GPU_HOST_DATA(use_device_addr= ' [data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]' )
@@ -187,39 +187,71 @@ contains
187187#endif
188188 #:endcall GPU_HOST_DATA
189189
190- $:GPU_PARALLEL_LOOP(private= ' [j,k,l]' , collapse= 3 )
191- do k = 1 , sys_size
192- do j = 0 , m
193- do l = 0 , p
194- data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
195- q_cons_vf(k)%sf(j, 0 , l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
190+ $:GPU_PARALLEL_LOOP(collapse= 3 )
191+ do k = 1 , sys_size
192+ do j = 0 , m
193+ do l = 0 , p
194+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
195+ q_cons_vf(k)%sf(j, 0 , l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
196+ end do
196197 end do
197198 end do
198- end do
199199 $:END_GPU_PARALLEL_LOOP()
200200
201201 do i = 1 , fourier_rings
202202
203- $:GPU_PARALLEL_LOOP(private= ' [j,k,l]' , collapse= 3 )
204- do k = 1 , sys_size
205- do j = 0 , m
206- do l = 1 , cmplx_size
207- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = (0_dp , 0_dp )
203+ $:GPU_PARALLEL_LOOP(collapse= 3 )
204+ do k = 1 , sys_size
205+ do j = 0 , m
206+ do l = 1 , cmplx_size
207+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = (0_dp , 0_dp )
208+ end do
208209 end do
210+ end do
211+ $:END_GPU_PARALLEL_LOOP()
209212
213+ $:GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
214+ do k = 1 , sys_size
215+ do j = 0 , m
216+ do l = 0 , p
217+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = q_cons_vf(k)%sf(j, i, l)
218+ end do
219+ end do
220+ end do
221+ $:END_GPU_PARALLEL_LOOP()
222+
223+ #:call GPU_HOST_DATA(use_device_addr= ' [data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]' )
224+ #if defined(__PGI)
225+ ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
226+ #else
227+ ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
228+ call hipCheck(hipDeviceSynchronize())
229+ #endif
230+ #:endcall GPU_HOST_DATA
231+
232+ Nfq = min (floor(2_dp * real (i, dp)* pi), cmplx_size)
233+ $:GPU_UPDATE(device= ' [Nfq]' )
234+
235+ $:GPU_PARALLEL_LOOP(collapse= 3 )
236+ do k = 1 , sys_size
237+ do j = 0 , m
238+ do l = 1 , Nfq
239+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
240+ end do
241+ end do
210242 end do
211- $:END_GPU_PARALLEL_LOOP()
243+ $:END_GPU_PARALLEL_LOOP()
212244
213- #:call GPU_HOST_DATA(use_device_addr= ' [data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]' )
245+ #:call GPU_HOST_DATA(use_device_addr= ' [data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]' )
214246#if defined(__PGI)
215- ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
247+ ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
216248#else
217- ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
218- call hipCheck(hipDeviceSynchronize())
249+ ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
250+ call hipCheck(hipDeviceSynchronize())
219251#endif
220- #:endcall GPU_HOST_DATA
252+ #:endcall GPU_HOST_DATA
221253
222- $:GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
254+ $:GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
223255 do k = 1 , sys_size
224256 do j = 0 , m
225257 do l = 0 , p
@@ -228,66 +260,66 @@ contains
228260 end do
229261 end do
230262 end do
231- $:END_GPU_PARALLEL_LOOP()
232- end do
263+ $:END_GPU_PARALLEL_LOOP()
264+ end do
233265
234266#else
235- Nfq = 3
267+ Nfq = 3
268+ do j = 0 , m
269+ do k = 1 , sys_size
270+ data_fltr_cmplx(:) = (0_dp , 0_dp )
271+ data_real(1 :p + 1 ) = q_cons_vf(k)%sf(j, 0 , 0 :p)
272+ call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
273+ data_fltr_cmplx(1 :Nfq) = data_cmplx(1 :Nfq)
274+ call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
275+ data_real(:) = data_real(:)/ real (real_size, dp)
276+ q_cons_vf(k)%sf(j, 0 , 0 :p) = data_real(1 :p + 1 )
277+ end do
278+ end do
279+
280+ ! Apply Fourier filter to additional rings
281+ do i = 1 , fourier_rings
282+ Nfq = min (floor(2_dp * real (i, dp)* pi), cmplx_size)
236283 do j = 0 , m
237284 do k = 1 , sys_size
238285 data_fltr_cmplx(:) = (0_dp , 0_dp )
239- data_real(1 :p + 1 ) = q_cons_vf(k)%sf(j, 0 , 0 :p)
286+ data_real(1 :p + 1 ) = q_cons_vf(k)%sf(j, i , 0 :p)
240287 call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
241288 data_fltr_cmplx(1 :Nfq) = data_cmplx(1 :Nfq)
242289 call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
243290 data_real(:) = data_real(:)/ real (real_size, dp)
244- q_cons_vf(k)%sf(j, 0 , 0 :p) = data_real(1 :p + 1 )
245- end do
246- end do
247-
248- ! Apply Fourier filter to additional rings
249- do i = 1 , fourier_rings
250- Nfq = min (floor(2_dp * real (i, dp)* pi), cmplx_size)
251- do j = 0 , m
252- do k = 1 , sys_size
253- data_fltr_cmplx(:) = (0_dp , 0_dp )
254- data_real(1 :p + 1 ) = q_cons_vf(k)%sf(j, i, 0 :p)
255- call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
256- data_fltr_cmplx(1 :Nfq) = data_cmplx(1 :Nfq)
257- call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
258- data_real(:) = data_real(:)/ real (real_size, dp)
259- q_cons_vf(k)%sf(j, i, 0 :p) = data_real(1 :p + 1 )
260- end do
291+ q_cons_vf(k)%sf(j, i, 0 :p) = data_real(1 :p + 1 )
261292 end do
262293 end do
294+ end do
263295#endif
264296
265- end subroutine s_apply_fourier_filter
297+ end subroutine s_apply_fourier_filter
266298
267- !> The purpose of this subroutine is to destroy the fftw plan
299+ !> The purpose of this subroutine is to destroy the fftw plan
268300 !! that will be used in the forward and backward DFTs when
269301 !! applying the Fourier filter in the azimuthal direction.
270- impure subroutine s_finalize_fftw_module
302+ impure subroutine s_finalize_fftw_module
271303
272304#if defined(MFC_GPU)
273- integer :: ierr !< Generic flag used to identify and report GPU errors
274- @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
305+ integer :: ierr !< Generic flag used to identify and report GPU errors
306+ @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
275307#if defined(__PGI)
276308
277- ierr = cufftDestroy(fwd_plan_gpu)
278- ierr = cufftDestroy(bwd_plan_gpu)
309+ ierr = cufftDestroy(fwd_plan_gpu)
310+ ierr = cufftDestroy(bwd_plan_gpu)
279311#else
280- ierr = hipfftDestroy(fwd_plan_gpu)
281- ierr = hipfftDestroy(bwd_plan_gpu)
312+ ierr = hipfftDestroy(fwd_plan_gpu)
313+ ierr = hipfftDestroy(bwd_plan_gpu)
282314#endif
283315#else
284- call fftw_free(fftw_real_data)
285- call fftw_free(fftw_cmplx_data)
286- call fftw_free(fftw_fltr_cmplx_data)
316+ call fftw_free(fftw_real_data)
317+ call fftw_free(fftw_cmplx_data)
318+ call fftw_free(fftw_fltr_cmplx_data)
287319
288- call fftw_destroy_plan(fwd_plan)
289- call fftw_destroy_plan(bwd_plan)
320+ call fftw_destroy_plan(fwd_plan)
321+ call fftw_destroy_plan(bwd_plan)
290322#endif
291323
292- end subroutine s_finalize_fftw_module
293- end module m_fftw
324+ end subroutine s_finalize_fftw_module
325+ end module m_fftw
0 commit comments