@@ -157,6 +157,68 @@ contains
157157 end do
158158 #:endcall GPU_PARALLEL_LOOP
159159
160+ #:call GPU_HOST_DATA(use_device_addr= ' [data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]' )
161+ #if defined(__PGI)
162+ ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
163+ #else
164+ ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
165+ call hipCheck(hipDeviceSynchronize())
166+ #endif
167+ #:endcall GPU_HOST_DATA
168+ Nfq = 3
169+ $:GPU_UPDATE(device= ' [Nfq]' )
170+
171+ #:call GPU_PARALLEL_LOOP(collapse= 3 )
172+ do k = 1 , sys_size
173+ do j = 0 , m
174+ do l = 1 , Nfq
175+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
176+ end do
177+ end do
178+ end do
179+ #:endcall GPU_PARALLEL_LOOP
180+
181+ #:call GPU_HOST_DATA(use_device_addr= ' [data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]' )
182+ #if defined(__PGI)
183+ ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
184+ #else
185+ ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
186+ call hipCheck(hipDeviceSynchronize())
187+ #endif
188+ #:endcall GPU_HOST_DATA
189+
190+ #:call GPU_PARALLEL_LOOP(collapse= 3 )
191+ do k = 1 , sys_size
192+ do j = 0 , m
193+ do l = 0 , p
194+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
195+ q_cons_vf(k)%sf(j, 0 , l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
196+ end do
197+ end do
198+ end do
199+ #:endcall GPU_PARALLEL_LOOP
200+
201+ do i = 1 , fourier_rings
202+
203+ #:call GPU_PARALLEL_LOOP(collapse= 3 )
204+ do k = 1 , sys_size
205+ do j = 0 , m
206+ do l = 1 , cmplx_size
207+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = (0_dp , 0_dp )
208+ end do
209+ end do
210+ end do
211+ #:endcall GPU_PARALLEL_LOOP
212+
213+ #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
214+ do k = 1 , sys_size
215+ do j = 0 , m
216+ do l = 0 , p
217+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = q_cons_vf(k)%sf(j, i, l)
218+ end do
219+ end do
220+ end do
221+ #:endcall GPU_PARALLEL_LOOP
160222
161223 #:call GPU_HOST_DATA(use_device_addr= ' [data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]' )
162224#if defined(__PGI)
@@ -166,7 +228,8 @@ contains
166228 call hipCheck(hipDeviceSynchronize())
167229#endif
168230 #:endcall GPU_HOST_DATA
169- Nfq = 3
231+
232+ Nfq = min (floor(2_dp * real (i, dp)* pi), cmplx_size)
170233 $:GPU_UPDATE(device= ' [Nfq]' )
171234
172235 #:call GPU_PARALLEL_LOOP(collapse= 3 )
@@ -188,81 +251,17 @@ contains
188251#endif
189252 #:endcall GPU_HOST_DATA
190253
191- #:call GPU_PARALLEL_LOOP(collapse= 3 )
254+ #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate = ' [i] ' )
192255 do k = 1 , sys_size
193256 do j = 0 , m
194257 do l = 0 , p
195258 data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
196- q_cons_vf(k)%sf(j, 0 , l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
259+ q_cons_vf(k)%sf(j, i , l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
197260 end do
198261 end do
199262 end do
200263 #:endcall GPU_PARALLEL_LOOP
201-
202- do i = 1 , fourier_rings
203-
204- #:call GPU_PARALLEL_LOOP(collapse= 3 )
205- do k = 1 , sys_size
206- do j = 0 , m
207- do l = 1 , cmplx_size
208- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = (0_dp , 0_dp )
209- end do
210- end do
211- end do
212- #:endcall GPU_PARALLEL_LOOP
213-
214- #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
215- do k = 1 , sys_size
216- do j = 0 , m
217- do l = 0 , p
218- data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = q_cons_vf(k)%sf(j, i, l)
219- end do
220- end do
221- end do
222- #:endcall GPU_PARALLEL_LOOP
223-
224- #:call GPU_HOST_DATA(use_device_addr= ' [data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]' )
225- #if defined(__PGI)
226- ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
227- #else
228- ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
229- call hipCheck(hipDeviceSynchronize())
230- #endif
231- #:endcall GPU_HOST_DATA
232-
233- Nfq = min (floor(2_dp * real (i, dp)* pi), cmplx_size)
234- $:GPU_UPDATE(device= ' [Nfq]' )
235-
236- #:call GPU_PARALLEL_LOOP(collapse= 3 )
237- do k = 1 , sys_size
238- do j = 0 , m
239- do l = 1 , Nfq
240- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
241- end do
242- end do
243- end do
244- #:endcall GPU_PARALLEL_LOOP
245-
246- #:call GPU_HOST_DATA(use_device_addr= ' [data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]' )
247- #if defined(__PGI)
248- ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
249- #else
250- ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
251- call hipCheck(hipDeviceSynchronize())
252- #endif
253- #:endcall GPU_HOST_DATA
254-
255- #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
256- do k = 1 , sys_size
257- do j = 0 , m
258- do l = 0 , p
259- data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
260- q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
261- end do
262- end do
263- end do
264- #:endcall GPU_PARALLEL_LOOP
265- end do
264+ end do
266265
267266#else
268267 Nfq = 3
0 commit comments