@@ -161,119 +161,119 @@ contains
161161 end do
162162 #:endcall GPU_PARALLEL_LOOP
163163
164- #:if not USING_NVHPC
165- p_real = > data_real_gpu
166- p_cmplx = > data_cmplx_gpu
167- p_fltr_cmplx = > data_fltr_cmplx_gpu
168- #:endif
164+ #:if not USING_NVHPC
165+ p_real = > data_real_gpu
166+ p_cmplx = > data_cmplx_gpu
167+ p_fltr_cmplx = > data_fltr_cmplx_gpu
168+ #:endif
169169
170170 #:call GPU_DATA(attach= ' [p_real, p_cmplx, p_fltr_cmplx]' )
171171 #:call GPU_HOST_DATA(use_device_ptr= ' [p_real, p_cmplx, p_fltr_cmplx]' )
172172#if defined(__PGI)
173- ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
173+ ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
174174#else
175175 ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
176176 call hipCheck(hipDeviceSynchronize())
177177#endif
178178 #:endcall GPU_HOST_DATA
179- Nfq = 3
180- $:GPU_UPDATE(device= ' [Nfq]' )
179+ Nfq = 3
180+ $:GPU_UPDATE(device= ' [Nfq]' )
181181
182- #:call GPU_PARALLEL_LOOP(collapse= 3 )
183- do k = 1 , sys_size
184- do j = 0 , m
185- do l = 1 , Nfq
186- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
182+ #:call GPU_PARALLEL_LOOP(collapse= 3 )
183+ do k = 1 , sys_size
184+ do j = 0 , m
185+ do l = 1 , Nfq
186+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
187+ end do
187188 end do
188189 end do
189- end do
190- #:endcall GPU_PARALLEL_LOOP
190+ #:endcall GPU_PARALLEL_LOOP
191191
192192 #:call GPU_HOST_DATA(use_device_ptr= ' [p_real, p_fltr_cmplx]' )
193193#if defined(__PGI)
194- ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
194+ ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
195195#else
196196 ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
197197 call hipCheck(hipDeviceSynchronize())
198198#endif
199199 #:endcall GPU_HOST_DATA
200200
201- #:call GPU_PARALLEL_LOOP(collapse= 3 )
202- do k = 1 , sys_size
203- do j = 0 , m
204- do l = 0 , p
205- data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
206- q_cons_vf(k)%sf(j, 0 , l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
207- end do
208- end do
209- end do
210- #:endcall GPU_PARALLEL_LOOP
211-
212- do i = 1 , fourier_rings
213-
214201 #:call GPU_PARALLEL_LOOP(collapse= 3 )
215202 do k = 1 , sys_size
216203 do j = 0 , m
217- do l = 1 , cmplx_size
218- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = (0_dp , 0_dp )
204+ do l = 0 , p
205+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
206+ q_cons_vf(k)%sf(j, 0 , l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
219207 end do
220208 end do
221209 end do
222210 #:endcall GPU_PARALLEL_LOOP
223211
224- #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
225- do k = 1 , sys_size
226- do j = 0 , m
227- do l = 0 , p
228- data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = q_cons_vf(k)%sf(j, i, l)
212+ do i = 1 , fourier_rings
213+
214+ #:call GPU_PARALLEL_LOOP(collapse= 3 )
215+ do k = 1 , sys_size
216+ do j = 0 , m
217+ do l = 1 , cmplx_size
218+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = (0_dp , 0_dp )
219+ end do
229220 end do
230221 end do
231- end do
232- #:endcall GPU_PARALLEL_LOOP
222+ #:endcall GPU_PARALLEL_LOOP
223+
224+ #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
225+ do k = 1 , sys_size
226+ do j = 0 , m
227+ do l = 0 , p
228+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = q_cons_vf(k)%sf(j, i, l)
229+ end do
230+ end do
231+ end do
232+ #:endcall GPU_PARALLEL_LOOP
233233
234234 #:call GPU_HOST_DATA(use_device_ptr= ' [p_real, p_cmplx]' )
235235#if defined(__PGI)
236- ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
236+ ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
237237#else
238238 ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
239239 call hipCheck(hipDeviceSynchronize())
240240#endif
241241 #:endcall GPU_HOST_DATA
242242
243- Nfq = min (floor(2_dp * real (i, dp)* pi), cmplx_size)
244- $:GPU_UPDATE(device= ' [Nfq]' )
243+ Nfq = min (floor(2_dp * real (i, dp)* pi), cmplx_size)
244+ $:GPU_UPDATE(device= ' [Nfq]' )
245245
246- #:call GPU_PARALLEL_LOOP(collapse= 3 )
247- do k = 1 , sys_size
248- do j = 0 , m
249- do l = 1 , Nfq
250- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
246+ #:call GPU_PARALLEL_LOOP(collapse= 3 )
247+ do k = 1 , sys_size
248+ do j = 0 , m
249+ do l = 1 , Nfq
250+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
251+ end do
251252 end do
252253 end do
253- end do
254- #:endcall GPU_PARALLEL_LOOP
254+ #:endcall GPU_PARALLEL_LOOP
255255
256256 #:call GPU_HOST_DATA(use_device_ptr= ' [p_real, p_fltr_cmplx]' )
257257#if defined(__PGI)
258- ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
258+ ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
259259#else
260260 ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
261261 call hipCheck(hipDeviceSynchronize())
262262#endif
263263 #:endcall GPU_HOST_DATA
264264
265- #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
266- do k = 1 , sys_size
267- do j = 0 , m
268- do l = 0 , p
269- data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
270- q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
265+ #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
266+ do k = 1 , sys_size
267+ do j = 0 , m
268+ do l = 0 , p
269+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
270+ q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
271+ end do
271272 end do
272273 end do
273- end do
274- #:endcall GPU_PARALLEL_LOOP
274+ #:endcall GPU_PARALLEL_LOOP
275275
276- end do
276+ end do
277277 #:endcall GPU_DATA
278278
279279#else
0 commit comments