@@ -161,117 +161,119 @@ contains
161161 end do
162162 #:endcall GPU_PARALLEL_LOOP
163163
164+ #:if not USING_NVHPC
164165 p_real = > data_real_gpu
165166 p_cmplx = > data_cmplx_gpu
166167 p_fltr_cmplx = > data_fltr_cmplx_gpu
168+ #:endif
167169
168170 #:call GPU_DATA(attach= ' [p_real, p_cmplx, p_fltr_cmplx]' )
169171 #:call GPU_HOST_DATA(use_device_ptr= ' [p_real, p_cmplx, p_fltr_cmplx]' )
170172#if defined(__PGI)
171- ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
173+ ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
172174#else
173175 ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
174176 call hipCheck(hipDeviceSynchronize())
175177#endif
176178 #:endcall GPU_HOST_DATA
177- Nfq = 3
178- $:GPU_UPDATE(device= ' [Nfq]' )
179+ Nfq = 3
180+ $:GPU_UPDATE(device= ' [Nfq]' )
179181
180- #:call GPU_PARALLEL_LOOP(collapse= 3 )
181- do k = 1 , sys_size
182- do j = 0 , m
183- do l = 1 , Nfq
184- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
185- end do
182+ #:call GPU_PARALLEL_LOOP(collapse= 3 )
183+ do k = 1 , sys_size
184+ do j = 0 , m
185+ do l = 1 , Nfq
186+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
186187 end do
187188 end do
188- #:endcall GPU_PARALLEL_LOOP
189+ end do
190+ #:endcall GPU_PARALLEL_LOOP
189191
190192 #:call GPU_HOST_DATA(use_device_ptr= ' [p_real, p_fltr_cmplx]' )
191193#if defined(__PGI)
192- ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
194+ ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
193195#else
194196 ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
195197 call hipCheck(hipDeviceSynchronize())
196198#endif
197199 #:endcall GPU_HOST_DATA
198200
201+ #:call GPU_PARALLEL_LOOP(collapse= 3 )
202+ do k = 1 , sys_size
203+ do j = 0 , m
204+ do l = 0 , p
205+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
206+ q_cons_vf(k)%sf(j, 0 , l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
207+ end do
208+ end do
209+ end do
210+ #:endcall GPU_PARALLEL_LOOP
211+
212+ do i = 1 , fourier_rings
213+
199214 #:call GPU_PARALLEL_LOOP(collapse= 3 )
200215 do k = 1 , sys_size
201216 do j = 0 , m
202- do l = 0 , p
203- data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
204- q_cons_vf(k)%sf(j, 0 , l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
217+ do l = 1 , cmplx_size
218+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = (0_dp , 0_dp )
205219 end do
206220 end do
207221 end do
208222 #:endcall GPU_PARALLEL_LOOP
209223
210- do i = 1 , fourier_rings
211-
212- #:call GPU_PARALLEL_LOOP(collapse= 3 )
213- do k = 1 , sys_size
214- do j = 0 , m
215- do l = 1 , cmplx_size
216- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = (0_dp , 0_dp )
217- end do
218- end do
219- end do
220- #:endcall GPU_PARALLEL_LOOP
221-
222- #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
223- do k = 1 , sys_size
224- do j = 0 , m
225- do l = 0 , p
226- data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = q_cons_vf(k)%sf(j, i, l)
227- end do
224+ #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
225+ do k = 1 , sys_size
226+ do j = 0 , m
227+ do l = 0 , p
228+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = q_cons_vf(k)%sf(j, i, l)
228229 end do
229230 end do
230- #:endcall GPU_PARALLEL_LOOP
231+ end do
232+ #:endcall GPU_PARALLEL_LOOP
231233
232234 #:call GPU_HOST_DATA(use_device_ptr= ' [p_real, p_cmplx]' )
233235#if defined(__PGI)
234- ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
236+ ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
235237#else
236238 ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
237239 call hipCheck(hipDeviceSynchronize())
238240#endif
239241 #:endcall GPU_HOST_DATA
240242
241- Nfq = min (floor(2_dp * real (i, dp)* pi), cmplx_size)
242- $:GPU_UPDATE(device= ' [Nfq]' )
243+ Nfq = min (floor(2_dp * real (i, dp)* pi), cmplx_size)
244+ $:GPU_UPDATE(device= ' [Nfq]' )
243245
244- #:call GPU_PARALLEL_LOOP(collapse= 3 )
245- do k = 1 , sys_size
246- do j = 0 , m
247- do l = 1 , Nfq
248- data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
249- end do
246+ #:call GPU_PARALLEL_LOOP(collapse= 3 )
247+ do k = 1 , sys_size
248+ do j = 0 , m
249+ do l = 1 , Nfq
250+ data_fltr_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size) = data_cmplx_gpu(l + j* cmplx_size + (k - 1 )* cmplx_size* x_size)
250251 end do
251252 end do
252- #:endcall GPU_PARALLEL_LOOP
253+ end do
254+ #:endcall GPU_PARALLEL_LOOP
253255
254256 #:call GPU_HOST_DATA(use_device_ptr= ' [p_real, p_fltr_cmplx]' )
255257#if defined(__PGI)
256- ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
258+ ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
257259#else
258260 ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
259261 call hipCheck(hipDeviceSynchronize())
260262#endif
261263 #:endcall GPU_HOST_DATA
262264
263- #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
264- do k = 1 , sys_size
265- do j = 0 , m
266- do l = 0 , p
267- data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
268- q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
269- end do
265+ #:call GPU_PARALLEL_LOOP(collapse= 3 , firstprivate= ' [i]' )
266+ do k = 1 , sys_size
267+ do j = 0 , m
268+ do l = 0 , p
269+ data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)/ real (real_size, dp)
270+ q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j* real_size + 1 + (k - 1 )* real_size* x_size)
270271 end do
271272 end do
272- #:endcall GPU_PARALLEL_LOOP
273+ end do
274+ #:endcall GPU_PARALLEL_LOOP
273275
274- end do
276+ end do
275277 #:endcall GPU_DATA
276278
277279#else
0 commit comments