add change for gpu

A-006 · A-006 · commit 071a827268dd · 2025-08-13T15:40:05.000+08:00
diff --git a/source/source_basis/module_pw/pw_basis_sup.cpp b/source/source_basis/module_pw/pw_basis_sup.cpp
@@ -412,8 +412,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
             {
                 int z = iz;
                 if (z < 0) {
-                    z += this->nz;
-}
+                    z += this->nz;}
                 if (!found[ixy * this->nz + z])
                 {
                     found[ixy * this->nz + z] = true;
@@ -422,7 +421,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
                     pw_filled++;
                     if (xprime && ixy / fftny == 0) {
                         ng_xeq0++;
-}
+                    }
                 }
             }
         }
diff --git a/source/source_pw/module_pwdft/kernels/cuda/veff_op.cu b/source/source_pw/module_pwdft/kernels/cuda/veff_op.cu
@@ -26,20 +26,50 @@ __global__ void veff_pw(
     const int size,
     thrust::complex<FPTYPE>* out,
     thrust::complex<FPTYPE>* out1,
-    const FPTYPE* in)
+    thrust::complex<FPTYPE>* in)
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if(idx >= size) {return;}
-    thrust::complex<FPTYPE> sup =
-        out[idx] * (in[0 * size + idx] + in[3 * size + idx])
-            + out1[idx] * (in[1 * size + idx] - thrust::complex<FPTYPE>(0.0, 1.0) * in[2 * size + idx]);
-    thrust::complex<FPTYPE> sdown =
-        out1[idx] * (in[0 * size + idx] - in[3 * size + idx])
-            + out[idx] * (in[1 * size + idx] + thrust::complex<FPTYPE>(0.0, 1.0) * in[2 * size + idx]);
+    const int base = idx * 4;
+    thrust::complex<FPTYPE> sup = out[idx] * in[base] + out1[idx] * in[base+1];
+    thrust::complex<FPTYPE> sdown = out1[idx] * in[base+2] + out[idx] * in[base+3];
     out[idx] = sup;
     out1[idx] = sdown;
 }
 
+template <typename FPTYPE>
+__global__ void rearrange_op(
+    const int size,
+    const FPTYPE* in,
+    thrust::complex<FPTYPE>* out)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idx >= size) {return;}
+    const int base = idx * 4;
+    const FPTYPE part_1 = in[idx];
+    const FPTYPE part_2 = in[idx + size];
+    const FPTYPE part_3 = in[idx + 2 * size];
+    const FPTYPE part_4 = in[idx + 3 * size];
+    out[base] = thrust::complex<FPTYPE>(part_1 + part_4, 0.0);
+    out[base + 1] = thrust::complex<FPTYPE>(part_2 , -part_3);
+    out[base + 2] = thrust::complex<FPTYPE>(part_1 - part_4, 0.0);
+    out[base + 3] = thrust::complex<FPTYPE>(part_2, part_3);
+
+}
+template <typename FPTYPE>
+void rearrange<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* device, 
+                    const int& size, 
+                    const FPTYPE* in, 
+                    std::complex<FPTYPE>* out) const
+{
+    const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    rearrange_op<FPTYPE><<<block, THREADS_PER_BLOCK>>>(
+        size, // control params
+        in, // array of data
+        reinterpret_cast<thrust::complex<FPTYPE>*>(out)); // array of data   
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 void veff_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
                                                              const int& size,
@@ -60,18 +90,20 @@ void veff_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::
                                                              const int& size,
                                                              std::complex<FPTYPE>* out,
                                                              std::complex<FPTYPE>* out1,
-                                                             const FPTYPE** in)
+                                                             std::complex<FPTYPE>* in)
 {
     const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     veff_pw<FPTYPE><<<block, THREADS_PER_BLOCK>>>(
         size, // control params
         reinterpret_cast<thrust::complex<FPTYPE>*>(out), // array of data
         reinterpret_cast<thrust::complex<FPTYPE>*>(out1), // array of data
-        in[0]); // array of data
+        reinterpret_cast<thrust::complex<FPTYPE>*>(in)); // array of data
 
     cudaCheckOnDebug();
 }
 
+template struct rearrange<float, base_device::DEVICE_GPU>;
+template struct rearrange<double, base_device::DEVICE_GPU>;
 template struct veff_pw_op<float, base_device::DEVICE_GPU>;
 template struct veff_pw_op<double, base_device::DEVICE_GPU>;
 
diff --git a/source/source_pw/module_pwdft/kernels/veff_op.cpp b/source/source_pw/module_pwdft/kernels/veff_op.cpp
@@ -20,7 +20,7 @@ struct veff_pw_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& size,
                     std::complex<FPTYPE>* out,
                     std::complex<FPTYPE>* out1,
-                    const std::complex<FPTYPE>* in)
+                    std::complex<FPTYPE>* in)
     {
 
 #ifdef _OPENMP
@@ -37,6 +37,31 @@ struct veff_pw_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
+template<typename FPTYPE>
+struct rearrange<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* dev,
+                    const int& size, 
+                    const FPTYPE* in, 
+                    std::complex<FPTYPE>* out) const
+    {
+        for (int ir=0; ir < size; ir++)
+        {
+            const int base = 4 *ir;
+            FPTYPE part_1 = in[ir];
+            FPTYPE part_2 = in[ir + size];
+            FPTYPE part_3 = in[ir + 2*size];
+            FPTYPE part_4 = in[ir + 3*size];
+            out[base ] = std::complex<FPTYPE>(part_1 + part_4, 0.0);
+            out[base + 1] = std::complex<FPTYPE>(part_2 , -part_3);
+            out[base + 2] = std::complex<FPTYPE>(part_1 - part_4, 0.0);
+            out[base + 3] = std::complex<FPTYPE>(part_2, part_3);
+        }
+    }
+};
+
+template struct rearrange<float, base_device::DEVICE_CPU>;
+template struct rearrange<double, base_device::DEVICE_CPU>;
 template struct veff_pw_op<float, base_device::DEVICE_CPU>;
 template struct veff_pw_op<double, base_device::DEVICE_CPU>;
 
diff --git a/source/source_pw/module_pwdft/kernels/veff_op.h b/source/source_pw/module_pwdft/kernels/veff_op.h
@@ -48,7 +48,7 @@ struct veff_pw_op {
         const int& size,
         std::complex<FPTYPE>* out,
         std::complex<FPTYPE>* out1,
-        const std::complex<FPTYPE>* in);
+        std::complex<FPTYPE>* in);
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -62,7 +62,25 @@ struct veff_pw_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& size,
                     std::complex<FPTYPE>* out,
                     std::complex<FPTYPE>* out1,
-                    const FPTYPE** in);
+                    std::complex<FPTYPE>* in);
+};
+
+#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+template<typename FPTYPE, typename Device>
+struct rearrange
+{
+    void operator()(const Device* device,const int& size, const FPTYPE* in, std::complex<FPTYPE>* out) const;
+};
+
+#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+
+template<typename FPTYPE>
+struct rearrange<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* device, 
+                    const int& size, 
+                    const FPTYPE* in, 
+                    std::complex<FPTYPE>* out) const;
 };
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hamilt
diff --git a/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp b/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp
@@ -114,20 +114,7 @@ void Veff<OperatorPW<T, Device>>::act(
     }
     else if (npol == 2)
     {
-        const Real* current_veff={nullptr};
-        const std::complex<Real> imag=std::complex<Real>(0.0, 1.0);
-        for (int ir=0; ir < veff_col; ir++)
-        {
-            const int base = 4 *ir;
-            Real part_1 = this->veff[ir];
-            Real part_2 = this->veff[ir + veff_col];
-            Real part_3 = this->veff[ir + 2*veff_col];
-            Real part_4 = this->veff[ir + 3*veff_col];
-            nspin_4_veff[base ] = part_1 + part_4;
-            nspin_4_veff[base + 1] = part_2 - imag * part_3;
-            nspin_4_veff[base + 2] = part_1 - part_4;
-            nspin_4_veff[base + 3] = part_2 + imag * part_3;
-        }
+        rearrange<Real,Device>()(this->ctx, this->veff_col, this->veff, this->nspin_4_veff);
         for (int ib = 0; ib < nbands; ib += npol)
         {
             // FFT to real space and do things.

Original file line number	Diff line number	Diff line change
`@@ -412,8 +412,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(`
`412`	`412`	`{`
`413`	`413`	`int z = iz;`
`414`	`414`	`if (z < 0) {`
`415`		`- z += this->nz;`
`416`		`-}`
	`415`	`+ z += this->nz;}`
`417`	`416`	`if (!found[ixy * this->nz + z])`
`418`	`417`	`{`
`419`	`418`	`found[ixy * this->nz + z] = true;`
`@@ -422,7 +421,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(`
`422`	`421`	`pw_filled++;`
`423`	`422`	`if (xprime && ixy / fftny == 0) {`
`424`	`423`	`ng_xeq0++;`
`425`		`-}`
	`424`	`+ }`
`426`	`425`	`}`
`427`	`426`	`}`
`428`	`427`	`}`