add convolution for psi

A-006 · A-006 · commit 4b770fb42067 · 2025-08-04T16:55:30.000+08:00
diff --git a/source/source_basis/module_pw/pw_basis_k.h b/source/source_basis/module_pw/pw_basis_k.h
@@ -158,7 +158,6 @@ class PW_Basis_K : public PW_Basis
                        const int ik,
                        const bool add = false,
                        const FPTYPE factor = 1.0) const; // in:(nz, ns)  ; out(nplane,nx*ny)
-    
     #endif
 
      template <typename FPTYPE, typename Device>
@@ -176,7 +175,6 @@ class PW_Basis_K : public PW_Basis
                        const bool add = false,
                        const FPTYPE factor = 1.0) const; // in:(nz, ns)  ; out(nplane,nx*ny)
 
-
     template <typename TK,
               typename Device,
               typename std::enable_if<std::is_same<Device, base_device::DEVICE_CPU>::value, int>::type = 0>
@@ -245,6 +243,47 @@ class PW_Basis_K : public PW_Basis
     {
         this->recip2real_gpu(in, out, ik, add, factor);
     }
+    template <typename FPTYPE, typename Device,
+              typename std::enable_if<std::is_same<Device, base_device::DEVICE_GPU>::value, int>::type = 0>
+    void convolution(const int ik,
+                    const int size,
+                    const FPTYPE* input,
+                    const typename GetTypeReal<FPTYPE>::type* input1,
+                    FPTYPE*       output,
+                    const bool add = false,
+                    const typename GetTypeReal<FPTYPE>::type factor =1.0) const
+    {
+        this->convolution_gpu(ik, size, input, input1, output, add, factor);
+    }
+    template <typename FPTYPE, typename Device,
+              typename std::enable_if<std::is_same<Device, base_device::DEVICE_CPU>::value, int>::type = 0>
+        void convolution(const int ik,
+                    const int size,
+                    const FPTYPE* input,
+                    const typename GetTypeReal<FPTYPE>::type* input1,
+                    FPTYPE*       output,
+                    const bool add = false,
+                    const typename GetTypeReal<FPTYPE>::type factor =1.0) const
+    {
+        this->convolution_cpu(ik, size, input, input1, output, add, factor);
+    }
+    template <typename FPTYPE>
+    void convolution_cpu(const int ik,
+                         const int size,
+                         const std::complex<FPTYPE>* input,
+                         const FPTYPE*               input1,
+                         std::complex<FPTYPE>*       output,
+                         const bool add = false,
+                         const FPTYPE factor = 1.0) const;
+    
+    template <typename FPTYPE>
+    void convolution_gpu(const int ik,
+                         const int size,
+                         const std::complex<FPTYPE>* input,
+                         const FPTYPE*               input1,
+                         std::complex<FPTYPE>*       output,
+                         const bool add = false,
+                         const FPTYPE factor = 1.0) const;
 
   public:
     //operator:
diff --git a/source/source_basis/module_pw/pw_transform_k.cpp b/source/source_basis/module_pw/pw_transform_k.cpp
@@ -2,7 +2,7 @@
 #include "source_basis/module_pw/kernels/pw_op.h"
 #include "pw_basis_k.h"
 #include "pw_gatherscatter.h"
-
+#include "source_pw/module_pwdft/kernels/veff_op.h"
 #include <cassert>
 #include <complex>
 
@@ -337,6 +337,63 @@ void PW_Basis_K::recip_to_real(const base_device::DEVICE_CPU* /*dev*/,
         this->recip2real(in, out, ik, add, factor);
     #endif
 }
+template <typename FPTYPE>
+void PW_Basis_K::convolution_cpu(const int ik,
+                             const int size,
+                             const std::complex<FPTYPE>* input,
+                             const FPTYPE* input1,
+                             std::complex<FPTYPE>* output,
+                             const bool add,
+                             const FPTYPE factor) const
+{
+    ModuleBase::timer::tick(this->classname, "convolution");
+    assert(this->gamma_only == false);
+    // ModuleBase::GlobalFunc::ZEROS(fft_bundle.get_auxg_data<double>(), this->nst * this->nz);
+    // memset the auxr of 0 in the auxr,here the len of the auxr is nxyz
+    auto* auxg = this->fft_bundle.get_auxg_data<FPTYPE>();
+    auto* auxr=this->fft_bundle.get_auxr_data<FPTYPE>();
+
+    memset(auxg, 0, this->nst * this->nz * 2 * 8);
+    const int startig = ik * this->npwk_max;
+    const int npwk = this->npwk[ik];
+
+    // copy the mapping form the type of stick to the 3dfft
+    #ifdef _OPENMP
+    #pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE))
+    #endif
+    for (int igl = 0; igl < npwk; ++igl)
+    {
+        auxg[this->igl2isz_k[igl + startig]] = input[igl];
+    }
+
+    // use 3d fft backward
+    this->fft_bundle.fftzbac(auxg, auxg);
+
+    this->gathers_scatterp(auxg, auxr);
+
+    this->fft_bundle.fftxybac(auxr, auxr);
+    for (int ir = 0; ir < size; ir++)
+    {
+        auxr[ir] *= input1[ir];
+    }
+
+    // 3d fft
+    this->fft_bundle.fftxyfor(auxr, auxr);
+
+    this->gatherp_scatters(auxr, auxg);
+
+    this->fft_bundle.fftzfor(auxg, auxg);
+    // copy the result from the auxr to the out ,while consider the add
+        FPTYPE tmpfac = factor / FPTYPE(this->nxyz);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE))
+#endif
+        for (int igl = 0; igl < npwk; ++igl)
+        {
+            output[igl] += tmpfac * auxg[this->igl2isz_k[igl + startig]];
+        }
+    ModuleBase::timer::tick(this->classname, "convolution");
+}
 
 #if (defined(__CUDA) || defined(__ROCM))
 template <>
@@ -534,6 +591,50 @@ void PW_Basis_K::recip2real_gpu(const std::complex<FPTYPE>* in,
     ModuleBase::timer::tick(this->classname, "recip_to_real gpu");
 }
 
+template <typename FPTYPE>
+void PW_Basis_K::convolution_gpu(const int ik,
+                             const int size,
+                             const std::complex<FPTYPE>* input,
+                             const FPTYPE* input1,
+                             std::complex<FPTYPE>* output,
+                             const bool add,
+                             const FPTYPE factor) const
+{       
+    ModuleBase::timer::tick(this->classname, "convolution");
+
+    assert(this->gamma_only == false);
+    const base_device::DEVICE_GPU* gpux;
+    // memset the auxr of 0 in the auxr,here the len of the auxr is nxyz
+    
+    base_device::memory::set_memory_op<std::complex<FPTYPE>, base_device::DEVICE_GPU>()(
+        this->fft_bundle.get_auxr_3d_data<FPTYPE>(),
+        0,
+        this->nxyz);
+    auto* auxr = this->fft_bundle.get_auxr_3d_data<FPTYPE>();
+    const int startig = ik * this->npwk_max;
+    const int npw_k = this->npwk[ik];
+
+    // copy the mapping form the type of stick to the 3dfft
+    set_3d_fft_box_op<FPTYPE, base_device::DEVICE_GPU>()(npw_k, this->ig2ixyz_k + startig, input, auxr);
+
+    // use 3d fft backward
+    this->fft_bundle.fft3D_backward(auxr, auxr);
+    
+    hamilt::veff_pw_op<FPTYPE,base_device::DEVICE_GPU>()(gpux,size,auxr,input1);
+
+    // 3d fft
+    this->fft_bundle.fft3D_forward(auxr, auxr);
+    // copy the result from the auxr to the out ,while consider the add
+    set_real_to_recip_output_op<FPTYPE, base_device::DEVICE_GPU>()(npw_k,
+                                                                   this->nxyz,
+                                                                   add,
+                                                                   factor,
+                                                                   this->ig2ixyz_k + startig,
+                                                                   auxr,
+                                                                   output);
+    ModuleBase::timer::tick(this->classname, "convolution");
+    }
+
 template void PW_Basis_K::real2recip_gpu<float>(const std::complex<float>*,
                                                 std::complex<float>*,
                                                 const int,
@@ -557,8 +658,35 @@ template void PW_Basis_K::recip2real_gpu<double>(const std::complex<double>*,
                                                  const int,
                                                  const bool,
                                                  const double) const;
-
+template void PW_Basis_K::convolution_gpu<float>(const int ik,
+                                                 const int size,
+                                                 const std::complex<float>* input,
+                                                 const float* input1,
+                                                 std::complex<float>* output,
+                                                 const bool add,
+                                                 const float factor) const;
+template void PW_Basis_K::convolution_gpu<double>(const int ik,
+                                                  const int size,
+                                                  const std::complex<double>* input,
+                                                  const double* input1,
+                                                  std::complex<double>* output,
+                                                  const bool add,
+                                                  const double factor) const;
 #endif
+template void PW_Basis_K::convolution_cpu<float>(const int ik,
+                                                 const int size,
+                                                 const std::complex<float>* input,
+                                                 const float* input1,
+                                                 std::complex<float>* output,
+                                                 const bool add,
+                                                 const float factor) const;
+template void PW_Basis_K::convolution_cpu<double>(const int ik,
+                                                  const int size,
+                                                  const std::complex<double>* input,
+                                                  const double* input1,
+                                                  std::complex<double>* output,
+                                                  const bool add,
+                                                  const double factor) const;
 
 template void PW_Basis_K::real2recip<float>(const float* in,
                                             std::complex<float>* out,
diff --git a/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp b/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp
@@ -96,12 +96,15 @@ void Veff<OperatorPW<T, Device>>::act(
     {
         for (int ib = 0; ib < nbands; ib += npol)
         {
-            wfcpw->recip_to_real<T, Device>(tmpsi_in, this->porter, this->ik);
+            wfcpw->convolution<T, Device>(this->ik,
+                               this->veff_col,
+                               tmpsi_in,
+                               this->veff + current_spin * this->veff_col,
+                               tmhpsi,
+                               true);
             // NOTICE: when MPI threads are larger than the number of Z grids
             // veff would contain nothing, and nothing should be done in real space
             // but the 3DFFT can not be skipped, it will cause hanging
-            veff_op()(this->ctx, this->veff_col, this->porter, this->veff + current_spin * this->veff_col);
-            wfcpw->real_to_recip<T, Device>(this->porter, tmhpsi, this->ik, true);
             tmhpsi   += psi_offset;
             tmpsi_in += psi_offset;
         }

Original file line number	Diff line number	Diff line change
`@@ -96,12 +96,15 @@ void Veff<OperatorPW<T, Device>>::act(`
`96`	`96`	`{`
`97`	`97`	`for (int ib = 0; ib < nbands; ib += npol)`
`98`	`98`	`{`
`99`		`- wfcpw->recip_to_real<T, Device>(tmpsi_in, this->porter, this->ik);`
	`99`	`+ wfcpw->convolution<T, Device>(this->ik,`
	`100`	`+ this->veff_col,`
	`101`	`+ tmpsi_in,`
	`102`	`+ this->veff + current_spin * this->veff_col,`
	`103`	`+ tmhpsi,`
	`104`	`+ true);`
`100`	`105`	`// NOTICE: when MPI threads are larger than the number of Z grids`
`101`	`106`	`// veff would contain nothing, and nothing should be done in real space`
`102`	`107`	`// but the 3DFFT can not be skipped, it will cause hanging`
`103`		`- veff_op()(this->ctx, this->veff_col, this->porter, this->veff + current_spin * this->veff_col);`
`104`		`- wfcpw->real_to_recip<T, Device>(this->porter, tmhpsi, this->ik, true);`
`105`	`108`	`tmhpsi += psi_offset;`
`106`	`109`	`tmpsi_in += psi_offset;`
`107`	`110`	`}`