diff --git a/source/source_basis/module_pw/pw_basis_k.h b/source/source_basis/module_pw/pw_basis_k.h
index b87da9ca0f..83b4d7722d 100644
--- a/source/source_basis/module_pw/pw_basis_k.h
+++ b/source/source_basis/module_pw/pw_basis_k.h
@@ -158,7 +158,6 @@ class PW_Basis_K : public PW_Basis
                        const int ik,
                        const bool add = false,
                        const FPTYPE factor = 1.0) const; // in:(nz, ns)  ; out(nplane,nx*ny)
-    
     #endif
 
      template <typename FPTYPE, typename Device>
@@ -176,7 +175,6 @@ class PW_Basis_K : public PW_Basis
                        const bool add = false,
                        const FPTYPE factor = 1.0) const; // in:(nz, ns)  ; out(nplane,nx*ny)
 
-
     template <typename TK,
               typename Device,
               typename std::enable_if<std::is_same<Device, base_device::DEVICE_CPU>::value, int>::type = 0>
@@ -245,6 +243,47 @@ class PW_Basis_K : public PW_Basis
     {
         this->recip2real_gpu(in, out, ik, add, factor);
     }
+    template <typename FPTYPE, typename Device,
+              typename std::enable_if<std::is_same<Device, base_device::DEVICE_GPU>::value, int>::type = 0>
+    void convolution(const int ik,
+                    const int size,
+                    const FPTYPE* input,
+                    const typename GetTypeReal<FPTYPE>::type* input1,
+                    FPTYPE*       output,
+                    const bool add = false,
+                    const typename GetTypeReal<FPTYPE>::type factor =1.0) const
+    {
+        this->convolution_gpu(ik, size, input, input1, output, add, factor);
+    }
+    template <typename FPTYPE, typename Device,
+              typename std::enable_if<std::is_same<Device, base_device::DEVICE_CPU>::value, int>::type = 0>
+        void convolution(const int ik,
+                    const int size,
+                    const FPTYPE* input,
+                    const typename GetTypeReal<FPTYPE>::type* input1,
+                    FPTYPE*       output,
+                    const bool add = false,
+                    const typename GetTypeReal<FPTYPE>::type factor =1.0) const
+    {
+        this->convolution_cpu(ik, size, input, input1, output, add, factor);
+    }
+    template <typename FPTYPE>
+    void convolution_cpu(const int ik,
+                         const int size,
+                         const std::complex<FPTYPE>* input,
+                         const FPTYPE*               input1,
+                         std::complex<FPTYPE>*       output,
+                         const bool add = false,
+                         const FPTYPE factor = 1.0) const;
+    
+    template <typename FPTYPE>
+    void convolution_gpu(const int ik,
+                         const int size,
+                         const std::complex<FPTYPE>* input,
+                         const FPTYPE*               input1,
+                         std::complex<FPTYPE>*       output,
+                         const bool add = false,
+                         const FPTYPE factor = 1.0) const;
 
   public:
     //operator:
diff --git a/source/source_basis/module_pw/pw_basis_sup.cpp b/source/source_basis/module_pw/pw_basis_sup.cpp
index d88345016a..17bdfa594a 100644
--- a/source/source_basis/module_pw/pw_basis_sup.cpp
+++ b/source/source_basis/module_pw/pw_basis_sup.cpp
@@ -412,8 +412,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
             {
                 int z = iz;
                 if (z < 0) {
-                    z += this->nz;
-}
+                    z += this->nz;}
                 if (!found[ixy * this->nz + z])
                 {
                     found[ixy * this->nz + z] = true;
@@ -422,7 +421,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
                     pw_filled++;
                     if (xprime && ixy / fftny == 0) {
                         ng_xeq0++;
-}
+                    }
                 }
             }
         }
diff --git a/source/source_basis/module_pw/pw_transform_k.cpp b/source/source_basis/module_pw/pw_transform_k.cpp
index 36290d091a..f5faf70fa9 100644
--- a/source/source_basis/module_pw/pw_transform_k.cpp
+++ b/source/source_basis/module_pw/pw_transform_k.cpp
@@ -2,7 +2,7 @@
 #include "source_basis/module_pw/kernels/pw_op.h"
 #include "pw_basis_k.h"
 #include "pw_gatherscatter.h"
-
+#include "source_pw/module_pwdft/kernels/veff_op.h"
 #include <cassert>
 #include <complex>
 
@@ -337,6 +337,63 @@ void PW_Basis_K::recip_to_real(const base_device::DEVICE_CPU* /*dev*/,
         this->recip2real(in, out, ik, add, factor);
     #endif
 }
+template <typename FPTYPE>
+void PW_Basis_K::convolution_cpu(const int ik,
+                             const int size,
+                             const std::complex<FPTYPE>* input,
+                             const FPTYPE* input1,
+                             std::complex<FPTYPE>* output,
+                             const bool add,
+                             const FPTYPE factor) const
+{
+    ModuleBase::timer::tick(this->classname, "convolution");
+    assert(this->gamma_only == false);
+    // ModuleBase::GlobalFunc::ZEROS(fft_bundle.get_auxg_data<double>(), this->nst * this->nz);
+    // memset the auxr of 0 in the auxr,here the len of the auxr is nxyz
+    auto* auxg = this->fft_bundle.get_auxg_data<FPTYPE>();
+    auto* auxr=this->fft_bundle.get_auxr_data<FPTYPE>();
+
+    memset(auxg, 0, this->nst * this->nz * 2 * 8);
+    const int startig = ik * this->npwk_max;
+    const int npwk = this->npwk[ik];
+
+    // copy the mapping form the type of stick to the 3dfft
+    #ifdef _OPENMP
+    #pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE))
+    #endif
+    for (int igl = 0; igl < npwk; ++igl)
+    {
+        auxg[this->igl2isz_k[igl + startig]] = input[igl];
+    }
+
+    // use 3d fft backward
+    this->fft_bundle.fftzbac(auxg, auxg);
+
+    this->gathers_scatterp(auxg, auxr);
+
+    this->fft_bundle.fftxybac(auxr, auxr);
+    for (int ir = 0; ir < size; ir++)
+    {
+        auxr[ir] *= input1[ir];
+    }
+
+    // 3d fft
+    this->fft_bundle.fftxyfor(auxr, auxr);
+
+    this->gatherp_scatters(auxr, auxg);
+
+    this->fft_bundle.fftzfor(auxg, auxg);
+    // copy the result from the auxr to the out ,while consider the add
+        FPTYPE tmpfac = factor / FPTYPE(this->nxyz);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE))
+#endif
+        for (int igl = 0; igl < npwk; ++igl)
+        {
+            output[igl] += tmpfac * auxg[this->igl2isz_k[igl + startig]];
+        }
+    ModuleBase::timer::tick(this->classname, "convolution");
+}
 
 #if (defined(__CUDA) || defined(__ROCM))
 template <>
@@ -534,6 +591,50 @@ void PW_Basis_K::recip2real_gpu(const std::complex<FPTYPE>* in,
     ModuleBase::timer::tick(this->classname, "recip_to_real gpu");
 }
 
+template <typename FPTYPE>
+void PW_Basis_K::convolution_gpu(const int ik,
+                             const int size,
+                             const std::complex<FPTYPE>* input,
+                             const FPTYPE* input1,
+                             std::complex<FPTYPE>* output,
+                             const bool add,
+                             const FPTYPE factor) const
+{       
+    ModuleBase::timer::tick(this->classname, "convolution");
+
+    assert(this->gamma_only == false);
+    const base_device::DEVICE_GPU* gpux;
+    // memset the auxr of 0 in the auxr,here the len of the auxr is nxyz
+    
+    base_device::memory::set_memory_op<std::complex<FPTYPE>, base_device::DEVICE_GPU>()(
+        this->fft_bundle.get_auxr_3d_data<FPTYPE>(),
+        0,
+        this->nxyz);
+    auto* auxr = this->fft_bundle.get_auxr_3d_data<FPTYPE>();
+    const int startig = ik * this->npwk_max;
+    const int npw_k = this->npwk[ik];
+
+    // copy the mapping form the type of stick to the 3dfft
+    set_3d_fft_box_op<FPTYPE, base_device::DEVICE_GPU>()(npw_k, this->ig2ixyz_k + startig, input, auxr);
+
+    // use 3d fft backward
+    this->fft_bundle.fft3D_backward(auxr, auxr);
+    
+    hamilt::veff_pw_op<FPTYPE,base_device::DEVICE_GPU>()(gpux,size,auxr,input1);
+
+    // 3d fft
+    this->fft_bundle.fft3D_forward(auxr, auxr);
+    // copy the result from the auxr to the out ,while consider the add
+    set_real_to_recip_output_op<FPTYPE, base_device::DEVICE_GPU>()(npw_k,
+                                                                   this->nxyz,
+                                                                   add,
+                                                                   factor,
+                                                                   this->ig2ixyz_k + startig,
+                                                                   auxr,
+                                                                   output);
+    ModuleBase::timer::tick(this->classname, "convolution");
+    }
+
 template void PW_Basis_K::real2recip_gpu<float>(const std::complex<float>*,
                                                 std::complex<float>*,
                                                 const int,
@@ -557,8 +658,35 @@ template void PW_Basis_K::recip2real_gpu<double>(const std::complex<double>*,
                                                  const int,
                                                  const bool,
                                                  const double) const;
-
+template void PW_Basis_K::convolution_gpu<float>(const int ik,
+                                                 const int size,
+                                                 const std::complex<float>* input,
+                                                 const float* input1,
+                                                 std::complex<float>* output,
+                                                 const bool add,
+                                                 const float factor) const;
+template void PW_Basis_K::convolution_gpu<double>(const int ik,
+                                                  const int size,
+                                                  const std::complex<double>* input,
+                                                  const double* input1,
+                                                  std::complex<double>* output,
+                                                  const bool add,
+                                                  const double factor) const;
 #endif
+template void PW_Basis_K::convolution_cpu<float>(const int ik,
+                                                 const int size,
+                                                 const std::complex<float>* input,
+                                                 const float* input1,
+                                                 std::complex<float>* output,
+                                                 const bool add,
+                                                 const float factor) const;
+template void PW_Basis_K::convolution_cpu<double>(const int ik,
+                                                  const int size,
+                                                  const std::complex<double>* input,
+                                                  const double* input1,
+                                                  std::complex<double>* output,
+                                                  const bool add,
+                                                  const double factor) const;
 
 template void PW_Basis_K::real2recip<float>(const float* in,
                                             std::complex<float>* out,
diff --git a/source/source_io/write_vxc_lip.hpp b/source/source_io/write_vxc_lip.hpp
index 3705993022..308dca2b23 100644
--- a/source/source_io/write_vxc_lip.hpp
+++ b/source/source_io/write_vxc_lip.hpp
@@ -160,7 +160,11 @@ namespace ModuleIO
         {
             // 2.1 local xc
             vxcs_op_pw = new hamilt::Veff<hamilt::OperatorPW<T>>(kv.isk.data(),
-                potxc->get_veff_smooth_data<FPTYPE>(), potxc->get_veff_smooth().nr, potxc->get_veff_smooth().nc, &wfc_basis);
+                                                                potxc->get_veff_smooth_data<FPTYPE>(), 
+                                                                PARAM.inp.nspin,
+                                                                potxc->get_veff_smooth().nr, 
+                                                                potxc->get_veff_smooth().nc, 
+                                                                &wfc_basis);
             vxcs_op_pw->init(ik);   // set k-point index
             psi_pw.fix_k(ik);
             hpsi_localxc.fix_k(ik);
diff --git a/source/source_pw/module_pwdft/hamilt_pw.cpp b/source/source_pw/module_pwdft/hamilt_pw.cpp
index 86782ed665..f095557637 100644
--- a/source/source_pw/module_pwdft/hamilt_pw.cpp
+++ b/source/source_pw/module_pwdft/hamilt_pw.cpp
@@ -80,6 +80,7 @@ HamiltPW<T, Device>::HamiltPW(elecstate::Potential* pot_in,
             pot_in->pot_register(pot_register_in);
             Operator<T, Device>* veff = new Veff<OperatorPW<T, Device>>(isk,
                                                                         pot_in->get_veff_smooth_data<Real>(),
+                                                                        PARAM.inp.nspin,
                                                                         pot_in->get_veff_smooth().nr,
                                                                         pot_in->get_veff_smooth().nc,
                                                                         wfc_basis);
diff --git a/source/source_pw/module_pwdft/kernels/cuda/veff_op.cu b/source/source_pw/module_pwdft/kernels/cuda/veff_op.cu
index f4eb37e5ac..51c1013698 100644
--- a/source/source_pw/module_pwdft/kernels/cuda/veff_op.cu
+++ b/source/source_pw/module_pwdft/kernels/cuda/veff_op.cu
@@ -26,20 +26,50 @@ __global__ void veff_pw(
     const int size,
     thrust::complex<FPTYPE>* out,
     thrust::complex<FPTYPE>* out1,
-    const FPTYPE* in)
+    thrust::complex<FPTYPE>* in)
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if(idx >= size) {return;}
-    thrust::complex<FPTYPE> sup =
-        out[idx] * (in[0 * size + idx] + in[3 * size + idx])
-            + out1[idx] * (in[1 * size + idx] - thrust::complex<FPTYPE>(0.0, 1.0) * in[2 * size + idx]);
-    thrust::complex<FPTYPE> sdown =
-        out1[idx] * (in[0 * size + idx] - in[3 * size + idx])
-            + out[idx] * (in[1 * size + idx] + thrust::complex<FPTYPE>(0.0, 1.0) * in[2 * size + idx]);
+    const int base = idx * 4;
+    thrust::complex<FPTYPE> sup = out[idx] * in[base] + out1[idx] * in[base+1];
+    thrust::complex<FPTYPE> sdown = out1[idx] * in[base+2] + out[idx] * in[base+3];
     out[idx] = sup;
     out1[idx] = sdown;
 }
 
+template <typename FPTYPE>
+__global__ void rearrange_op(
+    const int size,
+    const FPTYPE* in,
+    thrust::complex<FPTYPE>* out)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idx >= size) {return;}
+    const int base = idx * 4;
+    const FPTYPE part_1 = in[idx];
+    const FPTYPE part_2 = in[idx + size];
+    const FPTYPE part_3 = in[idx + 2 * size];
+    const FPTYPE part_4 = in[idx + 3 * size];
+    out[base] = thrust::complex<FPTYPE>(part_1 + part_4, 0.0);
+    out[base + 1] = thrust::complex<FPTYPE>(part_2 , -part_3);
+    out[base + 2] = thrust::complex<FPTYPE>(part_1 - part_4, 0.0);
+    out[base + 3] = thrust::complex<FPTYPE>(part_2, part_3);
+
+}
+template <typename FPTYPE>
+void rearrange<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* device, 
+                    const int& size, 
+                    const FPTYPE* in, 
+                    std::complex<FPTYPE>* out) const
+{
+    const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    rearrange_op<FPTYPE><<<block, THREADS_PER_BLOCK>>>(
+        size, // control params
+        in, // array of data
+        reinterpret_cast<thrust::complex<FPTYPE>*>(out)); // array of data   
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 void veff_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
                                                              const int& size,
@@ -60,18 +90,20 @@ void veff_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::
                                                              const int& size,
                                                              std::complex<FPTYPE>* out,
                                                              std::complex<FPTYPE>* out1,
-                                                             const FPTYPE** in)
+                                                             std::complex<FPTYPE>* in)
 {
     const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     veff_pw<FPTYPE><<<block, THREADS_PER_BLOCK>>>(
         size, // control params
         reinterpret_cast<thrust::complex<FPTYPE>*>(out), // array of data
         reinterpret_cast<thrust::complex<FPTYPE>*>(out1), // array of data
-        in[0]); // array of data
+        reinterpret_cast<thrust::complex<FPTYPE>*>(in)); // array of data
 
     cudaCheckOnDebug();
 }
 
+template struct rearrange<float, base_device::DEVICE_GPU>;
+template struct rearrange<double, base_device::DEVICE_GPU>;
 template struct veff_pw_op<float, base_device::DEVICE_GPU>;
 template struct veff_pw_op<double, base_device::DEVICE_GPU>;
 
diff --git a/source/source_pw/module_pwdft/kernels/test/veff_op_test.cpp b/source/source_pw/module_pwdft/kernels/test/veff_op_test.cpp
index 9f00625925..f3badce661 100644
--- a/source/source_pw/module_pwdft/kernels/test/veff_op_test.cpp
+++ b/source/source_pw/module_pwdft/kernels/test/veff_op_test.cpp
@@ -32,6 +32,8 @@ class TestModuleHamiltVeff : public ::testing::Test
     const base_device::DEVICE_CPU* cpu_ctx = {};
     const base_device::DEVICE_GPU* gpu_ctx = {};
 
+    using rearrange_cpu = hamilt::rearrange<double, base_device::DEVICE_CPU>;
+    using rearrange_gpu = hamilt::rearrange<double, base_device::DEVICE_GPU>;
     using veff_cpu_op = hamilt::veff_pw_op<double, base_device::DEVICE_CPU>;
     using veff_gpu_op = hamilt::veff_pw_op<double, base_device::DEVICE_GPU>;
 
@@ -70,13 +72,10 @@ TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_cpu)
     std::vector<std::complex<double>> expected_out1_spin(out_spin.size(), std::complex<double>(0, 0));
     std::vector<std::complex<double>> res = out_spin;
     std::vector<std::complex<double>> res1 = out1_spin;
+    std::vector<std::complex<double>>  in_(4 * in_spin.size(), std::complex<double>(0, 0));
+    rearrange_cpu()(cpu_ctx, this->size, in_spin.data(), in_.data());
 
-    const double * in_[4];
-    for (int ii = 0; ii < 4; ii++) {
-        in_[ii] = in_spin.data() + ii * this->size;
-    }
-
-    veff_cpu_op()(cpu_ctx, this->size, res.data(), res1.data(), in_);
+    veff_cpu_op()(cpu_ctx, this->size, res.data(), res1.data(), in_.data());
     for (int ii = 0; ii < res.size(); ii++) {
         EXPECT_LT(std::abs(res[ii] - expected_out_spin[ii]), 7.5e-5);
         EXPECT_LT(std::abs(res1[ii] - expected_out1_spin[ii]), 6e-5);
@@ -108,23 +107,22 @@ TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_gpu)
 {
     std::vector<std::complex<double>> out1_spin(out_spin.size(), std::complex<double>(0, 0));
     std::vector<std::complex<double>> expected_out1_spin(out_spin.size(), std::complex<double>(0, 0));
+    std::vector<std::complex<double>> in_(4 * in_spin.size(), std::complex<double>(0, 0));
     std::vector<std::complex<double>> res = out_spin;
     std::vector<std::complex<double>> res1 = out1_spin;
     double* d_in = NULL;
+    std::complex<double>* d_in_ = NULL;
     std::complex<double>* d_res = NULL, * d_res1 = NULL;
     resize_memory_double_op()(d_in, in_spin.size());
     resize_memory_complex_op()(d_res, res.size());
     resize_memory_complex_op()(d_res1, res1.size());
+    resize_memory_complex_op()(d_in_, in_spin.size()*4);
     syncmem_double_h2d_op()(d_in, in_spin.data(), in_spin.size());
     syncmem_complex_h2d_op()(d_res, res.data(), res.size());
     syncmem_complex_h2d_op()(d_res1, res1.data(), res1.size());
-
-    const double * in_[4];
-    for (int ii = 0; ii < 4; ii++) {
-        in_[ii] = d_in + ii * this->size;
-    }
-
-    veff_gpu_op()(gpu_ctx, this->size, d_res, d_res1, in_);
+    
+    rearrange_gpu()(gpu_ctx, this->size, d_in, d_in_);
+    veff_gpu_op()(gpu_ctx, this->size, d_res, d_res1, d_in_);
 
     syncmem_complex_d2h_op()(res.data(), d_res, res.size());
     syncmem_complex_d2h_op()(res1.data(), d_res1, res1.size());
@@ -135,5 +133,6 @@ TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_gpu)
     delete_memory_double_op()(d_in);
     delete_memory_complex_op()(d_res);
     delete_memory_complex_op()(d_res1);
+    delete_memory_complex_op()(d_in_);
 }
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
\ No newline at end of file
diff --git a/source/source_pw/module_pwdft/kernels/veff_op.cpp b/source/source_pw/module_pwdft/kernels/veff_op.cpp
index 23646e9608..b6f458275a 100644
--- a/source/source_pw/module_pwdft/kernels/veff_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/veff_op.cpp
@@ -20,26 +20,48 @@ struct veff_pw_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& size,
                     std::complex<FPTYPE>* out,
                     std::complex<FPTYPE>* out1,
-                    const FPTYPE** in)
+                    std::complex<FPTYPE>* in)
     {
+
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-        for (int ir = 0; ir < size; ir++) {
-            auto sup = out[ir] * (in[0][ir] + in[3][ir])
-                + out1[ir]
-                        * (in[1][ir]
-                        - std::complex<FPTYPE>(0.0, 1.0) * in[2][ir]);
-            auto sdown = out1[ir] * (in[0][ir] - in[3][ir])
-                    + out[ir]
-                        * (in[1][ir]
-                            + std::complex<FPTYPE>(0.0, 1.0) * in[2][ir]);
+        for (int ir = 0; ir < size; ir++) 
+        {
+            const int base = ir * 4;
+            auto sup = out[ir] * (in[base]) + out1[ir] * (in[base + 1]);
+            auto sdown = out1[ir] * (in[base + 2]) + out[ir] * (in[base + 3]);
             out[ir] = sup;
             out1[ir] = sdown;
         }
     }
 };
 
+template<typename FPTYPE>
+struct rearrange<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* dev,
+                    const int& size, 
+                    const FPTYPE* in, 
+                    std::complex<FPTYPE>* out) const
+    {
+        for (int ir=0; ir < size; ir++)
+        {
+            const int base = 4 *ir;
+            FPTYPE part_1 = in[ir];
+            FPTYPE part_2 = in[ir + size];
+            FPTYPE part_3 = in[ir + 2*size];
+            FPTYPE part_4 = in[ir + 3*size];
+            out[base ] = std::complex<FPTYPE>(part_1 + part_4, 0.0);
+            out[base + 1] = std::complex<FPTYPE>(part_2 , -part_3);
+            out[base + 2] = std::complex<FPTYPE>(part_1 - part_4, 0.0);
+            out[base + 3] = std::complex<FPTYPE>(part_2, part_3);
+        }
+    }
+};
+
+template struct rearrange<float, base_device::DEVICE_CPU>;
+template struct rearrange<double, base_device::DEVICE_CPU>;
 template struct veff_pw_op<float, base_device::DEVICE_CPU>;
 template struct veff_pw_op<double, base_device::DEVICE_CPU>;
 
diff --git a/source/source_pw/module_pwdft/kernels/veff_op.h b/source/source_pw/module_pwdft/kernels/veff_op.h
index cda82d9a9a..0bfdd6bea0 100644
--- a/source/source_pw/module_pwdft/kernels/veff_op.h
+++ b/source/source_pw/module_pwdft/kernels/veff_op.h
@@ -48,7 +48,7 @@ struct veff_pw_op {
         const int& size,
         std::complex<FPTYPE>* out,
         std::complex<FPTYPE>* out1,
-        const FPTYPE** in);
+        std::complex<FPTYPE>* in);
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -62,7 +62,25 @@ struct veff_pw_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& size,
                     std::complex<FPTYPE>* out,
                     std::complex<FPTYPE>* out1,
-                    const FPTYPE** in);
+                    std::complex<FPTYPE>* in);
+};
+
+#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+template<typename FPTYPE, typename Device>
+struct rearrange
+{
+    void operator()(const Device* device,const int& size, const FPTYPE* in, std::complex<FPTYPE>* out) const;
+};
+
+#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+
+template<typename FPTYPE>
+struct rearrange<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* device, 
+                    const int& size, 
+                    const FPTYPE* in, 
+                    std::complex<FPTYPE>* out) const;
 };
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hamilt
diff --git a/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp b/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp
index e2813b3a9a..25497b7897 100644
--- a/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp
+++ b/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp
@@ -7,10 +7,11 @@ namespace hamilt {
 
 template<typename T, typename Device>
 Veff<OperatorPW<T, Device>>::Veff(const int* isk_in,
-                                       const Real* veff_in,
-                                       const int veff_row,
-                                       const int veff_col,
-                                       const ModulePW::PW_Basis_K* wfcpw_in)
+                                const Real* veff_in,
+                                const int nspin_in,
+                                const int veff_row,
+                                const int veff_col,
+                                const ModulePW::PW_Basis_K* wfcpw_in)
 {
     if (isk_in == nullptr || wfcpw_in == nullptr) 
     {
@@ -21,20 +22,28 @@ Veff<OperatorPW<T, Device>>::Veff(const int* isk_in,
     this->cal_type = calculation_type::pw_veff;
     this->isk = isk_in;
     this->veff = veff_in;
+    this->nspin = nspin_in;
     //note: "veff = nullptr" means that this core does not treat potential but still treats wf. 
     this->veff_row = veff_row;
     this->veff_col = veff_col;
     this->wfcpw = wfcpw_in;
     resmem_complex_op()(this->porter, this->wfcpw->nmaxgr, "Veff<PW>::porter");
-    resmem_complex_op()(this->porter1, this->wfcpw->nmaxgr, "Veff<PW>::porter1");
-
+    if (nspin == 4)
+    {
+        resmem_complex_op()(nspin_4_veff, 4*veff_col, "Veff<PW>::porter");
+        resmem_complex_op()(this->porter1, this->wfcpw->nmaxgr, "Veff<PW>::porter1");
+    }
 }
 
 template<typename T, typename Device>
 Veff<OperatorPW<T, Device>>::~Veff()
 {
     delmem_complex_op()(this->porter);
-    delmem_complex_op()(this->porter1);
+    if (nspin == 4)
+    {
+        delmem_complex_op()(this->porter1);
+        delmem_complex_op()(this->nspin_4_veff);
+    }
 }
 
 template<typename T, typename Device>
@@ -96,29 +105,28 @@ void Veff<OperatorPW<T, Device>>::act(
     {
         for (int ib = 0; ib < nbands; ib += npol)
         {
-            wfcpw->recip_to_real<T, Device>(tmpsi_in, this->porter, this->ik);
+            wfcpw->convolution<T, Device>(this->ik,
+                               this->veff_col,
+                               tmpsi_in,
+                               this->veff + current_spin * this->veff_col,
+                               tmhpsi,
+                               true);
             // NOTICE: when MPI threads are larger than the number of Z grids
             // veff would contain nothing, and nothing should be done in real space
             // but the 3DFFT can not be skipped, it will cause hanging
-            veff_op()(this->ctx, this->veff_col, this->porter, this->veff + current_spin * this->veff_col);
-            wfcpw->real_to_recip<T, Device>(this->porter, tmhpsi, this->ik, true);
             tmhpsi   += psi_offset;
             tmpsi_in += psi_offset;
         }
     }
     else if (npol == 2)
     {
-        const Real* current_veff[4]={nullptr};
-        for (int is = 0; is < 4; is++)
-        {
-            current_veff[is] = this->veff + is * this->veff_col;
-        }
+        rearrange<Real,Device>()(this->ctx, this->veff_col, this->veff, this->nspin_4_veff);
         for (int ib = 0; ib < nbands; ib += npol)
         {
             // FFT to real space and do things.
             wfcpw->recip_to_real<T, Device>(tmpsi_in, this->porter, this->ik);
             wfcpw->recip_to_real<T, Device>(tmpsi_in + max_npw, this->porter1, this->ik);
-            veff_op()(this->ctx, this->veff_col, this->porter, this->porter1, current_veff);
+            veff_op()(this->ctx, this->veff_col, this->porter, this->porter1, nspin_4_veff);
             // FFT back to G space.
             wfcpw->real_to_recip<T, Device>(this->porter, tmhpsi, this->ik, true);
             wfcpw->real_to_recip<T, Device>(this->porter1, tmhpsi + max_npw, this->ik, true);
diff --git a/source/source_pw/module_pwdft/operator_pw/veff_pw.h b/source/source_pw/module_pwdft/operator_pw/veff_pw.h
index 3c2024edc5..3568189470 100644
--- a/source/source_pw/module_pwdft/operator_pw/veff_pw.h
+++ b/source/source_pw/module_pwdft/operator_pw/veff_pw.h
@@ -27,6 +27,7 @@ class Veff<OperatorPW<T, Device>> : public OperatorPW<T, Device>
   public:
     Veff(const int* isk_in,
          const Real* veff_in,
+         const int nspin_in,
          const int veff_row,
          const int veff_col,
          const ModulePW::PW_Basis_K* wfcpw_in);
@@ -65,9 +66,11 @@ class Veff<OperatorPW<T, Device>> : public OperatorPW<T, Device>
 
     int veff_col = 0;
     int veff_row = 0;
+    int nspin = 1;
     const Real *veff = nullptr, *h_veff = nullptr, *d_veff = nullptr;
     T *porter = nullptr;
     T *porter1 = nullptr;
+    mutable T* nspin_4_veff=nullptr;
     base_device::AbacusDevice_t device = {};
     using veff_op = veff_pw_op<Real, Device>;