diff --git a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
index 11b69a8580..4250146510 100644
--- a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
+++ b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
@@ -184,7 +184,7 @@ void XC_Functional::gradcorr(double &etxc, double &vtxc, ModuleBase::matrix &v,
 		}
 
 		gdr2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
-		h2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
+		if(!is_stress) h2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
 
 		XC_Functional::grad_rho( rhogsum1 , gdr1, rhopw, ucell->tpiba);
 		XC_Functional::grad_rho( rhogsum2 , gdr2, rhopw, ucell->tpiba);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
index 04caefd1bf..92733745b4 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
@@ -45,7 +45,7 @@ void Forces<FPTYPE, Device>::cal_force_nl(ModuleBase::matrix& forcenl,
                 break;
             }
         }
-        const int npm = ucell_in.get_npol() * nbands_occ;
+        const int npm = nbands_occ;
         // calculate becp = <psi|beta> for all beta functions
         nl_tools.cal_becp(ik, npm);
         for (int ipol = 0; ipol < 3; ipol++)
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
index cc05bc8e52..4dbdbc558f 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
@@ -108,12 +108,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix
         this->atom_na = h_atom_na.data();
         this->ppcell_vkb = this->nlpp_->vkb.c;
     }
-
-    // prepare the memory of the becp and dbecp:
-    // becp: <Beta(nkb,npw)|psi(nbnd,npw)>
-    // dbecp: <dBeta(nkb,npw)/dG|psi(nbnd,npw)>
-    resmem_complex_op()(this->ctx, becp, this->nbands * nkb, "Stress::becp");
-    resmem_complex_op()(this->ctx, dbecp, 6 * this->nbands * nkb, "Stress::dbecp");
 }
 
 template <typename FPTYPE, typename Device>
@@ -163,9 +157,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
 {
     ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp");
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
+    const int npol = this->ucell_->get_npol();
+    const int size_becp = this->nbands * npol * this->nkb;
+    const int size_becp_act = npm * npol * this->nkb;
     if (this->becp == nullptr)
     {
-        resmem_complex_op()(this->ctx, becp, this->nbands * this->nkb);
+        resmem_complex_op()(this->ctx, becp, size_becp);
     }
 
     // prepare math tools
@@ -249,11 +246,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
     }
     const char transa = 'C';
     const char transb = 'N';
+    int npm_npol = npm * npol;
     gemm_op()(this->ctx,
               transa,
               transb,
               nkb,
-              npm,
+              npm_npol,
               npw,
               &ModuleBase::ONE,
               ppcell_vkb,
@@ -268,15 +266,15 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
     if (this->device == base_device::GpuDevice)
     {
         std::complex<FPTYPE>* h_becp = nullptr;
-        resmem_complex_h_op()(this->cpu_ctx, h_becp, this->nbands * nkb);
-        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, this->nbands * nkb);
-        Parallel_Reduce::reduce_pool(h_becp, this->nbands * nkb);
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, this->nbands * nkb);
+        resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act);
+        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, size_becp_act);
+        Parallel_Reduce::reduce_pool(h_becp, size_becp_act);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, size_becp_act);
         delmem_complex_h_op()(this->cpu_ctx, h_becp);
     }
     else
     {
-        Parallel_Reduce::reduce_pool(becp, this->nbands * this->nkb);
+        Parallel_Reduce::reduce_pool(becp, size_becp_act);
     }
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
 }
@@ -287,9 +285,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
 {
     ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s");
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
+    const int npol = this->ucell_->get_npol();
+    const int size_becp = this->nbands * npol * this->nkb;
+    const int npm_npol = npm * npol;
     if (this->dbecp == nullptr)
     {
-        resmem_complex_op()(this->ctx, dbecp, this->nbands * this->nkb);
+        resmem_complex_op()(this->ctx, dbecp, size_becp);
     }
 
     // prepare math tools
@@ -401,7 +402,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
               transa,
               transb,
               nkb,
-              npm,
+              npm_npol,
               npw,
               &ModuleBase::ONE,
               ppcell_vkb,
@@ -412,43 +413,68 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
               dbecp,
               nkb);
     // calculate stress for target (ipol, jpol)
-    const int current_spin = this->kv_->isk[ik];
-    cal_stress_nl_op()(this->ctx,
-                       nondiagonal,
-                       ipol,
-                       jpol,
-                       nkb,
-                       npm,
-                       this->ntype,
-                       current_spin, // uspp only
-                       this->nbands,
-                       ik,
-                       this->nlpp_->deeq.getBound2(),
-                       this->nlpp_->deeq.getBound3(),
-                       this->nlpp_->deeq.getBound4(),
-                       atom_nh,
-                       atom_na,
-                       d_wg,
-                       d_ekb,
-                       qq_nt,
-                       deeq,
-                       becp,
-                       dbecp,
-                       stress);
+    if(npol == 1)
+    {
+        const int current_spin = this->kv_->isk[ik];
+        cal_stress_nl_op()(this->ctx,
+                        nondiagonal,
+                        ipol,
+                        jpol,
+                        nkb,
+                        npm,
+                        this->ntype,
+                        current_spin, // uspp only
+                        this->nlpp_->deeq.getBound2(),
+                        this->nlpp_->deeq.getBound3(),
+                        this->nlpp_->deeq.getBound4(),
+                        atom_nh,
+                        atom_na,
+                        d_wg + this->nbands * ik,
+                        d_ekb + this->nbands * ik,
+                        qq_nt,
+                        deeq,
+                        becp,
+                        dbecp,
+                        stress);
+    }
+    else
+    {
+        cal_stress_nl_op()(this->ctx,
+                           ipol,
+                           jpol,
+                           nkb,
+                           npm,
+                           this->ntype,
+                           this->nlpp_->deeq_nc.getBound2(),
+                           this->nlpp_->deeq_nc.getBound3(),
+                           this->nlpp_->deeq_nc.getBound4(),
+                           atom_nh,
+                           atom_na,
+                           d_wg + this->nbands * ik,
+                           d_ekb + this->nbands * ik,
+                           qq_nt,
+                           this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
+                           becp,
+                           dbecp,
+                           stress);
+    }
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
 }
 
 template <typename FPTYPE, typename Device>
 void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
 {
-    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s");
+    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f");
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f");
+    const int npol = this->ucell_->get_npol();
+    const int npm_npol = npm * npol;
+    const int size_becp = this->nbands * npol * this->nkb;
     if (this->dbecp == nullptr)
     {
-        resmem_complex_op()(this->ctx, dbecp, 3 * this->nbands * this->nkb);
+        resmem_complex_op()(this->ctx, dbecp, 3 * size_becp);
     }
 
-    std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * this->nbands * this->nkb;
+    std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * size_becp;
     const std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
     std::complex<FPTYPE>* vkb_deri_ptr = this->ppcell_vkb;
 
@@ -481,7 +507,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
               transa,
               transb,
               this->nkb,
-              npm,
+              npm_npol,
               npw,
               &ModuleBase::ONE,
               vkb_deri_ptr,
@@ -491,7 +517,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
               &ModuleBase::ZERO,
               dbecp_ptr,
               nkb);
-
     this->revert_vkb(npw, ipol);
     this->pre_ik_f = ik;
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f");
@@ -634,29 +659,52 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
     const int current_spin = this->kv_->isk[ik];
     const int force_nc = 3;
     // calculate the force
-    cal_force_nl_op<FPTYPE, Device>()(this->ctx,
-                                      nondiagonal,
-                                      npm,
-                                      this->nbands,
-                                      this->ntype,
-                                      current_spin,
-                                      this->nlpp_->deeq.getBound2(),
-                                      this->nlpp_->deeq.getBound3(),
-                                      this->nlpp_->deeq.getBound4(),
-                                      force_nc,
-                                      this->nbands,
-                                      ik,
-                                      nkb,
-                                      atom_nh,
-                                      atom_na,
-                                      this->ucell_->tpiba,
-                                      d_wg,
-                                      d_ekb,
-                                      qq_nt,
-                                      deeq,
-                                      becp,
-                                      dbecp,
-                                      force);
+    if(this->ucell_->get_npol() == 1)
+    {
+        cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                        nondiagonal,
+                                        npm,
+                                        this->ntype,
+                                        current_spin,
+                                        this->nlpp_->deeq.getBound2(),
+                                        this->nlpp_->deeq.getBound3(),
+                                        this->nlpp_->deeq.getBound4(),
+                                        force_nc,
+                                        this->nbands,
+                                        nkb,
+                                        atom_nh,
+                                        atom_na,
+                                        this->ucell_->tpiba,
+                                        d_wg + this->nbands * ik,
+                                        d_ekb + this->nbands * ik,
+                                        qq_nt,
+                                        deeq,
+                                        becp,
+                                        dbecp,
+                                        force);
+    }
+    else
+    {
+        cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                          npm,
+                                          this->ntype,
+                                          this->nlpp_->deeq_nc.getBound2(),
+                                          this->nlpp_->deeq_nc.getBound3(),
+                                          this->nlpp_->deeq_nc.getBound4(),
+                                          force_nc,
+                                          this->nbands,
+                                          nkb,
+                                          atom_nh,
+                                          atom_na,
+                                          this->ucell_->tpiba,
+                                          d_wg + this->nbands * ik,
+                                          d_ekb + this->nbands * ik,
+                                          qq_nt,
+                                          this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
+                                          becp,
+                                          dbecp,
+                                          force);
+    }
 }
 
 // template instantiation
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index f8df7b3029..dddf889de1 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -35,7 +35,6 @@ __global__ void cal_vkb1_nl(
 template <typename FPTYPE>
 __global__ void cal_force_nl(
         const bool nondiagonal,
-        const int wg_nc,
         const int ntype,
         const int spin,
         const int deeq_2,
@@ -43,7 +42,6 @@ __global__ void cal_force_nl(
         const int deeq_4,
         const int forcenl_nc,
         const int nbands,
-        const int ik,
         const int nkb,
         const int *atom_nh,
         const int *atom_na,
@@ -65,11 +63,11 @@ __global__ void cal_force_nl(
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    int Nprojs = atom_nh[it];
-    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
-    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    int nproj = atom_nh[it];
+    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ib];
     for (int ia = 0; ia < atom_na[it]; ia++) {
-        for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
+        for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
             // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip];
             FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]
                         - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
@@ -85,8 +83,8 @@ __global__ void cal_force_nl(
             }
 
             if (nondiagonal) {
-                //for (int ip2=0; ip2<Nprojs; ip2++)
-                for (int ip2 = 0; ip2 < Nprojs; ip2++) {
+                //for (int ip2=0; ip2<nproj; ip2++)
+                for (int ip2 = 0; ip2 < nproj; ip2++) {
                     if (ip != ip2) {
                         const int jnkb = sum + ip2;
                         ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]
@@ -101,7 +99,7 @@ __global__ void cal_force_nl(
             }
         }
         iat += 1;
-        sum += Nprojs;
+        sum += nproj;
     }
 }
 
@@ -134,7 +132,6 @@ template <typename FPTYPE>
 void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
                                                                   const bool& nondiagonal,
                                                                   const int& nbands_occ,
-                                                                  const int& wg_nc,
                                                                   const int& ntype,
                                                                   const int& spin,
                                                                   const int& deeq_2,
@@ -142,7 +139,6 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                                                                   const int& deeq_4,
                                                                   const int& forcenl_nc,
                                                                   const int& nbands,
-                                                                  const int& ik,
                                                                   const int& nkb,
                                                                   const int* atom_nh,
                                                                   const int* atom_na,
@@ -157,9 +153,9 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
 {
     cal_force_nl<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
             nondiagonal,
-            wg_nc, ntype, spin,
+            ntype, spin,
             deeq_2, deeq_3, deeq_4,
-            forcenl_nc, nbands, ik, nkb,
+            forcenl_nc, nbands, nkb,
             atom_nh, atom_na,
             tpiba,
             d_wg, d_ekb, qq_nt, deeq,
@@ -170,6 +166,106 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_force_nl(
+        const int ntype,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int forcenl_nc,
+        const int nbands,
+        const int nkb,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE tpiba,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *force)
+{
+    const int ib = blockIdx.x / ntype;
+    const int ib2 = ib * 2;
+    const int it = blockIdx.x % ntype;
+
+    int iat = 0, sum = 0;
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    int nproj = atom_nh[it];
+    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ib];
+    for (int ia = 0; ia < atom_na[it]; ia++) {
+        for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
+            const int inkb = sum + ip;
+            for (int ip2 = 0; ip2 < nproj; ip2++) 
+            {
+                // Effective values of the D-eS coefficients
+                const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                const int jnkb = sum + ip2;
+                const thrust::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+
+                for (int ipol = 0; ipol < 3; ipol++) {
+                    const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                    const int index1 = ib2 * nkb + jnkb;
+                    const thrust::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                    const thrust::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+                    const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                    atomicAdd(force + iat * forcenl_nc + ipol, tmp);
+                }
+            }
+        }
+        iat += 1;
+        sum += nproj;
+    }
+}
+
+// interface for nspin=4 only
+template <typename FPTYPE>
+void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+{
+    cal_force_nl<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
+            ntype,
+            deeq_2, deeq_3, deeq_4,
+            forcenl_nc, nbands, nkb,
+            atom_nh, atom_na,
+            tpiba,
+            d_wg, d_ekb, qq_nt, 
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+            force);// array of data
+
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void saveVkbValues_(
     const int *gcar_zero_ptrs, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu
index 275fbfe136..89a74f3c5f 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu
@@ -54,8 +54,8 @@ __global__ void nonlocal_pw(
     thrust::complex<FPTYPE>* ps,
     const thrust::complex<FPTYPE>* becp)
 {
-  const int ii = blockIdx.x / l2;
-  const int jj = blockIdx.x % l2;
+  const int ii = blockIdx.x * 2 / l2;
+  const int jj = blockIdx.x * 2 % l2;
   for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) {
     thrust::complex<FPTYPE> res1(0.0, 0.0);
     thrust::complex<FPTYPE> res2(0.0, 0.0);
@@ -121,7 +121,7 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 {
   // denghui implement 20221109
   // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-  nonlocal_pw<FPTYPE><<<l1 * l2, THREADS_PER_BLOCK>>>(
+  nonlocal_pw<FPTYPE><<<l1 * l2 / 2, THREADS_PER_BLOCK>>>(
     l1, l2, l3, // loop size
     sum, iat, nkb,   // control params
     deeq_x, deeq_y, deeq_z,
@@ -138,4 +138,4 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 template struct nonlocal_pw_op<float, base_device::DEVICE_GPU>;
 template struct nonlocal_pw_op<double, base_device::DEVICE_GPU>;
 
-}  // namespace hamilt
\ No newline at end of file
+}  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index e965446d77..e566126876 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -107,8 +107,6 @@ __global__ void cal_stress_nl(
         const int nkb,
         const int ntype,
         const int spin,
-        const int wg_nc,
-        const int ik,
         const int deeq_2,
         const int deeq_3,
         const int deeq_4,
@@ -125,22 +123,25 @@ __global__ void cal_stress_nl(
     int ib = blockIdx.x / ntype;
     int it = blockIdx.x % ntype;
 
-    int iat = 0, sum = 0;
+    int iat = 0;
+    int sum = 0;
     for (int ii = 0; ii < it; ii++) {
         iat += atom_na[ii];
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib];
-    const int Nprojs = atom_nh[it];
+    FPTYPE stress_var = 0;
+    const FPTYPE fac = d_wg[ib];
+    const FPTYPE ekb_now = d_ekb[ib];
+    const int nproj = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
-        for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
-            int ip1 = ii / Nprojs, ip2 = ii % Nprojs;
+        for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) {
+            const int ip1 = ii / nproj, ip2 = ii % nproj;
             if(!nondiagonal && ip1 != ip2) {
                 continue;
             }
-            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2]
+            const FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2]
                         - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
             const int inkb1 = sum + ip1;
             const int inkb2 = sum + ip2;
@@ -149,7 +150,7 @@ __global__ void cal_stress_nl(
             stress_var -= ps * fac * dbb;
         }
         ++iat;
-        sum+=Nprojs;
+        sum+=nproj;
     }//ia
     __syncwarp();
     warp_reduce(stress_var);
@@ -204,8 +205,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                                                                    const int& nbands_occ,
                                                                    const int& ntype,
                                                                    const int& spin,
-                                                                   const int& wg_nc,
-                                                                   const int& ik,
                                                                    const int& deeq_2,
                                                                    const int& deeq_3,
                                                                    const int& deeq_4,
@@ -226,8 +225,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
              nkb,
              ntype,
              spin,
-             wg_nc,
-             ik,
              deeq_2,
              deeq_3,
              deeq_4,
@@ -244,6 +241,110 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_stress_nl(
+        const int ipol,
+        const int jpol,
+        const int nkb,
+        const int ntype,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *stress)
+{
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
+    const int ib2  = ib * 2;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
+
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    FPTYPE stress_var = 0;
+    const FPTYPE fac = d_wg[ib];
+    const FPTYPE ekb_now = d_ekb[ib];
+    const int nproj = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) {
+            const int ip1 = ii / nproj;
+	        const int ip2 = ii % nproj;
+            const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const int inkb1 = sum + ip1;
+            const int inkb2 = sum + ip2;
+            //out<<"\n ps = "<<ps;
+            const thrust::complex<FPTYPE> dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+        }
+        ++iat;
+        sum+=nproj;
+    }//ia
+    __syncwarp();
+    warp_reduce(stress_var);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        atomicAdd(stress + ipol * 3 + jpol, stress_var);
+    }
+}
+
+template <typename FPTYPE>
+void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                                                                   const int& ipol,
+                                                                   const int& jpol,
+                                                                   const int& nkb,
+                                                                   const int& nbands_occ,
+                                                                   const int& ntype,
+                                                                   const int& deeq_2,
+                                                                   const int& deeq_3,
+                                                                   const int& deeq_4,
+                                                                   const int* atom_nh,
+                                                                   const int* atom_na,
+                                                                   const FPTYPE* d_wg,
+                                                                   const FPTYPE* d_ekb,
+                                                                   const FPTYPE* qq_nt,
+                                                                   const std::complex<FPTYPE>* deeq_nc,
+                                                                   const std::complex<FPTYPE>* becp,
+                                                                   const std::complex<FPTYPE>* dbecp,
+                                                                   FPTYPE* stress)
+{
+     cal_stress_nl<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
+             ipol,
+             jpol,
+             nkb,
+             ntype,
+             deeq_2,
+             deeq_3,
+             deeq_4,
+             atom_nh,
+             atom_na,
+             d_wg,
+             d_ekb,
+             qq_nt,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+             stress);// array of data
+
+    cudaCheckOnDebug();
+}
+
 template <typename T, typename Device>
 void cal_stress_mgga_op<T, Device>::operator()(
     const int& spin,
@@ -710,34 +811,6 @@ void cal_force_npw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
 }
 
 
-// template <typename FPTYPE>
-// void prepare_vkb_deri_ptr_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-//         const base_device::DEVICE_GPU* ctx,
-//         int nbeta, double* nhtol, int nhtol_nc, int npw, int it,
-//         int ipol, int jpol,
-//         std::complex<FPTYPE>*vkb_out, std::complex<FPTYPE>** vkb_ptrs,
-//         FPTYPE* ylm_in, FPTYPE** ylm_ptrs,
-//         FPTYPE* ylm_deri_in, FPTYPE** ylm_deri_ptr1s, FPTYPE** ylm_deri_ptr2s,
-//         FPTYPE* vq_in, FPTYPE** vq_ptrs,
-//         FPTYPE* vq_deri_in, FPTYPE** vq_deri_ptrs
-//     )
-// {
-//     const int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-//     dim3 gridsize(block,nbeta);
-
-
-//     prepare_vkb_deri_ptr<FPTYPE><<<1,1>>>(
-//         nbeta, nhtol, nhtol_nc, npw, it, ipol, jpol,
-//         reinterpret_cast<const thrust::complex<FPTYPE>*>(vkb_out), 
-//         reinterpret_cast<const thrust::complex<FPTYPE>*>(vkb_ptrs), 
-//         ylm_in, ylm_ptrs, ylm_deri_in, ylm_deri_ptr1s, ylm_deri_ptr2s,
-//         vq_in, vq_ptrs, vq_deri_in, vq_deri_ptrs
-//     );
-
-//     return ;
-// }
-
-
 template <>
 void pointer_array_malloc<base_device::DEVICE_GPU>::operator()(
     void **ptr,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 1584c5c681..2ee9cb4530 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -42,7 +42,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
     void operator()(const base_device::DEVICE_CPU* ctx,
                     const bool& nondiagonal,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& spin,
                     const int& deeq_2,
@@ -50,7 +49,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
@@ -71,7 +69,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
             int sum0 = 0;
             for (int it = 0; it < ntype; it++)
             {
-                const int Nprojs = atom_nh[it];
+                const int nproj = atom_nh[it];
 #ifdef _OPENMP
 #pragma omp for collapse(2)
 #endif
@@ -80,11 +78,11 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     for (int ib = 0; ib < nbands_occ; ib++)
                     {
                         FPTYPE local_force[3] = {0, 0, 0};
-                        FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
-                        FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+                        FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+                        FPTYPE ekb_now = d_ekb[ib];
                         int iat = iat0 + ia;
-                        int sum = sum0 + ia * Nprojs;
-                        for (int ip = 0; ip < Nprojs; ip++)
+                        int sum = sum0 + ia * nproj;
+                        for (int ip = 0; ip < nproj; ip++)
                         {
                             // Effective values of the D-eS coefficients
                             FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]
@@ -102,7 +100,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                             }
                             if (nondiagonal)
                             {
-                                for (int ip2 = 0; ip2 < Nprojs; ip2++)
+                                for (int ip2 = 0; ip2 < nproj; ip2++)
                                 {
                                     if (ip != ip2)
                                     {
@@ -141,7 +139,104 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     }
                 } // end ia
                 iat0 += atom_na[it];
-                sum0 += atom_na[it] * Nprojs;
+                sum0 += atom_na[it] * nproj;
+            } // end it
+#ifdef _OPENMP
+        }
+#endif
+    };
+
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+    {
+#ifdef _OPENMP
+#pragma omp parallel
+        {
+#endif
+            int iat0 = 0;
+            int sum0 = 0;
+            for (int it = 0; it < ntype; it++)
+            {
+                const int nprojs = atom_nh[it];
+#ifdef _OPENMP
+#pragma omp for collapse(2)
+#endif
+                for (int ia = 0; ia < atom_na[it]; ia++)
+                {
+                    for (int ib = 0; ib < nbands_occ; ib++)
+                    {
+                        const int ib2 = ib*2;
+                        FPTYPE local_force[3] = {0, 0, 0};
+                        FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+                        FPTYPE ekb_now = d_ekb[ib];
+                        int iat = iat0 + ia;
+                        int sum = sum0 + ia * nprojs;
+                        for (int ip = 0; ip < nprojs; ip++)
+                        {
+                            const int inkb = sum + ip;
+                            // out<<"\n ps = "<<ps;
+                            for (int ip2 = 0; ip2 < nprojs; ip2++)
+                            {
+                                // Effective values of the D-eS coefficients
+                                std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                                const int jnkb = sum + ip2;
+                                std::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                                std::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                                
+
+                                for (int ipol = 0; ipol < 3; ipol++)
+                                {
+                                    const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                                    const int index1 = ib2 * nkb + jnkb;
+                                    const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                                    const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                                    const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                                    const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+
+                                    local_force[ipol] -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                                }
+                            }
+                        }
+#ifdef _OPENMP
+                        if (omp_get_num_threads() > 1)
+                        {
+                            for (int ipol = 0; ipol < 3; ++ipol)
+                            {
+#pragma omp atomic
+                                force[iat * forcenl_nc + ipol] += local_force[ipol];
+                            }
+                        }
+                        else
+#endif
+                        {
+                            for (int ipol = 0; ipol < 3; ++ipol)
+                            {
+                                force[iat * forcenl_nc + ipol] += local_force[ipol];
+                            }
+                        }
+                    }
+                } // end ia
+                iat0 += atom_na[it];
+                sum0 += atom_na[it] * nprojs;
             } // end it
 #ifdef _OPENMP
         }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 885997222c..c68df7bd0f 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -48,7 +48,6 @@ struct cal_force_nl_op
     /// @param ctx - which device this function runs on
     /// @param nondiagonal - control flag
     /// @param nbands_occ - number of occupied bands
-    /// @param wg_nc - the second dimension of matrix wg
     /// @param ntype - total atomic type
     /// @param spin - current spin
     /// @param deeq_2 - the second dimension of deeq
@@ -56,7 +55,6 @@ struct cal_force_nl_op
     /// @param deeq_4 - the forth dimension of deeq
     /// @param forcenl_nc - the second dimension of matrix forcenl
     /// @param nbands - NBANDS
-    /// @param ik - current k point
     /// @param nkb - number of k point
     /// @param atom_nh - GlobalC::ucell.atoms[ii].ncpp.nh
     /// @param atom_na - GlobalC::ucell.atoms[ii].na
@@ -73,7 +71,6 @@ struct cal_force_nl_op
     void operator()(const base_device::DEVICE_CPU* ctx,
                     const bool& nondiagonal,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& spin,
                     const int& deeq_2,
@@ -81,7 +78,6 @@ struct cal_force_nl_op
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
@@ -93,6 +89,26 @@ struct cal_force_nl_op
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* force);
+    // interface for nspin=4 only
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force);
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -117,7 +133,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
     void operator()(const base_device::DEVICE_GPU* ctx,
                     const bool& nondiagonal,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& spin,
                     const int& deeq_2,
@@ -125,7 +140,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
@@ -137,6 +151,26 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* force);
+    // interface for nspin=4 only
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force);
 };
 
 /**
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index fa03fa2a5d..da45a375e7 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -31,7 +31,6 @@ __global__ void cal_vkb1_nl(
 template <typename FPTYPE>
 __global__ void cal_force_nl(
         const bool nondiagonal,
-        const int wg_nc,
         const int ntype,
         const int spin,
         const int deeq_2,
@@ -39,7 +38,6 @@ __global__ void cal_force_nl(
         const int deeq_4,
         const int forcenl_nc,
         const int nbands,
-        const int ik,
         const int nkb,
         const int *atom_nh,
         const int *atom_na,
@@ -55,17 +53,18 @@ __global__ void cal_force_nl(
     const int ib = blockIdx.x / ntype;
     const int it = blockIdx.x % ntype;
 
-    int iat = 0, sum = 0;
+    int iat = 0;
+    int sum = 0;
     for (int ii = 0; ii < it; ii++) {
         iat += atom_na[ii];
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    int Nprojs = atom_nh[it];
-    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
-    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    int nproj = atom_nh[it];
+    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ib];
     for (int ia = 0; ia < atom_na[it]; ia++) {
-        for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
+        for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
             // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip];
             FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]
                         - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
@@ -81,8 +80,8 @@ __global__ void cal_force_nl(
             }
 
             if (nondiagonal) {
-                //for (int ip2=0; ip2<Nprojs; ip2++)
-                for (int ip2 = 0; ip2 < Nprojs; ip2++) {
+                //for (int ip2=0; ip2<nproj; ip2++)
+                for (int ip2 = 0; ip2 < nproj; ip2++) {
                     if (ip != ip2) {
                         const int jnkb = sum + ip2;
                         ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]
@@ -97,7 +96,7 @@ __global__ void cal_force_nl(
             }
         }
         iat += 1;
-        sum += Nprojs;
+        sum += nproj;
     }
 }
 
@@ -130,7 +129,6 @@ template <typename FPTYPE>
 void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
                                                                   const bool& nondiagonal,
                                                                   const int& nbands_occ,
-                                                                  const int& wg_nc,
                                                                   const int& ntype,
                                                                   const int& spin,
                                                                   const int& deeq_2,
@@ -138,7 +136,6 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                                                                   const int& deeq_4,
                                                                   const int& forcenl_nc,
                                                                   const int& nbands,
-                                                                  const int& ik,
                                                                   const int& nkb,
                                                                   const int* atom_nh,
                                                                   const int* atom_na,
@@ -153,9 +150,9 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
 {
     hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
             nondiagonal,
-            wg_nc, ntype, spin,
+            ntype, spin,
             deeq_2, deeq_3, deeq_4,
-            forcenl_nc, nbands, ik, nkb,
+            forcenl_nc, nbands, nkb,
             atom_nh, atom_na,
             tpiba,
             d_wg, d_ekb, qq_nt, deeq,
@@ -166,6 +163,107 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     hipCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_force_nl(
+        const int ntype,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int forcenl_nc,
+        const int nbands,
+        const int nkb,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE tpiba,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *force)
+{
+    const int ib = blockIdx.x / ntype;
+    const int ib2 = ib * 2;
+    const int it = blockIdx.x % ntype;
+
+    int iat = 0;
+    int sum = 0;
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    int nproj = atom_nh[it];
+    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ib];
+    for (int ia = 0; ia < atom_na[it]; ia++) {
+        for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
+            const int inkb = sum + ip;
+            for (int ip2 = 0; ip2 < nproj; ip2++) 
+            {
+                // Effective values of the D-eS coefficients
+                const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                const int jnkb = sum + ip2;
+                const thrust::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+
+                for (int ipol = 0; ipol < 3; ipol++) {
+                    const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                    const int index1 = ib2 * nkb + jnkb;
+                    const thrust::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                    const thrust::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+                    const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                    atomicAdd(force + iat * forcenl_nc + ipol, tmp);
+                }
+            }
+        }
+        iat += 1;
+        sum += nproj;
+    }
+}
+
+// interface for nspin=4 only
+template <typename FPTYPE>
+void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+{
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
+            ntype,
+            deeq_2, deeq_3, deeq_4,
+            forcenl_nc, nbands, nkb,
+            atom_nh, atom_na,
+            tpiba,
+            d_wg, d_ekb, qq_nt, 
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+            force);// array of data
+
+    hipCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void saveVkbValues_(
     const int *gcar_zero_ptrs, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu
index 93df9fe58d..9a43f64ea6 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu
@@ -54,8 +54,8 @@ __global__ void nonlocal_pw(
     thrust::complex<FPTYPE>* ps,
     const thrust::complex<FPTYPE>* becp)
 {
-  const int ii = blockIdx.x / l2;
-  const int jj = blockIdx.x % l2;
+  const int ii = blockIdx.x * 2 / l2;
+  const int jj = blockIdx.x * 2 % l2;
   for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) {
     thrust::complex<FPTYPE> res1(0.0, 0.0);
     thrust::complex<FPTYPE> res2(0.0, 0.0);
@@ -122,7 +122,7 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 {
   // denghui implement 20221109
   // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw<FPTYPE>), dim3(l1 * l2), dim3(THREADS_PER_BLOCK), 0, 0,
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw<FPTYPE>), dim3(l1 * l2 / 2), dim3(THREADS_PER_BLOCK), 0, 0,
     l1, l2, l3, // loop size
     sum, iat, nkb,   // control params
     deeq_x, deeq_y, deeq_z,
@@ -140,4 +140,4 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 namespace hamilt{
 template struct nonlocal_pw_op<float, base_device::DEVICE_GPU>;
 template struct nonlocal_pw_op<double, base_device::DEVICE_GPU>;
-}
\ No newline at end of file
+}
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
index 30a735ee57..6880987546 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -102,8 +102,6 @@ __global__ void cal_stress_nl(
         const int nkb,
         const int ntype,
         const int spin,
-        const int wg_nc,
-        const int ik,
         const int deeq_2,
         const int deeq_3,
         const int deeq_4,
@@ -120,18 +118,21 @@ __global__ void cal_stress_nl(
     int ib = blockIdx.x / ntype;
     int it = blockIdx.x % ntype;
 
-    int iat = 0, sum = 0;
+    int iat = 0;
+    int sum = 0;
     for (int ii = 0; ii < it; ii++) {
         iat += atom_na[ii];
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib];
-    const int Nprojs = atom_nh[it];
+    FPTYPE stress_var = 0;
+    FPTYPE fac = d_wg[ib];
+    FPTYPE ekb_now = d_ekb[ib];
+    const int nproj = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
-        for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
-            int ip1 = ii / Nprojs, ip2 = ii % Nprojs;
+        for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) {
+            int ip1 = ii / nproj, ip2 = ii % nproj;
             if(!nondiagonal && ip1 != ip2) {
                 continue;
             }
@@ -144,7 +145,7 @@ __global__ void cal_stress_nl(
             stress_var -= ps * fac * dbb;
         }
         ++iat;
-        sum+=Nprojs;
+        sum+=nproj;
     }//ia
     warp_reduce(stress_var);
     if (threadIdx.x % WARP_SIZE == 0) {
@@ -198,8 +199,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                                                                    const int& nbands_occ,
                                                                    const int& ntype,
                                                                    const int& spin,
-                                                                   const int& wg_nc,
-                                                                   const int& ik,
                                                                    const int& deeq_2,
                                                                    const int& deeq_3,
                                                                    const int& deeq_4,
@@ -220,8 +219,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
              nkb,
              ntype,
              spin,
-             wg_nc,
-             ik,
              deeq_2,
              deeq_3,
              deeq_4,
@@ -238,6 +235,110 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
     hipCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_stress_nl(
+        const int ipol,
+        const int jpol,
+        const int nkb,
+        const int ntype,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *stress)
+{
+    int ib = blockIdx.x / ntype;
+    const int ib2  = ib * 2;
+    int it = blockIdx.x % ntype;
+
+    int iat = 0;
+    int sum = 0;
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    FPTYPE stress_var = 0;
+    FPTYPE fac = d_wg[ib];
+    FPTYPE ekb_now = d_ekb[ib];
+    const int nproj = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) {
+            const int ip1 = ii / nproj;
+            const int ip2 = ii % nproj;
+            const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const int inkb1 = sum + ip1;
+            const int inkb2 = sum + ip2;
+            //out<<"\n ps = "<<ps;
+            const thrust::complex<FPTYPE> dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+        }
+        ++iat;
+        sum+=nproj;
+    }//ia
+    __syncwarp();
+    warp_reduce(stress_var);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        atomicAdd(stress + ipol * 3 + jpol, stress_var);
+    }
+}
+
+template <typename FPTYPE>
+void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                                                                   const int& ipol,
+                                                                   const int& jpol,
+                                                                   const int& nkb,
+                                                                   const int& nbands_occ,
+                                                                   const int& ntype,
+                                                                   const int& deeq_2,
+                                                                   const int& deeq_3,
+                                                                   const int& deeq_4,
+                                                                   const int* atom_nh,
+                                                                   const int* atom_na,
+                                                                   const FPTYPE* d_wg,
+                                                                   const FPTYPE* d_ekb,
+                                                                   const FPTYPE* qq_nt,
+                                                                   const std::complex<FPTYPE>* deeq_nc,
+                                                                   const std::complex<FPTYPE>* becp,
+                                                                   const std::complex<FPTYPE>* dbecp,
+                                                                   FPTYPE* stress)
+{
+     hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
+             ipol,
+             jpol,
+             nkb,
+             ntype,
+             deeq_2,
+             deeq_3,
+             deeq_4,
+             atom_nh,
+             atom_na,
+             d_wg,
+             d_ekb,
+             qq_nt,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+             stress);// array of data
+
+    hipCheckOnDebug();
+}
+
 template <typename T, typename Device>
 void cal_stress_mgga_op<T, Device>::operator()(
     const int& spin,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index c3a8008e04..f36f2e8316 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -77,8 +77,6 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& nbands_occ,
                     const int& ntype,
                     const int& spin,
-                    const int& wg_nc,
-                    const int& ik,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
@@ -97,10 +95,11 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
 #pragma omp parallel reduction(+ : local_stress)
         {
 #endif
-            int iat = 0, sum = 0;
+            int iat = 0;
+            int sum = 0;
             for (int it = 0; it < ntype; it++)
             {
-                const int Nprojs = atom_nh[it];
+                const int nproj = atom_nh[it];
 #ifdef _OPENMP
 #pragma omp for collapse(4)
 #endif
@@ -108,20 +107,20 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                 {
                     for (int ia = 0; ia < atom_na[it]; ia++)
                     {
-                        for (int ip1 = 0; ip1 < Nprojs; ip1++)
+                        for (int ip1 = 0; ip1 < nproj; ip1++)
                         {
-                            for (int ip2 = 0; ip2 < Nprojs; ip2++)
+                            for (int ip2 = 0; ip2 < nproj; ip2++)
                             {
                                 if (!nondiagonal && ip1 != ip2)
                                 {
                                     continue;
                                 }
-                                FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0;
-                                FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+                                FPTYPE fac = d_wg[ib] * 1.0;
+                                FPTYPE ekb_now = d_ekb[ib];
                                 FPTYPE ps = deeq[((spin * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]
                                             - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
-                                const int inkb1 = sum + ia * Nprojs + ip1;
-                                const int inkb2 = sum + ia * Nprojs + ip2;
+                                const int inkb1 = sum + ia * nproj + ip1;
+                                const int inkb2 = sum + ia * nproj + ip2;
                                 // out<<"\n ps = "<<ps;
 
                                 const FPTYPE dbb = (conj(dbecp[ib * nkb + inkb1]) * becp[ib * nkb + inkb2]).real();
@@ -130,7 +129,76 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                         } // end ip
                     }     // ia
                 }
-                sum += atom_na[it] * Nprojs;
+                sum += atom_na[it] * nproj;
+                iat += atom_na[it];
+            } // end it
+#ifdef _OPENMP
+        }
+#endif
+        stress[ipol * 3 + jpol] += local_stress;
+    };
+
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& ipol,
+                    const int& jpol,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress)
+    {
+        FPTYPE local_stress = 0;
+#ifdef _OPENMP
+#pragma omp parallel reduction(+ : local_stress)
+        {
+#endif
+            int iat = 0, sum = 0;
+            for (int it = 0; it < ntype; it++)
+            {
+                const int nproj = atom_nh[it];
+#ifdef _OPENMP
+#pragma omp for collapse(4)
+#endif
+                for (int ib = 0; ib < nbands_occ; ib++)
+                {
+                    for (int ia = 0; ia < atom_na[it]; ia++)
+                    {
+                        for (int ip1 = 0; ip1 < nproj; ip1++)
+                        {
+                            for (int ip2 = 0; ip2 < nproj; ip2++)
+                            {
+                                const int ib2 = ib*2;
+                                FPTYPE fac = d_wg[ib] * 1.0;
+                                FPTYPE ekb_now = d_ekb[ib];
+                                std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+                                std::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+                                const int inkb1 = sum + ia * nproj + ip1;
+                                const int inkb2 = sum + ia * nproj + ip2;
+                                // out<<"\n ps = "<<ps;
+
+                                const std::complex<FPTYPE> dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+                                const std::complex<FPTYPE> dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+                                const std::complex<FPTYPE> dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+                                const std::complex<FPTYPE> dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+                                local_stress -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                            }
+                        } // end ip
+                    }     // ia
+                }
+                sum += atom_na[it] * nproj;
                 iat += atom_na[it];
             } // end it
 #ifdef _OPENMP
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index 3603b16cac..2f4bbd0989 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -66,8 +66,6 @@ struct cal_stress_nl_op
     /// @param nbands_occ - number of occupied bands
     /// @param ntype - total atomic type
     /// @param spin - current spin
-    /// @param wg_nc - the second dimension of matrix wg
-    /// @param ik - current k point
     /// @param deeq_2 - the second dimension of deeq
     /// @param deeq_3 - the third dimension of deeq
     /// @param deeq_4 - the forth dimension of deeq
@@ -90,8 +88,6 @@ struct cal_stress_nl_op
                     const int& nbands_occ,
                     const int& ntype,
                     const int& spin,
-                    const int& wg_nc,
-                    const int& ik,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
@@ -104,6 +100,25 @@ struct cal_stress_nl_op
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* stress);
+    // interface for nspin=4 only
+    void operator()(const Device* ctx,
+                    const int& ipol,
+                    const int& jpol,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress);
 };
 
 template <typename T, typename Device>
@@ -237,8 +252,6 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& nbands_occ,
                     const int& ntype,
                     const int& spin,
-                    const int& wg_nc,
-                    const int& ik,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
@@ -251,6 +264,25 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* stress);
+    // interface for nspin=4 only
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& ipol,
+                    const int& jpol,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress);
 };
 
 // cpu version first, gpu version later
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
index a791fd933b..3da3ca3b19 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
@@ -2887,7 +2887,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_cpu)
     hamilt::cal_force_nl_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
                                                                multi_proj,
                                                                nbands_occ,
-                                                               wg_nc,
                                                                ntype,
                                                                spin,
                                                                deeq_2,
@@ -2895,7 +2894,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_cpu)
                                                                deeq_4,
                                                                forcenl_nc,
                                                                nbands,
-                                                               ik,
                                                                nkb,
                                                                atom_nh.data(),
                                                                atom_na.data(),
@@ -2980,7 +2978,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu)
     hamilt::cal_force_nl_op<double, base_device::DEVICE_GPU>()(gpu_ctx,
                                                                multi_proj,
                                                                nbands_occ,
-                                                               wg_nc,
                                                                ntype,
                                                                spin,
                                                                deeq_2,
@@ -2988,7 +2985,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu)
                                                                deeq_4,
                                                                forcenl_nc,
                                                                nbands,
-                                                               ik,
                                                                nkb,
                                                                d_atom_nh,
                                                                d_atom_na,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
index 81cd5de858..4f84fa098d 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
@@ -96,8 +96,6 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_cpu)
                                                                 nbands_occ,
                                                                 ntype,
                                                                 spin,
-                                                                wg_nc,
-                                                                ik,
                                                                 deeq_2,
                                                                 deeq_3,
                                                                 deeq_4,
@@ -271,8 +269,6 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu)
                                                                 nbands_occ,
                                                                 ntype,
                                                                 spin,
-                                                                wg_nc,
-                                                                ik,
                                                                 deeq_2,
                                                                 deeq_3,
                                                                 deeq_4,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
index 48a212d3ef..489e1fe0f8 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
@@ -30,8 +30,9 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
 	int npwx=0;
     for (int ik = 0; ik < p_kv->get_nks(); ik++)
     {
-        if (npwx < p_kv->ngk[ik])
+        if (npwx < p_kv->ngk[ik]) {
             npwx = p_kv->ngk[ik];
+}
     }
 
     gk[0]= new FPTYPE[npwx];
@@ -40,6 +41,7 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
     FPTYPE tpiba = ModuleBase::TWO_PI / GlobalC::ucell.lat0;
     FPTYPE twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI);
     FPTYPE* kfac = new FPTYPE[npwx];
+    const int npol = GlobalC::ucell.get_npol();
 
     for (int ik = 0; ik < p_kv->get_nks(); ik++)
     {
@@ -72,8 +74,8 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
             {
                 for (int ibnd = 0; ibnd < PARAM.inp.nbands; ibnd++)
                 {
-                    if (std::fabs(wg(ik, ibnd)) < ModuleBase::threshold_wg * wg(ik, 0))
-                        continue;
+                    if (wg(ik, ibnd) == 0.0) { continue;
+}
                     const std::complex<FPTYPE>* ppsi = nullptr;
                     ppsi = &(psi_in[0](ik, ibnd, 0));
 
@@ -86,6 +88,18 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
                         sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i]
                                * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real()));
                     }
+                    if(npol == 2)
+                    {
+                        ppsi = &(psi_in[0](ik, ibnd, npwx));
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : sum)
+#endif
+                        for (int i = 0; i < npw; i++)
+                        {
+                            sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i]
+                                   * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real()));
+                        }
+                    }
                     s_kin[l][m] += sum;
                 }
             }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
index 7a0d614f6f..d214c99ba1 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
@@ -23,10 +23,10 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
 
 	const int nspin_rho = (PARAM.inp.nspin == 2) ? 2 : 1;
 
-	if (PARAM.globalv.gamma_only_pw && is_pw) { fact=2.0;
-}
-
-    
+	if (PARAM.inp.gamma_only && is_pw) 
+	{ 
+		fact=2.0;
+	}
 
 	std::vector<std::complex<FPTYPE>> aux(rho_basis->nmaxgr);
 
@@ -52,11 +52,11 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
 				aux[ir] = std::complex<FPTYPE>(chr->rho[0][ir], 0.0 );
 			}
 		}
-		for (int is = 1; is < nspin_rho; is++)
+		if(nspin_rho == 2)
 		{
 			for (int ir = irb; ir < ir_end; ++ir)
 			{ // accumulate aux
-				aux[ir] += std::complex<FPTYPE>(chr->rho[is][ir], 0.0 );
+				aux[ir] += std::complex<FPTYPE>(chr->rho[1][ir], 0.0 );
 			}
 		}
  	}
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
index d3cbb06885..5b177b37b6 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
@@ -49,7 +49,7 @@ void Stress_Func<FPTYPE, Device>::stress_nl(ModuleBase::matrix& sigma,
                 break;
             }
         }
-        const int npm = ucell_in.get_npol() * nbands_occ;
+        const int npm = nbands_occ;
 
         // calculate becp = <psi|beta> for all beta functions
         nl_tools.cal_becp(ik, npm);
diff --git a/tests/integrate/104_PW_NC_magnetic/INPUT b/tests/integrate/104_PW_NC_magnetic/INPUT
index 622af75b86..d761feb2c1 100644
--- a/tests/integrate/104_PW_NC_magnetic/INPUT
+++ b/tests/integrate/104_PW_NC_magnetic/INPUT
@@ -4,10 +4,9 @@ suffix    autotest
 #nbands   40
 
 calculation   scf
-nspin         2
-ecutwfc       15
-scf_thr           1.0e-8
-scf_nmax         200
+ecutwfc       20
+scf_thr           1.0e-6
+scf_nmax         50
 out_chg    0
 
 #init_chg    file
@@ -16,14 +15,17 @@ out_chg    0
 #out_band     1
 noncolin	1
 smearing_method    gaussian
-smearing_sigma       0.07
+smearing_sigma       0.02
 
 mixing_type    broyden 
-mixing_beta    0.5
+mixing_beta    0.2
 mixing_ndim 10
 
-ks_solver     cg
+ks_solver     dav_subspace
+pw_diag_ndim  2
 basis_type    pw
-symmetry      1
+symmetry      0
+cal_force     1
+cal_stress    1
 pseudo_dir	../../PP_ORB
 orbital_dir	../../PP_ORB
diff --git a/tests/integrate/104_PW_NC_magnetic/KPT b/tests/integrate/104_PW_NC_magnetic/KPT
index db31432445..28006d5e2d 100644
--- a/tests/integrate/104_PW_NC_magnetic/KPT
+++ b/tests/integrate/104_PW_NC_magnetic/KPT
@@ -1,4 +1,4 @@
 K_POINTS
 0
 Gamma
-1 1 1  0 0 0
+2 2 2  0 0 0
diff --git a/tests/integrate/104_PW_NC_magnetic/STRU b/tests/integrate/104_PW_NC_magnetic/STRU
index d1981a2504..5b61e75c50 100644
--- a/tests/integrate/104_PW_NC_magnetic/STRU
+++ b/tests/integrate/104_PW_NC_magnetic/STRU
@@ -18,5 +18,5 @@ Fe
 1.0
 2
 0.00 0.00 0.00 1 1 1 angle1 90 angle2 0 
-0.50 0.50 0.50 1 1 1 angle1 90 angle2 180
+0.51 0.52 0.53 1 1 1 angle1 90 angle2 0
 
diff --git a/tests/integrate/104_PW_NC_magnetic/jd b/tests/integrate/104_PW_NC_magnetic/jd
new file mode 100644
index 0000000000..a4292b9c28
--- /dev/null
+++ b/tests/integrate/104_PW_NC_magnetic/jd
@@ -0,0 +1 @@
+Test noncolliner-spin calculation for FM-Fe2 , test force and stress for k-222
diff --git a/tests/integrate/104_PW_NC_magnetic/result.ref b/tests/integrate/104_PW_NC_magnetic/result.ref
index 384804bb8d..21c5c36613 100644
--- a/tests/integrate/104_PW_NC_magnetic/result.ref
+++ b/tests/integrate/104_PW_NC_magnetic/result.ref
@@ -1,6 +1,5 @@
-etotref -5866.197307043378
-etotperatomref -2933.0986535217
-pointgroupref O_h
-spacegroupref O_h
-nksibzref 1
-totaltimeref 2.23
+etotref -6156.9375288905130219
+etotperatomref -3078.4687644453
+totalforceref 2.221910
+totalstressref 76009.325784
+totaltimeref 3.03
diff --git a/tests/integrate/140_PW_15_SO/INPUT b/tests/integrate/140_PW_15_SO/INPUT
index fc09132c7c..1c58840435 100644
--- a/tests/integrate/140_PW_15_SO/INPUT
+++ b/tests/integrate/140_PW_15_SO/INPUT
@@ -25,12 +25,16 @@ ecutwfc             20
 scf_thr                 1e-6
 scf_nmax               100
 
+cal_force           1
+cal_stress          1
+
 #cal_stress             1
 #noncolin           1
 lspinorb            1
 
 basis_type          pw
-ks_solver           dav
+ks_solver           dav_subspace
+pw_diag_ndim        2
 chg_extrap       second-order
 out_dm              0
 pw_diag_thr                0.00001
diff --git a/tests/integrate/140_PW_15_SO/KPT b/tests/integrate/140_PW_15_SO/KPT
index c289c0158a..e769af7638 100644
--- a/tests/integrate/140_PW_15_SO/KPT
+++ b/tests/integrate/140_PW_15_SO/KPT
@@ -1,4 +1,4 @@
 K_POINTS
 0
 Gamma
-1 1 1 0 0 0
+2 1 1 0 0 0
diff --git a/tests/integrate/140_PW_15_SO/result.ref b/tests/integrate/140_PW_15_SO/result.ref
index 2f6201a31f..6bb325aacb 100644
--- a/tests/integrate/140_PW_15_SO/result.ref
+++ b/tests/integrate/140_PW_15_SO/result.ref
@@ -1,3 +1,5 @@
-etotref -1668.616833271131
-etotperatomref -834.3084166356
-totaltimeref 5.15
+etotref -1678.3650981686614614
+etotperatomref -839.1825490843
+totalforceref 1.739332
+totalstressref 34372.194072
+totaltimeref 0.99