From cda0e5d483e57eeb7a69b7a5d99a91a968f5e6ce Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@bjaisi.com>
Date: Wed, 31 Jul 2024 13:09:54 +0800
Subject: [PATCH 1/8] Fix: noncollinear-spin force and stress with PW

---
 .../module_xc/xc_functional_gradcorr.cpp      |   2 +-
 .../hamilt_pwdft/forces_nl.cpp                |   2 +-
 .../hamilt_pwdft/fs_nonlocal_tools.cpp        | 138 +++++++++++++-----
 .../hamilt_pwdft/kernels/force_op.cpp         | 102 +++++++++++++
 .../hamilt_pwdft/kernels/force_op.h           |  23 +++
 .../hamilt_pwdft/kernels/stress_op.cpp        |  76 ++++++++++
 .../hamilt_pwdft/kernels/stress_op.h          |  22 +++
 .../hamilt_pwdft/stress_func_kin.cpp          |  16 +-
 .../hamilt_pwdft/stress_func_loc.cpp          |   4 +-
 .../hamilt_pwdft/stress_func_nl.cpp           |   2 +-
 10 files changed, 343 insertions(+), 44 deletions(-)
diff --git a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
index 4d94f68ec4..d1082c809b 100644
--- a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
+++ b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
@@ -186,7 +186,7 @@ void XC_Functional::gradcorr(double &etxc, double &vtxc, ModuleBase::matrix &v,
 		}
 
 		gdr2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
-		h2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
+		if(!is_stress) h2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
 
 		XC_Functional::grad_rho( rhogsum1 , gdr1, rhopw, ucell->tpiba);
 		XC_Functional::grad_rho( rhogsum2 , gdr2, rhopw, ucell->tpiba);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
index 04caefd1bf..92733745b4 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
@@ -45,7 +45,7 @@ void Forces<FPTYPE, Device>::cal_force_nl(ModuleBase::matrix& forcenl,
                 break;
             }
         }
-        const int npm = ucell_in.get_npol() * nbands_occ;
+        const int npm = nbands_occ;
         // calculate becp = <psi|beta> for all beta functions
         nl_tools.cal_becp(ik, npm);
         for (int ipol = 0; ipol < 3; ipol++)
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
index f6773b3ddb..e189f937d9 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
@@ -162,9 +162,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
 {
     ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp");
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
+    const int npol = this->ucell_->get_npol();
+    const int size_becp = this->nbands * npol * this->nkb;
+    const int size_becp_act = npm * npol * this->nkb;
     if (this->becp == nullptr)
     {
-        resmem_complex_op()(this->ctx, becp, this->nbands * this->nkb);
+        resmem_complex_op()(this->ctx, becp, size_becp);
     }
 
     // prepare math tools
@@ -248,11 +251,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
     }
     const char transa = 'C';
     const char transb = 'N';
+    int npm_npol = npm * npol;
     gemm_op()(this->ctx,
               transa,
               transb,
               nkb,
-              npm,
+              npm_npol,
               npw,
               &ModuleBase::ONE,
               ppcell_vkb,
@@ -267,15 +271,15 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
     if (this->device == base_device::GpuDevice)
     {
         std::complex<FPTYPE>* h_becp = nullptr;
-        resmem_complex_h_op()(this->cpu_ctx, h_becp, this->nbands * nkb);
-        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, this->nbands * nkb);
-        Parallel_Reduce::reduce_pool(h_becp, this->nbands * nkb);
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, this->nbands * nkb);
+        resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act);
+        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, size_becp_act);
+        Parallel_Reduce::reduce_pool(h_becp, size_becp_act);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, size_becp_act);
         delmem_complex_h_op()(this->cpu_ctx, h_becp);
     }
     else
     {
-        Parallel_Reduce::reduce_pool(becp, this->nbands * this->nkb);
+        Parallel_Reduce::reduce_pool(becp, size_becp_act);
     }
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
 }
@@ -286,9 +290,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
 {
     ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s");
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
+    const int npol = this->ucell_->get_npol();
+    const int size_becp = this->nbands * npol * this->nkb;
+    const int npm_npol = npm * npol;
     if (this->dbecp == nullptr)
     {
-        resmem_complex_op()(this->ctx, dbecp, this->nbands * this->nkb);
+        resmem_complex_op()(this->ctx, dbecp, size_becp);
     }
 
     // prepare math tools
@@ -400,7 +407,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
               transa,
               transb,
               nkb,
-              npm,
+              npm_npol,
               npw,
               &ModuleBase::ONE,
               ppcell_vkb,
@@ -411,6 +418,8 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
               dbecp,
               nkb);
     // calculate stress for target (ipol, jpol)
+    if(npol == 1)
+    {
     const int current_spin = this->kv_->isk[ik];
     cal_stress_nl_op()(this->ctx,
                        nondiagonal,
@@ -434,20 +443,48 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
                        becp,
                        dbecp,
                        stress);
+    }
+    else
+    {
+        cal_stress_nl_op()(this->ctx,
+                           nondiagonal,
+                           ipol,
+                           jpol,
+                           nkb,
+                           npm,
+                           this->ntype,
+                           this->nbands,
+                           ik,
+                           this->nlpp_->deeq_nc.getBound2(),
+                           this->nlpp_->deeq_nc.getBound3(),
+                           this->nlpp_->deeq_nc.getBound4(),
+                           atom_nh,
+                           atom_na,
+                           d_wg,
+                           d_ekb,
+                           qq_nt,
+                           this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
+                           becp,
+                           dbecp,
+                           stress);
+    }
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
 }
 
 template <typename FPTYPE, typename Device>
 void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
 {
-    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s");
+    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f");
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f");
+    const int npol = this->ucell_->get_npol();
+    const int npm_npol = npm * npol;
+    const int size_becp = this->nbands * npol * this->nkb;
     if (this->dbecp == nullptr)
     {
-        resmem_complex_op()(this->ctx, dbecp, 3 * this->nbands * this->nkb);
+        resmem_complex_op()(this->ctx, dbecp, 3 * size_becp);
     }
 
-    std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * this->nbands * this->nkb;
+    std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * size_becp;
     const std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
     std::complex<FPTYPE>* vkb_deri_ptr = this->ppcell_vkb;
 
@@ -480,7 +517,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
               transa,
               transb,
               this->nkb,
-              npm,
+              npm_npol,
               npw,
               &ModuleBase::ONE,
               vkb_deri_ptr,
@@ -490,7 +527,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
               &ModuleBase::ZERO,
               dbecp_ptr,
               nkb);
-
     this->revert_vkb(npw, ipol);
     this->pre_ik_f = ik;
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f");
@@ -633,29 +669,57 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
     const int current_spin = this->kv_->isk[ik];
     const int force_nc = 3;
     // calculate the force
-    cal_force_nl_op<FPTYPE, Device>()(this->ctx,
-                                      nondiagonal,
-                                      npm,
-                                      this->nbands,
-                                      this->ntype,
-                                      current_spin,
-                                      this->nlpp_->deeq.getBound2(),
-                                      this->nlpp_->deeq.getBound3(),
-                                      this->nlpp_->deeq.getBound4(),
-                                      force_nc,
-                                      this->nbands,
-                                      ik,
-                                      nkb,
-                                      atom_nh,
-                                      atom_na,
-                                      this->ucell_->tpiba,
-                                      d_wg,
-                                      d_ekb,
-                                      qq_nt,
-                                      deeq,
-                                      becp,
-                                      dbecp,
-                                      force);
+    if(this->ucell_->get_npol() == 1)
+    {
+        cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                        nondiagonal,
+                                        npm,
+                                        this->nbands,
+                                        this->ntype,
+                                        current_spin,
+                                        this->nlpp_->deeq.getBound2(),
+                                        this->nlpp_->deeq.getBound3(),
+                                        this->nlpp_->deeq.getBound4(),
+                                        force_nc,
+                                        this->nbands,
+                                        ik,
+                                        nkb,
+                                        atom_nh,
+                                        atom_na,
+                                        this->ucell_->tpiba,
+                                        d_wg,
+                                        d_ekb,
+                                        qq_nt,
+                                        deeq,
+                                        becp,
+                                        dbecp,
+                                        force);
+    }
+    else
+    {
+        cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                          nondiagonal,
+                                          npm,
+                                          this->nbands,
+                                          this->ntype,
+                                          this->nlpp_->deeq_nc.getBound2(),
+                                          this->nlpp_->deeq_nc.getBound3(),
+                                          this->nlpp_->deeq_nc.getBound4(),
+                                          force_nc,
+                                          this->nbands,
+                                          ik,
+                                          nkb,
+                                          atom_nh,
+                                          atom_na,
+                                          this->ucell_->tpiba,
+                                          d_wg,
+                                          d_ekb,
+                                          qq_nt,
+                                          this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
+                                          becp,
+                                          dbecp,
+                                          force);
+    }
 }
 
 // template instantiation
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 1584c5c681..041bfc3b19 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -145,6 +145,108 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
             } // end it
 #ifdef _OPENMP
         }
+#endif
+    };
+
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const bool& nondiagonal,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+    {
+#ifdef _OPENMP
+#pragma omp parallel
+        {
+#endif
+            int iat0 = 0;
+            int sum0 = 0;
+            for (int it = 0; it < ntype; it++)
+            {
+                const int Nprojs = atom_nh[it];
+#ifdef _OPENMP
+#pragma omp for collapse(2)
+#endif
+                for (int ia = 0; ia < atom_na[it]; ia++)
+                {
+                    for (int ib = 0; ib < nbands_occ; ib++)
+                    {
+                        const int ib2 = ib*2;
+                        FPTYPE local_force[3] = {0, 0, 0};
+                        FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+                        FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+                        int iat = iat0 + ia;
+                        int sum = sum0 + ia * Nprojs;
+                        for (int ip = 0; ip < Nprojs; ip++)
+                        {
+                            const int inkb = sum + ip;
+                            // out<<"\n ps = "<<ps;
+                            for (int ip2 = 0; ip2 < Nprojs; ip2++)
+                            {
+                                if(ip2 != ip && !nondiagonal) continue;
+
+                                // Effective values of the D-eS coefficients
+                                std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                                const int jnkb = sum + ip2;
+                                std::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                                std::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                                
+
+                                for (int ipol = 0; ipol < 3; ipol++)
+                                {
+                                    const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                                    const int index1 = ib2 * nkb + jnkb;
+                                    const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                                    const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                                    const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                                    const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+
+                                    local_force[ipol] -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                                }
+                            }
+                        }
+#ifdef _OPENMP
+                        if (omp_get_num_threads() > 1)
+                        {
+                            for (int ipol = 0; ipol < 3; ipol++)
+                            {
+#pragma omp atomic
+                                force[iat * forcenl_nc + ipol] += local_force[ipol];
+                            }
+                        }
+                        else
+#endif
+                        {
+                            for (int ipol = 0; ipol < 3; ipol++)
+                            {
+                                force[iat * forcenl_nc + ipol] += local_force[ipol];
+                            }
+                        }
+                    }
+                } // end ia
+                iat0 += atom_na[it];
+                sum0 += atom_na[it] * Nprojs;
+            } // end it
+#ifdef _OPENMP
+        }
 #endif
     }
 };
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index b04b769df8..b9fb84344a 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -92,6 +92,29 @@ struct cal_force_nl_op
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* force);
+    // interface for nspin=4 only
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const bool& nondiagonal,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force);
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index 2f551db297..e58cfb3e7b 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -135,6 +135,82 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
             } // end it
 #ifdef _OPENMP
         }
+#endif
+        stress[ipol * 3 + jpol] += local_stress;
+    };
+
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const bool& nondiagonal,
+                    const int& ipol,
+                    const int& jpol,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress)
+    {
+        FPTYPE local_stress = 0;
+#ifdef _OPENMP
+#pragma omp parallel reduction(+ : local_stress)
+        {
+#endif
+            int iat = 0, sum = 0;
+            for (int it = 0; it < ntype; it++)
+            {
+                const int Nprojs = atom_nh[it];
+#ifdef _OPENMP
+#pragma omp for collapse(4)
+#endif
+                for (int ib = 0; ib < nbands_occ; ib++)
+                {
+                    for (int ia = 0; ia < atom_na[it]; ia++)
+                    {
+                        for (int ip1 = 0; ip1 < Nprojs; ip1++)
+                        {
+                            for (int ip2 = 0; ip2 < Nprojs; ip2++)
+                            {
+                                if (!nondiagonal && ip1 != ip2)
+                                {
+                                    continue;
+                                }
+                                const int ib2 = ib*2;
+                                FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0;
+                                FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+                                std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+                                std::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+                                const int inkb1 = sum + ia * Nprojs + ip1;
+                                const int inkb2 = sum + ia * Nprojs + ip2;
+                                // out<<"\n ps = "<<ps;
+
+                                const std::complex<FPTYPE> dbb0 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real();
+                                const std::complex<FPTYPE> dbb1 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real();
+                                const std::complex<FPTYPE> dbb2 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real();
+                                const std::complex<FPTYPE> dbb3 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real();
+                                local_stress -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                            }
+                        } // end ip
+                    }     // ia
+                }
+                sum += atom_na[it] * Nprojs;
+                iat += atom_na[it];
+            } // end it
+#ifdef _OPENMP
+        }
 #endif
         stress[ipol * 3 + jpol] += local_stress;
     }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index b4dfc35ff8..a2852c00c9 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -103,6 +103,28 @@ struct cal_stress_nl_op
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* stress);
+    // interface for nspin=4 only
+    void operator()(const Device* ctx,
+                    const bool& nondiagonal,
+                    const int& ipol,
+                    const int& jpol,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress);
 };
 
 template <typename T, typename Device>
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
index 7a97be57d7..79d2d1ec79 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
@@ -39,6 +39,7 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
     FPTYPE tpiba = ModuleBase::TWO_PI / GlobalC::ucell.lat0;
     FPTYPE twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI);
     FPTYPE* kfac = new FPTYPE[npwx];
+    const int npol = GlobalC::ucell.get_npol();
 
     for (int ik = 0; ik < p_kv->get_nks(); ik++)
     {
@@ -71,8 +72,7 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
             {
                 for (int ibnd = 0; ibnd < GlobalV::NBANDS; ibnd++)
                 {
-                    if (std::fabs(wg(ik, ibnd)) < ModuleBase::threshold_wg * wg(ik, 0))
-                        continue;
+                    if (wg(ik, ibnd) == 0.0) continue;
                     const std::complex<FPTYPE>* ppsi = nullptr;
                     ppsi = &(psi_in[0](ik, ibnd, 0));
 
@@ -85,6 +85,18 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
                         sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i]
                                * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real()));
                     }
+                    if(npol == 2)
+                    {
+                        ppsi = &(psi_in[0](ik, ibnd, npwx));
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : sum)
+#endif
+                        for (int i = 0; i < npw; i++)
+                        {
+                            sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i]
+                                   * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real()));
+                        }
+                    }
                     s_kin[l][m] += sum;
                 }
             }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
index be64b4e573..b3e5eab6ba 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
@@ -50,11 +50,11 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
 				aux[ir] = std::complex<FPTYPE>(chr->rho[0][ir], 0.0 );
 			}
 		}
-		for (int is = 1; is < nspin_rho; is++)
+		if(nspin_rho == 2)
 		{
 			for (int ir = irb; ir < ir_end; ++ir)
 			{ // accumulate aux
-				aux[ir] += std::complex<FPTYPE>(chr->rho[is][ir], 0.0 );
+				aux[ir] += std::complex<FPTYPE>(chr->rho[1][ir], 0.0 );
 			}
 		}
  	}
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
index b243d1ff5d..1ff91afe6c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
@@ -48,7 +48,7 @@ void Stress_Func<FPTYPE, Device>::stress_nl(ModuleBase::matrix& sigma,
                 break;
             }
         }
-        const int npm = ucell_in.get_npol() * nbands_occ;
+        const int npm = nbands_occ;
 
         // calculate becp = <psi|beta> for all beta functions
         nl_tools.cal_becp(ik, npm);

From 8937146fc459069241fef148c134494ff785608a Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@dp.tech>
Date: Tue, 6 Aug 2024 07:14:26 +0000
Subject: [PATCH 2/8] Fix: bug of pw force&stress calculation

---
 source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
index e189f937d9..c01aae9e96 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
@@ -107,12 +107,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix
         this->atom_na = h_atom_na.data();
         this->ppcell_vkb = this->nlpp_->vkb.c;
     }
-
-    // prepare the memory of the becp and dbecp:
-    // becp: <Beta(nkb,npw)|psi(nbnd,npw)>
-    // dbecp: <dBeta(nkb,npw)/dG|psi(nbnd,npw)>
-    resmem_complex_op()(this->ctx, becp, this->nbands * nkb, "Stress::becp");
-    resmem_complex_op()(this->ctx, dbecp, 6 * this->nbands * nkb, "Stress::dbecp");
 }
 
 template <typename FPTYPE, typename Device>

From 65a93f7534695bf783d94746bd1994fce5545ba8 Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@dp.tech>
Date: Wed, 7 Aug 2024 06:40:54 +0000
Subject: [PATCH 3/8] fix: soc error and cuda kernels

---
 .../hamilt_pwdft/fs_nonlocal_tools.cpp        |   2 -
 .../hamilt_pwdft/kernels/cuda/force_op.cu     | 103 +++++++++++++
 .../hamilt_pwdft/kernels/cuda/stress_op.cu    | 135 ++++++++++++++----
 .../hamilt_pwdft/kernels/force_op.cpp         |   3 -
 .../hamilt_pwdft/kernels/force_op.h           |  23 ++-
 .../hamilt_pwdft/kernels/stress_op.cpp        |  13 +-
 .../hamilt_pwdft/kernels/stress_op.h          |  22 ++-
 7 files changed, 258 insertions(+), 43 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
index c01aae9e96..f81f5d0880 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
@@ -441,7 +441,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
     else
     {
         cal_stress_nl_op()(this->ctx,
-                           nondiagonal,
                            ipol,
                            jpol,
                            nkb,
@@ -692,7 +691,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
     else
     {
         cal_force_nl_op<FPTYPE, Device>()(this->ctx,
-                                          nondiagonal,
                                           npm,
                                           this->nbands,
                                           this->ntype,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index f8df7b3029..fb690754c9 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -170,6 +170,109 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_force_nl(
+        const int wg_nc,
+        const int ntype,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int forcenl_nc,
+        const int nbands,
+        const int ik,
+        const int nkb,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE tpiba,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *force)
+{
+    const int ib = blockIdx.x / ntype;
+    const int it = blockIdx.x % ntype;
+
+    int iat = 0, sum = 0;
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    int Nprojs = atom_nh[it];
+    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    for (int ia = 0; ia < atom_na[it]; ia++) {
+        for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
+            const int inkb = sum + ip;
+            for (int ip2 = 0; ip2 < Nprojs; ip2++) 
+            {
+                // Effective values of the D-eS coefficients
+                const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                const int jnkb = sum + ip2;
+                const thrust::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+
+                const int jnkb = sum + ip2;
+                for (int ipol = 0; ipol < 3; ipol++) {
+                    const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                    const int index1 = ib2 * nkb + jnkb;
+                    const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                    const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                    const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                    const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+                    const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                    atomicAdd(force + iat * forcenl_nc + ipol, tmp);
+                }
+            }
+        }
+        iat += 1;
+        sum += Nprojs;
+    }
+}
+
+// interface for nspin=4 only
+void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+{
+    cal_force_nl<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
+            wg_nc, ntype,
+            deeq_2, deeq_3, deeq_4,
+            forcenl_nc, nbands, ik, nkb,
+            atom_nh, atom_na,
+            tpiba,
+            d_wg, d_ekb, qq_nt, 
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+            force);// array of data
+
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void saveVkbValues_(
     const int *gcar_zero_ptrs, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index 769d6351e9..c379fc1082 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -240,6 +240,114 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_stress_nl(
+        const int ipol,
+        const int jpol,
+        const int nkb,
+        const int ntype,
+        const int wg_nc,
+        const int ik,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *stress)
+{
+    int ib = blockIdx.x / ntype;
+    const int ib2  = ib * 2;
+    int it = blockIdx.x % ntype;
+
+    int iat = 0, sum = 0;
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib];
+    const int Nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
+            int ip1 = ii / Nprojs, ip2 = ii % Nprojs;
+            const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            FPTYPE ps = deeq_nc[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2]
+                        - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            const int inkb1 = sum + ip1;
+            const int inkb2 = sum + ip2;
+            //out<<"\n ps = "<<ps;
+            const thrust::complex<FPTYPE> dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+        }
+        ++iat;
+        sum+=Nprojs;
+    }//ia
+    __syncwarp();
+    warp_reduce(stress_var);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        atomicAdd(stress + ipol * 3 + jpol, stress_var);
+    }
+}
+
+template <typename FPTYPE>
+void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                                                                   const int& ipol,
+                                                                   const int& jpol,
+                                                                   const int& nkb,
+                                                                   const int& nbands_occ,
+                                                                   const int& ntype,
+                                                                   const int& wg_nc,
+                                                                   const int& ik,
+                                                                   const int& deeq_2,
+                                                                   const int& deeq_3,
+                                                                   const int& deeq_4,
+                                                                   const int* atom_nh,
+                                                                   const int* atom_na,
+                                                                   const FPTYPE* d_wg,
+                                                                   const FPTYPE* d_ekb,
+                                                                   const FPTYPE* qq_nt,
+                                                                   const std::complex<FPTYPE>* deeq_nc,
+                                                                   const std::complex<FPTYPE>* becp,
+                                                                   const std::complex<FPTYPE>* dbecp,
+                                                                   FPTYPE* stress)
+{
+     cal_stress_nl<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
+             ipol,
+             jpol,
+             nkb,
+             ntype,
+             wg_nc,
+             ik,
+             deeq_2,
+             deeq_3,
+             deeq_4,
+             atom_nh,
+             atom_na,
+             d_wg,
+             d_ekb,
+             qq_nt,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+             stress);// array of data
+
+    cudaCheckOnDebug();
+}
+
 template <typename T, typename Device>
 void cal_stress_mgga_op<T, Device>::operator()(
     const int& spin,
@@ -638,33 +746,6 @@ void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     return ;
 }
 
-// template <typename FPTYPE>
-// void prepare_vkb_deri_ptr_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-//         const base_device::DEVICE_GPU* ctx,
-//         int nbeta, double* nhtol, int nhtol_nc, int npw, int it,
-//         int ipol, int jpol,
-//         std::complex<FPTYPE>*vkb_out, std::complex<FPTYPE>** vkb_ptrs,
-//         FPTYPE* ylm_in, FPTYPE** ylm_ptrs,
-//         FPTYPE* ylm_deri_in, FPTYPE** ylm_deri_ptr1s, FPTYPE** ylm_deri_ptr2s,
-//         FPTYPE* vq_in, FPTYPE** vq_ptrs,
-//         FPTYPE* vq_deri_in, FPTYPE** vq_deri_ptrs
-//     )
-// {
-//     const int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-//     dim3 gridsize(block,nbeta);
-
-
-//     prepare_vkb_deri_ptr<FPTYPE><<<1,1>>>(
-//         nbeta, nhtol, nhtol_nc, npw, it, ipol, jpol,
-//         reinterpret_cast<const thrust::complex<FPTYPE>*>(vkb_out), 
-//         reinterpret_cast<const thrust::complex<FPTYPE>*>(vkb_ptrs), 
-//         ylm_in, ylm_ptrs, ylm_deri_in, ylm_deri_ptr1s, ylm_deri_ptr2s,
-//         vq_in, vq_ptrs, vq_deri_in, vq_deri_ptrs
-//     );
-
-//     return ;
-// }
-
 
 template <>
 void pointer_array_malloc<base_device::DEVICE_GPU>::operator()(
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 041bfc3b19..d152b25a7d 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -149,7 +149,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
     };
 
     void operator()(const base_device::DEVICE_CPU* ctx,
-                    const bool& nondiagonal,
                     const int& nbands_occ,
                     const int& wg_nc,
                     const int& ntype,
@@ -199,8 +198,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                             // out<<"\n ps = "<<ps;
                             for (int ip2 = 0; ip2 < Nprojs; ip2++)
                             {
-                                if(ip2 != ip && !nondiagonal) continue;
-
                                 // Effective values of the D-eS coefficients
                                 std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
                                 const int jnkb = sum + ip2;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index b9fb84344a..f729280ef6 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -94,7 +94,6 @@ struct cal_force_nl_op
                     FPTYPE* force);
     // interface for nspin=4 only
     void operator()(const base_device::DEVICE_CPU* ctx,
-                    const bool& nondiagonal,
                     const int& nbands_occ,
                     const int& wg_nc,
                     const int& ntype,
@@ -159,6 +158,28 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* force);
+    // interface for nspin=4 only
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force);
 };
 
 /**
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index e58cfb3e7b..b951714e00 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -140,7 +140,6 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
     };
 
     void operator()(const base_device::DEVICE_CPU* ctx,
-                    const bool& nondiagonal,
                     const int& ipol,
                     const int& jpol,
                     const int& nkb,
@@ -181,10 +180,6 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                         {
                             for (int ip2 = 0; ip2 < Nprojs; ip2++)
                             {
-                                if (!nondiagonal && ip1 != ip2)
-                                {
-                                    continue;
-                                }
                                 const int ib2 = ib*2;
                                 FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0;
                                 FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
@@ -197,10 +192,10 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                                 const int inkb2 = sum + ia * Nprojs + ip2;
                                 // out<<"\n ps = "<<ps;
 
-                                const std::complex<FPTYPE> dbb0 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real();
-                                const std::complex<FPTYPE> dbb1 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real();
-                                const std::complex<FPTYPE> dbb2 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real();
-                                const std::complex<FPTYPE> dbb3 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real();
+                                const std::complex<FPTYPE> dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+                                const std::complex<FPTYPE> dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+                                const std::complex<FPTYPE> dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+                                const std::complex<FPTYPE> dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
                                 local_stress -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
                             }
                         } // end ip
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index a2852c00c9..177761a95d 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -105,7 +105,6 @@ struct cal_stress_nl_op
                     FPTYPE* stress);
     // interface for nspin=4 only
     void operator()(const Device* ctx,
-                    const bool& nondiagonal,
                     const int& ipol,
                     const int& jpol,
                     const int& nkb,
@@ -259,6 +258,27 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* stress);
+    // interface for nspin=4 only
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& ipol,
+                    const int& jpol,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress);
 };
 
 // cpu version first, gpu version later

From 5b629550d1e9495ea915e31ffcdfa5a4d63e2cf5 Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@dp.tech>
Date: Wed, 7 Aug 2024 08:30:37 +0000
Subject: [PATCH 4/8] Fix: CUDA and ROCM

---
 .../hamilt_pwdft/kernels/cuda/force_op.cu     |  11 +-
 .../hamilt_pwdft/kernels/cuda/stress_op.cu    |   2 -
 .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 104 +++++++++++++++++
 .../kernels/rocm/stress_op.hip.cu             | 106 ++++++++++++++++++
 4 files changed, 216 insertions(+), 7 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index fb690754c9..e23e0ccbc3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -193,6 +193,7 @@ __global__ void cal_force_nl(
         FPTYPE *force)
 {
     const int ib = blockIdx.x / ntype;
+    const int ib2 = ib * 2;
     const int it = blockIdx.x % ntype;
 
     int iat = 0, sum = 0;
@@ -217,14 +218,13 @@ __global__ void cal_force_nl(
                 const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
                 const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
 
-                const int jnkb = sum + ip2;
                 for (int ipol = 0; ipol < 3; ipol++) {
                     const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
                     const int index1 = ib2 * nkb + jnkb;
-                    const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
-                    const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
-                    const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
-                    const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+                    const thrust::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                    const thrust::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
                     const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
                     atomicAdd(force + iat * forcenl_nc + ipol, tmp);
                 }
@@ -236,6 +236,7 @@ __global__ void cal_force_nl(
 }
 
 // interface for nspin=4 only
+template <typename FPTYPE>
 void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
                     const int& nbands_occ,
                     const int& wg_nc,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index c379fc1082..98918c6523 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -282,8 +282,6 @@ __global__ void cal_stress_nl(
             const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
             const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
             const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
-            FPTYPE ps = deeq_nc[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2]
-                        - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
             const int inkb1 = sum + ip1;
             const int inkb2 = sum + ip2;
             //out<<"\n ps = "<<ps;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index fa03fa2a5d..786a6b8ebf 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -166,6 +166,110 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     hipCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_force_nl(
+        const int wg_nc,
+        const int ntype,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int forcenl_nc,
+        const int nbands,
+        const int ik,
+        const int nkb,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE tpiba,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *force)
+{
+    const int ib = blockIdx.x / ntype;
+    const int ib2 = ib * 2;
+    const int it = blockIdx.x % ntype;
+
+    int iat = 0, sum = 0;
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    int Nprojs = atom_nh[it];
+    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    for (int ia = 0; ia < atom_na[it]; ia++) {
+        for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
+            const int inkb = sum + ip;
+            for (int ip2 = 0; ip2 < Nprojs; ip2++) 
+            {
+                // Effective values of the D-eS coefficients
+                const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                const int jnkb = sum + ip2;
+                const thrust::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+
+                for (int ipol = 0; ipol < 3; ipol++) {
+                    const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                    const int index1 = ib2 * nkb + jnkb;
+                    const thrust::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                    const thrust::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+                    const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                    atomicAdd(force + iat * forcenl_nc + ipol, tmp);
+                }
+            }
+        }
+        iat += 1;
+        sum += Nprojs;
+    }
+}
+
+// interface for nspin=4 only
+template <typename FPTYPE>
+void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+{
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
+            wg_nc, ntype,
+            deeq_2, deeq_3, deeq_4,
+            forcenl_nc, nbands, ik, nkb,
+            atom_nh, atom_na,
+            tpiba,
+            d_wg, d_ekb, qq_nt, 
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+            force);// array of data
+
+    hipCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void saveVkbValues_(
     const int *gcar_zero_ptrs, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
index d7db3b3eea..1afba3a5b1 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -238,6 +238,112 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
     hipCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_stress_nl(
+        const int ipol,
+        const int jpol,
+        const int nkb,
+        const int ntype,
+        const int wg_nc,
+        const int ik,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *stress)
+{
+    int ib = blockIdx.x / ntype;
+    const int ib2  = ib * 2;
+    int it = blockIdx.x % ntype;
+
+    int iat = 0, sum = 0;
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib];
+    const int Nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
+            int ip1 = ii / Nprojs, ip2 = ii % Nprojs;
+            const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const int inkb1 = sum + ip1;
+            const int inkb2 = sum + ip2;
+            //out<<"\n ps = "<<ps;
+            const thrust::complex<FPTYPE> dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+        }
+        ++iat;
+        sum+=Nprojs;
+    }//ia
+    __syncwarp();
+    warp_reduce(stress_var);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        atomicAdd(stress + ipol * 3 + jpol, stress_var);
+    }
+}
+
+template <typename FPTYPE>
+void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                                                                   const int& ipol,
+                                                                   const int& jpol,
+                                                                   const int& nkb,
+                                                                   const int& nbands_occ,
+                                                                   const int& ntype,
+                                                                   const int& wg_nc,
+                                                                   const int& ik,
+                                                                   const int& deeq_2,
+                                                                   const int& deeq_3,
+                                                                   const int& deeq_4,
+                                                                   const int* atom_nh,
+                                                                   const int* atom_na,
+                                                                   const FPTYPE* d_wg,
+                                                                   const FPTYPE* d_ekb,
+                                                                   const FPTYPE* qq_nt,
+                                                                   const std::complex<FPTYPE>* deeq_nc,
+                                                                   const std::complex<FPTYPE>* becp,
+                                                                   const std::complex<FPTYPE>* dbecp,
+                                                                   FPTYPE* stress)
+{
+     hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
+             ipol,
+             jpol,
+             nkb,
+             ntype,
+             wg_nc,
+             ik,
+             deeq_2,
+             deeq_3,
+             deeq_4,
+             atom_nh,
+             atom_na,
+             d_wg,
+             d_ekb,
+             qq_nt,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+             stress);// array of data
+
+    hipCheckOnDebug();
+}
+
 template <typename T, typename Device>
 void cal_stress_mgga_op<T, Device>::operator()(
     const int& spin,

From 32ddfaf7e48b37a8f74ce7c83ef0fd404231d7f2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci-lite[bot]"
 <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Date: Wed, 7 Aug 2024 09:12:02 +0000
Subject: [PATCH 5/8] [pre-commit.ci lite] apply automatic fixes

---
 source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp | 6 ++++--
 source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp | 8 +++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
index 79d2d1ec79..f05e64458a 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
@@ -29,8 +29,9 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
 	int npwx=0;
     for (int ik = 0; ik < p_kv->get_nks(); ik++)
     {
-        if (npwx < p_kv->ngk[ik])
+        if (npwx < p_kv->ngk[ik]) {
             npwx = p_kv->ngk[ik];
+}
     }
 
     gk[0]= new FPTYPE[npwx];
@@ -72,7 +73,8 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
             {
                 for (int ibnd = 0; ibnd < GlobalV::NBANDS; ibnd++)
                 {
-                    if (wg(ik, ibnd) == 0.0) continue;
+                    if (wg(ik, ibnd) == 0.0) { continue;
+}
                     const std::complex<FPTYPE>* ppsi = nullptr;
                     ppsi = &(psi_in[0](ik, ibnd, 0));
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
index b3e5eab6ba..cb306fceeb 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
@@ -22,7 +22,8 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
 
 	const int nspin_rho = (GlobalV::NSPIN == 2) ? 2 : 1;
 
-	if (PARAM.inp.gamma_only && is_pw) fact=2.0;
+	if (PARAM.inp.gamma_only && is_pw) { fact=2.0;
+}
 
     
 
@@ -70,12 +71,13 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
 		{
 			for (int ig=0; ig<rho_basis->npw; ig++)
 			{
-                if (rho_basis->ig_gge0 == ig)
+                if (rho_basis->ig_gge0 == ig) {
                     evloc += GlobalC::ppcell.vloc(it, rho_basis->ig2igg[ig])
                              * (p_sf->strucFac(it, ig) * conj(aux[ig])).real();
-                else
+                } else {
                     evloc += GlobalC::ppcell.vloc(it, rho_basis->ig2igg[ig])
                              * (p_sf->strucFac(it, ig) * conj(aux[ig]) * fact).real();
+}
             }
 		}
     }

From 34d349545d4324aef955f57ee91c6d0460c278f0 Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@bjaisi.com>
Date: Mon, 26 Aug 2024 16:15:35 +0800
Subject: [PATCH 6/8] Fix: nspin=4 error on DCU/GPU

---
 .../hamilt_pwdft/kernels/cuda/nonlocal_op.cu       |  8 ++++----
 .../hamilt_pwdft/kernels/cuda/stress_op.cu         | 14 +++++++++-----
 .../hamilt_pwdft/kernels/force_op.cpp              | 14 +++++++-------
 .../hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu   |  8 ++++----
 4 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu
index 275fbfe136..89a74f3c5f 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu
@@ -54,8 +54,8 @@ __global__ void nonlocal_pw(
     thrust::complex<FPTYPE>* ps,
     const thrust::complex<FPTYPE>* becp)
 {
-  const int ii = blockIdx.x / l2;
-  const int jj = blockIdx.x % l2;
+  const int ii = blockIdx.x * 2 / l2;
+  const int jj = blockIdx.x * 2 % l2;
   for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) {
     thrust::complex<FPTYPE> res1(0.0, 0.0);
     thrust::complex<FPTYPE> res2(0.0, 0.0);
@@ -121,7 +121,7 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 {
   // denghui implement 20221109
   // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-  nonlocal_pw<FPTYPE><<<l1 * l2, THREADS_PER_BLOCK>>>(
+  nonlocal_pw<FPTYPE><<<l1 * l2 / 2, THREADS_PER_BLOCK>>>(
     l1, l2, l3, // loop size
     sum, iat, nkb,   // control params
     deeq_x, deeq_y, deeq_z,
@@ -138,4 +138,4 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 template struct nonlocal_pw_op<float, base_device::DEVICE_GPU>;
 template struct nonlocal_pw_op<double, base_device::DEVICE_GPU>;
 
-}  // namespace hamilt
\ No newline at end of file
+}  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index 98918c6523..f451abd26c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -261,22 +261,26 @@ __global__ void cal_stress_nl(
         const thrust::complex<FPTYPE> *dbecp,
         FPTYPE *stress)
 {
-    int ib = blockIdx.x / ntype;
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
     const int ib2  = ib * 2;
-    int it = blockIdx.x % ntype;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
 
-    int iat = 0, sum = 0;
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
     for (int ii = 0; ii < it; ii++) {
         iat += atom_na[ii];
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib];
+    FPTYPE stress_var = 0;
+    const FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0;
+    const FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
     const int Nprojs = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
         for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
-            int ip1 = ii / Nprojs, ip2 = ii % Nprojs;
+            const int ip1 = ii / Nprojs;
+	        const int ip2 = ii % Nprojs;
             const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
             const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
             const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index d152b25a7d..90a4641123 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -178,7 +178,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
             int sum0 = 0;
             for (int it = 0; it < ntype; it++)
             {
-                const int Nprojs = atom_nh[it];
+                const int nprojs = atom_nh[it];
 #ifdef _OPENMP
 #pragma omp for collapse(2)
 #endif
@@ -191,12 +191,12 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                         FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
                         FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
                         int iat = iat0 + ia;
-                        int sum = sum0 + ia * Nprojs;
-                        for (int ip = 0; ip < Nprojs; ip++)
+                        int sum = sum0 + ia * nprojs;
+                        for (int ip = 0; ip < nprojs; ip++)
                         {
                             const int inkb = sum + ip;
                             // out<<"\n ps = "<<ps;
-                            for (int ip2 = 0; ip2 < Nprojs; ip2++)
+                            for (int ip2 = 0; ip2 < nprojs; ip2++)
                             {
                                 // Effective values of the D-eS coefficients
                                 std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
@@ -223,7 +223,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
 #ifdef _OPENMP
                         if (omp_get_num_threads() > 1)
                         {
-                            for (int ipol = 0; ipol < 3; ipol++)
+                            for (int ipol = 0; ipol < 3; ++ipol)
                             {
 #pragma omp atomic
                                 force[iat * forcenl_nc + ipol] += local_force[ipol];
@@ -232,7 +232,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                         else
 #endif
                         {
-                            for (int ipol = 0; ipol < 3; ipol++)
+                            for (int ipol = 0; ipol < 3; ++ipol)
                             {
                                 force[iat * forcenl_nc + ipol] += local_force[ipol];
                             }
@@ -240,7 +240,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     }
                 } // end ia
                 iat0 += atom_na[it];
-                sum0 += atom_na[it] * Nprojs;
+                sum0 += atom_na[it] * nprojs;
             } // end it
 #ifdef _OPENMP
         }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu
index 93df9fe58d..9a43f64ea6 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu
@@ -54,8 +54,8 @@ __global__ void nonlocal_pw(
     thrust::complex<FPTYPE>* ps,
     const thrust::complex<FPTYPE>* becp)
 {
-  const int ii = blockIdx.x / l2;
-  const int jj = blockIdx.x % l2;
+  const int ii = blockIdx.x * 2 / l2;
+  const int jj = blockIdx.x * 2 % l2;
   for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) {
     thrust::complex<FPTYPE> res1(0.0, 0.0);
     thrust::complex<FPTYPE> res2(0.0, 0.0);
@@ -122,7 +122,7 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 {
   // denghui implement 20221109
   // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw<FPTYPE>), dim3(l1 * l2), dim3(THREADS_PER_BLOCK), 0, 0,
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw<FPTYPE>), dim3(l1 * l2 / 2), dim3(THREADS_PER_BLOCK), 0, 0,
     l1, l2, l3, // loop size
     sum, iat, nkb,   // control params
     deeq_x, deeq_y, deeq_z,
@@ -140,4 +140,4 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 namespace hamilt{
 template struct nonlocal_pw_op<float, base_device::DEVICE_GPU>;
 template struct nonlocal_pw_op<double, base_device::DEVICE_GPU>;
-}
\ No newline at end of file
+}

From ebc3aed16003bae567a4c46056b76f6b4292a049 Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@bjaisi.com>
Date: Thu, 31 Oct 2024 14:21:55 +0800
Subject: [PATCH 7/8] Refactor: format and less parameters for nonlocal fs
 kernels

---
 .../hamilt_pwdft/fs_nonlocal_tools.cpp        | 62 ++++++++-----------
 .../hamilt_pwdft/kernels/cuda/force_op.cu     | 42 +++++--------
 .../hamilt_pwdft/kernels/cuda/stress_op.cu    | 43 +++++--------
 .../hamilt_pwdft/kernels/force_op.cpp         | 22 +++----
 .../hamilt_pwdft/kernels/force_op.h           | 10 ---
 .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 48 +++++++-------
 .../kernels/rocm/stress_op.hip.cu             | 43 ++++++-------
 .../hamilt_pwdft/kernels/stress_op.cpp        | 39 ++++++------
 .../hamilt_pwdft/kernels/stress_op.h          | 10 ---
 .../kernels/test/force_op_test.cpp            |  4 --
 .../kernels/test/stress_op_test.cpp           |  4 --
 11 files changed, 128 insertions(+), 199 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
index bfdf79e94a..4dbdbc558f 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
@@ -415,29 +415,27 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
     // calculate stress for target (ipol, jpol)
     if(npol == 1)
     {
-    const int current_spin = this->kv_->isk[ik];
-    cal_stress_nl_op()(this->ctx,
-                       nondiagonal,
-                       ipol,
-                       jpol,
-                       nkb,
-                       npm,
-                       this->ntype,
-                       current_spin, // uspp only
-                       this->nbands,
-                       ik,
-                       this->nlpp_->deeq.getBound2(),
-                       this->nlpp_->deeq.getBound3(),
-                       this->nlpp_->deeq.getBound4(),
-                       atom_nh,
-                       atom_na,
-                       d_wg,
-                       d_ekb,
-                       qq_nt,
-                       deeq,
-                       becp,
-                       dbecp,
-                       stress);
+        const int current_spin = this->kv_->isk[ik];
+        cal_stress_nl_op()(this->ctx,
+                        nondiagonal,
+                        ipol,
+                        jpol,
+                        nkb,
+                        npm,
+                        this->ntype,
+                        current_spin, // uspp only
+                        this->nlpp_->deeq.getBound2(),
+                        this->nlpp_->deeq.getBound3(),
+                        this->nlpp_->deeq.getBound4(),
+                        atom_nh,
+                        atom_na,
+                        d_wg + this->nbands * ik,
+                        d_ekb + this->nbands * ik,
+                        qq_nt,
+                        deeq,
+                        becp,
+                        dbecp,
+                        stress);
     }
     else
     {
@@ -447,15 +445,13 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
                            nkb,
                            npm,
                            this->ntype,
-                           this->nbands,
-                           ik,
                            this->nlpp_->deeq_nc.getBound2(),
                            this->nlpp_->deeq_nc.getBound3(),
                            this->nlpp_->deeq_nc.getBound4(),
                            atom_nh,
                            atom_na,
-                           d_wg,
-                           d_ekb,
+                           d_wg + this->nbands * ik,
+                           d_ekb + this->nbands * ik,
                            qq_nt,
                            this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
                            becp,
@@ -668,7 +664,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
         cal_force_nl_op<FPTYPE, Device>()(this->ctx,
                                         nondiagonal,
                                         npm,
-                                        this->nbands,
                                         this->ntype,
                                         current_spin,
                                         this->nlpp_->deeq.getBound2(),
@@ -676,13 +671,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
                                         this->nlpp_->deeq.getBound4(),
                                         force_nc,
                                         this->nbands,
-                                        ik,
                                         nkb,
                                         atom_nh,
                                         atom_na,
                                         this->ucell_->tpiba,
-                                        d_wg,
-                                        d_ekb,
+                                        d_wg + this->nbands * ik,
+                                        d_ekb + this->nbands * ik,
                                         qq_nt,
                                         deeq,
                                         becp,
@@ -693,20 +687,18 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
     {
         cal_force_nl_op<FPTYPE, Device>()(this->ctx,
                                           npm,
-                                          this->nbands,
                                           this->ntype,
                                           this->nlpp_->deeq_nc.getBound2(),
                                           this->nlpp_->deeq_nc.getBound3(),
                                           this->nlpp_->deeq_nc.getBound4(),
                                           force_nc,
                                           this->nbands,
-                                          ik,
                                           nkb,
                                           atom_nh,
                                           atom_na,
                                           this->ucell_->tpiba,
-                                          d_wg,
-                                          d_ekb,
+                                          d_wg + this->nbands * ik,
+                                          d_ekb + this->nbands * ik,
                                           qq_nt,
                                           this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
                                           becp,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index e23e0ccbc3..dddf889de1 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -35,7 +35,6 @@ __global__ void cal_vkb1_nl(
 template <typename FPTYPE>
 __global__ void cal_force_nl(
         const bool nondiagonal,
-        const int wg_nc,
         const int ntype,
         const int spin,
         const int deeq_2,
@@ -43,7 +42,6 @@ __global__ void cal_force_nl(
         const int deeq_4,
         const int forcenl_nc,
         const int nbands,
-        const int ik,
         const int nkb,
         const int *atom_nh,
         const int *atom_na,
@@ -65,11 +63,11 @@ __global__ void cal_force_nl(
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    int Nprojs = atom_nh[it];
-    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
-    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    int nproj = atom_nh[it];
+    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ib];
     for (int ia = 0; ia < atom_na[it]; ia++) {
-        for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
+        for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
             // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip];
             FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]
                         - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
@@ -85,8 +83,8 @@ __global__ void cal_force_nl(
             }
 
             if (nondiagonal) {
-                //for (int ip2=0; ip2<Nprojs; ip2++)
-                for (int ip2 = 0; ip2 < Nprojs; ip2++) {
+                //for (int ip2=0; ip2<nproj; ip2++)
+                for (int ip2 = 0; ip2 < nproj; ip2++) {
                     if (ip != ip2) {
                         const int jnkb = sum + ip2;
                         ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]
@@ -101,7 +99,7 @@ __global__ void cal_force_nl(
             }
         }
         iat += 1;
-        sum += Nprojs;
+        sum += nproj;
     }
 }
 
@@ -134,7 +132,6 @@ template <typename FPTYPE>
 void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
                                                                   const bool& nondiagonal,
                                                                   const int& nbands_occ,
-                                                                  const int& wg_nc,
                                                                   const int& ntype,
                                                                   const int& spin,
                                                                   const int& deeq_2,
@@ -142,7 +139,6 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                                                                   const int& deeq_4,
                                                                   const int& forcenl_nc,
                                                                   const int& nbands,
-                                                                  const int& ik,
                                                                   const int& nkb,
                                                                   const int* atom_nh,
                                                                   const int* atom_na,
@@ -157,9 +153,9 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
 {
     cal_force_nl<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
             nondiagonal,
-            wg_nc, ntype, spin,
+            ntype, spin,
             deeq_2, deeq_3, deeq_4,
-            forcenl_nc, nbands, ik, nkb,
+            forcenl_nc, nbands, nkb,
             atom_nh, atom_na,
             tpiba,
             d_wg, d_ekb, qq_nt, deeq,
@@ -172,14 +168,12 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
 
 template <typename FPTYPE>
 __global__ void cal_force_nl(
-        const int wg_nc,
         const int ntype,
         const int deeq_2,
         const int deeq_3,
         const int deeq_4,
         const int forcenl_nc,
         const int nbands,
-        const int ik,
         const int nkb,
         const int *atom_nh,
         const int *atom_na,
@@ -202,13 +196,13 @@ __global__ void cal_force_nl(
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    int Nprojs = atom_nh[it];
-    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
-    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    int nproj = atom_nh[it];
+    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ib];
     for (int ia = 0; ia < atom_na[it]; ia++) {
-        for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
+        for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
             const int inkb = sum + ip;
-            for (int ip2 = 0; ip2 < Nprojs; ip2++) 
+            for (int ip2 = 0; ip2 < nproj; ip2++) 
             {
                 // Effective values of the D-eS coefficients
                 const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
@@ -231,7 +225,7 @@ __global__ void cal_force_nl(
             }
         }
         iat += 1;
-        sum += Nprojs;
+        sum += nproj;
     }
 }
 
@@ -239,14 +233,12 @@ __global__ void cal_force_nl(
 template <typename FPTYPE>
 void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
@@ -260,9 +252,9 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                     FPTYPE* force)
 {
     cal_force_nl<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
-            wg_nc, ntype,
+            ntype,
             deeq_2, deeq_3, deeq_4,
-            forcenl_nc, nbands, ik, nkb,
+            forcenl_nc, nbands, nkb,
             atom_nh, atom_na,
             tpiba,
             d_wg, d_ekb, qq_nt, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index 4f134e0042..e566126876 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -107,8 +107,6 @@ __global__ void cal_stress_nl(
         const int nkb,
         const int ntype,
         const int spin,
-        const int wg_nc,
-        const int ik,
         const int deeq_2,
         const int deeq_3,
         const int deeq_4,
@@ -125,22 +123,25 @@ __global__ void cal_stress_nl(
     int ib = blockIdx.x / ntype;
     int it = blockIdx.x % ntype;
 
-    int iat = 0, sum = 0;
+    int iat = 0;
+    int sum = 0;
     for (int ii = 0; ii < it; ii++) {
         iat += atom_na[ii];
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib];
-    const int Nprojs = atom_nh[it];
+    FPTYPE stress_var = 0;
+    const FPTYPE fac = d_wg[ib];
+    const FPTYPE ekb_now = d_ekb[ib];
+    const int nproj = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
-        for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
-            int ip1 = ii / Nprojs, ip2 = ii % Nprojs;
+        for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) {
+            const int ip1 = ii / nproj, ip2 = ii % nproj;
             if(!nondiagonal && ip1 != ip2) {
                 continue;
             }
-            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2]
+            const FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2]
                         - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
             const int inkb1 = sum + ip1;
             const int inkb2 = sum + ip2;
@@ -149,7 +150,7 @@ __global__ void cal_stress_nl(
             stress_var -= ps * fac * dbb;
         }
         ++iat;
-        sum+=Nprojs;
+        sum+=nproj;
     }//ia
     __syncwarp();
     warp_reduce(stress_var);
@@ -204,8 +205,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                                                                    const int& nbands_occ,
                                                                    const int& ntype,
                                                                    const int& spin,
-                                                                   const int& wg_nc,
-                                                                   const int& ik,
                                                                    const int& deeq_2,
                                                                    const int& deeq_3,
                                                                    const int& deeq_4,
@@ -226,8 +225,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
              nkb,
              ntype,
              spin,
-             wg_nc,
-             ik,
              deeq_2,
              deeq_3,
              deeq_4,
@@ -250,8 +247,6 @@ __global__ void cal_stress_nl(
         const int jpol,
         const int nkb,
         const int ntype,
-        const int wg_nc,
-        const int ik,
         const int deeq_2,
         const int deeq_3,
         const int deeq_4,
@@ -277,14 +272,14 @@ __global__ void cal_stress_nl(
     }
 
     FPTYPE stress_var = 0;
-    const FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0;
-    const FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
-    const int Nprojs = atom_nh[it];
+    const FPTYPE fac = d_wg[ib];
+    const FPTYPE ekb_now = d_ekb[ib];
+    const int nproj = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
-        for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
-            const int ip1 = ii / Nprojs;
-	        const int ip2 = ii % Nprojs;
+        for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) {
+            const int ip1 = ii / nproj;
+	        const int ip2 = ii % nproj;
             const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
             const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
             const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
@@ -300,7 +295,7 @@ __global__ void cal_stress_nl(
             stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
         }
         ++iat;
-        sum+=Nprojs;
+        sum+=nproj;
     }//ia
     __syncwarp();
     warp_reduce(stress_var);
@@ -316,8 +311,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                                                                    const int& nkb,
                                                                    const int& nbands_occ,
                                                                    const int& ntype,
-                                                                   const int& wg_nc,
-                                                                   const int& ik,
                                                                    const int& deeq_2,
                                                                    const int& deeq_3,
                                                                    const int& deeq_4,
@@ -336,8 +329,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
              jpol,
              nkb,
              ntype,
-             wg_nc,
-             ik,
              deeq_2,
              deeq_3,
              deeq_4,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 90a4641123..2ee9cb4530 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -42,7 +42,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
     void operator()(const base_device::DEVICE_CPU* ctx,
                     const bool& nondiagonal,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& spin,
                     const int& deeq_2,
@@ -50,7 +49,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
@@ -71,7 +69,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
             int sum0 = 0;
             for (int it = 0; it < ntype; it++)
             {
-                const int Nprojs = atom_nh[it];
+                const int nproj = atom_nh[it];
 #ifdef _OPENMP
 #pragma omp for collapse(2)
 #endif
@@ -80,11 +78,11 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     for (int ib = 0; ib < nbands_occ; ib++)
                     {
                         FPTYPE local_force[3] = {0, 0, 0};
-                        FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
-                        FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+                        FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+                        FPTYPE ekb_now = d_ekb[ib];
                         int iat = iat0 + ia;
-                        int sum = sum0 + ia * Nprojs;
-                        for (int ip = 0; ip < Nprojs; ip++)
+                        int sum = sum0 + ia * nproj;
+                        for (int ip = 0; ip < nproj; ip++)
                         {
                             // Effective values of the D-eS coefficients
                             FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]
@@ -102,7 +100,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                             }
                             if (nondiagonal)
                             {
-                                for (int ip2 = 0; ip2 < Nprojs; ip2++)
+                                for (int ip2 = 0; ip2 < nproj; ip2++)
                                 {
                                     if (ip != ip2)
                                     {
@@ -141,7 +139,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     }
                 } // end ia
                 iat0 += atom_na[it];
-                sum0 += atom_na[it] * Nprojs;
+                sum0 += atom_na[it] * nproj;
             } // end it
 #ifdef _OPENMP
         }
@@ -150,14 +148,12 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
 
     void operator()(const base_device::DEVICE_CPU* ctx,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
@@ -188,8 +184,8 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     {
                         const int ib2 = ib*2;
                         FPTYPE local_force[3] = {0, 0, 0};
-                        FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
-                        FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+                        FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+                        FPTYPE ekb_now = d_ekb[ib];
                         int iat = iat0 + ia;
                         int sum = sum0 + ia * nprojs;
                         for (int ip = 0; ip < nprojs; ip++)
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 05c419dd5e..c68df7bd0f 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -48,7 +48,6 @@ struct cal_force_nl_op
     /// @param ctx - which device this function runs on
     /// @param nondiagonal - control flag
     /// @param nbands_occ - number of occupied bands
-    /// @param wg_nc - the second dimension of matrix wg
     /// @param ntype - total atomic type
     /// @param spin - current spin
     /// @param deeq_2 - the second dimension of deeq
@@ -56,7 +55,6 @@ struct cal_force_nl_op
     /// @param deeq_4 - the forth dimension of deeq
     /// @param forcenl_nc - the second dimension of matrix forcenl
     /// @param nbands - NBANDS
-    /// @param ik - current k point
     /// @param nkb - number of k point
     /// @param atom_nh - GlobalC::ucell.atoms[ii].ncpp.nh
     /// @param atom_na - GlobalC::ucell.atoms[ii].na
@@ -73,7 +71,6 @@ struct cal_force_nl_op
     void operator()(const base_device::DEVICE_CPU* ctx,
                     const bool& nondiagonal,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& spin,
                     const int& deeq_2,
@@ -81,7 +78,6 @@ struct cal_force_nl_op
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
@@ -96,14 +92,12 @@ struct cal_force_nl_op
     // interface for nspin=4 only
     void operator()(const base_device::DEVICE_CPU* ctx,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
@@ -139,7 +133,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
     void operator()(const base_device::DEVICE_GPU* ctx,
                     const bool& nondiagonal,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& spin,
                     const int& deeq_2,
@@ -147,7 +140,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
@@ -162,14 +154,12 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
     // interface for nspin=4 only
     void operator()(const base_device::DEVICE_GPU* ctx,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index 786a6b8ebf..da45a375e7 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -31,7 +31,6 @@ __global__ void cal_vkb1_nl(
 template <typename FPTYPE>
 __global__ void cal_force_nl(
         const bool nondiagonal,
-        const int wg_nc,
         const int ntype,
         const int spin,
         const int deeq_2,
@@ -39,7 +38,6 @@ __global__ void cal_force_nl(
         const int deeq_4,
         const int forcenl_nc,
         const int nbands,
-        const int ik,
         const int nkb,
         const int *atom_nh,
         const int *atom_na,
@@ -55,17 +53,18 @@ __global__ void cal_force_nl(
     const int ib = blockIdx.x / ntype;
     const int it = blockIdx.x % ntype;
 
-    int iat = 0, sum = 0;
+    int iat = 0;
+    int sum = 0;
     for (int ii = 0; ii < it; ii++) {
         iat += atom_na[ii];
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    int Nprojs = atom_nh[it];
-    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
-    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    int nproj = atom_nh[it];
+    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ib];
     for (int ia = 0; ia < atom_na[it]; ia++) {
-        for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
+        for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
             // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip];
             FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]
                         - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
@@ -81,8 +80,8 @@ __global__ void cal_force_nl(
             }
 
             if (nondiagonal) {
-                //for (int ip2=0; ip2<Nprojs; ip2++)
-                for (int ip2 = 0; ip2 < Nprojs; ip2++) {
+                //for (int ip2=0; ip2<nproj; ip2++)
+                for (int ip2 = 0; ip2 < nproj; ip2++) {
                     if (ip != ip2) {
                         const int jnkb = sum + ip2;
                         ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]
@@ -97,7 +96,7 @@ __global__ void cal_force_nl(
             }
         }
         iat += 1;
-        sum += Nprojs;
+        sum += nproj;
     }
 }
 
@@ -130,7 +129,6 @@ template <typename FPTYPE>
 void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
                                                                   const bool& nondiagonal,
                                                                   const int& nbands_occ,
-                                                                  const int& wg_nc,
                                                                   const int& ntype,
                                                                   const int& spin,
                                                                   const int& deeq_2,
@@ -138,7 +136,6 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                                                                   const int& deeq_4,
                                                                   const int& forcenl_nc,
                                                                   const int& nbands,
-                                                                  const int& ik,
                                                                   const int& nkb,
                                                                   const int* atom_nh,
                                                                   const int* atom_na,
@@ -153,9 +150,9 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
 {
     hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
             nondiagonal,
-            wg_nc, ntype, spin,
+            ntype, spin,
             deeq_2, deeq_3, deeq_4,
-            forcenl_nc, nbands, ik, nkb,
+            forcenl_nc, nbands, nkb,
             atom_nh, atom_na,
             tpiba,
             d_wg, d_ekb, qq_nt, deeq,
@@ -168,14 +165,12 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
 
 template <typename FPTYPE>
 __global__ void cal_force_nl(
-        const int wg_nc,
         const int ntype,
         const int deeq_2,
         const int deeq_3,
         const int deeq_4,
         const int forcenl_nc,
         const int nbands,
-        const int ik,
         const int nkb,
         const int *atom_nh,
         const int *atom_na,
@@ -192,19 +187,20 @@ __global__ void cal_force_nl(
     const int ib2 = ib * 2;
     const int it = blockIdx.x % ntype;
 
-    int iat = 0, sum = 0;
+    int iat = 0;
+    int sum = 0;
     for (int ii = 0; ii < it; ii++) {
         iat += atom_na[ii];
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    int Nprojs = atom_nh[it];
-    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
-    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    int nproj = atom_nh[it];
+    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ib];
     for (int ia = 0; ia < atom_na[it]; ia++) {
-        for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
+        for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
             const int inkb = sum + ip;
-            for (int ip2 = 0; ip2 < Nprojs; ip2++) 
+            for (int ip2 = 0; ip2 < nproj; ip2++) 
             {
                 // Effective values of the D-eS coefficients
                 const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
@@ -227,7 +223,7 @@ __global__ void cal_force_nl(
             }
         }
         iat += 1;
-        sum += Nprojs;
+        sum += nproj;
     }
 }
 
@@ -235,14 +231,12 @@ __global__ void cal_force_nl(
 template <typename FPTYPE>
 void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
                     const int& nbands_occ,
-                    const int& wg_nc,
                     const int& ntype,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
                     const int& forcenl_nc,
                     const int& nbands,
-                    const int& ik,
                     const int& nkb,
                     const int* atom_nh,
                     const int* atom_na,
@@ -256,9 +250,9 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                     FPTYPE* force)
 {
     hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
-            wg_nc, ntype,
+            ntype,
             deeq_2, deeq_3, deeq_4,
-            forcenl_nc, nbands, ik, nkb,
+            forcenl_nc, nbands, nkb,
             atom_nh, atom_na,
             tpiba,
             d_wg, d_ekb, qq_nt, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
index 96675e359d..6880987546 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -102,8 +102,6 @@ __global__ void cal_stress_nl(
         const int nkb,
         const int ntype,
         const int spin,
-        const int wg_nc,
-        const int ik,
         const int deeq_2,
         const int deeq_3,
         const int deeq_4,
@@ -120,18 +118,21 @@ __global__ void cal_stress_nl(
     int ib = blockIdx.x / ntype;
     int it = blockIdx.x % ntype;
 
-    int iat = 0, sum = 0;
+    int iat = 0;
+    int sum = 0;
     for (int ii = 0; ii < it; ii++) {
         iat += atom_na[ii];
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib];
-    const int Nprojs = atom_nh[it];
+    FPTYPE stress_var = 0;
+    FPTYPE fac = d_wg[ib];
+    FPTYPE ekb_now = d_ekb[ib];
+    const int nproj = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
-        for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
-            int ip1 = ii / Nprojs, ip2 = ii % Nprojs;
+        for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) {
+            int ip1 = ii / nproj, ip2 = ii % nproj;
             if(!nondiagonal && ip1 != ip2) {
                 continue;
             }
@@ -144,7 +145,7 @@ __global__ void cal_stress_nl(
             stress_var -= ps * fac * dbb;
         }
         ++iat;
-        sum+=Nprojs;
+        sum+=nproj;
     }//ia
     warp_reduce(stress_var);
     if (threadIdx.x % WARP_SIZE == 0) {
@@ -198,8 +199,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                                                                    const int& nbands_occ,
                                                                    const int& ntype,
                                                                    const int& spin,
-                                                                   const int& wg_nc,
-                                                                   const int& ik,
                                                                    const int& deeq_2,
                                                                    const int& deeq_3,
                                                                    const int& deeq_4,
@@ -220,8 +219,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
              nkb,
              ntype,
              spin,
-             wg_nc,
-             ik,
              deeq_2,
              deeq_3,
              deeq_4,
@@ -244,8 +241,6 @@ __global__ void cal_stress_nl(
         const int jpol,
         const int nkb,
         const int ntype,
-        const int wg_nc,
-        const int ik,
         const int deeq_2,
         const int deeq_3,
         const int deeq_4,
@@ -263,18 +258,22 @@ __global__ void cal_stress_nl(
     const int ib2  = ib * 2;
     int it = blockIdx.x % ntype;
 
-    int iat = 0, sum = 0;
+    int iat = 0;
+    int sum = 0;
     for (int ii = 0; ii < it; ii++) {
         iat += atom_na[ii];
         sum += atom_na[ii] * atom_nh[ii];
     }
 
-    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib];
-    const int Nprojs = atom_nh[it];
+    FPTYPE stress_var = 0;
+    FPTYPE fac = d_wg[ib];
+    FPTYPE ekb_now = d_ekb[ib];
+    const int nproj = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
-        for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
-            int ip1 = ii / Nprojs, ip2 = ii % Nprojs;
+        for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) {
+            const int ip1 = ii / nproj;
+            const int ip2 = ii % nproj;
             const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
             const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
             const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
@@ -290,7 +289,7 @@ __global__ void cal_stress_nl(
             stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
         }
         ++iat;
-        sum+=Nprojs;
+        sum+=nproj;
     }//ia
     __syncwarp();
     warp_reduce(stress_var);
@@ -306,8 +305,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                                                                    const int& nkb,
                                                                    const int& nbands_occ,
                                                                    const int& ntype,
-                                                                   const int& wg_nc,
-                                                                   const int& ik,
                                                                    const int& deeq_2,
                                                                    const int& deeq_3,
                                                                    const int& deeq_4,
@@ -326,8 +323,6 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
              jpol,
              nkb,
              ntype,
-             wg_nc,
-             ik,
              deeq_2,
              deeq_3,
              deeq_4,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index 142ec6e4a8..f36f2e8316 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -77,8 +77,6 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& nbands_occ,
                     const int& ntype,
                     const int& spin,
-                    const int& wg_nc,
-                    const int& ik,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
@@ -97,10 +95,11 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
 #pragma omp parallel reduction(+ : local_stress)
         {
 #endif
-            int iat = 0, sum = 0;
+            int iat = 0;
+            int sum = 0;
             for (int it = 0; it < ntype; it++)
             {
-                const int Nprojs = atom_nh[it];
+                const int nproj = atom_nh[it];
 #ifdef _OPENMP
 #pragma omp for collapse(4)
 #endif
@@ -108,20 +107,20 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                 {
                     for (int ia = 0; ia < atom_na[it]; ia++)
                     {
-                        for (int ip1 = 0; ip1 < Nprojs; ip1++)
+                        for (int ip1 = 0; ip1 < nproj; ip1++)
                         {
-                            for (int ip2 = 0; ip2 < Nprojs; ip2++)
+                            for (int ip2 = 0; ip2 < nproj; ip2++)
                             {
                                 if (!nondiagonal && ip1 != ip2)
                                 {
                                     continue;
                                 }
-                                FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0;
-                                FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+                                FPTYPE fac = d_wg[ib] * 1.0;
+                                FPTYPE ekb_now = d_ekb[ib];
                                 FPTYPE ps = deeq[((spin * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]
                                             - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
-                                const int inkb1 = sum + ia * Nprojs + ip1;
-                                const int inkb2 = sum + ia * Nprojs + ip2;
+                                const int inkb1 = sum + ia * nproj + ip1;
+                                const int inkb2 = sum + ia * nproj + ip2;
                                 // out<<"\n ps = "<<ps;
 
                                 const FPTYPE dbb = (conj(dbecp[ib * nkb + inkb1]) * becp[ib * nkb + inkb2]).real();
@@ -130,7 +129,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                         } // end ip
                     }     // ia
                 }
-                sum += atom_na[it] * Nprojs;
+                sum += atom_na[it] * nproj;
                 iat += atom_na[it];
             } // end it
 #ifdef _OPENMP
@@ -145,8 +144,6 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& nkb,
                     const int& nbands_occ,
                     const int& ntype,
-                    const int& wg_nc,
-                    const int& ik,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
@@ -168,7 +165,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
             int iat = 0, sum = 0;
             for (int it = 0; it < ntype; it++)
             {
-                const int Nprojs = atom_nh[it];
+                const int nproj = atom_nh[it];
 #ifdef _OPENMP
 #pragma omp for collapse(4)
 #endif
@@ -176,20 +173,20 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                 {
                     for (int ia = 0; ia < atom_na[it]; ia++)
                     {
-                        for (int ip1 = 0; ip1 < Nprojs; ip1++)
+                        for (int ip1 = 0; ip1 < nproj; ip1++)
                         {
-                            for (int ip2 = 0; ip2 < Nprojs; ip2++)
+                            for (int ip2 = 0; ip2 < nproj; ip2++)
                             {
                                 const int ib2 = ib*2;
-                                FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0;
-                                FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+                                FPTYPE fac = d_wg[ib] * 1.0;
+                                FPTYPE ekb_now = d_ekb[ib];
                                 std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
                                 std::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
                                 std::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
                                 std::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
                                 std::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
-                                const int inkb1 = sum + ia * Nprojs + ip1;
-                                const int inkb2 = sum + ia * Nprojs + ip2;
+                                const int inkb1 = sum + ia * nproj + ip1;
+                                const int inkb2 = sum + ia * nproj + ip2;
                                 // out<<"\n ps = "<<ps;
 
                                 const std::complex<FPTYPE> dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
@@ -201,7 +198,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                         } // end ip
                     }     // ia
                 }
-                sum += atom_na[it] * Nprojs;
+                sum += atom_na[it] * nproj;
                 iat += atom_na[it];
             } // end it
 #ifdef _OPENMP
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index 725bfdb981..2f4bbd0989 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -66,8 +66,6 @@ struct cal_stress_nl_op
     /// @param nbands_occ - number of occupied bands
     /// @param ntype - total atomic type
     /// @param spin - current spin
-    /// @param wg_nc - the second dimension of matrix wg
-    /// @param ik - current k point
     /// @param deeq_2 - the second dimension of deeq
     /// @param deeq_3 - the third dimension of deeq
     /// @param deeq_4 - the forth dimension of deeq
@@ -90,8 +88,6 @@ struct cal_stress_nl_op
                     const int& nbands_occ,
                     const int& ntype,
                     const int& spin,
-                    const int& wg_nc,
-                    const int& ik,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
@@ -111,8 +107,6 @@ struct cal_stress_nl_op
                     const int& nkb,
                     const int& nbands_occ,
                     const int& ntype,
-                    const int& wg_nc,
-                    const int& ik,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
@@ -258,8 +252,6 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& nbands_occ,
                     const int& ntype,
                     const int& spin,
-                    const int& wg_nc,
-                    const int& ik,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
@@ -279,8 +271,6 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& nkb,
                     const int& nbands_occ,
                     const int& ntype,
-                    const int& wg_nc,
-                    const int& ik,
                     const int& deeq_2,
                     const int& deeq_3,
                     const int& deeq_4,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
index a791fd933b..3da3ca3b19 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
@@ -2887,7 +2887,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_cpu)
     hamilt::cal_force_nl_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
                                                                multi_proj,
                                                                nbands_occ,
-                                                               wg_nc,
                                                                ntype,
                                                                spin,
                                                                deeq_2,
@@ -2895,7 +2894,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_cpu)
                                                                deeq_4,
                                                                forcenl_nc,
                                                                nbands,
-                                                               ik,
                                                                nkb,
                                                                atom_nh.data(),
                                                                atom_na.data(),
@@ -2980,7 +2978,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu)
     hamilt::cal_force_nl_op<double, base_device::DEVICE_GPU>()(gpu_ctx,
                                                                multi_proj,
                                                                nbands_occ,
-                                                               wg_nc,
                                                                ntype,
                                                                spin,
                                                                deeq_2,
@@ -2988,7 +2985,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu)
                                                                deeq_4,
                                                                forcenl_nc,
                                                                nbands,
-                                                               ik,
                                                                nkb,
                                                                d_atom_nh,
                                                                d_atom_na,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
index 81cd5de858..4f84fa098d 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
@@ -96,8 +96,6 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_cpu)
                                                                 nbands_occ,
                                                                 ntype,
                                                                 spin,
-                                                                wg_nc,
-                                                                ik,
                                                                 deeq_2,
                                                                 deeq_3,
                                                                 deeq_4,
@@ -271,8 +269,6 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu)
                                                                 nbands_occ,
                                                                 ntype,
                                                                 spin,
-                                                                wg_nc,
-                                                                ik,
                                                                 deeq_2,
                                                                 deeq_3,
                                                                 deeq_4,

From 92e8f581dbb7396bf4e14fbe8384a27b40f0828d Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@dp.tech>
Date: Thu, 31 Oct 2024 06:59:52 +0000
Subject: [PATCH 8/8] Test: add force and stress test for noncollinear-spin and
 soc

---
 tests/integrate/104_PW_NC_magnetic/INPUT      | 18 ++++++++++--------
 tests/integrate/104_PW_NC_magnetic/KPT        |  2 +-
 tests/integrate/104_PW_NC_magnetic/STRU       |  2 +-
 tests/integrate/104_PW_NC_magnetic/jd         |  1 +
 tests/integrate/104_PW_NC_magnetic/result.ref | 11 +++++------
 tests/integrate/140_PW_15_SO/INPUT            |  6 +++++-
 tests/integrate/140_PW_15_SO/KPT              |  2 +-
 tests/integrate/140_PW_15_SO/result.ref       |  8 +++++---
 8 files changed, 29 insertions(+), 21 deletions(-)
 create mode 100644 tests/integrate/104_PW_NC_magnetic/jd

diff --git a/tests/integrate/104_PW_NC_magnetic/INPUT b/tests/integrate/104_PW_NC_magnetic/INPUT
index 622af75b86..d761feb2c1 100644
--- a/tests/integrate/104_PW_NC_magnetic/INPUT
+++ b/tests/integrate/104_PW_NC_magnetic/INPUT
@@ -4,10 +4,9 @@ suffix    autotest
 #nbands   40
 
 calculation   scf
-nspin         2
-ecutwfc       15
-scf_thr           1.0e-8
-scf_nmax         200
+ecutwfc       20
+scf_thr           1.0e-6
+scf_nmax         50
 out_chg    0
 
 #init_chg    file
@@ -16,14 +15,17 @@ out_chg    0
 #out_band     1
 noncolin	1
 smearing_method    gaussian
-smearing_sigma       0.07
+smearing_sigma       0.02
 
 mixing_type    broyden 
-mixing_beta    0.5
+mixing_beta    0.2
 mixing_ndim 10
 
-ks_solver     cg
+ks_solver     dav_subspace
+pw_diag_ndim  2
 basis_type    pw
-symmetry      1
+symmetry      0
+cal_force     1
+cal_stress    1
 pseudo_dir	../../PP_ORB
 orbital_dir	../../PP_ORB
diff --git a/tests/integrate/104_PW_NC_magnetic/KPT b/tests/integrate/104_PW_NC_magnetic/KPT
index db31432445..28006d5e2d 100644
--- a/tests/integrate/104_PW_NC_magnetic/KPT
+++ b/tests/integrate/104_PW_NC_magnetic/KPT
@@ -1,4 +1,4 @@
 K_POINTS
 0
 Gamma
-1 1 1  0 0 0
+2 2 2  0 0 0
diff --git a/tests/integrate/104_PW_NC_magnetic/STRU b/tests/integrate/104_PW_NC_magnetic/STRU
index d1981a2504..5b61e75c50 100644
--- a/tests/integrate/104_PW_NC_magnetic/STRU
+++ b/tests/integrate/104_PW_NC_magnetic/STRU
@@ -18,5 +18,5 @@ Fe
 1.0
 2
 0.00 0.00 0.00 1 1 1 angle1 90 angle2 0 
-0.50 0.50 0.50 1 1 1 angle1 90 angle2 180
+0.51 0.52 0.53 1 1 1 angle1 90 angle2 0
 
diff --git a/tests/integrate/104_PW_NC_magnetic/jd b/tests/integrate/104_PW_NC_magnetic/jd
new file mode 100644
index 0000000000..a4292b9c28
--- /dev/null
+++ b/tests/integrate/104_PW_NC_magnetic/jd
@@ -0,0 +1 @@
+Test noncolliner-spin calculation for FM-Fe2 , test force and stress for k-222
diff --git a/tests/integrate/104_PW_NC_magnetic/result.ref b/tests/integrate/104_PW_NC_magnetic/result.ref
index 384804bb8d..21c5c36613 100644
--- a/tests/integrate/104_PW_NC_magnetic/result.ref
+++ b/tests/integrate/104_PW_NC_magnetic/result.ref
@@ -1,6 +1,5 @@
-etotref -5866.197307043378
-etotperatomref -2933.0986535217
-pointgroupref O_h
-spacegroupref O_h
-nksibzref 1
-totaltimeref 2.23
+etotref -6156.9375288905130219
+etotperatomref -3078.4687644453
+totalforceref 2.221910
+totalstressref 76009.325784
+totaltimeref 3.03
diff --git a/tests/integrate/140_PW_15_SO/INPUT b/tests/integrate/140_PW_15_SO/INPUT
index fc09132c7c..1c58840435 100644
--- a/tests/integrate/140_PW_15_SO/INPUT
+++ b/tests/integrate/140_PW_15_SO/INPUT
@@ -25,12 +25,16 @@ ecutwfc             20
 scf_thr                 1e-6
 scf_nmax               100
 
+cal_force           1
+cal_stress          1
+
 #cal_stress             1
 #noncolin           1
 lspinorb            1
 
 basis_type          pw
-ks_solver           dav
+ks_solver           dav_subspace
+pw_diag_ndim        2
 chg_extrap       second-order
 out_dm              0
 pw_diag_thr                0.00001
diff --git a/tests/integrate/140_PW_15_SO/KPT b/tests/integrate/140_PW_15_SO/KPT
index c289c0158a..e769af7638 100644
--- a/tests/integrate/140_PW_15_SO/KPT
+++ b/tests/integrate/140_PW_15_SO/KPT
@@ -1,4 +1,4 @@
 K_POINTS
 0
 Gamma
-1 1 1 0 0 0
+2 1 1 0 0 0
diff --git a/tests/integrate/140_PW_15_SO/result.ref b/tests/integrate/140_PW_15_SO/result.ref
index 2f6201a31f..6bb325aacb 100644
--- a/tests/integrate/140_PW_15_SO/result.ref
+++ b/tests/integrate/140_PW_15_SO/result.ref
@@ -1,3 +1,5 @@
-etotref -1668.616833271131
-etotperatomref -834.3084166356
-totaltimeref 5.15
+etotref -1678.3650981686614614
+etotperatomref -839.1825490843
+totalforceref 1.739332
+totalstressref 34372.194072
+totaltimeref 0.99