From cda0e5d483e57eeb7a69b7a5d99a91a968f5e6ce Mon Sep 17 00:00:00 2001 From: dyzheng Date: Wed, 31 Jul 2024 13:09:54 +0800 Subject: [PATCH 1/8] Fix: noncollinear-spin force and stress with PW --- .../module_xc/xc_functional_gradcorr.cpp | 2 +- .../hamilt_pwdft/forces_nl.cpp | 2 +- .../hamilt_pwdft/fs_nonlocal_tools.cpp | 138 +++++++++++++----- .../hamilt_pwdft/kernels/force_op.cpp | 102 +++++++++++++ .../hamilt_pwdft/kernels/force_op.h | 23 +++ .../hamilt_pwdft/kernels/stress_op.cpp | 76 ++++++++++ .../hamilt_pwdft/kernels/stress_op.h | 22 +++ .../hamilt_pwdft/stress_func_kin.cpp | 16 +- .../hamilt_pwdft/stress_func_loc.cpp | 4 +- .../hamilt_pwdft/stress_func_nl.cpp | 2 +- 10 files changed, 343 insertions(+), 44 deletions(-) diff --git a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp index 4d94f68ec4..d1082c809b 100644 --- a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp +++ b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp @@ -186,7 +186,7 @@ void XC_Functional::gradcorr(double &etxc, double &vtxc, ModuleBase::matrix &v, } gdr2 = new ModuleBase::Vector3[rhopw->nrxx]; - h2 = new ModuleBase::Vector3[rhopw->nrxx]; + if(!is_stress) h2 = new ModuleBase::Vector3[rhopw->nrxx]; XC_Functional::grad_rho( rhogsum1 , gdr1, rhopw, ucell->tpiba); XC_Functional::grad_rho( rhogsum2 , gdr2, rhopw, ucell->tpiba); diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp index 04caefd1bf..92733745b4 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp @@ -45,7 +45,7 @@ void Forces::cal_force_nl(ModuleBase::matrix& forcenl, break; } } - const int npm = ucell_in.get_npol() * nbands_occ; + const int npm = nbands_occ; // calculate becp = for all beta functions nl_tools.cal_becp(ik, npm); for (int ipol = 0; ipol < 3; ipol++) diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index f6773b3ddb..e189f937d9 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -162,9 +162,12 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) { ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); + const int npol = this->ucell_->get_npol(); + const int size_becp = this->nbands * npol * this->nkb; + const int size_becp_act = npm * npol * this->nkb; if (this->becp == nullptr) { - resmem_complex_op()(this->ctx, becp, this->nbands * this->nkb); + resmem_complex_op()(this->ctx, becp, size_becp); } // prepare math tools @@ -248,11 +251,12 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) } const char transa = 'C'; const char transb = 'N'; + int npm_npol = npm * npol; gemm_op()(this->ctx, transa, transb, nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, ppcell_vkb, @@ -267,15 +271,15 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) if (this->device == base_device::GpuDevice) { std::complex* h_becp = nullptr; - resmem_complex_h_op()(this->cpu_ctx, h_becp, this->nbands * nkb); - syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, this->nbands * nkb); - Parallel_Reduce::reduce_pool(h_becp, this->nbands * nkb); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, this->nbands * nkb); + resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act); + syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, size_becp_act); + Parallel_Reduce::reduce_pool(h_becp, size_becp_act); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, size_becp_act); delmem_complex_h_op()(this->cpu_ctx, h_becp); } else { - Parallel_Reduce::reduce_pool(becp, this->nbands * this->nkb); + Parallel_Reduce::reduce_pool(becp, size_becp_act); } ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); } @@ -286,9 +290,12 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i { ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); + const int npol = this->ucell_->get_npol(); + const int size_becp = this->nbands * npol * this->nkb; + const int npm_npol = npm * npol; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, this->nbands * this->nkb); + resmem_complex_op()(this->ctx, dbecp, size_becp); } // prepare math tools @@ -400,7 +407,7 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i transa, transb, nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, ppcell_vkb, @@ -411,6 +418,8 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i dbecp, nkb); // calculate stress for target (ipol, jpol) + if(npol == 1) + { const int current_spin = this->kv_->isk[ik]; cal_stress_nl_op()(this->ctx, nondiagonal, @@ -434,20 +443,48 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i becp, dbecp, stress); + } + else + { + cal_stress_nl_op()(this->ctx, + nondiagonal, + ipol, + jpol, + nkb, + npm, + this->ntype, + this->nbands, + ik, + this->nlpp_->deeq_nc.getBound2(), + this->nlpp_->deeq_nc.getBound3(), + this->nlpp_->deeq_nc.getBound4(), + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + this->nlpp_->template get_deeq_nc_data(), + becp, + dbecp, + stress); + } ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); } template void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) { - ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s"); + ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f"); + const int npol = this->ucell_->get_npol(); + const int npm_npol = npm * npol; + const int size_becp = this->nbands * npol * this->nkb; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, 3 * this->nbands * this->nkb); + resmem_complex_op()(this->ctx, dbecp, 3 * size_becp); } - std::complex* dbecp_ptr = this->dbecp + ipol * this->nbands * this->nkb; + std::complex* dbecp_ptr = this->dbecp + ipol * size_becp; const std::complex* vkb_ptr = this->ppcell_vkb; std::complex* vkb_deri_ptr = this->ppcell_vkb; @@ -480,7 +517,7 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) transa, transb, this->nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, vkb_deri_ptr, @@ -490,7 +527,6 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) &ModuleBase::ZERO, dbecp_ptr, nkb); - this->revert_vkb(npw, ipol); this->pre_ik_f = ik; ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f"); @@ -633,29 +669,57 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force const int current_spin = this->kv_->isk[ik]; const int force_nc = 3; // calculate the force - cal_force_nl_op()(this->ctx, - nondiagonal, - npm, - this->nbands, - this->ntype, - current_spin, - this->nlpp_->deeq.getBound2(), - this->nlpp_->deeq.getBound3(), - this->nlpp_->deeq.getBound4(), - force_nc, - this->nbands, - ik, - nkb, - atom_nh, - atom_na, - this->ucell_->tpiba, - d_wg, - d_ekb, - qq_nt, - deeq, - becp, - dbecp, - force); + if(this->ucell_->get_npol() == 1) + { + cal_force_nl_op()(this->ctx, + nondiagonal, + npm, + this->nbands, + this->ntype, + current_spin, + this->nlpp_->deeq.getBound2(), + this->nlpp_->deeq.getBound3(), + this->nlpp_->deeq.getBound4(), + force_nc, + this->nbands, + ik, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg, + d_ekb, + qq_nt, + deeq, + becp, + dbecp, + force); + } + else + { + cal_force_nl_op()(this->ctx, + nondiagonal, + npm, + this->nbands, + this->ntype, + this->nlpp_->deeq_nc.getBound2(), + this->nlpp_->deeq_nc.getBound3(), + this->nlpp_->deeq_nc.getBound4(), + force_nc, + this->nbands, + ik, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg, + d_ekb, + qq_nt, + this->nlpp_->template get_deeq_nc_data(), + becp, + dbecp, + force); + } } // template instantiation diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 1584c5c681..041bfc3b19 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -145,6 +145,108 @@ struct cal_force_nl_op } // end it #ifdef _OPENMP } +#endif + }; + + void operator()(const base_device::DEVICE_CPU* ctx, + const bool& nondiagonal, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) + { +#ifdef _OPENMP +#pragma omp parallel + { +#endif + int iat0 = 0; + int sum0 = 0; + for (int it = 0; it < ntype; it++) + { + const int Nprojs = atom_nh[it]; +#ifdef _OPENMP +#pragma omp for collapse(2) +#endif + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ib = 0; ib < nbands_occ; ib++) + { + const int ib2 = ib*2; + FPTYPE local_force[3] = {0, 0, 0}; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + int iat = iat0 + ia; + int sum = sum0 + ia * Nprojs; + for (int ip = 0; ip < Nprojs; ip++) + { + const int inkb = sum + ip; + // out<<"\n ps = "< ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + std::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + std::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + std::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + std::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + + for (int ipol = 0; ipol < 3; ipol++) + { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const std::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const std::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const std::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const std::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + + local_force[ipol] -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + } + } +#ifdef _OPENMP + if (omp_get_num_threads() > 1) + { + for (int ipol = 0; ipol < 3; ipol++) + { +#pragma omp atomic + force[iat * forcenl_nc + ipol] += local_force[ipol]; + } + } + else +#endif + { + for (int ipol = 0; ipol < 3; ipol++) + { + force[iat * forcenl_nc + ipol] += local_force[ipol]; + } + } + } + } // end ia + iat0 += atom_na[it]; + sum0 += atom_na[it] * Nprojs; + } // end it +#ifdef _OPENMP + } #endif } }; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index b04b769df8..b9fb84344a 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -92,6 +92,29 @@ struct cal_force_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* force); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_CPU* ctx, + const bool& nondiagonal, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); }; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index 2f551db297..e58cfb3e7b 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -135,6 +135,82 @@ struct cal_stress_nl_op } // end it #ifdef _OPENMP } +#endif + stress[ipol * 3 + jpol] += local_stress; + }; + + void operator()(const base_device::DEVICE_CPU* ctx, + const bool& nondiagonal, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) + { + FPTYPE local_stress = 0; +#ifdef _OPENMP +#pragma omp parallel reduction(+ : local_stress) + { +#endif + int iat = 0, sum = 0; + for (int it = 0; it < ntype; it++) + { + const int Nprojs = atom_nh[it]; +#ifdef _OPENMP +#pragma omp for collapse(4) +#endif + for (int ib = 0; ib < nbands_occ; ib++) + { + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ip1 = 0; ip1 < Nprojs; ip1++) + { + for (int ip2 = 0; ip2 < Nprojs; ip2++) + { + if (!nondiagonal && ip1 != ip2) + { + continue; + } + const int ib2 = ib*2; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + std::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + std::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + std::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + std::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + std::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const int inkb1 = sum + ia * Nprojs + ip1; + const int inkb2 = sum + ia * Nprojs + ip2; + // out<<"\n ps = "< dbb0 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real(); + const std::complex dbb1 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real(); + const std::complex dbb2 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real(); + const std::complex dbb3 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real(); + local_stress -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + } // end ip + } // ia + } + sum += atom_na[it] * Nprojs; + iat += atom_na[it]; + } // end it +#ifdef _OPENMP + } #endif stress[ipol * 3 + jpol] += local_stress; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index b4dfc35ff8..a2852c00c9 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -103,6 +103,28 @@ struct cal_stress_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* stress); + // interface for nspin=4 only + void operator()(const Device* ctx, + const bool& nondiagonal, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); }; template diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp index 7a97be57d7..79d2d1ec79 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp @@ -39,6 +39,7 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, FPTYPE tpiba = ModuleBase::TWO_PI / GlobalC::ucell.lat0; FPTYPE twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI); FPTYPE* kfac = new FPTYPE[npwx]; + const int npol = GlobalC::ucell.get_npol(); for (int ik = 0; ik < p_kv->get_nks(); ik++) { @@ -71,8 +72,7 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, { for (int ibnd = 0; ibnd < GlobalV::NBANDS; ibnd++) { - if (std::fabs(wg(ik, ibnd)) < ModuleBase::threshold_wg * wg(ik, 0)) - continue; + if (wg(ik, ibnd) == 0.0) continue; const std::complex* ppsi = nullptr; ppsi = &(psi_in[0](ik, ibnd, 0)); @@ -85,6 +85,18 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real())); } + if(npol == 2) + { + ppsi = &(psi_in[0](ik, ibnd, npwx)); +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : sum) +#endif + for (int i = 0; i < npw; i++) + { + sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] + * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real())); + } + } s_kin[l][m] += sum; } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp index be64b4e573..b3e5eab6ba 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp @@ -50,11 +50,11 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, aux[ir] = std::complex(chr->rho[0][ir], 0.0 ); } } - for (int is = 1; is < nspin_rho; is++) + if(nspin_rho == 2) { for (int ir = irb; ir < ir_end; ++ir) { // accumulate aux - aux[ir] += std::complex(chr->rho[is][ir], 0.0 ); + aux[ir] += std::complex(chr->rho[1][ir], 0.0 ); } } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp index b243d1ff5d..1ff91afe6c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp @@ -48,7 +48,7 @@ void Stress_Func::stress_nl(ModuleBase::matrix& sigma, break; } } - const int npm = ucell_in.get_npol() * nbands_occ; + const int npm = nbands_occ; // calculate becp = for all beta functions nl_tools.cal_becp(ik, npm); From 8937146fc459069241fef148c134494ff785608a Mon Sep 17 00:00:00 2001 From: dyzheng Date: Tue, 6 Aug 2024 07:14:26 +0000 Subject: [PATCH 2/8] Fix: bug of pw force&stress calculation --- source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index e189f937d9..c01aae9e96 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -107,12 +107,6 @@ void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix this->atom_na = h_atom_na.data(); this->ppcell_vkb = this->nlpp_->vkb.c; } - - // prepare the memory of the becp and dbecp: - // becp: - // dbecp: - resmem_complex_op()(this->ctx, becp, this->nbands * nkb, "Stress::becp"); - resmem_complex_op()(this->ctx, dbecp, 6 * this->nbands * nkb, "Stress::dbecp"); } template From 65a93f7534695bf783d94746bd1994fce5545ba8 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Wed, 7 Aug 2024 06:40:54 +0000 Subject: [PATCH 3/8] fix: soc error and cuda kernels --- .../hamilt_pwdft/fs_nonlocal_tools.cpp | 2 - .../hamilt_pwdft/kernels/cuda/force_op.cu | 103 +++++++++++++ .../hamilt_pwdft/kernels/cuda/stress_op.cu | 135 ++++++++++++++---- .../hamilt_pwdft/kernels/force_op.cpp | 3 - .../hamilt_pwdft/kernels/force_op.h | 23 ++- .../hamilt_pwdft/kernels/stress_op.cpp | 13 +- .../hamilt_pwdft/kernels/stress_op.h | 22 ++- 7 files changed, 258 insertions(+), 43 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index c01aae9e96..f81f5d0880 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -441,7 +441,6 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i else { cal_stress_nl_op()(this->ctx, - nondiagonal, ipol, jpol, nkb, @@ -692,7 +691,6 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force else { cal_force_nl_op()(this->ctx, - nondiagonal, npm, this->nbands, this->ntype, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index f8df7b3029..fb690754c9 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -170,6 +170,109 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } +template +__global__ void cal_force_nl( + const int wg_nc, + const int ntype, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int forcenl_nc, + const int nbands, + const int ik, + const int nkb, + const int *atom_nh, + const int *atom_na, + const FPTYPE tpiba, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *force) +{ + const int ib = blockIdx.x / ntype; + const int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + int Nprojs = atom_nh[it]; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + for (int ia = 0; ia < atom_na[it]; ia++) { + for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + const int inkb = sum + ip; + for (int ip2 = 0; ip2 < Nprojs; ip2++) + { + // Effective values of the D-eS coefficients + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + const thrust::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + const int jnkb = sum + ip2; + for (int ipol = 0; ipol < 3; ipol++) { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const std::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const std::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const std::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const std::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + } + iat += 1; + sum += Nprojs; + } +} + +// interface for nspin=4 only +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + cal_force_nl<<>>( + wg_nc, ntype, + deeq_2, deeq_3, deeq_4, + forcenl_nc, nbands, ik, nkb, + atom_nh, atom_na, + tpiba, + d_wg, d_ekb, qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force);// array of data + + cudaCheckOnDebug(); +} + template __global__ void saveVkbValues_( const int *gcar_zero_ptrs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index 769d6351e9..c379fc1082 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -240,6 +240,114 @@ void cal_stress_nl_op::operator()(const base_de cudaCheckOnDebug(); } +template +__global__ void cal_stress_nl( + const int ipol, + const int jpol, + const int nkb, + const int ntype, + const int wg_nc, + const int ik, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; + const int Nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { + int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + FPTYPE ps = deeq_nc[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] + - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + const int inkb1 = sum + ip1; + const int inkb2 = sum + ip2; + //out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const thrust::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + ++iat; + sum+=Nprojs; + }//ia + __syncwarp(); + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress + ipol * 3 + jpol, stress_var); + } +} + +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + cal_stress_nl<<>>( + ipol, + jpol, + nkb, + ntype, + wg_nc, + ik, + deeq_2, + deeq_3, + deeq_4, + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + cudaCheckOnDebug(); +} + template void cal_stress_mgga_op::operator()( const int& spin, @@ -638,33 +746,6 @@ void cal_stress_drhoc_aux_op::operator()( return ; } -// template -// void prepare_vkb_deri_ptr_op::operator()( -// const base_device::DEVICE_GPU* ctx, -// int nbeta, double* nhtol, int nhtol_nc, int npw, int it, -// int ipol, int jpol, -// std::complex*vkb_out, std::complex** vkb_ptrs, -// FPTYPE* ylm_in, FPTYPE** ylm_ptrs, -// FPTYPE* ylm_deri_in, FPTYPE** ylm_deri_ptr1s, FPTYPE** ylm_deri_ptr2s, -// FPTYPE* vq_in, FPTYPE** vq_ptrs, -// FPTYPE* vq_deri_in, FPTYPE** vq_deri_ptrs -// ) -// { -// const int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; -// dim3 gridsize(block,nbeta); - - -// prepare_vkb_deri_ptr<<<1,1>>>( -// nbeta, nhtol, nhtol_nc, npw, it, ipol, jpol, -// reinterpret_cast*>(vkb_out), -// reinterpret_cast*>(vkb_ptrs), -// ylm_in, ylm_ptrs, ylm_deri_in, ylm_deri_ptr1s, ylm_deri_ptr2s, -// vq_in, vq_ptrs, vq_deri_in, vq_deri_ptrs -// ); - -// return ; -// } - template <> void pointer_array_malloc::operator()( diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 041bfc3b19..d152b25a7d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -149,7 +149,6 @@ struct cal_force_nl_op }; void operator()(const base_device::DEVICE_CPU* ctx, - const bool& nondiagonal, const int& nbands_occ, const int& wg_nc, const int& ntype, @@ -199,8 +198,6 @@ struct cal_force_nl_op // out<<"\n ps = "< ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; const int jnkb = sum + ip2; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index b9fb84344a..f729280ef6 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -94,7 +94,6 @@ struct cal_force_nl_op FPTYPE* force); // interface for nspin=4 only void operator()(const base_device::DEVICE_CPU* ctx, - const bool& nondiagonal, const int& nbands_occ, const int& wg_nc, const int& ntype, @@ -159,6 +158,28 @@ struct cal_force_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* force); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); }; /** diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index e58cfb3e7b..b951714e00 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -140,7 +140,6 @@ struct cal_stress_nl_op }; void operator()(const base_device::DEVICE_CPU* ctx, - const bool& nondiagonal, const int& ipol, const int& jpol, const int& nkb, @@ -181,10 +180,6 @@ struct cal_stress_nl_op { for (int ip2 = 0; ip2 < Nprojs; ip2++) { - if (!nondiagonal && ip1 != ip2) - { - continue; - } const int ib2 = ib*2; FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; @@ -197,10 +192,10 @@ struct cal_stress_nl_op const int inkb2 = sum + ia * Nprojs + ip2; // out<<"\n ps = "< dbb0 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real(); - const std::complex dbb1 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real(); - const std::complex dbb2 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real(); - const std::complex dbb3 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real(); + const std::complex dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const std::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const std::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const std::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; local_stress -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); } } // end ip diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index a2852c00c9..177761a95d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -105,7 +105,6 @@ struct cal_stress_nl_op FPTYPE* stress); // interface for nspin=4 only void operator()(const Device* ctx, - const bool& nondiagonal, const int& ipol, const int& jpol, const int& nkb, @@ -259,6 +258,27 @@ struct cal_stress_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* stress); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); }; // cpu version first, gpu version later From 5b629550d1e9495ea915e31ffcdfa5a4d63e2cf5 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Wed, 7 Aug 2024 08:30:37 +0000 Subject: [PATCH 4/8] Fix: CUDA and ROCM --- .../hamilt_pwdft/kernels/cuda/force_op.cu | 11 +- .../hamilt_pwdft/kernels/cuda/stress_op.cu | 2 - .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 104 +++++++++++++++++ .../kernels/rocm/stress_op.hip.cu | 106 ++++++++++++++++++ 4 files changed, 216 insertions(+), 7 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index fb690754c9..e23e0ccbc3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -193,6 +193,7 @@ __global__ void cal_force_nl( FPTYPE *force) { const int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; const int it = blockIdx.x % ntype; int iat = 0, sum = 0; @@ -217,14 +218,13 @@ __global__ void cal_force_nl( const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; - const int jnkb = sum + ip2; for (int ipol = 0; ipol < 3; ipol++) { const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; const int index1 = ib2 * nkb + jnkb; - const std::complex dbb0 = conj(dbecp[index0]) * becp[index1]; - const std::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; - const std::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; - const std::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + const thrust::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const thrust::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const thrust::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const thrust::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); atomicAdd(force + iat * forcenl_nc + ipol, tmp); } @@ -236,6 +236,7 @@ __global__ void cal_force_nl( } // interface for nspin=4 only +template void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, const int& nbands_occ, const int& wg_nc, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index c379fc1082..98918c6523 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -282,8 +282,6 @@ __global__ void cal_stress_nl( const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; - FPTYPE ps = deeq_nc[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] - - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; const int inkb1 = sum + ip1; const int inkb2 = sum + ip2; //out<<"\n ps = "<::operator()(const base_dev hipCheckOnDebug(); } +template +__global__ void cal_force_nl( + const int wg_nc, + const int ntype, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int forcenl_nc, + const int nbands, + const int ik, + const int nkb, + const int *atom_nh, + const int *atom_na, + const FPTYPE tpiba, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *force) +{ + const int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + int Nprojs = atom_nh[it]; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + for (int ia = 0; ia < atom_na[it]; ia++) { + for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + const int inkb = sum + ip; + for (int ip2 = 0; ip2 < Nprojs; ip2++) + { + // Effective values of the D-eS coefficients + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + const thrust::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + for (int ipol = 0; ipol < 3; ipol++) { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const thrust::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const thrust::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const thrust::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const thrust::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + } + iat += 1; + sum += Nprojs; + } +} + +// interface for nspin=4 only +template +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, + wg_nc, ntype, + deeq_2, deeq_3, deeq_4, + forcenl_nc, nbands, ik, nkb, + atom_nh, atom_na, + tpiba, + d_wg, d_ekb, qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force);// array of data + + hipCheckOnDebug(); +} + template __global__ void saveVkbValues_( const int *gcar_zero_ptrs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index d7db3b3eea..1afba3a5b1 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -238,6 +238,112 @@ void cal_stress_nl_op::operator()(const base_de hipCheckOnDebug(); } +template +__global__ void cal_stress_nl( + const int ipol, + const int jpol, + const int nkb, + const int ntype, + const int wg_nc, + const int ik, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; + const int Nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { + int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const int inkb1 = sum + ip1; + const int inkb2 = sum + ip2; + //out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const thrust::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + ++iat; + sum+=Nprojs; + }//ia + __syncwarp(); + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress + ipol * 3 + jpol, stress_var); + } +} + +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, + ipol, + jpol, + nkb, + ntype, + wg_nc, + ik, + deeq_2, + deeq_3, + deeq_4, + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + hipCheckOnDebug(); +} + template void cal_stress_mgga_op::operator()( const int& spin, From 32ddfaf7e48b37a8f74ce7c83ef0fd404231d7f2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci-lite[bot]" <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Date: Wed, 7 Aug 2024 09:12:02 +0000 Subject: [PATCH 5/8] [pre-commit.ci lite] apply automatic fixes --- source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp | 6 ++++-- source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp index 79d2d1ec79..f05e64458a 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp @@ -29,8 +29,9 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, int npwx=0; for (int ik = 0; ik < p_kv->get_nks(); ik++) { - if (npwx < p_kv->ngk[ik]) + if (npwx < p_kv->ngk[ik]) { npwx = p_kv->ngk[ik]; +} } gk[0]= new FPTYPE[npwx]; @@ -72,7 +73,8 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, { for (int ibnd = 0; ibnd < GlobalV::NBANDS; ibnd++) { - if (wg(ik, ibnd) == 0.0) continue; + if (wg(ik, ibnd) == 0.0) { continue; +} const std::complex* ppsi = nullptr; ppsi = &(psi_in[0](ik, ibnd, 0)); diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp index b3e5eab6ba..cb306fceeb 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp @@ -22,7 +22,8 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, const int nspin_rho = (GlobalV::NSPIN == 2) ? 2 : 1; - if (PARAM.inp.gamma_only && is_pw) fact=2.0; + if (PARAM.inp.gamma_only && is_pw) { fact=2.0; +} @@ -70,12 +71,13 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, { for (int ig=0; ignpw; ig++) { - if (rho_basis->ig_gge0 == ig) + if (rho_basis->ig_gge0 == ig) { evloc += GlobalC::ppcell.vloc(it, rho_basis->ig2igg[ig]) * (p_sf->strucFac(it, ig) * conj(aux[ig])).real(); - else + } else { evloc += GlobalC::ppcell.vloc(it, rho_basis->ig2igg[ig]) * (p_sf->strucFac(it, ig) * conj(aux[ig]) * fact).real(); +} } } } From 34d349545d4324aef955f57ee91c6d0460c278f0 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Mon, 26 Aug 2024 16:15:35 +0800 Subject: [PATCH 6/8] Fix: nspin=4 error on DCU/GPU --- .../hamilt_pwdft/kernels/cuda/nonlocal_op.cu | 8 ++++---- .../hamilt_pwdft/kernels/cuda/stress_op.cu | 14 +++++++++----- .../hamilt_pwdft/kernels/force_op.cpp | 14 +++++++------- .../hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu | 8 ++++---- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu index 275fbfe136..89a74f3c5f 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu @@ -54,8 +54,8 @@ __global__ void nonlocal_pw( thrust::complex* ps, const thrust::complex* becp) { - const int ii = blockIdx.x / l2; - const int jj = blockIdx.x % l2; + const int ii = blockIdx.x * 2 / l2; + const int jj = blockIdx.x * 2 % l2; for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) { thrust::complex res1(0.0, 0.0); thrust::complex res2(0.0, 0.0); @@ -121,7 +121,7 @@ void hamilt::nonlocal_pw_op::operator()(const b { // denghui implement 20221109 // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - nonlocal_pw<<>>( + nonlocal_pw<<>>( l1, l2, l3, // loop size sum, iat, nkb, // control params deeq_x, deeq_y, deeq_z, @@ -138,4 +138,4 @@ void hamilt::nonlocal_pw_op::operator()(const b template struct nonlocal_pw_op; template struct nonlocal_pw_op; -} // namespace hamilt \ No newline at end of file +} // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index 98918c6523..f451abd26c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -261,22 +261,26 @@ __global__ void cal_stress_nl( const thrust::complex *dbecp, FPTYPE *stress) { - int ib = blockIdx.x / ntype; + const int ib = blockIdx.x / ntype; // index of loop-nbands const int ib2 = ib * 2; - int it = blockIdx.x % ntype; + const int it = blockIdx.x % ntype; // index of loop-ntype - int iat = 0, sum = 0; + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index for (int ii = 0; ii < it; ii++) { iat += atom_na[ii]; sum += atom_na[ii] * atom_nh[ii]; } - FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; + FPTYPE stress_var = 0; + const FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; + const FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; const int Nprojs = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { - int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + const int ip1 = ii / Nprojs; + const int ip2 = ii % Nprojs; const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index d152b25a7d..90a4641123 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -178,7 +178,7 @@ struct cal_force_nl_op int sum0 = 0; for (int it = 0; it < ntype; it++) { - const int Nprojs = atom_nh[it]; + const int nprojs = atom_nh[it]; #ifdef _OPENMP #pragma omp for collapse(2) #endif @@ -191,12 +191,12 @@ struct cal_force_nl_op FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; int iat = iat0 + ia; - int sum = sum0 + ia * Nprojs; - for (int ip = 0; ip < Nprojs; ip++) + int sum = sum0 + ia * nprojs; + for (int ip = 0; ip < nprojs; ip++) { const int inkb = sum + ip; // out<<"\n ps = "< ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; @@ -223,7 +223,7 @@ struct cal_force_nl_op #ifdef _OPENMP if (omp_get_num_threads() > 1) { - for (int ipol = 0; ipol < 3; ipol++) + for (int ipol = 0; ipol < 3; ++ipol) { #pragma omp atomic force[iat * forcenl_nc + ipol] += local_force[ipol]; @@ -232,7 +232,7 @@ struct cal_force_nl_op else #endif { - for (int ipol = 0; ipol < 3; ipol++) + for (int ipol = 0; ipol < 3; ++ipol) { force[iat * forcenl_nc + ipol] += local_force[ipol]; } @@ -240,7 +240,7 @@ struct cal_force_nl_op } } // end ia iat0 += atom_na[it]; - sum0 += atom_na[it] * Nprojs; + sum0 += atom_na[it] * nprojs; } // end it #ifdef _OPENMP } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu index 93df9fe58d..9a43f64ea6 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu @@ -54,8 +54,8 @@ __global__ void nonlocal_pw( thrust::complex* ps, const thrust::complex* becp) { - const int ii = blockIdx.x / l2; - const int jj = blockIdx.x % l2; + const int ii = blockIdx.x * 2 / l2; + const int jj = blockIdx.x * 2 % l2; for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) { thrust::complex res1(0.0, 0.0); thrust::complex res2(0.0, 0.0); @@ -122,7 +122,7 @@ void hamilt::nonlocal_pw_op::operator()(const b { // denghui implement 20221109 // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw), dim3(l1 * l2), dim3(THREADS_PER_BLOCK), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw), dim3(l1 * l2 / 2), dim3(THREADS_PER_BLOCK), 0, 0, l1, l2, l3, // loop size sum, iat, nkb, // control params deeq_x, deeq_y, deeq_z, @@ -140,4 +140,4 @@ void hamilt::nonlocal_pw_op::operator()(const b namespace hamilt{ template struct nonlocal_pw_op; template struct nonlocal_pw_op; -} \ No newline at end of file +} From ebc3aed16003bae567a4c46056b76f6b4292a049 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Thu, 31 Oct 2024 14:21:55 +0800 Subject: [PATCH 7/8] Refactor: format and less parameters for nonlocal fs kernels --- .../hamilt_pwdft/fs_nonlocal_tools.cpp | 62 ++++++++----------- .../hamilt_pwdft/kernels/cuda/force_op.cu | 42 +++++-------- .../hamilt_pwdft/kernels/cuda/stress_op.cu | 43 +++++-------- .../hamilt_pwdft/kernels/force_op.cpp | 22 +++---- .../hamilt_pwdft/kernels/force_op.h | 10 --- .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 48 +++++++------- .../kernels/rocm/stress_op.hip.cu | 43 ++++++------- .../hamilt_pwdft/kernels/stress_op.cpp | 39 ++++++------ .../hamilt_pwdft/kernels/stress_op.h | 10 --- .../kernels/test/force_op_test.cpp | 4 -- .../kernels/test/stress_op_test.cpp | 4 -- 11 files changed, 128 insertions(+), 199 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index bfdf79e94a..4dbdbc558f 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -415,29 +415,27 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i // calculate stress for target (ipol, jpol) if(npol == 1) { - const int current_spin = this->kv_->isk[ik]; - cal_stress_nl_op()(this->ctx, - nondiagonal, - ipol, - jpol, - nkb, - npm, - this->ntype, - current_spin, // uspp only - this->nbands, - ik, - this->nlpp_->deeq.getBound2(), - this->nlpp_->deeq.getBound3(), - this->nlpp_->deeq.getBound4(), - atom_nh, - atom_na, - d_wg, - d_ekb, - qq_nt, - deeq, - becp, - dbecp, - stress); + const int current_spin = this->kv_->isk[ik]; + cal_stress_nl_op()(this->ctx, + nondiagonal, + ipol, + jpol, + nkb, + npm, + this->ntype, + current_spin, // uspp only + this->nlpp_->deeq.getBound2(), + this->nlpp_->deeq.getBound3(), + this->nlpp_->deeq.getBound4(), + atom_nh, + atom_na, + d_wg + this->nbands * ik, + d_ekb + this->nbands * ik, + qq_nt, + deeq, + becp, + dbecp, + stress); } else { @@ -447,15 +445,13 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i nkb, npm, this->ntype, - this->nbands, - ik, this->nlpp_->deeq_nc.getBound2(), this->nlpp_->deeq_nc.getBound3(), this->nlpp_->deeq_nc.getBound4(), atom_nh, atom_na, - d_wg, - d_ekb, + d_wg + this->nbands * ik, + d_ekb + this->nbands * ik, qq_nt, this->nlpp_->template get_deeq_nc_data(), becp, @@ -668,7 +664,6 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force cal_force_nl_op()(this->ctx, nondiagonal, npm, - this->nbands, this->ntype, current_spin, this->nlpp_->deeq.getBound2(), @@ -676,13 +671,12 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force this->nlpp_->deeq.getBound4(), force_nc, this->nbands, - ik, nkb, atom_nh, atom_na, this->ucell_->tpiba, - d_wg, - d_ekb, + d_wg + this->nbands * ik, + d_ekb + this->nbands * ik, qq_nt, deeq, becp, @@ -693,20 +687,18 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force { cal_force_nl_op()(this->ctx, npm, - this->nbands, this->ntype, this->nlpp_->deeq_nc.getBound2(), this->nlpp_->deeq_nc.getBound3(), this->nlpp_->deeq_nc.getBound4(), force_nc, this->nbands, - ik, nkb, atom_nh, atom_na, this->ucell_->tpiba, - d_wg, - d_ekb, + d_wg + this->nbands * ik, + d_ekb + this->nbands * ik, qq_nt, this->nlpp_->template get_deeq_nc_data(), becp, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index e23e0ccbc3..dddf889de1 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -35,7 +35,6 @@ __global__ void cal_vkb1_nl( template __global__ void cal_force_nl( const bool nondiagonal, - const int wg_nc, const int ntype, const int spin, const int deeq_2, @@ -43,7 +42,6 @@ __global__ void cal_force_nl( const int deeq_4, const int forcenl_nc, const int nbands, - const int ik, const int nkb, const int *atom_nh, const int *atom_na, @@ -65,11 +63,11 @@ __global__ void cal_force_nl( sum += atom_na[ii] * atom_nh[ii]; } - int Nprojs = atom_nh[it]; - FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + int nproj = atom_nh[it]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip]; FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; @@ -85,8 +83,8 @@ __global__ void cal_force_nl( } if (nondiagonal) { - //for (int ip2=0; ip2 void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, const bool& nondiagonal, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& spin, const int& deeq_2, @@ -142,7 +139,6 @@ void cal_force_nl_op::operator()(const base_dev const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -157,9 +153,9 @@ void cal_force_nl_op::operator()(const base_dev { cal_force_nl<<>>( nondiagonal, - wg_nc, ntype, spin, + ntype, spin, deeq_2, deeq_3, deeq_4, - forcenl_nc, nbands, ik, nkb, + forcenl_nc, nbands, nkb, atom_nh, atom_na, tpiba, d_wg, d_ekb, qq_nt, deeq, @@ -172,14 +168,12 @@ void cal_force_nl_op::operator()(const base_dev template __global__ void cal_force_nl( - const int wg_nc, const int ntype, const int deeq_2, const int deeq_3, const int deeq_4, const int forcenl_nc, const int nbands, - const int ik, const int nkb, const int *atom_nh, const int *atom_na, @@ -202,13 +196,13 @@ __global__ void cal_force_nl( sum += atom_na[ii] * atom_nh[ii]; } - int Nprojs = atom_nh[it]; - FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + int nproj = atom_nh[it]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { const int inkb = sum + ip; - for (int ip2 = 0; ip2 < Nprojs; ip2++) + for (int ip2 = 0; ip2 < nproj; ip2++) { // Effective values of the D-eS coefficients const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; @@ -231,7 +225,7 @@ __global__ void cal_force_nl( } } iat += 1; - sum += Nprojs; + sum += nproj; } } @@ -239,14 +233,12 @@ __global__ void cal_force_nl( template void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& deeq_2, const int& deeq_3, const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -260,9 +252,9 @@ void cal_force_nl_op::operator()(const base_dev FPTYPE* force) { cal_force_nl<<>>( - wg_nc, ntype, + ntype, deeq_2, deeq_3, deeq_4, - forcenl_nc, nbands, ik, nkb, + forcenl_nc, nbands, nkb, atom_nh, atom_na, tpiba, d_wg, d_ekb, qq_nt, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index 4f134e0042..e566126876 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -107,8 +107,6 @@ __global__ void cal_stress_nl( const int nkb, const int ntype, const int spin, - const int wg_nc, - const int ik, const int deeq_2, const int deeq_3, const int deeq_4, @@ -125,22 +123,25 @@ __global__ void cal_stress_nl( int ib = blockIdx.x / ntype; int it = blockIdx.x % ntype; - int iat = 0, sum = 0; + int iat = 0; + int sum = 0; for (int ii = 0; ii < it; ii++) { iat += atom_na[ii]; sum += atom_na[ii] * atom_nh[ii]; } - FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; - const int Nprojs = atom_nh[it]; + FPTYPE stress_var = 0; + const FPTYPE fac = d_wg[ib]; + const FPTYPE ekb_now = d_ekb[ib]; + const int nproj = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { - int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) { + const int ip1 = ii / nproj, ip2 = ii % nproj; if(!nondiagonal && ip1 != ip2) { continue; } - FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] + const FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; const int inkb1 = sum + ip1; const int inkb2 = sum + ip2; @@ -149,7 +150,7 @@ __global__ void cal_stress_nl( stress_var -= ps * fac * dbb; } ++iat; - sum+=Nprojs; + sum+=nproj; }//ia __syncwarp(); warp_reduce(stress_var); @@ -204,8 +205,6 @@ void cal_stress_nl_op::operator()(const base_de const int& nbands_occ, const int& ntype, const int& spin, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -226,8 +225,6 @@ void cal_stress_nl_op::operator()(const base_de nkb, ntype, spin, - wg_nc, - ik, deeq_2, deeq_3, deeq_4, @@ -250,8 +247,6 @@ __global__ void cal_stress_nl( const int jpol, const int nkb, const int ntype, - const int wg_nc, - const int ik, const int deeq_2, const int deeq_3, const int deeq_4, @@ -277,14 +272,14 @@ __global__ void cal_stress_nl( } FPTYPE stress_var = 0; - const FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; - const FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; - const int Nprojs = atom_nh[it]; + const FPTYPE fac = d_wg[ib]; + const FPTYPE ekb_now = d_ekb[ib]; + const int nproj = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { - const int ip1 = ii / Nprojs; - const int ip2 = ii % Nprojs; + for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) { + const int ip1 = ii / nproj; + const int ip2 = ii % nproj; const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; @@ -300,7 +295,7 @@ __global__ void cal_stress_nl( stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); } ++iat; - sum+=Nprojs; + sum+=nproj; }//ia __syncwarp(); warp_reduce(stress_var); @@ -316,8 +311,6 @@ void cal_stress_nl_op::operator()(const base_de const int& nkb, const int& nbands_occ, const int& ntype, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -336,8 +329,6 @@ void cal_stress_nl_op::operator()(const base_de jpol, nkb, ntype, - wg_nc, - ik, deeq_2, deeq_3, deeq_4, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 90a4641123..2ee9cb4530 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -42,7 +42,6 @@ struct cal_force_nl_op void operator()(const base_device::DEVICE_CPU* ctx, const bool& nondiagonal, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& spin, const int& deeq_2, @@ -50,7 +49,6 @@ struct cal_force_nl_op const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -71,7 +69,7 @@ struct cal_force_nl_op int sum0 = 0; for (int it = 0; it < ntype; it++) { - const int Nprojs = atom_nh[it]; + const int nproj = atom_nh[it]; #ifdef _OPENMP #pragma omp for collapse(2) #endif @@ -80,11 +78,11 @@ struct cal_force_nl_op for (int ib = 0; ib < nbands_occ; ib++) { FPTYPE local_force[3] = {0, 0, 0}; - FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; int iat = iat0 + ia; - int sum = sum0 + ia * Nprojs; - for (int ip = 0; ip < Nprojs; ip++) + int sum = sum0 + ia * nproj; + for (int ip = 0; ip < nproj; ip++) { // Effective values of the D-eS coefficients FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] @@ -102,7 +100,7 @@ struct cal_force_nl_op } if (nondiagonal) { - for (int ip2 = 0; ip2 < Nprojs; ip2++) + for (int ip2 = 0; ip2 < nproj; ip2++) { if (ip != ip2) { @@ -141,7 +139,7 @@ struct cal_force_nl_op } } // end ia iat0 += atom_na[it]; - sum0 += atom_na[it] * Nprojs; + sum0 += atom_na[it] * nproj; } // end it #ifdef _OPENMP } @@ -150,14 +148,12 @@ struct cal_force_nl_op void operator()(const base_device::DEVICE_CPU* ctx, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& deeq_2, const int& deeq_3, const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -188,8 +184,8 @@ struct cal_force_nl_op { const int ib2 = ib*2; FPTYPE local_force[3] = {0, 0, 0}; - FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; int iat = iat0 + ia; int sum = sum0 + ia * nprojs; for (int ip = 0; ip < nprojs; ip++) diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 05c419dd5e..c68df7bd0f 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -48,7 +48,6 @@ struct cal_force_nl_op /// @param ctx - which device this function runs on /// @param nondiagonal - control flag /// @param nbands_occ - number of occupied bands - /// @param wg_nc - the second dimension of matrix wg /// @param ntype - total atomic type /// @param spin - current spin /// @param deeq_2 - the second dimension of deeq @@ -56,7 +55,6 @@ struct cal_force_nl_op /// @param deeq_4 - the forth dimension of deeq /// @param forcenl_nc - the second dimension of matrix forcenl /// @param nbands - NBANDS - /// @param ik - current k point /// @param nkb - number of k point /// @param atom_nh - GlobalC::ucell.atoms[ii].ncpp.nh /// @param atom_na - GlobalC::ucell.atoms[ii].na @@ -73,7 +71,6 @@ struct cal_force_nl_op void operator()(const base_device::DEVICE_CPU* ctx, const bool& nondiagonal, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& spin, const int& deeq_2, @@ -81,7 +78,6 @@ struct cal_force_nl_op const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -96,14 +92,12 @@ struct cal_force_nl_op // interface for nspin=4 only void operator()(const base_device::DEVICE_CPU* ctx, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& deeq_2, const int& deeq_3, const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -139,7 +133,6 @@ struct cal_force_nl_op void operator()(const base_device::DEVICE_GPU* ctx, const bool& nondiagonal, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& spin, const int& deeq_2, @@ -147,7 +140,6 @@ struct cal_force_nl_op const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -162,14 +154,12 @@ struct cal_force_nl_op // interface for nspin=4 only void operator()(const base_device::DEVICE_GPU* ctx, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& deeq_2, const int& deeq_3, const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index 786a6b8ebf..da45a375e7 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -31,7 +31,6 @@ __global__ void cal_vkb1_nl( template __global__ void cal_force_nl( const bool nondiagonal, - const int wg_nc, const int ntype, const int spin, const int deeq_2, @@ -39,7 +38,6 @@ __global__ void cal_force_nl( const int deeq_4, const int forcenl_nc, const int nbands, - const int ik, const int nkb, const int *atom_nh, const int *atom_na, @@ -55,17 +53,18 @@ __global__ void cal_force_nl( const int ib = blockIdx.x / ntype; const int it = blockIdx.x % ntype; - int iat = 0, sum = 0; + int iat = 0; + int sum = 0; for (int ii = 0; ii < it; ii++) { iat += atom_na[ii]; sum += atom_na[ii] * atom_nh[ii]; } - int Nprojs = atom_nh[it]; - FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + int nproj = atom_nh[it]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip]; FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; @@ -81,8 +80,8 @@ __global__ void cal_force_nl( } if (nondiagonal) { - //for (int ip2=0; ip2 void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, const bool& nondiagonal, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& spin, const int& deeq_2, @@ -138,7 +136,6 @@ void cal_force_nl_op::operator()(const base_dev const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -153,9 +150,9 @@ void cal_force_nl_op::operator()(const base_dev { hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, nondiagonal, - wg_nc, ntype, spin, + ntype, spin, deeq_2, deeq_3, deeq_4, - forcenl_nc, nbands, ik, nkb, + forcenl_nc, nbands, nkb, atom_nh, atom_na, tpiba, d_wg, d_ekb, qq_nt, deeq, @@ -168,14 +165,12 @@ void cal_force_nl_op::operator()(const base_dev template __global__ void cal_force_nl( - const int wg_nc, const int ntype, const int deeq_2, const int deeq_3, const int deeq_4, const int forcenl_nc, const int nbands, - const int ik, const int nkb, const int *atom_nh, const int *atom_na, @@ -192,19 +187,20 @@ __global__ void cal_force_nl( const int ib2 = ib * 2; const int it = blockIdx.x % ntype; - int iat = 0, sum = 0; + int iat = 0; + int sum = 0; for (int ii = 0; ii < it; ii++) { iat += atom_na[ii]; sum += atom_na[ii] * atom_nh[ii]; } - int Nprojs = atom_nh[it]; - FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + int nproj = atom_nh[it]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { const int inkb = sum + ip; - for (int ip2 = 0; ip2 < Nprojs; ip2++) + for (int ip2 = 0; ip2 < nproj; ip2++) { // Effective values of the D-eS coefficients const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; @@ -227,7 +223,7 @@ __global__ void cal_force_nl( } } iat += 1; - sum += Nprojs; + sum += nproj; } } @@ -235,14 +231,12 @@ __global__ void cal_force_nl( template void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& deeq_2, const int& deeq_3, const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -256,9 +250,9 @@ void cal_force_nl_op::operator()(const base_dev FPTYPE* force) { hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, - wg_nc, ntype, + ntype, deeq_2, deeq_3, deeq_4, - forcenl_nc, nbands, ik, nkb, + forcenl_nc, nbands, nkb, atom_nh, atom_na, tpiba, d_wg, d_ekb, qq_nt, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index 96675e359d..6880987546 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -102,8 +102,6 @@ __global__ void cal_stress_nl( const int nkb, const int ntype, const int spin, - const int wg_nc, - const int ik, const int deeq_2, const int deeq_3, const int deeq_4, @@ -120,18 +118,21 @@ __global__ void cal_stress_nl( int ib = blockIdx.x / ntype; int it = blockIdx.x % ntype; - int iat = 0, sum = 0; + int iat = 0; + int sum = 0; for (int ii = 0; ii < it; ii++) { iat += atom_na[ii]; sum += atom_na[ii] * atom_nh[ii]; } - FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; - const int Nprojs = atom_nh[it]; + FPTYPE stress_var = 0; + FPTYPE fac = d_wg[ib]; + FPTYPE ekb_now = d_ekb[ib]; + const int nproj = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { - int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) { + int ip1 = ii / nproj, ip2 = ii % nproj; if(!nondiagonal && ip1 != ip2) { continue; } @@ -144,7 +145,7 @@ __global__ void cal_stress_nl( stress_var -= ps * fac * dbb; } ++iat; - sum+=Nprojs; + sum+=nproj; }//ia warp_reduce(stress_var); if (threadIdx.x % WARP_SIZE == 0) { @@ -198,8 +199,6 @@ void cal_stress_nl_op::operator()(const base_de const int& nbands_occ, const int& ntype, const int& spin, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -220,8 +219,6 @@ void cal_stress_nl_op::operator()(const base_de nkb, ntype, spin, - wg_nc, - ik, deeq_2, deeq_3, deeq_4, @@ -244,8 +241,6 @@ __global__ void cal_stress_nl( const int jpol, const int nkb, const int ntype, - const int wg_nc, - const int ik, const int deeq_2, const int deeq_3, const int deeq_4, @@ -263,18 +258,22 @@ __global__ void cal_stress_nl( const int ib2 = ib * 2; int it = blockIdx.x % ntype; - int iat = 0, sum = 0; + int iat = 0; + int sum = 0; for (int ii = 0; ii < it; ii++) { iat += atom_na[ii]; sum += atom_na[ii] * atom_nh[ii]; } - FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; - const int Nprojs = atom_nh[it]; + FPTYPE stress_var = 0; + FPTYPE fac = d_wg[ib]; + FPTYPE ekb_now = d_ekb[ib]; + const int nproj = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { - int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) { + const int ip1 = ii / nproj; + const int ip2 = ii % nproj; const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; @@ -290,7 +289,7 @@ __global__ void cal_stress_nl( stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); } ++iat; - sum+=Nprojs; + sum+=nproj; }//ia __syncwarp(); warp_reduce(stress_var); @@ -306,8 +305,6 @@ void cal_stress_nl_op::operator()(const base_de const int& nkb, const int& nbands_occ, const int& ntype, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -326,8 +323,6 @@ void cal_stress_nl_op::operator()(const base_de jpol, nkb, ntype, - wg_nc, - ik, deeq_2, deeq_3, deeq_4, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index 142ec6e4a8..f36f2e8316 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -77,8 +77,6 @@ struct cal_stress_nl_op const int& nbands_occ, const int& ntype, const int& spin, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -97,10 +95,11 @@ struct cal_stress_nl_op #pragma omp parallel reduction(+ : local_stress) { #endif - int iat = 0, sum = 0; + int iat = 0; + int sum = 0; for (int it = 0; it < ntype; it++) { - const int Nprojs = atom_nh[it]; + const int nproj = atom_nh[it]; #ifdef _OPENMP #pragma omp for collapse(4) #endif @@ -108,20 +107,20 @@ struct cal_stress_nl_op { for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ip1 = 0; ip1 < Nprojs; ip1++) + for (int ip1 = 0; ip1 < nproj; ip1++) { - for (int ip2 = 0; ip2 < Nprojs; ip2++) + for (int ip2 = 0; ip2 < nproj; ip2++) { if (!nondiagonal && ip1 != ip2) { continue; } - FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + FPTYPE fac = d_wg[ib] * 1.0; + FPTYPE ekb_now = d_ekb[ib]; FPTYPE ps = deeq[((spin * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; - const int inkb1 = sum + ia * Nprojs + ip1; - const int inkb2 = sum + ia * Nprojs + ip2; + const int inkb1 = sum + ia * nproj + ip1; + const int inkb2 = sum + ia * nproj + ip2; // out<<"\n ps = "< } // end ip } // ia } - sum += atom_na[it] * Nprojs; + sum += atom_na[it] * nproj; iat += atom_na[it]; } // end it #ifdef _OPENMP @@ -145,8 +144,6 @@ struct cal_stress_nl_op const int& nkb, const int& nbands_occ, const int& ntype, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -168,7 +165,7 @@ struct cal_stress_nl_op int iat = 0, sum = 0; for (int it = 0; it < ntype; it++) { - const int Nprojs = atom_nh[it]; + const int nproj = atom_nh[it]; #ifdef _OPENMP #pragma omp for collapse(4) #endif @@ -176,20 +173,20 @@ struct cal_stress_nl_op { for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ip1 = 0; ip1 < Nprojs; ip1++) + for (int ip1 = 0; ip1 < nproj; ip1++) { - for (int ip2 = 0; ip2 < Nprojs; ip2++) + for (int ip2 = 0; ip2 < nproj; ip2++) { const int ib2 = ib*2; - FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + FPTYPE fac = d_wg[ib] * 1.0; + FPTYPE ekb_now = d_ekb[ib]; std::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; std::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; std::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; std::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; std::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; - const int inkb1 = sum + ia * Nprojs + ip1; - const int inkb2 = sum + ia * Nprojs + ip2; + const int inkb1 = sum + ia * nproj + ip1; + const int inkb2 = sum + ia * nproj + ip2; // out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; @@ -201,7 +198,7 @@ struct cal_stress_nl_op } // end ip } // ia } - sum += atom_na[it] * Nprojs; + sum += atom_na[it] * nproj; iat += atom_na[it]; } // end it #ifdef _OPENMP diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index 725bfdb981..2f4bbd0989 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -66,8 +66,6 @@ struct cal_stress_nl_op /// @param nbands_occ - number of occupied bands /// @param ntype - total atomic type /// @param spin - current spin - /// @param wg_nc - the second dimension of matrix wg - /// @param ik - current k point /// @param deeq_2 - the second dimension of deeq /// @param deeq_3 - the third dimension of deeq /// @param deeq_4 - the forth dimension of deeq @@ -90,8 +88,6 @@ struct cal_stress_nl_op const int& nbands_occ, const int& ntype, const int& spin, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -111,8 +107,6 @@ struct cal_stress_nl_op const int& nkb, const int& nbands_occ, const int& ntype, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -258,8 +252,6 @@ struct cal_stress_nl_op const int& nbands_occ, const int& ntype, const int& spin, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -279,8 +271,6 @@ struct cal_stress_nl_op const int& nkb, const int& nbands_occ, const int& ntype, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp index a791fd933b..3da3ca3b19 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp @@ -2887,7 +2887,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_cpu) hamilt::cal_force_nl_op()(cpu_ctx, multi_proj, nbands_occ, - wg_nc, ntype, spin, deeq_2, @@ -2895,7 +2894,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_cpu) deeq_4, forcenl_nc, nbands, - ik, nkb, atom_nh.data(), atom_na.data(), @@ -2980,7 +2978,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu) hamilt::cal_force_nl_op()(gpu_ctx, multi_proj, nbands_occ, - wg_nc, ntype, spin, deeq_2, @@ -2988,7 +2985,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu) deeq_4, forcenl_nc, nbands, - ik, nkb, d_atom_nh, d_atom_na, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp index 81cd5de858..4f84fa098d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp @@ -96,8 +96,6 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_cpu) nbands_occ, ntype, spin, - wg_nc, - ik, deeq_2, deeq_3, deeq_4, @@ -271,8 +269,6 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu) nbands_occ, ntype, spin, - wg_nc, - ik, deeq_2, deeq_3, deeq_4, From 92e8f581dbb7396bf4e14fbe8384a27b40f0828d Mon Sep 17 00:00:00 2001 From: dyzheng Date: Thu, 31 Oct 2024 06:59:52 +0000 Subject: [PATCH 8/8] Test: add force and stress test for noncollinear-spin and soc --- tests/integrate/104_PW_NC_magnetic/INPUT | 18 ++++++++++-------- tests/integrate/104_PW_NC_magnetic/KPT | 2 +- tests/integrate/104_PW_NC_magnetic/STRU | 2 +- tests/integrate/104_PW_NC_magnetic/jd | 1 + tests/integrate/104_PW_NC_magnetic/result.ref | 11 +++++------ tests/integrate/140_PW_15_SO/INPUT | 6 +++++- tests/integrate/140_PW_15_SO/KPT | 2 +- tests/integrate/140_PW_15_SO/result.ref | 8 +++++--- 8 files changed, 29 insertions(+), 21 deletions(-) create mode 100644 tests/integrate/104_PW_NC_magnetic/jd diff --git a/tests/integrate/104_PW_NC_magnetic/INPUT b/tests/integrate/104_PW_NC_magnetic/INPUT index 622af75b86..d761feb2c1 100644 --- a/tests/integrate/104_PW_NC_magnetic/INPUT +++ b/tests/integrate/104_PW_NC_magnetic/INPUT @@ -4,10 +4,9 @@ suffix autotest #nbands 40 calculation scf -nspin 2 -ecutwfc 15 -scf_thr 1.0e-8 -scf_nmax 200 +ecutwfc 20 +scf_thr 1.0e-6 +scf_nmax 50 out_chg 0 #init_chg file @@ -16,14 +15,17 @@ out_chg 0 #out_band 1 noncolin 1 smearing_method gaussian -smearing_sigma 0.07 +smearing_sigma 0.02 mixing_type broyden -mixing_beta 0.5 +mixing_beta 0.2 mixing_ndim 10 -ks_solver cg +ks_solver dav_subspace +pw_diag_ndim 2 basis_type pw -symmetry 1 +symmetry 0 +cal_force 1 +cal_stress 1 pseudo_dir ../../PP_ORB orbital_dir ../../PP_ORB diff --git a/tests/integrate/104_PW_NC_magnetic/KPT b/tests/integrate/104_PW_NC_magnetic/KPT index db31432445..28006d5e2d 100644 --- a/tests/integrate/104_PW_NC_magnetic/KPT +++ b/tests/integrate/104_PW_NC_magnetic/KPT @@ -1,4 +1,4 @@ K_POINTS 0 Gamma -1 1 1 0 0 0 +2 2 2 0 0 0 diff --git a/tests/integrate/104_PW_NC_magnetic/STRU b/tests/integrate/104_PW_NC_magnetic/STRU index d1981a2504..5b61e75c50 100644 --- a/tests/integrate/104_PW_NC_magnetic/STRU +++ b/tests/integrate/104_PW_NC_magnetic/STRU @@ -18,5 +18,5 @@ Fe 1.0 2 0.00 0.00 0.00 1 1 1 angle1 90 angle2 0 -0.50 0.50 0.50 1 1 1 angle1 90 angle2 180 +0.51 0.52 0.53 1 1 1 angle1 90 angle2 0 diff --git a/tests/integrate/104_PW_NC_magnetic/jd b/tests/integrate/104_PW_NC_magnetic/jd new file mode 100644 index 0000000000..a4292b9c28 --- /dev/null +++ b/tests/integrate/104_PW_NC_magnetic/jd @@ -0,0 +1 @@ +Test noncolliner-spin calculation for FM-Fe2 , test force and stress for k-222 diff --git a/tests/integrate/104_PW_NC_magnetic/result.ref b/tests/integrate/104_PW_NC_magnetic/result.ref index 384804bb8d..21c5c36613 100644 --- a/tests/integrate/104_PW_NC_magnetic/result.ref +++ b/tests/integrate/104_PW_NC_magnetic/result.ref @@ -1,6 +1,5 @@ -etotref -5866.197307043378 -etotperatomref -2933.0986535217 -pointgroupref O_h -spacegroupref O_h -nksibzref 1 -totaltimeref 2.23 +etotref -6156.9375288905130219 +etotperatomref -3078.4687644453 +totalforceref 2.221910 +totalstressref 76009.325784 +totaltimeref 3.03 diff --git a/tests/integrate/140_PW_15_SO/INPUT b/tests/integrate/140_PW_15_SO/INPUT index fc09132c7c..1c58840435 100644 --- a/tests/integrate/140_PW_15_SO/INPUT +++ b/tests/integrate/140_PW_15_SO/INPUT @@ -25,12 +25,16 @@ ecutwfc 20 scf_thr 1e-6 scf_nmax 100 +cal_force 1 +cal_stress 1 + #cal_stress 1 #noncolin 1 lspinorb 1 basis_type pw -ks_solver dav +ks_solver dav_subspace +pw_diag_ndim 2 chg_extrap second-order out_dm 0 pw_diag_thr 0.00001 diff --git a/tests/integrate/140_PW_15_SO/KPT b/tests/integrate/140_PW_15_SO/KPT index c289c0158a..e769af7638 100644 --- a/tests/integrate/140_PW_15_SO/KPT +++ b/tests/integrate/140_PW_15_SO/KPT @@ -1,4 +1,4 @@ K_POINTS 0 Gamma -1 1 1 0 0 0 +2 1 1 0 0 0 diff --git a/tests/integrate/140_PW_15_SO/result.ref b/tests/integrate/140_PW_15_SO/result.ref index 2f6201a31f..6bb325aacb 100644 --- a/tests/integrate/140_PW_15_SO/result.ref +++ b/tests/integrate/140_PW_15_SO/result.ref @@ -1,3 +1,5 @@ -etotref -1668.616833271131 -etotperatomref -834.3084166356 -totaltimeref 5.15 +etotref -1678.3650981686614614 +etotperatomref -839.1825490843 +totalforceref 1.739332 +totalstressref 34372.194072 +totaltimeref 0.99