diff --git a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp index 11b69a8580..4250146510 100644 --- a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp +++ b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp @@ -184,7 +184,7 @@ void XC_Functional::gradcorr(double &etxc, double &vtxc, ModuleBase::matrix &v, } gdr2 = new ModuleBase::Vector3[rhopw->nrxx]; - h2 = new ModuleBase::Vector3[rhopw->nrxx]; + if(!is_stress) h2 = new ModuleBase::Vector3[rhopw->nrxx]; XC_Functional::grad_rho( rhogsum1 , gdr1, rhopw, ucell->tpiba); XC_Functional::grad_rho( rhogsum2 , gdr2, rhopw, ucell->tpiba); diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp index 04caefd1bf..92733745b4 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp @@ -45,7 +45,7 @@ void Forces::cal_force_nl(ModuleBase::matrix& forcenl, break; } } - const int npm = ucell_in.get_npol() * nbands_occ; + const int npm = nbands_occ; // calculate becp = for all beta functions nl_tools.cal_becp(ik, npm); for (int ipol = 0; ipol < 3; ipol++) diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index cc05bc8e52..4dbdbc558f 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -108,12 +108,6 @@ void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix this->atom_na = h_atom_na.data(); this->ppcell_vkb = this->nlpp_->vkb.c; } - - // prepare the memory of the becp and dbecp: - // becp: - // dbecp: - resmem_complex_op()(this->ctx, becp, this->nbands * nkb, "Stress::becp"); - resmem_complex_op()(this->ctx, dbecp, 6 * this->nbands * nkb, "Stress::dbecp"); } template @@ -163,9 +157,12 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) { ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); + const int npol = this->ucell_->get_npol(); + const int size_becp = this->nbands * npol * this->nkb; + const int size_becp_act = npm * npol * this->nkb; if (this->becp == nullptr) { - resmem_complex_op()(this->ctx, becp, this->nbands * this->nkb); + resmem_complex_op()(this->ctx, becp, size_becp); } // prepare math tools @@ -249,11 +246,12 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) } const char transa = 'C'; const char transb = 'N'; + int npm_npol = npm * npol; gemm_op()(this->ctx, transa, transb, nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, ppcell_vkb, @@ -268,15 +266,15 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) if (this->device == base_device::GpuDevice) { std::complex* h_becp = nullptr; - resmem_complex_h_op()(this->cpu_ctx, h_becp, this->nbands * nkb); - syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, this->nbands * nkb); - Parallel_Reduce::reduce_pool(h_becp, this->nbands * nkb); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, this->nbands * nkb); + resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act); + syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, size_becp_act); + Parallel_Reduce::reduce_pool(h_becp, size_becp_act); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, size_becp_act); delmem_complex_h_op()(this->cpu_ctx, h_becp); } else { - Parallel_Reduce::reduce_pool(becp, this->nbands * this->nkb); + Parallel_Reduce::reduce_pool(becp, size_becp_act); } ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); } @@ -287,9 +285,12 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i { ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); + const int npol = this->ucell_->get_npol(); + const int size_becp = this->nbands * npol * this->nkb; + const int npm_npol = npm * npol; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, this->nbands * this->nkb); + resmem_complex_op()(this->ctx, dbecp, size_becp); } // prepare math tools @@ -401,7 +402,7 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i transa, transb, nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, ppcell_vkb, @@ -412,43 +413,68 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i dbecp, nkb); // calculate stress for target (ipol, jpol) - const int current_spin = this->kv_->isk[ik]; - cal_stress_nl_op()(this->ctx, - nondiagonal, - ipol, - jpol, - nkb, - npm, - this->ntype, - current_spin, // uspp only - this->nbands, - ik, - this->nlpp_->deeq.getBound2(), - this->nlpp_->deeq.getBound3(), - this->nlpp_->deeq.getBound4(), - atom_nh, - atom_na, - d_wg, - d_ekb, - qq_nt, - deeq, - becp, - dbecp, - stress); + if(npol == 1) + { + const int current_spin = this->kv_->isk[ik]; + cal_stress_nl_op()(this->ctx, + nondiagonal, + ipol, + jpol, + nkb, + npm, + this->ntype, + current_spin, // uspp only + this->nlpp_->deeq.getBound2(), + this->nlpp_->deeq.getBound3(), + this->nlpp_->deeq.getBound4(), + atom_nh, + atom_na, + d_wg + this->nbands * ik, + d_ekb + this->nbands * ik, + qq_nt, + deeq, + becp, + dbecp, + stress); + } + else + { + cal_stress_nl_op()(this->ctx, + ipol, + jpol, + nkb, + npm, + this->ntype, + this->nlpp_->deeq_nc.getBound2(), + this->nlpp_->deeq_nc.getBound3(), + this->nlpp_->deeq_nc.getBound4(), + atom_nh, + atom_na, + d_wg + this->nbands * ik, + d_ekb + this->nbands * ik, + qq_nt, + this->nlpp_->template get_deeq_nc_data(), + becp, + dbecp, + stress); + } ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); } template void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) { - ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s"); + ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f"); + const int npol = this->ucell_->get_npol(); + const int npm_npol = npm * npol; + const int size_becp = this->nbands * npol * this->nkb; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, 3 * this->nbands * this->nkb); + resmem_complex_op()(this->ctx, dbecp, 3 * size_becp); } - std::complex* dbecp_ptr = this->dbecp + ipol * this->nbands * this->nkb; + std::complex* dbecp_ptr = this->dbecp + ipol * size_becp; const std::complex* vkb_ptr = this->ppcell_vkb; std::complex* vkb_deri_ptr = this->ppcell_vkb; @@ -481,7 +507,7 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) transa, transb, this->nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, vkb_deri_ptr, @@ -491,7 +517,6 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) &ModuleBase::ZERO, dbecp_ptr, nkb); - this->revert_vkb(npw, ipol); this->pre_ik_f = ik; ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f"); @@ -634,29 +659,52 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force const int current_spin = this->kv_->isk[ik]; const int force_nc = 3; // calculate the force - cal_force_nl_op()(this->ctx, - nondiagonal, - npm, - this->nbands, - this->ntype, - current_spin, - this->nlpp_->deeq.getBound2(), - this->nlpp_->deeq.getBound3(), - this->nlpp_->deeq.getBound4(), - force_nc, - this->nbands, - ik, - nkb, - atom_nh, - atom_na, - this->ucell_->tpiba, - d_wg, - d_ekb, - qq_nt, - deeq, - becp, - dbecp, - force); + if(this->ucell_->get_npol() == 1) + { + cal_force_nl_op()(this->ctx, + nondiagonal, + npm, + this->ntype, + current_spin, + this->nlpp_->deeq.getBound2(), + this->nlpp_->deeq.getBound3(), + this->nlpp_->deeq.getBound4(), + force_nc, + this->nbands, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg + this->nbands * ik, + d_ekb + this->nbands * ik, + qq_nt, + deeq, + becp, + dbecp, + force); + } + else + { + cal_force_nl_op()(this->ctx, + npm, + this->ntype, + this->nlpp_->deeq_nc.getBound2(), + this->nlpp_->deeq_nc.getBound3(), + this->nlpp_->deeq_nc.getBound4(), + force_nc, + this->nbands, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg + this->nbands * ik, + d_ekb + this->nbands * ik, + qq_nt, + this->nlpp_->template get_deeq_nc_data(), + becp, + dbecp, + force); + } } // template instantiation diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index f8df7b3029..dddf889de1 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -35,7 +35,6 @@ __global__ void cal_vkb1_nl( template __global__ void cal_force_nl( const bool nondiagonal, - const int wg_nc, const int ntype, const int spin, const int deeq_2, @@ -43,7 +42,6 @@ __global__ void cal_force_nl( const int deeq_4, const int forcenl_nc, const int nbands, - const int ik, const int nkb, const int *atom_nh, const int *atom_na, @@ -65,11 +63,11 @@ __global__ void cal_force_nl( sum += atom_na[ii] * atom_nh[ii]; } - int Nprojs = atom_nh[it]; - FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + int nproj = atom_nh[it]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip]; FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; @@ -85,8 +83,8 @@ __global__ void cal_force_nl( } if (nondiagonal) { - //for (int ip2=0; ip2 void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, const bool& nondiagonal, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& spin, const int& deeq_2, @@ -142,7 +139,6 @@ void cal_force_nl_op::operator()(const base_dev const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -157,9 +153,9 @@ void cal_force_nl_op::operator()(const base_dev { cal_force_nl<<>>( nondiagonal, - wg_nc, ntype, spin, + ntype, spin, deeq_2, deeq_3, deeq_4, - forcenl_nc, nbands, ik, nkb, + forcenl_nc, nbands, nkb, atom_nh, atom_na, tpiba, d_wg, d_ekb, qq_nt, deeq, @@ -170,6 +166,106 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } +template +__global__ void cal_force_nl( + const int ntype, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int forcenl_nc, + const int nbands, + const int nkb, + const int *atom_nh, + const int *atom_na, + const FPTYPE tpiba, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *force) +{ + const int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + int nproj = atom_nh[it]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; + for (int ia = 0; ia < atom_na[it]; ia++) { + for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { + const int inkb = sum + ip; + for (int ip2 = 0; ip2 < nproj; ip2++) + { + // Effective values of the D-eS coefficients + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + const thrust::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + for (int ipol = 0; ipol < 3; ipol++) { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const thrust::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const thrust::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const thrust::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const thrust::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + } + iat += 1; + sum += nproj; + } +} + +// interface for nspin=4 only +template +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + cal_force_nl<<>>( + ntype, + deeq_2, deeq_3, deeq_4, + forcenl_nc, nbands, nkb, + atom_nh, atom_na, + tpiba, + d_wg, d_ekb, qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force);// array of data + + cudaCheckOnDebug(); +} + template __global__ void saveVkbValues_( const int *gcar_zero_ptrs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu index 275fbfe136..89a74f3c5f 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu @@ -54,8 +54,8 @@ __global__ void nonlocal_pw( thrust::complex* ps, const thrust::complex* becp) { - const int ii = blockIdx.x / l2; - const int jj = blockIdx.x % l2; + const int ii = blockIdx.x * 2 / l2; + const int jj = blockIdx.x * 2 % l2; for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) { thrust::complex res1(0.0, 0.0); thrust::complex res2(0.0, 0.0); @@ -121,7 +121,7 @@ void hamilt::nonlocal_pw_op::operator()(const b { // denghui implement 20221109 // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - nonlocal_pw<<>>( + nonlocal_pw<<>>( l1, l2, l3, // loop size sum, iat, nkb, // control params deeq_x, deeq_y, deeq_z, @@ -138,4 +138,4 @@ void hamilt::nonlocal_pw_op::operator()(const b template struct nonlocal_pw_op; template struct nonlocal_pw_op; -} // namespace hamilt \ No newline at end of file +} // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index e965446d77..e566126876 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -107,8 +107,6 @@ __global__ void cal_stress_nl( const int nkb, const int ntype, const int spin, - const int wg_nc, - const int ik, const int deeq_2, const int deeq_3, const int deeq_4, @@ -125,22 +123,25 @@ __global__ void cal_stress_nl( int ib = blockIdx.x / ntype; int it = blockIdx.x % ntype; - int iat = 0, sum = 0; + int iat = 0; + int sum = 0; for (int ii = 0; ii < it; ii++) { iat += atom_na[ii]; sum += atom_na[ii] * atom_nh[ii]; } - FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; - const int Nprojs = atom_nh[it]; + FPTYPE stress_var = 0; + const FPTYPE fac = d_wg[ib]; + const FPTYPE ekb_now = d_ekb[ib]; + const int nproj = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { - int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) { + const int ip1 = ii / nproj, ip2 = ii % nproj; if(!nondiagonal && ip1 != ip2) { continue; } - FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] + const FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; const int inkb1 = sum + ip1; const int inkb2 = sum + ip2; @@ -149,7 +150,7 @@ __global__ void cal_stress_nl( stress_var -= ps * fac * dbb; } ++iat; - sum+=Nprojs; + sum+=nproj; }//ia __syncwarp(); warp_reduce(stress_var); @@ -204,8 +205,6 @@ void cal_stress_nl_op::operator()(const base_de const int& nbands_occ, const int& ntype, const int& spin, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -226,8 +225,6 @@ void cal_stress_nl_op::operator()(const base_de nkb, ntype, spin, - wg_nc, - ik, deeq_2, deeq_3, deeq_4, @@ -244,6 +241,110 @@ void cal_stress_nl_op::operator()(const base_de cudaCheckOnDebug(); } +template +__global__ void cal_stress_nl( + const int ipol, + const int jpol, + const int nkb, + const int ntype, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + const int ib = blockIdx.x / ntype; // index of loop-nbands + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; // index of loop-ntype + + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + FPTYPE stress_var = 0; + const FPTYPE fac = d_wg[ib]; + const FPTYPE ekb_now = d_ekb[ib]; + const int nproj = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) { + const int ip1 = ii / nproj; + const int ip2 = ii % nproj; + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const int inkb1 = sum + ip1; + const int inkb2 = sum + ip2; + //out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const thrust::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + ++iat; + sum+=nproj; + }//ia + __syncwarp(); + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress + ipol * 3 + jpol, stress_var); + } +} + +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + cal_stress_nl<<>>( + ipol, + jpol, + nkb, + ntype, + deeq_2, + deeq_3, + deeq_4, + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + cudaCheckOnDebug(); +} + template void cal_stress_mgga_op::operator()( const int& spin, @@ -710,34 +811,6 @@ void cal_force_npw_op::operator()( } -// template -// void prepare_vkb_deri_ptr_op::operator()( -// const base_device::DEVICE_GPU* ctx, -// int nbeta, double* nhtol, int nhtol_nc, int npw, int it, -// int ipol, int jpol, -// std::complex*vkb_out, std::complex** vkb_ptrs, -// FPTYPE* ylm_in, FPTYPE** ylm_ptrs, -// FPTYPE* ylm_deri_in, FPTYPE** ylm_deri_ptr1s, FPTYPE** ylm_deri_ptr2s, -// FPTYPE* vq_in, FPTYPE** vq_ptrs, -// FPTYPE* vq_deri_in, FPTYPE** vq_deri_ptrs -// ) -// { -// const int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; -// dim3 gridsize(block,nbeta); - - -// prepare_vkb_deri_ptr<<<1,1>>>( -// nbeta, nhtol, nhtol_nc, npw, it, ipol, jpol, -// reinterpret_cast*>(vkb_out), -// reinterpret_cast*>(vkb_ptrs), -// ylm_in, ylm_ptrs, ylm_deri_in, ylm_deri_ptr1s, ylm_deri_ptr2s, -// vq_in, vq_ptrs, vq_deri_in, vq_deri_ptrs -// ); - -// return ; -// } - - template <> void pointer_array_malloc::operator()( void **ptr, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 1584c5c681..2ee9cb4530 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -42,7 +42,6 @@ struct cal_force_nl_op void operator()(const base_device::DEVICE_CPU* ctx, const bool& nondiagonal, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& spin, const int& deeq_2, @@ -50,7 +49,6 @@ struct cal_force_nl_op const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -71,7 +69,7 @@ struct cal_force_nl_op int sum0 = 0; for (int it = 0; it < ntype; it++) { - const int Nprojs = atom_nh[it]; + const int nproj = atom_nh[it]; #ifdef _OPENMP #pragma omp for collapse(2) #endif @@ -80,11 +78,11 @@ struct cal_force_nl_op for (int ib = 0; ib < nbands_occ; ib++) { FPTYPE local_force[3] = {0, 0, 0}; - FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; int iat = iat0 + ia; - int sum = sum0 + ia * Nprojs; - for (int ip = 0; ip < Nprojs; ip++) + int sum = sum0 + ia * nproj; + for (int ip = 0; ip < nproj; ip++) { // Effective values of the D-eS coefficients FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] @@ -102,7 +100,7 @@ struct cal_force_nl_op } if (nondiagonal) { - for (int ip2 = 0; ip2 < Nprojs; ip2++) + for (int ip2 = 0; ip2 < nproj; ip2++) { if (ip != ip2) { @@ -141,7 +139,104 @@ struct cal_force_nl_op } } // end ia iat0 += atom_na[it]; - sum0 += atom_na[it] * Nprojs; + sum0 += atom_na[it] * nproj; + } // end it +#ifdef _OPENMP + } +#endif + }; + + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nbands_occ, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) + { +#ifdef _OPENMP +#pragma omp parallel + { +#endif + int iat0 = 0; + int sum0 = 0; + for (int it = 0; it < ntype; it++) + { + const int nprojs = atom_nh[it]; +#ifdef _OPENMP +#pragma omp for collapse(2) +#endif + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ib = 0; ib < nbands_occ; ib++) + { + const int ib2 = ib*2; + FPTYPE local_force[3] = {0, 0, 0}; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; + int iat = iat0 + ia; + int sum = sum0 + ia * nprojs; + for (int ip = 0; ip < nprojs; ip++) + { + const int inkb = sum + ip; + // out<<"\n ps = "< ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + std::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + std::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + std::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + std::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + + for (int ipol = 0; ipol < 3; ipol++) + { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const std::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const std::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const std::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const std::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + + local_force[ipol] -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + } + } +#ifdef _OPENMP + if (omp_get_num_threads() > 1) + { + for (int ipol = 0; ipol < 3; ++ipol) + { +#pragma omp atomic + force[iat * forcenl_nc + ipol] += local_force[ipol]; + } + } + else +#endif + { + for (int ipol = 0; ipol < 3; ++ipol) + { + force[iat * forcenl_nc + ipol] += local_force[ipol]; + } + } + } + } // end ia + iat0 += atom_na[it]; + sum0 += atom_na[it] * nprojs; } // end it #ifdef _OPENMP } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 885997222c..c68df7bd0f 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -48,7 +48,6 @@ struct cal_force_nl_op /// @param ctx - which device this function runs on /// @param nondiagonal - control flag /// @param nbands_occ - number of occupied bands - /// @param wg_nc - the second dimension of matrix wg /// @param ntype - total atomic type /// @param spin - current spin /// @param deeq_2 - the second dimension of deeq @@ -56,7 +55,6 @@ struct cal_force_nl_op /// @param deeq_4 - the forth dimension of deeq /// @param forcenl_nc - the second dimension of matrix forcenl /// @param nbands - NBANDS - /// @param ik - current k point /// @param nkb - number of k point /// @param atom_nh - GlobalC::ucell.atoms[ii].ncpp.nh /// @param atom_na - GlobalC::ucell.atoms[ii].na @@ -73,7 +71,6 @@ struct cal_force_nl_op void operator()(const base_device::DEVICE_CPU* ctx, const bool& nondiagonal, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& spin, const int& deeq_2, @@ -81,7 +78,6 @@ struct cal_force_nl_op const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -93,6 +89,26 @@ struct cal_force_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* force); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nbands_occ, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); }; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM @@ -117,7 +133,6 @@ struct cal_force_nl_op void operator()(const base_device::DEVICE_GPU* ctx, const bool& nondiagonal, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& spin, const int& deeq_2, @@ -125,7 +140,6 @@ struct cal_force_nl_op const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -137,6 +151,26 @@ struct cal_force_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* force); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); }; /** diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index fa03fa2a5d..da45a375e7 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -31,7 +31,6 @@ __global__ void cal_vkb1_nl( template __global__ void cal_force_nl( const bool nondiagonal, - const int wg_nc, const int ntype, const int spin, const int deeq_2, @@ -39,7 +38,6 @@ __global__ void cal_force_nl( const int deeq_4, const int forcenl_nc, const int nbands, - const int ik, const int nkb, const int *atom_nh, const int *atom_na, @@ -55,17 +53,18 @@ __global__ void cal_force_nl( const int ib = blockIdx.x / ntype; const int it = blockIdx.x % ntype; - int iat = 0, sum = 0; + int iat = 0; + int sum = 0; for (int ii = 0; ii < it; ii++) { iat += atom_na[ii]; sum += atom_na[ii] * atom_nh[ii]; } - int Nprojs = atom_nh[it]; - FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + int nproj = atom_nh[it]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip]; FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; @@ -81,8 +80,8 @@ __global__ void cal_force_nl( } if (nondiagonal) { - //for (int ip2=0; ip2 void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, const bool& nondiagonal, const int& nbands_occ, - const int& wg_nc, const int& ntype, const int& spin, const int& deeq_2, @@ -138,7 +136,6 @@ void cal_force_nl_op::operator()(const base_dev const int& deeq_4, const int& forcenl_nc, const int& nbands, - const int& ik, const int& nkb, const int* atom_nh, const int* atom_na, @@ -153,9 +150,9 @@ void cal_force_nl_op::operator()(const base_dev { hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, nondiagonal, - wg_nc, ntype, spin, + ntype, spin, deeq_2, deeq_3, deeq_4, - forcenl_nc, nbands, ik, nkb, + forcenl_nc, nbands, nkb, atom_nh, atom_na, tpiba, d_wg, d_ekb, qq_nt, deeq, @@ -166,6 +163,107 @@ void cal_force_nl_op::operator()(const base_dev hipCheckOnDebug(); } +template +__global__ void cal_force_nl( + const int ntype, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int forcenl_nc, + const int nbands, + const int nkb, + const int *atom_nh, + const int *atom_na, + const FPTYPE tpiba, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *force) +{ + const int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; + + int iat = 0; + int sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + int nproj = atom_nh[it]; + FPTYPE fac = d_wg[ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ib]; + for (int ia = 0; ia < atom_na[it]; ia++) { + for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { + const int inkb = sum + ip; + for (int ip2 = 0; ip2 < nproj; ip2++) + { + // Effective values of the D-eS coefficients + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + const thrust::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + for (int ipol = 0; ipol < 3; ipol++) { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const thrust::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const thrust::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const thrust::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const thrust::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + } + iat += 1; + sum += nproj; + } +} + +// interface for nspin=4 only +template +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, + ntype, + deeq_2, deeq_3, deeq_4, + forcenl_nc, nbands, nkb, + atom_nh, atom_na, + tpiba, + d_wg, d_ekb, qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force);// array of data + + hipCheckOnDebug(); +} + template __global__ void saveVkbValues_( const int *gcar_zero_ptrs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu index 93df9fe58d..9a43f64ea6 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu @@ -54,8 +54,8 @@ __global__ void nonlocal_pw( thrust::complex* ps, const thrust::complex* becp) { - const int ii = blockIdx.x / l2; - const int jj = blockIdx.x % l2; + const int ii = blockIdx.x * 2 / l2; + const int jj = blockIdx.x * 2 % l2; for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) { thrust::complex res1(0.0, 0.0); thrust::complex res2(0.0, 0.0); @@ -122,7 +122,7 @@ void hamilt::nonlocal_pw_op::operator()(const b { // denghui implement 20221109 // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw), dim3(l1 * l2), dim3(THREADS_PER_BLOCK), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw), dim3(l1 * l2 / 2), dim3(THREADS_PER_BLOCK), 0, 0, l1, l2, l3, // loop size sum, iat, nkb, // control params deeq_x, deeq_y, deeq_z, @@ -140,4 +140,4 @@ void hamilt::nonlocal_pw_op::operator()(const b namespace hamilt{ template struct nonlocal_pw_op; template struct nonlocal_pw_op; -} \ No newline at end of file +} diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index 30a735ee57..6880987546 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -102,8 +102,6 @@ __global__ void cal_stress_nl( const int nkb, const int ntype, const int spin, - const int wg_nc, - const int ik, const int deeq_2, const int deeq_3, const int deeq_4, @@ -120,18 +118,21 @@ __global__ void cal_stress_nl( int ib = blockIdx.x / ntype; int it = blockIdx.x % ntype; - int iat = 0, sum = 0; + int iat = 0; + int sum = 0; for (int ii = 0; ii < it; ii++) { iat += atom_na[ii]; sum += atom_na[ii] * atom_nh[ii]; } - FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; - const int Nprojs = atom_nh[it]; + FPTYPE stress_var = 0; + FPTYPE fac = d_wg[ib]; + FPTYPE ekb_now = d_ekb[ib]; + const int nproj = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { - int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) { + int ip1 = ii / nproj, ip2 = ii % nproj; if(!nondiagonal && ip1 != ip2) { continue; } @@ -144,7 +145,7 @@ __global__ void cal_stress_nl( stress_var -= ps * fac * dbb; } ++iat; - sum+=Nprojs; + sum+=nproj; }//ia warp_reduce(stress_var); if (threadIdx.x % WARP_SIZE == 0) { @@ -198,8 +199,6 @@ void cal_stress_nl_op::operator()(const base_de const int& nbands_occ, const int& ntype, const int& spin, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -220,8 +219,6 @@ void cal_stress_nl_op::operator()(const base_de nkb, ntype, spin, - wg_nc, - ik, deeq_2, deeq_3, deeq_4, @@ -238,6 +235,110 @@ void cal_stress_nl_op::operator()(const base_de hipCheckOnDebug(); } +template +__global__ void cal_stress_nl( + const int ipol, + const int jpol, + const int nkb, + const int ntype, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + int it = blockIdx.x % ntype; + + int iat = 0; + int sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + FPTYPE stress_var = 0; + FPTYPE fac = d_wg[ib]; + FPTYPE ekb_now = d_ekb[ib]; + const int nproj = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) { + const int ip1 = ii / nproj; + const int ip2 = ii % nproj; + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const int inkb1 = sum + ip1; + const int inkb2 = sum + ip2; + //out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const thrust::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + ++iat; + sum+=nproj; + }//ia + __syncwarp(); + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress + ipol * 3 + jpol, stress_var); + } +} + +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, + ipol, + jpol, + nkb, + ntype, + deeq_2, + deeq_3, + deeq_4, + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + hipCheckOnDebug(); +} + template void cal_stress_mgga_op::operator()( const int& spin, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index c3a8008e04..f36f2e8316 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -77,8 +77,6 @@ struct cal_stress_nl_op const int& nbands_occ, const int& ntype, const int& spin, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -97,10 +95,11 @@ struct cal_stress_nl_op #pragma omp parallel reduction(+ : local_stress) { #endif - int iat = 0, sum = 0; + int iat = 0; + int sum = 0; for (int it = 0; it < ntype; it++) { - const int Nprojs = atom_nh[it]; + const int nproj = atom_nh[it]; #ifdef _OPENMP #pragma omp for collapse(4) #endif @@ -108,20 +107,20 @@ struct cal_stress_nl_op { for (int ia = 0; ia < atom_na[it]; ia++) { - for (int ip1 = 0; ip1 < Nprojs; ip1++) + for (int ip1 = 0; ip1 < nproj; ip1++) { - for (int ip2 = 0; ip2 < Nprojs; ip2++) + for (int ip2 = 0; ip2 < nproj; ip2++) { if (!nondiagonal && ip1 != ip2) { continue; } - FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; - FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + FPTYPE fac = d_wg[ib] * 1.0; + FPTYPE ekb_now = d_ekb[ib]; FPTYPE ps = deeq[((spin * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; - const int inkb1 = sum + ia * Nprojs + ip1; - const int inkb2 = sum + ia * Nprojs + ip2; + const int inkb1 = sum + ia * nproj + ip1; + const int inkb2 = sum + ia * nproj + ip2; // out<<"\n ps = "< } // end ip } // ia } - sum += atom_na[it] * Nprojs; + sum += atom_na[it] * nproj; + iat += atom_na[it]; + } // end it +#ifdef _OPENMP + } +#endif + stress[ipol * 3 + jpol] += local_stress; + }; + + void operator()(const base_device::DEVICE_CPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) + { + FPTYPE local_stress = 0; +#ifdef _OPENMP +#pragma omp parallel reduction(+ : local_stress) + { +#endif + int iat = 0, sum = 0; + for (int it = 0; it < ntype; it++) + { + const int nproj = atom_nh[it]; +#ifdef _OPENMP +#pragma omp for collapse(4) +#endif + for (int ib = 0; ib < nbands_occ; ib++) + { + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ip1 = 0; ip1 < nproj; ip1++) + { + for (int ip2 = 0; ip2 < nproj; ip2++) + { + const int ib2 = ib*2; + FPTYPE fac = d_wg[ib] * 1.0; + FPTYPE ekb_now = d_ekb[ib]; + std::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + std::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + std::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + std::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + std::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const int inkb1 = sum + ia * nproj + ip1; + const int inkb2 = sum + ia * nproj + ip2; + // out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const std::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const std::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const std::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + local_stress -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + } // end ip + } // ia + } + sum += atom_na[it] * nproj; iat += atom_na[it]; } // end it #ifdef _OPENMP diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index 3603b16cac..2f4bbd0989 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -66,8 +66,6 @@ struct cal_stress_nl_op /// @param nbands_occ - number of occupied bands /// @param ntype - total atomic type /// @param spin - current spin - /// @param wg_nc - the second dimension of matrix wg - /// @param ik - current k point /// @param deeq_2 - the second dimension of deeq /// @param deeq_3 - the third dimension of deeq /// @param deeq_4 - the forth dimension of deeq @@ -90,8 +88,6 @@ struct cal_stress_nl_op const int& nbands_occ, const int& ntype, const int& spin, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -104,6 +100,25 @@ struct cal_stress_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* stress); + // interface for nspin=4 only + void operator()(const Device* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); }; template @@ -237,8 +252,6 @@ struct cal_stress_nl_op const int& nbands_occ, const int& ntype, const int& spin, - const int& wg_nc, - const int& ik, const int& deeq_2, const int& deeq_3, const int& deeq_4, @@ -251,6 +264,25 @@ struct cal_stress_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* stress); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); }; // cpu version first, gpu version later diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp index a791fd933b..3da3ca3b19 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp @@ -2887,7 +2887,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_cpu) hamilt::cal_force_nl_op()(cpu_ctx, multi_proj, nbands_occ, - wg_nc, ntype, spin, deeq_2, @@ -2895,7 +2894,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_cpu) deeq_4, forcenl_nc, nbands, - ik, nkb, atom_nh.data(), atom_na.data(), @@ -2980,7 +2978,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu) hamilt::cal_force_nl_op()(gpu_ctx, multi_proj, nbands_occ, - wg_nc, ntype, spin, deeq_2, @@ -2988,7 +2985,6 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu) deeq_4, forcenl_nc, nbands, - ik, nkb, d_atom_nh, d_atom_na, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp index 81cd5de858..4f84fa098d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp @@ -96,8 +96,6 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_cpu) nbands_occ, ntype, spin, - wg_nc, - ik, deeq_2, deeq_3, deeq_4, @@ -271,8 +269,6 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu) nbands_occ, ntype, spin, - wg_nc, - ik, deeq_2, deeq_3, deeq_4, diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp index 48a212d3ef..489e1fe0f8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp @@ -30,8 +30,9 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, int npwx=0; for (int ik = 0; ik < p_kv->get_nks(); ik++) { - if (npwx < p_kv->ngk[ik]) + if (npwx < p_kv->ngk[ik]) { npwx = p_kv->ngk[ik]; +} } gk[0]= new FPTYPE[npwx]; @@ -40,6 +41,7 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, FPTYPE tpiba = ModuleBase::TWO_PI / GlobalC::ucell.lat0; FPTYPE twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI); FPTYPE* kfac = new FPTYPE[npwx]; + const int npol = GlobalC::ucell.get_npol(); for (int ik = 0; ik < p_kv->get_nks(); ik++) { @@ -72,8 +74,8 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, { for (int ibnd = 0; ibnd < PARAM.inp.nbands; ibnd++) { - if (std::fabs(wg(ik, ibnd)) < ModuleBase::threshold_wg * wg(ik, 0)) - continue; + if (wg(ik, ibnd) == 0.0) { continue; +} const std::complex* ppsi = nullptr; ppsi = &(psi_in[0](ik, ibnd, 0)); @@ -86,6 +88,18 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real())); } + if(npol == 2) + { + ppsi = &(psi_in[0](ik, ibnd, npwx)); +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : sum) +#endif + for (int i = 0; i < npw; i++) + { + sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] + * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real())); + } + } s_kin[l][m] += sum; } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp index 7a0d614f6f..d214c99ba1 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp @@ -23,10 +23,10 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, const int nspin_rho = (PARAM.inp.nspin == 2) ? 2 : 1; - if (PARAM.globalv.gamma_only_pw && is_pw) { fact=2.0; -} - - + if (PARAM.inp.gamma_only && is_pw) + { + fact=2.0; + } std::vector> aux(rho_basis->nmaxgr); @@ -52,11 +52,11 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, aux[ir] = std::complex(chr->rho[0][ir], 0.0 ); } } - for (int is = 1; is < nspin_rho; is++) + if(nspin_rho == 2) { for (int ir = irb; ir < ir_end; ++ir) { // accumulate aux - aux[ir] += std::complex(chr->rho[is][ir], 0.0 ); + aux[ir] += std::complex(chr->rho[1][ir], 0.0 ); } } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp index d3cbb06885..5b177b37b6 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp @@ -49,7 +49,7 @@ void Stress_Func::stress_nl(ModuleBase::matrix& sigma, break; } } - const int npm = ucell_in.get_npol() * nbands_occ; + const int npm = nbands_occ; // calculate becp = for all beta functions nl_tools.cal_becp(ik, npm); diff --git a/tests/integrate/104_PW_NC_magnetic/INPUT b/tests/integrate/104_PW_NC_magnetic/INPUT index 622af75b86..d761feb2c1 100644 --- a/tests/integrate/104_PW_NC_magnetic/INPUT +++ b/tests/integrate/104_PW_NC_magnetic/INPUT @@ -4,10 +4,9 @@ suffix autotest #nbands 40 calculation scf -nspin 2 -ecutwfc 15 -scf_thr 1.0e-8 -scf_nmax 200 +ecutwfc 20 +scf_thr 1.0e-6 +scf_nmax 50 out_chg 0 #init_chg file @@ -16,14 +15,17 @@ out_chg 0 #out_band 1 noncolin 1 smearing_method gaussian -smearing_sigma 0.07 +smearing_sigma 0.02 mixing_type broyden -mixing_beta 0.5 +mixing_beta 0.2 mixing_ndim 10 -ks_solver cg +ks_solver dav_subspace +pw_diag_ndim 2 basis_type pw -symmetry 1 +symmetry 0 +cal_force 1 +cal_stress 1 pseudo_dir ../../PP_ORB orbital_dir ../../PP_ORB diff --git a/tests/integrate/104_PW_NC_magnetic/KPT b/tests/integrate/104_PW_NC_magnetic/KPT index db31432445..28006d5e2d 100644 --- a/tests/integrate/104_PW_NC_magnetic/KPT +++ b/tests/integrate/104_PW_NC_magnetic/KPT @@ -1,4 +1,4 @@ K_POINTS 0 Gamma -1 1 1 0 0 0 +2 2 2 0 0 0 diff --git a/tests/integrate/104_PW_NC_magnetic/STRU b/tests/integrate/104_PW_NC_magnetic/STRU index d1981a2504..5b61e75c50 100644 --- a/tests/integrate/104_PW_NC_magnetic/STRU +++ b/tests/integrate/104_PW_NC_magnetic/STRU @@ -18,5 +18,5 @@ Fe 1.0 2 0.00 0.00 0.00 1 1 1 angle1 90 angle2 0 -0.50 0.50 0.50 1 1 1 angle1 90 angle2 180 +0.51 0.52 0.53 1 1 1 angle1 90 angle2 0 diff --git a/tests/integrate/104_PW_NC_magnetic/jd b/tests/integrate/104_PW_NC_magnetic/jd new file mode 100644 index 0000000000..a4292b9c28 --- /dev/null +++ b/tests/integrate/104_PW_NC_magnetic/jd @@ -0,0 +1 @@ +Test noncolliner-spin calculation for FM-Fe2 , test force and stress for k-222 diff --git a/tests/integrate/104_PW_NC_magnetic/result.ref b/tests/integrate/104_PW_NC_magnetic/result.ref index 384804bb8d..21c5c36613 100644 --- a/tests/integrate/104_PW_NC_magnetic/result.ref +++ b/tests/integrate/104_PW_NC_magnetic/result.ref @@ -1,6 +1,5 @@ -etotref -5866.197307043378 -etotperatomref -2933.0986535217 -pointgroupref O_h -spacegroupref O_h -nksibzref 1 -totaltimeref 2.23 +etotref -6156.9375288905130219 +etotperatomref -3078.4687644453 +totalforceref 2.221910 +totalstressref 76009.325784 +totaltimeref 3.03 diff --git a/tests/integrate/140_PW_15_SO/INPUT b/tests/integrate/140_PW_15_SO/INPUT index fc09132c7c..1c58840435 100644 --- a/tests/integrate/140_PW_15_SO/INPUT +++ b/tests/integrate/140_PW_15_SO/INPUT @@ -25,12 +25,16 @@ ecutwfc 20 scf_thr 1e-6 scf_nmax 100 +cal_force 1 +cal_stress 1 + #cal_stress 1 #noncolin 1 lspinorb 1 basis_type pw -ks_solver dav +ks_solver dav_subspace +pw_diag_ndim 2 chg_extrap second-order out_dm 0 pw_diag_thr 0.00001 diff --git a/tests/integrate/140_PW_15_SO/KPT b/tests/integrate/140_PW_15_SO/KPT index c289c0158a..e769af7638 100644 --- a/tests/integrate/140_PW_15_SO/KPT +++ b/tests/integrate/140_PW_15_SO/KPT @@ -1,4 +1,4 @@ K_POINTS 0 Gamma -1 1 1 0 0 0 +2 1 1 0 0 0 diff --git a/tests/integrate/140_PW_15_SO/result.ref b/tests/integrate/140_PW_15_SO/result.ref index 2f6201a31f..6bb325aacb 100644 --- a/tests/integrate/140_PW_15_SO/result.ref +++ b/tests/integrate/140_PW_15_SO/result.ref @@ -1,3 +1,5 @@ -etotref -1668.616833271131 -etotperatomref -834.3084166356 -totaltimeref 5.15 +etotref -1678.3650981686614614 +etotperatomref -839.1825490843 +totalforceref 1.739332 +totalstressref 34372.194072 +totaltimeref 0.99