diff --git a/source/Makefile.Objects b/source/Makefile.Objects index 46aa744f24..d0a42b1c1e 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -332,6 +332,8 @@ OBJS_HAMILT=hamilt_pw.o\ op_exx_pw.o\ ekinetic_pw.o\ ekinetic_op.o\ + exx_pw_ace.o\ + exx_pw_pot.o\ hpsi_norm_op.o\ veff_pw.o\ veff_op.o\ diff --git a/source/source_cell/pseudo.cpp b/source/source_cell/pseudo.cpp index 233632069d..c6dc9de2f7 100644 --- a/source/source_cell/pseudo.cpp +++ b/source/source_cell/pseudo.cpp @@ -1,5 +1,6 @@ #include "pseudo.h" #include "source_base/tool_title.h" +#include pseudo::pseudo() { diff --git a/source/source_esolver/esolver_ks_pw.cpp b/source/source_esolver/esolver_ks_pw.cpp index 6ed705eb1c..059c49212d 100644 --- a/source/source_esolver/esolver_ks_pw.cpp +++ b/source/source_esolver/esolver_ks_pw.cpp @@ -645,8 +645,9 @@ void ESolver_KS_PW::iter_finish(UnitCell& ucell, const int istep, int //---------------------------------------------------------- // 3) Print out electronic wavefunctions in pw basis //---------------------------------------------------------- - if (iter % PARAM.inp.out_freq_elec == 0 || iter == PARAM.inp.scf_nmax || conv_esolver) + if (iter % PARAM.inp.out_freq_elec == 0 || iter == PARAM.inp.scf_nmax) { + // conv_esolver == true has already been dealt with in after_scf ModuleIO::write_wfc_pw(GlobalV::KPAR, GlobalV::MY_POOL, GlobalV::MY_RANK, diff --git a/source/source_pw/module_pwdft/CMakeLists.txt b/source/source_pw/module_pwdft/CMakeLists.txt index 03e808f6e6..81ad6ddc2c 100644 --- a/source/source_pw/module_pwdft/CMakeLists.txt +++ b/source/source_pw/module_pwdft/CMakeLists.txt @@ -10,6 +10,8 @@ list(APPEND objects operator_pw/operator_pw.cpp operator_pw/onsite_proj_pw.cpp operator_pw/op_exx_pw.cpp + operator_pw/exx_pw_ace.cpp + operator_pw/exx_pw_pot.cpp forces_nl.cpp forces_cc.cpp forces_scc.cpp diff --git a/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt b/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt index e4f2ef4f10..0a0e923ee0 100644 --- a/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt +++ b/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt @@ -6,6 +6,9 @@ list(APPEND operator_ks_pw_srcs meta_pw.cpp velocity_pw.cpp onsite_proj_pw.cpp + op_exx_pw.cpp + exx_pw_ace.cpp + exx_pw_pot.cpp ) # this library is included in module_pwdft now diff --git a/source/source_pw/module_pwdft/operator_pw/exx_pw_ace.cpp b/source/source_pw/module_pwdft/operator_pw/exx_pw_ace.cpp new file mode 100644 index 0000000000..e7a4eb6ef1 --- /dev/null +++ b/source/source_pw/module_pwdft/operator_pw/exx_pw_ace.cpp @@ -0,0 +1,247 @@ +#include "op_exx_pw.h" + +namespace hamilt +{ +template +void OperatorEXXPW::act_op_ace(const int nbands, + const int nbasis, + const int npol, + const T *tmpsi_in, + T *tmhpsi, + const int ngk_ik, + const bool is_first_node) const +{ + ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace"); + // std::cout << "act_op_ace" << std::endl; + // hpsi += -Xi^\dagger * Xi * psi + T* Xi_ace = Xi_ace_k[this->ik]; + int nbands_tot = psi.get_nbands(); + int nbasis_max = psi.get_nbasis(); + // T* hpsi = nullptr; + // resmem_complex_op()(hpsi, nbands_tot * nbasis); + // setmem_complex_op()(hpsi, 0, nbands_tot * nbasis); + T* Xi_psi = nullptr; + resmem_complex_op()(Xi_psi, nbands_tot * nbands); + setmem_complex_op()(Xi_psi, 0, nbands_tot * nbands); + + char trans_N = 'N', trans_T = 'T', trans_C = 'C'; + T intermediate_one = 1.0, intermediate_zero = 0.0, intermediate_minus_one = -1.0; + // Xi * psi + gemm_complex_op()(trans_N, + trans_N, + nbands_tot, + nbands, + nbasis, + &intermediate_one, + Xi_ace, + nbands_tot, + tmpsi_in, + nbasis, + &intermediate_zero, + Xi_psi, + nbands_tot + ); + + Parallel_Reduce::reduce_pool(Xi_psi, nbands_tot * nbands); + + // Xi^\dagger * (Xi * psi) + gemm_complex_op()(trans_C, + trans_N, + nbasis, + nbands, + nbands_tot, + &intermediate_minus_one, + Xi_ace, + nbands_tot, + Xi_psi, + nbands_tot, + &intermediate_one, + tmhpsi, + nbasis + ); + + + // // negative sign, add to hpsi + // vec_add_vec_complex_op()(this->ctx, nbands * nbasis, tmhpsi, hpsi, -1, tmhpsi, 1); + // delmem_complex_op()(hpsi); + delmem_complex_op()(Xi_psi); + ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace"); + +} + +template +void OperatorEXXPW::construct_ace() const +{ + ModuleBase::timer::tick("OperatorEXXPW", "construct_ace"); + // int nkb = p_exx_helper->psi.get_nbands() * p_exx_helper->psi.get_nk(); + int nbands = psi.get_nbands(); + int nbasis = psi.get_nbasis(); + int nk = psi.get_nk(); + + int ik_save = this->ik; + int * ik_ = const_cast(&this->ik); + + T intermediate_one = 1.0, intermediate_zero = 0.0; + + if (h_psi_ace == nullptr) + { + resmem_complex_op()(h_psi_ace, nbands * nbasis); + setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); + } + + if (Xi_ace_k.size() != nk) + { + Xi_ace_k.resize(nk); + for (int i = 0; i < nk; i++) + { + resmem_complex_op()(Xi_ace_k[i], nbands * nbasis); + } + } + + for (int i = 0; i < nk; i++) + { + setmem_complex_op()(Xi_ace_k[i], 0, nbands * nbasis); + } + + if (L_ace == nullptr) + { + resmem_complex_op()(L_ace, nbands * nbands); + setmem_complex_op()(L_ace, 0, nbands * nbands); + } + + if (psi_h_psi_ace == nullptr) + { + resmem_complex_op()(psi_h_psi_ace, nbands * nbands); + } + + if (first_iter) return; + + for (int ik = 0; ik < nk; ik++) + { + int npwk = wfcpw->npwk[ik]; + + T* Xi_ace = Xi_ace_k[ik]; + psi.fix_kb(ik, 0); + T* p_psi = psi.get_pointer(); + + setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); + + *ik_ = ik; + + act_op( + nbands, + nbasis, + 1, + p_psi, + h_psi_ace, + nbasis, + false + ); + + // psi_h_psi_ace = psi^\dagger * h_psi_ace + // p_exx_helper->psi.fix_kb(0, 0); + gemm_complex_op()('C', + 'N', + nbands, + nbands, + npwk, + &intermediate_one, + p_psi, + nbasis, + h_psi_ace, + nbasis, + &intermediate_zero, + psi_h_psi_ace, + nbands); + + // reduction of psi_h_psi_ace, due to distributed memory + Parallel_Reduce::reduce_pool(psi_h_psi_ace, nbands * nbands); + + T intermediate_minus_one = -1.0; + axpy_complex_op()(nbands * nbands, + &intermediate_minus_one, + psi_h_psi_ace, + 1, + L_ace, + 1); + + + int info = 0; + char up = 'U', lo = 'L'; + + lapack_potrf()(lo, nbands, L_ace, nbands); + + // expand for-loop + for (int i = 0; i < nbands; ++i) { + setmem_complex_op()(L_ace + i * nbands, 0, i); + } + + // L_ace inv in place + char non = 'N'; + lapack_trtri()(lo, non, nbands, L_ace, nbands); + + // Xi_ace = L_ace^-1 * h_psi_ace^dagger + gemm_complex_op()('N', + 'C', + nbands, + npwk, + nbands, + &intermediate_one, + L_ace, + nbands, + h_psi_ace, + nbasis, + &intermediate_zero, + Xi_ace, + nbands); + + // clear mem + setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); + setmem_complex_op()(psi_h_psi_ace, 0, nbands * nbands); + setmem_complex_op()(L_ace, 0, nbands * nbands); + + } + + *ik_ = ik_save; + ModuleBase::timer::tick("OperatorEXXPW", "construct_ace"); + +} + +template +double OperatorEXXPW::cal_exx_energy_ace(psi::Psi* ppsi_) const +{ + double Eexx = 0; + + psi::Psi psi_ = *ppsi_; + int* ik_ = const_cast(&this->ik); + int ik_save = this->ik; + for (int i = 0; i < wfcpw->nks; i++) + { + setmem_complex_op()(h_psi_ace, 0, psi_.get_nbands() * psi_.get_nbasis()); + *ik_ = i; + psi_.fix_kb(i, 0); + T* psi_i = psi_.get_pointer(); + act_op_ace(psi_.get_nbands(), psi_.get_nbasis(), 1, psi_i, h_psi_ace, 0, true); + + for (int nband = 0; nband < psi_.get_nbands(); nband++) + { + psi_.fix_kb(i, nband); + T* psi_i_n = psi_.get_pointer(); + T* hpsi_i_n = h_psi_ace + nband * psi_.get_nbasis(); + double wg_i_n = (*wg)(i, nband); + // Eexx += dot(psi_i_n, h_psi_i_n) + Eexx += dot_op()(psi_.get_nbasis(), psi_i_n, hpsi_i_n, false) * wg_i_n * 2; + } + } + + Parallel_Reduce::reduce_pool(Eexx); + *ik_ = ik_save; + return Eexx; +} +template class OperatorEXXPW, base_device::DEVICE_CPU>; +template class OperatorEXXPW, base_device::DEVICE_CPU>; +#if ((defined __CUDA) || (defined __ROCM)) +template class OperatorEXXPW, base_device::DEVICE_GPU>; +template class OperatorEXXPW, base_device::DEVICE_GPU>; +#endif +} \ No newline at end of file diff --git a/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp b/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp new file mode 100644 index 0000000000..1e0f2d00b9 --- /dev/null +++ b/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp @@ -0,0 +1,294 @@ +#include "op_exx_pw.h" +#include "source_io/module_parameter/parameter.h" +#include "source_pw/module_pwdft/global.h" + +namespace hamilt +{ +template +void OperatorEXXPW::get_potential() const +{ + Real nqs_half1 = 0.5 * kv->nmp[0]; + Real nqs_half2 = 0.5 * kv->nmp[1]; + Real nqs_half3 = 0.5 * kv->nmp[2]; + + Real* pot_cpu = nullptr; + int nks = wfcpw->nks, npw = rhopw_dev->npw; + double tpiba2 = tpiba * tpiba; + pot_cpu = new Real[npw * wfcpw->nks * wfcpw->nks]; + // fill zero + setmem_real_cpu_op()(pot_cpu, 0, npw * nks * nks); + + // calculate Fock pot + auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock]; + for (auto param: param_fock) + { + double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock); + double alpha = std::stod(param["alpha"]); + for (int ik = 0; ik < nks; ik++) + { + for (int iq = 0; iq < nks; iq++) + { + const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; + const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; + const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; + const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; + +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int ig = 0; ig < rhopw_dev->npw; ig++) + { + const ModuleBase::Vector3 g_d = rhopw_dev->gdirect[ig]; + const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; + // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) + // 7/8 of the points in the grid are "activated" and 1/8 are disabled. + // grid_factor is designed for the 7/8 of the grid to function like all of the points + Real grid_factor = 1; + double extrapolate_grid = 8.0 / 7.0; + if (gamma_extrapolation) + { + // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3) + auto isint = [](double x) { + double epsilon = 1e-6; // this follows the isint judgement in q-e + return std::abs(x - std::round(x)) < epsilon; + }; + if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)) + { + grid_factor = 0; + } + else + { + grid_factor = extrapolate_grid; + } + } + + const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; + const int nk = nks / nk_fac; + const int ig_kq = ik * nks * npw + iq * npw + ig; + + Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2; + // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 + if (gg >= 1e-8) + { + Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; + pot_cpu[ig_kq] += fac * grid_factor * alpha; + } + // } + else + { + pot_cpu[ig_kq] += exx_div * alpha; + } + // assert(is_finite(density_recip[ig])); + } + } + } + } + + // calculate erfc pot + auto param_erfc = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Erfc]; + for (auto param: param_erfc) + { + double erfc_omega = std::stod(param["omega"]); + double erfc_omega2 = erfc_omega * erfc_omega; + double alpha = std::stod(param["alpha"]); + double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc, erfc_omega); + for (int ik = 0; ik < nks; ik++) + { + for (int iq = 0; iq < nks; iq++) + { + const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; + const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; + const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; + const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; + +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int ig = 0; ig < rhopw_dev->npw; ig++) + { + const ModuleBase::Vector3 g_d = rhopw_dev->gdirect[ig]; + const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; + // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) + // 7/8 of the points in the grid are "activated" and 1/8 are disabled. + // grid_factor is designed for the 7/8 of the grid to function like all of the points + Real grid_factor = 1; + double extrapolate_grid = 8.0 / 7.0; + if (gamma_extrapolation) + { + // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3) + auto isint = [](double x) { + double epsilon = 1e-6; // this follows the isint judgement in q-e + return std::abs(x - std::round(x)) < epsilon; + }; + if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)) + { + grid_factor = 0; + } + else + { + grid_factor = extrapolate_grid; + } + } + + const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; + const int nk = nks / nk_fac; + const int ig_kq = ik * nks * npw + iq * npw + ig; + + Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2; + // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 + if (gg >= 1e-8) + { + Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; + pot_cpu[ig_kq] += fac * (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) * grid_factor * alpha; + } + // } + else + { + // if (PARAM.inp.dft_functional == "hse") + if (!gamma_extrapolation) + { + pot_cpu[ig_kq] += (exx_div - ModuleBase::PI * ModuleBase::e2 / erfc_omega2) * alpha; + } + else + { + pot_cpu[ig_kq] += exx_div * alpha; + } + } + // assert(is_finite(density_recip[ig])); + } + } + } + } + + // copy the potential to the device memory + syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw * wfcpw->nks * wfcpw->nks); + + delete pot_cpu; +} + +template +double OperatorEXXPW::exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type, double erfc_omega) const +{ + double exx_div = 0; + + Real nqs_half1 = 0.5 * kv->nmp[0]; + Real nqs_half2 = 0.5 * kv->nmp[1]; + Real nqs_half3 = 0.5 * kv->nmp[2]; + + int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; + + // here we follow the exx_divergence subroutine in q-e (PW/src/exx_base.f90) + double alpha = 10.0 / wfcpw->gk_ecut; + double tpiba2 = tpiba * tpiba; + double div = 0; + + // this is the \sum_q F(q) part + // temporarily for all k points, should be replaced to q points later + for (int ik = 0; ik < wfcpw->nks; ik++) + { + const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; + const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : div) +#endif + for (int ig = 0; ig < rhopw_dev->npw; ig++) + { + const ModuleBase::Vector3 q_c = k_c + rhopw_dev->gcar[ig]; + const ModuleBase::Vector3 q_d = k_d + rhopw_dev->gdirect[ig]; + double qq = q_c.norm2(); + // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) + // 7/8 of the points in the grid are "activated" and 1/8 are disabled. + // grid_factor is designed for the 7/8 of the grid to function like all of the points + Real grid_factor = 1; + double extrapolate_grid = 8.0 / 7.0; + if (gamma_extrapolation) + { + auto isint = [](double x) { + double epsilon = 1e-6; // this follows the isint judgement in q-e + return std::abs(x - std::round(x)) < epsilon; + }; + if (isint(q_d[0] * nqs_half1) && isint(q_d[1] * nqs_half2) && isint(q_d[2] * nqs_half3)) + { + grid_factor = 0; + } + else + { + grid_factor = extrapolate_grid; + } + } + + if (qq <= 1e-8) + continue; + // else if (PARAM.inp.dft_functional == "hse") + else if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) + { + double omega = erfc_omega; + double omega2 = omega * omega; + div += std::exp(-alpha * qq) / qq * (1.0 - std::exp(-qq * tpiba2 / 4.0 / omega2)) * grid_factor; + } + else + { + div += std::exp(-alpha * qq) / qq * grid_factor; + } + } + } + + Parallel_Reduce::reduce_pool(div); + // std::cout << "EXX div: " << div << std::endl; + + // if (PARAM.inp.dft_functional == "hse") + if (!gamma_extrapolation) + { + if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) + { + double omega = erfc_omega; + div += tpiba2 / 4.0 / omega / omega; // compensate for the finite value when qq = 0 + } + else + { + div -= alpha; + } + } + + div *= ModuleBase::e2 * ModuleBase::FOUR_PI / tpiba2 / wfcpw->nks; + // std::cout << "div: " << div << std::endl; + + // numerically value the mean value of F(q) in the reciprocal space + // This means we need to calculate the average of F(q) in the first brillouin zone + alpha /= tpiba2; + int nqq = 100000; + double dq = 5.0 / std::sqrt(alpha) / nqq; + double aa = 0.0; + // if (PARAM.inp.dft_functional == "hse") + if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) + { + double omega = erfc_omega; + double omega2 = omega * omega; +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : aa) +#endif + for (int i = 0; i < nqq; i++) + { + double q = dq * (i + 0.5); + aa -= exp(-alpha * q * q) * exp(-q * q / 4.0 / omega2) * dq; + } + } + aa *= 8 / ModuleBase::FOUR_PI; + aa += 1.0 / std::sqrt(alpha * ModuleBase::PI); + + // printf("ucell: %p\n", ucell); + double omega = ucell->omega; + div -= ModuleBase::e2 * omega * aa; + exx_div = div * wfcpw->nks / nk_fac; + // exx_div = 0; + // std::cout << "EXX divergence: " << exx_div << std::endl; + + return exx_div; +} +template class OperatorEXXPW, base_device::DEVICE_CPU>; +template class OperatorEXXPW, base_device::DEVICE_CPU>; +#if ((defined __CUDA) || (defined __ROCM)) +template class OperatorEXXPW, base_device::DEVICE_GPU>; +template class OperatorEXXPW, base_device::DEVICE_GPU>; +#endif +} \ No newline at end of file diff --git a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp index 68bfca3158..9e5d1d82e6 100644 --- a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp +++ b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp @@ -263,211 +263,6 @@ void OperatorEXXPW::act_op(const int nbands, } -template -void OperatorEXXPW::act_op_ace(const int nbands, - const int nbasis, - const int npol, - const T *tmpsi_in, - T *tmhpsi, - const int ngk_ik, - const bool is_first_node) const -{ - ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace"); -// std::cout << "act_op_ace" << std::endl; - // hpsi += -Xi^\dagger * Xi * psi - T* Xi_ace = Xi_ace_k[this->ik]; - int nbands_tot = psi.get_nbands(); - int nbasis_max = psi.get_nbasis(); -// T* hpsi = nullptr; -// resmem_complex_op()(hpsi, nbands_tot * nbasis); -// setmem_complex_op()(hpsi, 0, nbands_tot * nbasis); - T* Xi_psi = nullptr; - resmem_complex_op()(Xi_psi, nbands_tot * nbands); - setmem_complex_op()(Xi_psi, 0, nbands_tot * nbands); - - char trans_N = 'N', trans_T = 'T', trans_C = 'C'; - T intermediate_one = 1.0, intermediate_zero = 0.0, intermediate_minus_one = -1.0; - // Xi * psi - gemm_complex_op()(trans_N, - trans_N, - nbands_tot, - nbands, - nbasis, - &intermediate_one, - Xi_ace, - nbands_tot, - tmpsi_in, - nbasis, - &intermediate_zero, - Xi_psi, - nbands_tot - ); - - Parallel_Reduce::reduce_pool(Xi_psi, nbands_tot * nbands); - - // Xi^\dagger * (Xi * psi) - gemm_complex_op()(trans_C, - trans_N, - nbasis, - nbands, - nbands_tot, - &intermediate_minus_one, - Xi_ace, - nbands_tot, - Xi_psi, - nbands_tot, - &intermediate_one, - tmhpsi, - nbasis - ); - - -// // negative sign, add to hpsi -// vec_add_vec_complex_op()(this->ctx, nbands * nbasis, tmhpsi, hpsi, -1, tmhpsi, 1); -// delmem_complex_op()(hpsi); - delmem_complex_op()(Xi_psi); - ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace"); - -} - -template -void OperatorEXXPW::construct_ace() const -{ - ModuleBase::timer::tick("OperatorEXXPW", "construct_ace"); -// int nkb = p_exx_helper->psi.get_nbands() * p_exx_helper->psi.get_nk(); - int nbands = psi.get_nbands(); - int nbasis = psi.get_nbasis(); - int nk = psi.get_nk(); - - int ik_save = this->ik; - int * ik_ = const_cast(&this->ik); - - T intermediate_one = 1.0, intermediate_zero = 0.0; - - if (h_psi_ace == nullptr) - { - resmem_complex_op()(h_psi_ace, nbands * nbasis); - setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); - } - - if (Xi_ace_k.size() != nk) - { - Xi_ace_k.resize(nk); - for (int i = 0; i < nk; i++) - { - resmem_complex_op()(Xi_ace_k[i], nbands * nbasis); - } - } - - for (int i = 0; i < nk; i++) - { - setmem_complex_op()(Xi_ace_k[i], 0, nbands * nbasis); - } - - if (L_ace == nullptr) - { - resmem_complex_op()(L_ace, nbands * nbands); - setmem_complex_op()(L_ace, 0, nbands * nbands); - } - - if (psi_h_psi_ace == nullptr) - { - resmem_complex_op()(psi_h_psi_ace, nbands * nbands); - } - - if (first_iter) return; - - for (int ik = 0; ik < nk; ik++) - { - int npwk = wfcpw->npwk[ik]; - - T* Xi_ace = Xi_ace_k[ik]; - psi.fix_kb(ik, 0); - T* p_psi = psi.get_pointer(); - - setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); - - *ik_ = ik; - - act_op( - nbands, - nbasis, - 1, - p_psi, - h_psi_ace, - nbasis, - false - ); - - // psi_h_psi_ace = psi^\dagger * h_psi_ace - // p_exx_helper->psi.fix_kb(0, 0); - gemm_complex_op()('C', - 'N', - nbands, - nbands, - npwk, - &intermediate_one, - p_psi, - nbasis, - h_psi_ace, - nbasis, - &intermediate_zero, - psi_h_psi_ace, - nbands); - - // reduction of psi_h_psi_ace, due to distributed memory - Parallel_Reduce::reduce_pool(psi_h_psi_ace, nbands * nbands); - - T intermediate_minus_one = -1.0; - axpy_complex_op()(nbands * nbands, - &intermediate_minus_one, - psi_h_psi_ace, - 1, - L_ace, - 1); - - - int info = 0; - char up = 'U', lo = 'L'; - - lapack_potrf()(lo, nbands, L_ace, nbands); - - // expand for-loop - for (int i = 0; i < nbands; ++i) { - setmem_complex_op()(L_ace + i * nbands, 0, i); - } - - // L_ace inv in place - char non = 'N'; - lapack_trtri()(lo, non, nbands, L_ace, nbands); - - // Xi_ace = L_ace^-1 * h_psi_ace^dagger - gemm_complex_op()('N', - 'C', - nbands, - npwk, - nbands, - &intermediate_one, - L_ace, - nbands, - h_psi_ace, - nbasis, - &intermediate_zero, - Xi_ace, - nbands); - - // clear mem - setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); - setmem_complex_op()(psi_h_psi_ace, 0, nbands * nbands); - setmem_complex_op()(L_ace, 0, nbands * nbands); - - } - - *ik_ = ik_save; - ModuleBase::timer::tick("OperatorEXXPW", "construct_ace"); - -} - template std::vector OperatorEXXPW::get_q_points(const int ik) const { @@ -560,291 +355,6 @@ OperatorEXXPW::OperatorEXXPW(const OperatorEXXPW *op } -template -void OperatorEXXPW::get_potential() const -{ - Real nqs_half1 = 0.5 * kv->nmp[0]; - Real nqs_half2 = 0.5 * kv->nmp[1]; - Real nqs_half3 = 0.5 * kv->nmp[2]; - - Real* pot_cpu = nullptr; - int nks = wfcpw->nks, npw = rhopw_dev->npw; - double tpiba2 = tpiba * tpiba; - pot_cpu = new Real[npw * wfcpw->nks * wfcpw->nks]; - // fill zero - setmem_real_cpu_op()(pot_cpu, 0, npw * nks * nks); - - // calculate Fock pot - auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock]; - for (auto param : param_fock) - { - double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock); - double alpha = std::stod(param["alpha"]); - for (int ik = 0; ik < nks; ik++) - { - for (int iq = 0; iq < nks; iq++) - { - const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; - const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; - const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; - const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; - -#ifdef _OPENMP -#pragma omp parallel for schedule(static) -#endif - for (int ig = 0; ig < rhopw_dev->npw; ig++) - { - const ModuleBase::Vector3 g_d = rhopw_dev->gdirect[ig]; - const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; - // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) - // 7/8 of the points in the grid are "activated" and 1/8 are disabled. - // grid_factor is designed for the 7/8 of the grid to function like all of the points - Real grid_factor = 1; - double extrapolate_grid = 8.0 / 7.0; - if (gamma_extrapolation) - { - // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3) - auto isint = [](double x) { - double epsilon = 1e-6; // this follows the isint judgement in q-e - return std::abs(x - std::round(x)) < epsilon; - }; - if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)) - { - grid_factor = 0; - } - else - { - grid_factor = extrapolate_grid; - } - } - - const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; - const int nk = nks / nk_fac; - const int ig_kq = ik * nks * npw + iq * npw + ig; - - Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2; - // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 - if (gg >= 1e-8) - { - Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; - pot_cpu[ig_kq] += fac * grid_factor * alpha; - } - // } - else - { - pot_cpu[ig_kq] += exx_div * alpha; - } - // assert(is_finite(density_recip[ig])); - } - } - } - } - - // calculate erfc pot - auto param_erfc = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Erfc]; - for (auto param : param_erfc) - { - double erfc_omega = std::stod(param["omega"]); - double erfc_omega2 = erfc_omega * erfc_omega; - double alpha = std::stod(param["alpha"]); - double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc, erfc_omega); - for (int ik = 0; ik < nks; ik++) - { - for (int iq = 0; iq < nks; iq++) - { - const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; - const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; - const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; - const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; - -#ifdef _OPENMP -#pragma omp parallel for schedule(static) -#endif - for (int ig = 0; ig < rhopw_dev->npw; ig++) - { - const ModuleBase::Vector3 g_d = rhopw_dev->gdirect[ig]; - const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; - // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) - // 7/8 of the points in the grid are "activated" and 1/8 are disabled. - // grid_factor is designed for the 7/8 of the grid to function like all of the points - Real grid_factor = 1; - double extrapolate_grid = 8.0 / 7.0; - if (gamma_extrapolation) - { - // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3) - auto isint = [](double x) { - double epsilon = 1e-6; // this follows the isint judgement in q-e - return std::abs(x - std::round(x)) < epsilon; - }; - if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)) - { - grid_factor = 0; - } - else - { - grid_factor = extrapolate_grid; - } - } - - const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; - const int nk = nks / nk_fac; - const int ig_kq = ik * nks * npw + iq * npw + ig; - - Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2; - // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 - if (gg >= 1e-8) - { - Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; - pot_cpu[ig_kq] += fac * (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) * grid_factor * alpha; - } - // } - else - { - // if (PARAM.inp.dft_functional == "hse") - if (!gamma_extrapolation) - { - pot_cpu[ig_kq] += (exx_div - ModuleBase::PI * ModuleBase::e2 / erfc_omega2) * alpha; - } - else - { - pot_cpu[ig_kq] += exx_div * alpha; - } - } - // assert(is_finite(density_recip[ig])); - } - } - } - } - - // copy the potential to the device memory - syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw * wfcpw->nks * wfcpw->nks); - - delete pot_cpu; -} - -template -double OperatorEXXPW::exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type, double erfc_omega) const -{ - double exx_div = 0; - - Real nqs_half1 = 0.5 * kv->nmp[0]; - Real nqs_half2 = 0.5 * kv->nmp[1]; - Real nqs_half3 = 0.5 * kv->nmp[2]; - - int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; - - // here we follow the exx_divergence subroutine in q-e (PW/src/exx_base.f90) - double alpha = 10.0 / wfcpw->gk_ecut; - double tpiba2 = tpiba * tpiba; - double div = 0; - - // this is the \sum_q F(q) part - // temporarily for all k points, should be replaced to q points later - for (int ik = 0; ik < wfcpw->nks; ik++) - { - const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; - const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; -#ifdef _OPENMP -#pragma omp parallel for reduction(+:div) -#endif - for (int ig = 0; ig < rhopw_dev->npw; ig++) - { - const ModuleBase::Vector3 q_c = k_c + rhopw_dev->gcar[ig]; - const ModuleBase::Vector3 q_d = k_d + rhopw_dev->gdirect[ig]; - double qq = q_c.norm2(); - // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) - // 7/8 of the points in the grid are "activated" and 1/8 are disabled. - // grid_factor is designed for the 7/8 of the grid to function like all of the points - Real grid_factor = 1; - double extrapolate_grid = 8.0/7.0; - if (gamma_extrapolation) - { - auto isint = [](double x) - { - double epsilon = 1e-6; // this follows the isint judgement in q-e - return std::abs(x - std::round(x)) < epsilon; - }; - if (isint(q_d[0] * nqs_half1) && - isint(q_d[1] * nqs_half2) && - isint(q_d[2] * nqs_half3)) - { - grid_factor = 0; - } - else - { - grid_factor = extrapolate_grid; - } - } - - if (qq <= 1e-8) continue; - // else if (PARAM.inp.dft_functional == "hse") - else if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) - { - double omega = erfc_omega; - double omega2 = omega * omega; - div += std::exp(-alpha * qq) / qq * (1.0 - std::exp(-qq*tpiba2 / 4.0 / omega2)) * grid_factor; - } - else - { - div += std::exp(-alpha * qq) / qq * grid_factor; - } - } - } - - Parallel_Reduce::reduce_pool(div); - // std::cout << "EXX div: " << div << std::endl; - - // if (PARAM.inp.dft_functional == "hse") - if (!gamma_extrapolation) - { - if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) - { - double omega = erfc_omega; - div += tpiba2 / 4.0 / omega / omega; // compensate for the finite value when qq = 0 - } - else - { - div -= alpha; - } - - } - - div *= ModuleBase::e2 * ModuleBase::FOUR_PI / tpiba2 / wfcpw->nks; -// std::cout << "div: " << div << std::endl; - - // numerically value the mean value of F(q) in the reciprocal space - // This means we need to calculate the average of F(q) in the first brillouin zone - alpha /= tpiba2; - int nqq = 100000; - double dq = 5.0 / std::sqrt(alpha) / nqq; - double aa = 0.0; - // if (PARAM.inp.dft_functional == "hse") - if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) - { - double omega = erfc_omega; - double omega2 = omega * omega; -#ifdef _OPENMP -#pragma omp parallel for reduction(+:aa) -#endif - for (int i = 0; i < nqq; i++) - { - double q = dq * (i+0.5); - aa -= exp(-alpha * q * q) * exp(-q*q / 4.0 / omega2) * dq; - } - } - aa *= 8 / ModuleBase::FOUR_PI; - aa += 1.0 / std::sqrt(alpha * ModuleBase::PI); - - // printf("ucell: %p\n", ucell); - double omega = ucell->omega; - div -= ModuleBase::e2 * omega * aa; - exx_div = div * wfcpw->nks / nk_fac; -// exx_div = 0; -// std::cout << "EXX divergence: " << exx_div << std::endl; - - return exx_div; -} - template double OperatorEXXPW::cal_exx_energy(psi::Psi *psi_) const { @@ -858,41 +368,6 @@ double OperatorEXXPW::cal_exx_energy(psi::Psi *psi_) const } } -template -double OperatorEXXPW::cal_exx_energy_ace(psi::Psi *ppsi_) const -{ - double Eexx = 0; - - psi::Psi psi_ = *ppsi_; - int *ik_ = const_cast(&this->ik); - int ik_save = this->ik; - for (int i = 0; i < wfcpw->nks; i++) - { - setmem_complex_op()(h_psi_ace, 0, psi_.get_nbands() * psi_.get_nbasis()); - *ik_ = i; - psi_.fix_kb(i, 0); - T* psi_i = psi_.get_pointer(); - act_op_ace(psi_.get_nbands(), psi_.get_nbasis(), 1, psi_i, h_psi_ace, 0, true); - - for (int nband = 0; nband < psi_.get_nbands(); nband++) - { - psi_.fix_kb(i, nband); - T* psi_i_n = psi_.get_pointer(); - T* hpsi_i_n = h_psi_ace + nband * psi_.get_nbasis(); - double wg_i_n = (*wg)(i, nband); - // Eexx += dot(psi_i_n, h_psi_i_n) - Eexx += dot_op()(psi_.get_nbasis(), psi_i_n, hpsi_i_n, false) * wg_i_n * 2; - - } - - - } - - Parallel_Reduce::reduce_pool(Eexx); - *ik_ = ik_save; - return Eexx; -} - template double OperatorEXXPW::cal_exx_energy_op(psi::Psi *ppsi_) const {