diff --git a/CMakeLists.txt b/CMakeLists.txt index 00d59548c4..6e921dce23 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -253,9 +253,6 @@ if(ENABLE_LCAO) add_compile_definitions(__PEXSI) set(CMAKE_CXX_STANDARD 14) endif() - if(OLD_GINT) - add_compile_definitions(__OLD_GINT) - endif() else() set(ENABLE_MLALGO OFF) set(ENABLE_LIBRI OFF) diff --git a/source/Makefile.Objects b/source/Makefile.Objects index 8b185f2011..8353a0f754 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -62,7 +62,6 @@ VPATH=./src_global:\ ./source_lcao/module_deltaspin:\ ./source_lcao/module_operator_lcao:\ ./source_lcao/module_gint:\ -./source_lcao/module_gint/temp_gint:\ ./source_relax:\ ./source_hamilt/module_vdw:\ ./source_io:\ @@ -282,32 +281,7 @@ OBJS_ESOLVER_LCAO=esolver_ks_lcao.o\ esolver_dm2rho.o\ esolver_double_xc.o\ -OBJS_GINT=gint_old.o\ - gint_gamma_env.o\ - gint_gamma_vl.o\ - gint_fvl_old.o\ - gint_rho_old.o\ - gint_tau_old.o\ - gint_vl_old.o\ - gint_k_env.o\ - gint_k_sparse1.o\ - gint_k_pvpr.o\ - gint_k_pvdpr.o\ - gint_tools.o\ - grid_bigcell.o\ - grid_meshball.o\ - grid_meshcell.o\ - grid_meshk.o\ - grid_technique.o\ - gint_force_cpu_interface.o\ - gint_rho_cpu_interface.o\ - gint_vl_cpu_interface.o\ - cal_psir_ylm.o\ - cal_dpsir_ylm.o\ - cal_ddpsir_ylm.o\ - mult_psi_dmr.o\ - init_orb.o\ - batch_biggrid.o\ +OBJS_GINT=batch_biggrid.o\ big_grid.o\ biggrid_info.o\ divide_info.o\ @@ -655,7 +629,6 @@ OBJS_LCAO=evolve_elec.o\ stress_tools.o\ edm.o\ pulay_fs_center2.o\ - grid_init.o\ spar_dh.o\ spar_exx.o\ spar_hsr.o\ diff --git a/source/source_esolver/esolver_double_xc.cpp b/source/source_esolver/esolver_double_xc.cpp index 2658444bec..0174331a04 100644 --- a/source/source_esolver/esolver_double_xc.cpp +++ b/source/source_esolver/esolver_double_xc.cpp @@ -51,8 +51,6 @@ void ESolver_DoubleXC::before_all_runners(UnitCell& ucell, const Input_p this->pelec_base = new elecstate::ElecStateLCAO(&(this->chr_base), // use which parameter? &(this->kv), this->kv.get_nks(), - &(this->GG), - &(this->GK), this->pw_rho, this->pw_big); } @@ -145,8 +143,6 @@ void ESolver_DoubleXC::before_scf(UnitCell& ucell, const int istep) elecstate::DensityMatrix* DM = dynamic_cast*>(this->pelec_base)->get_DM(); this->p_hamilt_base = new hamilt::HamiltLCAO( - PARAM.globalv.gamma_only_local ? &(this->GG) : nullptr, - PARAM.globalv.gamma_only_local ? nullptr : &(this->GK), ucell, this->gd, &this->pv, diff --git a/source/source_esolver/esolver_gets.cpp b/source/source_esolver/esolver_gets.cpp index d503876212..4e79849367 100644 --- a/source/source_esolver/esolver_gets.cpp +++ b/source/source_esolver/esolver_gets.cpp @@ -53,8 +53,6 @@ void ESolver_GetS::before_all_runners(UnitCell& ucell, const Input_para& inp) this->pelec = new elecstate::ElecStateLCAO>(&(this->chr), // use which parameter? &(this->kv), this->kv.get_nks(), - nullptr, // mohan add 2024-04-01 - nullptr, // mohan add 2024-04-01 this->pw_rho, this->pw_big); } diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp index 5b7797169f..29388b04c1 100644 --- a/source/source_esolver/esolver_ks_lcao.cpp +++ b/source/source_esolver/esolver_ks_lcao.cpp @@ -57,7 +57,7 @@ void ESolver_KS_LCAO::before_all_runners(UnitCell& ucell, const Input_pa { // TK stands for double and std::complex? this->pelec = new elecstate::ElecStateLCAO(&(this->chr), &(this->kv), - this->kv.get_nks(), &(this->GG), &(this->GK), this->pw_rho, this->pw_big); + this->kv.get_nks(), this->pw_rho, this->pw_big); } // 3) read LCAO orbitals/projectors and construct the interpolation tables. @@ -136,7 +136,7 @@ void ESolver_KS_LCAO::before_all_runners(UnitCell& ucell, const Input_pa // 16) init rdmft, added by jghan if (inp.rdmft == true) { - rdmft_solver.init(this->GG, this->GK, this->pv, ucell, + rdmft_solver.init(this->pv, ucell, this->gd, this->kv, *(this->pelec), this->orb_, two_center_bundle_, inp.dft_functional, inp.rdmft_power_alpha); } @@ -198,8 +198,6 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) elecstate::DensityMatrix* DM = estate->get_DM(); this->p_hamilt = new hamilt::HamiltLCAO( - PARAM.globalv.gamma_only_local ? &(this->GG) : nullptr, - PARAM.globalv.gamma_only_local ? nullptr : &(this->GK), ucell, this->gd, &this->pv, this->pelec->pot, this->kv, two_center_bundle_, orb_, DM, this->deepks #ifdef __EXX @@ -371,7 +369,7 @@ void ESolver_KS_LCAO::after_all_runners(UnitCell& ucell) ModuleIO::ctrl_runner_lcao(ucell, PARAM.inp, this->kv, estate, this->pv, this->Pgrid, this->gd, this->psi, this->chr, hamilt_lcao, - this->two_center_bundle_, this->GG, this->GK, + this->two_center_bundle_, this->orb_, this->pw_rho, this->pw_rhod, this->sf, this->locpp.vloc, this->exx_nao, this->solvent); @@ -639,9 +637,9 @@ void ESolver_KS_LCAO::after_scf(UnitCell& ucell, const int istep, const ModuleIO::ctrl_scf_lcao(ucell, PARAM.inp, this->kv, estate, this->pv, this->gd, this->psi, hamilt_lcao, - this->two_center_bundle_, this->GK, + this->two_center_bundle_, this->orb_, this->pw_wfc, this->pw_rho, - this->GridT, this->pw_big, this->sf, + this->pw_big, this->sf, this->rdmft_solver, this->deepks, this->exx_nao, this->conv_esolver, this->scf_nmax_flag, istep); diff --git a/source/source_esolver/esolver_ks_lcao.h b/source/source_esolver/esolver_ks_lcao.h index 422048c59c..16fb6aa8df 100644 --- a/source/source_esolver/esolver_ks_lcao.h +++ b/source/source_esolver/esolver_ks_lcao.h @@ -4,10 +4,8 @@ #include "esolver_ks.h" #include "source_lcao/record_adj.h" // adjacent atoms #include "source_basis/module_nao/two_center_bundle.h" // nao basis -#include "source_lcao/module_gint/gint_gamma.h" // gint for gamma-only k-points -#include "source_lcao/module_gint/gint_k.h" // gint for multi k-points -#include "source_lcao/module_gint/temp_gint/gint.h" // gint -#include "source_lcao/module_gint/temp_gint/gint_info.h" +#include "source_lcao/module_gint/gint.h" // gint +#include "source_lcao/module_gint/gint_info.h" #include "source_lcao/setup_deepks.h" // for deepks, mohan add 20251008 #include "source_lcao/setup_exx.h" // for exx, mohan add 20251008 #include "source_lcao/module_rdmft/rdmft.h" // rdmft @@ -67,15 +65,6 @@ class ESolver_KS_LCAO : public ESolver_KS //! NAO orbitals: 2d block-cyclic distribution info Parallel_Orbitals pv; - //! Grid integration: used for k-point-dependent algorithm - Gint_k GK; - - //! Grid integration: used for gamma only algorithms. - Gint_Gamma GG; - - //! Grid integration: used to store some basic information - Grid_Technique GridT; - //! GintInfo: used to store some basic infomation about module_gint std::unique_ptr gint_info_; @@ -107,12 +96,7 @@ class ESolver_KS_LCAO : public ESolver_KS const Record_adj & get_RA() const { return RA; } const Grid_Driver & get_gd() const { return gd; } const Parallel_Orbitals & get_pv() const { return pv; } - const Gint_k & get_GK() const { return GK; } - const Gint_Gamma & get_GG() const { return GG; } - const Grid_Technique & get_GridT() const { return GridT; } - #ifndef __OLD_GINT const std::unique_ptr & get_gint_info() const { return gint_info_; } - #endif const TwoCenterBundle & get_two_center_bundle() const { return two_center_bundle_; } const rdmft::RDMFT & get_rdmft_solver() const { return rdmft_solver; } const LCAO_Orbitals & get_orb() const { return orb_; } diff --git a/source/source_esolver/lcao_others.cpp b/source/source_esolver/lcao_others.cpp index 1d4891954b..72aabae78e 100644 --- a/source/source_esolver/lcao_others.cpp +++ b/source/source_esolver/lcao_others.cpp @@ -175,8 +175,6 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) { elecstate::DensityMatrix* DM = dynamic_cast*>(this->pelec)->get_DM(); this->p_hamilt = new hamilt::HamiltLCAO( - PARAM.globalv.gamma_only_local ? &(this->GG) : nullptr, - PARAM.globalv.gamma_only_local ? nullptr : &(this->GK), ucell, this->gd, &this->pv, @@ -235,8 +233,7 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) Get_pchg_lcao get_pchg(this->psi, &(this->pv)); if (PARAM.globalv.gamma_only_local) { - get_pchg.begin(this->GG, - this->chr.rho, + get_pchg.begin(this->chr.rho, this->pelec->wg, this->pelec->eferm.get_all_ef(), this->pw_rhod->nrxx, @@ -253,8 +250,7 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) } else { - get_pchg.begin(this->GK, - this->chr.rho, + get_pchg.begin(this->chr.rho, this->chr.rhog, this->pelec->wg, this->pelec->eferm.get_all_ef(), @@ -286,7 +282,6 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) this->pw_wfc, this->Pgrid, this->pv, - this->GG, PARAM.inp.out_wfc_pw, this->kv, PARAM.inp.nelec, @@ -305,7 +300,6 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) this->pw_wfc, this->Pgrid, this->pv, - this->GK, PARAM.inp.out_wfc_pw, this->kv, PARAM.inp.nelec, diff --git a/source/source_estate/elecstate_lcao.cpp b/source/source_estate/elecstate_lcao.cpp index 7eaef537b0..fe3bb11758 100644 --- a/source/source_estate/elecstate_lcao.cpp +++ b/source/source_estate/elecstate_lcao.cpp @@ -5,11 +5,10 @@ #include "source_estate/module_dm/cal_dm_psi.h" #include "source_hamilt/module_xc/xc_functional.h" #include "source_lcao/module_deltaspin/spin_constrain.h" -#include "source_lcao/module_gint/grid_technique.h" #include "source_pw/module_pwdft/global.h" #include "source_io/module_parameter/parameter.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" #include @@ -34,13 +33,7 @@ void ElecStateLCAO>::psiToRho(const psi::Psigint_k->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer dm2d to DM_grid in gint - Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); - this->gint_k->cal_gint(&inout); -#else ModuleGint::cal_gint_rho(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->rho); -#endif if (XC_Functional::get_ked_flag()) { @@ -71,13 +64,7 @@ void ElecStateLCAO::psiToRho(const psi::Psi& psi) //------------------------------------------------------------ ModuleBase::GlobalFunc::NOTE("Calculate the charge on real space grid!"); -#ifdef __OLD_GINT - this->gint_gamma->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer dm2d to DM_grid in gint - Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); - this->gint_gamma->cal_gint(&inout); -#else ModuleGint::cal_gint_rho(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->rho); -#endif if (XC_Functional::get_ked_flag()) { @@ -139,25 +126,14 @@ void ElecStateLCAO::dmToRho(std::vector pexsi_DM, std::vectorgint_gamma->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer dm2d to DM_grid in gint - Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); - this->gint_gamma->cal_gint(&inout); -#else ModuleGint::cal_gint_rho(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->rho); -#endif if (XC_Functional::get_ked_flag()) { for (int is = 0; is < PARAM.inp.nspin; is++) { ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[0], this->charge->nrxx); } -#ifdef __OLD_GINT - Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau); - this->gint_gamma->cal_gint(&inout1); -#else ModuleGint::cal_gint_tau(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->kin_r); -#endif } this->charge->renormalize_rho(); diff --git a/source/source_estate/elecstate_lcao.h b/source/source_estate/elecstate_lcao.h index b56ec31f18..7c19f2c39b 100644 --- a/source/source_estate/elecstate_lcao.h +++ b/source/source_estate/elecstate_lcao.h @@ -3,8 +3,6 @@ #include "elecstate.h" #include "source_estate/module_dm/density_matrix.h" -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" #include @@ -20,14 +18,10 @@ class ElecStateLCAO : public ElecState ElecStateLCAO(Charge* chg_in, const K_Vectors* klist_in, int nks_in, - Gint_Gamma* gint_gamma_in, // mohan add 2024-04-01 - Gint_k* gint_k_in, // mohan add 2024-04-01 ModulePW::PW_Basis* rhopw_in, ModulePW::PW_Basis_Big* bigpw_in) { init_ks(chg_in, klist_in, nks_in, rhopw_in, bigpw_in); - this->gint_gamma = gint_gamma_in; // mohan add 2024-04-01 - this->gint_k = gint_k_in; // mohan add 2024-04-01 this->classname = "ElecStateLCAO"; } @@ -85,8 +79,6 @@ class ElecStateLCAO : public ElecState // calcualte rho for each k // void rhoBandK(const psi::Psi>& psi); - Gint_Gamma* gint_gamma = nullptr; // mohan add 2024-04-01 - Gint_k* gint_k = nullptr; // mohan add 2024-04-01 }; template diff --git a/source/source_estate/elecstate_lcao_cal_tau.cpp b/source/source_estate/elecstate_lcao_cal_tau.cpp index db85c314fd..a2a4210002 100644 --- a/source/source_estate/elecstate_lcao_cal_tau.cpp +++ b/source/source_estate/elecstate_lcao_cal_tau.cpp @@ -1,5 +1,5 @@ #include "elecstate_lcao.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" #include "source_base/timer.h" @@ -16,12 +16,7 @@ void ElecStateLCAO>::cal_tau(const psi::Psicharge->kin_r[is], this->charge->nrxx); } -#ifdef __OLD_GINT - Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau, PARAM.inp.nspin); - this->gint_k->cal_gint(&inout1); -#else ModuleGint::cal_gint_tau(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->kin_r); -#endif ModuleBase::timer::tick("ElecStateLCAO", "cal_tau"); return; } @@ -36,12 +31,7 @@ void ElecStateLCAO::cal_tau(const psi::Psi& psi) { ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[is], this->charge->nrxx); } -#ifdef __OLD_GINT - Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau, PARAM.inp.nspin); - this->gint_gamma->cal_gint(&inout1); -#else ModuleGint::cal_gint_tau(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->kin_r); -#endif ModuleBase::timer::tick("ElecStateLCAO", "cal_tau"); return; diff --git a/source/source_io/berryphase.cpp b/source/source_io/berryphase.cpp index 8d31edec91..c98913b35d 100644 --- a/source/source_io/berryphase.cpp +++ b/source/source_io/berryphase.cpp @@ -44,11 +44,10 @@ void berryphase::get_occupation_bands() void berryphase::lcao_init(const UnitCell& ucell, const Grid_Driver& gd, const K_Vectors& kv, - const Grid_Technique& grid_tech, const LCAO_Orbitals& orb) { ModuleBase::TITLE("berryphase", "lcao_init"); - lcao_method.init(ucell,grid_tech, kv.get_nkstot(), orb); + lcao_method.init(ucell, kv.get_nkstot(), orb); lcao_method.cal_R_number(ucell, gd); lcao_method.cal_orb_overlap(ucell); return; diff --git a/source/source_io/berryphase.h b/source/source_io/berryphase.h index 1685ccdefd..a040fe758c 100644 --- a/source/source_io/berryphase.h +++ b/source/source_io/berryphase.h @@ -39,7 +39,6 @@ class berryphase void lcao_init(const UnitCell& ucell, const Grid_Driver& gd, const K_Vectors& kv, - const Grid_Technique& grid_tech, const LCAO_Orbitals& orb); #endif void set_kpoints(const K_Vectors& kv, const int direction); diff --git a/source/source_io/cal_ldos.cpp b/source/source_io/cal_ldos.cpp index ec2f00bfc7..f12fc9e391 100644 --- a/source/source_io/cal_ldos.cpp +++ b/source/source_io/cal_ldos.cpp @@ -3,7 +3,7 @@ #include "cal_dos.h" #include "cube_io.h" #include "source_estate/module_dm/cal_dm_psi.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" #include @@ -60,12 +60,7 @@ void Cal_ldos::cal_ldos_lcao(const elecstate::ElecStateLCAO* pelec, } // calculate ldos -#ifdef __OLD_GINT - ModuleBase::WARNING_QUIT("Cal_ldos::dm2ldos", - "do not support old grid integral, please recompile with __NEW_GINT"); -#else ModuleGint::cal_gint_rho(dm_ldos.get_DMR_vector(), PARAM.inp.nspin, ldos); -#endif // I'm not sure whether ldos should be output for each spin or not // ldos[0] += ldos[1] for nspin_dm == 2 diff --git a/source/source_io/ctrl_runner_lcao.cpp b/source/source_io/ctrl_runner_lcao.cpp index c6e2c0e447..9cf550830c 100644 --- a/source/source_io/ctrl_runner_lcao.cpp +++ b/source/source_io/ctrl_runner_lcao.cpp @@ -24,8 +24,6 @@ void ctrl_runner_lcao(UnitCell& ucell, // unitcell Charge &chr, // charge density hamilt::HamiltLCAO* p_hamilt, // hamiltonian TwoCenterBundle &two_center_bundle, // use two-center integration - Gint_Gamma &gg, // gint for Gamma-only - Gint_k &gk, // gint for multi k-points LCAO_Orbitals &orb, // LCAO orbitals ModulePW::PW_Basis* pw_rho, // charge density ModulePW::PW_Basis* pw_rhod, // dense charge density @@ -64,8 +62,6 @@ void ctrl_runner_lcao(UnitCell& ucell, // unitcell *pw_rhod, vloc, chr, - gg, - gk, kv, orb.cutoffs(), pelec->wg, @@ -89,8 +85,6 @@ void ctrl_runner_lcao(UnitCell& ucell, // unitcell *pw_rhod, vloc, chr, - gg, - gk, kv, orb.cutoffs(), gd @@ -118,8 +112,6 @@ void ctrl_runner_lcao(UnitCell& ucell, // unitcell *pw_rhod, vloc, chr, - gg, - gk, kv, pelec->wg, gd, @@ -150,8 +142,6 @@ template void ModuleIO::ctrl_runner_lcao(UnitCell& ucell, / Charge &chr, // charge density hamilt::HamiltLCAO* p_hamilt, // hamiltonian TwoCenterBundle &two_center_bundle, // use two-center integration - Gint_Gamma &gg, // gint for Gamma-only - Gint_k &gk, // gint for multi k-points LCAO_Orbitals &orb, // LCAO orbitals ModulePW::PW_Basis* pw_rho, // charge density ModulePW::PW_Basis* pw_rhod, // dense charge density @@ -172,8 +162,6 @@ template void ctrl_runner_lcao, double>(UnitCell& ucell, Charge &chr, // charge density hamilt::HamiltLCAO, double>* p_hamilt, // hamiltonian TwoCenterBundle &two_center_bundle, // use two-center integration - Gint_Gamma &gg, // gint for Gamma-only - Gint_k &gk, // gint for multi k-points LCAO_Orbitals &orb, // LCAO orbitals ModulePW::PW_Basis* pw_rho, // charge density ModulePW::PW_Basis* pw_rhod, // dense charge density @@ -194,8 +182,6 @@ template void ctrl_runner_lcao, std::complex>(UnitC Charge &chr, // charge density hamilt::HamiltLCAO, std::complex>* p_hamilt, // hamiltonian TwoCenterBundle &two_center_bundle, // use two-center integration - Gint_Gamma &gg, // gint for Gamma-only - Gint_k &gk, // gint for multi k-points LCAO_Orbitals &orb, // LCAO orbitals ModulePW::PW_Basis* pw_rho, // charge density ModulePW::PW_Basis* pw_rhod, // dense charge density diff --git a/source/source_io/ctrl_runner_lcao.h b/source/source_io/ctrl_runner_lcao.h index 34eae3d26a..2b57c1800f 100644 --- a/source/source_io/ctrl_runner_lcao.h +++ b/source/source_io/ctrl_runner_lcao.h @@ -7,7 +7,6 @@ #include "source_psi/psi.h" // use Psi #include "source_lcao/hamilt_lcao.h" // use hamilt::HamiltLCAO #include "source_basis/module_nao/two_center_bundle.h" // use TwoCenterBundle -#include "source_lcao/module_gint/gint_k.h" // use Gint_k #include "source_lcao/setup_exx.h" // for exx, mohan add 20251018 namespace ModuleIO @@ -25,8 +24,6 @@ void ctrl_runner_lcao(UnitCell& ucell, // unitcell Charge &chr, // charge density hamilt::HamiltLCAO* p_hamilt, // hamiltonian TwoCenterBundle &two_center_bundle, // use two-center integration - Gint_Gamma &gg, // gint for Gamma-only - Gint_k &gk, // gint for multi k-points LCAO_Orbitals &orb, // LCAO orbitals ModulePW::PW_Basis* pw_rho, // charge density ModulePW::PW_Basis* pw_rhod, // dense charge density diff --git a/source/source_io/ctrl_scf_lcao.cpp b/source/source_io/ctrl_scf_lcao.cpp index 7ab02002c2..9fd14afbd6 100644 --- a/source/source_io/ctrl_scf_lcao.cpp +++ b/source/source_io/ctrl_scf_lcao.cpp @@ -41,11 +41,9 @@ void ModuleIO::ctrl_scf_lcao(UnitCell& ucell, psi::Psi* psi, hamilt::HamiltLCAO* p_hamilt, TwoCenterBundle &two_center_bundle, - Gint_k &gk, LCAO_Orbitals &orb, const ModulePW::PW_Basis_K* pw_wfc, // for berryphase const ModulePW::PW_Basis* pw_rho, // for berryphase - Grid_Technique >, // for berryphase const ModulePW::PW_Basis_Big* pw_big, // for Wannier90 const Structure_Factor& sf, // for Wannier90 rdmft::RDMFT &rdmft_solver, // for RDMFT @@ -219,7 +217,6 @@ void ModuleIO::ctrl_scf_lcao(UnitCell& ucell, istep, pelec->pot->get_effective_v(), pv, - gk, two_center_bundle, orb, ucell, @@ -329,7 +326,7 @@ void ModuleIO::ctrl_scf_lcao(UnitCell& ucell, { std::cout << FmtCore::format("\n * * * * * *\n << Start %s.\n", "Berry phase calculation"); berryphase bp(&pv); - bp.lcao_init(ucell, gd, kv, gt, orb); + bp.lcao_init(ucell, gd, kv, orb); // additional step before calling macroscopic_polarization bp.Macroscopic_polarization(ucell, pw_wfc->npwk_max, psi, pw_rho, pw_wfc, kv); std::cout << FmtCore::format(" >> Finish %s.\n * * * * * *\n", "Berry phase calculation"); @@ -471,11 +468,9 @@ template void ModuleIO::ctrl_scf_lcao( psi::Psi* psi, hamilt::HamiltLCAO* p_hamilt, TwoCenterBundle &two_center_bundle, - Gint_k &gk, LCAO_Orbitals &orb, const ModulePW::PW_Basis_K* pw_wfc, // for berryphase const ModulePW::PW_Basis* pw_rho, // for berryphase - Grid_Technique >, // for berryphase const ModulePW::PW_Basis_Big* pw_big, // for Wannier90 const Structure_Factor& sf, // for Wannier90 rdmft::RDMFT &rdmft_solver, // for RDMFT @@ -496,11 +491,9 @@ template void ModuleIO::ctrl_scf_lcao, double>( psi::Psi>* psi, hamilt::HamiltLCAO, double>* p_hamilt, TwoCenterBundle &two_center_bundle, - Gint_k &gk, LCAO_Orbitals &orb, const ModulePW::PW_Basis_K* pw_wfc, // for berryphase const ModulePW::PW_Basis* pw_rho, // for berryphase - Grid_Technique >, // for berryphase const ModulePW::PW_Basis_Big* pw_big, // for Wannier90 const Structure_Factor& sf, // for Wannier90 rdmft::RDMFT, double> &rdmft_solver, // for RDMFT @@ -520,11 +513,9 @@ template void ModuleIO::ctrl_scf_lcao, std::complex psi::Psi>* psi, hamilt::HamiltLCAO, std::complex>* p_hamilt, TwoCenterBundle &two_center_bundle, - Gint_k &gk, LCAO_Orbitals &orb, const ModulePW::PW_Basis_K* pw_wfc, // for berryphase const ModulePW::PW_Basis* pw_rho, // for berryphase - Grid_Technique >, // for berryphase const ModulePW::PW_Basis_Big* pw_big, // for Wannier90 const Structure_Factor& sf, // for Wannier90 rdmft::RDMFT, std::complex> &rdmft_solver, // for RDMFT diff --git a/source/source_io/ctrl_scf_lcao.h b/source/source_io/ctrl_scf_lcao.h index ee1dcfdada..98ee5e18a5 100644 --- a/source/source_io/ctrl_scf_lcao.h +++ b/source/source_io/ctrl_scf_lcao.h @@ -9,7 +9,6 @@ #include "source_psi/psi.h" // use Psi #include "source_lcao/hamilt_lcao.h" // use hamilt::HamiltLCAO #include "source_basis/module_nao/two_center_bundle.h" // use TwoCenterBundle -#include "source_lcao/module_gint/gint_k.h" // use Gint_k #include "source_basis/module_pw/pw_basis_k.h" // use ModulePW::PW_Basis_K and ModulePW::PW_Basis #include "source_pw/module_pwdft/structure_factor.h" // use Structure_Factor #include "source_lcao/module_rdmft/rdmft.h" // use RDMFT codes @@ -30,11 +29,9 @@ namespace ModuleIO psi::Psi* psi, hamilt::HamiltLCAO* p_hamilt, TwoCenterBundle &two_center_bundle, - Gint_k &gk, LCAO_Orbitals &orb, const ModulePW::PW_Basis_K* pw_wfc, // for berryphase const ModulePW::PW_Basis* pw_rho, // for berryphase - Grid_Technique >, // for berryphase const ModulePW::PW_Basis_Big* pw_big, // for Wannier90 const Structure_Factor& sf, // for Wannier90 rdmft::RDMFT &rdmft_solver, // for RDMFT diff --git a/source/source_io/get_pchg_lcao.cpp b/source/source_io/get_pchg_lcao.cpp index e293a51312..346b56f12a 100644 --- a/source/source_io/get_pchg_lcao.cpp +++ b/source/source_io/get_pchg_lcao.cpp @@ -3,7 +3,7 @@ #include "source_io/cube_io.h" #include "source_estate/module_charge/symmetry_rho.h" #include "source_estate/module_dm/cal_dm_psi.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" Get_pchg_lcao::Get_pchg_lcao(psi::Psi* psi_gamma_in, const Parallel_Orbitals* ParaV_in) : psi_gamma(psi_gamma_in), ParaV(ParaV_in) @@ -20,8 +20,7 @@ Get_pchg_lcao::~Get_pchg_lcao() } // For gamma_only -void Get_pchg_lcao::begin(Gint_Gamma& gg, - double** rho, +void Get_pchg_lcao::begin(double** rho, const ModuleBase::matrix& wg, const std::vector& ef_all_spin, const int rhopw_nrxx, @@ -70,14 +69,7 @@ void Get_pchg_lcao::begin(Gint_Gamma& gg, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(); -#ifdef __OLD_GINT - gg.initialize_pvpR(*ucell_in, GridD_in, nspin); - gg.transfer_DM2DtoGrid(DM.get_DMR_vector()); - Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin); - gg.cal_gint(&inout); -#else ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho); -#endif // A solution to replace the original implementation of the following code: // pelec->charge->save_rho_before_sum_band(); @@ -109,8 +101,7 @@ void Get_pchg_lcao::begin(Gint_Gamma& gg, } // For multi-k -void Get_pchg_lcao::begin(Gint_k& gk, - double** rho, +void Get_pchg_lcao::begin(double** rho, std::complex** rhog, const ModuleBase::matrix& wg, const std::vector& ef_all_spin, @@ -169,14 +160,7 @@ void Get_pchg_lcao::begin(Gint_k& gk, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(ik); -#ifdef __OLD_GINT - gk.initialize_pvpR(*ucell_in, GridD_in, nspin); - gk.transfer_DM2DtoGrid(DM.get_DMR_vector()); - Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin); - gk.cal_gint(&inout); -#else ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho); -#endif // Using std::vector to replace the original double** rho_save @@ -216,14 +200,7 @@ void Get_pchg_lcao::begin(Gint_k& gk, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(); -#ifdef __OLD_GINT - gk.initialize_pvpR(*ucell_in, GridD_in, nspin); - gk.transfer_DM2DtoGrid(DM.get_DMR_vector()); - Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin); - gk.cal_gint(&inout); -#else ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho); -#endif // Using std::vector to replace the original double** rho_save std::vector> rho_save(nspin, std::vector(rhopw_nrxx)); diff --git a/source/source_io/get_pchg_lcao.h b/source/source_io/get_pchg_lcao.h index 1c34219ade..130637c775 100644 --- a/source/source_io/get_pchg_lcao.h +++ b/source/source_io/get_pchg_lcao.h @@ -1,8 +1,6 @@ #ifndef GET_PCHG_LCAO_H #define GET_PCHG_LCAO_H -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" #include "source_cell/klist.h" #include "source_estate/module_dm/density_matrix.h" @@ -22,8 +20,7 @@ class Get_pchg_lcao ~Get_pchg_lcao(); // For gamma_only - void begin(Gint_Gamma& gg, - double** rho, + void begin(double** rho, const ModuleBase::matrix& wg, const std::vector& ef_all_spin, const int rhopw_nrxx, @@ -39,8 +36,7 @@ class Get_pchg_lcao std::ofstream& ofs_running); // For multi-k - void begin(Gint_k& gk, - double** rho, + void begin(double** rho, std::complex** rhog, const ModuleBase::matrix& wg, const std::vector& ef_all_spin, diff --git a/source/source_io/get_wf_lcao.cpp b/source/source_io/get_wf_lcao.cpp index 03e40ae21a..7d6cd0d15c 100644 --- a/source/source_io/get_wf_lcao.cpp +++ b/source/source_io/get_wf_lcao.cpp @@ -4,10 +4,8 @@ #include "source_io/write_wfc_pw.h" #include "source_base/memory.h" -#ifndef __OLD_GINT -#include "source_lcao/module_gint/temp_gint/gint_env_gamma.h" -#include "source_lcao/module_gint/temp_gint/gint_env_k.h" -#endif +#include "source_lcao/module_gint/gint_env_gamma.h" +#include "source_lcao/module_gint/gint_env_k.h" Get_wf_lcao::Get_wf_lcao(const elecstate::ElecState* pes) { @@ -24,7 +22,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell, const ModulePW::PW_Basis_K* pw_wfc, const Parallel_Grid& pgrid, const Parallel_Orbitals& para_orb, - Gint_Gamma& gg, const int& out_wfc_pw, const K_Vectors& kv, const double nelec, @@ -45,31 +42,12 @@ void Get_wf_lcao::begin(const UnitCell& ucell, prepare_get_wf(ofs_running); -#ifdef __OLD_GINT - // allocate grid wave functions for gamma_only - std::vector wfc_gamma_grid(nspin); - for (int is = 0; is < nspin; ++is) - { - wfc_gamma_grid[is] = new double*[nbands]; - for (int ib = 0; ib < nbands; ++ib) - { - wfc_gamma_grid[is][ib] = new double[gg.gridt->lgd]; - } - } -#endif - // for pw_wfc in G space psi::Psi> psi_g; // if (out_wfc_pw || out_wfc_r) psi_g.resize(nspin, nbands, kv.ngk[0]); -#ifdef __OLD_GINT - const double mem_size = sizeof(double) * double(gg.gridt->lgd) * double(nbands) * double(nspin) / 1024.0 / 1024.0; - ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size); - ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size); -#endif - // Set this->bands_picked_ this->select_bands(out_wfc_norm, nbands, fermi_band); @@ -77,35 +55,12 @@ void Get_wf_lcao::begin(const UnitCell& ucell, for (int is = 0; is < nspin; ++is) { psid->fix_k(is); -#ifdef __OLD_GINT - #ifdef __MPI - wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); - #else - // if not MPI enabled, it is the case psid holds a global matrix. - // use fix_k to switch between different spin channels (actually kpoints, - // because now the same kpoint in different spin channels are treated - // as distinct kpoints) - for (int i = 0; i < nbands; ++i) - { - for (int j = 0; j < nlocal; ++j) - { - wfc_gamma_grid[is][i][j] = psid[0](i, j); - } - } - #endif -#else ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, nlocal, pes_->charge->rho[is]); -#endif for (int ib = 0; ib < nbands; ++ib) { if (bands_picked_[ib]) { - #ifdef __OLD_GINT - ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); - gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); - #else gint_env.cal_env_band(ib); - #endif pes_->charge->save_rho_before_sum_band(); // pint out information @@ -140,34 +95,12 @@ void Get_wf_lcao::begin(const UnitCell& ucell, for (int is = 0; is < nspin; ++is) { psid->fix_k(is); -#ifdef __OLD_GINT - #ifdef __MPI - wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); - #else - // if not MPI enabled, it is the case psid holds a global matrix. use fix_k to switch between - // different spin channels (actually kpoints, because now the same kpoint in different spin channels - // are treated as distinct kpoints) - for (int i = 0; i < nbands; ++i) - { - for (int j = 0; j < nlocal; ++j) - { - wfc_gamma_grid[is][i][j] = psid[0](i, j); - } - } - #endif -#else ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, nlocal, pes_->charge->rho[is]); -#endif for (int ib = 0; ib < nbands; ++ib) { if (bands_picked_[ib]) { -#ifdef __OLD_GINT - ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); - gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); -#else gint_env.cal_env_band(ib); -#endif pes_->charge->save_rho_before_sum_band(); const double ef_tmp = this->pes_->eferm.get_efval(is); @@ -222,16 +155,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell, pw_wfc, ofs_running); -#ifdef __OLD_GINT - for (int is = 0; is < nspin; ++is) - { - for (int ib = 0; ib < nbands; ++ib) - { - delete[] wfc_gamma_grid[is][ib]; - } - delete[] wfc_gamma_grid[is]; - } -#endif return; } @@ -241,7 +164,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell, const ModulePW::PW_Basis_K* pw_wfc, const Parallel_Grid& pgrid, const Parallel_Orbitals& para_orb, - Gint_k& gk, const int& out_wfc_pw, const K_Vectors& kv, const double nelec, @@ -262,21 +184,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // allocate grid wave functions for multi-k const int nks = kv.get_nks(); std::vector**> wfc_k_grid(nks); -#ifdef __OLD_GINT - for (int ik = 0; ik < nks; ++ik) - { - wfc_k_grid[ik] = new std::complex*[nbands]; - for (int ib = 0; ib < nbands; ++ib) - { - wfc_k_grid[ik][ib] = new std::complex[gk.gridt->lgd]; - } - } - - const double mem_size - = sizeof(std::complex) * double(gk.gridt->lgd) * double(nbands) * double(nks) / 1024.0 / 1024.0; - ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size); - ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size); -#endif // for pw_wfc in G space psi::Psi> psi_g; @@ -295,36 +202,14 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // 2d-to-grid conversion is unified into `wfc_2d_to_grid`. psi->fix_k(ik); -#ifdef __OLD_GINT - #ifdef __MPI // need to deal with NSPIN=4 !!!! - wfc_2d_to_grid(psi->get_pointer(), para_orb, wfc_k_grid[ik], gk.gridt->trace_lo); - #else - for (int i = 0; i < nbands; ++i) - { - for (int j = 0; j < nlocal; ++j) - { - wfc_k_grid[ik][i][j] = psi[0](i, j); - } - } - #endif -#else ModuleGint::Gint_env_k gint_env(psi->get_pointer(), ¶_orb, kv.kvec_c, kv.kvec_d, nbands, nlocal, ik, PARAM.inp.nspin, PARAM.globalv.npol, pes_->charge->rho[ispin]); -#endif for (int ib = 0; ib < nbands; ++ib) { if (bands_picked_[ib]) { -#ifdef __OLD_GINT - ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[ispin], - pw_wfc->nrxx); // terrible, you make changes on another instance's data??? - - // deal with NSPIN=4 - gk.cal_env_k(ik, wfc_k_grid[ik][ib], pes_->charge->rho[ispin], kv.kvec_c, kv.kvec_d, ucell); -#else gint_env.cal_env_band(ib); -#endif // ik0 is the real k-point index, starting from 0 int ik0 = kv.ik2iktot[ik]; @@ -440,16 +325,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell, } } } -#ifdef __OLD_GINT - for (int ik = 0; ik < nks; ++ik) - { - for (int ib = 0; ib < nbands; ++ib) - { - delete[] wfc_k_grid[ik][ib]; - } - delete[] wfc_k_grid[ik]; - } -#endif return; } diff --git a/source/source_io/get_wf_lcao.h b/source/source_io/get_wf_lcao.h index b182e352bd..94afb2cb64 100644 --- a/source/source_io/get_wf_lcao.h +++ b/source/source_io/get_wf_lcao.h @@ -1,9 +1,8 @@ #ifndef GET_WF_LCAO_H #define GET_WF_LCAO_H -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" #include "source_estate/elecstate.h" +#include "source_basis/module_ao/parallel_orbitals.h" class Get_wf_lcao { @@ -17,7 +16,6 @@ class Get_wf_lcao const ModulePW::PW_Basis_K* pw_wfc, const Parallel_Grid& pgrid, const Parallel_Orbitals& para_orb, - Gint_Gamma& gg, const int& out_wfc_pw, const K_Vectors& kv, const double nelec, @@ -29,34 +27,12 @@ class Get_wf_lcao const std::string& global_out_dir, std::ofstream& ofs_running); - /// tmp, delete after Gint is refactored. - void begin(const UnitCell& ucell, - const psi::Psi* psid, - const ModulePW::PW_Basis_K* pw_wfc, - const Parallel_Grid& pgrid, - const Parallel_Orbitals& para_orb, - Gint_k& gg, - const int& out_wfc_pw, - const K_Vectors& kv, - const double nelec, - const std::vector& out_wfc_norm, - const std::vector& out_wfc_re_im, - const int nbands, - const int nspin, - const int nlocal, - const std::string& global_out_dir, - std::ofstream& ofs_running) - { - throw std::logic_error("gint_k should use with complex psi."); - }; - /// For multi-k void begin(const UnitCell& ucell, const psi::Psi>* psi, const ModulePW::PW_Basis_K* pw_wfc, const Parallel_Grid& pgrid, const Parallel_Orbitals& para_orb, - Gint_k& gk, const int& out_wfc_pw, const K_Vectors& kv, const double nelec, @@ -68,27 +44,6 @@ class Get_wf_lcao const std::string& global_out_dir, std::ofstream& ofs_running); - /// tmp, delete after Gint is refactored. - void begin(const UnitCell& ucell, - const psi::Psi>* psi, - const ModulePW::PW_Basis_K* pw_wfc, - const Parallel_Grid& pgrid, - const Parallel_Orbitals& para_orb, - Gint_Gamma& gk, - const int& out_wfc_pw, - const K_Vectors& kv, - const double nelec, - const std::vector& out_wfc_norm, - const std::vector& out_wfc_re_im, - const int nbands, - const int nspin, - const int nlocal, - const std::string& global_out_dir, - std::ofstream& ofs_running) - { - throw std::logic_error("gint_gamma should use with real psi."); - }; - private: void prepare_get_wf(std::ofstream& ofs_running); diff --git a/source/source_io/output_mat_sparse.cpp b/source/source_io/output_mat_sparse.cpp index 12d65edd61..7381c61391 100644 --- a/source/source_io/output_mat_sparse.cpp +++ b/source/source_io/output_mat_sparse.cpp @@ -15,7 +15,6 @@ void output_mat_sparse(const bool& out_mat_hsR, const int& istep, const ModuleBase::matrix& v_eff, const Parallel_Orbitals& pv, - Gint_k& gint_k, const TwoCenterBundle& two_center_bundle, const LCAO_Orbitals& orb, UnitCell& ucell, @@ -34,7 +33,6 @@ void output_mat_sparse(const bool& out_mat_hsR, const int& istep, const ModuleBase::matrix& v_eff, const Parallel_Orbitals& pv, - Gint_k& gint_k, const TwoCenterBundle& two_center_bundle, const LCAO_Orbitals& orb, UnitCell& ucell, @@ -61,7 +59,6 @@ void output_mat_sparse(const bool& out_mat_hsR, { output_dHR(istep, v_eff, - gint_k, // mohan add 2024-04-01 ucell, pv, HS_Arrays, diff --git a/source/source_io/output_mat_sparse.h b/source/source_io/output_mat_sparse.h index 065f510214..bce47f7fb0 100644 --- a/source/source_io/output_mat_sparse.h +++ b/source/source_io/output_mat_sparse.h @@ -5,8 +5,7 @@ #include "source_basis/module_nao/two_center_bundle.h" #include "source_cell/klist.h" #include "source_hamilt/hamilt.h" -#include "source_lcao/module_gint/gint_k.h" - +#include "source_cell/module_neighbor/sltk_grid_driver.h" namespace ModuleIO { /// @brief the output interface to write the sparse matrix of H, S, T, and r @@ -19,7 +18,6 @@ void output_mat_sparse(const bool& out_mat_hsR, const int& istep, const ModuleBase::matrix& v_eff, const Parallel_Orbitals& pv, - Gint_k& gint_k, // mohan add 2024-04-01 const TwoCenterBundle& two_center_bundle, const LCAO_Orbitals& orb, UnitCell& ucell, diff --git a/source/source_io/to_wannier90_lcao.h b/source/source_io/to_wannier90_lcao.h index 50560464e9..fa75293d9b 100644 --- a/source/source_io/to_wannier90_lcao.h +++ b/source/source_io/to_wannier90_lcao.h @@ -39,7 +39,6 @@ #include "fR_overlap.h" #include "source_base/abfs-vector3_order.h" #include "source_base/math_lebedev_laikov.h" -#include "source_lcao/module_gint/grid_technique.h" #include "source_lcao/module_hcontainer/hcontainer.h" class Coordinate_3D diff --git a/source/source_io/to_wannier90_lcao_in_pw.h b/source/source_io/to_wannier90_lcao_in_pw.h index d7a728a209..cf6d5fc915 100644 --- a/source/source_io/to_wannier90_lcao_in_pw.h +++ b/source/source_io/to_wannier90_lcao_in_pw.h @@ -30,7 +30,6 @@ #ifdef __LCAO #include "source_basis/module_ao/parallel_orbitals.h" -#include "source_lcao/module_gint/grid_technique.h" #include "source_psi/psi_initializer.h" class toWannier90_LCAO_IN_PW : public toWannier90_PW diff --git a/source/source_io/unk_overlap_lcao.cpp b/source/source_io/unk_overlap_lcao.cpp index dbd734f7e2..a352995a79 100644 --- a/source/source_io/unk_overlap_lcao.cpp +++ b/source/source_io/unk_overlap_lcao.cpp @@ -25,7 +25,6 @@ unkOverlap_lcao::~unkOverlap_lcao() } void unkOverlap_lcao::init(const UnitCell& ucell, - const Grid_Technique& gt, const int nkstot, const LCAO_Orbitals& orb) { diff --git a/source/source_io/unk_overlap_lcao.h b/source/source_io/unk_overlap_lcao.h index a867a4b0c6..7abc37d337 100644 --- a/source/source_io/unk_overlap_lcao.h +++ b/source/source_io/unk_overlap_lcao.h @@ -12,7 +12,7 @@ #include "source_lcao/center2_orb-orb11.h" #include "source_lcao/center2_orb-orb21.h" #include "source_lcao/center2_orb.h" -#include "source_lcao/module_gint/grid_technique.h" +#include "source_cell/module_neighbor/sltk_grid_driver.h" #include #include @@ -48,7 +48,7 @@ class unkOverlap_lcao unkOverlap_lcao(); ~unkOverlap_lcao(); - void init(const UnitCell& ucell, const Grid_Technique& gt, const int nkstot, const LCAO_Orbitals& orb); + void init(const UnitCell& ucell, const int nkstot, const LCAO_Orbitals& orb); int iw2it(const UnitCell& ucell, int iw); int iw2ia(const UnitCell& ucell, int iw); int iw2iL(const UnitCell& ucell, int iw); diff --git a/source/source_io/write_HS_R.cpp b/source/source_io/write_HS_R.cpp index f88a8476f2..b98b74ef80 100644 --- a/source/source_io/write_HS_R.cpp +++ b/source/source_io/write_HS_R.cpp @@ -126,7 +126,6 @@ void ModuleIO::output_dSR(const int& istep, void ModuleIO::output_dHR(const int& istep, const ModuleBase::matrix& v_eff, - Gint_k& gint_k, // mohan add 2024-04-01 const UnitCell& ucell, const Parallel_Orbitals& pv, LCAO_HS_Arrays& HS_Arrays, @@ -161,8 +160,7 @@ void ModuleIO::output_dHR(const int& istep, orb, cspin, sparse_thr, - v_eff, - gint_k); + v_eff); } else if (nspin == 2) { @@ -176,8 +174,7 @@ void ModuleIO::output_dHR(const int& istep, orb, cspin, sparse_thr, - v_eff, - gint_k); + v_eff); } } // mohan update 2024-04-01 diff --git a/source/source_io/write_HS_R.h b/source/source_io/write_HS_R.h index bf95c2d648..2f831d2baa 100644 --- a/source/source_io/write_HS_R.h +++ b/source/source_io/write_HS_R.h @@ -5,8 +5,8 @@ #include "source_basis/module_nao/two_center_bundle.h" #include "source_cell/klist.h" #include "source_hamilt/hamilt.h" -#include "source_lcao/module_gint/gint_k.h" #include "source_pw/module_pwdft/global.h" +#include "source_lcao/LCAO_HS_arrays.hpp" namespace ModuleIO { @@ -31,7 +31,6 @@ namespace ModuleIO void output_dHR(const int& istep, const ModuleBase::matrix& v_eff, - Gint_k& gint_k, // mohan add 2024-04-01 const UnitCell& ucell, const Parallel_Orbitals& pv, LCAO_HS_Arrays& HS_Arrays, diff --git a/source/source_io/write_eband_terms.hpp b/source/source_io/write_eband_terms.hpp index 0aa48770e0..701b097b25 100644 --- a/source/source_io/write_eband_terms.hpp +++ b/source/source_io/write_eband_terms.hpp @@ -21,8 +21,6 @@ void write_eband_terms(const int nspin, const ModulePW::PW_Basis& rhod_basis, const ModuleBase::matrix& vloc, const Charge& chg, - Gint_Gamma& gint_gamma, // mohan add 2024-04-01 - Gint_k& gint_k, // mohan add 2024-04-01 const K_Vectors& kv, const ModuleBase::matrix& wg, Grid_Driver& gd, @@ -45,10 +43,6 @@ void write_eband_terms(const int nspin, set_para2d_MO(*pv, nbands, p2d); - typename TGint::type* gint = nullptr; - - set_gint_pointer(gint_gamma, gint_k, gint); - auto if_gamma_fix = [](hamilt::HContainer& hR) { if (std::is_same::value) @@ -110,7 +104,7 @@ void write_eband_terms(const int nspin, if_gamma_fix(v_pp_local_R_ao); std::vector> e_orb_pp_local; - hamilt::Veff> v_pp_local_op(gint, + hamilt::Veff> v_pp_local_op( &v_pp_local_k_ao, kv.kvec_d, &pot_local, @@ -167,7 +161,7 @@ void write_eband_terms(const int nspin, std::vector>*> v_hartree_op(nspin0); for (int is = 0; is < nspin0; ++is) { - v_hartree_op[is] = new hamilt::Veff>(gint, + v_hartree_op[is] = new hamilt::Veff>( &v_hartree_k_ao, kv.kvec_d, &pot_hartree, &v_hartree_R_ao[is], &ucell, orb_cutoff, &gd, nspin); v_hartree_op[is]->contributeHR(); } @@ -199,8 +193,6 @@ void write_eband_terms(const int nspin, rhod_basis, vloc, chg, - gint_gamma, - gint_k, kv, orb_cutoff, wg, diff --git a/source/source_io/write_vxc.hpp b/source/source_io/write_vxc.hpp index 43fd803bb7..ad503265c0 100644 --- a/source/source_io/write_vxc.hpp +++ b/source/source_io/write_vxc.hpp @@ -10,24 +10,6 @@ #include "source_io/write_HS.h" #include "source_io/filename.h" // use filename_output function -#ifndef TGINT_H -#define TGINT_H -template -struct TGint; - -template <> -struct TGint -{ - using type = Gint_Gamma; -}; - -template <> -struct TGint> -{ - using type = Gint_k; -}; -#endif - namespace ModuleIO { @@ -125,29 +107,6 @@ std::vector orbital_energy(const int ik, const int nbands, const std::ve return e; } -#ifndef SET_GINT_POINTER_H -#define SET_GINT_POINTER_H -// mohan update 2024-04-01 -template -void set_gint_pointer(Gint_Gamma& gint_gamma, Gint_k& gint_k, typename TGint::type*& gint); - -// mohan update 2024-04-01 -template <> -void set_gint_pointer(Gint_Gamma& gint_gamma, Gint_k& gint_k, typename TGint::type*& gint) -{ - gint = &gint_gamma; -} - -// mohan update 2024-04-01 -template <> -void set_gint_pointer>(Gint_Gamma& gint_gamma, - Gint_k& gint_k, - typename TGint>::type*& gint) -{ - gint = &gint_k; -} -#endif - inline void write_orb_energy(const K_Vectors& kv, const int nspin0, const int nbands, const std::vector>& e_orb, @@ -187,8 +146,6 @@ void write_Vxc(const int nspin, const ModulePW::PW_Basis& rhod_basis, const ModuleBase::matrix& vloc, const Charge& chg, - Gint_Gamma& gint_gamma, // mohan add 2024-04-01 - Gint_k& gint_k, // mohan add 2024-04-01 const K_Vectors& kv, const std::vector& orb_cutoff, const ModuleBase::matrix& wg, @@ -227,14 +184,11 @@ void write_Vxc(const int nspin, // 3. allocate operators and contribute HR // op (corresponding to hR) - typename TGint::type* gint = nullptr; - - set_gint_pointer(gint_gamma, gint_k, gint); std::vector>*> vxcs_op_ao(nspin0); for (int is = 0; is < nspin0; ++is) { - vxcs_op_ao[is] = new hamilt::Veff>(gint, + vxcs_op_ao[is] = new hamilt::Veff>( &vxc_k_ao, kv.kvec_d, potxc, &vxcs_R_ao[is], &ucell, orb_cutoff, &gd, nspin); vxcs_op_ao[is]->contributeHR(); diff --git a/source/source_io/write_vxc_r.hpp b/source/source_io/write_vxc_r.hpp index 183d032760..0adfddb299 100644 --- a/source/source_io/write_vxc_r.hpp +++ b/source/source_io/write_vxc_r.hpp @@ -10,47 +10,8 @@ #include "source_lcao/module_ri/RI_2D_Comm.h" #endif -#ifndef TGINT_H -#define TGINT_H -template -struct TGint; - -template <> -struct TGint -{ - using type = Gint_Gamma; -}; - -template <> -struct TGint> -{ - using type = Gint_k; -}; -#endif - namespace ModuleIO { - -#ifndef SET_GINT_POINTER_H -#define SET_GINT_POINTER_H -template -void set_gint_pointer(Gint_Gamma& gint_gamma, Gint_k& gint_k, typename TGint::type*& gint); - -template <> -void set_gint_pointer(Gint_Gamma& gint_gamma, Gint_k& gint_k, typename TGint::type*& gint) -{ - gint = &gint_gamma; -} - -template <> -void set_gint_pointer>(Gint_Gamma& gint_gamma, - Gint_k& gint_k, - typename TGint>::type*& gint) -{ - gint = &gint_k; -} -#endif - template std::set> get_R_range(const hamilt::HContainer& hR) { std::set> all_R_coor; @@ -97,8 +58,6 @@ void write_Vxc_R(const int nspin, const ModulePW::PW_Basis& rhod_basis, const ModuleBase::matrix& vloc, const Charge& chg, - Gint_Gamma& gint_gamma, - Gint_k& gint_k, const K_Vectors& kv, const std::vector& orb_cutoff, Grid_Driver& gd, @@ -144,12 +103,10 @@ const double sparse_thr=1e-10) // 3. calculate the Vxc(R) hamilt::HS_Matrix_K vxc_k_ao(pv, 1); // only hk is needed, sk is skipped - typename TGint::type* gint = nullptr; - set_gint_pointer(gint_gamma, gint_k, gint); std::vector>*> vxcs_op_ao(nspin0); for (int is = 0; is < nspin0; ++is) { - vxcs_op_ao[is] = new hamilt::Veff>(gint, + vxcs_op_ao[is] = new hamilt::Veff>( &vxc_k_ao, kv.kvec_d, potxc, &vxcs_R_ao[is], &ucell, orb_cutoff, &gd, nspin); vxcs_op_ao[is]->contributeHR(); #ifdef __EXX diff --git a/source/source_lcao/CMakeLists.txt b/source/source_lcao/CMakeLists.txt index 1831ac6522..118e877239 100644 --- a/source/source_lcao/CMakeLists.txt +++ b/source/source_lcao/CMakeLists.txt @@ -28,7 +28,6 @@ if(ENABLE_LCAO) FORCE_k.cpp stress_tools.cpp edm.cpp - grid_init.cpp spar_dh.cpp spar_exx.cpp spar_hsr.cpp diff --git a/source/source_lcao/FORCE.h b/source/source_lcao/FORCE.h index ee16afd8b0..5eba250181 100644 --- a/source/source_lcao/FORCE.h +++ b/source/source_lcao/FORCE.h @@ -13,7 +13,6 @@ #include "source_psi/psi.h" #include "source_lcao/setup_deepks.h" - template class Force_Stress_LCAO; diff --git a/source/source_lcao/LCAO_domain.h b/source/source_lcao/LCAO_domain.h index 77281f7efb..cf4af3ace9 100644 --- a/source/source_lcao/LCAO_domain.h +++ b/source/source_lcao/LCAO_domain.h @@ -9,9 +9,7 @@ #include "source_lcao/LCAO_HS_arrays.hpp" #include "source_lcao/force_stress_arrays.h" #include "source_lcao/module_deepks/LCAO_deepks.h" -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" -#include "source_lcao/module_gint/grid_technique.h" +#include "source_basis/module_ao/parallel_orbitals.h" namespace LCAO_domain { @@ -35,17 +33,6 @@ void build_Nonlocal_mu_new(const Parallel_Orbitals& pv, const TwoCenterIntegrator& intor_orb_beta, const Grid_Driver* GridD); -/** - * @brief prepare gird integration - */ -void grid_prepare(const Grid_Technique& gt, - Gint_Gamma& gint_gamma, - Gint_k& gint_k, - const UnitCell& ucell, - const LCAO_Orbitals& orb, - const ModulePW::PW_Basis& rhopw, - const ModulePW::PW_Basis_Big& bigpw); - /** * @brief set the elements of force-related matrices in LCAO method */ diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp index 7743a68f1f..f8b60b6298 100644 --- a/source/source_lcao/LCAO_init_basis.cpp +++ b/source/source_lcao/LCAO_init_basis.cpp @@ -1,6 +1,7 @@ #include "LCAO_domain.h" #include "source_io/module_parameter/parameter.h" +#include "source_base/parallel_comm.h" /// once the GlobalC::exx_info has been deleted, this include can be gone /// mohan note 2024-07-21 #ifdef __EXX diff --git a/source/source_lcao/grid_init.cpp b/source/source_lcao/grid_init.cpp deleted file mode 100644 index 517f39a273..0000000000 --- a/source/source_lcao/grid_init.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include "source_lcao/LCAO_domain.h" -#include "source_pw/module_pwdft/global.h" -#include "source_io/module_parameter/parameter.h" -#include "source_base/global_variable.h" -#include "source_base/parallel_reduce.h" -#include "source_base/timer.h" - -namespace LCAO_domain -{ - -//-------------------------------------------- -// prepare grid network for Gint(grid integral) -//-------------------------------------------- -void grid_prepare( - const Grid_Technique& gt, - Gint_Gamma &gint_gamma, - Gint_k &gint_k, - const UnitCell& ucell, - const LCAO_Orbitals& orb, - const ModulePW::PW_Basis& rhopw, - const ModulePW::PW_Basis_Big& bigpw) -{ - ModuleBase::TITLE("LCAO_domain","grid_prepare"); - ModuleBase::timer::tick("LCAO_domain","grid_prepare"); - if(PARAM.globalv.gamma_only_local) - { - gint_gamma.prep_grid( - gt, - bigpw.nbx, - bigpw.nby, - bigpw.nbzp, - bigpw.nbzp_start, - rhopw.nxyz, - bigpw.bx, - bigpw.by, - bigpw.bz, - bigpw.bxyz, - bigpw.nbxx, - rhopw.ny, - rhopw.nplane, - rhopw.startz_current, - &ucell, - &orb); - } - else // multiple k-points - { - // cal the grid integration of 'Vl' matrix for l-points algorithms. - gint_k.prep_grid( - gt, - bigpw.nbx, - bigpw.nby, - bigpw.nbzp, - bigpw.nbzp_start, - rhopw.nxyz, - bigpw.bx, - bigpw.by, - bigpw.bz, - bigpw.bxyz, - bigpw.nbxx, - rhopw.ny, - rhopw.nplane, - rhopw.startz_current, - &ucell, - &orb); - } - - ModuleBase::timer::tick("LCAO_domain","grid_prepare"); - return; -} - -} diff --git a/source/source_lcao/hamilt_lcao.cpp b/source/source_lcao/hamilt_lcao.cpp index 389b9812cf..920d66ac11 100644 --- a/source/source_lcao/hamilt_lcao.cpp +++ b/source/source_lcao/hamilt_lcao.cpp @@ -70,9 +70,7 @@ HamiltLCAO::HamiltLCAO(const UnitCell& ucell, } template -HamiltLCAO::HamiltLCAO(Gint_Gamma* GG_in, - Gint_k* GK_in, - const UnitCell& ucell, +HamiltLCAO::HamiltLCAO(const UnitCell& ucell, const Grid_Driver& grid_d, const Parallel_Orbitals* paraV, elecstate::Potential* pot_in, @@ -186,8 +184,7 @@ HamiltLCAO::HamiltLCAO(Gint_Gamma* GG_in, // register Potential by gathered operator pot_in->pot_register(pot_register_in); // effective potential term - Operator* veff = new Veff>(GG_in, - this->hsk, + Operator* veff = new Veff>(this->hsk, this->kv->kvec_d, pot_in, this->hR, // no explicit call yet @@ -256,8 +253,7 @@ HamiltLCAO::HamiltLCAO(Gint_Gamma* GG_in, // register Potential by gathered operator pot_in->pot_register(pot_register_in); // Veff term - this->getOperator() = new Veff>(GK_in, - this->hsk, + this->getOperator() = new Veff>(this->hsk, this->kv->kvec_d, pot_in, this->hR, diff --git a/source/source_lcao/hamilt_lcao.h b/source/source_lcao/hamilt_lcao.h index 7e888ded3d..acd838ca2f 100644 --- a/source/source_lcao/hamilt_lcao.h +++ b/source/source_lcao/hamilt_lcao.h @@ -8,8 +8,6 @@ #include "source_estate/module_pot/potential_new.h" #include "source_hamilt/hamilt.h" #include "source_lcao/hs_matrix_k.hpp" -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" #include "source_lcao/module_hcontainer/hcontainer.h" #include @@ -39,9 +37,7 @@ class HamiltLCAO : public Hamilt * @brief Constructor of Hamiltonian for LCAO base * HR and SR will be allocated with Operators */ - HamiltLCAO(Gint_Gamma* GG_in, - Gint_k* GK_in, - const UnitCell& ucell, + HamiltLCAO(const UnitCell& ucell, const Grid_Driver& grid_d, const Parallel_Orbitals* paraV, elecstate::Potential* pot_in, diff --git a/source/source_lcao/module_gint/CMakeLists.txt b/source/source_lcao/module_gint/CMakeLists.txt index 0505957b9c..6969abd7e0 100644 --- a/source/source_lcao/module_gint/CMakeLists.txt +++ b/source/source_lcao/module_gint/CMakeLists.txt @@ -2,103 +2,46 @@ if(ENABLE_LCAO) list(APPEND objects - gint_old.cpp - gint_gamma_env.cpp - gint_gamma_vl.cpp - gint_fvl_old.cpp - gint_rho_old.cpp - gint_tau_old.cpp - gint_vl_old.cpp - gint_k_env.cpp - gint_k_sparse1.cpp - gint_k_pvpr.cpp - gint_k_pvdpr.cpp - gint_tools.cpp - grid_bigcell.cpp - grid_meshball.cpp - grid_meshcell.cpp - grid_meshk.cpp - grid_technique.cpp - gint_force_cpu_interface.cpp - gint_rho_cpu_interface.cpp - gint_vl_cpu_interface.cpp - cal_psir_ylm.cpp - cal_dpsir_ylm.cpp - cal_ddpsir_ylm.cpp - mult_psi_dmr.cpp - init_orb.cpp -) - -if(NOT DEFINED OLD_GINT) - list(APPEND objects - temp_gint/biggrid_info.cpp - temp_gint/big_grid.cpp - temp_gint/divide_info.cpp - temp_gint/gint_atom.cpp - temp_gint/gint_info.cpp - temp_gint/gint.cpp - temp_gint/gint_vl.cpp - temp_gint/gint_vl_metagga.cpp - temp_gint/gint_vl_nspin4.cpp - temp_gint/gint_vl_metagga_nspin4.cpp - temp_gint/gint_rho.cpp - temp_gint/gint_tau.cpp - temp_gint/gint_fvl.cpp - temp_gint/gint_fvl_meta.cpp - temp_gint/gint_env_gamma.cpp - temp_gint/gint_env_k.cpp - temp_gint/gint_dvlocal.cpp - temp_gint/localcell_info.cpp - temp_gint/phi_operator.cpp - temp_gint/set_ddphi.cpp - temp_gint/unitcell_info.cpp - temp_gint/gint_common.cpp - temp_gint/gint_interface.cpp - ) - if(USE_CUDA) - list(APPEND objects - temp_gint/kernel/gint_gpu_vars.cpp - temp_gint/kernel/phi_operator_gpu.cu - temp_gint/kernel/phi_operator_kernel.cu - temp_gint/kernel/set_const_mem.cu - temp_gint/batch_biggrid.cpp - temp_gint/gint_vl_gpu.cpp - temp_gint/gint_rho_gpu.cpp - temp_gint/gint_fvl_gpu.cpp - temp_gint/gint_vl_metagga_gpu.cpp - temp_gint/gint_vl_nspin4_gpu.cpp - temp_gint/gint_vl_metagga_nspin4_gpu.cpp - temp_gint/gint_tau_gpu.cpp - temp_gint/gint_fvl_meta_gpu.cpp - temp_gint/kernel/dgemm_vbatch.cu + biggrid_info.cpp + big_grid.cpp + divide_info.cpp + gint_atom.cpp + gint_info.cpp + gint.cpp + gint_vl.cpp + gint_vl_metagga.cpp + gint_vl_nspin4.cpp + gint_vl_metagga_nspin4.cpp + gint_rho.cpp + gint_tau.cpp + gint_fvl.cpp + gint_fvl_meta.cpp + gint_env_gamma.cpp + gint_env_k.cpp + gint_dvlocal.cpp + localcell_info.cpp + phi_operator.cpp + set_ddphi.cpp + unitcell_info.cpp + gint_common.cpp + gint_interface.cpp ) - endif() -endif() - if(USE_CUDA) list(APPEND objects - gint_gpu_interface.cpp - kernels/cuda/cuda_tools.cu - kernels/cuda/gint_vl.cu - kernels/cuda/gint_rho.cu - kernels/cuda/gint_force.cu - gint_vl_gpu.cu - gint_rho_gpu.cu - gint_force_gpu.cu - kernels/cuda/gemm_selector.cu - kernels/cuda/code_gen_00.cu - kernels/cuda/code_gen_01.cu - kernels/cuda/code_gen_02.cu - kernels/cuda/code_gen_03.cu - kernels/cuda/code_gen_04.cu - kernels/cuda/code_gen_05.cu - kernels/cuda/code_gen_06.cu - kernels/cuda/code_gen_07.cu - kernels/cuda/code_gen_08.cu - kernels/cuda/code_gen_09.cu - gtask_vl.cpp - gtask_rho.cpp - gtask_force.cpp + kernel/gint_gpu_vars.cpp + kernel/phi_operator_gpu.cu + kernel/phi_operator_kernel.cu + kernel/set_const_mem.cu + batch_biggrid.cpp + gint_vl_gpu.cpp + gint_rho_gpu.cpp + gint_fvl_gpu.cpp + gint_vl_metagga_gpu.cpp + gint_vl_nspin4_gpu.cpp + gint_vl_metagga_nspin4_gpu.cpp + gint_tau_gpu.cpp + gint_fvl_meta_gpu.cpp + kernel/dgemm_vbatch.cu ) endif() @@ -112,10 +55,4 @@ if(ENABLE_COVERAGE) add_coverage(gint) endif() -IF (BUILD_TESTING) - if(ENABLE_MPI) - add_subdirectory(test) - endif() -endif() - endif() \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/batch_biggrid.cpp b/source/source_lcao/module_gint/batch_biggrid.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/batch_biggrid.cpp rename to source/source_lcao/module_gint/batch_biggrid.cpp diff --git a/source/source_lcao/module_gint/temp_gint/batch_biggrid.h b/source/source_lcao/module_gint/batch_biggrid.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/batch_biggrid.h rename to source/source_lcao/module_gint/batch_biggrid.h diff --git a/source/source_lcao/module_gint/temp_gint/big_grid.cpp b/source/source_lcao/module_gint/big_grid.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/big_grid.cpp rename to source/source_lcao/module_gint/big_grid.cpp diff --git a/source/source_lcao/module_gint/temp_gint/big_grid.h b/source/source_lcao/module_gint/big_grid.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/big_grid.h rename to source/source_lcao/module_gint/big_grid.h diff --git a/source/source_lcao/module_gint/temp_gint/biggrid_info.cpp b/source/source_lcao/module_gint/biggrid_info.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/biggrid_info.cpp rename to source/source_lcao/module_gint/biggrid_info.cpp diff --git a/source/source_lcao/module_gint/temp_gint/biggrid_info.h b/source/source_lcao/module_gint/biggrid_info.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/biggrid_info.h rename to source/source_lcao/module_gint/biggrid_info.h diff --git a/source/source_lcao/module_gint/cal_ddpsir_ylm.cpp b/source/source_lcao/module_gint/cal_ddpsir_ylm.cpp deleted file mode 100644 index 206c6f95e8..0000000000 --- a/source/source_lcao/module_gint/cal_ddpsir_ylm.cpp +++ /dev/null @@ -1,316 +0,0 @@ -#include "gint_tools.h" -#include "source_base/timer.h" -#include "source_base/ylm.h" -namespace Gint_Tools{ -void cal_ddpsir_ylm( - const Grid_Technique& gt, const int bxyz, - const int na_grid, // number of atoms on this grid - const int grid_index, // 1d index of FFT index (i,j,k) - const double delta_r, // delta_r of the uniform FFT grid - const int* const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const int* const block_size, // block_size[na_grid], number of columns of a band - const bool* const* const cal_flag, // cal_flag[bxyz][na_grid], whether the atom-grid distance is larger than cutoff - double* const* const ddpsir_ylm_xx, double* const* const ddpsir_ylm_xy, double* const* const ddpsir_ylm_xz, - double* const* const ddpsir_ylm_yy, double* const* const ddpsir_ylm_yz, double* const* const ddpsir_ylm_zz) -{ - ModuleBase::timer::tick("Gint_Tools", "cal_ddpsir_ylm"); - const UnitCell& ucell = *gt.ucell; - std::vector it_psi_uniform(gt.nwmax); - std::vector it_dpsi_uniform(gt.nwmax); - std::vector it_d2psi_uniform(gt.nwmax); - std::vector it_psi_nr_uniform(gt.nwmax); - // array to store spherical harmonics and its derivatives - // the first dimension equals 36 because the maximum nwl is 5. - double rly[36]; - ModuleBase::Array_Pool grly(36, 3); - - for (int id = 0; id < na_grid; id++) - { - const int mcell_index = gt.bcell_start[grid_index] + id; - const int imcell = gt.which_bigcell[mcell_index]; - int iat = gt.which_atom[mcell_index]; - const int it = ucell.iat2it[iat]; - const int ia = ucell.iat2ia[iat]; - Atom* atom = &ucell.atoms[it]; - - const double mt[3] = {gt.meshball_positions[imcell][0] - gt.tau_in_bigcell[iat][0], - gt.meshball_positions[imcell][1] - gt.tau_in_bigcell[iat][1], - gt.meshball_positions[imcell][2] - gt.tau_in_bigcell[iat][2]}; - - for (int iw=0; iw< atom->nw; ++iw) - { - if ( atom->iw2_new[iw] ) - { - it_psi_uniform[iw]= gt.psi_u[it*gt.nwmax + iw].data(); - it_dpsi_uniform[iw] = gt.dpsi_u[it*gt.nwmax + iw].data(); - it_psi_nr_uniform[iw]= gt.psi_u[it*gt.nwmax + iw].size(); - } - } - - for (int ib = 0; ib < bxyz; ib++) - { - double* const p_ddpsi_xx = &ddpsir_ylm_xx[ib][block_index[id]]; - double* const p_ddpsi_xy = &ddpsir_ylm_xy[ib][block_index[id]]; - double* const p_ddpsi_xz = &ddpsir_ylm_xz[ib][block_index[id]]; - double* const p_ddpsi_yy = &ddpsir_ylm_yy[ib][block_index[id]]; - double* const p_ddpsi_yz = &ddpsir_ylm_yz[ib][block_index[id]]; - double* const p_ddpsi_zz = &ddpsir_ylm_zz[ib][block_index[id]]; - if (!cal_flag[ib][id]) - { - ModuleBase::GlobalFunc::ZEROS(p_ddpsi_xx, block_size[id]); - ModuleBase::GlobalFunc::ZEROS(p_ddpsi_xy, block_size[id]); - ModuleBase::GlobalFunc::ZEROS(p_ddpsi_xz, block_size[id]); - ModuleBase::GlobalFunc::ZEROS(p_ddpsi_yy, block_size[id]); - ModuleBase::GlobalFunc::ZEROS(p_ddpsi_yz, block_size[id]); - ModuleBase::GlobalFunc::ZEROS(p_ddpsi_zz, block_size[id]); - } - else - { - const double dr[3] - = {// vectors between atom and grid - gt.meshcell_pos[ib][0] + mt[0], gt.meshcell_pos[ib][1] + mt[1], gt.meshcell_pos[ib][2] + mt[2]}; - double distance = std::sqrt(dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]); - - // for some unknown reason, the finite difference between dpsi and ddpsi - // using analytical expression is always wrong; as a result, - // I switch to explicit finite difference method for evaluating - // the second derivatives of the orbitals - if (/*distance < 1e-9*/ true) - { - double*** dpsi = new double**[atom->nw]; - for (int i = 0; i < atom->nw; i++) - { - dpsi[i] = new double*[6]; - for (int j = 0; j < 6; j++) - { - dpsi[i][j] = new double[3]; - ModuleBase::GlobalFunc::ZEROS(dpsi[i][j], 3); - } - } - - double* dr1 = new double[3]; - - double** displ = new double*[6]; - for (int i = 0; i < 6; i++) - { - displ[i] = new double[3]; - ModuleBase::GlobalFunc::ZEROS(displ[i], 3); - } - displ[0][0] = 0.0001; // in x direction - displ[1][0] = -0.0001; - displ[2][1] = 0.0001; // in y direction - displ[3][1] = -0.0001; - displ[4][2] = 0.0001; // in z direction - displ[5][2] = -0.0001; - - for (int i = 0; i < 6; i++) - { - dr1[0] = dr[0] + displ[i][0]; - dr1[1] = dr[1] + displ[i][1]; - dr1[2] = dr[2] + displ[i][2]; - - ModuleBase::Ylm::grad_rl_sph_harm(ucell.atoms[it].nwl, dr1[0], dr1[1], dr1[2], rly, grly.get_ptr_2D()); - - double distance1 = std::sqrt(dr1[0] * dr1[0] + dr1[1] * dr1[1] + dr1[2] * dr1[2]); - if (distance1 < 1e-9) { - distance1 = 1e-9; -} - - const double position = distance1 / delta_r; - - const int ip = static_cast(position); - const double iq = static_cast(position); - const double x0 = position - iq; - const double x1 = 1.0 - x0; - const double x2 = 2.0 - x0; - const double x3 = 3.0 - x0; - const double x12 = x1 * x2 / 6; - const double x03 = x0 * x3 / 2; - - double tmp, dtmp; - - for (int iw = 0; iw < atom->nw; ++iw) - { - // this is a new 'l', we need 1D orbital wave - // function from interpolation method. - if (atom->iw2_new[iw]) - { - auto psi_uniform = it_psi_uniform[iw]; - auto dpsi_uniform = it_dpsi_uniform[iw]; - - // if ( iq[id] >= philn.nr_uniform-4) - if (iq >= it_psi_nr_uniform[iw]-4) - { - tmp = dtmp = 0.0; - } - else - { - // use Polynomia Interpolation method to get the - // wave functions - - tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) - + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); - - dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) - + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); - } - } // new l is used. - - // get the 'l' of this localized wave function - const int ll = atom->iw2l[iw]; - const int idx_lm = atom->iw2_ylm[iw]; - - const double rl = pow_int(distance1, ll); - - // derivative of wave functions with respect to atom positions. - const double tmpdphi_rly = (dtmp - tmp * ll / distance1) / rl * rly[idx_lm] / distance1; - const double tmprl = tmp / rl; - - dpsi[iw][i][0] = tmpdphi_rly * dr1[0] + tmprl * grly[idx_lm][0]; - dpsi[iw][i][1] = tmpdphi_rly * dr1[1] + tmprl * grly[idx_lm][1]; - dpsi[iw][i][2] = tmpdphi_rly * dr1[2] + tmprl * grly[idx_lm][2]; - } // end iw - } // end i = 0-6 - - for (int iw = 0; iw < atom->nw; iw++) - { - p_ddpsi_xx[iw] = (dpsi[iw][0][0] - dpsi[iw][1][0]) / 0.0002; - p_ddpsi_xy[iw] - = ((dpsi[iw][2][0] - dpsi[iw][3][0]) + (dpsi[iw][0][1] - dpsi[iw][1][1])) / 0.0004; - p_ddpsi_xz[iw] - = ((dpsi[iw][4][0] - dpsi[iw][5][0]) + (dpsi[iw][0][2] - dpsi[iw][1][2])) / 0.0004; - p_ddpsi_yy[iw] = (dpsi[iw][2][1] - dpsi[iw][3][1]) / 0.0002; - p_ddpsi_yz[iw] - = ((dpsi[iw][4][1] - dpsi[iw][5][1]) + (dpsi[iw][2][2] - dpsi[iw][3][2])) / 0.0004; - p_ddpsi_zz[iw] = (dpsi[iw][4][2] - dpsi[iw][5][2]) / 0.0002; - } - - for (int i = 0; i < atom->nw; i++) - { - for (int j = 0; j < 6; j++) - { - delete[] dpsi[i][j]; - } - delete[] dpsi[i]; - } - delete[] dpsi; - - delete[] dr1; - for (int i = 0; i < 6; i++) - { - delete[] displ[i]; - } - delete[] displ; - } - else - // the analytical method for evaluating 2nd derivatives - // it is not used currently - { - // Add it here, but do not run it. If there is a need to run this code - // in the future, include it in the previous initialization process. - for (int iw=0; iw< atom->nw; ++iw) - { - if ( atom->iw2_new[iw] ) - { - it_d2psi_uniform[iw] = gt.d2psi_u[it*gt.nwmax + iw].data(); - } - } - // End of code addition section. - - std::vector> hrly; - ModuleBase::Ylm::grad_rl_sph_harm(ucell.atoms[it].nwl, dr[0], dr[1], dr[2], rly, grly.get_ptr_2D()); - ModuleBase::Ylm::hes_rl_sph_harm(ucell.atoms[it].nwl, dr[0], dr[1], dr[2], hrly); - const double position = distance / delta_r; - - const double iq = static_cast(position); - const int ip = static_cast(position); - const double x0 = position - iq; - const double x1 = 1.0 - x0; - const double x2 = 2.0 - x0; - const double x3 = 3.0 - x0; - const double x12 = x1 * x2 / 6; - const double x03 = x0 * x3 / 2; - - double tmp, dtmp, ddtmp; - - for (int iw = 0; iw < atom->nw; ++iw) - { - // this is a new 'l', we need 1D orbital wave - // function from interpolation method. - if (atom->iw2_new[iw]) - { - auto psi_uniform = it_psi_uniform[iw]; - auto dpsi_uniform = it_dpsi_uniform[iw]; - auto ddpsi_uniform = it_d2psi_uniform[iw]; - - // if ( iq[id] >= philn.nr_uniform-4) - if (iq >= it_psi_nr_uniform[iw]-4) - { - tmp = dtmp = ddtmp = 0.0; - } - else - { - // use Polynomia Interpolation method to get the - // wave functions - - tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) - + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); - - dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) - + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); - - ddtmp = x12 * (ddpsi_uniform[ip] * x3 + ddpsi_uniform[ip + 3] * x0) - + x03 * (ddpsi_uniform[ip + 1] * x2 - ddpsi_uniform[ip + 2] * x1); - } - } // new l is used. - - // get the 'l' of this localized wave function - const int ll = atom->iw2l[iw]; - const int idx_lm = atom->iw2_ylm[iw]; - - const double rl = pow_int(distance, ll); - const double r_lp2 =rl * distance * distance; - - // d/dr (R_l / r^l) - const double tmpdphi = (dtmp - tmp * ll / distance) / rl; - const double term1 = ddtmp / r_lp2; - const double term2 = (2 * ll + 1) * dtmp / r_lp2 / distance; - const double term3 = ll * (ll + 2) * tmp / r_lp2 / distance / distance; - const double term4 = tmpdphi / distance; - const double term5 = term1 - term2 + term3; - - // hessian of (R_l / r^l) - const double term_xx = term4 + dr[0] * dr[0] * term5; - const double term_xy = dr[0] * dr[1] * term5; - const double term_xz = dr[0] * dr[2] * term5; - const double term_yy = term4 + dr[1] * dr[1] * term5; - const double term_yz = dr[1] * dr[2] * term5; - const double term_zz = term4 + dr[2] * dr[2] * term5; - - // d/dr (R_l / r^l) * alpha / r - const double term_1x = dr[0] * term4; - const double term_1y = dr[1] * term4; - const double term_1z = dr[2] * term4; - - p_ddpsi_xx[iw] - = term_xx * rly[idx_lm] + 2.0 * term_1x * grly[idx_lm][0] + tmp / rl * hrly[idx_lm][0]; - p_ddpsi_xy[iw] = term_xy * rly[idx_lm] + term_1x * grly[idx_lm][1] + term_1y * grly[idx_lm][0] - + tmp / rl * hrly[idx_lm][1]; - p_ddpsi_xz[iw] = term_xz * rly[idx_lm] + term_1x * grly[idx_lm][2] + term_1z * grly[idx_lm][0] - + tmp / rl * hrly[idx_lm][2]; - p_ddpsi_yy[iw] - = term_yy * rly[idx_lm] + 2.0 * term_1y * grly[idx_lm][1] + tmp / rl * hrly[idx_lm][3]; - p_ddpsi_yz[iw] = term_yz * rly[idx_lm] + term_1y * grly[idx_lm][2] + term_1z * grly[idx_lm][1] - + tmp / rl * hrly[idx_lm][4]; - p_ddpsi_zz[iw] - = term_zz * rly[idx_lm] + 2.0 * term_1z * grly[idx_lm][2] + tmp / rl * hrly[idx_lm][5]; - - } // iw - } // end if - } // else - } // end ib - } // end id(atom) - ModuleBase::timer::tick("Gint_Tools", "cal_ddpsir_ylm"); - return; -} -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/cal_dpsir_ylm.cpp b/source/source_lcao/module_gint/cal_dpsir_ylm.cpp deleted file mode 100644 index 8b32b2fc05..0000000000 --- a/source/source_lcao/module_gint/cal_dpsir_ylm.cpp +++ /dev/null @@ -1,138 +0,0 @@ -#include "gint_tools.h" -#include "source_base/timer.h" -#include "source_base/ylm.h" -#include "source_base/array_pool.h" -namespace Gint_Tools{ -void cal_dpsir_ylm( - const Grid_Technique& gt, const int bxyz, - const int na_grid, // number of atoms on this grid - const int grid_index, // 1d index of FFT index (i,j,k) - const double delta_r, // delta_r of the uniform FFT grid - const int* const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const int* const block_size, // block_size[na_grid], number of columns of a band - const bool* const* const cal_flag, // cal_flag[bxyz][na_grid], whether the atom-grid distance is larger than cutoff - double* const* const psir_ylm, double* const* const dpsir_ylm_x, double* const* const dpsir_ylm_y, - double* const* const dpsir_ylm_z) -{ - ModuleBase::timer::tick("Gint_Tools", "cal_dpsir_ylm"); - const UnitCell& ucell = *gt.ucell; - std::vector it_psi_uniform(gt.nwmax); - std::vector it_dpsi_uniform(gt.nwmax); - std::vector it_psi_nr_uniform(gt.nwmax); - // array to store spherical harmonics and its derivatives - // the first dimension equals 36 because the maximum nwl is 5. - double rly[36]; - ModuleBase::Array_Pool grly(36, 3); - - for (int id = 0; id < na_grid; id++) - { - const int mcell_index = gt.bcell_start[grid_index] + id; - const int imcell = gt.which_bigcell[mcell_index]; - int iat = gt.which_atom[mcell_index]; - const int it = ucell.iat2it[iat]; - const int ia = ucell.iat2ia[iat]; - Atom* atom = &ucell.atoms[it]; - - const double mt[3] = {gt.meshball_positions[imcell][0] - gt.tau_in_bigcell[iat][0], - gt.meshball_positions[imcell][1] - gt.tau_in_bigcell[iat][1], - gt.meshball_positions[imcell][2] - gt.tau_in_bigcell[iat][2]}; - // preprocess index - for (int iw=0; iw< atom->nw; ++iw) - { - if ( atom->iw2_new[iw] ) - { - it_psi_uniform[iw]= gt.psi_u[it*gt.nwmax + iw].data(); - it_dpsi_uniform[iw] = gt.dpsi_u[it*gt.nwmax + iw].data(); - it_psi_nr_uniform[iw]= gt.psi_u[it*gt.nwmax + iw].size(); - } - } - - for (int ib = 0; ib < bxyz; ib++) - { - double* const p_psi = &psir_ylm[ib][block_index[id]]; - double* const p_dpsi_x = &dpsir_ylm_x[ib][block_index[id]]; - double* const p_dpsi_y = &dpsir_ylm_y[ib][block_index[id]]; - double* const p_dpsi_z = &dpsir_ylm_z[ib][block_index[id]]; - if (!cal_flag[ib][id]) - { - ModuleBase::GlobalFunc::ZEROS(p_psi, block_size[id]); - ModuleBase::GlobalFunc::ZEROS(p_dpsi_x, block_size[id]); - ModuleBase::GlobalFunc::ZEROS(p_dpsi_y, block_size[id]); - ModuleBase::GlobalFunc::ZEROS(p_dpsi_z, block_size[id]); - } - else - { - const double dr[3] - = {// vectors between atom and grid - gt.meshcell_pos[ib][0] + mt[0], gt.meshcell_pos[ib][1] + mt[1], gt.meshcell_pos[ib][2] + mt[2]}; - double distance = std::sqrt(dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]); - - ModuleBase::Ylm::grad_rl_sph_harm(ucell.atoms[it].nwl, dr[0], dr[1], dr[2], rly, grly.get_ptr_2D()); - if (distance < 1e-9) { - distance = 1e-9; -} - - const double position = distance / delta_r; - - const double iq = static_cast(position); - const int ip = static_cast(position); - const double x0 = position - iq; - const double x1 = 1.0 - x0; - const double x2 = 2.0 - x0; - const double x3 = 3.0 - x0; - const double x12 = x1 * x2 / 6; - const double x03 = x0 * x3 / 2; - - double tmp, dtmp; - - for (int iw = 0; iw < atom->nw; ++iw) - { - - // this is a new 'l', we need 1D orbital wave - // function from interpolation method. - if (atom->iw2_new[iw]) - { - auto psi_uniform = it_psi_uniform[iw]; - auto dpsi_uniform = it_dpsi_uniform[iw]; - // if ( iq[id] >= philn.nr_uniform-4) - if (iq >= it_psi_nr_uniform[iw] - 4) - { - tmp = dtmp = 0.0; - } - else - { - // use Polynomia Interpolation method to get the - // wave functions - - tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) - + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); - - dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) - + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); - } - } // new l is used. - - // get the 'l' of this localized wave function - const int ll = atom->iw2l[iw]; - const int idx_lm = atom->iw2_ylm[iw]; - - const double rl = pow_int(distance, ll); - const double tmprl = tmp / rl; - - // 3D wave functions - p_psi[iw] = tmprl * rly[idx_lm]; - - // derivative of wave functions with respect to atom positions. - const double tmpdphi_rly = (dtmp - tmp * ll / distance) / rl * rly[idx_lm] / distance; - - p_dpsi_x[iw] = tmpdphi_rly * dr[0] + tmprl * grly[idx_lm][0]; - p_dpsi_y[iw] = tmpdphi_rly * dr[1] + tmprl * grly[idx_lm][1]; - p_dpsi_z[iw] = tmpdphi_rly * dr[2] + tmprl * grly[idx_lm][2]; - } // iw - } // else - } - } - ModuleBase::timer::tick("Gint_Tools", "cal_dpsir_ylm"); - return; -} -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/cal_psir_ylm.cpp b/source/source_lcao/module_gint/cal_psir_ylm.cpp deleted file mode 100644 index 4eeedd19a5..0000000000 --- a/source/source_lcao/module_gint/cal_psir_ylm.cpp +++ /dev/null @@ -1,113 +0,0 @@ -#include "gint_tools.h" -#include "source_base/timer.h" -#include "source_base/ylm.h" -namespace Gint_Tools{ -void cal_psir_ylm( - const Grid_Technique& gt, - const int bxyz, - const int na_grid, // number of atoms on this grid - const int grid_index, // 1d index of FFT index (i,j,k) - const double delta_r, // delta_r of the uniform FFT grid - const int* const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const int* const block_size, // block_size[na_grid], number of columns of a band - const bool* const* const cal_flag, - double* const* const psir_ylm) // cal_flag[bxyz][na_grid], whether the atom-grid distance is larger than cutoff -{ -// ModuleBase::timer::tick("Gint_Tools", "cal_psir_ylm"); - std::vector ylma; - const UnitCell& ucell = *gt.ucell; - std::vector it_psi_uniform(gt.nwmax); - std::vector it_dpsi_uniform(gt.nwmax); - - for (int id = 0; id < na_grid; id++) - { - // there are two parameters we want to know here: - // in which bigcell of the meshball the atom is in? - // what's the cartesian coordinate of the bigcell? - const int mcell_index = gt.bcell_start[grid_index] + id; - - const int iat = gt.which_atom[mcell_index]; // index of atom - const int it = ucell.iat2it[iat]; // index of atom type - const Atom* const atom = &ucell.atoms[it]; - std::vector it_psi_uniform(atom->nw); - std::vector it_dpsi_uniform(atom->nw); - // preprocess index - for (int iw = 0; iw < atom->nw; ++iw) - { - if (atom->iw2_new[iw]) - { - it_psi_uniform[iw]= gt.psi_u[it*gt.nwmax + iw].data(); - it_dpsi_uniform[iw] = gt.dpsi_u[it*gt.nwmax + iw].data(); - } - } - - // meshball_positions should be the bigcell position in meshball - // to the center of meshball. - // calculated in cartesian coordinates - // the std::vector from the grid which is now being operated to the atom position. - // in meshball language, is the std::vector from imcell to the center cel, plus - // tau_in_bigcell. - const int imcell = gt.which_bigcell[mcell_index]; - const double mt[3] = {gt.meshball_positions[imcell][0] - gt.tau_in_bigcell[iat][0], - gt.meshball_positions[imcell][1] - gt.tau_in_bigcell[iat][1], - gt.meshball_positions[imcell][2] - gt.tau_in_bigcell[iat][2]}; - - // number of grids in each big cell (bxyz) - for (int ib = 0; ib < bxyz; ib++) - { - double* p = &psir_ylm[ib][block_index[id]]; - if (!cal_flag[ib][id]) - { - ModuleBase::GlobalFunc::ZEROS(p, block_size[id]); - } - else - { - // meshcell_pos: z is the fastest - const double dr[3] - = {gt.meshcell_pos[ib][0] + mt[0], gt.meshcell_pos[ib][1] + mt[1], gt.meshcell_pos[ib][2] + mt[2]}; - double distance - = std::sqrt(dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]); // distance between atom and grid - // if(distance[id] > gt.orbital_rmax) continue; - if (distance < 1.0E-9) - distance += 1.0E-9; - - //------------------------------------------------------ - // spherical harmonic functions Ylm - //------------------------------------------------------ - // Ylm::get_ylm_real(this->nnn[it], this->dr[id], ylma); - ModuleBase::Ylm::sph_harm(ucell.atoms[it].nwl, dr[0] / distance, dr[1] / distance, dr[2] / distance, - ylma); - // these parameters are related to interpolation - // because once the distance from atom to grid point is known, - // we can obtain the parameters for interpolation and - // store them first! these operations can save lots of efforts. - const double position = distance / delta_r; - const int ip = static_cast(position); - const double dx = position - ip; - const double dx2 = dx * dx; - const double dx3 = dx2 * dx; - - const double c3 = 3.0 * dx2 - 2.0 * dx3; - const double c1 = 1.0 - c3; - const double c2 = (dx - 2.0 * dx2 + dx3) * delta_r; - const double c4 = (dx3 - dx2) * delta_r; - - double phi = 0; - for (int iw = 0; iw < atom->nw; ++iw) - { - if (atom->iw2_new[iw]) - { - auto psi_uniform = it_psi_uniform[iw]; - auto dpsi_uniform = it_dpsi_uniform[iw]; - phi = c1 * psi_uniform[ip] + c2 * dpsi_uniform[ip] // radial wave functions - + c3 * psi_uniform[ip + 1] + c4 * dpsi_uniform[ip + 1]; - } - p[iw] = phi * ylma[atom->iw2_ylm[iw]]; - } // end iw - } // end distance<=(rcuts[it]-1.0e-15) - } // end ib - } // end id -// ModuleBase::timer::tick("Gint_Tools", "cal_psir_ylm"); - return; -} -} diff --git a/source/source_lcao/module_gint/temp_gint/divide_info.cpp b/source/source_lcao/module_gint/divide_info.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/divide_info.cpp rename to source/source_lcao/module_gint/divide_info.cpp diff --git a/source/source_lcao/module_gint/temp_gint/divide_info.h b/source/source_lcao/module_gint/divide_info.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/divide_info.h rename to source/source_lcao/module_gint/divide_info.h diff --git a/source/source_lcao/module_gint/temp_gint/gint.cpp b/source/source_lcao/module_gint/gint.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint.cpp rename to source/source_lcao/module_gint/gint.cpp diff --git a/source/source_lcao/module_gint/gint.h b/source/source_lcao/module_gint/gint.h index 3c447b7e4f..1255bae971 100644 --- a/source/source_lcao/module_gint/gint.h +++ b/source/source_lcao/module_gint/gint.h @@ -1,275 +1,26 @@ -#ifndef GINT_INTERFACE -#define GINT_INTERFACE - -#include "gint_tools.h" -#include "source_cell/module_neighbor/sltk_grid_driver.h" -#include "source_lcao/module_gint/grid_technique.h" -#include "source_lcao/module_hcontainer/hcontainer.h" -#include - -//---------------------------------------------------------- -//!This class provides a unified interface to the -//!grid intergration operation used to calculate -//!electron density, and the contribution of local -//!potential to Hamiltonian and force/stress. -//!There are two derived classes of this class -//! namely Gint_Gamma and Gint_k, which contain -//! specific operations for gamma point/multi-k calculations -//---------------------------------------------------------- - -class Gint { - public: - ~Gint(); - - //! move operator for the next ESolver to directly use its infomation - Gint& operator=(Gint&& rhs); - - hamilt::HContainer* get_hRGint() const { return hRGint; } - - std::vector*> get_DMRGint() const { return dmr_gint; } - - int get_ncxyz() const { return ncxyz; } - - //! the unified interface to grid integration - void cal_gint(Gint_inout* inout); - - //! preparing FFT grid - void prep_grid(const Grid_Technique& gt, - const int& nbx_in, - const int& nby_in, - const int& nbz_in, - const int& nbz_start_in, - const int& ncxyz_in, - const int& bx_in, - const int& by_in, - const int& bz_in, - const int& bxyz_in, - const int& nbxx_in, - const int& ny_in, - const int& nplane_in, - const int& startz_current_in, - const UnitCell* ucell_in, - const LCAO_Orbitals* orb_in); - - /** - * @brief calculate the neighbor atoms of each atom in this processor - * size of BaseMatrix with be the non-parallel version - */ - void initialize_pvpR(const UnitCell& unitcell, const Grid_Driver* gd, const int& nspin); - - /** - * @brief resize dmr_gint to nspin and reallocate the memory - */ - void reset_DMRGint(const int& nspin); - - /** - * @brief transfer DMR (2D para) to DMR (Grid para) in elecstate_lcao.cpp - */ - void transfer_DM2DtoGrid(std::vector*> dm2d); - - const Grid_Technique* gridt = nullptr; - const UnitCell* ucell; - - // psir_ylm_new = psir_func(psir_ylm) - // psir_func==nullptr means psir_ylm_new=psir_ylm - using T_psir_func = std::function< - const ModuleBase::Array_Pool&( - const ModuleBase::Array_Pool &psir_ylm, - const Grid_Technique >, - const int grid_index, - const int is, - const std::vector &block_iw, - const std::vector &block_size, - const std::vector &block_index, - const ModuleBase::Array_Pool &cal_flag)>; - - T_psir_func psir_func_1 = nullptr; - T_psir_func psir_func_2 = nullptr; - - protected: - - //! variables related to FFT grid - int nbx; - int nby; - int nbz; - int ncxyz; - int nbz_start; - int bx; - int by; - int bz; - int bxyz; - int nbxx; - int ny; - int nplane; - int startz_current; // from rhopw - - //! in cal_gint_gpu.cpp - void gpu_vlocal_interface(Gint_inout* inout); - - void gpu_rho_interface(Gint_inout* inout); - - void gpu_force_interface(Gint_inout* inout); - - //! in cal_gint_cpu.cpp - void gint_kernel_vlocal(Gint_inout* inout); - - //! calculate H_mu_nu(local)= - void gint_kernel_dvlocal(Gint_inout* inout); - - //! calculate vlocal in meta-GGA functionals - void gint_kernel_vlocal_meta(Gint_inout* inout); - - //! calculate charge density rho(r)=\int D_munu \phi_mu \phi_nu - void gint_kernel_rho(Gint_inout* inout); - - //! used in meta-GGA functional - void gint_kernel_tau(Gint_inout* inout); - - //! compute forces - void gint_kernel_force(Gint_inout* inout); - - //! compute forces related to meta-GGA functionals - void gint_kernel_force_meta(Gint_inout* inout); - - //! calculate local potential contribution to the Hamiltonian - //! na_grid: how many atoms on this (i,j,k) grid - //! block_size: dim is [block_size], number of columns of a band - //! block_index: dim is [na_grid+1], total number of atomic orbitals - //! grid_index: index of grid group, for tracing iat - //! cal_flag: dim is [bxyz][na_grid], whether the atom-grid distance is larger than cutoff - //! psir_ylm: dim is [bxyz][LD_pool] - //! psir_vlbr3: dim is [bxyz][LD_pool] - //! hR: HContainer for storing the matrix elements - //! cal_meshball_vlocal is thread-safe! - void cal_meshball_vlocal( - const int na_grid, - const int LD_pool, - const int* const block_size, - const int* const block_index, - const int grid_index, - const bool* const* const cal_flag, - const double* const* const psir_ylm, - const double* const* const psir_vlbr3, - hamilt::HContainer* hR); - - //! in gint_fvl.cpp - //! calculate vl contributuion to force & stress via grid integrals - void gint_kernel_force(const int na_grid, - const int grid_index, - const double delta_r, - double* vldr3, - const int is, - const bool isforce, - const bool isstress, - ModuleBase::matrix* fvl_dphi, - ModuleBase::matrix* svl_dphi, - const UnitCell& ucell); - - //! in gint_fvl.cpp - //! calculate vl contributuion to force & stress via grid integrals - //! used in meta-GGA calculations - void gint_kernel_force_meta(const int na_grid, - const int grid_index, - const double delta_r, - double* vldr3, - double* vkdr3, - const int is, - const bool isforce, - const bool isstress, - ModuleBase::matrix* fvl_dphi, - ModuleBase::matrix* svl_dphi, - const UnitCell& ucell); - - //! Use grid integrals to compute the atomic force contributions - //! na_grid: how many atoms on this (i,j,k) grid - //! block_size: dim is [na_grid], number of columns of a band - //! block_index: dim is [na_grid+1], total number of atomis orbitals - //! psir_vlbr3_DMR: dim is [bxyz][LD_pool] - //! dpsir_x: dim is [bxyz][LD_pool] - //! dpsir_y: dim is [bxyz][LD_pool] - //! dpsir_z: dim is [bxyz][LD_pool] - void cal_meshball_force( - const int grid_index, - const int na_grid, - const int* const block_size, - const int* const block_index, - const double* const* const psir_vlbr3_DMR, - const double* const* const dpsir_x, // psir_vlbr3[bxyz][LD_pool] - const double* const* const dpsir_y, // psir_vlbr3[bxyz][LD_pool] - const double* const* const dpsir_z, // psir_vlbr3[bxyz][LD_pool] - ModuleBase::matrix* force); - - //! Use grid integrals to compute the stress contributions - //! na_grid: how many atoms on this (i,j,k) grid - //! block_index: dim is [na_grid+1], total number of atomis orbitals - void cal_meshball_stress( - const int na_grid, - const int*const block_index, - const double*const psir_vlbr3_DMR, - const double*const dpsirr, - ModuleBase::matrix *stress); - - //! Use grid integrals to compute charge density - //! in gint_k_rho.cpp - //! calculate the charge density & kinetic energy density (tau) via grid integrals - void gint_kernel_rho(const int na_grid, - const int grid_index, - const double delta_r, - int* vindex, - const int LD_pool, - const UnitCell& ucell, - Gint_inout* inout); - - //! Use grid integrals to compute charge density in a meshball - void cal_meshball_rho(const int na_grid, - const int*const block_index, - const int*const vindex, - const double*const*const psir_ylm, - const double*const*const psir_DMR, - double*const rho); - - //! Use grid integrals to compute kinetic energy density tau - //!in meta-GGA functional - void gint_kernel_tau(const int na_grid, - const int grid_index, - const double delta_r, - int* vindex, - const int LD_pool, - Gint_inout* inout, - const UnitCell& ucell); - - //! Use grid integrals to compute kinetic energy density tau - //!in a meshball, used in meta-GGA functional calculations - void cal_meshball_tau(const int na_grid, - int* block_index, - int* vindex, - double** dpsix, - double** dpsiy, - double** dpsiz, - double** dpsix_dm, - double** dpsiy_dm, - double** dpsiz_dm, - double* rho); - - //! save the < phi_0i | V | phi_Rj > in sparse H matrix. - //! stores Hamiltonian in sparse format - hamilt::HContainer* hRGint = nullptr; - - //! size of vec is 4, only used when nspin = 4 - std::vector*> hr_gint_tmp; - - //! stores Hamiltonian in sparse format - hamilt::HContainer>* hRGintCd = nullptr; - - //! stores DMR in sparse format - std::vector*> dmr_gint; - - //! tmp tools used in transfer_DM2DtoGrid - hamilt::HContainer* dm2d_tmp = nullptr; - - std::vector> pvdpRx_reduced; - std::vector> pvdpRy_reduced; - std::vector> pvdpRz_reduced; +#pragma once +#include +#include "gint_info.h" +#include "gint_type.h" + +namespace ModuleGint +{ + +class Gint +{ + public: + Gint() = default; + virtual ~Gint() = default; + + // note that gint_info_ is a static member variable + // it is shared by all instances of Gint + static void set_gint_info(GintInfo* gint_info) + { + gint_info_ = gint_info; + } + + protected: + static GintInfo* gint_info_; }; -#endif +} \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/gint_atom.cpp b/source/source_lcao/module_gint/gint_atom.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_atom.cpp rename to source/source_lcao/module_gint/gint_atom.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_atom.h b/source/source_lcao/module_gint/gint_atom.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_atom.h rename to source/source_lcao/module_gint/gint_atom.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_common.cpp b/source/source_lcao/module_gint/gint_common.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_common.cpp rename to source/source_lcao/module_gint/gint_common.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_common.h b/source/source_lcao/module_gint/gint_common.h similarity index 94% rename from source/source_lcao/module_gint/temp_gint/gint_common.h rename to source/source_lcao/module_gint/gint_common.h index 0e04a7cffc..180bc9e8ea 100644 --- a/source/source_lcao/module_gint/temp_gint/gint_common.h +++ b/source/source_lcao/module_gint/gint_common.h @@ -1,6 +1,6 @@ #pragma once #include "source_lcao/module_hcontainer/hcontainer.h" -#include "source_lcao/module_gint/temp_gint/gint_info.h" +#include "source_lcao/module_gint/gint_info.h" namespace ModuleGint { diff --git a/source/source_lcao/module_gint/temp_gint/gint_dvlocal.cpp b/source/source_lcao/module_gint/gint_dvlocal.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_dvlocal.cpp rename to source/source_lcao/module_gint/gint_dvlocal.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_dvlocal.h b/source/source_lcao/module_gint/gint_dvlocal.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_dvlocal.h rename to source/source_lcao/module_gint/gint_dvlocal.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_env_gamma.cpp b/source/source_lcao/module_gint/gint_env_gamma.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_env_gamma.cpp rename to source/source_lcao/module_gint/gint_env_gamma.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_env_gamma.h b/source/source_lcao/module_gint/gint_env_gamma.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_env_gamma.h rename to source/source_lcao/module_gint/gint_env_gamma.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_env_k.cpp b/source/source_lcao/module_gint/gint_env_k.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_env_k.cpp rename to source/source_lcao/module_gint/gint_env_k.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_env_k.h b/source/source_lcao/module_gint/gint_env_k.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_env_k.h rename to source/source_lcao/module_gint/gint_env_k.h diff --git a/source/source_lcao/module_gint/gint_force_cpu_interface.cpp b/source/source_lcao/module_gint/gint_force_cpu_interface.cpp deleted file mode 100644 index f4f346783d..0000000000 --- a/source/source_lcao/module_gint/gint_force_cpu_interface.cpp +++ /dev/null @@ -1,313 +0,0 @@ -#include "gint.h" -#include "source_base/memory.h" -#include "source_base/timer.h" - -void Gint::gint_kernel_force(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint_force"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_force"); - const UnitCell& ucell = *this->ucell; - const int max_size = this->gridt->max_atom; - const int ncyz = this->ny * this->nplane; - const double dv = ucell.omega / this->ncxyz; - const double delta_r = this->gridt->dr_uniform; - - -#pragma omp parallel -{ - ModuleBase::matrix* fvl_dphi_thread=inout->fvl_dphi; - ModuleBase::matrix* svl_dphi_thread=inout->svl_dphi; - if (inout->isforce) { - fvl_dphi_thread=new ModuleBase::matrix(*inout->fvl_dphi); - fvl_dphi_thread->zero_out(); - } - if (inout->isstress) { - svl_dphi_thread=new ModuleBase::matrix(*inout->svl_dphi); - svl_dphi_thread->zero_out(); - } - std::vector block_iw(max_size,0); - std::vector block_index(max_size+1,0); - std::vector block_size(max_size,0); - std::vector vldr3(this->bxyz,0.0); -#pragma omp for schedule(dynamic) - for (int grid_index = 0; grid_index < this->nbxx; grid_index++) { - const int na_grid = this->gridt->how_many_atoms[grid_index]; - if (na_grid == 0) { - continue; - } - Gint_Tools::get_gint_vldr3(vldr3.data(), - inout->vl, - this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - dv); - //prepare block information - ModuleBase::Array_Pool cal_flag(this->bxyz,max_size); - Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, - block_iw.data(), block_index.data(), block_size.data(), - cal_flag.get_ptr_2D()); - const int LD_pool = block_index[na_grid]; - - //evaluate psi and dpsi on grids - ModuleBase::Array_Pool psir_ylm(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_x(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_y(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_z(this->bxyz, LD_pool); - - Gint_Tools::cal_dpsir_ylm(*this->gridt, this->bxyz, na_grid, grid_index, delta_r, - block_index.data(), block_size.data(), - cal_flag.get_ptr_2D(),psir_ylm.get_ptr_2D(), - dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D()); - - //calculating f_mu(r) = v(r)*psi_mu(r)*dv - const ModuleBase::Array_Pool psir_vlbr3 = - Gint_Tools::get_psir_vlbr3(this->bxyz, na_grid, LD_pool, block_index.data(), - cal_flag.get_ptr_2D(), vldr3.data(), psir_ylm.get_ptr_2D()); - - ModuleBase::Array_Pool psir_vlbr3_DM(this->bxyz, LD_pool); - ModuleBase::GlobalFunc::ZEROS(psir_vlbr3_DM.get_ptr_1D(), this->bxyz*LD_pool); - - //calculating g_mu(r) = sum_nu rho_mu,nu f_nu(r) - Gint_Tools::mult_psi_DMR( - *this->gridt, - this->bxyz, - LD_pool, - grid_index, - na_grid, - block_index.data(), - block_size.data(), - cal_flag.get_ptr_2D(), - psir_vlbr3.get_ptr_2D(), - psir_vlbr3_DM.get_ptr_2D(), - this->dmr_gint[inout->ispin], - false); - - if(inout->isforce) - { - //do integration to get force - this-> cal_meshball_force(grid_index, na_grid, block_size.data(), block_index.data(), - psir_vlbr3_DM.get_ptr_2D(), dpsir_ylm_x.get_ptr_2D(), - dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D(), - fvl_dphi_thread); - } - if(inout->isstress) - { - //calculating g_mu(r)*(r-R) where R is the location of atom - - // The array dpsirr contains derivatives of psir in the xx, xy, xz, yy, yz, zz directions, - // with each set of six numbers representing the derivatives in these respective directions. - ModuleBase::Array_Pool dpsirr_ylm(this->bxyz, LD_pool * 6); - Gint_Tools::cal_dpsirr_ylm(*this->gridt, this->bxyz, na_grid, grid_index, block_index.data(), - block_size.data(), cal_flag.get_ptr_2D(),dpsir_ylm_x.get_ptr_2D(), - dpsir_ylm_y.get_ptr_2D(),dpsir_ylm_z.get_ptr_2D(), - dpsirr_ylm.get_ptr_2D()); - - //do integration to get stress - this-> cal_meshball_stress(na_grid, block_index.data(), psir_vlbr3_DM.get_ptr_1D(), - dpsirr_ylm.get_ptr_1D(), svl_dphi_thread); - } - } -#pragma omp critical(gint) - { - if (inout->isforce) { - inout->fvl_dphi[0] += fvl_dphi_thread[0]; - delete fvl_dphi_thread; - } - if (inout->isstress) { - inout->svl_dphi[0] += svl_dphi_thread[0]; - delete svl_dphi_thread; - } - } -} - ModuleBase::TITLE("Gint_interface", "cal_gint_force"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_force"); -} - -void Gint::gint_kernel_force_meta(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint_force_meta"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_force_meta"); - const UnitCell& ucell = *this->ucell; - const int max_size = this->gridt->max_atom; - const int ncyz = this->ny * this->nplane; - const double dv = ucell.omega / this->ncxyz; - const double delta_r = this->gridt->dr_uniform; - - -#pragma omp parallel -{ - ModuleBase::matrix* fvl_dphi_thread=inout->fvl_dphi; - ModuleBase::matrix* svl_dphi_thread=inout->svl_dphi; - if (inout->isforce) { - fvl_dphi_thread=new ModuleBase::matrix(*inout->fvl_dphi); - fvl_dphi_thread->zero_out(); - } - if (inout->isstress) { - svl_dphi_thread=new ModuleBase::matrix(*inout->svl_dphi); - svl_dphi_thread->zero_out(); - } - std::vector block_iw(max_size,0); - std::vector block_index(max_size+1,0); - std::vector block_size(max_size,0); - std::vector vldr3(this->bxyz,0.0); - std::vector vkdr3(this->bxyz,0.0); -#pragma omp for schedule(dynamic) - for (int grid_index = 0; grid_index < this->nbxx; grid_index++) { - const int na_grid = this->gridt->how_many_atoms[grid_index]; - if (na_grid == 0) { - continue; - } - Gint_Tools::get_gint_vldr3(vldr3.data(), - inout->vl, - this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - dv); - - Gint_Tools::get_gint_vldr3(vkdr3.data(), - inout->vofk, - this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - dv); - //prepare block information - ModuleBase::Array_Pool cal_flag(this->bxyz,max_size); - Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, - block_iw.data(), block_index.data(), block_size.data(), cal_flag.get_ptr_2D()); - const int LD_pool = block_index[na_grid]; - - //evaluate psi and dpsi on grids - ModuleBase::Array_Pool psir_ylm(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_x(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_y(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_z(this->bxyz, LD_pool); - ModuleBase::Array_Pool ddpsir_ylm_xx(this->bxyz, LD_pool); - ModuleBase::Array_Pool ddpsir_ylm_xy(this->bxyz, LD_pool); - ModuleBase::Array_Pool ddpsir_ylm_xz(this->bxyz, LD_pool); - ModuleBase::Array_Pool ddpsir_ylm_yy(this->bxyz, LD_pool); - ModuleBase::Array_Pool ddpsir_ylm_yz(this->bxyz, LD_pool); - ModuleBase::Array_Pool ddpsir_ylm_zz(this->bxyz, LD_pool); - - //psi and gradient of psi - Gint_Tools::cal_dpsir_ylm(*this->gridt, this->bxyz, na_grid, grid_index, delta_r, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(), - psir_ylm.get_ptr_2D(), dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D()); - - //hessian of psi - Gint_Tools::cal_ddpsir_ylm(*this->gridt, this->bxyz, na_grid, grid_index, delta_r, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(), - ddpsir_ylm_xx.get_ptr_2D(), ddpsir_ylm_xy.get_ptr_2D(), ddpsir_ylm_xz.get_ptr_2D(), - ddpsir_ylm_yy.get_ptr_2D(), ddpsir_ylm_yz.get_ptr_2D(), ddpsir_ylm_zz.get_ptr_2D()); - - //calculating f_mu(r) = v(r)*psi_mu(r)*dv - const ModuleBase::Array_Pool psir_vlbr3 - = Gint_Tools::get_psir_vlbr3(this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vldr3.data(), psir_ylm.get_ptr_2D()); - const ModuleBase::Array_Pool dpsir_x_vlbr3 - = Gint_Tools::get_psir_vlbr3(this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_x.get_ptr_2D()); - const ModuleBase::Array_Pool dpsir_y_vlbr3 - = Gint_Tools::get_psir_vlbr3(this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_y.get_ptr_2D()); - const ModuleBase::Array_Pool dpsir_z_vlbr3 - = Gint_Tools::get_psir_vlbr3(this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_z.get_ptr_2D()); - - ModuleBase::Array_Pool psir_vlbr3_DM(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsirx_v_DM(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsiry_v_DM(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsirz_v_DM(this->bxyz, LD_pool); - - ModuleBase::GlobalFunc::ZEROS(psir_vlbr3_DM.get_ptr_1D(), this->bxyz*LD_pool); - ModuleBase::GlobalFunc::ZEROS(dpsirx_v_DM.get_ptr_1D(), this->bxyz*LD_pool); - ModuleBase::GlobalFunc::ZEROS(dpsiry_v_DM.get_ptr_1D(), this->bxyz*LD_pool); - ModuleBase::GlobalFunc::ZEROS(dpsirz_v_DM.get_ptr_1D(), this->bxyz*LD_pool); - - //calculating g_mu(r) = sum_nu rho_mu,nu f_nu(r) - Gint_Tools::mult_psi_DMR(*this->gridt, this->bxyz, LD_pool, grid_index, - na_grid, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(), - psir_vlbr3.get_ptr_2D(), psir_vlbr3_DM.get_ptr_2D(), this->dmr_gint[inout->ispin], false); - - Gint_Tools::mult_psi_DMR(*this->gridt, this->bxyz, LD_pool, grid_index, - na_grid, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(), - dpsir_x_vlbr3.get_ptr_2D(), dpsirx_v_DM.get_ptr_2D(), this->dmr_gint[inout->ispin], false); - - Gint_Tools::mult_psi_DMR(*this->gridt, this->bxyz, LD_pool, grid_index, - na_grid, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(), - dpsir_y_vlbr3.get_ptr_2D(), dpsiry_v_DM.get_ptr_2D(), this->dmr_gint[inout->ispin], false); - - Gint_Tools::mult_psi_DMR(*this->gridt, this->bxyz, LD_pool, grid_index, - na_grid, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(), - dpsir_z_vlbr3.get_ptr_2D(), dpsirz_v_DM.get_ptr_2D(), this->dmr_gint[inout->ispin], false); - - if(inout->isforce) - { - //do integration to get force - this-> cal_meshball_force(grid_index, na_grid, block_size.data(), block_index.data(), - psir_vlbr3_DM.get_ptr_2D(), dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D(), - fvl_dphi_thread); - - this-> cal_meshball_force(grid_index, na_grid, block_size.data(), block_index.data(), - dpsirx_v_DM.get_ptr_2D(), ddpsir_ylm_xx.get_ptr_2D(), ddpsir_ylm_xy.get_ptr_2D(), ddpsir_ylm_xz.get_ptr_2D(), - fvl_dphi_thread); - this-> cal_meshball_force(grid_index, na_grid, block_size.data(), block_index.data(), - dpsiry_v_DM.get_ptr_2D(), ddpsir_ylm_xy.get_ptr_2D(), ddpsir_ylm_yy.get_ptr_2D(), ddpsir_ylm_yz.get_ptr_2D(), - fvl_dphi_thread); - this-> cal_meshball_force(grid_index, na_grid, block_size.data(), block_index.data(), - dpsirz_v_DM.get_ptr_2D(), ddpsir_ylm_xz.get_ptr_2D(), ddpsir_ylm_yz.get_ptr_2D(), ddpsir_ylm_zz.get_ptr_2D(), - fvl_dphi_thread); - - } - if(inout->isstress) - { - //calculating g_mu(r)*(r-R) where R is the location of atom - ModuleBase::Array_Pool array(this->bxyz, LD_pool * 6); - - //the vxc part - Gint_Tools::cal_dpsirr_ylm(*this->gridt, this->bxyz, na_grid, grid_index, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(), - dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D(), array.get_ptr_2D()); - //do integration to get stress - this-> cal_meshball_stress(na_grid, block_index.data(), psir_vlbr3_DM.get_ptr_1D(), - array.get_ptr_1D(), svl_dphi_thread); - - //partial x of vtau part - Gint_Tools::cal_dpsirr_ylm(*this->gridt, this->bxyz, na_grid, grid_index, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(), - ddpsir_ylm_xx.get_ptr_2D(), ddpsir_ylm_xy.get_ptr_2D(), ddpsir_ylm_xz.get_ptr_2D(), array.get_ptr_2D()); - //do integration to get stress - this-> cal_meshball_stress(na_grid, block_index.data(), dpsirx_v_DM.get_ptr_1D(), - array.get_ptr_1D(), svl_dphi_thread); - - //partial y of vtau part - Gint_Tools::cal_dpsirr_ylm(*this->gridt, this->bxyz, na_grid, grid_index, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(), - ddpsir_ylm_xy.get_ptr_2D(), ddpsir_ylm_yy.get_ptr_2D(), ddpsir_ylm_yz.get_ptr_2D(), array.get_ptr_2D()); - //do integration to get stress - this-> cal_meshball_stress(na_grid, block_index.data(), dpsiry_v_DM.get_ptr_1D(), - array.get_ptr_1D(), svl_dphi_thread); - - //partial z of vtau part - Gint_Tools::cal_dpsirr_ylm(*this->gridt, this->bxyz, na_grid, grid_index, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(), - ddpsir_ylm_xz.get_ptr_2D(), ddpsir_ylm_yz.get_ptr_2D(), ddpsir_ylm_zz.get_ptr_2D(), array.get_ptr_2D()); - //do integration to get stress - this-> cal_meshball_stress(na_grid, block_index.data(), dpsirz_v_DM.get_ptr_1D(), - array.get_ptr_1D(), svl_dphi_thread); - } - } -#pragma omp critical(gint) - { - if (inout->isforce) { - inout->fvl_dphi[0] += fvl_dphi_thread[0]; - delete fvl_dphi_thread; - } - if (inout->isstress) { - inout->svl_dphi[0] += svl_dphi_thread[0]; - delete svl_dphi_thread; - } - } -} - ModuleBase::TITLE("Gint_interface", "cal_gint_force_meta"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_force_meta"); -} diff --git a/source/source_lcao/module_gint/gint_force_gpu.cu b/source/source_lcao/module_gint/gint_force_gpu.cu deleted file mode 100644 index cb3390aacc..0000000000 --- a/source/source_lcao/module_gint/gint_force_gpu.cu +++ /dev/null @@ -1,301 +0,0 @@ -#ifdef _OPENMP -#include -#endif - -#include "gint_force_gpu.h" -#include "kernels/cuda/cuda_tools.cuh" -#include "kernels/cuda/gint_force.cuh" -#include "source_base/ylm.h" -#include "gint_tools.h" - -namespace GintKernel -{ -/** - * @brief Calculate forces and stresses - * @note The grid integration on the GPU is mainly divided into the following - * steps: - * 1. Use the CPU to divide the grid integration into subtasks. - * 2. Copy the subtask information to the GPU. - * 3. Calculate the matrix elements on the GPU. - * 4. Perform matrix multiplication on the GPU. - * 5. stress dot on the GPU. - * 6. force dot on the GPU. - * 7. Copy the results back to the host. - */ -void gint_fvl_gpu(const hamilt::HContainer* dm, - const double* vlocal, - double* force_in, - double* stress_in, - double dr, - const double* rcut, - const int isforce, - const int isstress, - const Grid_Technique& gridt, - const UnitCell& ucell) -{ - checkCuda(cudaSetDevice(gridt.dev_id)); - // checkCuda(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); - - const int nbzp = gridt.nbzp; - const int max_atom = gridt.max_atom; - const int nwmax = ucell.nwmax; - const int bxyz = gridt.bxyz; - const int max_atom_per_bcell = max_atom * bxyz; - const int max_atom_per_z = max_atom_per_bcell * nbzp; - const int max_phi_per_z = max_atom_per_z * ucell.nwmax; - const int max_atompair_per_z = max_atom * max_atom * nbzp; - const double vfactor = ucell.omega / gridt.ncxyz; - const int nczp = nbzp * gridt.bz; - const int nat=ucell.nat; - - const int num_streams = gridt.nstreams; - - std::vector streams(num_streams); - std::vector events(num_streams); - for (int i = 0; i < num_streams; i++) - { - checkCuda(cudaStreamCreate(&streams[i])); - checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming)); - } - - Cuda_Mem_Wrapper dr_part(3 * max_atom_per_z, num_streams, true); - Cuda_Mem_Wrapper atoms_type(max_atom_per_z, num_streams, true); - Cuda_Mem_Wrapper iat_on_nbz(max_atom_per_z, num_streams, true); - // The first number in every group of two represents the number of atoms on that bigcell. - // The second number represents the cumulative number of atoms up to that bigcell. - Cuda_Mem_Wrapper atoms_num_info(2 * nbzp, num_streams, true); - Cuda_Mem_Wrapper vldr3(nbzp * gridt.bxyz, num_streams, true); - - Cuda_Mem_Wrapper psi(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper psi_dm(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper dpsi(3 * max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper d2psi(6 * max_phi_per_z, num_streams, false); - - Cuda_Mem_Wrapper gemm_alpha(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_m(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_n(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_k(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_lda(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_ldb(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_ldc(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_A(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_B(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_C(max_atompair_per_z, num_streams, true); - - Cuda_Mem_Wrapper force(3 * nat, num_streams, true); - Cuda_Mem_Wrapper stress(6, num_streams, true); - - Cuda_Mem_Wrapper dm_matrix(dm->get_nnr(), 1, false); - // retrieve the density matrix on the host - checkCuda(cudaMemcpy(dm_matrix.get_device_pointer(), - dm->get_wrapper(), - dm->get_nnr() * sizeof(double), - cudaMemcpyHostToDevice)); - -#ifdef _OPENMP -const int max_thread_num = std::min(omp_get_max_threads(), num_streams); -#endif -#pragma omp parallel num_threads(max_thread_num) -{ -#ifdef _OPENMP - const int tid = omp_get_thread_num(); - const int num_threads = omp_get_num_threads(); - const int sid_start = tid * num_streams / num_threads; - const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads; -#else - const int sid_start = 0; - const int thread_num_streams = num_streams; -#endif -#pragma omp for collapse(2) schedule(dynamic) - for (int i = 0; i < gridt.nbx; i++) - { - for (int j = 0; j < gridt.nby; j++) - { - // 20240620 Note that it must be set again here because - // cuda's device is not safe in a multi-threaded environment. - checkCuda(cudaSetDevice(gridt.dev_id)); - - const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start; - checkCuda(cudaEventSynchronize(events[sid])); - - int max_m = 0; - int max_n = 0; - int atom_pair_num = 0; - int atoms_per_z = 0; - const int grid_index_ij = i * gridt.nby * nbzp + j * nbzp; - - gtask_force(gridt, - ucell, - grid_index_ij, - nczp, - vfactor, - vlocal, - atoms_per_z, - atoms_num_info.get_host_pointer(sid), - iat_on_nbz.get_host_pointer(sid), - atoms_type.get_host_pointer(sid), - dr_part.get_host_pointer(sid), - vldr3.get_host_pointer(sid)); - - alloc_mult_force(dm, - gridt, - ucell, - grid_index_ij, - max_atom, - atoms_num_info.get_host_pointer(sid), - psi.get_device_pointer(sid), - psi_dm.get_device_pointer(sid), - dm_matrix.get_device_pointer(), - max_m, - max_n, - atom_pair_num, - gemm_m.get_host_pointer(sid), - gemm_n.get_host_pointer(sid), - gemm_k.get_host_pointer(sid), - gemm_lda.get_host_pointer(sid), - gemm_ldb.get_host_pointer(sid), - gemm_ldc.get_host_pointer(sid), - gemm_A.get_host_pointer(sid), - gemm_B.get_host_pointer(sid), - gemm_C.get_host_pointer(sid)); - - dr_part.copy_host_to_device_async(streams[sid], sid, 3 * atoms_per_z); - atoms_type.copy_host_to_device_async(streams[sid], sid, atoms_per_z); - iat_on_nbz.copy_host_to_device_async(streams[sid], sid, atoms_per_z); - vldr3.copy_host_to_device_async(streams[sid], sid); - atoms_num_info.copy_host_to_device_async(streams[sid], sid); - - gemm_m.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_n.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_k.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_lda.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_ldb.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_ldc.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - checkCuda(cudaEventRecord(events[sid], streams[sid])); - - psi.memset_device_async(streams[sid], sid, 0); - psi_dm.memset_device_async(streams[sid], sid, 0); - dpsi.memset_device_async(streams[sid], sid, 0); - d2psi.memset_device_async(streams[sid], sid, 0); - - dim3 grid_psi(nbzp, gridt.bxyz); - dim3 block_psi(64); - get_psi_force<<>>( - gridt.ylmcoef_g, - dr, - bxyz, - nwmax, - max_atom, - gridt.atom_nwl_g, - gridt.atom_new_g, - gridt.atom_ylm_g, - gridt.atom_l_g, - gridt.atom_nw_g, - gridt.rcut_g, - gridt.nr_max, - gridt.psi_u_g, - gridt.mcell_pos_g, - dr_part.get_device_pointer(sid), - vldr3.get_device_pointer(sid), - atoms_type.get_device_pointer(sid), - atoms_num_info.get_device_pointer(sid), - psi.get_device_pointer(sid), - dpsi.get_device_pointer(sid), - d2psi.get_device_pointer(sid)); - checkCudaLastError(); - - gridt.fastest_matrix_mul(max_m, - max_n, - gemm_m.get_device_pointer(sid), - gemm_n.get_device_pointer(sid), - gemm_k.get_device_pointer(sid), - gemm_A.get_device_pointer(sid), - gemm_lda.get_device_pointer(sid), - gemm_B.get_device_pointer(sid), - gemm_ldb.get_device_pointer(sid), - gemm_C.get_device_pointer(sid), - gemm_ldc.get_device_pointer(sid), - atom_pair_num, - streams[sid], - nullptr); - - if (isforce){ - dim3 grid_force(nbzp); - dim3 block_force(64); - dot_product_force<<>>( - bxyz, - nwmax, - atoms_num_info.get_device_pointer(sid), - iat_on_nbz.get_device_pointer(sid), - dpsi.get_device_pointer(sid), - psi_dm.get_device_pointer(sid), - force.get_device_pointer(sid)); - checkCudaLastError(); - } - - if (isstress){ - dim3 grid_stress(nbzp); - dim3 block_stress(64); - dot_product_stress<<>>( - d2psi.get_device_pointer(sid), - psi_dm.get_device_pointer(sid), - atoms_per_z * nwmax * bxyz, - stress.get_device_pointer(sid)); - checkCudaLastError(); - } - } - } -} - - for(int i = 0; i < num_streams; i++) - { - stress.copy_device_to_host_async(streams[i], i); - force.copy_device_to_host_async(streams[i], i); - } - - for (int i = 0; i < num_streams; i++) - { - checkCuda(cudaStreamSynchronize(streams[i])); - checkCuda(cudaEventDestroy(events[i])); - } - - if (isstress){ - for (int i = 0; i < num_streams; i++) - { - const int offset = 6 * i; - for (int j = 0; j < 6; j++) - { - stress_in[j] += stress.get_host_pointer()[offset + j]; - } - } - } - if (isforce){ - for (int i = 0; i < num_streams; i++) - { - const int offset = 3 * i * nat; - for (int j = 0; j < 3 * nat; j++) - { - force_in[j] += force.get_host_pointer()[offset + j]; - } - } - } - - for (int i = 0; i < num_streams; i++) - { - checkCuda(cudaStreamDestroy(streams[i])); - } -} - -} // namespace GintKernel diff --git a/source/source_lcao/module_gint/gint_force_gpu.h b/source/source_lcao/module_gint/gint_force_gpu.h deleted file mode 100644 index 0dac4a99d6..0000000000 --- a/source/source_lcao/module_gint/gint_force_gpu.h +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_FORCE_GPU_H -#define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_FORCE_GPU_H - -#include "source_lcao/module_gint/gint.h" -#include "source_lcao/module_gint/grid_technique.h" -namespace GintKernel -{ -void gint_fvl_gpu(const hamilt::HContainer* dm, - const double* vlocal, - double* force_in, - double* stress_in, - double dr, - const double* rcut, - const int isforce, - const int isstress, - const Grid_Technique& gridt, - const UnitCell& ucell); - -void gtask_force(const Grid_Technique& gridt, - const UnitCell& ucell, - const int grid_index_ij, - const int nczp, - const double vfactor, - const double* vlocal_global_value, - int& atoms_per_z, - int* atoms_num_info, - int* iat_on_nbz, - uint8_t* atoms_type, - double* dr_part, - double* vldr3); - -void alloc_mult_force(const hamilt::HContainer* dm, - const Grid_Technique& gridt, - const UnitCell& ucell, - const int grid_index_ij, - const int max_atom, - const int *atoms_num_info, - double* const psi_g, - double* const psi_dm_g, - double* const dm_matrix_g, - int& max_m, - int& max_n, - int& atom_pair_num, - int* mat_m, - int* mat_n, - int* mat_k, - int* mat_lda, - int* mat_ldb, - int* mat_ldc, - double** mat_A, - double** mat_B, - double** mat_C); - -} // namespace GintKernel -#endif diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl.cpp b/source/source_lcao/module_gint/gint_fvl.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_fvl.cpp rename to source/source_lcao/module_gint/gint_fvl.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl.h b/source/source_lcao/module_gint/gint_fvl.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_fvl.h rename to source/source_lcao/module_gint/gint_fvl.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp b/source/source_lcao/module_gint/gint_fvl_gpu.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp rename to source/source_lcao/module_gint/gint_fvl_gpu.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_gpu.h b/source/source_lcao/module_gint/gint_fvl_gpu.h similarity index 95% rename from source/source_lcao/module_gint/temp_gint/gint_fvl_gpu.h rename to source/source_lcao/module_gint/gint_fvl_gpu.h index b613333e7a..cdbcd40aa9 100644 --- a/source/source_lcao/module_gint/temp_gint/gint_fvl_gpu.h +++ b/source/source_lcao/module_gint/gint_fvl_gpu.h @@ -6,7 +6,7 @@ #include "source_base/matrix.h" #include "gint.h" #include "gint_info.h" -#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" +#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h" namespace ModuleGint { diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_meta.cpp b/source/source_lcao/module_gint/gint_fvl_meta.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_fvl_meta.cpp rename to source/source_lcao/module_gint/gint_fvl_meta.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_meta.h b/source/source_lcao/module_gint/gint_fvl_meta.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_fvl_meta.h rename to source/source_lcao/module_gint/gint_fvl_meta.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp b/source/source_lcao/module_gint/gint_fvl_meta_gpu.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp rename to source/source_lcao/module_gint/gint_fvl_meta_gpu.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h b/source/source_lcao/module_gint/gint_fvl_meta_gpu.h similarity index 95% rename from source/source_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h rename to source/source_lcao/module_gint/gint_fvl_meta_gpu.h index 2b9d88aec2..a1b41cbd61 100644 --- a/source/source_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h +++ b/source/source_lcao/module_gint/gint_fvl_meta_gpu.h @@ -6,7 +6,7 @@ #include "source_base/matrix.h" #include "gint.h" #include "gint_info.h" -#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" +#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h" namespace ModuleGint { diff --git a/source/source_lcao/module_gint/gint_fvl_old.cpp b/source/source_lcao/module_gint/gint_fvl_old.cpp deleted file mode 100644 index 663a7ddce6..0000000000 --- a/source/source_lcao/module_gint/gint_fvl_old.cpp +++ /dev/null @@ -1,76 +0,0 @@ -#include "gint_k.h" -#include "source_base/timer.h" -#include "source_base/ylm.h" -#include "source_pw/module_pwdft/global.h" -#include "source_base/array_pool.h" - -// This function utilizes the cache more effectively than calling the ddot function, thus performing faster. -void Gint::cal_meshball_force( - const int grid_index, - const int na_grid, // how many atoms on this (i,j,k) grid - const int*const block_size, // block_size[na_grid], number of columns of a band - const int*const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const double*const*const psir_vlbr3_DMR, // psir_vlbr3[this->bxyz][LD_pool] - const double*const*const dpsir_x, // psir_vlbr3[this->bxyz][LD_pool] - const double*const*const dpsir_y, // psir_vlbr3[this->bxyz][LD_pool] - const double*const*const dpsir_z, // psir_vlbr3[this->bxyz][LD_pool] - ModuleBase::matrix *force) -{ - for(int ia1=0;ia1gridt->bcell_start[grid_index] + ia1; - const int iat=this->gridt->which_atom[mcell_index]; // index of atom - double rx = 0; - double ry = 0; - double rz = 0; - for(int ib=0; ibbxyz; ib++) - { - for(int iw=0; iwbxyz; - - for(int i=0; i -#endif - -//========================================================= -// ModuleBase::Integral On 3D Grids, different from Grid_Integral -// Feature : Matrix Elements Of Local Potential For -// Numerical Orbitals -//========================================================= - -class Gint_Gamma : public Gint -{ - public: - - //! @brief move operator for the next ESolver to directly use its infomation - //! @param rhs - //! @return *this - Gint_Gamma& operator=(Gint_Gamma&& rhs); - - //! in gint_gamma_vl.cpp - //! there is an additional step in calculating vlocal for gamma point - //! namely the redistribution of Hamiltonian from grid to 2D block format - //! hence we have an additional layer outside the unified interface - void cal_vlocal(Gint_inout* inout, const bool new_e_iteration); - - //! in gint_gamma_env.cpp - //! calcualte the electronic wave functions via grid integral - void cal_env(const double* wfc, double* rho,const UnitCell &ucell); - - //! transfer this->hRGint to Veff::hR - void transfer_pvpR(hamilt::HContainer* hR,const UnitCell* ucell); - -private: - - //! pointer to density matrix - double*** DM = nullptr; - -}; - -#endif diff --git a/source/source_lcao/module_gint/gint_gamma_env.cpp b/source/source_lcao/module_gint/gint_gamma_env.cpp deleted file mode 100644 index 76ae6e506a..0000000000 --- a/source/source_lcao/module_gint/gint_gamma_env.cpp +++ /dev/null @@ -1,101 +0,0 @@ -#include "gint_gamma.h" -#include "grid_technique.h" -#include "source_base/timer.h" -#include "source_base/ylm.h" -#include "source_base/array_pool.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_pw/module_pwdft/global.h" - -void Gint_Gamma::cal_env(const double* wfc, double* rho,const UnitCell& ucell) -{ - ModuleBase::TITLE("Grid_Integral", "cal_env"); - - // it's a uniform grid to save orbital values, so the delta_r is a constant. - const double delta_r = this->gridt->dr_uniform; - const int max_size = this->gridt->max_atom; - if (max_size <= 0){ - ModuleBase::WARNING_QUIT("Gint_Gamma::cal_env", - "the max_size is less than 0!"); - } - const int nbx = this->gridt->nbx; - const int nby = this->gridt->nby; - const int nbz = this->gridt->nbzp; - const int ncyz = this->ny * this->nplane; // mohan add 2012-03-25 - const int bxyz = this->bxyz; - - #pragma omp parallel - { - std::vector block_iw(max_size, 0); - std::vector block_index(max_size+1, 0); - std::vector block_size(max_size, 0); - std::vector vindex(bxyz,0); - #pragma omp for - for (int grid_index = 0; grid_index < this->nbxx; grid_index++) - { - - // get the value: how many atoms has orbital value on this grid. - const int size = this->gridt->how_many_atoms[grid_index]; - if (size == 0) - continue; - - // int *block_iw, *block_index, *block_size; - ModuleBase::Array_Pool cal_flag(bxyz, size); - Gint_Tools::get_block_info(*this->gridt, - this->bxyz, - size, - grid_index, - block_iw.data(), - block_index.data(), - block_size.data(), - cal_flag.get_ptr_2D()); - const int LD_pool = block_index[size]; - - // evaluate psi on grids - ModuleBase::Array_Pool psir_ylm(this->bxyz, LD_pool); - Gint_Tools::cal_psir_ylm(*this->gridt, - this->bxyz, - size, - grid_index, - delta_r, - block_index.data(), - block_size.data(), - cal_flag.get_ptr_2D(), - psir_ylm.get_ptr_2D()); - - Gint_Tools::get_vindex(this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - vindex.data()); - - for (int ia1 = 0; ia1 < size; ia1++) - { - const int mcell_index1 = this->gridt->bcell_start[grid_index] + ia1; - const int iat = this->gridt->which_atom[mcell_index1]; - const int T1 = ucell.iat2it[iat]; - Atom* atom1 = &ucell.atoms[T1]; - const int I1 = ucell.iat2ia[iat]; - // get the start index of local orbitals. - const int start1 = ucell.itiaiw2iwt(T1, I1, 0); - for (int ib = 0; ib < this->bxyz; ib++) - { - if (cal_flag[ib][ia1]) - { - int iw1_lo = this->gridt->trace_lo[start1]; - double* psi1 = &psir_ylm[ib][block_index[ia1]]; - double tmp = 0.0; - for (int iw = 0; iw < atom1->nw; ++iw, ++iw1_lo) - { - tmp += psi1[iw] * wfc[iw1_lo]; - } // iw - rho[vindex[ib]] += tmp; - } // cal_flag - } // ib - } // ia1 - } - } - return; -} diff --git a/source/source_lcao/module_gint/gint_gamma_vl.cpp b/source/source_lcao/module_gint/gint_gamma_vl.cpp deleted file mode 100644 index 161a3e7083..0000000000 --- a/source/source_lcao/module_gint/gint_gamma_vl.cpp +++ /dev/null @@ -1,95 +0,0 @@ -//========================================================= -// REFACTOR : Peize Lin, 2021.06.28 -//========================================================= -#include "gint_gamma.h" -#include "gint_tools.h" -#include "grid_technique.h" -#include "source_base/memory.h" -#include "source_base/timer.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_lcao/module_hcontainer/hcontainer_funcs.h" -#include "source_pw/module_pwdft/global.h" - -#ifdef _OPENMP -#include -#endif - -#ifdef __MKL -#include -#endif - -extern "C" -{ - void Cblacs_gridinfo(int icontxt, int* nprow, int* npcol, int* myprow, int* mypcol); - void Cblacs_pinfo(int* myid, int* nprocs); - void Cblacs_pcoord(int icontxt, int pnum, int* prow, int* pcol); -} - -void Gint_Gamma::cal_vlocal(Gint_inout* inout, bool new_e_iteration) -{ - const int max_size = this->gridt->max_atom; - const int lgd = this->gridt->lgd; - - if (inout->job == Gint_Tools::job_type::vlocal || inout->job == Gint_Tools::job_type::vlocal_meta) - { - if (max_size > 0 && lgd > 0) - { - this->hRGint->set_zero(); - } - - this->cal_gint(inout); - } -} - -#ifdef __MPI -#include "source_lcao/module_hcontainer/hcontainer_funcs.h" -#endif -void Gint_Gamma::transfer_pvpR(hamilt::HContainer* hR, const UnitCell* ucell) -{ - ModuleBase::TITLE("Gint_Gamma", "transfer_pvpR"); - ModuleBase::timer::tick("Gint_Gamma", "transfer_pvpR"); - - for (int iap = 0; iap < this->hRGint->size_atom_pairs(); iap++) - { - auto& ap = this->hRGint->get_atom_pair(iap); - const int iat1 = ap.get_atom_i(); - const int iat2 = ap.get_atom_j(); - if (iat1 > iat2) - { - // fill lower triangle matrix with upper triangle matrix - // gamma_only case, only 1 R_index in each AtomPair - // the upper is - const hamilt::AtomPair* upper_ap = this->hRGint->find_pair(iat2, iat1); -#ifdef __DEBUG - assert(upper_ap != nullptr); -#endif - double* lower_matrix = ap.get_pointer(0); - for (int irow = 0; irow < ap.get_row_size(); ++irow) - { - for (int icol = 0; icol < ap.get_col_size(); ++icol) - { - *lower_matrix++ = upper_ap->get_value(icol, irow); - } - } - } - } - -#ifdef __MPI - int size = 0; - MPI_Comm_size(MPI_COMM_WORLD, &size); - if (size == 1) - { - hR->add(*this->hRGint); - } - else - { - hamilt::transferSerials2Parallels(*this->hRGint, hR); - } -#else - hR->add(*this->hRGint); -#endif - - ModuleBase::timer::tick("Gint_Gamma", "transfer_pvpR"); - - return; -} diff --git a/source/source_lcao/module_gint/gint_gpu_interface.cpp b/source/source_lcao/module_gint/gint_gpu_interface.cpp deleted file mode 100644 index 8e8e362f23..0000000000 --- a/source/source_lcao/module_gint/gint_gpu_interface.cpp +++ /dev/null @@ -1,108 +0,0 @@ -#include "gint.h" -#include "gint_force_gpu.h" -#include "source_io/module_parameter/parameter.h" -#include "gint_rho_gpu.h" -#include "gint_vl_gpu.h" -#include "source_base/memory.h" -#include "source_base/timer.h" - -void Gint::gpu_vlocal_interface(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal"); - - const UnitCell& ucell = *this->ucell; - const double dr = this->gridt->dr_uniform; - double ylmcoef[100]; - ModuleBase::GlobalFunc::ZEROS(ylmcoef, 100); - for (int i = 0; i < 100; i++) { - ylmcoef[i] = ModuleBase::Ylm::ylmcoef[i]; - } - - hamilt::HContainer* hRGint_kernel = PARAM.inp.nspin != 4 ? this->hRGint : this->hr_gint_tmp[inout->ispin]; - GintKernel::gint_vl_gpu(hRGint_kernel, - inout->vl, - ylmcoef, - dr, - this->gridt->rcuts.data(), - *this->gridt, - ucell); - - ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal"); -} - -void Gint::gpu_rho_interface(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint_rho"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_rho"); - - const UnitCell& ucell = *this->ucell; - const double dr = this->gridt->dr_uniform; - double ylmcoef[100]; - ModuleBase::GlobalFunc::ZEROS(ylmcoef, 100); - for (int i = 0; i < 100; i++) { - ylmcoef[i] = ModuleBase::Ylm::ylmcoef[i]; - } - int nrxx = this->gridt->ncx * this->gridt->ncy * this->nplane; - for (int is = 0; is < PARAM.inp.nspin; ++is) { - ModuleBase::GlobalFunc::ZEROS(inout->rho[is], nrxx); - GintKernel::gint_rho_gpu(this->dmr_gint[is], - ylmcoef, - dr, - this->gridt->rcuts.data(), - *this->gridt, - ucell, - inout->rho[is]); - } - ModuleBase::TITLE("Gint_interface", "cal_gint_rho"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_rho"); -} - -void Gint::gpu_force_interface(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint_force"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_force"); - - const UnitCell& ucell = *this->ucell; - const double dr = this->gridt->dr_uniform; - double ylmcoef[100]; - ModuleBase::GlobalFunc::ZEROS(ylmcoef, 100); - for (int i = 0; i < 100; i++) { - ylmcoef[i] = ModuleBase::Ylm::ylmcoef[i]; - } - - const int ncyz = this->ny * this->nplane; - int nat = ucell.nat; - const int isforce = inout->isforce; - const int isstress = inout->isstress; - if (isforce || isstress) { - std::vector force(nat * 3, 0.0); - std::vector stress(6, 0.0); - GintKernel::gint_fvl_gpu(this->dmr_gint[inout->ispin], - inout->vl, - force.data(), - stress.data(), - dr, - this->gridt->rcuts.data(), - isforce, - isstress, - *this->gridt, - ucell); - if (inout->isforce) { - for (int iat = 0; iat < nat; iat++) { - inout->fvl_dphi[0](iat, 0) += force[iat * 3]; - inout->fvl_dphi[0](iat, 1) += force[iat * 3 + 1]; - inout->fvl_dphi[0](iat, 2) += force[iat * 3 + 2]; - } - } - if (inout->isstress) { - inout->svl_dphi[0](0, 0) += stress[0]; - inout->svl_dphi[0](0, 1) += stress[1]; - inout->svl_dphi[0](0, 2) += stress[2]; - inout->svl_dphi[0](1, 1) += stress[3]; - inout->svl_dphi[0](1, 2) += stress[4]; - inout->svl_dphi[0](2, 2) += stress[5]; - } - } - - ModuleBase::TITLE("Gint_interface", "cal_gint_force"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_force"); -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/gint_helper.h b/source/source_lcao/module_gint/gint_helper.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_helper.h rename to source/source_lcao/module_gint/gint_helper.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_info.cpp b/source/source_lcao/module_gint/gint_info.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_info.cpp rename to source/source_lcao/module_gint/gint_info.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_info.h b/source/source_lcao/module_gint/gint_info.h similarity index 98% rename from source/source_lcao/module_gint/temp_gint/gint_info.h rename to source/source_lcao/module_gint/gint_info.h index 0f311c1bcc..a2e35b6642 100644 --- a/source/source_lcao/module_gint/temp_gint/gint_info.h +++ b/source/source_lcao/module_gint/gint_info.h @@ -15,7 +15,7 @@ #ifdef __CUDA #include "batch_biggrid.h" -#include "source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h" +#include "source_lcao/module_gint/kernel/gint_gpu_vars.h" #endif namespace ModuleGint diff --git a/source/source_lcao/module_gint/temp_gint/gint_interface.cpp b/source/source_lcao/module_gint/gint_interface.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_interface.cpp rename to source/source_lcao/module_gint/gint_interface.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_interface.h b/source/source_lcao/module_gint/gint_interface.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_interface.h rename to source/source_lcao/module_gint/gint_interface.h diff --git a/source/source_lcao/module_gint/gint_k.h b/source/source_lcao/module_gint/gint_k.h deleted file mode 100644 index ec2de50730..0000000000 --- a/source/source_lcao/module_gint/gint_k.h +++ /dev/null @@ -1,86 +0,0 @@ -#ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H -#define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H - -#include "gint.h" -#include "grid_technique.h" -#include "source_basis/module_ao/ORB_atomic_lm.h" -#include "source_estate/module_charge/charge.h" -#include "source_lcao/LCAO_HS_arrays.hpp" - -// add by jingan for map<> in 2021-12-2, will be deleted in the future -#include "source_base/abfs-vector3_order.h" - -class Gint_k : public Gint { - public: - /// @brief move operator for the next ESolver to directly use its infomation - /// @param rhs - /// @return *this - Gint_k& operator=(Gint_k&& rhs); - - //------------------------------------------------------ - // in gint_k_pvpr.cpp - //------------------------------------------------------ - // pvpR and reset_spin/get_spin : auxilliary methods - // for calculating hamiltonian - - // allocate the matrix element. - void allocate_pvdpR(); - // destroy the temporary matrix element. - void destroy_pvdpR(); - - /** - * @brief transfer pvpR to this->hRGint - * then pass this->hRGint to Veff::hR - */ - void transfer_pvpR(hamilt::HContainer* hR, const UnitCell* ucell_in, const Grid_Driver* gd); - void transfer_pvpR(hamilt::HContainer>* hR, const UnitCell* ucell_in, const Grid_Driver* gd); - - //------------------------------------------------------ - // in gint_k_env.cpp - //------------------------------------------------------ - // calculate the envelop function via grid integrals - void cal_env_k(int ik, - const std::complex* psi_k, - double* rho, - const std::vector>& kvec_c, - const std::vector>& kvec_d, - const UnitCell& ucell); - - //------------------------------------------------------ - // in gint_k_sparse1.cpp - //------------------------------------------------------ - // similar to the above 3, just for the derivative - void distribute_pvdpR_sparseMatrix( - const int current_spin, - const int dim, - const double& sparse_threshold, - const std::map, - std::map>>& - pvdpR_sparseMatrix, - LCAO_HS_Arrays& HS_Arrays, - const Parallel_Orbitals* pv); - - void distribute_pvdpR_soc_sparseMatrix( - const int dim, - const double& sparse_threshold, - const std::map< - Abfs::Vector3_Order, - std::map>>>& - pvdpR_soc_sparseMatrix, - LCAO_HS_Arrays& HS_Arrays, - const Parallel_Orbitals* pv); - - void cal_dvlocal_R_sparseMatrix(const int& current_spin, - const double& sparse_threshold, - LCAO_HS_Arrays& HS_Arrays, - const Parallel_Orbitals* pv, - const UnitCell& ucell, - const Grid_Driver& gdriver); - - private: - //---------------------------- - // key variable - //---------------------------- -}; - -#endif diff --git a/source/source_lcao/module_gint/gint_k_env.cpp b/source/source_lcao/module_gint/gint_k_env.cpp deleted file mode 100644 index 67ce701461..0000000000 --- a/source/source_lcao/module_gint/gint_k_env.cpp +++ /dev/null @@ -1,138 +0,0 @@ -#include "gint_k.h" -#include "grid_technique.h" -#include "source_io/module_parameter/parameter.h" -#include "source_base/timer.h" -#include "source_base/ylm.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_pw/module_pwdft/global.h" -#include "source_base/array_pool.h" -#include "source_base/vector3.h" - -void Gint_k::cal_env_k(int ik, - const std::complex* psi_k, - double* rho, - const std::vector>& kvec_c, - const std::vector>& kvec_d, - const UnitCell& ucell) -{ - ModuleBase::TITLE("Gint_k", "cal_env_k"); - ModuleBase::timer::tick("Gint_k", "cal_env_k"); - - // it's a uniform grid to save orbital values, so the delta_r is a constant. - const double delta_r = this->gridt->dr_uniform; - const int max_size = this->gridt->max_atom; - if (max_size <= 0){ - ModuleBase::WARNING_QUIT("Gint_Gamma::cal_env", - "the max_size is less than 0!"); - } - const int nbx = this->gridt->nbx; - const int nby = this->gridt->nby; - const int nbz = this->gridt->nbzp; - const int ncyz = this->ny * this->nplane; // mohan add 2012-03-25 - - #pragma omp parallel - { - std::vector vindex(this->bxyz, 0); - std::vector block_iw(max_size, 0); - std::vector block_index(max_size + 1, 0); - std::vector block_size(max_size, 0); - #pragma omp for - for (int grid_index = 0; grid_index < this->nbxx; grid_index++) - { - - // get the value: how many atoms has orbital value on this grid. - const int size = this->gridt->how_many_atoms[grid_index]; - if (size == 0) - { - continue; - } - ModuleBase::Array_Pool cal_flag(this->bxyz, max_size); - Gint_Tools::get_block_info(*this->gridt, - this->bxyz, - size, - grid_index, - block_iw.data(), - block_index.data(), - block_size.data(), - cal_flag.get_ptr_2D()); - const int LD_pool = block_index[size]; - - // evaluate psi on grids - ModuleBase::Array_Pool psir_ylm(this->bxyz, LD_pool); - Gint_Tools::cal_psir_ylm(*this->gridt, - this->bxyz, - size, - grid_index, - delta_r, - block_index.data(), - block_size.data(), - cal_flag.get_ptr_2D(), - psir_ylm.get_ptr_2D()); - - Gint_Tools::get_vindex(this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - vindex.data()); - - for (int ia1 = 0; ia1 < size; ia1++) - { - const int mcell_index1 = this->gridt->bcell_start[grid_index] + ia1; - const int iat = this->gridt->which_atom[mcell_index1]; - const int T1 = ucell.iat2it[iat]; - Atom* atom1 = &ucell.atoms[T1]; - const int I1 = ucell.iat2ia[iat]; - - // find R by which_unitcell and cal kphase - const int id_ucell = this->gridt->which_unitcell[mcell_index1]; - ModuleBase::Vector3 R(this->gridt->get_ucell_coords(id_ucell)); - // std::cout << "kvec_d: " << kvec_d[ik].x << " " << kvec_d[ik].y << " " << kvec_d[ik].z << std::endl; - // std::cout << "kvec_c: " << kvec_c[ik].x << " " << kvec_c[ik].y << " " << kvec_c[ik].z << std::endl; - // std::cout << "R: " << R.x << " " << R.y << " " << R.z << std::endl; - const double arg = (kvec_d[ik] * R) * ModuleBase::TWO_PI; - const double arg1 - = (kvec_c[ik] * (R.x * ucell.a1 + R.y * ucell.a2 + R.z * ucell.a3)) * ModuleBase::TWO_PI; - // std::cout << "arg0=" << arg << ", arg1=" << arg1 << std::endl; - const std::complex kphase = std::complex(cos(arg), sin(arg)); - - // get the start index of local orbitals. - const int start1 = ucell.itiaiw2iwt(T1, I1, 0); - for (int ib = 0; ib < this->bxyz; ib++) - { - if (cal_flag[ib][ia1]) - { - int iw1_lo = 0; - double* psi1 = &psir_ylm[ib][block_index[ia1]]; - std::complex tmp{0.0, 0.0}; - if (PARAM.inp.nspin == 4) // is it a simple add of 2 spins? - { - for (int is = 0; is < 2; ++is) - { - iw1_lo = this->gridt->trace_lo[start1] / PARAM.globalv.npol - + this->gridt->lgd / PARAM.globalv.npol * is; - for (int iw = 0; iw < atom1->nw; ++iw, ++iw1_lo) - { - tmp += std::complex(psi1[iw], 0.0) * psi_k[iw1_lo] * kphase; - } - } - } - else - { - iw1_lo = this->gridt->trace_lo[start1]; - for (int iw = 0; iw < atom1->nw; ++iw, ++iw1_lo) - { - tmp += std::complex(psi1[iw], 0.0) * psi_k[iw1_lo] * kphase; - } - } - rho[vindex[ib]] += tmp.real(); - } // cal_flag - } // ib - } // ia1 - } // i - } - ModuleBase::timer::tick("Gint_k", "cal_env_k"); - return; -} diff --git a/source/source_lcao/module_gint/gint_k_pvdpr.cpp b/source/source_lcao/module_gint/gint_k_pvdpr.cpp deleted file mode 100644 index b03f012a66..0000000000 --- a/source/source_lcao/module_gint/gint_k_pvdpr.cpp +++ /dev/null @@ -1,53 +0,0 @@ -#include "gint_k.h" -#include "grid_technique.h" -#include "source_io/module_parameter/parameter.h" -#include "source_base/global_function.h" -#include "source_base/global_variable.h" -#include "source_base/memory.h" -#include "source_base/parallel_reduce.h" -#include "source_base/timer.h" -#include "source_base/tool_threading.h" -#include "source_base/ylm.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_pw/module_pwdft/global.h" - -void Gint_k::allocate_pvdpR(void) -{ - ModuleBase::TITLE("Gint_k","allocate_pvpR"); - - const int nspin = PARAM.inp.nspin; - assert(nspin>0); - - //xiaohui modify 2015-05-30 - // the number of matrix element is this->gridt->nnrg. - for(int is =0;ispvdpRx_reduced.push_back(hamilt::HContainer(this->ucell->nat)); - pvdpRx_reduced[is].insert_ijrs(this->gridt->get_ijr_info(), *this->ucell); - pvdpRx_reduced[is].allocate(nullptr, true); - this->pvdpRy_reduced.push_back(hamilt::HContainer(this->ucell->nat)); - pvdpRy_reduced[is].insert_ijrs(this->gridt->get_ijr_info(), *this->ucell); - pvdpRy_reduced[is].allocate(nullptr, true); - this->pvdpRz_reduced.push_back(hamilt::HContainer(this->ucell->nat)); - pvdpRz_reduced[is].insert_ijrs(this->gridt->get_ijr_info(), *this->ucell); - pvdpRz_reduced[is].allocate(nullptr, true); - } - - ModuleBase::Memory::record("pvdpR_reduced", 3 * sizeof(double) * this->gridt->nnrg * nspin); - return; -} - -void Gint_k::destroy_pvdpR(void) -{ - ModuleBase::TITLE("Gint_k","destroy_pvpR"); - - const int nspin = PARAM.inp.nspin; - assert(nspin>0); - pvdpRx_reduced.clear(); - pvdpRy_reduced.clear(); - pvdpRz_reduced.clear(); - pvdpRx_reduced.shrink_to_fit(); - pvdpRy_reduced.shrink_to_fit(); - pvdpRz_reduced.shrink_to_fit(); - return; -} diff --git a/source/source_lcao/module_gint/gint_k_pvpr.cpp b/source/source_lcao/module_gint/gint_k_pvpr.cpp deleted file mode 100644 index 8f98e1dcaf..0000000000 --- a/source/source_lcao/module_gint/gint_k_pvpr.cpp +++ /dev/null @@ -1,163 +0,0 @@ -#include "gint_k.h" -#include "grid_technique.h" -#include "source_io/module_parameter/parameter.h" -#include "source_base/global_function.h" -#include "source_base/global_variable.h" -#include "source_base/libm/libm.h" -#include "source_base/memory.h" -#include "source_base/parallel_reduce.h" -#include "source_base/timer.h" -#include "source_base/tool_threading.h" -#include "source_base/ylm.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_cell/module_neighbor/sltk_grid_driver.h" -#include "source_pw/module_pwdft/global.h" -#include "source_lcao/module_hcontainer/hcontainer_funcs.h" -#ifdef __MPI -#include -#endif - -// transfer_pvpR, NSPIN = 1 or 2 -void Gint_k::transfer_pvpR(hamilt::HContainer* hR, const UnitCell* ucell, const Grid_Driver* gd) -{ - ModuleBase::TITLE("Gint_k", "transfer_pvpR"); - ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); - - for (int iap = 0; iap < this->hRGint->size_atom_pairs(); iap++) - { - auto& ap = this->hRGint->get_atom_pair(iap); - const int iat1 = ap.get_atom_i(); - const int iat2 = ap.get_atom_j(); - if (iat1 > iat2) - { - // fill lower triangle matrix with upper triangle matrix - // the upper is - const hamilt::AtomPair* upper_ap = this->hRGint->find_pair(iat2, iat1); - const hamilt::AtomPair* lower_ap = this->hRGint->find_pair(iat1, iat2); -#ifdef __DEBUG - assert(upper_ap != nullptr); -#endif - for (int ir = 0; ir < ap.get_R_size(); ir++) - { - auto R_index = ap.get_R_index(ir); - auto upper_mat = upper_ap->find_matrix(-R_index); - auto lower_mat = lower_ap->find_matrix(R_index); - for (int irow = 0; irow < upper_mat->get_row_size(); ++irow) - { - for (int icol = 0; icol < upper_mat->get_col_size(); ++icol) - { - lower_mat->get_value(icol, irow) = upper_ap->get_value(irow, icol); - } - } - } - } - } -#ifdef __MPI - int size = 0; - MPI_Comm_size(MPI_COMM_WORLD, &size); - if (size == 1) - { - hR->add(*this->hRGint); - } - else - { - hamilt::transferSerials2Parallels(*this->hRGint, hR); - } -#else - hR->add(*this->hRGint); -#endif - ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); - return; -} - -// transfer_pvpR, NSPIN = 4 -void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, - const UnitCell* ucell_in, - const Grid_Driver* gd) -{ - ModuleBase::TITLE("Gint_k", "transfer_pvpR"); - ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); - - this->hRGintCd->set_zero(); - - for (int iap = 0; iap < this->hRGintCd->size_atom_pairs(); iap++) - { - auto* ap = &this->hRGintCd->get_atom_pair(iap); - const int iat1 = ap->get_atom_i(); - const int iat2 = ap->get_atom_j(); - if (iat1 <= iat2) - { - hamilt::AtomPair>* upper_ap = ap; - hamilt::AtomPair>* lower_ap = this->hRGintCd->find_pair(iat2, iat1); - const hamilt::AtomPair* ap_nspin_0 = this->hr_gint_tmp[0]->find_pair(iat1, iat2); - const hamilt::AtomPair* ap_nspin_3 = this->hr_gint_tmp[3]->find_pair(iat1, iat2); - for (int ir = 0; ir < upper_ap->get_R_size(); ir++) - { - const auto R_index = upper_ap->get_R_index(ir); - auto upper_mat = upper_ap->find_matrix(R_index); - auto mat_nspin_0 = ap_nspin_0->find_matrix(R_index); - auto mat_nspin_3 = ap_nspin_3->find_matrix(R_index); - - // The row size and the col size of upper_matrix is double that of matrix_nspin_0 - for (int irow = 0; irow < mat_nspin_0->get_row_size(); ++irow) - { - for (int icol = 0; icol < mat_nspin_0->get_col_size(); ++icol) - { - upper_mat->get_value(2*irow, 2*icol) = mat_nspin_0->get_value(irow, icol) + mat_nspin_3->get_value(irow, icol); - upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin_0->get_value(irow, icol) - mat_nspin_3->get_value(irow, icol); - } - } - - if (PARAM.globalv.domag) - { - const hamilt::AtomPair* ap_nspin_1 = this->hr_gint_tmp[1]->find_pair(iat1, iat2); - const hamilt::AtomPair* ap_nspin_2 = this->hr_gint_tmp[2]->find_pair(iat1, iat2); - const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index); - const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index); - for (int irow = 0; irow < mat_nspin_1->get_row_size(); ++irow) - { - for (int icol = 0; icol < mat_nspin_1->get_col_size(); ++icol) - { - upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin_1->get_value(irow, icol) + std::complex(0.0, 1.0) * mat_nspin_2->get_value(irow, icol); - upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin_1->get_value(irow, icol) - std::complex(0.0, 1.0) * mat_nspin_2->get_value(irow, icol); - } - } - } - - // fill the lower triangle matrix - if (iat1 < iat2) - { - auto lower_mat = lower_ap->find_matrix(-R_index); - for (int irow = 0; irow < upper_mat->get_row_size(); ++irow) - { - for (int icol = 0; icol < upper_mat->get_col_size(); ++icol) - { - lower_mat->get_value(icol, irow) = conj(upper_mat->get_value(irow, icol)); - } - } - } - } - } - } - - // =================================== - // transfer HR from Gint to Veff, std::complex>> - // =================================== -#ifdef __MPI - int size; - MPI_Comm_size(MPI_COMM_WORLD, &size); - if (size == 1) - { - hR->add(*this->hRGintCd); - } - else - { - hamilt::transferSerials2Parallels>(*this->hRGintCd, hR); - } -#else - hR->add(*this->hRGintCd); -#endif - - ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); - return; -} diff --git a/source/source_lcao/module_gint/gint_k_sparse1.cpp b/source/source_lcao/module_gint/gint_k_sparse1.cpp deleted file mode 100644 index ab0d8b60ef..0000000000 --- a/source/source_lcao/module_gint/gint_k_sparse1.cpp +++ /dev/null @@ -1,554 +0,0 @@ -#include "gint_k.h" -#include "grid_technique.h" -#include "source_io/module_parameter/parameter.h" -#include "source_base/global_function.h" -#include "source_base/global_variable.h" -#include "source_base/memory.h" -#include "source_base/parallel_reduce.h" -#include "source_base/timer.h" -#include "source_base/ylm.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_cell/module_neighbor/sltk_grid_driver.h" -#include "source_pw/module_pwdft/global.h" - -void Gint_k::distribute_pvdpR_sparseMatrix( - const int current_spin, - const int dim, - const double& sparse_threshold, - const std::map, std::map>>& pvdpR_sparseMatrix, - LCAO_HS_Arrays& HS_Arrays, - const Parallel_Orbitals* pv) -{ - ModuleBase::TITLE("Gint_k", "distribute_pvdpR_sparseMatrix"); - - int total_R_num = HS_Arrays.all_R_coor.size(); - int* nonzero_num = new int[total_R_num]; - int* minus_nonzero_num = new int[total_R_num]; - ModuleBase::GlobalFunc::ZEROS(nonzero_num, total_R_num); - ModuleBase::GlobalFunc::ZEROS(minus_nonzero_num, total_R_num); - int count = 0; - for (auto& R_coor: HS_Arrays.all_R_coor) - { - auto iter = pvdpR_sparseMatrix.find(R_coor); - if (iter != pvdpR_sparseMatrix.end()) - { - for (auto& row_loop: iter->second) - { - nonzero_num[count] += row_loop.second.size(); - } - } - - auto minus_R_coor = -1 * R_coor; - - iter = pvdpR_sparseMatrix.find(minus_R_coor); - if (iter != pvdpR_sparseMatrix.end()) - { - for (auto& row_loop: iter->second) - { - minus_nonzero_num[count] += row_loop.second.size(); - } - } - - count++; - } - - Parallel_Reduce::reduce_all(nonzero_num, total_R_num); - Parallel_Reduce::reduce_all(minus_nonzero_num, total_R_num); - // Parallel_Reduce::reduce_pool(nonzero_num, total_R_num); - // Parallel_Reduce::reduce_pool(minus_nonzero_num, total_R_num); - - double* tmp = nullptr; - tmp = new double[PARAM.globalv.nlocal]; - - count = 0; - for (auto& R_coor: HS_Arrays.all_R_coor) - { - if (nonzero_num[count] != 0 || minus_nonzero_num[count] != 0) - { - auto minus_R_coor = -1 * R_coor; - - for (int row = 0; row < PARAM.globalv.nlocal; ++row) - { - ModuleBase::GlobalFunc::ZEROS(tmp, PARAM.globalv.nlocal); - - auto iter = pvdpR_sparseMatrix.find(R_coor); - if (iter != pvdpR_sparseMatrix.end()) - { - - if (this->gridt->trace_lo[row] >= 0) - { - auto row_iter = iter->second.find(row); - if (row_iter != iter->second.end()) - { - for (auto& value: row_iter->second) - { - tmp[value.first] = value.second; - } - } - } - } - - auto minus_R_iter = pvdpR_sparseMatrix.find(minus_R_coor); - if (minus_R_iter != pvdpR_sparseMatrix.end()) - { - for (int col = 0; col < row; ++col) - { - if (this->gridt->trace_lo[col] >= 0) - { - auto row_iter = minus_R_iter->second.find(col); - if (row_iter != minus_R_iter->second.end()) - { - auto col_iter = row_iter->second.find(row); - if (col_iter != row_iter->second.end()) - { - tmp[col] = col_iter->second; - } - } - } - } - } - - Parallel_Reduce::reduce_pool(tmp, PARAM.globalv.nlocal); - - if (pv->global2local_row(row) >= 0) - { - for (int col = 0; col < PARAM.globalv.nlocal; ++col) - { - if (pv->global2local_col(col) >= 0) - { - if (std::abs(tmp[col]) > sparse_threshold) - { - if (dim == 0) - { - double& value = HS_Arrays.dHRx_sparse[current_spin][R_coor][row][col]; - value += tmp[col]; - if (std::abs(value) <= sparse_threshold) - { - HS_Arrays.dHRx_sparse[current_spin][R_coor][row].erase(col); - } - } - if (dim == 1) - { - double& value = HS_Arrays.dHRy_sparse[current_spin][R_coor][row][col]; - value += tmp[col]; - if (std::abs(value) <= sparse_threshold) - { - HS_Arrays.dHRy_sparse[current_spin][R_coor][row].erase(col); - } - } - if (dim == 2) - { - double& value = HS_Arrays.dHRz_sparse[current_spin][R_coor][row][col]; - value += tmp[col]; - if (std::abs(value) <= sparse_threshold) - { - HS_Arrays.dHRz_sparse[current_spin][R_coor][row].erase(col); - } - } - } - } - } - } - } - } - - count++; - } - - delete[] nonzero_num; - delete[] minus_nonzero_num; - delete[] tmp; - nonzero_num = nullptr; - minus_nonzero_num = nullptr; - tmp = nullptr; - - return; -} - -void Gint_k::distribute_pvdpR_soc_sparseMatrix( - const int dim, - const double& sparse_threshold, - const std::map, std::map>>>& - pvdpR_soc_sparseMatrix, - LCAO_HS_Arrays& HS_Arrays, - const Parallel_Orbitals* pv) -{ - ModuleBase::TITLE("Gint_k", "distribute_pvdpR_soc_sparseMatrix"); - - int total_R_num = HS_Arrays.all_R_coor.size(); - int* nonzero_num = new int[total_R_num]; - int* minus_nonzero_num = new int[total_R_num]; - ModuleBase::GlobalFunc::ZEROS(nonzero_num, total_R_num); - ModuleBase::GlobalFunc::ZEROS(minus_nonzero_num, total_R_num); - int count = 0; - for (auto& R_coor: HS_Arrays.all_R_coor) - { - auto iter = pvdpR_soc_sparseMatrix.find(R_coor); - if (iter != pvdpR_soc_sparseMatrix.end()) - { - for (auto& row_loop: iter->second) - { - nonzero_num[count] += row_loop.second.size(); - } - } - - auto minus_R_coor = -1 * R_coor; - - iter = pvdpR_soc_sparseMatrix.find(minus_R_coor); - if (iter != pvdpR_soc_sparseMatrix.end()) - { - for (auto& row_loop: iter->second) - { - minus_nonzero_num[count] += row_loop.second.size(); - } - } - - count++; - } - - Parallel_Reduce::reduce_all(nonzero_num, total_R_num); - Parallel_Reduce::reduce_all(minus_nonzero_num, total_R_num); - // Parallel_Reduce::reduce_pool(nonzero_num, total_R_num); - // Parallel_Reduce::reduce_pool(minus_nonzero_num, total_R_num); - - std::complex* tmp_soc = nullptr; - tmp_soc = new std::complex[PARAM.globalv.nlocal]; - - count = 0; - for (auto& R_coor: HS_Arrays.all_R_coor) - { - if (nonzero_num[count] != 0 || minus_nonzero_num[count] != 0) - { - auto minus_R_coor = -1 * R_coor; - - for (int row = 0; row < PARAM.globalv.nlocal; ++row) - { - ModuleBase::GlobalFunc::ZEROS(tmp_soc, PARAM.globalv.nlocal); - - auto iter = pvdpR_soc_sparseMatrix.find(R_coor); - if (iter != pvdpR_soc_sparseMatrix.end()) - { - if (this->gridt->trace_lo[row] >= 0) - { - auto row_iter = iter->second.find(row); - if (row_iter != iter->second.end()) - { - for (auto& value: row_iter->second) - { - tmp_soc[value.first] = value.second; - } - } - } - } - - auto minus_R_iter = pvdpR_soc_sparseMatrix.find(minus_R_coor); - if (minus_R_iter != pvdpR_soc_sparseMatrix.end()) - { - for (int col = 0; col < row; ++col) - { - if (this->gridt->trace_lo[col] >= 0) - { - auto row_iter = minus_R_iter->second.find(col); - if (row_iter != minus_R_iter->second.end()) - { - auto col_iter = row_iter->second.find(row); - if (col_iter != row_iter->second.end()) - { - tmp_soc[col] = conj(col_iter->second); - } - } - } - } - } - - Parallel_Reduce::reduce_pool(tmp_soc, PARAM.globalv.nlocal); - - if (pv->global2local_row(row) >= 0) - { - for (int col = 0; col < PARAM.globalv.nlocal; ++col) - { - if (pv->global2local_col(col) >= 0) - { - if (std::abs(tmp_soc[col]) > sparse_threshold) - { - if (dim == 0) - { - std::complex& value = HS_Arrays.dHRx_soc_sparse[R_coor][row][col]; - value += tmp_soc[col]; - if (std::abs(value) <= sparse_threshold) - { - HS_Arrays.dHRx_soc_sparse[R_coor][row].erase(col); - } - } - if (dim == 1) - { - std::complex& value = HS_Arrays.dHRy_soc_sparse[R_coor][row][col]; - value += tmp_soc[col]; - if (std::abs(value) <= sparse_threshold) - { - HS_Arrays.dHRy_soc_sparse[R_coor][row].erase(col); - } - } - if (dim == 2) - { - std::complex& value = HS_Arrays.dHRz_soc_sparse[R_coor][row][col]; - value += tmp_soc[col]; - if (std::abs(value) <= sparse_threshold) - { - HS_Arrays.dHRz_soc_sparse[R_coor][row].erase(col); - } - } - } - } - } - } - } - } - - count++; - } - - delete[] nonzero_num; - delete[] minus_nonzero_num; - delete[] tmp_soc; - nonzero_num = nullptr; - minus_nonzero_num = nullptr; - tmp_soc = nullptr; - - return; -} - -void Gint_k::cal_dvlocal_R_sparseMatrix(const int& current_spin, - const double& sparse_threshold, - LCAO_HS_Arrays& HS_Arrays, - const Parallel_Orbitals* pv, - const UnitCell& ucell, - const Grid_Driver& gdriver) -{ - ModuleBase::TITLE("Gint_k", "cal_dvlocal_R_sparseMatrix"); - - std::map, std::map>> pvdpRx_sparseMatrix; - std::map, std::map>> pvdpRy_sparseMatrix; - std::map, std::map>> pvdpRz_sparseMatrix; - std::map, std::map>>> - pvdpRx_soc_sparseMatrix; - std::map, std::map>>> - pvdpRy_soc_sparseMatrix; - std::map, std::map>>> - pvdpRz_soc_sparseMatrix; - - double temp_value_double; - std::complex temp_value_complex; - - ModuleBase::Vector3 tau1, dtau; - for (int iap = 0; iap < pvdpRx_reduced[0].size_atom_pairs(); iap++) - { - const auto& ap = pvdpRx_reduced[0].get_atom_pair(iap); - const int iat1 = ap.get_atom_i(); - const int iat2 = ap.get_atom_j(); - const int it1 = ucell.iat2it[iat1]; - const int it2 = ucell.iat2it[iat2]; - const Atom* atom1 = &ucell.atoms[it1]; - const Atom* atom2 = &ucell.atoms[it2]; - const int start1 = ucell.itiaiw2iwt(it1, ucell.iat2ia[iat1], 0); - const int start2 = ucell.itiaiw2iwt(it2, ucell.iat2ia[iat2], 0); - - for (int ir = 0; ir < ap.get_R_size(); ir++) - { - const ModuleBase::Vector3 R = ap.get_R_index(ir); - Abfs::Vector3_Order dR(R.x, R.y, R.z); - std::vector pvdpRx; - std::vector pvdpRy; - std::vector pvdpRz; - for(int i = 0; i < PARAM.inp.nspin; i++) - { - pvdpRx.push_back(pvdpRx_reduced[i].get_atom_pair(iap).get_pointer(ir)); - pvdpRy.push_back(pvdpRy_reduced[i].get_atom_pair(iap).get_pointer(ir)); - pvdpRz.push_back(pvdpRz_reduced[i].get_atom_pair(iap).get_pointer(ir)); - } - - for (int iw = 0; iw < atom1->nw * PARAM.globalv.npol; iw++) - { - for (int iw2 = 0; iw2 < atom2->nw * PARAM.globalv.npol; iw2++) - { - const int nw = atom2->nw; - const int mug0 = iw / PARAM.globalv.npol; - const int nug0 = iw2 / PARAM.globalv.npol; - const int iw_nowg = mug0 * nw + nug0; - - if (PARAM.inp.nspin == 4) - { - // pvp is symmetric, only half is calculated. - - if (iw % 2 == 0 && iw2 % 2 == 0) - { - // spin = 0; - temp_value_complex - = std::complex(1.0, 0.0) * pvdpRx[0][iw_nowg] - + std::complex(1.0, 0.0) * pvdpRx[3][iw_nowg]; - - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRx_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - - temp_value_complex - = std::complex(1.0, 0.0) * pvdpRy[0][iw_nowg] - + std::complex(1.0, 0.0) * pvdpRy[3][iw_nowg]; - - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRy_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - temp_value_complex - = std::complex(1.0, 0.0) * pvdpRz[0][iw_nowg] - + std::complex(1.0, 0.0) * pvdpRz[3][iw_nowg]; - - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRz_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - } - else if (iw % 2 == 1 && iw2 % 2 == 1) - { - // spin = 3; - temp_value_complex - = std::complex(1.0, 0.0) * pvdpRx[0][iw_nowg] - - std::complex(1.0, 0.0) * pvdpRx[3][iw_nowg]; - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRx_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - temp_value_complex - = std::complex(1.0, 0.0) * pvdpRy[0][iw_nowg] - - std::complex(1.0, 0.0) * pvdpRy[3][iw_nowg]; - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRy_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - temp_value_complex - = std::complex(1.0, 0.0) * pvdpRz[0][iw_nowg] - - std::complex(1.0, 0.0) * pvdpRz[3][iw_nowg]; - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRz_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - } - else if (iw % 2 == 0 && iw2 % 2 == 1) - { - // spin = 1; - if (!PARAM.globalv.domag) - { - // do nothing - } - else - { - temp_value_complex - = pvdpRx[1][iw_nowg] - - std::complex(0.0, 1.0) * pvdpRx[2][iw_nowg]; - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRx_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - temp_value_complex - = pvdpRy[1][iw_nowg] - - std::complex(0.0, 1.0) * pvdpRy[2][iw_nowg]; - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRy_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - temp_value_complex - = pvdpRz[1][iw_nowg] - - std::complex(0.0, 1.0) * pvdpRz[2][iw_nowg]; - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRz_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - } - } - else if (iw % 2 == 1 && iw2 % 2 == 0) - { - // spin = 2; - if (!PARAM.globalv.domag) - { - // do nothing - } - else - { - temp_value_complex - = pvdpRx[1][iw_nowg] - + std::complex(0.0, 1.0) * pvdpRx[2][iw_nowg]; - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRx_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - temp_value_complex - = pvdpRy[1][iw_nowg] - + std::complex(0.0, 1.0) * pvdpRy[2][iw_nowg]; - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRy_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - temp_value_complex - = pvdpRz[1][iw_nowg] - + std::complex(0.0, 1.0) * pvdpRz[2][iw_nowg]; - if (std::abs(temp_value_complex) > sparse_threshold) - { - pvdpRz_soc_sparseMatrix[dR][start1 + iw][start2 + iw2] - = temp_value_complex; - } - } - } - else - { - ModuleBase::WARNING_QUIT("Gint_k::folding_vl_k_nc", "index is wrong!"); - } - } // endif NC - else - { - temp_value_double = pvdpRx[current_spin][iw_nowg]; - if (std::abs(temp_value_double) > sparse_threshold) - { - pvdpRx_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value_double; - } - temp_value_double = pvdpRy[current_spin][iw_nowg]; - if (std::abs(temp_value_double) > sparse_threshold) - { - pvdpRy_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value_double; - } - temp_value_double = pvdpRz[current_spin][iw_nowg]; - if (std::abs(temp_value_double) > sparse_threshold) - { - pvdpRz_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value_double; - } - } // endif normal - } - } - } - } - if (PARAM.inp.nspin != 4) - { - distribute_pvdpR_sparseMatrix(current_spin, 0, sparse_threshold, pvdpRx_sparseMatrix, HS_Arrays, pv); - distribute_pvdpR_sparseMatrix(current_spin, 1, sparse_threshold, pvdpRy_sparseMatrix, HS_Arrays, pv); - distribute_pvdpR_sparseMatrix(current_spin, 2, sparse_threshold, pvdpRz_sparseMatrix, HS_Arrays, pv); - } - else - { - distribute_pvdpR_soc_sparseMatrix(0, sparse_threshold, pvdpRx_soc_sparseMatrix, HS_Arrays, pv); - distribute_pvdpR_soc_sparseMatrix(1, sparse_threshold, pvdpRy_soc_sparseMatrix, HS_Arrays, pv); - distribute_pvdpR_soc_sparseMatrix(2, sparse_threshold, pvdpRz_soc_sparseMatrix, HS_Arrays, pv); - } - - return; -} diff --git a/source/source_lcao/module_gint/gint_old.cpp b/source/source_lcao/module_gint/gint_old.cpp deleted file mode 100644 index 73b666581c..0000000000 --- a/source/source_lcao/module_gint/gint_old.cpp +++ /dev/null @@ -1,306 +0,0 @@ -#include "gint.h" - -#include "source_io/module_parameter/parameter.h" -#if ((defined __CUDA)) -#include "gint_force_gpu.h" -#include "gint_rho_gpu.h" -#include "gint_vl_gpu.h" -#endif - -#include "source_base/memory.h" -#include "source_base/timer.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_lcao/module_hcontainer/hcontainer_funcs.h" -#include "source_pw/module_pwdft/global.h" -#ifdef _OPENMP -#include -#endif - -#ifdef __MKL -#include -#endif - -Gint::~Gint() { - - delete this->hRGint; - delete this->hRGintCd; - // in gamma_only case, dmr_gint.size()=0, - // in multi-k case, dmr_gint.size()=nspin - for (int is = 0; is < this->dmr_gint.size(); is++) { - delete this->dmr_gint[is]; - } - for(int is = 0; is < this->hr_gint_tmp .size(); is++) { - delete this->hr_gint_tmp [is]; - } -#ifdef __MPI - delete this->dm2d_tmp; -#endif -} - -void Gint::cal_gint(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint"); - ModuleBase::timer::tick("Gint_interface", "cal_gint"); - // In multi-process environments, - // some processes may not be allocated any data. - if (this->gridt->get_init_malloced() == false) { - ModuleBase::WARNING_QUIT("Gint_interface::cal_gint", - "gridt has not been allocated yet!"); - } - if (this->gridt->max_atom > 0) { -#ifdef __CUDA - if (PARAM.inp.device == "gpu" - && (inout->job == Gint_Tools::job_type::vlocal - || inout->job == Gint_Tools::job_type::rho - || inout->job == Gint_Tools::job_type::force)) { - if (inout->job == Gint_Tools::job_type::vlocal) { - gpu_vlocal_interface(inout); - } else if (inout->job == Gint_Tools::job_type::rho) { - gpu_rho_interface(inout); - } else if (inout->job == Gint_Tools::job_type::force) { - gpu_force_interface(inout); - } - } else -#endif - { -#ifdef __MKL - const int mkl_threads = mkl_get_max_threads(); - mkl_set_num_threads(mkl_threads); -#endif - { - if (inout->job == Gint_Tools::job_type::vlocal) { - gint_kernel_vlocal(inout); - } else if (inout->job == Gint_Tools::job_type::dvlocal) { - gint_kernel_dvlocal(inout); - } else if (inout->job == Gint_Tools::job_type::vlocal_meta) { - gint_kernel_vlocal_meta(inout); - } else if (inout->job == Gint_Tools::job_type::rho) { - gint_kernel_rho(inout); - } else if (inout->job == Gint_Tools::job_type::tau) { - gint_kernel_tau(inout); - } else if (inout->job == Gint_Tools::job_type::force) { - gint_kernel_force(inout); - } else if (inout->job == Gint_Tools::job_type::force_meta) { - gint_kernel_force_meta(inout); - } - } - } - } - ModuleBase::timer::tick("Gint_interface", "cal_gint"); - return; -} -void Gint::prep_grid(const Grid_Technique& gt, - const int& nbx_in, - const int& nby_in, - const int& nbz_in, - const int& nbz_start_in, - const int& ncxyz_in, - const int& bx_in, - const int& by_in, - const int& bz_in, - const int& bxyz_in, - const int& nbxx_in, - const int& ny_in, - const int& nplane_in, - const int& startz_current_in, - const UnitCell* ucell_in, - const LCAO_Orbitals* orb_in) { - ModuleBase::TITLE(GlobalV::ofs_running, "Gint_k", "prep_grid"); - - this->gridt = > - this->nbx = nbx_in; - this->nby = nby_in; - this->nbz = nbz_in; - this->ncxyz = ncxyz_in; - this->nbz_start = nbz_start_in; - this->bx = bx_in; - this->by = by_in; - this->bz = bz_in; - this->bxyz = bxyz_in; - this->nbxx = nbxx_in; - this->ny = ny_in; - this->nplane = nplane_in; - this->startz_current = startz_current_in; - this->ucell = ucell_in; - assert(nbx > 0); - assert(nby > 0); - assert(nbz >= 0); - assert(ncxyz > 0); - assert(bx > 0); - assert(by > 0); - assert(bz > 0); - assert(bxyz > 0); - assert(nbxx >= 0); - assert(ny > 0); - assert(nplane >= 0); - assert(startz_current >= 0); - assert(this->ucell->omega > 0.0); - - return; -} - -void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, const int& nspin) -{ - ModuleBase::TITLE("Gint", "initialize_pvpR"); - int npol = 1; - // there is the only resize code of dmr_gint - if (this->dmr_gint.size() == 0) { - this->dmr_gint.resize(nspin); - } - hr_gint_tmp.resize(nspin); - if (nspin != 4) { - if (this->hRGint != nullptr) { - delete this->hRGint; - } - this->hRGint = new hamilt::HContainer(ucell_in.nat); - } else { - npol = 2; - if (this->hRGintCd != nullptr) { - delete this->hRGintCd; - } - this->hRGintCd - = new hamilt::HContainer>(ucell_in.nat); - for (int is = 0; is < nspin; is++) { - if (this->dmr_gint[is] != nullptr) { - delete this->dmr_gint[is]; - } - if (this->hr_gint_tmp[is] != nullptr) { - delete this->hr_gint_tmp[is]; - } - this->dmr_gint[is] = new hamilt::HContainer(ucell_in.nat); - this->hr_gint_tmp[is] = new hamilt::HContainer(ucell_in.nat); - } -#ifdef __MPI - if (this->dm2d_tmp != nullptr) { - delete this->dm2d_tmp; - } -#endif - } - if (PARAM.globalv.gamma_only_local && nspin != 4) { - this->hRGint->fix_gamma(); - } - if (npol == 1) { - this->hRGint->insert_ijrs(this->gridt->get_ijr_info(), ucell_in); - this->hRGint->allocate(nullptr, true); - ModuleBase::Memory::record("Gint::hRGint", - this->hRGint->get_memory_size()); - // initialize dmr_gint with hRGint when NSPIN != 4 - for (int is = 0; is < this->dmr_gint.size(); is++) { - if (this->dmr_gint[is] != nullptr) { - delete this->dmr_gint[is]; - } - this->dmr_gint[is] = new hamilt::HContainer(*this->hRGint); - } - ModuleBase::Memory::record("Gint::dmr_gint", - this->dmr_gint[0]->get_memory_size() - * this->dmr_gint.size()); - } else { - this->hRGintCd->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol); - this->hRGintCd->allocate(nullptr, true); - for(int is = 0; is < nspin; is++) { - this->hr_gint_tmp[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in); - this->dmr_gint[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in); - this->hr_gint_tmp[is]->allocate(nullptr, true); - this->dmr_gint[is]->allocate(nullptr, true); - } - ModuleBase::Memory::record("Gint::hr_gint_tmp", - this->hr_gint_tmp[0]->get_memory_size()*nspin); - ModuleBase::Memory::record("Gint::dmr_gint", - this->dmr_gint[0]->get_memory_size() - * this->dmr_gint.size()*nspin); - } -} - -void Gint::reset_DMRGint(const int& nspin) -{ - if (this->hRGint) - { - for (auto& d : this->dmr_gint) { delete d; } - this->dmr_gint.resize(nspin); - this->dmr_gint.shrink_to_fit(); - for (auto& d : this->dmr_gint) { d = new hamilt::HContainer(*this->hRGint); } - if (nspin == 4) - { - for (auto& d : this->dmr_gint) { d->allocate(nullptr, false); } -#ifdef __MPI - delete this->dm2d_tmp; -#endif - } - } -} - -void Gint::transfer_DM2DtoGrid(std::vector*> dm2d) { - ModuleBase::TITLE("Gint", "transfer_DMR"); - // To check whether input parameter dm2d has been initialized -#ifdef __DEBUG - assert(!dm2d.empty() - && "Input parameter dm2d has not been initialized while calling " - "function transfer_DM2DtoGrid!"); -#endif - ModuleBase::timer::tick("Gint", "transfer_DMR"); - if (PARAM.inp.nspin != 4) { - for (int is = 0; is < this->dmr_gint.size(); is++) { -#ifdef __MPI - hamilt::transferParallels2Serials(*dm2d[is], dmr_gint[is]); -#else - this->dmr_gint[is]->set_zero(); - this->dmr_gint[is]->add(*dm2d[is]); -#endif - } - } else // NSPIN=4 case - { - // is=0:↑↑, 1:↑↓, 2:↓↑, 3:↓↓ - const int row_set[4] = {0, 0, 1, 1}; - const int col_set[4] = {0, 1, 0, 1}; - int mg = dm2d[0]->get_paraV()->get_global_row_size()/2; - int ng = dm2d[0]->get_paraV()->get_global_col_size()/2; - int nb = dm2d[0]->get_paraV()->get_block_size()/2; - auto ijr_info = dm2d[0]->get_ijr_info(); -#ifdef __MPI - int blacs_ctxt = dm2d[0]->get_paraV()->blacs_ctxt; - std::vector iat2iwt(ucell->nat); - for (int iat = 0; iat < ucell->nat; iat++) { - iat2iwt[iat] = ucell->get_iat2iwt()[iat]/2; - } - Parallel_Orbitals pv{}; - pv.set(mg, ng, nb, blacs_ctxt); - pv.set_atomic_trace(iat2iwt.data(), ucell->nat, mg); - this-> dm2d_tmp = new hamilt::HContainer(&pv, nullptr, &ijr_info); -#else - if (this->dm2d_tmp != nullptr) { - delete this->dm2d_tmp; - } - this-> dm2d_tmp = new hamilt::HContainer(*this->hRGint); - this-> dm2d_tmp -> insert_ijrs(this->gridt->get_ijr_info(), *(this->ucell)); - this-> dm2d_tmp -> allocate(nullptr, true); -#endif - ModuleBase::Memory::record("Gint::dm2d_tmp", this->dm2d_tmp->get_memory_size()); - for (int is = 0; is < 4; is++){ - for (int iap = 0; iap < dm2d[0]->size_atom_pairs(); ++iap) { - auto& ap = dm2d[0]->get_atom_pair(iap); - int iat1 = ap.get_atom_i(); - int iat2 = ap.get_atom_j(); - for (int ir = 0; ir < ap.get_R_size(); ++ir) { - const ModuleBase::Vector3 r_index = ap.get_R_index(ir); - double* matrix_out = this -> dm2d_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer(); - double* matrix_in = ap.get_pointer(ir); - for (int irow = 0; irow < ap.get_row_size()/2; irow ++) { - for (int icol = 0; icol < ap.get_col_size()/2; icol++){ - int index_i = irow* ap.get_col_size()/2 + icol; - int index_j = (irow*2+row_set[is]) * ap.get_col_size() + icol*2+col_set[is]; - matrix_out[index_i] = matrix_in[index_j]; - } - } - } - } -#ifdef __MPI - hamilt::transferParallels2Serials( *(this->dm2d_tmp), this->dmr_gint[is]); -#else - this->dmr_gint[is]->set_zero(); - this->dmr_gint[is]->add(*(this->dm2d_tmp)); -#endif - }//is=4 - delete this->dm2d_tmp; - this->dm2d_tmp = nullptr; - } - ModuleBase::timer::tick("Gint", "transfer_DMR"); -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/gint_rho.cpp b/source/source_lcao/module_gint/gint_rho.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_rho.cpp rename to source/source_lcao/module_gint/gint_rho.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_rho.h b/source/source_lcao/module_gint/gint_rho.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_rho.h rename to source/source_lcao/module_gint/gint_rho.h diff --git a/source/source_lcao/module_gint/gint_rho_cpu_interface.cpp b/source/source_lcao/module_gint/gint_rho_cpu_interface.cpp deleted file mode 100644 index 2f41152fc3..0000000000 --- a/source/source_lcao/module_gint/gint_rho_cpu_interface.cpp +++ /dev/null @@ -1,197 +0,0 @@ -#include "gint.h" -#include "source_base/memory.h" -#include "source_io/module_parameter/parameter.h" -#include "source_base/timer.h" - -void Gint::gint_kernel_rho(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint_rho"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_rho"); - const int max_size = this->gridt->max_atom; - const int ncyz = this->ny * this->nplane; - const double delta_r = this->gridt->dr_uniform; - -#pragma omp parallel -{ - std::vector block_iw(max_size, 0); - std::vector block_index(max_size+1, 0); - std::vector block_size(max_size, 0); - std::vector vindex(this->bxyz, 0); -#pragma omp for schedule(dynamic) - for (int grid_index = 0; grid_index < this->nbxx; grid_index++) - { - const int na_grid = this->gridt->how_many_atoms[grid_index]; - if (na_grid == 0) { - continue; - } - Gint_Tools::get_vindex(this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - vindex.data()); - // prepare block information - ModuleBase::Array_Pool cal_flag(this->bxyz,max_size); - Gint_Tools::get_block_info(*this->gridt, - this->bxyz, - na_grid, - grid_index, - block_iw.data(), - block_index.data(), - block_size.data(), - cal_flag.get_ptr_2D()); - - // evaluate psi on grids - const int LD_pool = block_index[na_grid]; - ModuleBase::Array_Pool psir_ylm(this->bxyz, LD_pool); - Gint_Tools::cal_psir_ylm(*this->gridt, - this->bxyz, - na_grid, - grid_index, - delta_r, - block_index.data(), - block_size.data(), - cal_flag.get_ptr_2D(), - psir_ylm.get_ptr_2D()); - - for (int is = 0; is < inout->nspin_rho; ++is) - { - // psir_ylm_new = psir_func(psir_ylm) - // psir_func==nullptr means psir_ylm_new=psir_ylm - const ModuleBase::Array_Pool &psir_ylm_1 = (!this->psir_func_1) ? psir_ylm : this->psir_func_1(psir_ylm, *this->gridt, grid_index, is, block_iw, block_size, block_index, cal_flag); - const ModuleBase::Array_Pool &psir_ylm_2 = (!this->psir_func_2) ? psir_ylm : this->psir_func_2(psir_ylm, *this->gridt, grid_index, is, block_iw, block_size, block_index, cal_flag); - - ModuleBase::Array_Pool psir_DM(this->bxyz, LD_pool); - ModuleBase::GlobalFunc::ZEROS(psir_DM.get_ptr_1D(), this->bxyz * LD_pool); - - // calculating g_mu(r) = sum_nu rho_mu,nu psi_nu(r) - Gint_Tools::mult_psi_DMR(*this->gridt, - this->bxyz, - LD_pool, - grid_index, - na_grid, - block_index.data(), - block_size.data(), - cal_flag.get_ptr_2D(), - psir_ylm_1.get_ptr_2D(), - psir_DM.get_ptr_2D(), - this->dmr_gint[is], - inout->if_symm); - - // do sum_mu g_mu(r)psi_mu(r) to get electron density on grid - this->cal_meshball_rho(na_grid, block_index.data(), vindex.data(), psir_ylm_2.get_ptr_2D(), psir_DM.get_ptr_2D(), inout->rho[is]); - } - } -} - ModuleBase::TITLE("Gint_interface", "cal_gint_rho"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_rho"); -} - -void Gint::gint_kernel_tau(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint_tau"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_tau"); - const int max_size = this->gridt->max_atom; - const int ncyz = this->ny * this->nplane; - const double delta_r = this->gridt->dr_uniform; - - -#pragma omp parallel -{ - std::vector block_iw(max_size, 0); - std::vector block_index(max_size+1, 0); - std::vector block_size(max_size, 0); - std::vector vindex(bxyz, 0); -#pragma omp for schedule(dynamic) - for (int grid_index = 0; grid_index < this->nbxx; grid_index++) - { - const int na_grid = this->gridt->how_many_atoms[grid_index]; - if (na_grid == 0) { - continue; - } - Gint_Tools::get_vindex(this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - vindex.data()); - //prepare block information - ModuleBase::Array_Pool cal_flag(this->bxyz,max_size); - Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, - block_iw.data(), block_index.data(), block_size.data(), cal_flag.get_ptr_2D()); - - //evaluate psi and dpsi on grids - const int LD_pool = block_index[na_grid]; - ModuleBase::Array_Pool psir_ylm(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_x(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_y(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_z(this->bxyz, LD_pool); - - Gint_Tools::cal_dpsir_ylm(*this->gridt, - this->bxyz, na_grid, grid_index, delta_r, - block_index.data(), block_size.data(), - cal_flag.get_ptr_2D(), - psir_ylm.get_ptr_2D(), - dpsir_ylm_x.get_ptr_2D(), - dpsir_ylm_y.get_ptr_2D(), - dpsir_ylm_z.get_ptr_2D()); - - for(int is=0; is dpsix_DM(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsiy_DM(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsiz_DM(this->bxyz, LD_pool); - ModuleBase::GlobalFunc::ZEROS(dpsix_DM.get_ptr_1D(), this->bxyz*LD_pool); - ModuleBase::GlobalFunc::ZEROS(dpsiy_DM.get_ptr_1D(), this->bxyz*LD_pool); - ModuleBase::GlobalFunc::ZEROS(dpsiz_DM.get_ptr_1D(), this->bxyz*LD_pool); - - //calculating g_i,mu(r) = sum_nu rho_mu,nu d/dx_i psi_nu(r), x_i=x,y,z - Gint_Tools::mult_psi_DMR( - *this->gridt, this->bxyz, - LD_pool, - grid_index, na_grid, - block_index.data(), block_size.data(), - cal_flag.get_ptr_2D(), - dpsir_ylm_x.get_ptr_2D(), - dpsix_DM.get_ptr_2D(), - this->dmr_gint[is], - true); - Gint_Tools::mult_psi_DMR( - *this->gridt, this->bxyz, - LD_pool, - grid_index, na_grid, - block_index.data(), block_size.data(), - cal_flag.get_ptr_2D(), - dpsir_ylm_y.get_ptr_2D(), - dpsiy_DM.get_ptr_2D(), - this->dmr_gint[is], - true); - Gint_Tools::mult_psi_DMR( - *this->gridt, this->bxyz, - LD_pool, - grid_index, na_grid, - block_index.data(), block_size.data(), - cal_flag.get_ptr_2D(), - dpsir_ylm_z.get_ptr_2D(), - dpsiz_DM.get_ptr_2D(), - this->dmr_gint[is], - true); - - //do sum_i,mu g_i,mu(r) * d/dx_i psi_mu(r) to get kinetic energy density on grid - if(inout->job==Gint_Tools::job_type::tau) - { - this->cal_meshball_tau( - na_grid, block_index.data(), - vindex.data(), - dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D(), - dpsix_DM.get_ptr_2D(), dpsiy_DM.get_ptr_2D(), dpsiz_DM.get_ptr_2D(), - inout->rho[is]); - } - } - } -} - ModuleBase::TITLE("Gint_interface", "cal_gint_tau"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_tau"); -} diff --git a/source/source_lcao/module_gint/temp_gint/gint_rho_gpu.cpp b/source/source_lcao/module_gint/gint_rho_gpu.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_rho_gpu.cpp rename to source/source_lcao/module_gint/gint_rho_gpu.cpp diff --git a/source/source_lcao/module_gint/gint_rho_gpu.cu b/source/source_lcao/module_gint/gint_rho_gpu.cu deleted file mode 100644 index c5591e662e..0000000000 --- a/source/source_lcao/module_gint/gint_rho_gpu.cu +++ /dev/null @@ -1,234 +0,0 @@ -#include "kernels/cuda/cuda_tools.cuh" -#include "source_base/ylm.h" -#include "gint_rho_gpu.h" -#include "gint_tools.h" -#include "kernels/cuda/gint_rho.cuh" - -#ifdef _OPENMP -#include -#endif - -namespace GintKernel -{ - -void gint_rho_gpu(const hamilt::HContainer* dm, - const double* ylmcoef_now, - const double dr, - const double* rcut, - const Grid_Technique& gridt, - const UnitCell& ucell, - double* rho) -{ - checkCuda(cudaSetDevice(gridt.dev_id)); - // checkCuda(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); - - const int nbzp = gridt.nbzp; - const int nczp =nbzp * gridt.bz; - const int num_mcell_on_proc = nczp * gridt.ncx * gridt.ncy; - const int lgd = gridt.lgd; - const int max_atom = gridt.max_atom; - const int num_streams = gridt.nstreams; - const int max_atom_per_bcell = max_atom * gridt.bxyz; - const int max_atom_per_z = max_atom * nbzp; - const int max_phi_per_z = max_atom_per_bcell * nbzp * ucell.nwmax; - const int max_atompair_per_z = max_atom * max_atom * nbzp; - - std::vector streams(num_streams); - std::vector events(num_streams); - for (int i = 0; i < num_streams; i++) - { - checkCuda(cudaStreamCreate(&streams[i])); - checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming)); - } - - Cuda_Mem_Wrapper dr_part(max_atom_per_z * 3, num_streams, true); - Cuda_Mem_Wrapper atoms_type(max_atom_per_z, num_streams, true); - // The first number in every group of two represents the number of atoms on that bigcell. - // The second number represents the cumulative number of atoms up to that bigcell. - Cuda_Mem_Wrapper atoms_num_info(2 * nbzp, num_streams, true); - - Cuda_Mem_Wrapper psi(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper psi_dm(max_phi_per_z, num_streams, false); - - Cuda_Mem_Wrapper gemm_alpha(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_m(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_n(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_k(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_lda(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_ldb(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_ldc(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_A(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_B(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_C(max_atompair_per_z, num_streams, true); - - Cuda_Mem_Wrapper rho_g(num_mcell_on_proc, 1, false); - Cuda_Mem_Wrapper dot_product(nbzp * gridt.bxyz, num_streams, true); - - Cuda_Mem_Wrapper dm_matrix(dm->get_nnr(), 1, false); - // retrieve the density matrix on the host - checkCuda(cudaMemcpy(dm_matrix.get_device_pointer(), - dm->get_wrapper(), - dm->get_nnr() * sizeof(double), - cudaMemcpyHostToDevice)); - -// calculate the rho for every nbzp bigcells -#ifdef _OPENMP -const int max_thread_num = std::min(omp_get_max_threads(), num_streams); -#endif -#pragma omp parallel num_threads(max_thread_num) -{ -#ifdef _OPENMP - const int tid = omp_get_thread_num(); - const int num_threads = omp_get_num_threads(); - const int sid_start = tid * num_streams / num_threads; - const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads; -#else - const int sid_start = 0; - const int thread_num_streams = num_streams; -#endif -#pragma omp for collapse(2) schedule(dynamic) - for (int i = 0; i < gridt.nbx; i++) - { - for (int j = 0; j < gridt.nby; j++) - { - // 20240620 Note that it must be set again here because - // cuda's device is not safe in a multi-threaded environment. - - checkCuda(cudaSetDevice(gridt.dev_id)); - - const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start; - checkCuda(cudaEventSynchronize(events[sid])); - - int max_m = 0; - int max_n = 0; - int atom_pair_num = 0; - int atoms_per_z = 0; - const int grid_index_ij = i * gridt.nby * nbzp + j * nbzp; - - // generate GPU tasks, including the calculation of psir, matrix - // multiplication, and dot product - gtask_rho(gridt, - grid_index_ij, - ucell, - dr_part.get_host_pointer(sid), - atoms_type.get_host_pointer(sid), - atoms_num_info.get_host_pointer(sid), - atoms_per_z); - - alloc_mult_dot_rho( - dm, - gridt, - ucell, - grid_index_ij, - max_atom, - lgd, - nczp, - atoms_num_info.get_host_pointer(sid), - psi.get_device_pointer(sid), - psi_dm.get_device_pointer(sid), - dm_matrix.get_device_pointer(), - gemm_alpha.get_host_pointer(sid), - gemm_m.get_host_pointer(sid), - gemm_n.get_host_pointer(sid), - gemm_k.get_host_pointer(sid), - gemm_lda.get_host_pointer(sid), - gemm_ldb.get_host_pointer(sid), - gemm_ldc.get_host_pointer(sid), - gemm_A.get_host_pointer(sid), - gemm_B.get_host_pointer(sid), - gemm_C.get_host_pointer(sid), - max_m, - max_n, - atom_pair_num, - rho_g.get_device_pointer(), - dot_product.get_host_pointer(sid)); - - dr_part.copy_host_to_device_async(streams[sid], sid, atoms_per_z * 3); - atoms_type.copy_host_to_device_async(streams[sid], sid, atoms_per_z); - atoms_num_info.copy_host_to_device_async(streams[sid], sid); - - gemm_alpha.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_m.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_n.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_k.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_lda.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_ldb.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_ldc.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - dot_product.copy_host_to_device_async(streams[sid], sid); - checkCuda(cudaEventRecord(events[sid], streams[sid])); - - psi.memset_device_async(streams[sid], sid, 0); - psi_dm.memset_device_async(streams[sid], sid, 0); - - // Launching kernel to calculate psi - dim3 grid_psi(nbzp, gridt.bxyz); - dim3 block_psi(64); - get_psi<<>>( - gridt.ylmcoef_g, - dr, - gridt.bxyz, - ucell.nwmax, - max_atom, - gridt.atom_nwl_g, - gridt.atom_new_g, - gridt.atom_ylm_g, - gridt.atom_nw_g, - gridt.rcut_g, - gridt.nr_max, - gridt.psi_u_g, - gridt.mcell_pos_g, - dr_part.get_device_pointer(sid), - atoms_type.get_device_pointer(sid), - atoms_num_info.get_device_pointer(sid), - psi.get_device_pointer(sid)); - checkCudaLastError(); - - // Performing matrix multiplication alpha * mat_dm * mat_psir - gridt.fastest_matrix_mul(max_m, - max_n, - gemm_m.get_device_pointer(sid), - gemm_n.get_device_pointer(sid), - gemm_k.get_device_pointer(sid), - gemm_A.get_device_pointer(sid), - gemm_lda.get_device_pointer(sid), - gemm_B.get_device_pointer(sid), - gemm_ldb.get_device_pointer(sid), - gemm_C.get_device_pointer(sid), - gemm_ldc.get_device_pointer(sid), - atom_pair_num, - streams[sid], - gemm_alpha.get_device_pointer(sid)); - checkCudaLastError(); - - // Launching kernel to calculate dot product psir * psir_dm - // if warpSize is not eauql to 32, the psir_dot kernel should be modified - dim3 grid_dot(nbzp, gridt.bxyz); - dim3 block_dot(64); - psir_dot<<>>( - gridt.bxyz, - ucell.nwmax, - atoms_num_info.get_device_pointer(sid), - psi.get_device_pointer(sid), - psi_dm.get_device_pointer(sid), - dot_product.get_device_pointer(sid)); - checkCudaLastError(); - } - } -} - - // Copy rho from device to host - checkCuda(cudaMemcpy(rho, - rho_g.get_device_pointer(), - num_mcell_on_proc * sizeof(double), - cudaMemcpyDeviceToHost)); - - for (int i = 0; i < num_streams; i++) - { - checkCuda(cudaStreamDestroy(streams[i])); - checkCuda(cudaEventDestroy(events[i])); - } -} -} // namespace GintKernel diff --git a/source/source_lcao/module_gint/gint_rho_gpu.h b/source/source_lcao/module_gint/gint_rho_gpu.h index 7dba352a84..d8a8fe6e01 100644 --- a/source/source_lcao/module_gint/gint_rho_gpu.h +++ b/source/source_lcao/module_gint/gint_rho_gpu.h @@ -1,68 +1,52 @@ -#ifndef GINT_RHO_H -#define GINT_RHO_H -#include -#include // for CUDA_VERSION -#include +#pragma once -#include "source_lcao/module_gint/gint.h" -#include "source_lcao/module_gint/grid_technique.h" +#include +#include +#include "source_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h" -namespace GintKernel +namespace ModuleGint { -/** - * calculate the rho by GPU - * - * @param dm density matrix. - * @param ylmcoef_now coefficients for the spherical harmonics expansion. - * @param dr The grid spacing. - * @param rcut Pointer to the cutoff radius array. - * @param gridt Grid_Technique object containing grid information. - * @param ucell UnitCell. - * @param rho rho. - */ -void gint_rho_gpu(const hamilt::HContainer* dm, - const double* ylmcoef_now, - const double dr, - const double* rcut, - const Grid_Technique& gridt, - const UnitCell& ucell, - double* rho); +class Gint_rho_gpu: public Gint +{ + public: + Gint_rho_gpu( + const std::vector*>& dm_vec, + const int nspin, + double **rho, + bool is_dm_symm = true) + : dm_vec_(dm_vec), nspin_(nspin), rho_(rho), is_dm_symm_(is_dm_symm) {} + + void cal_gint(); + + private: + void init_dm_gint_(); + + void cal_rho_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + // input + const std::vector*> dm_vec_; + const int nspin_; + + // if true, it means the DMR matrix is symmetric, + // which leads to faster computations compared to the asymmetric case. + const bool is_dm_symm_; + + // output + double **rho_; -void gtask_rho(const Grid_Technique& gridt, - const int grid_index_ij, - const UnitCell& ucell, - double* dr_part, - uint8_t* atoms_type, - int* atoms_num_info, - int& atoms_per_z); + // Intermediate variables + std::vector> dm_gint_vec_; -void alloc_mult_dot_rho(const hamilt::HContainer* dm, - const Grid_Technique& gridt, - const UnitCell& ucell, - const int grid_index_ij, - const int max_atom, - const int lgd, - const int nczp, - const int* atoms_num_info, - double* const psir_ylm_g, - double* const psir_dm_g, - double* const dm_matrix_g, - double* mat_alpha, - int* mat_m, - int* mat_n, - int* mat_k, - int* mat_lda, - int* mat_ldb, - int* mat_ldc, - double** mat_A, - double** mat_B, - double** mat_C, - int& max_m, - int& max_n, - int& atom_pair_num, - double* rho_g, - double** dot_product); + std::vector> dm_gint_d_vec_; + std::vector> rho_d_vec_; +}; -} // namespace GintKernel -#endif \ No newline at end of file +} \ No newline at end of file diff --git a/source/source_lcao/module_gint/gint_rho_old.cpp b/source/source_lcao/module_gint/gint_rho_old.cpp deleted file mode 100644 index b3027d6b12..0000000000 --- a/source/source_lcao/module_gint/gint_rho_old.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#include "gint_k.h" -#include "gint_tools.h" -#include "grid_technique.h" -#include "source_base/global_function.h" -#include "source_base/global_variable.h" -#include "source_base/timer.h" -#include "source_base/array_pool.h" -#include "source_base/ylm.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_pw/module_pwdft/global.h" - -void Gint::cal_meshball_rho(const int na_grid, - const int*const block_index, - const int*const vindex, - const double*const*const psir_ylm, - const double*const*const psir_DMR, - double*const rho) -{ - const int inc = 1; - // sum over mu to get density on grid - for (int ib = 0; ib < this->bxyz; ++ib) - { - const double r = ddot_(&block_index[na_grid], psir_ylm[ib], &inc, psir_DMR[ib], &inc); - const int grid = vindex[ib]; - rho[grid] += r; - } -} diff --git a/source/source_lcao/module_gint/temp_gint/gint_tau.cpp b/source/source_lcao/module_gint/gint_tau.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_tau.cpp rename to source/source_lcao/module_gint/gint_tau.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_tau.h b/source/source_lcao/module_gint/gint_tau.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_tau.h rename to source/source_lcao/module_gint/gint_tau.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_tau_gpu.cpp b/source/source_lcao/module_gint/gint_tau_gpu.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_tau_gpu.cpp rename to source/source_lcao/module_gint/gint_tau_gpu.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_tau_gpu.h b/source/source_lcao/module_gint/gint_tau_gpu.h similarity index 92% rename from source/source_lcao/module_gint/temp_gint/gint_tau_gpu.h rename to source/source_lcao/module_gint/gint_tau_gpu.h index da19c98828..638892ff13 100644 --- a/source/source_lcao/module_gint/temp_gint/gint_tau_gpu.h +++ b/source/source_lcao/module_gint/gint_tau_gpu.h @@ -5,7 +5,7 @@ #include "source_lcao/module_hcontainer/hcontainer.h" #include "gint.h" #include "gint_info.h" -#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" +#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h" namespace ModuleGint { diff --git a/source/source_lcao/module_gint/gint_tau_old.cpp b/source/source_lcao/module_gint/gint_tau_old.cpp deleted file mode 100644 index adf20d45b5..0000000000 --- a/source/source_lcao/module_gint/gint_tau_old.cpp +++ /dev/null @@ -1,38 +0,0 @@ -#include "source_base/global_function.h" -#include "source_base/global_variable.h" -#include "gint_k.h" -#include "source_basis/module_ao/ORB_read.h" -#include "grid_technique.h" -#include "source_base/ylm.h" -#include "source_pw/module_pwdft/global.h" -#include "source_base/module_external/blas_connector.h" -#include "source_base/timer.h" -#include "source_base/array_pool.h" -#include "gint_tools.h" -#include "source_base/memory.h" -#include "source_lcao/module_gint/grid_technique.h" - - -void Gint::cal_meshball_tau( - const int na_grid, - int* block_index, - int* vindex, - double** dpsix, - double** dpsiy, - double** dpsiz, - double** dpsix_dm, - double** dpsiy_dm, - double** dpsiz_dm, - double* rho) -{ - const int inc = 1; - // sum over mu to get density on grid - for(int ib=0; ibbxyz; ++ib) - { - double rx=ddot_(&block_index[na_grid], dpsix[ib], &inc, dpsix_dm[ib], &inc); - double ry=ddot_(&block_index[na_grid], dpsiy[ib], &inc, dpsiy_dm[ib], &inc); - double rz=ddot_(&block_index[na_grid], dpsiz[ib], &inc, dpsiz_dm[ib], &inc); - const int grid = vindex[ib]; - rho[ grid ] += rx + ry + rz; - } -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/gint_tools.cpp b/source/source_lcao/module_gint/gint_tools.cpp deleted file mode 100644 index d60db04a1a..0000000000 --- a/source/source_lcao/module_gint/gint_tools.cpp +++ /dev/null @@ -1,234 +0,0 @@ -//========================================================= -//REFACTOR : Peize Lin, 2021.06.28 -//========================================================= -#include "gint_tools.h" - -#include -#include // for std::pair - -#include "source_base/timer.h" -#include "source_base/ylm.h" -#include "source_base/array_pool.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_pw/module_pwdft/global.h" - -namespace Gint_Tools{ -void get_vindex(const int bxyz, const int bx, const int by, const int bz, - const int nplane, const int start_ind, - const int ncyz,int* vindex) -{ - int bindex = 0; - - for(int ii=0; ii vindex(bxyz,0); - Gint_Tools::get_vindex(bxyz, bx, by, bz, nplane, start_ind, ncyz,vindex.data()); - for(int ib=0; ib gt.rcuts[it] - 1.0e-10) { - cal_flag[ib][id] = false; - } else { - cal_flag[ib][id] = true; - } - } // end ib - } - } - - -void cal_dpsirr_ylm( - const Grid_Technique& gt, const int bxyz, - const int na_grid, // number of atoms on this grid - const int grid_index, // 1d index of FFT index (i,j,k) - const int* const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const int* const block_size, // block_size[na_grid], number of columns of a band - const bool* const* const cal_flag, // cal_flag[bxyz][na_grid], whether the atom-grid distance is larger than cutoff - double* const* const dpsir_ylm_x, double* const* const dpsir_ylm_y, double* const* const dpsir_ylm_z, - double* const* const dpsirr_ylm) -{ - ModuleBase::timer::tick("Gint_Tools", "cal_dpsirr_ylm"); - const UnitCell& ucell = *gt.ucell; - for (int id = 0; id < na_grid; id++) - { - const int mcell_index = gt.bcell_start[grid_index] + id; - const int imcell = gt.which_bigcell[mcell_index]; - int iat = gt.which_atom[mcell_index]; - const int it = ucell.iat2it[iat]; - Atom* atom = &ucell.atoms[it]; - - const double mt[3]={ - gt.meshball_positions[imcell][0] - gt.tau_in_bigcell[iat][0], - gt.meshball_positions[imcell][1] - gt.tau_in_bigcell[iat][1], - gt.meshball_positions[imcell][2] - gt.tau_in_bigcell[iat][2]}; - - for(int ib=0; ibnw; ++iw) - { - p_dpsirr[iw * 6] = p_dpsi_x[iw]*dr[0]; - p_dpsirr[iw * 6 + 1] = p_dpsi_x[iw]*dr[1]; - p_dpsirr[iw * 6 + 2] = p_dpsi_x[iw]*dr[2]; - p_dpsirr[iw * 6 + 3] = p_dpsi_y[iw]*dr[1]; - p_dpsirr[iw * 6 + 4] = p_dpsi_y[iw]*dr[2]; - p_dpsirr[iw * 6 + 5] = p_dpsi_z[iw]*dr[2]; - }//iw - }//else - } - } - ModuleBase::timer::tick("Gint_Tools", "cal_dpsirr_ylm"); - return; - } - - // atomic basis sets - // psir_vlbr3[bxyz][LD_pool] - ModuleBase::Array_Pool get_psir_vlbr3( - const int bxyz, - const int na_grid, // how many atoms on this (i,j,k) grid - const int LD_pool, - const int*const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const bool*const*const cal_flag, // cal_flag[bxyz][na_grid], whether the atom-grid distance is larger than cutoff - const double*const vldr3, // vldr3[bxyz] - const double*const*const psir_ylm) // psir_ylm[bxyz][LD_pool] - { - ModuleBase::Array_Pool psir_vlbr3(bxyz, LD_pool); - for(int ib=0; ib cal_info(const int bxyz, - const int ia1, - const int ia2, - const bool* const* const cal_flag) -{ - int ib_start = bxyz; - int ib_end = 0; - int ib_length = 0; - for(int ib=0; ib=0; --ib) - { - if(cal_flag[ib][ia1] && cal_flag[ib][ia2]) - { - ib_end = ib; - break; - } - } - } - - ib_length = ib_end - ib_start + 1; - return std::make_pair(ib_start, ib_length); -} - -} // namespace Gint_Tools diff --git a/source/source_lcao/module_gint/gint_tools.h b/source/source_lcao/module_gint/gint_tools.h deleted file mode 100644 index a7f0e1b0d0..0000000000 --- a/source/source_lcao/module_gint/gint_tools.h +++ /dev/null @@ -1,311 +0,0 @@ -//========================================================= -// REFACTOR : Peize Lin, 2021.06.28 -//========================================================= -#ifndef GINT_TOOLS_H -#define GINT_TOOLS_H -#include "grid_technique.h" -#include "source_estate/module_charge/charge.h" -#include "source_lcao/module_hcontainer/hcontainer.h" -#include "source_base/array_pool.h" - -#include -#include // for std::pair - -namespace Gint_Tools -{ -enum class job_type -{ - vlocal, - rho, - force, - tau, - vlocal_meta, - force_meta, - dvlocal -}; -// Hamiltonian, electron density, force, kinetic energy density, Hamiltonian for mGGA -} // namespace Gint_Tools - -// the class is used to pass input/output variables -// into the unified interface gint -// not sure if this is the best practice though .. -class Gint_inout -{ - public: - // input - double*** DM=nullptr; - const double* vl=nullptr; - const double* vofk=nullptr; - bool isforce=false; - bool isstress=false; - int ispin=0; - int nspin_rho=0; // usually, but not always, equal to global nspin - bool if_symm = false; // if true, use dsymv in gint_kernel_rho; if false, use dgemv. - - // output - double** rho=nullptr; - ModuleBase::matrix* fvl_dphi=nullptr; - ModuleBase::matrix* svl_dphi=nullptr; - Gint_Tools::job_type job; - - // electron density and kin_r, multi-k - Gint_inout(double** rho_in, Gint_Tools::job_type job_in, const int& nspin_rho_in, bool if_symm_in = true) - { - rho = rho_in; - job = job_in; - nspin_rho = nspin_rho_in; - if_symm = if_symm_in; - } - - // force - Gint_inout(const int ispin_in, - const double* vl_in, - bool isforce_in, - bool isstress_in, - ModuleBase::matrix* fvl_dphi_in, - ModuleBase::matrix* svl_dphi_in, - Gint_Tools::job_type job_in) - { - vl = vl_in; - isforce = isforce_in; - isstress = isstress_in; - fvl_dphi = fvl_dphi_in; - svl_dphi = svl_dphi_in; - job = job_in; - ispin = ispin_in; - } - - // force (mGGA) - Gint_inout(const int ispin_in, - const double* vl_in, - const double* vofk_in, - const bool isforce_in, - const bool isstress_in, - ModuleBase::matrix* fvl_dphi_in, - ModuleBase::matrix* svl_dphi_in, - Gint_Tools::job_type job_in) - { - vl = vl_in; - vofk = vofk_in; - isforce = isforce_in; - isstress = isstress_in; - fvl_dphi = fvl_dphi_in; - svl_dphi = svl_dphi_in; - job = job_in; - ispin = ispin_in; - } - - // vlocal, multi-k - Gint_inout(const double* vl_in, int ispin_in, Gint_Tools::job_type job_in) - { - vl = vl_in; - ispin = ispin_in; - job = job_in; - } - - // mGGA vlocal, multi-k - Gint_inout(const double* vl_in, const double* vofk_in, int ispin_in, Gint_Tools::job_type job_in) - { - vl = vl_in; - vofk = vofk_in; - ispin = ispin_in; - job = job_in; - } - - // vlocal, gamma point - Gint_inout(const double* vl_in, Gint_Tools::job_type job_in) - { - vl = vl_in; - job = job_in; - } - - // mGGA vlocal, gamma point - Gint_inout(const double* vl_in, const double* vofk_in, Gint_Tools::job_type job_in) - { - vl = vl_in; - vofk = vofk_in; - job = job_in; - } -}; - -namespace Gint_Tools -{ -// if exponent is an integer between 0 and 5 (the most common cases in gint), -// pow_int is much faster than std::pow -inline double pow_int(const double base, const int exp) -{ - switch (exp) - { - case 0: - return 1.0; - case 1: - return base; - case 2: - return base * base; - case 3: - return base * base * base; - case 4: - return base * base * base * base; - case 5: - return base * base * base * base * base; - default: - double result = std::pow(base, exp); - return result; - } -} -// vindex[pw.bxyz] - -/** - * @brief Get the vindex form the grid index - * @param bxyz number of big grids - * @param bx number of big grids in x direction - * @param by number of big grids in y direction - * @param bz number of big grids in z direction - * @param nplane Currently using Z-axis 1D division, - * recording the number of the Z-axis process - * (nbz in the current process). - * @param start_ind start index of the grid in the 1D FFT grid - * @param ncyz number of grids in yz plane - * @param vindex the index of the grid -*/ -void get_vindex(const int bxyz, const int bx, const int by, - const int bz, const int nplane, - const int start_ind,const int ncyz,int* vindex); - -/** - * @brief Get the vldr3 form the grid index - * @param vldr3 the local potential multiplied by the grid volume - * @param vlocal the local potential - * @param bxyz number of grids - * @param bx number of grids in x direction - * @param by number of grids in y direction - * @param bz number of grids in z direction - * @param nplane Currently using Z-axis 1D division, - * recording the number of the Z-axis process - * (nbz in the current process). - * @param start_ind start index of the grid in the 1D FFT grid - * @param ncyz number of grids in yz plane - * @param dv the volume of the grid -*/ -void get_gint_vldr3(double* vldr3, - const double* const vlocal, - const int bxyz, - const int bx, - const int by, - const int bz, - const int nplane, - const int start_ind, - const int ncyz, - const double dv); - -/** - * @brief Get the information of a big grid index - * @param gt the grid technique, which contains the tools of the grid intergration - * @param bxyz number of grids - * @param na_grid number of atoms on this grid - * @param grid_index 1d index of FFT index (i,j,k) - * @param block_iw track the atom orbitals in all atoms - * @param block_index count total number of atomis orbitals - * @param block_size count the number of atomis orbitals in each atom - * @param cal_flag whether the atom-grid distance is larger than cutoff -*/ -void get_block_info(const Grid_Technique& gt, const int bxyz, const int na_grid, const int grid_index, - int* block_iw, int* block_index, int* block_size, bool** cal_flag); - -void init_orb(double& dr_uniform, - std::vector& rcuts, - UnitCell& ucell, - const LCAO_Orbitals& orb, - std::vector>& psi_u, - std::vector>& dpsi_u, - std::vector>& d2psi_u); - -// psir_ylm[pw.bxyz][LD_pool] -void cal_psir_ylm(const Grid_Technique& gt, - const int bxyz, - const int na_grid, // number of atoms on this grid - const int grid_index, // 1d index of FFT index (i,j,k) - const double delta_r, // delta_r of the uniform FFT grid - const int* const block_index, // count total number of atomis orbitals - const int* const block_size, - const bool* const* const cal_flag, - double* const* const psir_ylm); // whether the atom-grid distance is larger than cutoff - -// psir_ylm and dpsir_ylm, both[pw.bxyz][LD_pool] -void cal_dpsir_ylm( - const Grid_Technique& gt, - const int bxyz, - const int na_grid, // number of atoms on this grid - const int grid_index, // 1d index of FFT index (i,j,k) - const double delta_r, // delta_r of the uniform FFT grid - const int* const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const int* const block_size, // block_size[na_grid], number of columns of a band - const bool* const* const cal_flag, // cal_flag[bxyz][na_grid], whether the atom-grid distance is larger than cutoff - double* const* const psir_ylm, - double* const* const dpsir_ylm_x, - double* const* const dpsir_ylm_y, - double* const* const dpsir_ylm_z); - -// dpsir_ylm * (r-R), R is the atomic position -void cal_dpsirr_ylm( - const Grid_Technique& gt, const int bxyz, - const int na_grid, // number of atoms on this grid - const int grid_index, // 1d index of FFT index (i,j,k) - const int* const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const int* const block_size, // block_size[na_grid], number of columns of a band - const bool* const* const cal_flag, // cal_flag[bxyz][na_grid], whether the atom-grid distance is larger than cutoff - double* const* const dpsir_ylm_x, double* const* const dpsir_ylm_y, double* const* const dpsir_ylm_z, - double* const* const dpsir_ylm); - -void cal_ddpsir_ylm( - const Grid_Technique& gt, - const int bxyz, - const int na_grid, // number of atoms on this grid - const int grid_index, // 1d index of FFT index (i,j,k) - const double delta_r, // delta_r of the uniform FFT grid - const int* const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const int* const block_size, // block_size[na_grid], number of columns of a band - const bool* const* const cal_flag, // cal_flag[bxyz][na_grid], whether the atom-grid distance is larger than cutoff - double* const* const ddpsir_ylm_xx, - double* const* const ddpsir_ylm_xy, - double* const* const ddpsir_ylm_xz, - double* const* const ddpsir_ylm_yy, - double* const* const ddpsir_ylm_yz, - double* const* const ddpsir_ylm_zz); - -// psir_ylm * vldr3 -ModuleBase::Array_Pool get_psir_vlbr3( - const int bxyz, - const int na_grid, // how many atoms on this (i,j,k) grid - const int LD_pool, - const int* const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const bool* const* const cal_flag, // cal_flag[bxyz][na_grid], whether the atom-grid distance is larger than cutoff - const double* const vldr3, // vldr3[bxyz] - const double* const* const psir_ylm); // psir_ylm[bxyz][LD_pool] - -// sum_nu,R rho_mu,nu(R) psi_nu, for multi-k and gamma point -void mult_psi_DMR( - const Grid_Technique& gt, - const int bxyz, - const int LD_pool, - const int &grid_index, - const int &na_grid, - const int*const block_index, - const int*const block_size, - const bool*const*const cal_flag, - const double*const*const psi, - double*const*const psi_DMR, - const hamilt::HContainer*const DM, - const bool if_symm); - - -// pair.first is the first index of the meshcell which is inside atoms ia1 and ia2. -// pair.second is the number of meshcells which should be calculated in the following gemm. -// If no meshcell is inside both ia1 and ia2, return [bxyz, 0]. -std::pair cal_info(const int bxyz, - const int ia1, - const int ia2, - const bool* const* const cal_flag); - -} // namespace Gint_Tools -#endif diff --git a/source/source_lcao/module_gint/temp_gint/gint_type.h b/source/source_lcao/module_gint/gint_type.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_type.h rename to source/source_lcao/module_gint/gint_type.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl.cpp b/source/source_lcao/module_gint/gint_vl.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl.cpp rename to source/source_lcao/module_gint/gint_vl.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl.h b/source/source_lcao/module_gint/gint_vl.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl.h rename to source/source_lcao/module_gint/gint_vl.h diff --git a/source/source_lcao/module_gint/gint_vl_cpu_interface.cpp b/source/source_lcao/module_gint/gint_vl_cpu_interface.cpp deleted file mode 100644 index f913fab83e..0000000000 --- a/source/source_lcao/module_gint/gint_vl_cpu_interface.cpp +++ /dev/null @@ -1,265 +0,0 @@ -#include "gint.h" -#include "source_base/memory.h" -#include "source_io/module_parameter/parameter.h" -#include "source_base/timer.h" - -void Gint::gint_kernel_vlocal(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal"); - const UnitCell& ucell = *this->ucell; - const int max_size = this->gridt->max_atom; - const int lgd = this->gridt->lgd; - const int ncyz = this->ny * this->nplane; - const double dv = ucell.omega / this->ncxyz; - const double delta_r = this->gridt->dr_uniform; - hamilt::HContainer* hRGint_kernel = PARAM.inp.nspin != 4 ? this->hRGint : this->hr_gint_tmp[inout->ispin]; - hRGint_kernel->set_zero(); - -#pragma omp parallel - { /** - * @brief When in OpenMP, it points to a newly allocated memory, - */ - std::vector block_iw(max_size,0); - std::vector block_index(max_size+1,0); - std::vector block_size(max_size,0); - std::vector vldr3(this->bxyz,0.0); - #pragma omp for schedule(dynamic) - for (int grid_index = 0; grid_index < this->nbxx; grid_index++) { - const int na_grid = this->gridt->how_many_atoms[grid_index]; - if (na_grid == 0) { - continue; - } - /** - * @brief Prepare block information - */ - ModuleBase::Array_Pool cal_flag(this->bxyz,max_size); - - Gint_Tools::get_gint_vldr3(vldr3.data(), - inout->vl, - this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - dv); - - Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, - block_iw.data(), block_index.data(), block_size.data(), cal_flag.get_ptr_2D()); - - /** - * @brief Evaluate psi and dpsi on grids - */ - const int LD_pool = block_index[na_grid]; - ModuleBase::Array_Pool psir_ylm(this->bxyz, LD_pool); - Gint_Tools::cal_psir_ylm(*this->gridt, - this->bxyz, na_grid, grid_index, delta_r, - block_index.data(), block_size.data(), - cal_flag.get_ptr_2D(),psir_ylm.get_ptr_2D()); - - // psir_ylm_new=psir_func(psir_ylm) - // psir_func==nullptr means psir_ylm_new=psir_ylm - const ModuleBase::Array_Pool &psir_ylm_1 = (!this->psir_func_1) ? psir_ylm : this->psir_func_1(psir_ylm, *this->gridt, grid_index, 0, block_iw, block_size, block_index, cal_flag); - const ModuleBase::Array_Pool &psir_ylm_2 = (!this->psir_func_2) ? psir_ylm : this->psir_func_2(psir_ylm, *this->gridt, grid_index, 0, block_iw, block_size, block_index, cal_flag); - - //calculating f_mu(r) = v(r)*psi_mu(r)*dv - const ModuleBase::Array_Pool psir_vlbr3 = Gint_Tools::get_psir_vlbr3( - this->bxyz, na_grid, LD_pool, block_index.data(), - cal_flag.get_ptr_2D(), vldr3.data(), psir_ylm_1.get_ptr_2D()); - - //integrate (psi_mu*v(r)*dv) * psi_nu on grid - //and accumulates to the corresponding element in Hamiltonian - this->cal_meshball_vlocal( - na_grid, LD_pool, block_size.data(), block_index.data(), grid_index, - cal_flag.get_ptr_2D(),psir_ylm.get_ptr_2D(), psir_vlbr3.get_ptr_2D(), - hRGint_kernel); - } - ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal"); - } -} - -void Gint::gint_kernel_dvlocal(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint_dvlocal"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_dvlocal"); - const UnitCell& ucell = *this->ucell; - const int max_size = this->gridt->max_atom; - const int lgd = this->gridt->lgd; - const int nnrg = pvdpRx_reduced[inout->ispin].get_nnr(); - const int ncyz = this->ny * this->nplane; - const double dv = ucell.omega / this->ncxyz; - const double delta_r = this->gridt->dr_uniform; - - if (PARAM.globalv.gamma_only_local) { - ModuleBase::WARNING_QUIT("Gint_interface::cal_gint","dvlocal only for k point!"); - } - pvdpRx_reduced[inout->ispin].set_zero(); - pvdpRy_reduced[inout->ispin].set_zero(); - pvdpRz_reduced[inout->ispin].set_zero(); - -#pragma omp parallel -{ - std::vector block_iw(max_size,0); - std::vector block_index(max_size+1,0); - std::vector block_size(max_size,0); - std::vector vldr3(this->bxyz,0.0); -#pragma omp for schedule(dynamic) - for (int grid_index = 0; grid_index < this->nbxx; grid_index++) { - const int na_grid = this->gridt->how_many_atoms[grid_index]; - if (na_grid == 0) { - continue; - } - Gint_Tools::get_gint_vldr3(vldr3.data(), - inout->vl, - this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - dv); - //prepare block information - ModuleBase::Array_Pool cal_flag(this->bxyz,max_size); - Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, - block_iw.data(), block_index.data(), block_size.data(), cal_flag.get_ptr_2D()); - - //evaluate psi and dpsi on grids - const int LD_pool = block_index[na_grid]; - - ModuleBase::Array_Pool psir_ylm(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_x(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_y(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_z(this->bxyz, LD_pool); - Gint_Tools::cal_dpsir_ylm(*this->gridt, this->bxyz, na_grid, grid_index, delta_r, - block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),psir_ylm.get_ptr_2D(), - dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D()); - - //calculating f_mu(r) = v(r)*psi_mu(r)*dv - const ModuleBase::Array_Pool psir_vlbr3 = Gint_Tools::get_psir_vlbr3( - this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vldr3.data(), psir_ylm.get_ptr_2D()); - - //integrate (psi_mu*v(r)*dv) * psi_nu on grid - //and accumulates to the corresponding element in Hamiltonian - this->cal_meshball_vlocal(na_grid, LD_pool, block_size.data(), block_index.data(), - grid_index, cal_flag.get_ptr_2D(),psir_vlbr3.get_ptr_2D(), - dpsir_ylm_x.get_ptr_2D(), &this->pvdpRx_reduced[inout->ispin]); - this->cal_meshball_vlocal(na_grid, LD_pool, block_size.data(), block_index.data(), - grid_index, cal_flag.get_ptr_2D(),psir_vlbr3.get_ptr_2D(), - dpsir_ylm_y.get_ptr_2D(), &this->pvdpRy_reduced[inout->ispin]); - this->cal_meshball_vlocal(na_grid, LD_pool, block_size.data(), block_index.data(), - grid_index, cal_flag.get_ptr_2D(),psir_vlbr3.get_ptr_2D(), - dpsir_ylm_z.get_ptr_2D(), &this->pvdpRz_reduced[inout->ispin]); - } -} - ModuleBase::TITLE("Gint_interface", "cal_gint_dvlocal"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_dvlocal"); -} - -void Gint::gint_kernel_vlocal_meta(Gint_inout* inout) { - ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal_meta"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal_meta"); - const UnitCell& ucell = *this->ucell; - const int max_size = this->gridt->max_atom; - const int lgd = this->gridt->lgd; - const int ncyz = this->ny * this->nplane; - const double dv = ucell.omega / this->ncxyz; - const double delta_r = this->gridt->dr_uniform; - hamilt::HContainer* hRGint_kernel = PARAM.inp.nspin != 4 ? this->hRGint : this->hr_gint_tmp[inout->ispin]; - hRGint_kernel->set_zero(); - const int nnrg = hRGint_kernel->get_nnr(); - -#pragma omp parallel -{ - // define HContainer here to reference. - //Under the condition of gamma_only, hRGint will be instantiated. - std::vector block_iw(max_size,0); - std::vector block_index(max_size+1,0); - std::vector block_size(max_size,0); - std::vector vldr3(this->bxyz,0.0); - std::vector vkdr3(this->bxyz,0.0); - -#pragma omp for schedule(dynamic) - for (int grid_index = 0; grid_index < this->nbxx; grid_index++) { - const int na_grid = this->gridt->how_many_atoms[grid_index]; - if (na_grid == 0) { - continue; - } - Gint_Tools::get_gint_vldr3(vldr3.data(), - inout->vl, - this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - dv); - Gint_Tools::get_gint_vldr3(vkdr3.data(), - inout->vofk, - this->bxyz, - this->bx, - this->by, - this->bz, - this->nplane, - this->gridt->start_ind[grid_index], - ncyz, - dv); - //prepare block information - ModuleBase::Array_Pool cal_flag(this->bxyz,max_size); - Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, - block_iw.data(), block_index.data(), block_size.data(), cal_flag.get_ptr_2D()); - - //evaluate psi and dpsi on grids - const int LD_pool = block_index[na_grid]; - ModuleBase::Array_Pool psir_ylm(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_x(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_y(this->bxyz, LD_pool); - ModuleBase::Array_Pool dpsir_ylm_z(this->bxyz, LD_pool); - - Gint_Tools::cal_dpsir_ylm(*this->gridt, - this->bxyz, na_grid, grid_index, delta_r, - block_index.data(), block_size.data(), - cal_flag.get_ptr_2D(), - psir_ylm.get_ptr_2D(), - dpsir_ylm_x.get_ptr_2D(), - dpsir_ylm_y.get_ptr_2D(), - dpsir_ylm_z.get_ptr_2D() - ); - - //calculating f_mu(r) = v(r)*psi_mu(r)*dv - const ModuleBase::Array_Pool psir_vlbr3 = Gint_Tools::get_psir_vlbr3( - this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vldr3.data(), psir_ylm.get_ptr_2D()); - - //calculating df_mu(r) = vofk(r) * dpsi_mu(r) * dv - const ModuleBase::Array_Pool dpsix_vlbr3 = Gint_Tools::get_psir_vlbr3( - this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_x.get_ptr_2D()); - const ModuleBase::Array_Pool dpsiy_vlbr3 = Gint_Tools::get_psir_vlbr3( - this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_y.get_ptr_2D()); - const ModuleBase::Array_Pool dpsiz_vlbr3 = Gint_Tools::get_psir_vlbr3( - this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_z.get_ptr_2D()); - - - //integrate (psi_mu*v(r)*dv) * psi_nu on grid - //and accumulates to the corresponding element in Hamiltonian - this->cal_meshball_vlocal( - na_grid, LD_pool, block_size.data(), block_index.data(), grid_index, cal_flag.get_ptr_2D(), - psir_ylm.get_ptr_2D(), psir_vlbr3.get_ptr_2D(), hRGint_kernel); - //integrate (d/dx_i psi_mu*vk(r)*dv) * (d/dx_i psi_nu) on grid (x_i=x,y,z) - //and accumulates to the corresponding element in Hamiltonian - this->cal_meshball_vlocal( - na_grid, LD_pool, block_size.data(), block_index.data(), grid_index, cal_flag.get_ptr_2D(), - dpsir_ylm_x.get_ptr_2D(), dpsix_vlbr3.get_ptr_2D(), hRGint_kernel); - this->cal_meshball_vlocal( - na_grid, LD_pool, block_size.data(), block_index.data(), grid_index, cal_flag.get_ptr_2D(), - dpsir_ylm_y.get_ptr_2D(), dpsiy_vlbr3.get_ptr_2D(), hRGint_kernel); - this->cal_meshball_vlocal( - na_grid, LD_pool, block_size.data(), block_index.data(), grid_index, cal_flag.get_ptr_2D(), - dpsir_ylm_z.get_ptr_2D(), dpsiz_vlbr3.get_ptr_2D(), hRGint_kernel); - } -} - - ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal_meta"); - ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal_meta"); -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_gpu.cpp b/source/source_lcao/module_gint/gint_vl_gpu.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl_gpu.cpp rename to source/source_lcao/module_gint/gint_vl_gpu.cpp diff --git a/source/source_lcao/module_gint/gint_vl_gpu.cu b/source/source_lcao/module_gint/gint_vl_gpu.cu deleted file mode 100644 index ddbca83a60..0000000000 --- a/source/source_lcao/module_gint/gint_vl_gpu.cu +++ /dev/null @@ -1,219 +0,0 @@ -#ifdef _OPENMP -#include -#endif - -#include "kernels/cuda/cuda_tools.cuh" -#include "source_base/ylm.h" -#include "gint_vl_gpu.h" -#include "kernels/cuda/gint_vl.cuh" - -namespace GintKernel -{ - -/** - * Computes the gamma component of the VL (Vlocal) integral on the GPU. - * - * @note The grid integration on the GPU is mainly divided into the following - * steps: - * 1. Use the CPU to divide the grid integration into subtasks. - * 2. Copy the subtask information to the GPU. - * 3. Calculate the matrix elements on the GPU. - * 4. Perform matrix multiplication on the GPU. - * 5. Copy the results back to the host. - */ -void gint_vl_gpu(hamilt::HContainer* hRGint, - const double* vlocal, - const double* ylmcoef_now, - const double dr, - const double* rcut, - const Grid_Technique& gridt, - const UnitCell& ucell) -{ - checkCuda(cudaSetDevice(gridt.dev_id)); - // checkCuda(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); - const int nbzp = gridt.nbzp; - const int num_streams = gridt.nstreams; - const int max_atom = gridt.max_atom; - const int max_atom_per_bcell = max_atom * gridt.bxyz; - const int max_atom_per_z = max_atom_per_bcell * nbzp; - const int max_phi_per_z = max_atom_per_z * ucell.nwmax; - const int max_atompair_per_z = max_atom * max_atom * nbzp; - const double vfactor = ucell.omega / gridt.ncxyz; - const int nczp = nbzp * gridt.bz; - std::vector streams(num_streams); - std::vector events(num_streams); - - for (int i = 0; i < num_streams; i++) - { - checkCuda(cudaStreamCreate(&streams[i])); - checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming)); - } - - const int nnrg = hRGint->get_nnr(); - hRGint->set_zero(); - Cuda_Mem_Wrapper grid_vlocal_g(nnrg, 1, false); - grid_vlocal_g.memset_device_sync(); - - Cuda_Mem_Wrapper dr_part(max_atom_per_z * 3, num_streams, true); - Cuda_Mem_Wrapper atoms_type(max_atom_per_z, num_streams, true); - // The first number in every group of two represents the number of atoms on that bigcell. - // The second number represents the cumulative number of atoms up to that bigcell. - Cuda_Mem_Wrapper atoms_num_info(2 * nbzp, num_streams, true); - Cuda_Mem_Wrapper vldr3(nbzp * gridt.bxyz, num_streams, true); - - Cuda_Mem_Wrapper psi(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper psi_vldr3(max_phi_per_z, num_streams, false); - - Cuda_Mem_Wrapper gemm_m(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_n(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_k(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_lda(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_ldb(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_ldc(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_A(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_B(max_atompair_per_z, num_streams, true); - Cuda_Mem_Wrapper gemm_C(max_atompair_per_z, num_streams, true); - -#ifdef _OPENMP -const int max_thread_num = std::min(omp_get_max_threads(), num_streams); -#endif -#pragma omp parallel num_threads(max_thread_num) -{ -#ifdef _OPENMP - const int tid = omp_get_thread_num(); - const int num_threads = omp_get_num_threads(); - const int sid_start = tid * num_streams / num_threads; - const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads; -#else - const int sid_start = 0; - const int thread_num_streams = num_streams; -#endif -#pragma omp for collapse(2) schedule(dynamic) - for (int i = 0; i < gridt.nbx; i++) - { - for (int j = 0; j < gridt.nby; j++) - { - // 20240620 Note that it must be set again here because - // cuda's device is not safe in a multi-threaded environment. - checkCuda(cudaSetDevice(gridt.dev_id)); - - const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start; - checkCuda(cudaEventSynchronize(events[sid])); - int max_m = 0; - int max_n = 0; - int atom_pair_num = 0; - int atoms_per_z = 0; - const int grid_index_ij = i * gridt.nby * nbzp + j * nbzp; - - gtask_vlocal(gridt, - ucell, - grid_index_ij, - nczp, - vfactor, - vlocal, - atoms_per_z, - atoms_num_info.get_host_pointer(sid), - atoms_type.get_host_pointer(sid), - dr_part.get_host_pointer(sid), - vldr3.get_host_pointer(sid)); - - alloc_mult_vlocal(hRGint, - gridt, - ucell, - grid_index_ij, - max_atom, - psi.get_device_pointer(sid), - psi_vldr3.get_device_pointer(sid), - grid_vlocal_g.get_device_pointer(), - gemm_m.get_host_pointer(sid), - gemm_n.get_host_pointer(sid), - gemm_k.get_host_pointer(sid), - gemm_lda.get_host_pointer(sid), - gemm_ldb.get_host_pointer(sid), - gemm_ldc.get_host_pointer(sid), - gemm_A.get_host_pointer(sid), - gemm_B.get_host_pointer(sid), - gemm_C.get_host_pointer(sid), - atom_pair_num, - max_m, - max_n); - - dr_part.copy_host_to_device_async(streams[sid], sid, atoms_per_z * 3); - atoms_type.copy_host_to_device_async(streams[sid], sid, atoms_per_z); - vldr3.copy_host_to_device_async(streams[sid], sid); - atoms_num_info.copy_host_to_device_async(streams[sid], sid, 2 * nbzp); - - gemm_m.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_n.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_k.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_lda.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_ldb.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_ldc.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num); - checkCuda(cudaEventRecord(events[sid], streams[sid])); - - psi.memset_device_async(streams[sid], sid, 0); - psi_vldr3.memset_device_async(streams[sid], sid, 0); - - dim3 grid_psi(nbzp, gridt.bxyz); - dim3 block_psi(64); - get_psi_and_vldr3<<>>( - gridt.ylmcoef_g, - dr, - gridt.bxyz, - ucell.nwmax, - max_atom, - gridt.atom_nwl_g, - gridt.atom_new_g, - gridt.atom_ylm_g, - gridt.atom_nw_g, - gridt.rcut_g, - gridt.nr_max, - gridt.psi_u_g, - gridt.mcell_pos_g, - dr_part.get_device_pointer(sid), - vldr3.get_device_pointer(sid), - atoms_type.get_device_pointer(sid), - atoms_num_info.get_device_pointer(sid), - psi.get_device_pointer(sid), - psi_vldr3.get_device_pointer(sid)); - checkCudaLastError(); - - gridt.fastest_matrix_mul(max_m, - max_n, - gemm_m.get_device_pointer(sid), - gemm_n.get_device_pointer(sid), - gemm_k.get_device_pointer(sid), - gemm_A.get_device_pointer(sid), - gemm_lda.get_device_pointer(sid), - gemm_B.get_device_pointer(sid), - gemm_ldb.get_device_pointer(sid), - gemm_C.get_device_pointer(sid), - gemm_ldc.get_device_pointer(sid), - atom_pair_num, - streams[sid], - nullptr); - checkCudaLastError(); - } - } -} - - checkCuda(cudaMemcpy( - hRGint->get_wrapper(), - grid_vlocal_g.get_device_pointer(), - nnrg * sizeof(double), - cudaMemcpyDeviceToHost)); - - for (int i = 0; i < num_streams; i++) - { - checkCuda(cudaStreamDestroy(streams[i])); - checkCuda(cudaEventDestroy(events[i])); - } -} - -} // namespace GintKernel \ No newline at end of file diff --git a/source/source_lcao/module_gint/gint_vl_gpu.h b/source/source_lcao/module_gint/gint_vl_gpu.h index a04b6a130d..a671b6b33a 100644 --- a/source/source_lcao/module_gint/gint_vl_gpu.h +++ b/source/source_lcao/module_gint/gint_vl_gpu.h @@ -1,53 +1,49 @@ -#ifndef GINT_VL_GPU_H -#define GINT_VL_GPU_H +#pragma once +#include +#include +#include "source_lcao/module_hcontainer/hcontainer.h" #include "gint.h" -#include "grid_technique.h" -#include "kernels/cuda/cuda_tools.cuh" +#include "gint_info.h" +#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h" -namespace GintKernel +namespace ModuleGint { -void gint_vl_gpu(hamilt::HContainer* hRGint, - const double* vlocal, - const double* ylmcoef_now, - const double dr, - const double* rcut, - const Grid_Technique& gridt, - const UnitCell& ucell); - -void gtask_vlocal(const Grid_Technique& gridt, - const UnitCell& ucell, - const int grid_index_ij, - const int nczp, - const double vfactor, - const double* vlocal_global_value, - int& atoms_per_z, - int* atoms_num_info, - uint8_t* atoms_type, - double* dr_part, - double* vldr3); - -void alloc_mult_vlocal(const hamilt::HContainer* hRGint, - const Grid_Technique& gridt, - const UnitCell& ucell, - const int grid_index_ij, - const int max_atom, - double* const psi, - double* const psi_vldr3, - double* const grid_vlocal_g, - int* mat_m, - int* mat_n, - int* mat_k, - int* mat_lda, - int* mat_ldb, - int* mat_ldc, - double** mat_A, - double** mat_B, - double** mat_C, - int& atom_pair_num, - int& max_m, - int& max_n); -} // namespace GintKernel - -#endif \ No newline at end of file +class Gint_vl_gpu : public Gint +{ + public: + Gint_vl_gpu( + const double* vr_eff, + HContainer* hR) + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} + + void cal_gint(); + + private: + + void init_hr_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + void cal_hr_gint_(); + + // input + const double* vr_eff_; + + + // output + HContainer* hR_; + + // Intermediate variables + double dr3_; + + HContainer hr_gint_; + + CudaMemWrapper hr_gint_d_; + CudaMemWrapper vr_eff_d_; +}; + +} \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga.cpp b/source/source_lcao/module_gint/gint_vl_metagga.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga.cpp rename to source/source_lcao/module_gint/gint_vl_metagga.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga.h b/source/source_lcao/module_gint/gint_vl_metagga.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga.h rename to source/source_lcao/module_gint/gint_vl_metagga.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp b/source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp rename to source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h b/source/source_lcao/module_gint/gint_vl_metagga_gpu.h similarity index 93% rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h rename to source/source_lcao/module_gint/gint_vl_metagga_gpu.h index aabae7e52f..f55c409c66 100644 --- a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h +++ b/source/source_lcao/module_gint/gint_vl_metagga_gpu.h @@ -5,7 +5,7 @@ #include "source_lcao/module_hcontainer/hcontainer.h" #include "gint.h" #include "gint_info.h" -#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" +#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h" namespace ModuleGint { diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp b/source/source_lcao/module_gint/gint_vl_metagga_nspin4.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp rename to source/source_lcao/module_gint/gint_vl_metagga_nspin4.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h b/source/source_lcao/module_gint/gint_vl_metagga_nspin4.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h rename to source/source_lcao/module_gint/gint_vl_metagga_nspin4.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp rename to source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h b/source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.h similarity index 93% rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h rename to source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.h index c5f6f7c729..9c1b8ca166 100644 --- a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h +++ b/source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.h @@ -5,7 +5,7 @@ #include "source_lcao/module_hcontainer/hcontainer.h" #include "gint.h" #include "gint_info.h" -#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" +#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h" namespace ModuleGint { diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp b/source/source_lcao/module_gint/gint_vl_nspin4.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp rename to source/source_lcao/module_gint/gint_vl_nspin4.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_nspin4.h b/source/source_lcao/module_gint/gint_vl_nspin4.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl_nspin4.h rename to source/source_lcao/module_gint/gint_vl_nspin4.h diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp b/source/source_lcao/module_gint/gint_vl_nspin4_gpu.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp rename to source/source_lcao/module_gint/gint_vl_nspin4_gpu.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h b/source/source_lcao/module_gint/gint_vl_nspin4_gpu.h similarity index 94% rename from source/source_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h rename to source/source_lcao/module_gint/gint_vl_nspin4_gpu.h index 6d17a9a1bb..2e1aa1a475 100644 --- a/source/source_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h +++ b/source/source_lcao/module_gint/gint_vl_nspin4_gpu.h @@ -5,7 +5,7 @@ #include "source_lcao/module_hcontainer/hcontainer.h" #include "gint.h" #include "gint_info.h" -#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" +#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h" namespace ModuleGint { diff --git a/source/source_lcao/module_gint/gint_vl_old.cpp b/source/source_lcao/module_gint/gint_vl_old.cpp deleted file mode 100644 index 9ebc341d7f..0000000000 --- a/source/source_lcao/module_gint/gint_vl_old.cpp +++ /dev/null @@ -1,94 +0,0 @@ -#include "source_base/global_function.h" -#include "source_base/global_variable.h" -#include "gint_k.h" -#include "source_basis/module_ao/ORB_read.h" -#include "grid_technique.h" -#include "source_base/ylm.h" -#include "source_pw/module_pwdft/global.h" -#include "source_base/timer.h" -#include "source_base/array_pool.h" -#include "source_base/vector3.h" -//#include - -#ifdef _OPENMP -#include -#endif - -#ifdef __MKL -#include -#endif - -// this is a thread-safe function -void Gint::cal_meshball_vlocal( - const int na_grid, // how many atoms on this (i,j,k) grid - const int LD_pool, - const int*const block_size, // block_size[na_grid], number of columns of a band - const int*const block_index, // block_index[na_grid+1], count total number of atomis orbitals - const int grid_index, // index of grid group, for tracing global atom index - const bool*const*const cal_flag, // cal_flag[this->bxyz][na_grid], whether the atom-grid distance is larger than cutoff - const double*const*const psir_ylm, // psir_ylm[this->bxyz][LD_pool] - const double*const*const psir_vlbr3, // psir_vlbr3[this->bxyz][LD_pool] - hamilt::HContainer* hR) // this->hRGint is the container of matrix element. -{ - const char transa='N', transb='T'; - const double alpha=1, beta=1; - const int lgd_now = this->gridt->lgd; - - const int mcell_index = this->gridt->bcell_start[grid_index]; - std::vector hr_tmp; - for(int ia1=0; ia1gridt->which_atom[bcell1]; - const int id1 = this->gridt->which_unitcell[bcell1]; - const ModuleBase::Vector3 r1 = this->gridt->get_ucell_coords(id1); - - for(int ia2=0; ia2gridt->which_atom[bcell2]; - const int id2 = this->gridt->which_unitcell[bcell2]; - const ModuleBase::Vector3 r2 = this->gridt->get_ucell_coords(id2); - - if(iat1<=iat2) - { - int first_ib=0; - for(int ib=0; ibbxyz; ++ib) - { - if(cal_flag[ib][ia1] && cal_flag[ib][ia2]) - { - first_ib=ib; - break; - } - } - int last_ib=0; - for(int ib=this->bxyz-1; ib>=0; --ib) - { - if(cal_flag[ib][ia1] && cal_flag[ib][ia2]) - { - last_ib=ib+1; - break; - } - } - const int ib_length = last_ib-first_ib; - if(ib_length<=0) { continue; } - - const auto tmp_matrix = hR->find_matrix(iat1, iat2, r1-r2); - if (tmp_matrix == nullptr) - { - continue; - } - const int m = tmp_matrix->get_row_size(); - const int n = tmp_matrix->get_col_size(); - hr_tmp.resize(m * n); - ModuleBase::GlobalFunc::ZEROS(hr_tmp.data(), m*n); - - dgemm_(&transa, &transb, &n, &m, &ib_length, &alpha, - &psir_vlbr3[first_ib][block_index[ia2]], &LD_pool, - &psir_ylm[first_ib][block_index[ia1]], &LD_pool, - &beta, hr_tmp.data(), &n); - tmp_matrix->add_array_ts(hr_tmp.data()); - } - } - } -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/grid_bigcell.cpp b/source/source_lcao/module_gint/grid_bigcell.cpp deleted file mode 100644 index ec5b29970c..0000000000 --- a/source/source_lcao/module_gint/grid_bigcell.cpp +++ /dev/null @@ -1,363 +0,0 @@ -#include "grid_bigcell.h" - -#include "source_io/module_parameter/parameter.h" -#include "source_base/memory.h" -#include "source_base/timer.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_pw/module_pwdft/global.h" -#include "source_cell/unitcell.h" -Grid_BigCell::Grid_BigCell() -{ - this->orbital_rmax = 0.0; - this->nxe = this->nye = this->nze = 0; - this->dxe = 0; - this->dye = 0; - this->dze = 0; - this->nxe = 0; - this->nye = 0; - this->nze = 0; - this->nxyze = 0; -} - -Grid_BigCell::~Grid_BigCell() -{ -} - -void Grid_BigCell::init_big_latvec(const UnitCell& ucell) -{ - ModuleBase::TITLE("Grid_BigCell","init_big_latvec"); - // initialize the mesh cell vectors. - assert(nbx>0); - assert(nby>0); - assert(nbz>=0); - - this->nat=ucell.nat; - //size of each big room (same shape with unitcell) - this->bigcell_vec1=std::vector(3,0.0); - this->bigcell_vec1[0]=ucell.a1.x / (double)nbx * ucell.lat0; - this->bigcell_vec1[1]=ucell.a1.y / (double)nbx * ucell.lat0; - this->bigcell_vec1[2]=ucell.a1.z / (double)nbx * ucell.lat0; - - this->bigcell_vec2=std::vector(3,0.0); - this->bigcell_vec2[0]=ucell.a2.x / (double)nby * ucell.lat0; - this->bigcell_vec2[1]=ucell.a2.y / (double)nby * ucell.lat0; - this->bigcell_vec2[2]=ucell.a2.z / (double)nby * ucell.lat0; - - this->bigcell_vec3=std::vector(3,0.0); - this->bigcell_vec3[0]=ucell.a3.x / (double)nbz * ucell.lat0; - this->bigcell_vec3[1]=ucell.a3.y / (double)nbz * ucell.lat0; - this->bigcell_vec3[2]=ucell.a3.z / (double)nbz * ucell.lat0; - - this->bigcell_latvec0.e11 = this->bigcell_vec1[0]; - this->bigcell_latvec0.e12 = this->bigcell_vec1[1]; - this->bigcell_latvec0.e13 = this->bigcell_vec1[2]; - - this->bigcell_latvec0.e21 = this->bigcell_vec2[0]; - this->bigcell_latvec0.e22 = this->bigcell_vec2[1]; - this->bigcell_latvec0.e23 = this->bigcell_vec2[2]; - - this->bigcell_latvec0.e31 = this->bigcell_vec3[0]; - this->bigcell_latvec0.e32 = this->bigcell_vec3[1]; - this->bigcell_latvec0.e33 = this->bigcell_vec3[2]; - - // why we need GT = bigcell_latvec0^(-1)? - // note that (i,j,k) is a grid point. - // (x,y,z) is the cartesian coordinates. - // because - // (x,y,z) = (i,j,k) * bigcell_latvec0 - // once we know (x,y,z) and bigcell_latvec0 - // we need to transform the formula to - // (x,y,z) * bigcell_latvec0^(-1) = (i,j,k) - this->bigcell_GT = this->bigcell_latvec0.Inverse(); - - if(PARAM.inp.test_gridt) - { - GlobalV::ofs_running << " the VECTORS of BIGCELL are (Bohr): " << std::endl; - GlobalV::ofs_running << " vec1( " - << std::setw(15) << bigcell_vec1[0] - << std::setw(15) << bigcell_vec1[1] - << std::setw(15) << bigcell_vec1[2] - << ")" << std::endl; - - GlobalV::ofs_running << " vec2( " - << std::setw(15) << bigcell_vec2[0] - << std::setw(15) << bigcell_vec2[1] - << std::setw(15) << bigcell_vec2[2] - << ")" << std::endl; - - GlobalV::ofs_running << " vec3( " - << std::setw(15) << bigcell_vec3[0] - << std::setw(15) << bigcell_vec3[1] - << std::setw(15) << bigcell_vec3[2] - << ")" << std::endl; - } - return; -} - - -void Grid_BigCell::init_grid_expansion(const UnitCell& ucell,double* rcut) -{ - ModuleBase::TITLE("Grid_BigCell","init_grid_expansion"); - - // calculate the max cutoff radius among all orbitals. - // then we will use this parameter to generate grid expansion. - - for(int T=0; Torbital_rmax = std::max( rcut[T], this->orbital_rmax); - } - if(PARAM.inp.test_gridt)ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"rmax of periodic grid (bohr)",orbital_rmax); - - // mohan fixed serious bug 2010-03-06 - // G = GT^T - // g1 = the norm of first std::vector of G - // g2 = the norm of second std::vector of G - // g3 = the norm of third std::vector of G - double g1 = sqrt(bigcell_GT.e11 * bigcell_GT.e11 - + bigcell_GT.e21 * bigcell_GT.e21 - + bigcell_GT.e31 * bigcell_GT.e31); - - double g2 = sqrt(bigcell_GT.e12 * bigcell_GT.e12 - + bigcell_GT.e22 * bigcell_GT.e22 - + bigcell_GT.e32 * bigcell_GT.e32); - - double g3 = sqrt(bigcell_GT.e13 * bigcell_GT.e13 - + bigcell_GT.e23 * bigcell_GT.e23 - + bigcell_GT.e33 * bigcell_GT.e33); - - // we assume the added bigcell can present even the atom - // is at the edge of the origin grid. - // mohan add +1, 2011-04-23 - this->dxe = static_cast( this->orbital_rmax * g1) +1; - this->dye = static_cast( this->orbital_rmax * g2) +1; - this->dze = static_cast( this->orbital_rmax * g3) +1; - //xiaohui add 'PARAM.inp.out_level' line, 2015-09-16 - if(PARAM.inp.out_level != "m") ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"extended fft grid",dxe,dye,dze); - - // calculate the dimension of expanded grid. - // +1 in order to cover the spillage atom on the right side. - assert(nbx>0); - assert(nby>0); - assert(nbz>=0); - - this->nxe = nbx + 2*dxe +1; - this->nye = nby + 2*dye +1; - this->nze = nbz + 2*dze +1; - this->nxyze = this->nxe * this->nye * this->nze; - - if(PARAM.inp.out_level != "m") ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"dimension of extened grid",nxe,nye,nze); - return; -} - - -void Grid_BigCell::init_tau_in_bigcell(const UnitCell& ucell) -{ - ModuleBase::TITLE("Grid_BigCell","init_tau_in_bigcell"); - - // allcoate space for atom positions relative - // to meshcell. - this->tau_in_bigcell = std::vector>(ucell.nat,std::vector(3,0.0)); - ModuleBase::Memory::record("tau_in_bigcell", sizeof(double) * ucell.nat*3); - // allocate space, these arrays record which meshcell - // the atom is in. - this->index_atom = std::vector(ucell.nat, 0); - ModuleBase::Memory::record("index_atom", sizeof(double) * ucell.nat); - - // get the fraction number of (i,j,k) - ModuleBase::Vector3 fraction; - int iat=0; - int ii,jj,kk; - double delta[3]; - for(int it=0; itbigcell_GT; - - //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - // mohan add 2012-07-03, - // this can make sure faction are always larger than 0. - //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - fraction.x = ucell.atoms[it].taud[ia].x / (1.0/(double)nbx); - fraction.y = ucell.atoms[it].taud[ia].y / (1.0/(double)nby); - fraction.z = ucell.atoms[it].taud[ia].z / (1.0/(double)nbz); - - // never use the following, especially for k-algorithm, - // it may move the atom to a cell that it doesn't belong - // to - //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - // mohan add 2012-06-07 - // fraction may be very very small, about -1.0e-15, - // and the fraction must > 0, so I use periodic boundary condition -// if( fraction.x < 0.0 ) fraction.x += nxe; -// if( fraction.y < 0.0 ) fraction.y += nye; -// if( fraction.z < 0.0 ) fraction.z += nze; - - - - if( fraction.x < 0 || fraction.y < 0 || fraction.z < 0) - { - std::cout << " Atom positions " << std::endl; - std::cout << ucell.atoms[it].tau[ia].x << " " ; - std::cout << ucell.atoms[it].tau[ia].y << " " ; - std::cout << ucell.atoms[it].tau[ia].z << " " ; - std::cout << " fraction " << std::endl; - std::cout << fraction.x << " "; - std::cout << fraction.y << " "; - std::cout << fraction.z << " "; - std::cout << std::endl; - ModuleBase::WARNING_QUIT("Grid_BigCell::init_tau_in_bigcell","fraction.x<0 || fraction.y<0 || fraction.z<0"); - } - - assert(fraction.x >= 0.0); - assert(fraction.y >= 0.0); - assert(fraction.z >= 0.0); - - // make clean which meshcell the atom is in. - ii = static_cast(fraction.x+1.0e-8); - jj = static_cast(fraction.y+1.0e-8); - kk = static_cast(fraction.z+1.0e-8); - - // calculate the index of each corresponding meshcell. - // Notice ! In fact, we need to minus ii,jj,kk by 1. - // to label the atom belong to which meshcell - // in a usual way: left, down corner. - // if we dont' do this, means the start position - // of atom is another tyep: right,up corner. - // which cause minus atom position in grid integration. - - // index_atom: atom 'iat' index in extended grid. - this->index_atom[iat] = (kk+dze) + (jj+dye) * this->nze + (ii+dxe) * this->nye * this->nze; - - /* - if(index_atom[iat]==3483935) - { - std::cout << "\n i=" << kk+dze << " j=" << jj+dye << " k=" << ii+dxe; - BLOCK_HERE("check index atom"); - } - */ - - // get the relative position in direct coordinate. - delta[0] = fraction.x - (double)ii; - delta[1] = fraction.y - (double)jj; - delta[2] = fraction.z - (double)kk; - - if( std::abs(delta[0]) < 1.0e-8) delta[0] = 0.0; - if( std::abs(delta[1]) < 1.0e-8) delta[1] = 0.0; - if( std::abs(delta[2]) < 1.0e-8) delta[2] = 0.0; - -// std::cout << " fraction=" << fraction.x << " " << fraction.y << " " << fraction.z << std::endl; -// std::cout << " delta=" << delta[0] << " " << delta[1] << " " << delta[2] << std::endl; - - // get the true relative cartesian coordinate of each atom to the coresponding - // meshcell. - for(int ic=0; ic<3; ic++) - { - this->tau_in_bigcell[iat][ic] = - delta[0] * this->bigcell_vec1[ic] + - delta[1] * this->bigcell_vec2[ic] + - delta[2] * this->bigcell_vec3[ic]; - } - - ++iat; - } - } - - return; -} - -// (3) -// if f2normal == true, calculate the index2normal. -// if f2normal == false, calculate the index2cell. -void Grid_BigCell::grid_expansion_index(bool f2normal, int *target)const -{ - ModuleBase::TITLE("Grid_BigCell","grid_expansion_index"); - ModuleBase::timer::tick("Grid_BigCell","grid_expansion_index"); - - int ii,jj,kk,in_ext,in_normal; - for(int i=0; inxe; i++) - { - for(int j=0; jnye; j++) - { - for(int k=0; knze; k++) - { - in_ext = k + j * this->nze + i * this->nye * this->nze; - - // range from [-dxe,ncx+dxe] - ii = i - this->dxe; - jj = j - this->dye; - kk = k - this->dze; - - //--------------------------------------------------- - // mohan add 2010-10-28 - // be careful of the box. - // it's useful only when k points are used in LCAO. - // for example, we construct a 2D supercell - // and using 32 * 32 FFT grid (bigcell ) to do - // grid integration, - // then the first cell (0,0) along x is [0,31) - // others are: - // cell index: (-2,0) , (-1,0) , (0,0), (0,1) - // fft index: [-64,-33], [-32,-1], [0,31], [32,63]. - // look at the formulas below, - // at first, we take grid_index2ucell1=(ii/nbx) - // but then we found it is wrong if ii < 0. - // for example, if ii is -31, the box is -1, - // so we add -1, the formula turns to ii/nbx-1, - // but if ii is -32, the box is -1-1 = -2, not correct. - // so we add 1 to ii, the box will be -31/32-1=-1, correct! - // the formula is (ii+1)/nbx-1, - // if ii is -1, the box is still -1, correct! - // if ii is -33, the box is -2, correct! - //--------------------------------------------------- - - int cel1, cel2, cel3; - - if(ii<0) cel1 = (ii+1) / nbx - 1; - else cel1 = ii / nbx; - if(jj<0) cel2 = (jj+1) / nby - 1; - else cel2 = jj / nby; - if(kk<0) cel3 = (kk+1) / nbz - 1; - else cel3 = kk / nbz; - - if(!f2normal) - { - // target: index2ucell - target[in_ext] = this->cal_Rindex(cel1, cel2, cel3); - } - else - { - // if ii < 0, we need to make ii > 0. - // so we add 10000 layers. It should be enough. - // ii, jj, kk shoudl -- ????????????? - ii = (ii + 10000 * nbx) % nbx; - jj = (jj + 10000 * nby) % nby; - kk = (kk + 10000 * nbz) % nbz; - - assert(ii>=0); - assert(jj>=0); - assert(kk>=0); - - assert( in_ext < nxyze); - - if(ii> tau_in_bigcell; - - /// move operator for the next ESolver to directly use its infomation - Grid_BigCell& operator=(Grid_BigCell&& rhs) = default; - - protected: - // get the max radius of all orbitals - // which will use to generate grid expansion, - // and the meshball. - double orbital_rmax; - - // the added number of bigcelli each direction. - int dxe; - int dye; - int dze; - - // expansion grid dimension. - int nxe; - int nye; - int nze; - int nxyze; - - std::vector index_atom; - - // save the position of base vector of bigcell. - std::vector bigcell_vec1; - std::vector bigcell_vec2; - std::vector bigcell_vec3; - - ModuleBase::Matrix3 bigcell_latvec0; - ModuleBase::Matrix3 bigcell_GT; - - //--------------------------------- - void grid_expansion_index(bool f2normal, int *target)const; - //--------------------------------- - void init_big_latvec(const UnitCell &ucell); - //--------------------------------- - void init_tau_in_bigcell(const UnitCell& ucell); - //--------------------------------- - void init_grid_expansion(const UnitCell& ucell,double* rcut); -}; -#endif diff --git a/source/source_lcao/module_gint/grid_meshball.cpp b/source/source_lcao/module_gint/grid_meshball.cpp deleted file mode 100644 index 464ea8d962..0000000000 --- a/source/source_lcao/module_gint/grid_meshball.cpp +++ /dev/null @@ -1,142 +0,0 @@ -#include "grid_meshball.h" -#include "source_base/memory.h" -#include "source_io/module_parameter/parameter.h" - -Grid_MeshBall::Grid_MeshBall() -{ -} - -Grid_MeshBall::~Grid_MeshBall() -{ -} - -void Grid_MeshBall::init_meshball() -{ - ModuleBase::TITLE("Grid_MeshBall","init_meshball"); - - // init meshball_radius, generally the value - // is same as orbital_rmax, of course you can - // incrase meshball_radius, but there will be - // no atoms in the added bigcells. - // (in case subcell are too many). - this->meshball_radius = this->orbital_rmax; - - // select a ball in a cubic. - double pos[3]; - double r2=0.0; - - //------------------------------------------------------------------ - // const double rcut2 = this->meshball_radius * this->meshball_radius; - // qianrui fix a bug and add 0.001 2022-4-30 - // Sometimes r2 is equal to rcut2, for example they are 36. - // However, r2 is either 35.99.. or 36.0..001, which makes count != this->meshball_ncells - // and segment fault. - // I do not know how to solve it and this may occurs in somewhere else in ABACUS. - // May some genius can give a better solution. - //------------------------------------------------------------------ - const double rcut2 = this->meshball_radius * this->meshball_radius + 0.001; - - //------------------------------------------------------------------- - // calculate twice, the first time find the number of mesh points, - // then allocate array and save each bigcell's cartesian coordinate. - // plus one because we need to cover atom spillage. - // meshball_ncells: How many cells in mesh ball. - //------------------------------------------------------------------- - this->meshball_ncells = 0; - for(int i=-dxe; ideal_with_atom_spillage( pos ); - //r2 = pos[0]*pos[0]+pos[1]*pos[1]+pos[2]*pos[2]; - - // calculate the distance. - if( r2 < rcut2 ) - { - ++meshball_ncells; - } - } - } - } - if(PARAM.inp.test_gridt) {ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "how many cells in meshball",this->meshball_ncells); -} - - // prepare for the second calculation. - this->meshball_positions = std::vector>(meshball_ncells, std::vector(3, 0.0)); - ModuleBase::Memory::record("meshball_pos", sizeof(double) * meshball_ncells*3); - this->index_ball = std::vector(meshball_ncells); - ModuleBase::Memory::record("index_ball", sizeof(int) * meshball_ncells); - - // second time. - int count = 0; - for(int i=-dxe; idxe+1; i++) - { - for(int j=-dye; jdye+1; j++) - { - for(int k=-dze; kdze+1; k++) - { - // caclculate the std::vector away from 'zero point'. - // change to cartesian coordinates. - for(int ip=0; ip<3; ip++) - { - pos[ip] = i*bigcell_vec1[ip]+j*bigcell_vec2[ip]+k*bigcell_vec3[ip]; - } - r2 = this->deal_with_atom_spillage( pos ); - - // calculate the distance. - if( r2 < rcut2 ) - { - for(int ip=0; ip<3; ip++) - { - this->meshball_positions[count][ip] = pos[ip]; - } - - // record each position. - this->index_ball[count] = k + j * this->nze + i * this->nye * this->nze; - ++count; - } - } - } - } - - assert(count == this->meshball_ncells); - return; -} - -double Grid_MeshBall::deal_with_atom_spillage(const double *pos) -{ - double dx; - double r2 = 100000; - double *cell=new double[3]; - - for(int i=-1; i<=1; i++) - { - for(int j=-1; j<=1; j++) - { - for(int k=-1; k<=1; k++) - { - dx = 0.0; - for(int ip=0; ip<3; ip++) - { - // change to cartesian coordinates. - cell[ip] = i*this->bigcell_vec1[ip] + - j*this->bigcell_vec2[ip] + - k*this->bigcell_vec3[ip]; - dx += (cell[ip] - pos[ip]) * (cell[ip] - pos[ip]); - } - r2 = std::min(dx, r2); - } - } - } - delete[] cell; - return r2; -} - - diff --git a/source/source_lcao/module_gint/grid_meshball.h b/source/source_lcao/module_gint/grid_meshball.h deleted file mode 100644 index 571d59126e..0000000000 --- a/source/source_lcao/module_gint/grid_meshball.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef GRID_MESHBALL_H -#define GRID_MESHBALL_H - -#include "grid_bigcell.h" - -class Grid_MeshBall : public Grid_BigCell -{ - public: - Grid_MeshBall(); - ~Grid_MeshBall(); - // cartesian coordinates of meshball. - std::vector> meshball_positions; - - /// move operator for the next ESolver to directly use its infomation - Grid_MeshBall& operator=(Grid_MeshBall&& rhs) = default; - - protected: - // number of meshcells in meshball. - int meshball_ncells=0; - // used in index2normal - std::vector index_ball; - // search each meshcell of this meshball. - void init_meshball(void); - - private: - // init the meshball radius. - double meshball_radius=0.0; - // Handle as a truncation function. - double deal_with_atom_spillage(const double* pos); - -}; -#endif diff --git a/source/source_lcao/module_gint/grid_meshcell.cpp b/source/source_lcao/module_gint/grid_meshcell.cpp deleted file mode 100644 index 77e933c55d..0000000000 --- a/source/source_lcao/module_gint/grid_meshcell.cpp +++ /dev/null @@ -1,168 +0,0 @@ -#include "grid_meshcell.h" - -#include "source_io/module_parameter/parameter.h" -#include "source_base/memory.h" -#include "source_pw/module_pwdft/global.h" - -Grid_MeshCell::Grid_MeshCell() -{ -} - -Grid_MeshCell::~Grid_MeshCell() -{ -} - -void Grid_MeshCell::set_grid_dim( - const int &ncx_in, - const int &ncy_in, - const int &ncz_in, - const int &bx_in, - const int &by_in, - const int &bz_in, - const int &nbx_in, - const int &nby_in, - const int &nbz_in, - const int &nbxx_in, - const int &nbzp_start_in, - const int &nbzp_in - ) -{ - this->ncx = ncx_in; - this->ncy = ncy_in; - this->ncz = ncz_in; - this->ncxyz = ncx * ncy * ncz; - this->bx = bx_in; - this->by = by_in; - this->bz = bz_in; - this->bxyz = bx*by*bz; - this->nbx = nbx_in; - this->nby = nby_in; - this->nbz = nbz_in; - this->nbxyz = nbx*nby*nbz; - this->nbxx = nbxx_in; - this->nbzp_start = nbzp_start_in; - this->nbzp = nbzp_in; - - - //xiaohui add 'PARAM.inp.out_level' line, 2015-09-16 - if(PARAM.inp.out_level != "m") - { - ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"real space grid",ncx,ncy,ncz); // real space uniform grid - } - - if(PARAM.inp.out_level != "m") - { - ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"big cell numbers in grid",nbx,nby,nbz); // reduced by BIG_CELL - } - - if(PARAM.inp.out_level != "m") - { - ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"meshcell numbers in big cell",bx,by,bz); // is small integer, typical number 2*2*2 - } - - return; -} - - - -// (1) -void Grid_MeshCell::init_latvec(const UnitCell &ucell) -{ - ModuleBase::TITLE("Grid_MeshCell","init_latvec"); - // initialize the mesh cell vectors. - assert(ncx>0); - assert(ncy>0); - assert(ncz>0); - - //size of each room (same shape with unitcell) - this->meshcell_vec1=std::vector(3,0.0); - this->meshcell_vec1[0]=ucell.a1.x / (double)ncx * ucell.lat0; - this->meshcell_vec1[1]=ucell.a1.y / (double)ncx * ucell.lat0; - this->meshcell_vec1[2]=ucell.a1.z / (double)ncx * ucell.lat0; - - this->meshcell_vec2=std::vector(3,0.0); - this->meshcell_vec2[0]=ucell.a2.x / (double)ncy * ucell.lat0; - this->meshcell_vec2[1]=ucell.a2.y / (double)ncy * ucell.lat0; - this->meshcell_vec2[2]=ucell.a2.z / (double)ncy * ucell.lat0; - - this->meshcell_vec3=std::vector(3,0.0); - this->meshcell_vec3[0]=ucell.a3.x / (double)ncz * ucell.lat0; - this->meshcell_vec3[1]=ucell.a3.y / (double)ncz * ucell.lat0; - this->meshcell_vec3[2]=ucell.a3.z / (double)ncz * ucell.lat0; - - this->meshcell_latvec0.e11 = this->meshcell_vec1[0]; - this->meshcell_latvec0.e12 = this->meshcell_vec1[1]; - this->meshcell_latvec0.e13 = this->meshcell_vec1[2]; - - this->meshcell_latvec0.e21 = this->meshcell_vec2[0]; - this->meshcell_latvec0.e22 = this->meshcell_vec2[1]; - this->meshcell_latvec0.e23 = this->meshcell_vec2[2]; - - this->meshcell_latvec0.e31 = this->meshcell_vec3[0]; - this->meshcell_latvec0.e32 = this->meshcell_vec3[1]; - this->meshcell_latvec0.e33 = this->meshcell_vec3[2]; - - // why we need GT = meshcell_latvec0^(-1)? - // note that (i,j,k) is a grid point. - // (x,y,z) is the cartesian coordinates. - // because - // (x,y,z) = (i,j,k) * meshcell_latvec0 - // once we know (x,y,z) and meshcell_latvec0 - // we need to transform the formula to - // (x,y,z) * meshcell_latvec0^(-1) = (i,j,k) - this->meshcell_GT = this->meshcell_latvec0.Inverse(); - - if(PARAM.inp.test_gridt) - { - GlobalV::ofs_running << " the VECTORS of MESHCELL are (Bohr): " << std::endl; - GlobalV::ofs_running << " vec1( " - << std::setw(15) << meshcell_vec1[0] - << std::setw(15) << meshcell_vec1[1] - << std::setw(15) << meshcell_vec1[2] - << ")" << std::endl; - - GlobalV::ofs_running << " vec2( " - << std::setw(15) << meshcell_vec2[0] - << std::setw(15) << meshcell_vec2[1] - << std::setw(15) << meshcell_vec2[2] - << ")" << std::endl; - - GlobalV::ofs_running << " vec3( " - << std::setw(15) << meshcell_vec3[0] - << std::setw(15) << meshcell_vec3[1] - << std::setw(15) << meshcell_vec3[2] - << ")" << std::endl; - } - - return; -} - -void Grid_MeshCell::init_meshcell_pos(void) -{ - assert(bx>0); - assert(by>0); - assert(bz>0); - assert(bxyz>0); - - meshcell_pos = std::vector>(bxyz,std::vector(3,0.0)); - ModuleBase::Memory::record("meshcell_pos", sizeof(double) * bxyz*3); - - int index=0; - for(int i=0; i> meshcell_pos; - - private: - // latvec0 and GT are not used in current code. - // these two variables may be removed in the future. - ModuleBase::Matrix3 meshcell_latvec0; - ModuleBase::Matrix3 meshcell_GT; - - protected: - - std::vector meshcell_vec1; - std::vector meshcell_vec2; - std::vector meshcell_vec3; - - /// move operator for the next ESolver to directly use its infomation - Grid_MeshCell& operator=(Grid_MeshCell&& rhs) = default; - - void set_grid_dim( - const int &ncx_in, - const int &ncy_in, - const int &ncz_in, - const int &bx_in, - const int &by_in, - const int &bz_in, - const int &nbx_in, - const int &nby_in, - const int &nbz_in, - const int &nbxx_in, - const int &nbzp_start_in, - const int &nbzp_in); - - void init_latvec(const UnitCell &ucell); - void init_meshcell_pos(); - -}; - -#endif diff --git a/source/source_lcao/module_gint/grid_meshk.cpp b/source/source_lcao/module_gint/grid_meshk.cpp deleted file mode 100644 index e1451a31d8..0000000000 --- a/source/source_lcao/module_gint/grid_meshk.cpp +++ /dev/null @@ -1,101 +0,0 @@ -#include "grid_meshk.h" -#include "source_pw/module_pwdft/global.h" -#include "source_io/module_parameter/parameter.h" - -Grid_MeshK::Grid_MeshK() -{ -} - -Grid_MeshK::~Grid_MeshK() -{ -} - -int Grid_MeshK::cal_Rindex(const int &u1, const int &u2, const int &u3)const -{ - const int x1 = u1 - this->minu1; - const int x2 = u2 - this->minu2; - const int x3 = u3 - this->minu3; - - if(x1<0 || x2<0 || x3<0) - { - std::cout << " u1=" << u1 << " minu1=" << minu1 << std::endl; - std::cout << " u2=" << u2 << " minu2=" << minu2 << std::endl; - std::cout << " u3=" << u3 << " minu3=" << minu3 << std::endl; - ModuleBase::WARNING_QUIT("Grid_MeshK::cal_Rindex","x1<0 || x2<0 || x3<0 !"); - } - - assert(x1>=0); - assert(x2>=0); - assert(x3>=0); - - return (x3 + x2 * this->nu3 + x1 * this->nu2 * this->nu3); -} - -ModuleBase::Vector3 Grid_MeshK::get_ucell_coords(const int &Rindex)const -{ - const int x = ucell_index2x[Rindex]; - const int y = ucell_index2y[Rindex]; - const int z = ucell_index2z[Rindex]; - - return ModuleBase::Vector3(x, y, z); -} - -void Grid_MeshK::cal_extended_cell(const int &dxe, const int &dye, const int &dze,const int& nbx, const int& nby, const int& nbz) -{ - ModuleBase::TITLE("Grid_MeshK","cal_extended_cell"); - - //-------------------------------------- - // max and min unitcell in expaned grid. - //-------------------------------------- - this->maxu1 = dxe / nbx + 1; - this->maxu2 = dye / nby + 1; - this->maxu3 = dze / nbz + 1; - - this->minu1 = (-dxe+1) / nbx - 1; - this->minu2 = (-dye+1) / nby - 1; - this->minu3 = (-dze+1) / nbz - 1; - - if(PARAM.inp.test_gridt) {ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"MaxUnitcell",maxu1,maxu2,maxu3); -} - if(PARAM.inp.test_gridt) {ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"MinUnitcell",minu1,minu2,minu3); -} - - //-------------------------------------- - // number of unitcell in each direction. - //-------------------------------------- - this->nu1 = maxu1 - minu1 + 1; - this->nu2 = maxu2 - minu2 + 1; - this->nu3 = maxu3 - minu3 + 1; - this->nutot = nu1 * nu2 * nu3; - - if(PARAM.inp.test_gridt) {ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"UnitCellNumber",nu1,nu2,nu3); -} - if(PARAM.inp.out_level != "m") { ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"UnitCellTotal",nutot); -} - - - this->ucell_index2x = std::vector(nutot, 0); - this->ucell_index2y = std::vector(nutot, 0); - this->ucell_index2z = std::vector(nutot, 0); - - this->nutot = nu1 * nu2 * nu3; - - for(int i=minu1; i<=maxu1; i++) - { - for(int j=minu2; j<=maxu2; j++) - { - for(int k=minu3; k<=maxu3; k++) - { - const int cell = cal_Rindex(i,j,k); - assert(cellucell_index2x[cell] = i; - this->ucell_index2y[cell] = j; - this->ucell_index2z[cell] = k; - - } - } - } - - return; -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/grid_meshk.h b/source/source_lcao/module_gint/grid_meshk.h deleted file mode 100644 index fb8d458bb0..0000000000 --- a/source/source_lcao/module_gint/grid_meshk.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef GRID_MESHK_H -#define GRID_MESHK_H -#include "source_base/global_function.h" -#include "source_base/global_variable.h" -#include "source_base/vector3.h" - -class Grid_MeshK -{ - public: - Grid_MeshK(); - ~Grid_MeshK(); - - // calculate the index of unitcell. - int cal_Rindex(const int& u1, const int& u2, const int& u3)const; - - ModuleBase::Vector3 get_ucell_coords(const int& Rindex)const; - - /// move operator for the next ESolver to directly use its infomation - Grid_MeshK& operator=(Grid_MeshK&& rhs) = default; - - private: - // the max and the min unitcell. - int maxu1; - int maxu2; - int maxu3; - - int minu1; - int minu2; - int minu3; - - // the number of unitcells. - int nu1; - int nu2; - int nu3; - int nutot; - - // from 1D index to unitcell. - std::vector ucell_index2x; - std::vector ucell_index2y; - std::vector ucell_index2z; - - protected: - // calculate the extended unitcell. - void cal_extended_cell(const int &dxe, const int &dye, const int &dze, - const int& nbx, const int& nby, const int& nbz); -}; - -#endif diff --git a/source/source_lcao/module_gint/grid_technique.cpp b/source/source_lcao/module_gint/grid_technique.cpp deleted file mode 100644 index de52f4d5f9..0000000000 --- a/source/source_lcao/module_gint/grid_technique.cpp +++ /dev/null @@ -1,784 +0,0 @@ -#if ((defined __CUDA) /* || (defined __ROCM) */) -#include -#include "source_io/module_parameter/parameter.h" -#endif -#include "grid_technique.h" -#include "source_io/module_parameter/parameter.h" -#include "source_base/memory.h" -#include "source_base/parallel_reduce.h" -#include "source_base/timer.h" -#include "source_pw/module_pwdft/global.h" -#include "source_hsolver/kernels/cuda/helper_cuda.h" - -#include "source_lcao/module_gint/temp_gint/gint_helper.h" - -Grid_Technique::Grid_Technique() { -#if ((defined __CUDA) /* || (defined __ROCM) */) - if (PARAM.inp.device == "gpu") { - is_malloced = false; - } -#endif -} - -Grid_Technique::~Grid_Technique() { - -#if ((defined __CUDA) /* || (defined __ROCM) */) - if (PARAM.inp.device == "gpu") { - free_gpu_gint_variables(this->nat); - } -#endif -} - -// This function is called in esolver_ks_lcao_elec.cpp -// after the orbital information has been read, -// this function control the routinue to generate -// grid technique parameters. -void Grid_Technique::set_pbc_grid(const int& ncx_in, - const int& ncy_in, - const int& ncz_in, - const int& bx_in, - const int& by_in, - const int& bz_in, - const int& nbx_in, - const int& nby_in, - const int& nbz_in, - const int& nbxx_in, - const int& nbzp_start_in, - const int& nbzp_in, - const int& ny, - const int& nplane, - const int& startz_current, - const UnitCell& ucell, - const Grid_Driver& gd, - const double& dr_uniform, - const std::vector& rcuts, - const std::vector>& psi_u, - const std::vector>& dpsi_u, - const std::vector>& d2psi_u, - const int& num_stream) -{ - ModuleBase::TITLE("Grid_Technique", "init"); - ModuleBase::timer::tick("Grid_Technique", "init"); - - if (PARAM.inp.out_level != "m") { - GlobalV::ofs_running - << "\n SETUP EXTENDED REAL SPACE GRID FOR GRID INTEGRATION" - << std::endl; - } - this->init_malloced = true; - - // copy ucell and orb parameters - this->ucell = &ucell; - this->dr_uniform = dr_uniform; - - this->nwmax = ucell.nwmax; - this->ntype = ucell.ntype; - - this->rcuts = rcuts; - double max_cut = *std::max_element(this->rcuts.begin(), this->rcuts.end()); - this->nr_max = static_cast(1 / this->dr_uniform * max_cut) + 10; - this->psi_u = psi_u; - this->dpsi_u = dpsi_u; - this->d2psi_u = d2psi_u; - - // (1) init_meshcell cell and big cell. - this->set_grid_dim(ncx_in, - ncy_in, - ncz_in, - bx_in, - by_in, - bz_in, - nbx_in, - nby_in, - nbz_in, - nbxx_in, - nbzp_start_in, - nbzp_in); - this->init_latvec(ucell); - - this->init_big_latvec(ucell); - - this->init_meshcell_pos(); - - // (2) expand the grid - - this->init_grid_expansion(ucell, this->rcuts.data()); - - // (3) calculate the extended grid. - this->cal_extended_cell(this->dxe, - this->dye, - this->dze, - this->nbx, - this->nby, - this->nbz); - - this->init_tau_in_bigcell(ucell); - - this->init_meshball(); - - this->init_atoms_on_grid(ny, nplane, ucell); - - this->init_ijr_and_nnrg(ucell, gd); - this->cal_trace_lo(ucell); -#if ((defined __CUDA) /* || (defined __ROCM) */) - if (PARAM.inp.device == "gpu") { - this->init_gpu_gint_variables(ucell, num_stream); - } -#endif - - ModuleBase::timer::tick("Grid_Technique", "init"); - return; -} - -void Grid_Technique::get_startind(const int& ny, - const int& nplane) { - ModuleBase::TITLE("Grid_Technique", "get_startind"); - - assert(nbxx >= 0); - - // calculates start_ind, which stores the - // starting index of each bigcell - this->start_ind = std::vector(nbxx, 0); - ModuleBase::Memory::record("GT::start_ind", sizeof(int) * nbxx); - - for (int i = 0; i < nbxx; i++) { - int ibx = 0; - int iby = 0; - int ibz = 0; - - int ix = 0; - int iy = 0; - int iz = 0; - - ibx = i / (nby * nbzp); - iby = (i - ibx * nby * nbzp) / nbzp; - ibz = i % nbzp; - - ix = ibx * this->bx; - iy = iby * this->by; - iz = ibz * this->bz; - - int ind = iz + iy * nplane + ix * ny * nplane; - - start_ind[i] = ind; - } - - return; -} - -// PLEASE update this 'init_atoms_on_grid' to make -// it adapted to 'cuboid' shape of grid -// mohan add 2021-04-06 -void Grid_Technique::init_atoms_on_grid(const int& ny, - const int& nplane, - const UnitCell& ucell) { - ModuleBase::TITLE("Grid_Technique", "init_atoms_on_grid"); - - assert(nbxx >= 0); - this->get_startind(ny, nplane); - - // (1) prepare data. - // counting the number of atoms whose orbitals have - // values on the bigcell. - this->how_many_atoms = std::vector(nbxx, 0); - ModuleBase::Memory::record("GT::how_many_atoms", sizeof(int) * nbxx); - - // (2) information about gloabl grid - // and local grid. - // mohan add 2010-07-02 - std::vector ind_bigcell = std::vector(nbxyz, 0); - ModuleBase::Memory::record("GT::ind_bigcell", sizeof(int) * this->nxyze); - std::vector bigcell_on_processor = std::vector(nbxyz, 0); - ModuleBase::Memory::record("GT::bigcell_on_processor", - sizeof(char) * this->nxyze); - this->check_bigcell(ind_bigcell.data(), bigcell_on_processor.data()); - - // (3) Find the atoms using - // when doing grid integration. - this->in_this_processor = std::vector(ucell.nat, false); - ModuleBase::Memory::record("GT::in_this_processor", - sizeof(int) * this->nxyze); - - // (4) init atoms on grid - std::vector index2normal = std::vector(this->nxyze, 0); - ModuleBase::Memory::record("GT::index2normal", sizeof(int) * this->nxyze); - this->grid_expansion_index(true, index2normal.data()); - - // (5) record how many atoms on - // each local grid point (ix,iy,iz) - int nat_local = 0; - this->total_atoms_on_grid = 0; - for (int iat = 0; iat < ucell.nat; iat++) - { - const int it = ucell.iat2it[iat]; - const double rcut_square = this->rcuts[it] * this->rcuts[it]; - for (int im = 0; im < this->meshball_ncells; im++) - { - // bcell[iat]: which bcell iat atom is in. - // ball[im]: relative position of adjacent bcell. - const int normal = index2normal[this->index_atom[iat] + this->index_ball[im]]; -#ifdef __DEBUG - if (normal >= nbxyz) - { - #pragma omp critical - { - std::cout << " index_atom=" << index_atom[iat] << std::endl; - std::cout << " index_ball=" << index_ball[im] << std::endl; - std::cout << " normal=" << normal << std::endl; - std::cout << " nbxyz=" << nbxyz << std::endl; - ModuleBase::WARNING_QUIT( - "Grid_Technique::init_atoms_on_grid", - "normal >= nbxyz"); - } - } -#endif - assert(normal >= 0); - const int bcell_idx_on_proc = ind_bigcell[normal]; - if (!bigcell_on_processor[normal]) - { - continue; - } - - bool is_atom_on_bcell = false; - const double dr_x_part = this->meshball_positions[im][0] - this->tau_in_bigcell[iat][0]; - const double dr_y_part = this->meshball_positions[im][1] - this->tau_in_bigcell[iat][1]; - const double dr_z_part = this->meshball_positions[im][2] - this->tau_in_bigcell[iat][2]; - for(int imcell = 0; imcell < this -> bxyz; imcell++) - { - const double dr_x = this->meshcell_pos[imcell][0] + dr_x_part; - const double dr_y = this->meshcell_pos[imcell][1] + dr_y_part; - const double dr_z = this->meshcell_pos[imcell][2] + dr_z_part; - const double dist_square = dr_x * dr_x + dr_y * dr_y + dr_z * dr_z; - if(dist_square <= rcut_square) - { - is_atom_on_bcell = true; - break; - } - } - if(is_atom_on_bcell) - { - ++how_many_atoms[bcell_idx_on_proc]; - ++this->total_atoms_on_grid; - this->in_this_processor[iat] = true; - } - } - if (this->in_this_processor[iat]) - { - ++nat_local; - } - } - - if (PARAM.inp.test_gridt) { - ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, - "Total_atoms_on_grid", - total_atoms_on_grid); -} - - int stop = 0; - if (total_atoms_on_grid == 0) { - GlobalV::ofs_running << " No atoms on this sub-FFT-mesh." << std::endl; - stop = 1; - } - Parallel_Reduce::reduce_all(stop); - if (stop) { - ModuleBase::WARNING("Grid_Technique::init_atoms_on_grid", - "No atom on this sub-FFT-mesh."); - } - - // calculate the trach of local ia to global iat - if (nat_local > 0) { - this->trace_iat.resize(nat_local); - for (int iat = ucell.nat - 1; iat >= 0; iat--) { - if (this->in_this_processor[iat]) { - this->trace_iat[--nat_local] = iat; - } - } - } - - // need how_many_atoms first. - this->cal_grid_integration_index(); - // bcell_start is needed. - this->init_atoms_on_grid2(index2normal.data(), ucell); - return; -} - -void Grid_Technique::check_bigcell(int* ind_bigcell, - char* bigcell_on_processor) { - // check if a given bigcell is treated on this processor - const int zstart = nbzp_start; - const int zend = nbzp + zstart; - const int nbyz = nby * nbz; - const int nz = nbzp; - - int iz_now = 0; - int ix = 0; - int iy = 0; - int iz = 0; - int ind = 0; - bool flag = false; - - for (int i = 0; i < nbxyz; i++) { - int iz_now = i % nbz; - if (iz_now < zstart || iz_now >= zend) { - flag = false; - } else { - flag = true; - ix = i / nbyz; - iy = (i - ix * nbyz) / nbz; - iz = iz_now - zstart; - ind = ix * nby * nz + iy * nz + iz; - // no need to calculate index if bigcell is - // not on this processor - } - - ind_bigcell[i] = ind; - bigcell_on_processor[i] = flag; - } - return; -} - -void Grid_Technique::init_atoms_on_grid2(const int* index2normal, - const UnitCell& ucell) { - ModuleBase::TITLE("Grid_Techinique", "init_atoms_on_grid2"); - - if (total_atoms_on_grid == 0) { - ModuleBase::WARNING("Grid_Technique::init_atoms_on_grid2", - "no atom on this sub FFT grid."); - return; - } - - std::vector index2ucell = std::vector(this->nxyze, 0); - ModuleBase::Memory::record("GT::index2ucell", sizeof(int) * this->nxyze); - this->grid_expansion_index(false, index2ucell.data()); - - std::vector ind_bigcell = std::vector(nbxyz, 0); - ModuleBase::Memory::record("GT::ind_bigcell", sizeof(int) * nbxyz); - std::vector bigcell_on_processor = std::vector(nbxyz, 0); - this->check_bigcell(ind_bigcell.data(), bigcell_on_processor.data()); - - //-------------------------------------- - // save which atom is in the bigcell,unitcell - //-------------------------------------- - assert(total_atoms_on_grid != 0); - this->which_atom = std::vector(total_atoms_on_grid, 0); - ModuleBase::Memory::record("GT::which_atom", - sizeof(int) * total_atoms_on_grid); - - this->which_bigcell = std::vector(total_atoms_on_grid, 0); - ModuleBase::Memory::record("GT::which_bigcell", - sizeof(int) * total_atoms_on_grid); - - this->which_unitcell = std::vector(total_atoms_on_grid, 0); - ModuleBase::Memory::record("GT::which_unitcell", - sizeof(int) * total_atoms_on_grid); - - // for each atom, first we need to locate which cell - // the atom is in, then we search meshball aroung this - // grid, and record each grid's atom position. - int count = 0; - this->how_many_atoms = std::vector(nbxx, 0); - ModuleBase::Memory::record("GT::how many atoms", sizeof(int) * nbxx); - std::vector coord_x(total_atoms_on_grid* bxyz, 0.0); - std::vector coords3(bxyz * 3, 0.0); - for(int iat = 0; iat < ucell.nat; iat++) - { - const int it = ucell.iat2it[iat]; - const double rcut_square = this->rcuts[it] * this->rcuts[it]; - // zero bigcell of meshball indicate ? - for (int im = 0; im < this->meshball_ncells; im++) - { - const int extgrid = this->index_atom[iat] + this->index_ball[im]; - const int normal = index2normal[extgrid]; - - // mohan add 2010-07-01 - const int bcell_idx_on_proc = ind_bigcell[normal]; - if (!bigcell_on_processor[normal]) - { - continue; - } - - bool is_atom_on_bcell = false; - const double dr_x_part = this->meshball_positions[im][0] - this->tau_in_bigcell[iat][0]; - const double dr_y_part = this->meshball_positions[im][1] - this->tau_in_bigcell[iat][1]; - const double dr_z_part = this->meshball_positions[im][2] - this->tau_in_bigcell[iat][2]; - for(int imcell = 0; imcell < this -> bxyz; imcell++) - { - const double dr_x = this->meshcell_pos[imcell][0] + dr_x_part; - const double dr_y = this->meshcell_pos[imcell][1] + dr_y_part; - const double dr_z = this->meshcell_pos[imcell][2] + dr_z_part; - const double dist_square = dr_x * dr_x + dr_y * dr_y + dr_z * dr_z; - if(dist_square <= rcut_square) - { - is_atom_on_bcell = true; - break; - } - } - - if(is_atom_on_bcell) - { - // it's not the normal order to calculate which_atom - // and which_bigcell, especailly in 1D array. - // Each grid's adjacent atom number is different, - // so, first we need to locate which grid, using - // bcell_start, then we need to count which adjacent atom. - // using how_many_atoms. - const int index = this->bcell_start[bcell_idx_on_proc] + this->how_many_atoms[bcell_idx_on_proc]; - - // we save which_atom and which_bigcell in 1D array, - // once you want to use this in grid integration, - // the only information you got is the 'normal' index, - // so you need to use bcell_start - // to get the 'mesh_index', then you can you this mesh_index - // to use which_atom or which_bigcell. - this->which_atom[index] = iat; - this->which_bigcell[index] = im; - this->which_unitcell[index] = index2ucell[extgrid]; - for(int imcell = 0; imcell < this -> bxyz; imcell++) - { - const double dr_x = this->meshcell_pos[imcell][0] + dr_x_part; - coord_x[index * bxyz + imcell] = dr_x; - } - - ++count; - ++how_many_atoms[bcell_idx_on_proc]; - } - } - } - for(int i = 0; i < this->bxyz; i++) - { - for(int j = 0; j < 3; j++) - { - coords3[i * 3 + j] = this->meshcell_pos[i][j]; - } - } - assert(count == total_atoms_on_grid); - return; -} - -void Grid_Technique::cal_grid_integration_index() { - // save the start - this->bcell_start = std::vector(nbxx, 0); - ModuleBase::Memory::record("GT::bcell_start", sizeof(int) * nbxx); - for (int i = 1; i < nbxx; i++) { - this->bcell_start[i] - = this->bcell_start[i - 1] + this->how_many_atoms[i - 1]; - } - - // calculate which grid has the largest number of atoms, - // and how many atoms. - this->max_atom = 0; - for (int i = 0; i < nbxx; i++) { - this->max_atom = std::max(this->max_atom, this->how_many_atoms[i]); - } - -#ifdef __MPI - int* all = new int[GlobalV::NPROC]; - ModuleBase::GlobalFunc::ZEROS(all, GlobalV::NPROC); - Parallel_Reduce::gather_int_all(max_atom, all); - if (GlobalV::MY_RANK == 0) { - GlobalV::ofs_warning << std::setw(15) << "Processor" << std::setw(15) - << "Atom" << std::endl; - for (int i = 0; i < GlobalV::NPROC; i++) { - GlobalV::ofs_warning << std::setw(15) << i + 1 << std::setw(15) - << all[i] << std::endl; - } - } - delete[] all; -#endif - - if (PARAM.inp.test_gridt) { - ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, - "Max atom on bigcell", - max_atom); - } - return; -} - -// set 'lgd' variable -void Grid_Technique::cal_trace_lo(const UnitCell& ucell) { - ModuleBase::TITLE("Grid_Technique", "cal_trace_lo"); - // save the atom information in trace_lo, - // in fact the trace_lo dimension can be reduced - // to ucell.nat, but I think this is another way. - this->trace_lo = std::vector(PARAM.globalv.nlocal, -1); - ModuleBase::Memory::record("GT::trace_lo", sizeof(int) * PARAM.globalv.nlocal); - - this->lnat = 0; - this->lgd = 0; - int iat = 0; - int iw_all = 0; - int iw_local = 0; - - for (int it = 0; it < ucell.ntype; it++) { - for (int ia = 0; ia < ucell.atoms[it].na; ia++) { - if (this->in_this_processor[iat]) { - ++lnat; - int nw0 = ucell.atoms[it].nw; - if (PARAM.inp.nspin - == 4) { // added by zhengdy-soc, need to be double in soc - nw0 *= 2; - this->lgd += nw0; - } else { - this->lgd += ucell.atoms[it].nw; - } - - for (int iw = 0; iw < nw0; iw++) { - this->trace_lo[iw_all] = iw_local; - ++iw_local; - ++iw_all; - } - } else { - // global index of atomic orbitals - iw_all += ucell.atoms[it].nw; - if (PARAM.inp.nspin == 4) { - iw_all += ucell.atoms[it].nw; -} - } - ++iat; - } - } - - if (PARAM.inp.out_level != "m") { - ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, - "Atom number in sub-FFT-grid", - lnat); - ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, - "Local orbitals number in sub-FFT-grid", - lgd); - } - - assert(iw_local == lgd); - assert(iw_all == PARAM.globalv.nlocal); - return; -} - -void Grid_Technique::init_ijr_and_nnrg(const UnitCell& ucell, const Grid_Driver& gd) -{ - ModuleBase::TITLE("Grid_Technique", "init_ijr_and_nnrg"); - - hamilt::HContainer hr_gint_tmp(ucell.nat); - // prepare the row_index and col_index for construct AtomPairs, they are - // same, name as orb_index - std::vector orb_index(ucell.nat + 1); - orb_index[0] = 0; - for (int i = 1; i < orb_index.size(); i++) { - int type = ucell.iat2it[i - 1]; - orb_index[i] = orb_index[i - 1] + ucell.atoms[type].nw; - } - - for (int T1 = 0; T1 < ucell.ntype; ++T1) { - const Atom* atom1 = &(ucell.atoms[T1]); - for (int I1 = 0; I1 < atom1->na; ++I1) { - auto& tau1 = atom1->tau[I1]; - - gd.Find_atom(ucell, tau1, T1, I1); - - const int iat1 = ucell.itia2iat(T1, I1); - // whether this atom is in this processor. - if (this->in_this_processor[iat1]) { - for (int ad = 0; ad < gd.getAdjacentNum() + 1; ++ad) { - const int T2 = gd.getType(ad); - const int I2 = gd.getNatom(ad); - const int iat2 = ucell.itia2iat(T2, I2); - const Atom* atom2 = &(ucell.atoms[T2]); - - // NOTE: hRGint wil save total number of atom pairs, - // if only upper triangle is saved, the lower triangle will - // be lost in 2D-block parallelization. if the adjacent atom - // is in this processor. - if (this->in_this_processor[iat2]) { - ModuleBase::Vector3 dtau - = gd.getAdjacentTau(ad) - tau1; - double distance = dtau.norm() * ucell.lat0; - double rcut - = this->rcuts[T1] + this->rcuts[T2]; - - // if(distance < rcut) - // mohan reset this 2013-07-02 in Princeton - // we should make absolutely sure that the distance is - // smaller than rcuts[it] this should be consistant - // with LCAO_nnr::cal_nnrg function typical example : 7 - // Bohr cutoff Si orbital in 14 Bohr length of cell. - // distance = 7.0000000000000000 - // rcuts[it] = 7.0000000000000008 - if (distance < rcut - 1.0e-15) { - // calculate R index - auto& R_index = gd.getBox(ad); - // insert this atom-pair into this->hRGint - hamilt::AtomPair tmp_atom_pair( - iat1, - iat2, - R_index.x, - R_index.y, - R_index.z, - orb_index.data(), - orb_index.data(), - ucell.nat); - hr_gint_tmp.insert_pair(tmp_atom_pair); - } - } - } - } - } - } - this->ijr_info = hr_gint_tmp.get_ijr_info(); - this->nnrg = hr_gint_tmp.get_nnr(); - return; -} - -#if ((defined __CUDA) /* || (defined __ROCM) */) - -void Grid_Technique::init_gpu_gint_variables(const UnitCell& ucell, - const int num_stream) { -#ifdef __MPI - dev_id = base_device::information::set_device_by_rank(); -#endif - if (is_malloced) { - free_gpu_gint_variables(this->nat); - } - nstreams = num_stream; - double ylmcoef[100]; - ModuleBase::GlobalFunc::ZEROS(ylmcoef, 100); - for (int i = 0; i < 100; i++) { - ylmcoef[i] = ModuleBase::Ylm::ylmcoef[i]; - } - checkCudaErrors(cudaMalloc((void**)&ylmcoef_g, 100 * sizeof(double))); - checkCudaErrors(cudaMemcpy(ylmcoef_g, - ylmcoef, - 100 * sizeof(double), - cudaMemcpyHostToDevice)); - - double max_cut = *std::max_element(this->rcuts.begin(), this->rcuts.end()); - - int atom_nw_now[ucell.ntype]; - int ucell_atom_nwl_now[ucell.ntype]; - for (int i = 0; i < ucell.ntype; i++) { - atom_nw_now[i] = ucell.atoms[i].nw; - ucell_atom_nwl_now[i] = ucell.atoms[i].nwl; - } - - // double psi_u_now[ucell.ntype * ucell.nwmax * nr_max * - // 2]; - double* psi_u_now = (double*)malloc(ucell.ntype * ucell.nwmax * this->nr_max * 2 * sizeof(double)); - memset(psi_u_now, 0, ucell.ntype * ucell.nwmax * this->nr_max * 2 * sizeof(double)); - bool* atom_iw2_new_now = (bool*)malloc(ucell.ntype * ucell.nwmax * sizeof(bool)); - memset(atom_iw2_new_now, 0, ucell.ntype * ucell.nwmax * sizeof(bool)); - int* atom_iw2_ylm_now - = (int*)malloc(ucell.ntype * ucell.nwmax * sizeof(int)); - memset(atom_iw2_ylm_now, 0, ucell.ntype * ucell.nwmax * sizeof(int)); - int* atom_iw2_l_now = (int*)malloc(ucell.ntype * ucell.nwmax * sizeof(int)); - memset(atom_iw2_l_now, 0, ucell.ntype * ucell.nwmax * sizeof(int)); - - Atom* atomx; - for (int i = 0; i < ucell.ntype; i++) { - atomx = &ucell.atoms[i]; - for (int j = 0; j < ucell.nwmax; j++) { - if (j < atomx->nw) { - atom_iw2_new_now[i * ucell.nwmax + j] = atomx->iw2_new[j]; - atom_iw2_ylm_now[i * ucell.nwmax + j] = atomx->iw2_ylm[j]; - atom_iw2_l_now[i * ucell.nwmax + j] = atomx->iw2l[j]; - for (int k = 0; k < this->nr_max; k++) { - int index_temp = (i * ucell.nwmax * this->nr_max - + j * this->nr_max + k) - * 2; - if (k < this->psi_u[i * this->nwmax + j].size()) { - psi_u_now[index_temp] - = this->psi_u[i * this->nwmax + j].data()[k]; - psi_u_now[index_temp + 1] - = this->dpsi_u[i * this->nwmax + j].data()[k]; - } - } - } - } - } - - checkCudaErrors(cudaMalloc((void**)&atom_nw_g, ucell.ntype * sizeof(int))); - checkCudaErrors(cudaMemcpy(atom_nw_g, - atom_nw_now, - ucell.ntype * sizeof(int), - cudaMemcpyHostToDevice)); - - checkCudaErrors(cudaMalloc((void**)&atom_nwl_g, ucell.ntype * sizeof(int))); - checkCudaErrors(cudaMemcpy(atom_nwl_g, ucell_atom_nwl_now, ucell.ntype * sizeof(int), cudaMemcpyHostToDevice)); - - checkCudaErrors(cudaMalloc((void**)&psi_u_g, ucell.ntype * ucell.nwmax * this->nr_max * sizeof(double) * 2)); - checkCudaErrors(cudaMemcpy(psi_u_g, - psi_u_now, - ucell.ntype * ucell.nwmax * this->nr_max * sizeof(double) * 2, - cudaMemcpyHostToDevice)); - - checkCudaErrors(cudaMalloc((void**)&psi_u_g, - ucell.ntype * ucell.nwmax * nr_max * sizeof(double) * 2)); - checkCudaErrors(cudaMemcpy(psi_u_g, - psi_u_now, - ucell.ntype * ucell.nwmax * nr_max * sizeof(double) * 2, - cudaMemcpyHostToDevice)); - - checkCudaErrors(cudaMalloc((void**)&atom_new_g, - ucell.ntype * ucell.nwmax * sizeof(bool))); - checkCudaErrors(cudaMemcpy(atom_new_g, - atom_iw2_new_now, - ucell.ntype * ucell.nwmax * sizeof(bool), - cudaMemcpyHostToDevice)); - - checkCudaErrors(cudaMalloc((void**)&atom_ylm_g, - ucell.ntype * ucell.nwmax * sizeof(int))); - - checkCudaErrors(cudaMemcpy(atom_ylm_g, - atom_iw2_ylm_now, - ucell.ntype * ucell.nwmax * sizeof(int), - cudaMemcpyHostToDevice)); - - checkCudaErrors(cudaMalloc((void**)&atom_l_g, - ucell.ntype * ucell.nwmax * sizeof(int))); - checkCudaErrors(cudaMemcpy(atom_l_g, - atom_iw2_l_now, - ucell.ntype * ucell.nwmax * sizeof(int), - cudaMemcpyHostToDevice)); - - checkCudaErrors(cudaMalloc((void**)&rcut_g, ucell.ntype * sizeof(double))); - checkCudaErrors(cudaMemcpy(rcut_g, - rcuts.data(), - ucell.ntype * sizeof(double), - cudaMemcpyHostToDevice)); - std::vector mcell_pos(bxyz * 3, 0); - for (int i = 0; i < bxyz; i++) - { - mcell_pos[3 * i] = meshcell_pos[i][0]; - mcell_pos[3 * i + 1] = meshcell_pos[i][1]; - mcell_pos[3 * i + 2] = meshcell_pos[i][2]; - } - checkCudaErrors(cudaMalloc((void**)&mcell_pos_g, - bxyz * 3 * sizeof(double))); - checkCudaErrors(cudaMemcpy(mcell_pos_g, - mcell_pos.data(), - bxyz * 3 * sizeof(double), - cudaMemcpyHostToDevice)); - - gemm_algo_selector(bxyz, fastest_matrix_mul, ucell); - - is_malloced = true; - - free(psi_u_now); - free(atom_iw2_new_now); - free(atom_iw2_ylm_now); -} - -void Grid_Technique::free_gpu_gint_variables(int nat) { - if (!is_malloced) { - return; - } - - checkCudaErrors(cudaFree(ylmcoef_g)); - checkCudaErrors(cudaFree(atom_nwl_g)); - checkCudaErrors(cudaFree(psi_u_g)); - checkCudaErrors(cudaFree(atom_new_g)); - checkCudaErrors(cudaFree(atom_ylm_g)); - checkCudaErrors(cudaFree(atom_nw_g)); - checkCudaErrors(cudaFree(atom_l_g)); - checkCudaErrors(cudaFree(rcut_g)); - checkCudaErrors(cudaFree(mcell_pos_g)); - - is_malloced = false; -} -#endif diff --git a/source/source_lcao/module_gint/grid_technique.h b/source/source_lcao/module_gint/grid_technique.h deleted file mode 100644 index 947b8d9337..0000000000 --- a/source/source_lcao/module_gint/grid_technique.h +++ /dev/null @@ -1,172 +0,0 @@ -#ifndef GRID_TECHNIQUE_H -#define GRID_TECHNIQUE_H - -#include "grid_meshball.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_basis/module_ao/parallel_orbitals.h" -#include "source_cell/module_neighbor/sltk_grid_driver.h" -#include "source_cell/unitcell.h" -#include "source_lcao/module_hcontainer/hcontainer.h" -#if ((defined __CUDA) /* || (defined __ROCM) */) -#include "kernels/cuda/gemm_selector.cuh" - -#include -#endif - -// Author: mohan -// Date: 2009-10-17 -class Grid_Technique : public Grid_MeshBall { - // public variables. - public: - Grid_Technique(); - ~Grid_Technique(); - - /// move operator for the next ESolver to directly use its infomation - Grid_Technique& operator=(Grid_Technique&& rhs) = default; - //------------------------------------ - // 1: Info about atom number on grid. - //------------------------------------ - // record how many atoms on each grid. - std::vector how_many_atoms; - // max atom on grid - int max_atom=0; - // sum of how_many_atoms - int total_atoms_on_grid=0; - std::vector start_ind; - - //------------------------------------ - // 2: Info about which atom on grid. - //------------------------------------ - // save the start position of each big cell's adjacent - // atoms in 1D grid. - std::vector bcell_start; - // save the 'iat' atom. - // dim: total_atoms_on_grid. - std::vector which_atom; - - //-------------------------------------- - // save the bigcell index in meshball. - // dim: total_atoms_on_grid. - //-------------------------------------- - std::vector which_bigcell; - std::vector which_unitcell; - - //------------------------------------ - // 3: which atom on local grid. - //------------------------------------ - int lnat=0; // local nat. - int lgd=0; // local grid dimension. lgd * lgd symmetry matrix. - std::vector in_this_processor; - std::vector trace_iat; - std::vector trace_lo; // trace local orbital. - - //--------------------------------------- - // nnrg: number of matrix elements on - // each processor's real space grid. - // use: GridT.in_this_processor - //--------------------------------------- - int nnrg = 0; - - // UnitCell and LCAO_Obrbitals - const UnitCell* ucell=nullptr; - const LCAO_Orbitals* orb=nullptr; - - // UnitCell parameters - int nwmax=0; - int nr_max=0; - int ntype=0; - - // LCAO Orbitals - double dr_uniform={0.0}; - std::vector rcuts; - std::vector> psi_u; - std::vector> dpsi_u; - std::vector> d2psi_u; - - // Determine whether the grid point integration is initialized. - bool init_malloced=false; - - bool get_init_malloced() const { return init_malloced; } - - void set_pbc_grid(const int& ncx_in, - const int& ncy_in, - const int& ncz_in, - const int& bx_in, - const int& by_in, - const int& bz_in, - const int& nbx_in, - const int& nby_in, - const int& nbz_in, - const int& nbxx_in, - const int& nbzp_start_in, - const int& nbzp_in, - const int& ny, - const int& nplane, - const int& startz_current, - const UnitCell& ucell, - const Grid_Driver& gd, - const double& dr_uniform, - const std::vector& rcuts, - const std::vector>& psi_u, - const std::vector>& dpsi_u, - const std::vector>& d2psi_u, - const int& num_stream); - - const std::vector* get_ijr_info() const { return &ijr_info; } - - /// number of elements(basis-pairs) in this processon - /// on all adjacent atoms-pairs(Grid division) - int cal_RindexAtom(const int& u1, - const int& u2, - const int& u3, - const int& iat2) const; - - int find_offset(const int id1, const int id2, const int iat1, const int iat2) const; - - private: - - // store the information of atom pairs on this processor, used to initialize hcontainer. - // The meaning of ijr can be referred to in the get_ijr_info function in hcontainer.cpp. - std::vector ijr_info; - - void cal_max_box_index(); - // atoms on meshball - void init_atoms_on_grid(const int& ny, - const int& nplane, - const UnitCell& ucell); - void init_atoms_on_grid2(const int* index2normal, const UnitCell& ucell); - // initialize the ijr_info and nnrg - void init_ijr_and_nnrg(const UnitCell& ucell, const Grid_Driver& gd); - void cal_grid_integration_index(); - void cal_trace_lo(const UnitCell& ucell); - void check_bigcell(int* ind_bigcell, char* bigcell_on_processor); - void get_startind(const int& ny, - const int& nplane); - -#if ((defined __CUDA) /* || (defined __ROCM) */) - public: - double* ylmcoef_g; - bool is_malloced; - - int* atom_nw_g; - int* atom_nwl_g; - double* psi_u_g; - bool* atom_new_g; - int* atom_ylm_g; - int* atom_l_g; - double* rcut_g; - double*mcell_pos_g; - - int dev_id = 0; - int nstreams = 4; - // streams[nstreams] - // TODO it needs to be implemented through configuration files - matrix_multiple_func_type fastest_matrix_mul; - - private: - void init_gpu_gint_variables(const UnitCell& ucell, const int num_stream); - void free_gpu_gint_variables(int nat); - -#endif -}; -#endif diff --git a/source/source_lcao/module_gint/gtask_force.cpp b/source/source_lcao/module_gint/gtask_force.cpp deleted file mode 100644 index 2fab74907e..0000000000 --- a/source/source_lcao/module_gint/gtask_force.cpp +++ /dev/null @@ -1,152 +0,0 @@ -#include - -#include "gint_force_gpu.h" -#include "source_base/ylm.h" -#include "source_lcao/module_gint/gint_tools.h" -#include "source_base/vector3.h" -namespace GintKernel -{ - -void gtask_force(const Grid_Technique& gridt, - const UnitCell& ucell, - const int grid_index_ij, - const int nczp, - const double vfactor, - const double* vlocal_global_value, - int& atoms_per_z, - int* atoms_num_info, - int* iat_on_nbz, - uint8_t* atoms_type, - double* dr_part, - double* vldr3) -{ - atoms_per_z = 0; - for (int z_index = 0; z_index < gridt.nbzp; z_index++) - { - const int grid_index = grid_index_ij + z_index; - const int bcell_start_index = gridt.bcell_start[grid_index]; - const int na_grid = gridt.how_many_atoms[grid_index]; - atoms_num_info[z_index * 2] = na_grid; - atoms_num_info[z_index * 2 + 1] = atoms_per_z; - for (int id = 0; id < na_grid; id++) - { - const int mcell_index = bcell_start_index + id; - const int imcell = gridt.which_bigcell[mcell_index]; - const int iat = gridt.which_atom[mcell_index]; - const int it_temp = ucell.iat2it[iat]; - - dr_part[atoms_per_z * 3] = gridt.meshball_positions[imcell][0] - - gridt.tau_in_bigcell[iat][0]; - dr_part[atoms_per_z * 3 + 1] = gridt.meshball_positions[imcell][1] - - gridt.tau_in_bigcell[iat][1]; - dr_part[atoms_per_z * 3 + 2] = gridt.meshball_positions[imcell][2] - - gridt.tau_in_bigcell[iat][2]; - atoms_type[atoms_per_z] = it_temp; - iat_on_nbz[atoms_per_z] = iat; - atoms_per_z++; - } - - const int start_ind_grid = gridt.start_ind[grid_index]; - int id = z_index * gridt.bxyz; - for (int bx_index = 0; bx_index < gridt.bx; bx_index++) - { - for (int by_index = 0; by_index < gridt.by; by_index++) - { - for (int bz_index = 0; bz_index < gridt.bz; bz_index++) - { - int vindex_global = bx_index * gridt.ncy * nczp - + by_index * nczp + bz_index - + start_ind_grid; - vldr3[id]= vlocal_global_value[vindex_global] * vfactor; - id++; - } - } - } - } -} - -void alloc_mult_force(const hamilt::HContainer* dm, - const Grid_Technique& gridt, - const UnitCell& ucell, - const int grid_index_ij, - const int max_atom, - const int *atoms_num_info, - double* const psi_g, - double* const psi_dm_g, - double* const dm_matrix_g, - int& max_m, - int& max_n, - int& atom_pair_num, - int* mat_m, - int* mat_n, - int* mat_k, - int* mat_lda, - int* mat_ldb, - int* mat_ldc, - double** mat_A, - double** mat_B, - double** mat_C) -{ - int tid = 0; - max_m = 0; - max_n = 0; - const int nwmax = ucell.nwmax; - const int lgd = gridt.lgd; - for (int z_index = 0; z_index < gridt.nbzp; z_index++) - { - const int grid_index = grid_index_ij + z_index; - const int bcell_start_index = gridt.bcell_start[grid_index]; - const int pre_atoms = atoms_num_info[z_index * 2 + 1]; - - for (int atom1 = 0; atom1 < gridt.how_many_atoms[grid_index]; atom1++) - { - const int mcell_index1 = bcell_start_index + atom1; - const int iat1 = gridt.which_atom[mcell_index1]; - const int uc1 = gridt.which_unitcell[mcell_index1]; - const ModuleBase::Vector3 r1 = gridt.get_ucell_coords(uc1); - const int it1 = ucell.iat2it[iat1]; - const int nw1 = ucell.atoms[it1].nw; - - for (int atom2 = 0; atom2 < gridt.how_many_atoms[grid_index];atom2++) - { - const int mcell_index2 = bcell_start_index + atom2; - const int iat2 = gridt.which_atom[mcell_index2]; - const int uc2 = gridt.which_unitcell[mcell_index2]; - const ModuleBase::Vector3 r2 = gridt.get_ucell_coords(uc2); - const int offset = dm->find_matrix_offset(iat1, iat2, r1-r2); - if (offset == -1) - { - continue; - } - const int it2 = ucell.iat2it[iat2]; - const int nw2 = ucell.atoms[it2].nw; - - const int mat_A_idx = (pre_atoms + atom2) * nwmax * gridt.bxyz; - const int mat_C_idx = (pre_atoms + atom1) * nwmax * gridt.bxyz; - mat_m[tid] = gridt.bxyz; - mat_n[tid] = nw1; - mat_k[tid] = nw2; - mat_lda[tid] = nwmax; - mat_ldb[tid] = nw2; - mat_ldc[tid] = nwmax; - mat_A[tid] = psi_g + mat_A_idx; - mat_B[tid] = dm_matrix_g + offset; - mat_C[tid] = psi_dm_g + mat_C_idx; - - if (mat_m[tid] > max_m) - { - max_m = mat_m[tid]; - } - - if (mat_n[tid] > max_n) - { - max_n = mat_n[tid]; - } - - tid++; - } - } - } - atom_pair_num = tid; -} -} // namespace GintKernel diff --git a/source/source_lcao/module_gint/gtask_rho.cpp b/source/source_lcao/module_gint/gtask_rho.cpp deleted file mode 100644 index 691504d943..0000000000 --- a/source/source_lcao/module_gint/gtask_rho.cpp +++ /dev/null @@ -1,155 +0,0 @@ -#include "gint_rho_gpu.h" -#include "source_base/ylm.h" -#include "source_lcao/module_gint/gint_tools.h" -#include "source_base/vector3.h" -#include "omp.h" -namespace GintKernel -{ - -void gtask_rho(const Grid_Technique& gridt, - const int grid_index_ij, - const UnitCell& ucell, - double* dr_part, - uint8_t* atoms_type, - int* atoms_num_info, - int& atoms_per_z) -{ - atoms_per_z = 0; - for (int z_index = 0; z_index < gridt.nbzp; z_index++) - { - const int grid_index = grid_index_ij + z_index; - const int bcell_start_index = gridt.bcell_start[grid_index]; - const int na_grid = gridt.how_many_atoms[grid_index]; - atoms_num_info[2 * z_index] = na_grid; - atoms_num_info[2 * z_index + 1] = atoms_per_z; - for (int id = 0; id < na_grid; id++) - { - const int mcell_index = bcell_start_index + id; - const int imcell = gridt.which_bigcell[mcell_index]; - const int iat = gridt.which_atom[mcell_index]; - const int it_temp = ucell.iat2it[iat]; - - dr_part[atoms_per_z * 3] = gridt.meshball_positions[imcell][0] - - gridt.tau_in_bigcell[iat][0]; - dr_part[atoms_per_z * 3 + 1] = gridt.meshball_positions[imcell][1] - - gridt.tau_in_bigcell[iat][1]; - dr_part[atoms_per_z * 3 + 2] = gridt.meshball_positions[imcell][2] - - gridt.tau_in_bigcell[iat][2]; - atoms_type[atoms_per_z] = it_temp; - atoms_per_z++; - } - } -} - -void alloc_mult_dot_rho(const hamilt::HContainer* dm, - const Grid_Technique& gridt, - const UnitCell& ucell, - const int grid_index_ij, - const int max_atom, - const int lgd, - const int nczp, - const int* atoms_num_info, - double* const psir_ylm_g, - double* const psir_dm_g, - double* const dm_matrix_g, - double* mat_alpha, - int* mat_m, - int* mat_n, - int* mat_k, - int* mat_lda, - int* mat_ldb, - int* mat_ldc, - double** mat_A, - double** mat_B, - double** mat_C, - int& max_m, - int& max_n, - int& atom_pair_num, - double* rho_g, - double** dot_product) -{ - int tid = 0; - int dot_count = 0; - max_m = 0; - max_n = 0; - const int nwmax=ucell.nwmax; - // generate matrix multiplication tasks - for (int z_index = 0; z_index < gridt.nbzp; z_index++) - { - const int grid_index = grid_index_ij + z_index; - const int bcell_start_index = gridt.bcell_start[grid_index]; - const int bcell_start_psir = atoms_num_info[2 * z_index + 1] * gridt.bxyz * nwmax; - const int na_grid = atoms_num_info[2 * z_index]; - - for (int atom1 = 0; atom1 < gridt.how_many_atoms[grid_index]; atom1++) - { - const int mcell_index1 = bcell_start_index + atom1; - const int iat1 = gridt.which_atom[mcell_index1]; - const int uc1 = gridt.which_unitcell[mcell_index1]; - const ModuleBase::Vector3 r1 = gridt.get_ucell_coords(uc1); - const int it1 = ucell.iat2it[iat1]; - const int nw1 = ucell.atoms[it1].nw; - - for (int atom2 = atom1; atom2 < gridt.how_many_atoms[grid_index]; - atom2++) - { - const int mcell_index2 = bcell_start_index + atom2; - const int iat2 = gridt.which_atom[mcell_index2]; - const int uc2 = gridt.which_unitcell[mcell_index2]; - const ModuleBase::Vector3 r2 = gridt.get_ucell_coords(uc2); - const int offset = dm->find_matrix_offset(iat1, iat2, r1-r2); - if (offset == -1) - { - continue; - } - const int it2 = ucell.iat2it[iat2]; - const int nw2 = ucell.atoms[it2].nw; - - const int mat_A_idx = bcell_start_psir + atom2 * nwmax; - const int mat_C_idx = bcell_start_psir + atom1 * nwmax; - - mat_alpha[tid] = atom2 == atom1 ? 1 : 2; - mat_m[tid] = gridt.bxyz; - mat_n[tid] = nw1; - mat_k[tid] = nw2; - mat_lda[tid] = nwmax * na_grid; - mat_ldb[tid] = nw2; - mat_ldc[tid] = nwmax * na_grid; - mat_A[tid] = psir_ylm_g + mat_A_idx; - mat_B[tid] = dm_matrix_g + offset; - mat_C[tid] = psir_dm_g + mat_C_idx; - - if (mat_m[tid] > max_m) - { - max_m = mat_m[tid]; - } - - if (mat_n[tid] > max_n) - { - max_n = mat_n[tid]; - } - - tid++; - } - } - - // generate vec dot product tasks - std::vector vindex(gridt.bxyz); - Gint_Tools::get_vindex(gridt.bxyz, - gridt.bx, - gridt.by, - gridt.bz, - nczp, - gridt.start_ind[grid_index], - gridt.ncy * nczp, - vindex.data()); - for (int i = 0; i < gridt.bxyz; i++) - { - dot_product[dot_count] = rho_g + vindex[i]; - dot_count++; - } - } - atom_pair_num = tid; -} - -} // namespace GintKernel \ No newline at end of file diff --git a/source/source_lcao/module_gint/gtask_vl.cpp b/source/source_lcao/module_gint/gtask_vl.cpp deleted file mode 100644 index 026ed3ffab..0000000000 --- a/source/source_lcao/module_gint/gtask_vl.cpp +++ /dev/null @@ -1,154 +0,0 @@ -#include - -#include "gint_vl_gpu.h" -#include "source_base/ylm.h" -#include "source_lcao/module_gint/gint_tools.h" -#include "source_base/vector3.h" -namespace GintKernel -{ - -void gtask_vlocal(const Grid_Technique& gridt, - const UnitCell& ucell, - const int grid_index_ij, - const int nczp, - const double vfactor, - const double* vlocal_global_value, - int& atoms_per_z, - int* atoms_num_info, - uint8_t* atoms_type, - double* dr_part, - double* vldr3) -{ - atoms_per_z = 0; - for (int z_index = 0; z_index < gridt.nbzp; z_index++) - { - const int grid_index = grid_index_ij + z_index; - const int bcell_start_index = gridt.bcell_start[grid_index]; - const int na_grid = gridt.how_many_atoms[grid_index]; - atoms_num_info[2 * z_index] = na_grid; - atoms_num_info[2 * z_index + 1] = atoms_per_z; - for (int id = 0; id < na_grid; id++) - { - const int mcell_index = bcell_start_index + id; - const int imcell = gridt.which_bigcell[mcell_index]; - const int iat = gridt.which_atom[mcell_index]; - const int it_temp = ucell.iat2it[iat]; - - dr_part[atoms_per_z * 3] = gridt.meshball_positions[imcell][0] - - gridt.tau_in_bigcell[iat][0]; - dr_part[atoms_per_z * 3 + 1] = gridt.meshball_positions[imcell][1] - - gridt.tau_in_bigcell[iat][1]; - dr_part[atoms_per_z * 3 + 2] = gridt.meshball_positions[imcell][2] - - gridt.tau_in_bigcell[iat][2]; - atoms_type[atoms_per_z] = it_temp; - atoms_per_z++; - } - - const int start_ind_grid = gridt.start_ind[grid_index]; - int id = z_index * gridt.bxyz; - for (int bx_index = 0; bx_index < gridt.bx; bx_index++) - { - for (int by_index = 0; by_index < gridt.by; by_index++) - { - for (int bz_index = 0; bz_index < gridt.bz; bz_index++) - { - int vindex_global = bx_index * gridt.ncy * nczp - + by_index * nczp + bz_index - + start_ind_grid; - vldr3[id]= vlocal_global_value[vindex_global] * vfactor; - id++; - } - } - } - } -} - -void alloc_mult_vlocal(const hamilt::HContainer* hRGint, - const Grid_Technique& gridt, - const UnitCell& ucell, - const int grid_index_ij, - const int max_atom, - double* const psi, - double* const psi_vldr3, - double* const grid_vlocal_g, - int* mat_m, - int* mat_n, - int* mat_k, - int* mat_lda, - int* mat_ldb, - int* mat_ldc, - double** mat_A, - double** mat_B, - double** mat_C, - int& atom_pair_num, - int& max_m, - int& max_n) -{ - atom_pair_num = 0; - max_m = 0; - max_n = 0; - const int nwmax = ucell.nwmax; - for (int z_index = 0; z_index < gridt.nbzp; z_index++) - { - const int grid_index = grid_index_ij + z_index; - const int atom_num = gridt.how_many_atoms[grid_index]; - const int vldr3_index = z_index * max_atom * nwmax * gridt.bxyz; - const int bcell_start_index = gridt.bcell_start[grid_index]; - for (int atom1 = 0; atom1 < atom_num; atom1++) - { - const int iat1 = gridt.which_atom[bcell_start_index + atom1]; - const int uc1 = gridt.which_unitcell[bcell_start_index + atom1]; - const ModuleBase::Vector3 r1 = gridt.get_ucell_coords(uc1); - const int it1 = ucell.iat2it[iat1]; - - for (int atom2 = 0; atom2 < atom_num; atom2++) - { - const int iat2 = gridt.which_atom[bcell_start_index + atom2]; - const int uc2 = gridt.which_unitcell[bcell_start_index + atom2]; - const ModuleBase::Vector3 r2 = gridt.get_ucell_coords(uc2); - int offset = hRGint->find_matrix_offset(iat1, iat2, r1-r2); - if (offset == -1) - { - continue; - } - const int it2 = ucell.iat2it[iat2]; - - if (iat1 <= iat2) - { - const int atom_pair_nw - = ucell.atoms[it1].nw * ucell.atoms[it2].nw; - - const int calc_index1 = vldr3_index + atom1 * nwmax * gridt.bxyz; - const int calc_index2 = vldr3_index + atom2 * nwmax * gridt.bxyz; - - mat_A[atom_pair_num] - = psi + calc_index1; - mat_B[atom_pair_num] - = psi_vldr3 + calc_index2; - mat_C[atom_pair_num] - = grid_vlocal_g + offset; - - mat_lda[atom_pair_num] = gridt.bxyz; - mat_ldb[atom_pair_num] = gridt.bxyz; - mat_ldc[atom_pair_num] = ucell.atoms[it2].nw; - - mat_m[atom_pair_num] = ucell.atoms[it1].nw; - mat_n[atom_pair_num] = ucell.atoms[it2].nw; - mat_k[atom_pair_num] = gridt.bxyz; - - if (mat_m[atom_pair_num] > max_m) - { - max_m = mat_m[atom_pair_num]; - } - if (mat_n[atom_pair_num] > max_n) - { - max_n = mat_n[atom_pair_num]; - } - atom_pair_num++; - } - } - } - } -} - -} // namespace GintKernel \ No newline at end of file diff --git a/source/source_lcao/module_gint/init_orb.cpp b/source/source_lcao/module_gint/init_orb.cpp deleted file mode 100644 index 4ad04e08d6..0000000000 --- a/source/source_lcao/module_gint/init_orb.cpp +++ /dev/null @@ -1,62 +0,0 @@ -#include "gint_tools.h" -#include "source_base/memory.h" -#include "source_basis/module_ao/ORB_read.h" -#include "source_cell/unitcell.h" - -namespace Gint_Tools{ - -void init_orb(double& dr_uniform, - std::vector& rcuts, - UnitCell& ucell, - const LCAO_Orbitals& orb, - std::vector>& psi_u, - std::vector>& dpsi_u, - std::vector>& d2psi_u) -{ - //! set the grid parameters - dr_uniform=orb.dr_uniform; - - assert(dr_uniform>0.0); - - const int nwmax=ucell.nwmax; - const int ntype=ucell.ntype; - - assert(nwmax>0); - assert(ntype>0); - - rcuts=std::vector(ntype); - ModuleBase::Memory::record("rcuts", sizeof(double)*ntype*3); - - for(int it=0; it(1/dr_uniform * max_cut) + 10; - psi_u=std::vector>(ntype * nwmax); - dpsi_u=std::vector>(ntype * nwmax); - d2psi_u=std::vector>(ntype * nwmax); - ModuleBase::Memory::record("psi_u", sizeof(double)*nwmax*ntype*3); - - Atom* atomx = nullptr; - const Numerical_Orbital_Lm* pointer = nullptr; - - for (int i = 0; i < ntype; i++) - { - atomx = &ucell.atoms[i]; - for (int j = 0; j < nwmax; j++) - { - const int k=i*nwmax+j; - if (j < atomx->nw) - { - pointer = &orb.Phi[i].PhiLN(atomx->iw2l[j],atomx->iw2n[j]); - psi_u[k]=pointer->psi_uniform; - dpsi_u[k]=pointer->dpsi_uniform; - d2psi_u[k]=pointer->ddpsi_uniform; - } - } - } -}// End of init_orb() - -}// End of Gint_Tools diff --git a/source/source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/source_lcao/module_gint/kernel/cuda_mem_wrapper.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h rename to source/source_lcao/module_gint/kernel/cuda_mem_wrapper.h diff --git a/source/source_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu b/source/source_lcao/module_gint/kernel/dgemm_vbatch.cu similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu rename to source/source_lcao/module_gint/kernel/dgemm_vbatch.cu diff --git a/source/source_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h b/source/source_lcao/module_gint/kernel/dgemm_vbatch.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h rename to source/source_lcao/module_gint/kernel/dgemm_vbatch.h diff --git a/source/source_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh b/source/source_lcao/module_gint/kernel/gemm_nn_vbatch.cuh similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh rename to source/source_lcao/module_gint/kernel/gemm_nn_vbatch.cuh diff --git a/source/source_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh b/source/source_lcao/module_gint/kernel/gemm_tn_vbatch.cuh similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh rename to source/source_lcao/module_gint/kernel/gemm_tn_vbatch.cuh diff --git a/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp b/source/source_lcao/module_gint/kernel/gint_gpu_vars.cpp similarity index 98% rename from source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp rename to source/source_lcao/module_gint/kernel/gint_gpu_vars.cpp index f4443762f0..f81af2779c 100644 --- a/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp +++ b/source/source_lcao/module_gint/kernel/gint_gpu_vars.cpp @@ -101,8 +101,6 @@ GintGpuVars::GintGpuVars(std::shared_ptr biggrid_info, checkCuda(cudaMalloc((void**)&iat2it_d, sizeof(int) * ucell.nat)); checkCuda(cudaMemcpy(iat2it_d, ucell.iat2it, sizeof(int) * ucell.nat, cudaMemcpyHostToDevice)); - - gemm_algo_selector(mgrid_num, fastest_matrix_mul, ucell); } GintGpuVars::~GintGpuVars() diff --git a/source/source_lcao/module_gint/kernel/gint_gpu_vars.h b/source/source_lcao/module_gint/kernel/gint_gpu_vars.h new file mode 100644 index 0000000000..5f711aa6a0 --- /dev/null +++ b/source/source_lcao/module_gint/kernel/gint_gpu_vars.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include "set_const_mem.cuh" +#include "source_base/ylm.h" +#include "source_cell/unitcell.h" +#include "source_cell/atom_spec.h" +#include "source_lcao/module_gint/biggrid_info.h" +#include "gint_helper.cuh" + +namespace ModuleGint +{ + +class GintGpuVars +{ + public: + GintGpuVars(std::shared_ptr bgrid_info, + const UnitCell& ucell, + const Numerical_Orbital* Phi); + ~GintGpuVars(); + + int nwmax; + double dr_uniform; + double nr_max; + // ylmcoef_d is __constant__ memory, no need to cudaFree + double* ylmcoef_d = nullptr; + double* rcut_d = nullptr; + int* atom_nw_d = nullptr; + int* ucell_atom_nwl_d = nullptr; + bool* atom_iw2_new_d = nullptr; + int* atom_iw2_ylm_d = nullptr; + int* atom_iw2_l_d = nullptr; + double* psi_u_d = nullptr; + double* dpsi_u_d = nullptr; + double* d2psi_u_d = nullptr; + double3* mgrids_pos_d = nullptr; + int* iat2it_d = nullptr; + + // the index of gpu device + int dev_id_ = 0; + +}; + +} \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/kernel/gint_helper.cuh b/source/source_lcao/module_gint/kernel/gint_helper.cuh similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/gint_helper.cuh rename to source/source_lcao/module_gint/kernel/gint_helper.cuh diff --git a/source/source_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu rename to source/source_lcao/module_gint/kernel/phi_operator_gpu.cu diff --git a/source/source_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/source_lcao/module_gint/kernel/phi_operator_gpu.h similarity index 98% rename from source/source_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h rename to source/source_lcao/module_gint/kernel/phi_operator_gpu.h index 897218a8dd..27568e5ec9 100644 --- a/source/source_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h +++ b/source/source_lcao/module_gint/kernel/phi_operator_gpu.h @@ -2,7 +2,7 @@ #include #include -#include "source_lcao/module_gint/temp_gint/batch_biggrid.h" +#include "source_lcao/module_gint/batch_biggrid.h" #include "gint_helper.cuh" #include "gint_gpu_vars.h" #include "cuda_mem_wrapper.h" diff --git a/source/source_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu b/source/source_lcao/module_gint/kernel/phi_operator_kernel.cu similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu rename to source/source_lcao/module_gint/kernel/phi_operator_kernel.cu diff --git a/source/source_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh b/source/source_lcao/module_gint/kernel/phi_operator_kernel.cuh similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh rename to source/source_lcao/module_gint/kernel/phi_operator_kernel.cuh diff --git a/source/source_lcao/module_gint/temp_gint/kernel/set_const_mem.cu b/source/source_lcao/module_gint/kernel/set_const_mem.cu similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/set_const_mem.cu rename to source/source_lcao/module_gint/kernel/set_const_mem.cu diff --git a/source/source_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh b/source/source_lcao/module_gint/kernel/set_const_mem.cuh similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh rename to source/source_lcao/module_gint/kernel/set_const_mem.cuh diff --git a/source/source_lcao/module_gint/temp_gint/kernel/sph.cuh b/source/source_lcao/module_gint/kernel/sph.cuh similarity index 100% rename from source/source_lcao/module_gint/temp_gint/kernel/sph.cuh rename to source/source_lcao/module_gint/kernel/sph.cuh diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen.cpp b/source/source_lcao/module_gint/kernels/cuda/code_gen.cpp deleted file mode 100644 index 42e8c4f0c5..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen.cpp +++ /dev/null @@ -1,4426 +0,0 @@ -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); - -gemm_time_measure(max_m, - max_n, - d_m, - d_n, - d_k, - d_global_A_array, - d_global_lda, - d_global_B_array, - d_global_ldb, - d_global_C_array, - d_global_ldc, - batchCount, - temp_stream, - fastest_time, - fastest_algo, - cpu_result, - h_global_C, - d_global_C); diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen.cuh b/source/source_lcao/module_gint/kernels/cuda/code_gen.cuh deleted file mode 100644 index a4b1a75916..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen.cuh +++ /dev/null @@ -1,473 +0,0 @@ -#ifndef CODE_GEN_CUH -#define CODE_GEN_CUH - -#include "gemm_selector.cuh" -#include - -extern template void gemm_time_measure(int, int, int*, int*, int*, double**, int*, double**, int*, double**, int*, int, cudaStream_t, float&, matrix_multiple_func_type&, double*, double*, double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -#endif \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_00.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_00.cu deleted file mode 100644 index a07c411485..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen_00.cu +++ /dev/null @@ -1,48 +0,0 @@ -#include "vbatch_matrix_mul.cuh" - -template void gemm_time_measure(int, int, int*, int*, int*, double**, int*, double**, int*, double**, int*, int, cudaStream_t, float&, matrix_multiple_func_type&, double*, double*, double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_01.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_01.cu deleted file mode 100644 index 9f725c23c6..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen_01.cu +++ /dev/null @@ -1,48 +0,0 @@ -#include "vbatch_matrix_mul.cuh" - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_02.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_02.cu deleted file mode 100644 index 090eab0709..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen_02.cu +++ /dev/null @@ -1,48 +0,0 @@ -#include "vbatch_matrix_mul.cuh" - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_03.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_03.cu deleted file mode 100644 index 046d0e5063..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen_03.cu +++ /dev/null @@ -1,48 +0,0 @@ -#include "vbatch_matrix_mul.cuh" - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_04.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_04.cu deleted file mode 100644 index f74209d829..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen_04.cu +++ /dev/null @@ -1,48 +0,0 @@ -#include "vbatch_matrix_mul.cuh" - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_05.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_05.cu deleted file mode 100644 index c9cb81bd7c..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen_05.cu +++ /dev/null @@ -1,48 +0,0 @@ -#include "vbatch_matrix_mul.cuh" - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_06.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_06.cu deleted file mode 100644 index f5fac39df2..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen_06.cu +++ /dev/null @@ -1,48 +0,0 @@ -#include "vbatch_matrix_mul.cuh" - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_07.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_07.cu deleted file mode 100644 index 971c6eb0c0..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen_07.cu +++ /dev/null @@ -1,48 +0,0 @@ -#include "vbatch_matrix_mul.cuh" - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_08.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_08.cu deleted file mode 100644 index 8643faae70..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen_08.cu +++ /dev/null @@ -1,48 +0,0 @@ -#include "vbatch_matrix_mul.cuh" - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_09.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_09.cu deleted file mode 100644 index 8cf333bf6f..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/code_gen_09.cu +++ /dev/null @@ -1,53 +0,0 @@ -#include "vbatch_matrix_mul.cuh" - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); - -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cu b/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cu deleted file mode 100644 index c9bf122628..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cu +++ /dev/null @@ -1,292 +0,0 @@ -#include -#include -#include - -#include "cuda_tools.cuh" - -void dump_cuda_array_to_file(const double* cuda_array, - int width, - int hight, - const std::string& filename) -{ - double* h_data = new double[width * hight]; - cudaMemcpy(h_data, - cuda_array, - width * hight * sizeof(double), - cudaMemcpyDeviceToHost); - - std::ofstream outFile(filename); - if (!outFile.is_open()) - { - std::cerr << "Failed to open file for writing." << std::endl; - } - for (int j = 0; j < hight; ++j) - { - for (int i = 0; i < width; ++i) - { - outFile << "hight" << j << " width:" << i << " " - << h_data[j * width + i] << std::endl; - } - } - outFile.close(); - delete[] h_data; -} - -template -Cuda_Mem_Wrapper::Cuda_Mem_Wrapper() -{ - this->device_pointer = nullptr; - this->host_pointer = nullptr; - this->one_stream_size = 0; - this->one_stream_size_aligned = 0; - this->stream_number = 1; - this->total_size_aligned = 0; -} - -template -Cuda_Mem_Wrapper::Cuda_Mem_Wrapper(int one_stream_size_in, - int one_stream_size_aligned_in, - int stream_number_in, - bool malloc_host_in) -{ - this->stream_number = stream_number_in; - this->one_stream_size = one_stream_size_in; - this->one_stream_size_aligned = one_stream_size_aligned_in; - this->total_size_aligned - = this->one_stream_size_aligned * this->stream_number; - - checkCuda(cudaMalloc((void**)&this->device_pointer, - this->total_size_aligned * sizeof(T))); - checkCuda(cudaMemset(this->device_pointer, - 0, - this->total_size_aligned * sizeof(T))); - this->host_pointer = nullptr; - - if (malloc_host_in) - { - checkCuda(cudaMallocHost((void**)&this->host_pointer, - this->total_size_aligned * sizeof(T))); - memset(this->host_pointer, 0, this->total_size_aligned * sizeof(T)); - } -} - -template -Cuda_Mem_Wrapper::Cuda_Mem_Wrapper(int one_stream_size_in, - int stream_number_in, - bool malloc_host_in) - : Cuda_Mem_Wrapper(one_stream_size_in, - one_stream_size_in, - stream_number_in, - malloc_host_in) -{ -} - -template -Cuda_Mem_Wrapper::Cuda_Mem_Wrapper(Cuda_Mem_Wrapper&& other) noexcept -{ - this->device_pointer = other.device_pointer; - this->host_pointer = other.host_pointer; - this->one_stream_size = other.one_stream_size; - this->one_stream_size_aligned = other.one_stream_size_aligned; - this->stream_number = other.stream_number; - this->total_size_aligned = other.total_size_aligned; - - other.device_pointer = nullptr; - other.host_pointer = nullptr; - other.one_stream_size = 0; - other.one_stream_size_aligned = 0; - other.stream_number = 0; - other.total_size_aligned = 0; -} - -template -Cuda_Mem_Wrapper& Cuda_Mem_Wrapper::operator=(Cuda_Mem_Wrapper&& other) noexcept -{ - if (this != &other) - { - this->free_all(); - this->device_pointer = other.device_pointer; - this->host_pointer = other.host_pointer; - this->one_stream_size = other.one_stream_size; - this->one_stream_size_aligned = other.one_stream_size_aligned; - this->stream_number = other.stream_number; - this->total_size_aligned = other.total_size_aligned; - - other.device_pointer = nullptr; - other.host_pointer = nullptr; - other.one_stream_size = 0; - other.one_stream_size_aligned = 0; - other.stream_number = 0; - other.total_size_aligned = 0; - } - return *this; -} - -template -void Cuda_Mem_Wrapper::free_all() -{ - checkCuda(cudaFree(this->device_pointer)); - if (this->host_pointer != nullptr) - { - checkCuda(cudaFreeHost(this->host_pointer)); - } -} - -template -Cuda_Mem_Wrapper::~Cuda_Mem_Wrapper() -{ - this->free_all(); -} - -template -inline void Cuda_Mem_Wrapper::copy_host_to_device_sync(const int stream_id) -{ - if (this->host_pointer == nullptr || this->device_pointer == nullptr) - { - std::cerr << "host_pointer is nullptr, can not copy host to device" - << std::endl; - exit(1); - } - checkCuda(cudaMemcpy( - this->device_pointer + stream_id * this->one_stream_size_aligned, - this->host_pointer + stream_id * this->one_stream_size_aligned, - this->one_stream_size * sizeof(T), - cudaMemcpyHostToDevice)); -} - -template -inline void Cuda_Mem_Wrapper::copy_host_to_device_async(const cudaStream_t stream, - const int stream_id) -{ - if (this->host_pointer == nullptr || this->device_pointer == nullptr) - { - std::cerr << "host_pointer is nullptr, can not copy host to device" - << std::endl; - exit(1); - } - checkCuda(cudaMemcpyAsync( - this->device_pointer + stream_id * this->one_stream_size_aligned, - this->host_pointer + stream_id * this->one_stream_size_aligned, - this->one_stream_size * sizeof(T), - cudaMemcpyHostToDevice, - stream)); -} - -template -inline void Cuda_Mem_Wrapper::copy_host_to_device_async(const cudaStream_t stream, - const int stream_id, - const int size) -{ - if (this->host_pointer == nullptr || this->device_pointer == nullptr) - { - std::cerr << "host_pointer is nullptr, can not copy host to device" - << std::endl; - exit(1); - } - checkCuda(cudaMemcpyAsync( - this->device_pointer + stream_id * this->one_stream_size_aligned, - this->host_pointer + stream_id * this->one_stream_size_aligned, - size * sizeof(T), - cudaMemcpyHostToDevice, - stream)); -} - -template -inline void Cuda_Mem_Wrapper::copy_device_to_host_sync(const int stream_id) -{ - if (this->host_pointer == nullptr || this->device_pointer == nullptr) - { - std::cerr << "host_pointer is nullptr, can not copy device to host" - << std::endl; - exit(1); - } - checkCuda(cudaMemcpy( - this->host_pointer + stream_id * this->one_stream_size_aligned, - this->device_pointer + stream_id * this->one_stream_size_aligned, - this->one_stream_size * sizeof(T), - cudaMemcpyDeviceToHost)); -} - -template -inline void Cuda_Mem_Wrapper::copy_device_to_host_async(const cudaStream_t stream, - const int stream_id) -{ - if (this->host_pointer == nullptr || this->device_pointer == nullptr) - { - std::cerr << "host_pointer is nullptr, can not copy device to host" - << std::endl; - exit(1); - } - checkCuda(cudaMemcpyAsync( - this->host_pointer + stream_id * this->one_stream_size_aligned, - this->device_pointer + stream_id * this->one_stream_size_aligned, - this->one_stream_size * sizeof(T), - cudaMemcpyDeviceToHost, - stream)); -} - -template -inline void Cuda_Mem_Wrapper::copy_device_to_host_async(const cudaStream_t stream, - const int stream_id, - const int size) -{ - if (this->host_pointer == nullptr || this->device_pointer == nullptr) - { - std::cerr << "host_pointer is nullptr, can not copy device to host" - << std::endl; - exit(1); - } - checkCuda(cudaMemcpyAsync( - this->host_pointer + stream_id * this->one_stream_size_aligned, - this->device_pointer + stream_id * this->one_stream_size_aligned, - size * sizeof(T), - cudaMemcpyDeviceToHost, - stream)); -} - -template -inline void Cuda_Mem_Wrapper::memset_device_sync(const int stream_id, const int value) -{ - checkCuda(cudaMemset(this->device_pointer - + stream_id * this->one_stream_size_aligned, - value, - this->one_stream_size * sizeof(T))); -} - -template -inline void Cuda_Mem_Wrapper::memset_device_async(const cudaStream_t stream, - const int stream_id, - const int value) -{ - checkCuda(cudaMemsetAsync(this->device_pointer - + stream_id * this->one_stream_size_aligned, - value, - this->one_stream_size * sizeof(T), - stream)); -} - -template -inline void Cuda_Mem_Wrapper::memset_host(const int stream_id, const int value) -{ - memset(this->host_pointer + stream_id * this->one_stream_size_aligned, - value, - this->one_stream_size * sizeof(T)); -} - -template -inline T* Cuda_Mem_Wrapper::get_device_pointer(const int stream_id) -{ - return this->device_pointer + stream_id * this->one_stream_size_aligned; -} - -template -inline T* Cuda_Mem_Wrapper::get_host_pointer(const int stream_id) -{ - return this->host_pointer + stream_id * this->one_stream_size_aligned; -} -template class Cuda_Mem_Wrapper; -template class Cuda_Mem_Wrapper; -template class Cuda_Mem_Wrapper; -template class Cuda_Mem_Wrapper; -template class Cuda_Mem_Wrapper; - diff --git a/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cuh b/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cuh deleted file mode 100644 index dab697df8c..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cuh +++ /dev/null @@ -1,123 +0,0 @@ -#ifndef CUDA_TOOLS_CUH -#define CUDA_TOOLS_CUH -#include // for assert -#include -#include // for CUDA_VERSION -#include - -#include -#include -#include - -#define checkCuda(val) check((val), #val, __FILE__, __LINE__) -#define checkCudaLastError() __getLastCudaError(__FILE__, __LINE__) - -inline void check(cudaError_t result, char const *const func, const char *const file, - int const line) { - if (result) { - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, - static_cast(result), cudaGetErrorString(result), func); - exit(EXIT_FAILURE); - } -} - -inline void __getLastCudaError(const char *file, - const int line) -{ - cudaError_t err = cudaGetLastError(); - - if (cudaSuccess != err) { - fprintf(stderr, - "%s(%i) : getLastCudaError() CUDA error :" - " (%d) %s.\n", - file, line, static_cast(err), - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } -} - -static inline int ceildiv(int x, int y) -{ - return (x + y - 1) / y; -} - -void dump_cuda_array_to_file(const double* cuda_array, - int width, - int hight, - const std::string& filename); - -// inline int ceil_div(int a, int b) -// { -// return (a + b - 1) / b; -// } - -/* - * @brief: A simple wrapper for cudaMalloc and cudaFree, sync and async CUDA - * memory copy - * @param: T: the type of the data - * - * @note: - * Manual management of CUDA memory is a very delicate task; complex pointers - * and malloc/free operations make it easy for us to encounter memory bugs. The - * severity of the issues increases significantly when introducing multi-node, - * multi-GPU, and multi-stream parallelism. - * Debugging after encountering bugs is also very difficult, finding the leaking - * pointer from dozens of variables can be quite a headache. - * Therefore, considering that our use and management of memory have some - * homogeneity, we have abstracted these needs into the following encapsulations - * to reduce the cost of maintenance and development. The memory is allocated in - * the constructor and freed in the destructor. - * - * The following interface is primarily designed for the following requirements: - * 1. We need to split a large task into multiple subtasks to run on multiple - * streams across multiple GPUs on multiple nodes. - * 2. It is necessary to allocate memory of the same shape on both host and - * device. - * 3. Data copying between host and device sync or async is required. - */ - -template -class Cuda_Mem_Wrapper -{ - public: - - Cuda_Mem_Wrapper(); - Cuda_Mem_Wrapper(int one_stream_size, - int one_stream_size_aligned, - int stream_number = 1, - bool malloc_host = true); - Cuda_Mem_Wrapper(int one_stream_size, - int stream_number = 1, - bool malloc_host = true); - - Cuda_Mem_Wrapper(const Cuda_Mem_Wrapper& other) = delete; - Cuda_Mem_Wrapper& operator=(const Cuda_Mem_Wrapper& other) = delete; - Cuda_Mem_Wrapper(Cuda_Mem_Wrapper&& other) noexcept; - Cuda_Mem_Wrapper& operator=(Cuda_Mem_Wrapper&& other) noexcept; - - ~Cuda_Mem_Wrapper(); - void copy_host_to_device_sync(const int stream_id = 0); - void copy_host_to_device_async(const cudaStream_t stream, const int stream_id); - void copy_host_to_device_async(const cudaStream_t stream, const int stream_id, const int size); - void copy_device_to_host_sync(const int stream_id = 0); - void copy_device_to_host_async(const cudaStream_t stream, const int stream_id); - void copy_device_to_host_async(const cudaStream_t stream, const int stream_id, const int size); - void memset_device_sync(const int stream_id = 0, const int value = 0); - void memset_device_async(const cudaStream_t stream, - const int stream_id = 0, - const int value = 0); - void memset_host(const int stream_id = 0, const int value = 0); - T* get_device_pointer(const int stream_id = 0); - T* get_host_pointer(const int stream_id = 0); - void free_all(); - - private: - T* device_pointer; - T* host_pointer; - int one_stream_size; - int one_stream_size_aligned; - int stream_number; - int total_size_aligned; -}; - -#endif // CUDA_TOOLS_CUH \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cu b/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cu deleted file mode 100644 index 6550b21edb..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cu +++ /dev/null @@ -1,138 +0,0 @@ -#include - -#include "gemm_selector.cuh" -#include "vbatch_matrix_mul.cuh" -#include "cuda_tools.cuh" -#include "source_base/module_external/blas_connector.h" -#include "code_gen.cuh" - -/* - * Here we have utilized a very straightforward and brute-force method to select - * the optimal matrix multiplication kernel for a given scale of computation: we - * compute with all scales of kernels under the current computational task to - * find the fastest parameter combination. This approach can lead to an increase - * in compilation time. - */ -void gemm_algo_selector(int matrix_k, matrix_multiple_func_type& fastest_algo,const UnitCell& ucell) -{ - int batchCount_per_type = 32; - int batchCount - = batchCount_per_type * ucell.ntype * ucell.ntype; - - Cuda_Mem_Wrapper m(batchCount); - Cuda_Mem_Wrapper n(batchCount); - Cuda_Mem_Wrapper k(batchCount); - - int max_m = ucell.nwmax, max_n = ucell.nwmax; - - Cuda_Mem_Wrapper A(batchCount * max_m * matrix_k); - Cuda_Mem_Wrapper B(batchCount * max_n * matrix_k); - Cuda_Mem_Wrapper C(batchCount * max_m * max_n); - - Cuda_Mem_Wrapper lda(batchCount); - Cuda_Mem_Wrapper ldb(batchCount); - Cuda_Mem_Wrapper ldc(batchCount); - - Cuda_Mem_Wrapper A_array(batchCount); - Cuda_Mem_Wrapper B_array(batchCount); - Cuda_Mem_Wrapper C_array(batchCount); - - for (int i = 0; i < batchCount * max_m * matrix_k; ++i) - { - A.get_host_pointer()[i] = i * 0.001; - } - for (int i = 0; i < batchCount * max_n * matrix_k; ++i) - { - B.get_host_pointer()[i] = i * 0.002; - } - - double* cpu_result = new double[batchCount * max_m * max_n]; - memset(cpu_result, 0, batchCount * max_m * max_n * sizeof(double)); - int index = 0; - for (int i = 0; i < batchCount_per_type; ++i) - { - for (int j = 0; j < ucell.ntype; j++) - { - for (int l = 0; l < ucell.ntype; l++) - { - m.get_host_pointer()[index] = ucell.atoms[j].nw; - n.get_host_pointer()[index] = ucell.atoms[l].nw; - k.get_host_pointer()[index] = matrix_k; - - lda.get_host_pointer()[index] = matrix_k; - ldb.get_host_pointer()[index] = matrix_k; - ldc.get_host_pointer()[index] = ucell.atoms[l].nw; - - A_array.get_host_pointer()[index] - = &A.get_device_pointer()[index * max_m * matrix_k]; - B_array.get_host_pointer()[index] - = &B.get_device_pointer()[index * max_n * matrix_k]; - C_array.get_host_pointer()[index] - = &C.get_device_pointer()[index * max_n - * max_m]; // test atom add - BlasConnector::gemm( - 'N', - 'T', - m.get_host_pointer()[index], - n.get_host_pointer()[index], - matrix_k, - 1.0, - &A.get_host_pointer()[index * max_m * matrix_k], - matrix_k, - &B.get_host_pointer()[index * max_n * matrix_k], - matrix_k, - 1.0, - &cpu_result[index * max_m * max_n], - n.get_host_pointer()[index]); - index++; - } - } - } - - m.copy_host_to_device_sync(); - n.copy_host_to_device_sync(); - k.copy_host_to_device_sync(); - - lda.copy_host_to_device_sync(); - ldb.copy_host_to_device_sync(); - ldc.copy_host_to_device_sync(); - - A.copy_host_to_device_sync(); - B.copy_host_to_device_sync(); - A_array.copy_host_to_device_sync(); - B_array.copy_host_to_device_sync(); - C_array.copy_host_to_device_sync(); - - cudaStream_t temp_stream; - checkCuda(cudaStreamCreate(&temp_stream)); - - float fastest_time = 1000000; - fastest_algo = vbatched_gemm_impl; - - int* d_m = m.get_device_pointer(); - int* d_n = n.get_device_pointer(); - int* d_k = k.get_device_pointer(); - - double** d_global_A_array = A_array.get_device_pointer(); - double** d_global_B_array = B_array.get_device_pointer(); - double** d_global_C_array = C_array.get_device_pointer(); - - double* h_global_C = C.get_host_pointer(); - double* d_global_C = C.get_device_pointer(); - - int* d_global_lda = lda.get_device_pointer(); - int* d_global_ldb = ldb.get_device_pointer(); - int* d_global_ldc = ldc.get_device_pointer(); - -/* - * Please do not manually modify the code in the following file; - * it should simply be generated through a loop using a short Python program. - */ -#include "code_gen.cpp" - checkCuda(cudaStreamDestroy(temp_stream)); - std::cout << " gemm_algo_selector::Fastest time: " << fastest_time << " ms" - << std::endl; - // fastest_algo = vbatched_gemm_impl; - delete[] cpu_result; -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cuh b/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cuh deleted file mode 100644 index 744f3c887d..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cuh +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef GEMM_SELECTOR_H -#define GEMM_SELECTOR_H - -#include "cuda_runtime.h" -#include "source_cell/unitcell.h" -typedef std::function< - void(int, int, int*, int*, int*, double**, int*, double**, int*, double**, int*, int, cudaStream_t, double* alpha)> - matrix_multiple_func_type; - -void gemm_algo_selector(int k, matrix_multiple_func_type& func, const UnitCell& ucell); - -#endif \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_force.cu b/source/source_lcao/module_gint/kernels/cuda/gint_force.cu deleted file mode 100644 index 0199c9e37a..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/gint_force.cu +++ /dev/null @@ -1,225 +0,0 @@ -#include "sph.cuh" -#include "interp.cuh" -#include "gint_force.cuh" -#include "cuda_tools.cuh" -#include "source_base/module_device/device.h" -// CUDA kernel to calculate psi and force -namespace GintKernel -{ -__inline__ __device__ double warpReduceSum(double val) -{ - val += __shfl_xor_sync(0xffffffff, val, 16, 32); - val += __shfl_xor_sync(0xffffffff, val, 8, 32); - val += __shfl_xor_sync(0xffffffff, val, 4, 32); - val += __shfl_xor_sync(0xffffffff, val, 2, 32); - val += __shfl_xor_sync(0xffffffff, val, 1, 32); - return val; -} - - -__global__ void get_psi_force(double* ylmcoef, - double delta_r, - int bxyz, - const int nwmax, - const int max_atom, - const int* const ucell_atom_nwl, - const bool* const atom_iw2_new, - const int* const atom_iw2_ylm, - const int* const atom_iw2_l, - const int* const atom_nw, - const double* const rcut, - const int nr_max, - const double* const psi_u, - const double* const mcell_pos, - const double* const dr_part, - const double* const vldr3, - const uint8_t* const atoms_type, - const int* const atoms_num_info, - double* psi, - double* dpsi, - double* d2psi) -{ - const int bcell_id = blockIdx.x; - const int num_atoms = atoms_num_info[2 * bcell_id]; - const int pre_atoms = atoms_num_info[2 * bcell_id + 1]; - const int mcell_id = blockIdx.y; - const double vldr3_value = vldr3[bcell_id*bxyz + mcell_id]; - const double mcell_pos_x = mcell_pos[3 * mcell_id]; - const double mcell_pos_y = mcell_pos[3 * mcell_id + 1]; - const double mcell_pos_z = mcell_pos[3 * mcell_id + 2]; - - for(int atom_id = threadIdx.x; atom_id < num_atoms; atom_id += blockDim.x) - { - const int dr_start = 3 * (pre_atoms + atom_id); - const double dr_x = dr_part[dr_start] + mcell_pos_x; - const double dr_y = dr_part[dr_start + 1] + mcell_pos_y; - const double dr_z = dr_part[dr_start + 2] + mcell_pos_z; - double dist = sqrt(dr_x * dr_x + dr_y * dr_y + dr_z * dr_z); - const int atype = __ldg(atoms_type + pre_atoms + atom_id); - if(dist < rcut[atype]) - { - if (dist < 1.0E-9) - { - dist += 1.0E-9; - } - // dr is different from that in interp_rho and interp_vl - double dr[3] = {dr_x, dr_y, dr_z}; - double ylma[49]; - double grly[49][3]; - const int nwl = __ldg(ucell_atom_nwl + atype); - spherical_harmonics_d(dr, dist*dist, grly, nwl, ylma, ylmcoef); - int psi_idx = ((pre_atoms + atom_id) * bxyz + mcell_id) * nwmax; - interp_f(dist, - delta_r, - atype, - nwmax, - nr_max, - atom_nw, - atom_iw2_new, - psi_u, - ylma, - atom_iw2_l, - atom_iw2_ylm, - vldr3_value, - dr, - grly, - psi_idx, - psi, - dpsi, - d2psi); - } - } -} - - -__global__ void dot_product_stress(const double* d2psi, - const double* psi_dm, - const int size, - double* stress) -{ - __shared__ double cache[32 * 6]; - const int tid = threadIdx.x; - const int stride = blockDim.x * gridDim.x; - const int warp_id = tid / 32; - const int lane_id = tid % 32; - double tmp[6] = {0.0}; - for(int id = threadIdx.x + blockIdx.x * blockDim.x; id < size; id += stride) - { - const double psi_dm_2 = psi_dm[id] * 2; - const int id_stress = id * 6; - tmp[0] += d2psi[id_stress] * psi_dm_2; - tmp[1] += d2psi[id_stress + 1] * psi_dm_2; - tmp[2] += d2psi[id_stress + 2] * psi_dm_2; - tmp[3] += d2psi[id_stress + 3] * psi_dm_2; - tmp[4] += d2psi[id_stress + 4] * psi_dm_2; - tmp[5] += d2psi[id_stress + 5] * psi_dm_2; - } - - for(int i = 0; i<6; i++) - { - tmp[i] = warpReduceSum(tmp[i]); - } - - if (lane_id == 0) - { - for (int i = 0; i < 6; i++) - { - cache[warp_id * 6 + i] = tmp[i]; - } - } - __syncthreads(); - - for (int i = 0; i < 6; i++) - { - tmp[i] = (tid < blockDim.x / 32) ? cache[tid * 6 + i] : 0; - } - - if(warp_id == 0) - { - for (int i = 0; i < 6; i++) - { - tmp[i] = warpReduceSum(tmp[i]); - } - } - - if (tid == 0) - { - for (int i = 0; i < 6; i++) - { - atomicAdd(&stress[i], tmp[i]); // Use atomicAdd() instead of atomic_add(). - } - } -} - - -__global__ void dot_product_force(const int bxyz, - const int nwmax, - const int *atoms_num_info, - const int *iat_on_nbz, - const double* dpsi, - const double* psi_dm, - double* force) -{ - __shared__ double cache[32 * 3]; - const int tid = threadIdx.x; - const int bcell_id = blockIdx.x; - const int warp_id = tid / 32; - const int lane_id = tid % 32; - const int vec_size = bxyz * nwmax; - const int atom_num = atoms_num_info[2 * bcell_id]; - const int pre_atoms = atoms_num_info[2 * bcell_id + 1]; - - for(int k = 0; k < atom_num; k++) - { - const int atom_id = pre_atoms + k; - const int offset = atom_id * vec_size; - const int iat = iat_on_nbz[atom_id]; - double force_iat[3] = {0.0}; - - for(int i =tid; i < vec_size; i += blockDim.x) - { - int psi_offset = offset + i; - double psi_dm_2 = psi_dm[psi_offset] * 2; - force_iat[0] += dpsi[psi_offset * 3] * psi_dm_2; - force_iat[1] += dpsi[psi_offset * 3 + 1] * psi_dm_2; - force_iat[2] += dpsi[psi_offset * 3 + 2] * psi_dm_2; - } - - for (int i = 0; i < 3; i++) - { - force_iat[i] = warpReduceSum(force_iat[i]); - } - - if (lane_id == 0) - { - for (int i = 0; i < 3; i++) - { - cache[warp_id * 3 + i] = force_iat[i]; - } - } - __syncthreads(); - - for (int i = 0; i < 3; i++) - { - force_iat[i] = (tid < blockDim.x / 32) ? cache[tid * 3 + i] : 0; - } - - if (warp_id == 0) - { - for (int i = 0; i < 3; i++) - { - force_iat[i] = warpReduceSum(force_iat[i]); - } - } - - if (tid == 0) - { - for (int i = 0; i < 3; i++) - { - atomicAdd(&force[iat * 3 + i], force_iat[i]); - } - } - } -} - -} // namespace GintKernel diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_force.cuh b/source/source_lcao/module_gint/kernels/cuda/gint_force.cuh deleted file mode 100644 index 74b941f32a..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/gint_force.cuh +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef GINT_FORCE_CUH -#define GINT_FORCE_CUH - -#include -#include -namespace GintKernel -{ - -__global__ void get_psi_force(double* ylmcoef, - double delta_r, - int bxyz, - const int nwmax, - const int max_atom, - const int* const ucell_atom_nwl, - const bool* const atom_iw2_new, - const int* const atom_iw2_ylm, - const int* const atom_iw2_l, - const int* const atom_nw, - const double* const rcut, - const int nr_max, - const double* const psi_u, - const double* const mcell_pos, - const double* const dr_part, - const double* const vldr3, - const uint8_t* const atoms_type, - const int* const atoms_num_info, - double* psi, - double* dpsi, - double* d2psi); - - -__global__ void dot_product_stress(const double* d2psi, - const double* psi_dm, - const int size, - double* stress); - -__global__ void dot_product_force(const int bxyz, - const int nwmax, - const int *atoms_num_info, - const int *iat_on_nbz, - const double* dpsi, - const double* psi_dm, - double* force); - -} // namespace GintKernel -#endif // GINT_VL_CUH diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_rho.cu b/source/source_lcao/module_gint/kernels/cuda/gint_rho.cu deleted file mode 100644 index 6b4069c40b..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/gint_rho.cu +++ /dev/null @@ -1,130 +0,0 @@ -#include "interp.cuh" -#include "gint_rho.cuh" -#include "sph.cuh" -#include "cuda_tools.cuh" - -namespace GintKernel -{ -__inline__ __device__ double warpReduceSum(double val) -{ - val += __shfl_xor_sync(0xffffffff, val, 16, 32); - val += __shfl_xor_sync(0xffffffff, val, 8, 32); - val += __shfl_xor_sync(0xffffffff, val, 4, 32); - val += __shfl_xor_sync(0xffffffff, val, 2, 32); - val += __shfl_xor_sync(0xffffffff, val, 1, 32); - return val; -} - - -/* - each block calculates the wavefunction on a meshcell, - and each thread loops over the atoms on a meshcell. -*/ -__global__ void get_psi(const double* const ylmcoef, - const double delta_r, - const int bxyz, - const int nwmax, - const int max_atom, - const int* const ucell_atom_nwl, - const bool* const atom_iw2_new, - const int* const atom_iw2_ylm, - const int* const atom_nw, - const double* const rcut, - const int nr_max, - const double* const psi_u, - const double* const mcell_pos, - const double* const dr_part, - const uint8_t* const atoms_type, - const int* const atoms_num_info, - double* psi) -{ - const int bcell_id = blockIdx.x; - const int num_atoms = atoms_num_info[2 * bcell_id]; - const int pre_atoms = atoms_num_info[2 * bcell_id + 1]; - const int mcell_id = blockIdx.y; - const double mcell_pos_x = mcell_pos[3 * mcell_id]; - const double mcell_pos_y = mcell_pos[3 * mcell_id + 1]; - const double mcell_pos_z = mcell_pos[3 * mcell_id + 2]; - - for(int atom_id = threadIdx.x; atom_id < num_atoms; atom_id += blockDim.x) - { - const int aid = pre_atoms + atom_id; - const double dr_x = dr_part[aid * 3] + mcell_pos_x; - const double dr_y = dr_part[aid * 3 + 1] + mcell_pos_y; - const double dr_z = dr_part[aid * 3 + 2] + mcell_pos_z; - double dist = sqrt(dr_x * dr_x + dr_y * dr_y + dr_z * dr_z); - const int atype = __ldg(atoms_type + aid); - if(dist < rcut[atype]) - { - if (dist < 1.0E-9) - { - dist += 1.0E-9; - } - double dr[3] = {dr_x / dist, dr_y / dist, dr_z / dist}; - double ylma[49]; - const int nwl = __ldg(ucell_atom_nwl + atype); - int psi_idx = (pre_atoms * bxyz + mcell_id * num_atoms + atom_id) * nwmax; - spherical_harmonics(dr, nwl, ylma, ylmcoef); - interp_rho(dist, - delta_r, - atype, - nwmax, - nr_max, - atom_nw, - atom_iw2_new, - psi_u, - ylma, - atom_iw2_ylm, - psi, - psi_idx); - } - } -} - -/* - Each block calculates the dot product on a meshcell, - and each thread loops over the wavefunction of atoms on a meshcell. -*/ -__global__ void psir_dot(const int bxyz, - const int nwmax, - const int* atoms_num_info, - const double* __restrict__ vec_a_g, - const double* __restrict__ vec_b_g, - double** results_g) -{ - __shared__ double s_data[32]; - const int tid = threadIdx.x; - const int bcell_id = blockIdx.x; - const int mcell_id = blockIdx.y; - const int vec_size = atoms_num_info[2 * bcell_id] * nwmax; - const int offset = atoms_num_info[2 * bcell_id + 1] * nwmax * bxyz + mcell_id * vec_size; - const double* vec_a_mcell = vec_a_g + offset; - const double* vec_b_mcell = vec_b_g + offset; - const int warp_id = tid / 32; - const int lane_id = tid % 32; - double mySum = 0; - - for (int k = tid; k < vec_size; k += blockDim.x) - { - mySum += vec_a_mcell[k] * vec_b_mcell[k]; - } - - mySum = warpReduceSum(mySum); - - if (lane_id == 0) - { - s_data[warp_id] = mySum; - } - __syncthreads(); - - mySum = (tid < blockDim.x / 32) ? s_data[tid] : 0; - if (warp_id == 0) - { - mySum = warpReduceSum(mySum); - } - - if (tid == 0) { - *results_g[bcell_id*bxyz + mcell_id] = mySum; - } -} -} // namespace GintKernel \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_rho.cuh b/source/source_lcao/module_gint/kernels/cuda/gint_rho.cuh deleted file mode 100644 index 70cbbb7692..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/gint_rho.cuh +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef GINT_RHO_CUH -#define GINT_RHO_CUH - -#include -#include -namespace GintKernel -{ - -/** - * @brief CUDA kernel to calculate psir. - * - * This kernel calculates the wave function psi using the provided input - * parameters. - */ -__global__ void get_psi(const double* const ylmcoef, - const double delta_r, - const int bxyz, - const int nwmax, - const int max_atom, - const int* const ucell_atom_nwl, - const bool* const atom_iw2_new, - const int* const atom_iw2_ylm, - const int* const atom_nw, - const double* const rcut, - const int nr_max, - const double* const psi_u, - const double* const mcell_pos, - const double* const dr_part, - const uint8_t* const atoms_type, - const int* const atoms_num_info, - double* psi); - -__global__ void psir_dot(const int bxyz, - const int nwmax, - const int* atoms_num_info, - const double* __restrict__ vec_a_g, - const double* __restrict__ vec_b_g, - double** results_g); - -} // namespace GintKernel -#endif // GINT_RHO_CUH \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_vl.cu b/source/source_lcao/module_gint/kernels/cuda/gint_vl.cu deleted file mode 100644 index 3b92455e60..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/gint_vl.cu +++ /dev/null @@ -1,75 +0,0 @@ -#include "gint_vl.cuh" -#include "interp.cuh" -#include "cuda_tools.cuh" -#include "sph.cuh" -namespace GintKernel -{ - -__global__ void get_psi_and_vldr3(const double* const ylmcoef, - const double delta_r, - const int bxyz, - const double nwmax, - const double max_atom, - const int* const ucell_atom_nwl, - const bool* const atom_iw2_new, - const int* const atom_iw2_ylm, - const int* const atom_nw, - const double* const rcut, - const int nr_max, - const double* const psi_u, - const double* const mcell_pos, - const double* const dr_part, - const double* const vldr3, - const uint8_t* const atoms_type, - const int* const atoms_num_info, - double* psi, - double* psi_vldr3) -{ - const int bcell_id = blockIdx.x; - const int num_atoms = atoms_num_info[2 * bcell_id]; - const int pre_atoms = atoms_num_info[2 * bcell_id + 1]; - const int mcell_id = blockIdx.y; - const double vldr3_value = vldr3[bcell_id * bxyz + mcell_id]; - const double mcell_pos_x = mcell_pos[3 * mcell_id]; - const double mcell_pos_y = mcell_pos[3 * mcell_id + 1]; - const double mcell_pos_z = mcell_pos[3 * mcell_id + 2]; - - for(int atom_id = threadIdx.x; atom_id < num_atoms; atom_id += blockDim.x) - { - const int dr_start = 3 * (pre_atoms + atom_id); - const double dr_x = dr_part[dr_start] + mcell_pos_x; - const double dr_y = dr_part[dr_start + 1] + mcell_pos_y; - const double dr_z = dr_part[dr_start + 2] + mcell_pos_z; - double dist = sqrt(dr_x * dr_x + dr_y * dr_y + dr_z * dr_z); - const int atype = __ldg(atoms_type + pre_atoms + atom_id); - if(dist < rcut[atype]) - { - if (dist < 1.0E-9) - { - dist += 1.0E-9; - } - double dr[3] = {dr_x / dist, dr_y / dist, dr_z / dist}; - double ylma[49]; - const int nwl = __ldg(ucell_atom_nwl + atype); - spherical_harmonics(dr, nwl, ylma, ylmcoef); - int psi_idx = (bcell_id * max_atom + atom_id) * bxyz * nwmax + mcell_id; - interp_vl(dist, - delta_r, - atype, - nwmax, - bxyz, - nr_max, - atom_nw, - atom_iw2_new, - psi_u, - ylma, - atom_iw2_ylm, - vldr3_value, - psi, - psi_vldr3, - psi_idx); - } - } -} - -} // namespace GintKernel \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_vl.cuh b/source/source_lcao/module_gint/kernels/cuda/gint_vl.cuh deleted file mode 100644 index ada7954968..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/gint_vl.cuh +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef GINT_VL_CUH -#define GINT_VL_CUH - -#include -#include -namespace GintKernel -{ -/* - * @brief: get the value of the spherical harmonics - * - * - * @note the left and right matrix elements of the grid point integral. - * We can understand the grid point integral of the local potential term - * as the following operation: - * H = psi * vlocal * psi * dr^3. - * Here, the matrix element of the left matrix is psi, and the matrix - * element of the right matrix is vlocal * psi * dr^3. - */ -__global__ void get_psi_and_vldr3(const double* const ylmcoef, - const double delta_r, - const int bxyz, - const double nwmax, - const double max_atom, - const int* const ucell_atom_nwl, - const bool* const atom_iw2_new, - const int* const atom_iw2_ylm, - const int* const atom_nw, - const double* const rcut, - const int nr_max, - const double* const psi_u, - const double* const mcell_pos, - const double* const dr_part, - const double* const vldr3, - const uint8_t* const atoms_type, - const int* const atoms_num_info, - double* psi, - double* psi_vldr3); - -} // namespace GintKernel -#endif // GINT_VL_CUH \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/interp.cuh b/source/source_lcao/module_gint/kernels/cuda/interp.cuh deleted file mode 100644 index 31ccf3ca2c..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/interp.cuh +++ /dev/null @@ -1,204 +0,0 @@ -#ifndef INTERP_CUH -#define INTERP_CUH - -#include - -namespace GintKernel -{ -// if exponent is an integer between 0 and 5 (the most common cases in gint), -// pow_int is much faster than std::pow -static __device__ double pow_int(double base, int exp) -{ - switch (exp) - { - case 0: - return 1.0; - case 1: - return base; - case 2: - return base * base; - case 3: - return base * base * base; - case 4: - return base * base * base * base; - case 5: - return base * base * base * base * base; - default: - double result = pow(base, exp); - return result; - } -} - -static __device__ void interp_rho(const double dist, - const double delta_r, - const int atype, - const double nwmax, - const int nr_max, - const int* __restrict__ atom_nw, - const bool* __restrict__ atom_iw2_new, - const double* __restrict__ psi_u, - const double ylma[49], - const int* __restrict__ atom_iw2_ylm, - double* psi, - int psi_idx) -{ - const double distance = dist / delta_r; - - const int ip = (int)(distance); - const double dx = distance - ip; - const double dx2 = dx * dx; - const double dx3 = dx2 * dx; - - const double c3 = 3.0 * dx2 - 2.0 * dx3; - const double c1 = 1.0 - c3; - const double c2 = (dx - 2.0 * dx2 + dx3) * delta_r; - const double c4 = (dx3 - dx2) * delta_r; - - double phi = 0.0; - const int it_nw = atype * nwmax; - int iw_nr = (it_nw * nr_max + ip) * 2; - int it_nw_iw = it_nw; - for (int iw = 0; iw < atom_nw[atype]; ++iw) - { - if (atom_iw2_new[it_nw_iw]) - { - phi = c1 * psi_u[iw_nr] + c2 * psi_u[iw_nr + 1] - + c3 * psi_u[iw_nr + 2] + c4 * psi_u[iw_nr + 3]; - } - psi[psi_idx] = phi * ylma[atom_iw2_ylm[it_nw_iw]]; - psi_idx += 1; - iw_nr += 2 * nr_max; - it_nw_iw++; - } -} - -static __device__ void interp_vl(const double dist, - const double delta_r, - const int atype, - const double nwmax, - const int bxyz, - const int nr_max, - const int* __restrict__ atom_nw, - const bool* __restrict__ atom_iw2_new, - const double* __restrict__ psi_u, - const double ylma[49], - const int* __restrict__ atom_iw2_ylm, - const double vldr3_value, - double* psi, - double* psi_vldr3, - int psi_idx) -{ - const double distance = dist / delta_r; - - const int ip = (int)(distance); - const double dx = distance - ip; - const double dx2 = dx * dx; - const double dx3 = dx2 * dx; - - const double c3 = 3.0 * dx2 - 2.0 * dx3; - const double c1 = 1.0 - c3; - const double c2 = (dx - 2.0 * dx2 + dx3) * delta_r; - const double c4 = (dx3 - dx2) * delta_r; - - double phi = 0.0; - const int it_nw = atype * nwmax; - int iw_nr = (it_nw * nr_max + ip) * 2; - int it_nw_iw = it_nw; - for (int iw = 0; iw < atom_nw[atype]; ++iw) - { - if (atom_iw2_new[it_nw_iw]) - { - phi = c1 * psi_u[iw_nr] + c2 * psi_u[iw_nr + 1] - + c3 * psi_u[iw_nr + 2] + c4 * psi_u[iw_nr + 3]; - } - psi[psi_idx] = phi * ylma[atom_iw2_ylm[it_nw_iw]]; - psi_vldr3[psi_idx] = psi[psi_idx] * vldr3_value; - psi_idx += bxyz; - iw_nr += 2 * nr_max; - it_nw_iw++; - } -} - -static __device__ void interp_f(const double dist, - const double delta_r, - const int atype, - const double nwmax, - const int nr_max, - const int* __restrict__ atom_nw, - const bool* __restrict__ atom_iw2_new, - const double* __restrict__ psi_u, - const double ylma[49], - const int* __restrict__ atom_iw2_l, - const int* __restrict__ atom_iw2_ylm, - const double vldr3_value, - const double * __restrict__ dr, - const double grly[49][3], - int psi_idx, - double* psi, - double* dpsi, - double* d2psi) -{ - // Calculate normalized position for interpolation - const double postion = dist / delta_r; - // Extract integer part and fractional part of the position - const double ip = static_cast(postion); - const double x0 = postion - ip; - const double x1 = 1.0 - x0; - const double x2 = 2.0 - x0; - const double x3 = 3.0 - x0; - const double x12 = x1 * x2 / 6; - const double x03 = x0 * x3 / 2; - // Temporary variables for interpolation - double tmp = 0.0; - double dtmp = 0.0; - // Loop over non-zero elements in atom_nw array - const int it_nw = atype * nwmax; - int iw_nr = (it_nw * nr_max + ip) * 2; - int it_nw_iw = it_nw; - for (int iw = 0; iw < atom_nw[atype]; ++iw) - { - if (atom_iw2_new[it_nw_iw]) - { - // Perform interpolation using cubic B-spline - // basis functions - tmp = x12 * (psi_u[iw_nr] * x3 + psi_u[iw_nr + 6] * x0) - + x03 * (psi_u[iw_nr + 2] * x2 - psi_u[iw_nr + 4] * x1); - dtmp = x12 * (psi_u[iw_nr + 1] * x3 + psi_u[iw_nr + 7] * x0) - + x03 * (psi_u[iw_nr + 3] * x2 - psi_u[iw_nr + 5] * x1); - } - // Extract information from atom_iw2_* arrays - const int ll = atom_iw2_l[it_nw_iw]; - const int idx_lm = atom_iw2_ylm[it_nw_iw]; - const double rl = pow_int(dist, ll); - const double rl_r = 1.0 / rl; - const double dist_r = 1 / dist; - const int dpsi_idx = psi_idx * 3; - const int d2psi_idx = psi_idx * 6; - // Compute derivatives with respect to spatial - // coordinates - const double tmpdphi_rly - = (dtmp - tmp * ll * dist_r) * rl_r * ylma[idx_lm] * dist_r; - const double tmprl = tmp * rl_r; - const double dpsirx = tmpdphi_rly * dr[0] + tmprl * grly[idx_lm][0]; - const double dpsiry = tmpdphi_rly * dr[1] + tmprl * grly[idx_lm][1]; - const double dpsirz = tmpdphi_rly * dr[2] + tmprl * grly[idx_lm][2]; - - psi[psi_idx] = tmprl * ylma[idx_lm] * vldr3_value; - dpsi[dpsi_idx] = dpsirx; - dpsi[dpsi_idx + 1] = dpsiry; - dpsi[dpsi_idx + 2] = dpsirz; - d2psi[d2psi_idx] = dpsirx * dr[0]; - d2psi[d2psi_idx + 1] = dpsirx * dr[1]; - d2psi[d2psi_idx + 2] = dpsirx * dr[2]; - d2psi[d2psi_idx + 3] = dpsiry * dr[1]; - d2psi[d2psi_idx + 4] = dpsiry * dr[2]; - d2psi[d2psi_idx + 5] = dpsirz * dr[2]; - // Update loop counters and indices - psi_idx += 1; - iw_nr += 2 * nr_max; - it_nw_iw++; - } -} -} // namespace GintKernel - -#endif \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/sph.cuh b/source/source_lcao/module_gint/kernels/cuda/sph.cuh deleted file mode 100644 index fec963d9fd..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/sph.cuh +++ /dev/null @@ -1,519 +0,0 @@ -#ifndef SPH_CUH -#define SPH_CUH - -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -namespace GintKernel -{ - -static __device__ void spherical_harmonics(const double* const dr, - const int nwl, - double (&ylma)[49], - const double* const ylmcoef) -{ - /*************************** - L = 0 - ***************************/ - ylma[0] = ylmcoef[0]; // l=0, m=0 - double tmp0; - if (nwl == 0) - return; - - /*************************** - L = 1 - ***************************/ - ylma[1] = ylmcoef[1] * dr[2]; // l=1, m=0 - ylma[2] = -ylmcoef[1] * dr[0]; // l=1, m=1 - ylma[3] = -ylmcoef[1] * dr[1]; // l=1, m=-1 - if (nwl == 1) - return; - - /*************************** - L = 2 - ***************************/ - tmp0=ylmcoef[3] * ylma[0]; - ylma[4] = ylmcoef[2] * dr[2] * ylma[1] - tmp0 ; // l=2, m=0 - tmp0 = ylmcoef[4] * dr[2]; - ylma[5] = tmp0 * ylma[2]; // l=2,m=1 - ylma[6] = tmp0 * ylma[3]; // l=2,m=-1 - - tmp0 = ylmcoef[4] * dr[0]; - ylma[7] = ylmcoef[5] * ylma[4] - ylmcoef[6] * ylma[0] - - tmp0 * ylma[2]; // l=2,m=2 - ylma[8] = -tmp0 * ylma[3]; - if (nwl == 2) - return; - - /*************************** - L = 3 - ***************************/ - tmp0=ylmcoef[8] * ylma[1]; - ylma[9] = ylmcoef[7] * dr[2] * ylma[4] - tmp0; // l=3, m=0 - - tmp0 = ylmcoef[9] * dr[2]; - ylma[10] = tmp0 * ylma[5] - ylmcoef[10] * ylma[2]; // l=3,m=1 - ylma[11] = tmp0 * ylma[6] - ylmcoef[10] * ylma[3]; // l=3,m=-1 - - tmp0 = ylmcoef[11] * dr[2]; - ylma[12] = tmp0 * ylma[7]; // l=3,m=2 - ylma[13] = tmp0 * ylma[8]; // l=3,m=-2 - - tmp0 = ylmcoef[14] * dr[0]; - ylma[14] = ylmcoef[12] * ylma[10] - ylmcoef[13] * ylma[2] - - tmp0 * ylma[7]; // l=3,m=3 - ylma[15] = ylmcoef[12] * ylma[11] - ylmcoef[13] * ylma[3] - - tmp0 * ylma[8]; // l=3,m=-3 - if (nwl == 3) - return; - - /*************************** - L = 4 - ***************************/ - tmp0=ylmcoef[16] * ylma[4]; - ylma[16] = ylmcoef[15] * dr[2] * ylma[9] - tmp0; // l=4,m=0 - - tmp0 = ylmcoef[17] * dr[2]; - ylma[17] = tmp0 * ylma[10] - ylmcoef[18] * ylma[5]; // l=4,m=1 - ylma[18] = tmp0 * ylma[11] - ylmcoef[18] * ylma[6]; // l=4,m=-1 - - tmp0 = ylmcoef[19] * dr[2]; - ylma[19] = tmp0 * ylma[12] - ylmcoef[20] * ylma[7]; // l=4,m=2 - ylma[20] = tmp0 * ylma[13] - ylmcoef[20] * ylma[8]; // l=4,m=-2 - - tmp0 = 3.0 * dr[2]; - ylma[21] = tmp0 * ylma[14]; // l=4,m=3 - ylma[22] = tmp0 * ylma[15]; // l=4,m=-3 - - tmp0 = ylmcoef[23] * dr[0]; - ylma[23] = ylmcoef[21] * ylma[19] - ylmcoef[22] * ylma[7] - - tmp0 * ylma[14]; // l=4,m=4 - ylma[24] = ylmcoef[21] * ylma[20] - ylmcoef[22] * ylma[8] - - tmp0 * ylma[15]; // l=4,m=-4 - if (nwl == 4) - return; - - /*************************** - L = 5 - ***************************/ - tmp0=ylmcoef[25] * ylma[9]; - ylma[25] - = ylmcoef[24] * dr[2] * ylma[16] - tmp0; // l=5,m=0 - - tmp0 = ylmcoef[26] * dr[2]; - ylma[26] = tmp0 * ylma[17] - ylmcoef[27] * ylma[10]; // l=5,m=1 - ylma[27] = tmp0 * ylma[18] - ylmcoef[27] * ylma[11]; // l=5,m=-1 - - tmp0 = ylmcoef[28] * dr[2]; - ylma[28] = tmp0 * ylma[19] - ylmcoef[29] * ylma[12]; // l=5,m=2 - ylma[29] = tmp0 * ylma[20] - ylmcoef[29] * ylma[13]; // l=5,m=-2 - - tmp0 = ylmcoef[30] * dr[2]; - ylma[30] = tmp0 * ylma[21] - ylmcoef[31] * ylma[14]; // l=5,m=3 - ylma[31] = tmp0 * ylma[22] - ylmcoef[31] * ylma[15]; // l=5,m=-3 - - tmp0 = ylmcoef[32] * dr[2]; - ylma[32] = tmp0 * ylma[23]; // l=5,m=4 - ylma[33] = tmp0 * ylma[24]; // l=5,m=-4 - - tmp0 = ylmcoef[35] * dr[0]; - ylma[34] = ylmcoef[33] * ylma[30] - ylmcoef[34] * ylma[14] - - tmp0 * ylma[23]; // l=5,m=5 - ylma[35] = ylmcoef[33] * ylma[31] - ylmcoef[34] * ylma[15] - - tmp0 * ylma[24]; // l=5,m=-5 - if (nwl == 5) - return; - /* - // if nwl > 5 - for (int il = 6; il <= nwl; il++) - { - int istart = il * il; - int istart1 = (il - 1) * (il - 1); - int istart2 = (il - 2) * (il - 2); - - double fac2 = sqrt(4.0 * istart - 1.0); - double fac4 = sqrt(4.0 * istart1 - 1.0); - - for (int im = 0; im < 2 * il - 1; im++) - { - int imm = (im + 1) / 2; - ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (dr[2] - * ylma[istart1 + im] - sqrt((double)istart1 - imm * imm) / fac4 * - ylma[istart2 + im]); - } - - double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0)); - double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0)); - double bl3 = sqrt(2.0) / fac2; - - ylma[istart + 2 * il - 1] = (bl3 * ylma[istart + 2 * il - 5] - bl2 * - ylma[istart2 + 2 * il - 5] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 3]) / - bl1; ylma[istart + 2 * il] = (bl3 * ylma[istart + 2 * il - 4] - bl2 * - ylma[istart2 + 2 * il - 4] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 2]) / - bl1; - }*/ -} - -static __device__ void spherical_harmonics_d(const double* const dr, - const double distance, - double (&grly)[49][3], - const int nwl, - double (&ylma)[49], - const double* const ylmcoef) -{ - double tmp0; - double tx = 2.0 * dr[0]; - double ty = 2.0 * dr[1]; - double tz = 2.0 * dr[2]; - ylma[0] = ylmcoef[0]; // l=0, m=0 - grly[0][0] = grly[0][1] = grly[0][2] = 0.0; - if (nwl == 0) - return; - - /*************************** - L = 1 - ***************************/ - ylma[1] = ylmcoef[1] * dr[2]; // l=1, m=0 - grly[1][0] = grly[1][1] = 0.0; - grly[1][2] = ylmcoef[1]; - ylma[2] = -ylmcoef[1] * dr[0]; // l=1, m=1 - grly[2][1] = grly[2][2] = 0.0; - grly[2][0] = -ylmcoef[1]; - ylma[3] = -ylmcoef[1] * dr[1]; // l=1, m=-1 - grly[3][0] = grly[3][2] = 0.0; - grly[3][1] = -ylmcoef[1]; - if (nwl == 1) - return; - - /*************************** - L = 2 - ***************************/ - ylma[4] = ylmcoef[2] * dr[2] * ylma[1] - - ylmcoef[3] * ylma[0] * distance; // l=2, m=0 - grly[4][0] - = ylmcoef[2] * dr[2] * grly[1][0] - - ylmcoef[3] * (grly[0][0] * distance + ylma[0] * tx); // l=2, m=0 - grly[4][1] - = ylmcoef[2] * dr[2] * grly[1][1] - - ylmcoef[3] * (grly[0][1] * distance + ylma[0] * ty); // l=2, m=0 - grly[4][2] - = ylmcoef[2] * (dr[2] * grly[1][2] + ylma[1]) - - ylmcoef[3] * (grly[0][2] * distance + ylma[0] * tz); // l=2, m=0 - - tmp0 = ylmcoef[4] * dr[2]; - ylma[5] = tmp0 * ylma[2]; // l=2,m=1 - grly[5][0] = tmp0 * grly[2][0]; - grly[5][1] = tmp0 * grly[2][1]; - grly[5][2] = ylmcoef[4] * (ylma[2] + dr[2] * grly[2][2]); - - ylma[6] = tmp0 * ylma[3]; // l=2,m=-1 - grly[6][0] = tmp0 * grly[3][0]; - grly[6][1] = tmp0 * grly[3][1]; - grly[6][2] = ylmcoef[4] * (ylma[3] + dr[2] * grly[3][2]); - - tmp0 = ylmcoef[4] * dr[0]; - ylma[7] = ylmcoef[5] * ylma[4] - ylmcoef[6] * ylma[0] * distance - - tmp0 * ylma[2]; // l=2,m=2 - grly[7][0] = ylmcoef[5] * grly[4][0] - - ylmcoef[6] * (ylma[0] * tx + grly[0][0] * distance) - - ylmcoef[4] * (dr[0] * grly[2][0] + ylma[2]); - grly[7][1] = ylmcoef[5] * grly[4][1] - - ylmcoef[6] * (ylma[0] * ty + grly[0][1] * distance) - - tmp0 * grly[2][1]; - grly[7][2] = ylmcoef[5] * grly[4][2] - - ylmcoef[6] * (ylma[0] * tz + grly[0][2] * distance) - - tmp0 * grly[2][2]; - - ylma[8] = -tmp0 * ylma[3]; - grly[8][0] = -ylmcoef[4] * (ylma[3] + dr[0] * grly[3][0]); - grly[8][1] = -tmp0 * grly[3][1]; - grly[8][2] = -tmp0 * grly[3][2]; - if (nwl == 2) - return; - - /*************************** - L = 3 - ***************************/ - ylma[9] = ylmcoef[7] * dr[2] * ylma[4] - - ylmcoef[8] * ylma[1] * distance; // l=3, m=0 - grly[9][0] = ylmcoef[7] * dr[2] * grly[4][0] - - ylmcoef[8] * (ylma[1] * tx + grly[1][0] * distance); - grly[9][1] = ylmcoef[7] * dr[2] * grly[4][1] - - ylmcoef[8] * (ylma[1] * ty + grly[1][1] * distance); - grly[9][2] = ylmcoef[7] * (ylma[4] + dr[2] * grly[4][2]) - - ylmcoef[8] * (ylma[1] * tz + grly[1][2] * distance); - - tmp0 = ylmcoef[9] * dr[2]; - ylma[10] = tmp0 * ylma[5] - ylmcoef[10] * ylma[2] * distance; // l=3,m=1 - grly[10][0] = tmp0 * grly[5][0] - - ylmcoef[10] * (grly[2][0] * distance + ylma[2] * tx); - grly[10][1] = tmp0 * grly[5][1] - - ylmcoef[10] * (grly[2][1] * distance + ylma[2] * ty); - grly[10][2] = ylmcoef[9] * (dr[2] * grly[5][2] + ylma[5]) - - ylmcoef[10] * (grly[2][2] * distance + ylma[2] * tz); - - ylma[11] = tmp0 * ylma[6] - ylmcoef[10] * ylma[3] * distance; // l=3,m=-1 - grly[11][0] = tmp0 * grly[6][0] - - ylmcoef[10] * (grly[3][0] * distance + ylma[3] * tx); - grly[11][1] = tmp0 * grly[6][1] - - ylmcoef[10] * (grly[3][1] * distance + ylma[3] * ty); - grly[11][2] = ylmcoef[9] * (dr[2] * grly[6][2] + ylma[6]) - - ylmcoef[10] * (grly[3][2] * distance + ylma[3] * tz); - - tmp0 = ylmcoef[11] * dr[2]; - ylma[12] = tmp0 * ylma[7]; // l=3,m=2 - grly[12][0] = tmp0 * grly[7][0]; - grly[12][1] = tmp0 * grly[7][1]; - grly[12][2] = ylmcoef[11] * (dr[2] * grly[7][2] + ylma[7]); - - ylma[13] = tmp0 * ylma[8]; // l=3,m=-2 - grly[13][0] = tmp0 * grly[8][0]; - grly[13][1] = tmp0 * grly[8][1]; - grly[13][2] = ylmcoef[11] * (dr[2] * grly[8][2] + ylma[8]); - - tmp0 = ylmcoef[14] * dr[0]; - ylma[14] = ylmcoef[12] * ylma[10] - ylmcoef[13] * ylma[2] * distance - - tmp0 * ylma[7]; // l=3,m=3 - grly[14][0] = ylmcoef[12] * grly[10][0] - - ylmcoef[13] * (ylma[2] * tx + grly[2][0] * distance) - - ylmcoef[14] * (ylma[7] + dr[0] * grly[7][0]); - grly[14][1] = ylmcoef[12] * grly[10][1] - - ylmcoef[13] * (ylma[2] * ty + grly[2][1] * distance) - - tmp0 * grly[7][1]; - grly[14][2] = ylmcoef[12] * grly[10][2] - - ylmcoef[13] * (ylma[2] * tz + grly[2][2] * distance) - - tmp0 * grly[7][2]; - - ylma[15] = ylmcoef[12] * ylma[11] - ylmcoef[13] * ylma[3] * distance - - tmp0 * ylma[8]; // l=3,m=-3 - grly[15][0] = ylmcoef[12] * grly[11][0] - - ylmcoef[13] * (ylma[3] * tx + grly[3][0] * distance) - - ylmcoef[14] * (ylma[8] + dr[0] * grly[8][0]); - grly[15][1] = ylmcoef[12] * grly[11][1] - - ylmcoef[13] * (ylma[3] * ty + grly[3][1] * distance) - - tmp0 * grly[8][1]; - grly[15][2] = ylmcoef[12] * grly[11][2] - - ylmcoef[13] * (ylma[3] * tz + grly[3][2] * distance) - - tmp0 * grly[8][2]; - if (nwl == 3) - return; - - /*************************** - L = 4 - ***************************/ - ylma[16] = ylmcoef[15] * dr[2] * ylma[9] - - ylmcoef[16] * ylma[4] * distance; // l=4,m=0 - grly[16][0] = ylmcoef[15] * dr[2] * grly[9][0] - - ylmcoef[16] * (ylma[4] * tx + grly[4][0] * distance); - grly[16][1] = ylmcoef[15] * dr[2] * grly[9][1] - - ylmcoef[16] * (ylma[4] * ty + grly[4][1] * distance); - grly[16][2] = ylmcoef[15] * (dr[2] * grly[9][2] + ylma[9]) - - ylmcoef[16] * (ylma[4] * tz + grly[4][2] * distance); - - tmp0 = ylmcoef[17] * dr[2]; - ylma[17] = tmp0 * ylma[10] - ylmcoef[18] * ylma[5] * distance; // l=4,m=1 - grly[17][0] = tmp0 * grly[10][0] - - ylmcoef[18] * (ylma[5] * tx + grly[5][0] * distance); - grly[17][1] = tmp0 * grly[10][1] - - ylmcoef[18] * (ylma[5] * ty + grly[5][1] * distance); - grly[17][2] = ylmcoef[17] * (dr[2] * grly[10][2] + ylma[10]) - - ylmcoef[18] * (ylma[5] * tz + grly[5][2] * distance); - - ylma[18] = tmp0 * ylma[11] - ylmcoef[18] * ylma[6] * distance; // l=4,m=-1 - grly[18][0] = tmp0 * grly[11][0] - - ylmcoef[18] * (ylma[6] * tx + grly[6][0] * distance); - grly[18][1] = tmp0 * grly[11][1] - - ylmcoef[18] * (ylma[6] * ty + grly[6][1] * distance); - grly[18][2] = ylmcoef[17] * (dr[2] * grly[11][2] + ylma[11]) - - ylmcoef[18] * (ylma[6] * tz + grly[6][2] * distance); - - tmp0 = ylmcoef[19] * dr[2]; - ylma[19] = tmp0 * ylma[12] - ylmcoef[20] * ylma[7] * distance; // l=4,m=2 - grly[19][0] = tmp0 * grly[12][0] - - ylmcoef[20] * (ylma[7] * tx + grly[7][0] * distance); - grly[19][1] = tmp0 * grly[12][1] - - ylmcoef[20] * (ylma[7] * ty + grly[7][1] * distance); - grly[19][2] = ylmcoef[19] * (dr[2] * grly[12][2] + ylma[12]) - - ylmcoef[20] * (ylma[7] * tz + grly[7][2] * distance); - - ylma[20] = tmp0 * ylma[13] - ylmcoef[20] * ylma[8] * distance; // l=4,m=-2 - grly[20][0] = tmp0 * grly[13][0] - - ylmcoef[20] * (ylma[8] * tx + grly[8][0] * distance); - grly[20][1] = tmp0 * grly[13][1] - - ylmcoef[20] * (ylma[8] * ty + grly[8][1] * distance); - grly[20][2] = ylmcoef[19] * (dr[2] * grly[13][2] + ylma[13]) - - ylmcoef[20] * (ylma[8] * tz + grly[8][2] * distance); - - tmp0 = 3.0 * dr[2]; - ylma[21] = tmp0 * ylma[14]; // l=4,m=3 - grly[21][0] = tmp0 * grly[14][0]; - grly[21][1] = tmp0 * grly[14][1]; - grly[21][2] = 3.0 * (dr[2] * grly[14][2] + ylma[14]); - - ylma[22] = tmp0 * ylma[15]; // l=4,m=-3 - grly[22][0] = tmp0 * grly[15][0]; - grly[22][1] = tmp0 * grly[15][1]; - grly[22][2] = 3.0 * (dr[2] * grly[15][2] + ylma[15]); - - tmp0 = ylmcoef[23] * dr[0]; - ylma[23] = ylmcoef[21] * ylma[19] - ylmcoef[22] * ylma[7] * distance - - tmp0 * ylma[14]; // l=4,m=4 - grly[23][0] = ylmcoef[21] * grly[19][0] - - ylmcoef[22] * (ylma[7] * tx + grly[7][0] * distance) - - ylmcoef[23] * (dr[0] * grly[14][0] + ylma[14]); - grly[23][1] = ylmcoef[21] * grly[19][1] - - ylmcoef[22] * (ylma[7] * ty + grly[7][1] * distance) - - tmp0 * grly[14][1]; - grly[23][2] = ylmcoef[21] * grly[19][2] - - ylmcoef[22] * (ylma[7] * tz + grly[7][2] * distance) - - tmp0 * grly[14][2]; - - ylma[24] = ylmcoef[21] * ylma[20] - ylmcoef[22] * ylma[8] * distance - - tmp0 * ylma[15]; // l=4,m=-4 - grly[24][0] = ylmcoef[21] * grly[20][0] - - ylmcoef[22] * (ylma[8] * tx + grly[8][0] * distance) - - ylmcoef[23] * (dr[0] * grly[15][0] + ylma[15]); - grly[24][1] = ylmcoef[21] * grly[20][1] - - ylmcoef[22] * (ylma[8] * ty + grly[8][1] * distance) - - tmp0 * grly[15][1]; - grly[24][2] = ylmcoef[21] * grly[20][2] - - ylmcoef[22] * (ylma[8] * tz + grly[8][2] * distance) - - tmp0 * grly[15][2]; - if (nwl == 4) - return; - - /*************************** - L = 5 - ***************************/ - ylma[25] = ylmcoef[24] * dr[2] * ylma[16] - - ylmcoef[25] * ylma[9] * distance; // l=5,m=0 - grly[25][0] = ylmcoef[24] * dr[2] * grly[16][0] - - ylmcoef[25] * (ylma[9] * tx + grly[9][0] * distance); - grly[25][1] = ylmcoef[24] * dr[2] * grly[16][1] - - ylmcoef[25] * (ylma[9] * ty + grly[9][1] * distance); - grly[25][2] = ylmcoef[24] * (dr[2] * grly[16][2] + ylma[16]) - - ylmcoef[25] * (ylma[9] * tz + grly[9][2] * distance); - - tmp0 = ylmcoef[26] * dr[2]; - ylma[26] = tmp0 * ylma[17] - ylmcoef[27] * ylma[10] * distance; // l=5,m=1 - grly[26][0] = tmp0 * grly[17][0] - - ylmcoef[27] * (ylma[10] * tx + grly[10][0] * distance); - grly[26][1] = tmp0 * grly[17][1] - - ylmcoef[27] * (ylma[10] * ty + grly[10][1] * distance); - grly[26][2] = ylmcoef[26] * (dr[2] * grly[17][2] + ylma[17]) - - ylmcoef[27] * (ylma[10] * tz + grly[10][2] * distance); - - ylma[27] = tmp0 * ylma[18] - ylmcoef[27] * ylma[11] * distance; // l=5,m=-1 - grly[27][0] = tmp0 * grly[18][0] - - ylmcoef[27] * (ylma[11] * tx + grly[11][0] * distance); - grly[27][1] = tmp0 * grly[18][1] - - ylmcoef[27] * (ylma[11] * ty + grly[11][1] * distance); - grly[27][2] = ylmcoef[26] * (dr[2] * grly[18][2] + ylma[18]) - - ylmcoef[27] * (ylma[11] * tz + grly[11][2] * distance); - - tmp0 = ylmcoef[28] * dr[2]; - ylma[28] = tmp0 * ylma[19] - ylmcoef[29] * ylma[12] * distance; // l=5,m=2 - grly[28][0] = tmp0 * grly[19][0] - - ylmcoef[29] * (ylma[12] * tx + grly[12][0] * distance); - grly[28][1] = tmp0 * grly[19][1] - - ylmcoef[29] * (ylma[12] * ty + grly[12][1] * distance); - grly[28][2] = ylmcoef[28] * (dr[2] * grly[19][2] + ylma[19]) - - ylmcoef[29] * (ylma[12] * tz + grly[12][2] * distance); - - ylma[29] = tmp0 * ylma[20] - ylmcoef[29] * ylma[13] * distance; // l=5,m=-2 - grly[29][0] = tmp0 * grly[20][0] - - ylmcoef[29] * (ylma[13] * tx + grly[13][0] * distance); - grly[29][1] = tmp0 * grly[20][1] - - ylmcoef[29] * (ylma[13] * ty + grly[13][1] * distance); - grly[29][2] = ylmcoef[28] * (dr[2] * grly[20][2] + ylma[20]) - - ylmcoef[29] * (ylma[13] * tz + grly[13][2] * distance); - - tmp0 = ylmcoef[30] * dr[2]; - ylma[30] = tmp0 * ylma[21] - ylmcoef[31] * ylma[14] * distance; // l=5,m=3 - grly[30][0] = tmp0 * grly[21][0] - - ylmcoef[31] * (grly[14][0] * distance + ylma[14] * tx); - grly[30][1] = tmp0 * grly[21][1] - - ylmcoef[31] * (grly[14][1] * distance + ylma[14] * ty); - grly[30][2] = ylmcoef[30] * (dr[2] * grly[21][2] + ylma[21]) - - ylmcoef[31] * (ylma[14] * tz + grly[14][2] * distance); - - ylma[31] = tmp0 * ylma[22] - ylmcoef[31] * ylma[15] * distance; // l=5,m=-3 - grly[31][0] = tmp0 * grly[22][0] - - ylmcoef[31] * (grly[15][0] * distance + ylma[15] * tx); - grly[31][1] = tmp0 * grly[22][1] - - ylmcoef[31] * (grly[15][1] * distance + ylma[15] * ty); - grly[31][2] = ylmcoef[30] * (dr[2] * grly[22][2] + ylma[22]) - - ylmcoef[31] * (ylma[15] * tz + grly[15][2] * distance); - - tmp0 = ylmcoef[32] * dr[2]; - ylma[32] = tmp0 * ylma[23]; // l=5,m=4 - grly[32][0] = tmp0 * grly[23][0]; - grly[32][1] = tmp0 * grly[23][1]; - grly[32][2] = ylmcoef[32] * (ylma[23] + dr[2] * grly[23][2]); - - ylma[33] = tmp0 * ylma[24]; // l=5,m=-4 - grly[33][0] = tmp0 * grly[24][0]; - grly[33][1] = tmp0 * grly[24][1]; - grly[33][2] = ylmcoef[32] * (ylma[24] + dr[2] * grly[24][2]); - - tmp0 = ylmcoef[35] * dr[0]; - ylma[34] = ylmcoef[33] * ylma[30] - ylmcoef[34] * ylma[14] * distance - - tmp0 * ylma[23]; // l=5,m=5 - grly[34][0] = ylmcoef[33] * grly[30][0] - - ylmcoef[34] * (ylma[14] * tx + grly[14][0] * distance) - - ylmcoef[35] * (dr[0] * grly[23][0] + ylma[23]); - grly[34][1] = ylmcoef[33] * grly[30][1] - - ylmcoef[34] * (ylma[14] * ty + grly[14][1] * distance) - - tmp0 * grly[23][1]; - grly[34][2] = ylmcoef[33] * grly[30][2] - - ylmcoef[34] * (ylma[14] * tz + grly[14][2] * distance) - - tmp0 * grly[23][2]; - - ylma[35] = ylmcoef[33] * ylma[31] - ylmcoef[34] * ylma[15] * distance - - tmp0 * ylma[24]; // l=5,m=-5 - grly[35][0] = ylmcoef[33] * grly[31][0] - - ylmcoef[34] * (ylma[15] * tx + grly[15][0] * distance) - - ylmcoef[35] * (dr[0] * grly[24][0] + ylma[24]); - grly[35][1] = ylmcoef[33] * grly[31][1] - - ylmcoef[34] * (ylma[15] * ty + grly[15][1] * distance) - - tmp0 * grly[24][1]; - grly[35][2] = ylmcoef[33] * grly[31][2] - - ylmcoef[34] * (ylma[15] * tz + grly[15][2] * distance) - - tmp0 * grly[24][2]; - - if (nwl == 5) - return; - /* - // if nwl > 5 - for (int il = 6; il <= nwl; il++) - { - int istart = il * il; - int istart1 = (il - 1) * (il - 1); - int istart2 = (il - 2) * (il - 2); - - double fac2 = sqrt(4.0 * istart - 1.0); - double fac4 = sqrt(4.0 * istart1 - 1.0); - - for (int im = 0; im < 2 * il - 1; im++) - { - int imm = (im + 1) / 2; - ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (dr[2] - * ylma[istart1 + im] - sqrt((double)istart1 - imm * imm) / fac4 * - ylma[istart2 + im]); - } - - double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0)); - double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0)); - double bl3 = sqrt(2.0) / fac2; - - ylma[istart + 2 * il - 1] = (bl3 * ylma[istart + 2 * il - 5] - bl2 * - ylma[istart2 + 2 * il - 5] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 3]) / - bl1; ylma[istart + 2 * il] = (bl3 * ylma[istart + 2 * il - 4] - bl2 * - ylma[istart2 + 2 * il - 4] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 2]) / - bl1; - }*/ -} - -} // namespace GintKernel - -#endif \ No newline at end of file diff --git a/source/source_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh b/source/source_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh deleted file mode 100644 index 77cbec17f6..0000000000 --- a/source/source_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh +++ /dev/null @@ -1,545 +0,0 @@ -#ifndef VBATCH_MATRIX_MUL_CUH -#define VBATCH_MATRIX_MUL_CUH -#include "cuda_tools.cuh" -#include "source_pw/module_pwdft/global.h" -#include "source_base/module_device/device.h" -#include "source_cell/unitcell.h" - -#include // for assert -#include -#include // for CUDA_VERSION -#include -#include -#include // for fprintf and stderr - -#define sA(i, j) sA[(j)*slda + (i)] -#define sB(i, j) sB[(j)*sldb + (i)] -#define fetch(A, m, n, bound) offs_d##A[min(n * LD##A + m, bound)] - -template -static __device__ void vbatched_gemm_device(int M, - int N, - int K, - T* __restrict__ A, - int LDA, - T* __restrict__ B, - int LDB, - T* __restrict__ C, - int LDC, - T* sA, - int slda, - T* sB, - int sldb, - T alpha) -{ - int idx = threadIdx.x; // thread's m dimension - int idy = threadIdx.y; // thread's n dimension - - int idt = DIM_X * idy + idx; // thread's global number - - int idxA = idt % DIM_XA; // idx within A - int idyA = idt / DIM_XA; // idy within A - - int idxB = idt % DIM_XB; // idx within B - int idyB = idt / DIM_XB; // idy within B - - int blx = blockIdx.x; // block's m dimension - int bly = blockIdx.y; // block's n dimension - - // Registers for the innermost loop - T rC[THR_N][THR_M]; - T rA[THR_M]; - T rB[THR_N]; - - // Registers for the dev->shmem copy - T ra[BLK_M / DIM_YA][BLK_K / DIM_XA]; - T rb[BLK_N / DIM_YB][BLK_K / DIM_XB]; - - // bound is the correction to offs_d in order to not get out of memory bound - // so bound could be negative value since offs_d could be out of bound - T* offs_dA = A + blx * BLK_M * LDA + idyA * LDA + idxA; - int boundA = (LDA * (M - 1) + K) - (blx * BLK_M * LDA + idyA * LDA + idxA) - 1; - - T* offs_dB = B + bly * BLK_N * LDB + idyB * LDB + idxB; - int boundB = (LDB * (N - 1) + K) - (bly * BLK_N * LDB + idyB * LDB + idxB) - 1; - - int m, n, k, kk; - -// Zero C -#pragma unroll - for (n = 0; n < THR_N; n++) - { -#pragma unroll - for (m = 0; m < THR_M; m++) - { - rC[n][m] = 0.0; - } - } - -// Load A dev->shmem -#pragma unroll - for (n = 0; n < BLK_M; n += DIM_YA) - { -#pragma unroll - for (m = 0; m < BLK_K; m += DIM_XA) - { - sA(n + idyA, m + idxA) = fetch(A, m, n, boundA); - } - } - -#pragma unroll - for (n = 0; n < BLK_N; n += DIM_YB) - { -#pragma unroll - for (m = 0; m < BLK_K; m += DIM_XB) - { - sB(m + idxB, n + idyB) = fetch(B, m, n, boundB); - } - } - - __syncthreads(); - - for (kk = 0; kk < K - BLK_K; kk += BLK_K) - { - offs_dA += BLK_K; - boundA -= BLK_K; - - offs_dB += BLK_K; - boundB -= BLK_K; - -// Load A dev->regs -#pragma unroll - for (n = 0; n < BLK_M / DIM_YA; n++) - { -#pragma unroll - for (m = 0; m < BLK_K / DIM_XA; m++) - { - ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA); - } - } - -// Load B dev->regs -#pragma unroll - for (n = 0; n < BLK_N / DIM_YB; n++) - { -#pragma unroll - for (m = 0; m < BLK_K / DIM_XB; m++) - { - rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB); - } - } - -// Multiply -#pragma unroll - for (k = 0; k < BLK_K; k++) - { -// Load A shmem->regs -#pragma unroll - for (m = 0; m < THR_M; m++) - { - rA[m] = sA(m * DIM_X + idx, k); - } - -// Load B shmem->regs -#pragma unroll - for (n = 0; n < THR_N; n++) - { - rB[n] = sB(k, n * DIM_Y + idy); - } - -// Compute -#pragma unroll - for (n = 0; n < THR_N; n++) - { -#pragma unroll - for (m = 0; m < THR_M; m++) - { - rC[n][m] += rA[m] * rB[n]; - } - } - } - - __syncthreads(); - -// Load A regs->shmem -#pragma unroll - for (n = 0; n < BLK_M / DIM_YA; n++) - { -#pragma unroll - for (m = 0; m < BLK_K / DIM_XA; m++) - { - sA(n * DIM_YA + idyA, m * DIM_XA + idxA) = ra[n][m]; - } - } - -// Load B regs->shmem -#pragma unroll - for (n = 0; n < BLK_N / DIM_YB; n++) - { -#pragma unroll - for (m = 0; m < BLK_K / DIM_XB; m++) - { - sB(m * DIM_XB + idxB, n * DIM_YB + idyB) = rb[n][m]; - } - } - __syncthreads(); - } - - // Multiply last full (BLK_K) or partial block of - // columns of op(A) and rows of op(B). - // It's okay that m,n exceed matrix bounds as all work is in registers - // or shared memory, and out-of-bounds rC[n][m] will not be saved later. - kk = K - kk; -#pragma unroll - for (k = 0; k < kk; k++) - { -// Load A shmem->regs -#pragma unroll - for (m = 0; m < THR_M; m++) - { - rA[m] = sA(m * DIM_X + idx, k); - } - -// Load B shmem->regs -#pragma unroll - for (n = 0; n < THR_N; n++) - { - rB[n] = sB(k, n * DIM_Y + idy); - } - -// Compute -#pragma unroll - for (n = 0; n < THR_N; n++) - { -#pragma unroll - for (m = 0; m < THR_M; m++) - { - rC[n][m] += rA[m] * rB[n]; - } - } - } - -// Store C regs->dev -#pragma unroll - for (n = 0; n < THR_N; n++) - { - int coord_dCn = bly * BLK_N + n * DIM_Y + idy; -#pragma unroll - for (m = 0; m < THR_M; m++) - { - int coord_dCm = blx * BLK_M + m * DIM_X + idx; - if (coord_dCm < M && coord_dCn < N) - { - int offsC = coord_dCn * LDC + coord_dCm; - - atomicAdd(C + offsC, rC[n][m] * alpha); - } - } - } -} - -/******************************************************************************/ -template -static __global__ void vbatched_gemm_kernel(int* M, - int* N, - int* K, - T** global_A_array, - int* global_lda, - T** global_B_array, - int* global_ldb, - T** global_C_array, - int* global_ldc, - T* alpha) -{ - extern __shared__ __align__(sizeof(T)) unsigned char smem[]; - T* shared_mem = reinterpret_cast(smem); - - int batchid = blockIdx.z; - int local_M = (int)M[batchid]; - int local_N = (int)N[batchid]; - int local_K = (int)K[batchid]; - - if (blockIdx.x >= (local_M + BLK_M - 1) / BLK_M) - return; - if (blockIdx.y >= (local_N + BLK_N - 1) / BLK_N) - return; - - int shared_lda = BLK_M + 1; - int shared_ldb = BLK_K + 1; - T* shared_A = (T*)shared_mem; - T* shared_B = shared_A + shared_lda * BLK_K; - double alpha_tmp = 1.0; - if (alpha != nullptr) - { - alpha_tmp = alpha[batchid]; - } - vbatched_gemm_device(local_M, - local_N, - local_K, - global_A_array[batchid], - (int)global_lda[batchid], - global_B_array[batchid], - (int)global_ldb[batchid], - global_C_array[batchid], - (int)global_ldc[batchid], - shared_A, - shared_lda, - shared_B, - shared_ldb, - alpha_tmp); -} - -/** - * Performs a batched matrix multiplication using the vbatched_gemm_impl - * function. - * - * C = alpha * A * B + C - * @tparam T The data type of the matrices. - * @tparam DIM_X The number of threads in the x-dimension of each block. - * @tparam DIM_Y The number of threads in the y-dimension of each block. - * @tparam BLK_M The number of rows processed by each thread block. - * @tparam BLK_N The number of columns processed by each thread block. - * @tparam BLK_K The number of elements processed by each thread block along the - * K dimension. - * @tparam DIM_XA The number of threads in the x-dimension used for loading - * matrix A. - * @tparam DIM_YA The number of threads in the y-dimension used for loading - * matrix A. - * @tparam DIM_XB The number of threads in the x-dimension used for loading - * matrix B. - * @tparam DIM_YB The number of threads in the y-dimension used for loading - * matrix B. - * @param max_m The maximum number of rows in the matrices. - * @param max_n The maximum number of columns in the matrices. - * @param m An array of batch sizes for the number of rows in each matrix. - * @param n An array of batch sizes for the number of columns in each matrix. - * @param k An array of batch sizes for the number of elements in each matrix - * along the K dimension. - * @param global_A_array An array of pointers to the input matrices A. - * @param global_lda An array of leading dimensions for the input matrices A. - * @param global_B_array An array of pointers to the input matrices B. - * @param global_ldb An array of leading dimensions for the input matrices B. - * @param global_C_array An array of pointers to the output matrices C. - * @param global_ldc An array of leading dimensions for the output matrices C. - * @param batchCount The number of matrices in the batch. - * @param stream The CUDA stream to use for the computation. - * @param alpha The scalar value to multiply the matrices by (optional, default - * is nullptr). generate by copilot - */ - -/* - * Why do we need to implement our own matrix multiplication based on the magma - * code? There are two main reasons. First is when we are doing batch matrix - * multiplication, since we need to accumulate the results of the - * multiplications, it is necessary to pass the same memory address of matrix C - * to different multiplications. This way, the accumulation can be done directly - * through atomic operations during the matrix multiplication, avoiding the - * reduction operations after the multiplication. Secondly, when calculating the - * charge density, where C = alpha * A * B + C, the value of alpha might be - * different for the same batch of matrices. Using the standard matrix - * multiplication interface would require breaking down the batch matrix - * multiplication into smaller batches. In practice, it is difficult to - * accumulate a batch. - * - * Moreover, taking into account the specific requirements of our application, - * especially the fact that we can relatively easily control the arrangement of - * the matrix elements, we have only implemented one type of requirement for - * matrix transposition. That is, we have implemented the operation C = alpha * - * trans(A) * B + C under the constraint of column-major order. - * - * Finally, we would like to thank Magma for its contributions to the field of - * scientific computing. - */ - -template -void vbatched_gemm_impl(int max_m, - int max_n, - int* m, - int* n, - int* k, - T** global_A_array, - int* global_lda, - T** global_B_array, - int* global_ldb, - T** global_C_array, - int* global_ldc, - int batchCount, - cudaStream_t stream, - T* alpha = nullptr) -{ - // The positions of A and B have been swapped here. - // This is because the original code is for column-major matrices. - // We use row-major matrices, so we need to swap A and B. - // The vbatched_gemm_impl is for C = trans(A) * B + C, but we need trans(C). - // Which means: trans(C) = trans(trans(A)*B + C) = trans(B) * A + trans(C) - // Then, ldc should be N, lda and ldb should be K - - size_t shared_mem_size = 0; - shared_mem_size += (BLK_M + 1) * BLK_K * sizeof(T); - shared_mem_size += (BLK_K + 1) * BLK_N * sizeof(T); - dim3 dimBlock(DIM_X, DIM_Y); - const int max_batch_count = 32768; - const int loop_num = batchCount / max_batch_count; - const int remain_num = batchCount % max_batch_count; - - for (int i = 0; i < loop_num; ++i) - { - dim3 dimGrid(ceildiv(max_n, BLK_M), ceildiv(max_m, BLK_N), max_batch_count); - T* alpha_tmp = nullptr; - if (alpha != nullptr) - { - alpha_tmp = alpha + i * max_batch_count; - } - - vbatched_gemm_kernel - <<>>(n + i * max_batch_count, - m + i * max_batch_count, - k + i * max_batch_count, - global_B_array + i * max_batch_count, - global_ldb + i * max_batch_count, - global_A_array + i * max_batch_count, - global_lda + i * max_batch_count, - global_C_array + i * max_batch_count, - global_ldc + i * max_batch_count, - alpha_tmp); - checkCudaLastError(); - } - if (remain_num > 0) - { - dim3 dimGrid(ceildiv(max_n, BLK_M), ceildiv(max_m, BLK_N), remain_num); - T* alpha_tmp = nullptr; - if (alpha != nullptr) - { - alpha_tmp = alpha + loop_num * max_batch_count; - } - vbatched_gemm_kernel - <<>>(n + loop_num * max_batch_count, - m + loop_num * max_batch_count, - k + loop_num * max_batch_count, - global_B_array + loop_num * max_batch_count, - global_ldb + loop_num * max_batch_count, - global_A_array + loop_num * max_batch_count, - global_lda + loop_num * max_batch_count, - global_C_array + loop_num * max_batch_count, - global_ldc + loop_num * max_batch_count, - alpha_tmp); - checkCudaLastError(); - } -} - -template -void gemm_time_measure(int max_m, - int max_n, - int* m, - int* n, - int* k, - T** global_A_array, - int* global_lda, - T** global_B_array, - int* global_ldb, - T** global_C_array, - int* global_ldc, - int batchCount, - cudaStream_t stream, - float& fast_time, - matrix_multiple_func_type& fastest_algo, - double* cpu_result, - double* h_global_C, - double* d_global_C) -{ - cudaEvent_t start, stop; - checkCuda(cudaMemset(d_global_C, 0, batchCount * max_m * max_n * sizeof(double))); - checkCuda(cudaEventCreate(&start)); - checkCuda(cudaEventCreate(&stop)); - checkCuda(cudaEventRecord(start, stream)); - vbatched_gemm_impl(max_m, - max_n, - m, - n, - k, - global_A_array, - global_lda, - global_B_array, - global_ldb, - global_C_array, - global_ldc, - batchCount, - stream); - checkCuda(cudaEventRecord(stop, stream)); - cudaError_t cuda_status = cudaGetLastError(); - checkCuda(cudaStreamSynchronize(stream)); - float milliseconds = 0; - checkCuda(cudaEventElapsedTime(&milliseconds, start, stop)); - - // WARNING !!!!! Here we assume that all m and n are the same - checkCuda(cudaMemcpy(h_global_C, d_global_C, batchCount * max_m * max_n * sizeof(double), cudaMemcpyDeviceToHost)); - bool check_result = true; - for (int i = 0; i < batchCount * max_m * max_n; ++i) - { - if (abs(cpu_result[i] - h_global_C[i]) > 0.001) - { - check_result = false; - break; - } - } - if (milliseconds < fast_time && cuda_status == cudaSuccess && check_result) - { - fast_time = milliseconds; - fastest_algo = vbatched_gemm_impl; -#ifdef __DEBUG - std::cout << "found! fastest time: " << fast_time << std::endl; - std::cout << DIM_X << "," << DIM_Y << "," << BLK_M << "," << BLK_N << "," << BLK_K << "," << DIM_XA << "," - << DIM_YA << "," << DIM_XB << "," << DIM_YB << std::endl; -#endif - } -} -#endif // VBATCH_MATRIX_MUL_CUH \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/localcell_info.cpp b/source/source_lcao/module_gint/localcell_info.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/localcell_info.cpp rename to source/source_lcao/module_gint/localcell_info.cpp diff --git a/source/source_lcao/module_gint/temp_gint/localcell_info.h b/source/source_lcao/module_gint/localcell_info.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/localcell_info.h rename to source/source_lcao/module_gint/localcell_info.h diff --git a/source/source_lcao/module_gint/temp_gint/meshgrid_info.h b/source/source_lcao/module_gint/meshgrid_info.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/meshgrid_info.h rename to source/source_lcao/module_gint/meshgrid_info.h diff --git a/source/source_lcao/module_gint/mult_psi_dmr.cpp b/source/source_lcao/module_gint/mult_psi_dmr.cpp deleted file mode 100644 index fab47c1aee..0000000000 --- a/source/source_lcao/module_gint/mult_psi_dmr.cpp +++ /dev/null @@ -1,105 +0,0 @@ -#include "gint_tools.h" -#include "source_base/timer.h" -#include "source_base/ylm.h" -#include "source_base/module_external/blas_connector.h" - -namespace Gint_Tools{ - -void mult_psi_DMR( - const Grid_Technique& gt, - const int bxyz, - const int LD_pool, - const int &grid_index, - const int &na_grid, - const int*const block_index, - const int*const block_size, - const bool*const*const cal_flag, - const double*const*const psi, - double*const*const psi_DMR, - const hamilt::HContainer*const DM, - const bool if_symm) -{ - const UnitCell& ucell = *gt.ucell; - - // parameters for lapack subroutines - constexpr char side = 'L'; - constexpr char uplo = 'U'; - const char trans = 'N'; - const double alpha = 1.0; - const double beta = 1.0; - const double alpha1 = if_symm ? 2.0 : 1.0; - - for (int ia1 = 0; ia1 < na_grid; ia1++) - { - const int bcell1 = gt.bcell_start[grid_index] + ia1; - const int iat1 = gt.which_atom[bcell1]; - - //! get cell R1, this step is redundant in gamma_only case. - const int id1 = gt.which_unitcell[bcell1]; - const ModuleBase::Vector3 r1 = gt.get_ucell_coords(id1); - - //! density - if (if_symm) - { - //! ia2==ia1 - const auto tmp_matrix = DM->find_matrix(iat1, iat1, 0, 0, 0); - - //! maybe checking "tmp_matrix == nullptr" is not necessary - if(tmp_matrix == nullptr) - { - continue; - } - - const auto cal_info = Gint_Tools::cal_info(bxyz, ia1, ia1, cal_flag); - const int ib_start = cal_info.first; - const int ib_len = cal_info.second; - - if(ib_len == 0) - { - continue; - } - - const auto tmp_matrix_ptr = tmp_matrix->get_pointer(); - const int idx1 = block_index[ia1]; - BlasConnector::symm_cm(side, uplo, block_size[ia1], ib_len, alpha, tmp_matrix_ptr, block_size[ia1], - &psi[ib_start][idx1], LD_pool, beta, &psi_DMR[ib_start][idx1], LD_pool); - } - - //! get (j,beta,R2) - const int start = if_symm ? ia1 + 1 : 0; - - for (int ia2 = start; ia2 < na_grid; ia2++) - { - const int bcell2 = gt.bcell_start[grid_index] + ia2; - const int iat2 = gt.which_atom[bcell2]; - const int id2 = gt.which_unitcell[bcell2]; - - //! get cell R2, this step is redundant in gamma_only case. - const ModuleBase::Vector3 r2 = gt.get_ucell_coords(id2); - - // get AtomPair - const auto tmp_matrix = DM->find_matrix(iat1, iat2, r1-r2); - if (tmp_matrix == nullptr) - { - continue; - } - const auto tmp_matrix_ptr = tmp_matrix->get_pointer(); - - const auto cal_info = Gint_Tools::cal_info(bxyz, ia1, ia1, cal_flag); - const int ib_start = cal_info.first; - const int ib_len = cal_info.second; - if(ib_len == 0) - { - continue; - } - const int idx1 = block_index[ia1]; - const int idx2 = block_index[ia2]; - - dgemm_(&trans, &trans, &block_size[ia2], &ib_len, &block_size[ia1], &alpha1, tmp_matrix_ptr, &block_size[ia2], - &psi[ib_start][idx1], &LD_pool, &beta, &psi_DMR[ib_start][idx2], &LD_pool); - - } // ia2 - } // ia1 -}// End of mult_psi_DMR - -}// End of Gint_Tools diff --git a/source/source_lcao/module_gint/temp_gint/phi_operator.cpp b/source/source_lcao/module_gint/phi_operator.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/phi_operator.cpp rename to source/source_lcao/module_gint/phi_operator.cpp diff --git a/source/source_lcao/module_gint/temp_gint/phi_operator.h b/source/source_lcao/module_gint/phi_operator.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/phi_operator.h rename to source/source_lcao/module_gint/phi_operator.h diff --git a/source/source_lcao/module_gint/temp_gint/phi_operator.hpp b/source/source_lcao/module_gint/phi_operator.hpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/phi_operator.hpp rename to source/source_lcao/module_gint/phi_operator.hpp diff --git a/source/source_lcao/module_gint/temp_gint/set_ddphi.cpp b/source/source_lcao/module_gint/set_ddphi.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/set_ddphi.cpp rename to source/source_lcao/module_gint/set_ddphi.cpp diff --git a/source/source_lcao/module_gint/temp_gint/gint.h b/source/source_lcao/module_gint/temp_gint/gint.h deleted file mode 100644 index 1255bae971..0000000000 --- a/source/source_lcao/module_gint/temp_gint/gint.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once -#include -#include "gint_info.h" -#include "gint_type.h" - -namespace ModuleGint -{ - -class Gint -{ - public: - Gint() = default; - virtual ~Gint() = default; - - // note that gint_info_ is a static member variable - // it is shared by all instances of Gint - static void set_gint_info(GintInfo* gint_info) - { - gint_info_ = gint_info; - } - - protected: - static GintInfo* gint_info_; -}; - -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/gint_rho_gpu.h b/source/source_lcao/module_gint/temp_gint/gint_rho_gpu.h deleted file mode 100644 index 07bbf0eaed..0000000000 --- a/source/source_lcao/module_gint/temp_gint/gint_rho_gpu.h +++ /dev/null @@ -1,52 +0,0 @@ -#pragma once - -#include -#include -#include "source_lcao/module_hcontainer/hcontainer.h" -#include "gint.h" -#include "gint_info.h" -#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" - -namespace ModuleGint -{ - -class Gint_rho_gpu: public Gint -{ - public: - Gint_rho_gpu( - const std::vector*>& dm_vec, - const int nspin, - double **rho, - bool is_dm_symm = true) - : dm_vec_(dm_vec), nspin_(nspin), rho_(rho), is_dm_symm_(is_dm_symm) {} - - void cal_gint(); - - private: - void init_dm_gint_(); - - void cal_rho_(); - - void transfer_cpu_to_gpu_(); - - void transfer_gpu_to_cpu_(); - - // input - const std::vector*> dm_vec_; - const int nspin_; - - // if true, it means the DMR matrix is symmetric, - // which leads to faster computations compared to the asymmetric case. - const bool is_dm_symm_; - - // output - double **rho_; - - // Intermediate variables - std::vector> dm_gint_vec_; - - std::vector> dm_gint_d_vec_; - std::vector> rho_d_vec_; -}; - -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_gpu.h b/source/source_lcao/module_gint/temp_gint/gint_vl_gpu.h deleted file mode 100644 index de113b2ea0..0000000000 --- a/source/source_lcao/module_gint/temp_gint/gint_vl_gpu.h +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -#include -#include -#include "source_lcao/module_hcontainer/hcontainer.h" -#include "gint.h" -#include "gint_info.h" -#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" - -namespace ModuleGint -{ - -class Gint_vl_gpu : public Gint -{ - public: - Gint_vl_gpu( - const double* vr_eff, - HContainer* hR) - : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} - - void cal_gint(); - - private: - - void init_hr_gint_(); - - void transfer_cpu_to_gpu_(); - - void transfer_gpu_to_cpu_(); - - void cal_hr_gint_(); - - // input - const double* vr_eff_; - - - // output - HContainer* hR_; - - // Intermediate variables - double dr3_; - - HContainer hr_gint_; - - CudaMemWrapper hr_gint_d_; - CudaMemWrapper vr_eff_d_; -}; - -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h b/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h index 07139c82db..5f711aa6a0 100644 --- a/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h +++ b/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h @@ -5,9 +5,8 @@ #include "source_base/ylm.h" #include "source_cell/unitcell.h" #include "source_cell/atom_spec.h" -#include "source_lcao/module_gint/temp_gint/biggrid_info.h" +#include "source_lcao/module_gint/biggrid_info.h" #include "gint_helper.cuh" -#include "source_lcao/module_gint/kernels/cuda/gemm_selector.cuh" namespace ModuleGint { @@ -39,7 +38,6 @@ class GintGpuVars // the index of gpu device int dev_id_ = 0; - matrix_multiple_func_type fastest_matrix_mul; }; diff --git a/source/source_lcao/module_gint/test/CMakeLists.txt b/source/source_lcao/module_gint/test/CMakeLists.txt deleted file mode 100644 index 2030b04a12..0000000000 --- a/source/source_lcao/module_gint/test/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -if(ENABLE_LCAO AND USE_CUDA) - AddTest( - TARGET gint_gpu_test - LIBS parameter ${math_libs} psi base device - SOURCES test_sph.cu test_sph.cpp -) -endif() \ No newline at end of file diff --git a/source/source_lcao/module_gint/test/test_sph.cpp b/source/source_lcao/module_gint/test/test_sph.cpp deleted file mode 100644 index e13a4d5675..0000000000 --- a/source/source_lcao/module_gint/test/test_sph.cpp +++ /dev/null @@ -1,597 +0,0 @@ -#include "test_sph.h" -using namespace std; - -void sph_harm(const int& Lmax, // max momentum of l - const double& xdr, - const double& ydr, - const double& zdr, - std::vector& rly, - double* ylmcoef) -{ - - // begin calculation - /*************************** - L = 0 - ***************************/ - rly[0] = ylmcoef[0]; // l=0, m=0 - if (Lmax == 0) - return; - - /*************************** - L = 1 - ***************************/ - rly[1] = ylmcoef[1] * zdr; // l=1, m=0 - rly[2] = -ylmcoef[1] * xdr; // l=1, m=1 - rly[3] = -ylmcoef[1] * ydr; // l=1, m=-1 - if (Lmax == 1) - return; - - /*************************** - L = 2 - ***************************/ - double tmp0 = ylmcoef[3] * rly[0]; - rly[4] = ylmcoef[2] * zdr * rly[1] - tmp0; // l=2, m=0 - - tmp0 = ylmcoef[4] * zdr; - rly[5] = tmp0 * rly[2]; // l=2,m=1 - rly[6] = tmp0 * rly[3]; // l=2,m=-1 - - double tmp2 = ylmcoef[4] * xdr; - rly[7] - = ylmcoef[5] * rly[4] - ylmcoef[6] * rly[0] - tmp2 * rly[2]; // l=2,m=2 - rly[8] = -tmp2 * rly[3]; - // rly[8] = tmp1+tmp2*rly[3];//l=2,m=-2 - if (Lmax == 2) - return; - - /*************************** - L = 3 - ***************************/ - tmp0 = ylmcoef[8] * rly[1]; - rly[9] = ylmcoef[7] * zdr * rly[4] - tmp0; // l=3, m=0 - - double tmp3 = ylmcoef[9] * zdr; - rly[10] = tmp3 * rly[5] - ylmcoef[10] * rly[2]; // l=3,m=1 - rly[11] = tmp3 * rly[6] - ylmcoef[10] * rly[3]; // l=3,m=-1 - - double tmp4 = ylmcoef[11] * zdr; - rly[12] = tmp4 * rly[7]; // l=3,m=2 - rly[13] = tmp4 * rly[8]; // l=3,m=-2 - - double tmp5 = ylmcoef[14] * xdr; - rly[14] = ylmcoef[12] * rly[10] - ylmcoef[13] * rly[2] - - tmp5 * rly[7]; // l=3,m=3 - rly[15] = ylmcoef[12] * rly[11] - ylmcoef[13] * rly[3] - - tmp5 * rly[8]; // l=3,m=-3 - if (Lmax == 3) - return; - - /*************************** - L = 4 - ***************************/ - tmp0 = ylmcoef[16] * rly[4]; - rly[16] = ylmcoef[15] * zdr * rly[9] - tmp0; // l=4,m=0 - - double tmp6 = ylmcoef[17] * zdr; - rly[17] = tmp6 * rly[10] - ylmcoef[18] * rly[5]; // l=4,m=1 - rly[18] = tmp6 * rly[11] - ylmcoef[18] * rly[6]; // l=4,m=-1 - - double tmp7 = ylmcoef[19] * zdr; - rly[19] = tmp7 * rly[12] - ylmcoef[20] * rly[7]; // l=4,m=2 - rly[20] = tmp7 * rly[13] - ylmcoef[20] * rly[8]; // l=4,m=-2 - - double tmp8 = 3.0 * zdr; - rly[21] = tmp8 * rly[14]; // l=4,m=3 - rly[22] = tmp8 * rly[15]; // l=4,m=-3 - - double tmp9 = ylmcoef[23] * xdr; - rly[23] = ylmcoef[21] * rly[19] - ylmcoef[22] * rly[7] - - tmp9 * rly[14]; // l=4,m=4 - rly[24] = ylmcoef[21] * rly[20] - ylmcoef[22] * rly[8] - - tmp9 * rly[15]; // l=4,m=-4 - if (Lmax == 4) - return; - - /*************************** - L = 5 - ***************************/ - tmp0 = ylmcoef[25] * rly[9]; - rly[25] = ylmcoef[24] * zdr * rly[16] - tmp0; // l=5,m=0 - - double tmp10 = ylmcoef[26] * zdr; - rly[26] = tmp10 * rly[17] - ylmcoef[27] * rly[10]; // l=5,m=1 - rly[27] = tmp10 * rly[18] - ylmcoef[27] * rly[11]; // l=5,m=-1 - - double tmp11 = ylmcoef[28] * zdr; - rly[28] = tmp11 * rly[19] - ylmcoef[29] * rly[12]; // l=5,m=2 - rly[29] = tmp11 * rly[20] - ylmcoef[29] * rly[13]; // l=5,m=-2 - - double tmp12 = ylmcoef[30] * zdr; - rly[30] = tmp12 * rly[21] - ylmcoef[31] * rly[14]; // l=5,m=3 - rly[31] = tmp12 * rly[22] - ylmcoef[31] * rly[15]; // l=5,m=-3 - - double tmp13 = ylmcoef[32] * zdr; - rly[32] = tmp13 * rly[23]; // l=5,m=4 - rly[33] = tmp13 * rly[24]; // l=5,m=-4 - - double tmp14 = ylmcoef[35] * xdr; - rly[34] = ylmcoef[33] * rly[30] - ylmcoef[34] * rly[14] - - tmp14 * rly[23]; // l=5,m=5 - rly[35] = ylmcoef[33] * rly[31] - ylmcoef[34] * rly[15] - - tmp14 * rly[24]; // l=5,m=-5 - if (Lmax == 5) - return; - - // if Lmax > 5 - for (int il = 6; il <= Lmax; il++) - { - int istart = il * il; - int istart1 = (il - 1) * (il - 1); - int istart2 = (il - 2) * (il - 2); - - double fac2 = sqrt(4.0 * istart - 1.0); - double fac4 = sqrt(4.0 * istart1 - 1.0); - - for (int im = 0; im < 2 * il - 1; im++) - { - int imm = (im + 1) / 2; - // if (im % 2 == 0) imm *= -1; - - rly[istart + im] = fac2 / sqrt((double)istart - imm * imm) - * (zdr * rly[istart1 + im] - - sqrt((double)istart1 - imm * imm) / fac4 - * rly[istart2 + im]); - } - - double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0)); - double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0)); - double bl3 = sqrt(2.0) / fac2; - - rly[istart + 2 * il - 1] - = (bl3 * rly[istart + 2 * il - 5] - bl2 * rly[istart2 + 2 * il - 5] - - 2.0 * xdr * rly[istart1 + 2 * il - 3]) - / bl1; - rly[istart + 2 * il] - = (bl3 * rly[istart + 2 * il - 4] - bl2 * rly[istart2 + 2 * il - 4] - - 2.0 * xdr * rly[istart1 + 2 * il - 2]) - / bl1; - } - - return; -} -void grad_rl_sph_harm(const int& Lmax, // max momentum of L - const double& x, - const double& y, - const double& z, - double* rly, - double** grly, - const double* ylmcoef) -{ - double radius2 = x * x + y * y + z * z; - double tx = 2.0 * x; - double ty = 2.0 * y; - double tz = 2.0 * z; - - // begin calculation - /*************************** - L = 0 - ***************************/ - rly[0] = ylmcoef[0]; // l=0, m=0 - grly[0][0] = grly[0][1] = grly[0][2] = 0.0; - if (Lmax == 0) - return; - - /*************************** - L = 1 - ***************************/ - rly[1] = ylmcoef[1] * z; // l=1, m=0 - grly[1][0] = grly[1][1] = 0.0; - grly[1][2] = ylmcoef[1]; - - rly[2] = -ylmcoef[1] * x; // l=1, m=1 - grly[2][1] = grly[2][2] = 0.0; - grly[2][0] = -ylmcoef[1]; - - rly[3] = -ylmcoef[1] * y; // l=1, m=-1 - grly[3][0] = grly[3][2] = 0.0; - grly[3][1] = -ylmcoef[1]; - - if (Lmax == 1) - return; - - /*************************** - L = 2 - ***************************/ - rly[4] - = ylmcoef[2] * z * rly[1] - ylmcoef[3] * rly[0] * radius2; // l=2, m=0 - grly[4][0] - = ylmcoef[2] * z * grly[1][0] - - ylmcoef[3] * (grly[0][0] * radius2 + rly[0] * tx); // l=2, m=0 - grly[4][1] - = ylmcoef[2] * z * grly[1][1] - - ylmcoef[3] * (grly[0][1] * radius2 + rly[0] * ty); // l=2, m=0 - grly[4][2] - = ylmcoef[2] * (z * grly[1][2] + rly[1]) - - ylmcoef[3] * (grly[0][2] * radius2 + rly[0] * tz); // l=2, m=0 - - double tmp0 = ylmcoef[4] * z; - rly[5] = tmp0 * rly[2]; // l=2,m=1 - grly[5][0] = tmp0 * grly[2][0]; - grly[5][1] = tmp0 * grly[2][1]; - grly[5][2] = ylmcoef[4] * (rly[2] + z * grly[2][2]); - - rly[6] = tmp0 * rly[3]; // l=2,m=-1 - grly[6][0] = tmp0 * grly[3][0]; - grly[6][1] = tmp0 * grly[3][1]; - grly[6][2] = ylmcoef[4] * (rly[3] + z * grly[3][2]); - - double tmp2 = ylmcoef[4] * x; - rly[7] = ylmcoef[5] * rly[4] - ylmcoef[6] * rly[0] * radius2 - - tmp2 * rly[2]; // l=2,m=2 - grly[7][0] = ylmcoef[5] * grly[4][0] - - ylmcoef[6] * (rly[0] * tx + grly[0][0] * radius2) - - ylmcoef[4] * (x * grly[2][0] + rly[2]); - - // std::cout << "\np1 = "<< ylmcoef[5]*grly[4][0] << " p2 = " << - //-ylmcoef[6]*rly[0]*tx - // << " p3 = " << -ylmcoef[4]*x*grly[2][0] << " p4 = " - //<< -ylmcoef[4]*rly[2] << std::endl; - - grly[7][1] = ylmcoef[5] * grly[4][1] - - ylmcoef[6] * (rly[0] * ty + grly[0][1] * radius2) - - tmp2 * grly[2][1]; - grly[7][2] = ylmcoef[5] * grly[4][2] - - ylmcoef[6] * (rly[0] * tz + grly[0][2] * radius2) - - tmp2 * grly[2][2]; - - rly[8] = -tmp2 * rly[3]; - grly[8][0] = -ylmcoef[4] * (rly[3] + x * grly[3][0]); - grly[8][1] = -tmp2 * grly[3][1]; - grly[8][2] = -tmp2 * grly[3][2]; - // rly[8] = tmp1+tmp2*rly[3];//l=2,m=-2 - if (Lmax == 2) - return; - - /*************************** - L = 3 - ***************************/ - rly[9] - = ylmcoef[7] * z * rly[4] - ylmcoef[8] * rly[1] * radius2; // l=3, m=0 - grly[9][0] = ylmcoef[7] * z * grly[4][0] - - ylmcoef[8] * (rly[1] * tx + grly[1][0] * radius2); - grly[9][1] = ylmcoef[7] * z * grly[4][1] - - ylmcoef[8] * (rly[1] * ty + grly[1][1] * radius2); - grly[9][2] = ylmcoef[7] * (rly[4] + z * grly[4][2]) - - ylmcoef[8] * (rly[1] * tz + grly[1][2] * radius2); - - double tmp3 = ylmcoef[9] * z; - rly[10] = tmp3 * rly[5] - ylmcoef[10] * rly[2] * radius2; // l=3,m=1 - grly[10][0] = tmp3 * grly[5][0] - - ylmcoef[10] * (grly[2][0] * radius2 + rly[2] * tx); - grly[10][1] = tmp3 * grly[5][1] - - ylmcoef[10] * (grly[2][1] * radius2 + rly[2] * ty); - grly[10][2] = ylmcoef[9] * (z * grly[5][2] + rly[5]) - - ylmcoef[10] * (grly[2][2] * radius2 + rly[2] * tz); - - rly[11] = tmp3 * rly[6] - ylmcoef[10] * rly[3] * radius2; // l=3,m=-1 - grly[11][0] = tmp3 * grly[6][0] - - ylmcoef[10] * (grly[3][0] * radius2 + rly[3] * tx); - grly[11][1] = tmp3 * grly[6][1] - - ylmcoef[10] * (grly[3][1] * radius2 + rly[3] * ty); - grly[11][2] = ylmcoef[9] * (z * grly[6][2] + rly[6]) - - ylmcoef[10] * (grly[3][2] * radius2 + rly[3] * tz); - - double tmp4 = ylmcoef[11] * z; - rly[12] = tmp4 * rly[7]; // l=3,m=2 - grly[12][0] = tmp4 * grly[7][0]; - grly[12][1] = tmp4 * grly[7][1]; - grly[12][2] = ylmcoef[11] * (z * grly[7][2] + rly[7]); - - rly[13] = tmp4 * rly[8]; // l=3,m=-2 - grly[13][0] = tmp4 * grly[8][0]; - grly[13][1] = tmp4 * grly[8][1]; - grly[13][2] = ylmcoef[11] * (z * grly[8][2] + rly[8]); - - double tmp5 = ylmcoef[14] * x; - rly[14] = ylmcoef[12] * rly[10] - ylmcoef[13] * rly[2] * radius2 - - tmp5 * rly[7]; // l=3,m=3 - grly[14][0] = ylmcoef[12] * grly[10][0] - - ylmcoef[13] * (rly[2] * tx + grly[2][0] * radius2) - - ylmcoef[14] * (rly[7] + x * grly[7][0]); - grly[14][1] = ylmcoef[12] * grly[10][1] - - ylmcoef[13] * (rly[2] * ty + grly[2][1] * radius2) - - tmp5 * grly[7][1]; - grly[14][2] = ylmcoef[12] * grly[10][2] - - ylmcoef[13] * (rly[2] * tz + grly[2][2] * radius2) - - tmp5 * grly[7][2]; - - rly[15] = ylmcoef[12] * rly[11] - ylmcoef[13] * rly[3] * radius2 - - tmp5 * rly[8]; // l=3,m=-3 - grly[15][0] = ylmcoef[12] * grly[11][0] - - ylmcoef[13] * (rly[3] * tx + grly[3][0] * radius2) - - ylmcoef[14] * (rly[8] + x * grly[8][0]); - grly[15][1] = ylmcoef[12] * grly[11][1] - - ylmcoef[13] * (rly[3] * ty + grly[3][1] * radius2) - - tmp5 * grly[8][1]; - grly[15][2] = ylmcoef[12] * grly[11][2] - - ylmcoef[13] * (rly[3] * tz + grly[3][2] * radius2) - - tmp5 * grly[8][2]; - if (Lmax == 3) - return; - - /*************************** - L = 4 - ***************************/ - rly[16] - = ylmcoef[15] * z * rly[9] - ylmcoef[16] * rly[4] * radius2; // l=4,m=0 - grly[16][0] = ylmcoef[15] * z * grly[9][0] - - ylmcoef[16] * (rly[4] * tx + grly[4][0] * radius2); - grly[16][1] = ylmcoef[15] * z * grly[9][1] - - ylmcoef[16] * (rly[4] * ty + grly[4][1] * radius2); - grly[16][2] = ylmcoef[15] * (z * grly[9][2] + rly[9]) - - ylmcoef[16] * (rly[4] * tz + grly[4][2] * radius2); - - double tmp6 = ylmcoef[17] * z; - rly[17] = tmp6 * rly[10] - ylmcoef[18] * rly[5] * radius2; // l=4,m=1 - grly[17][0] = tmp6 * grly[10][0] - - ylmcoef[18] * (rly[5] * tx + grly[5][0] * radius2); - grly[17][1] = tmp6 * grly[10][1] - - ylmcoef[18] * (rly[5] * ty + grly[5][1] * radius2); - grly[17][2] = ylmcoef[17] * (z * grly[10][2] + rly[10]) - - ylmcoef[18] * (rly[5] * tz + grly[5][2] * radius2); - - rly[18] = tmp6 * rly[11] - ylmcoef[18] * rly[6] * radius2; // l=4,m=-1 - grly[18][0] = tmp6 * grly[11][0] - - ylmcoef[18] * (rly[6] * tx + grly[6][0] * radius2); - grly[18][1] = tmp6 * grly[11][1] - - ylmcoef[18] * (rly[6] * ty + grly[6][1] * radius2); - grly[18][2] = ylmcoef[17] * (z * grly[11][2] + rly[11]) - - ylmcoef[18] * (rly[6] * tz + grly[6][2] * radius2); - - double tmp7 = ylmcoef[19] * z; - rly[19] = tmp7 * rly[12] - ylmcoef[20] * rly[7] * radius2; // l=4,m=2 - grly[19][0] = tmp7 * grly[12][0] - - ylmcoef[20] * (rly[7] * tx + grly[7][0] * radius2); - grly[19][1] = tmp7 * grly[12][1] - - ylmcoef[20] * (rly[7] * ty + grly[7][1] * radius2); - grly[19][2] = ylmcoef[19] * (z * grly[12][2] + rly[12]) - - ylmcoef[20] * (rly[7] * tz + grly[7][2] * radius2); - - rly[20] = tmp7 * rly[13] - ylmcoef[20] * rly[8] * radius2; // l=4,m=-2 - grly[20][0] = tmp7 * grly[13][0] - - ylmcoef[20] * (rly[8] * tx + grly[8][0] * radius2); - grly[20][1] = tmp7 * grly[13][1] - - ylmcoef[20] * (rly[8] * ty + grly[8][1] * radius2); - grly[20][2] = ylmcoef[19] * (z * grly[13][2] + rly[13]) - - ylmcoef[20] * (rly[8] * tz + grly[8][2] * radius2); - - double tmp8 = 3.0 * z; - rly[21] = tmp8 * rly[14]; // l=4,m=3 - grly[21][0] = tmp8 * grly[14][0]; - grly[21][1] = tmp8 * grly[14][1]; - grly[21][2] = 3.0 * (z * grly[14][2] + rly[14]); - - rly[22] = tmp8 * rly[15]; // l=4,m=-3 - grly[22][0] = tmp8 * grly[15][0]; - grly[22][1] = tmp8 * grly[15][1]; - grly[22][2] = 3.0 * (z * grly[15][2] + rly[15]); - - double tmp9 = ylmcoef[23] * x; - rly[23] = ylmcoef[21] * rly[19] - ylmcoef[22] * rly[7] * radius2 - - tmp9 * rly[14]; // l=4,m=4 - grly[23][0] = ylmcoef[21] * grly[19][0] - - ylmcoef[22] * (rly[7] * tx + grly[7][0] * radius2) - - ylmcoef[23] * (x * grly[14][0] + rly[14]); - grly[23][1] = ylmcoef[21] * grly[19][1] - - ylmcoef[22] * (rly[7] * ty + grly[7][1] * radius2) - - tmp9 * grly[14][1]; - grly[23][2] = ylmcoef[21] * grly[19][2] - - ylmcoef[22] * (rly[7] * tz + grly[7][2] * radius2) - - tmp9 * grly[14][2]; - - rly[24] = ylmcoef[21] * rly[20] - ylmcoef[22] * rly[8] * radius2 - - tmp9 * rly[15]; // l=4,m=-4 - grly[24][0] = ylmcoef[21] * grly[20][0] - - ylmcoef[22] * (rly[8] * tx + grly[8][0] * radius2) - - ylmcoef[23] * (x * grly[15][0] + rly[15]); - grly[24][1] = ylmcoef[21] * grly[20][1] - - ylmcoef[22] * (rly[8] * ty + grly[8][1] * radius2) - - tmp9 * grly[15][1]; - grly[24][2] = ylmcoef[21] * grly[20][2] - - ylmcoef[22] * (rly[8] * tz + grly[8][2] * radius2) - - tmp9 * grly[15][2]; - - if (Lmax == 4) - return; - - /*************************** - L = 5 - ***************************/ - rly[25] - = ylmcoef[24] * z * rly[16] - ylmcoef[25] * rly[9] * radius2; // l=5,m=0 - grly[25][0] = ylmcoef[24] * z * grly[16][0] - - ylmcoef[25] * (rly[9] * tx + grly[9][0] * radius2); - grly[25][1] = ylmcoef[24] * z * grly[16][1] - - ylmcoef[25] * (rly[9] * ty + grly[9][1] * radius2); - grly[25][2] = ylmcoef[24] * (z * grly[16][2] + rly[16]) - - ylmcoef[25] * (rly[9] * tz + grly[9][2] * radius2); - - double tmp10 = ylmcoef[26] * z; - rly[26] = tmp10 * rly[17] - ylmcoef[27] * rly[10] * radius2; // l=5,m=1 - grly[26][0] = tmp10 * grly[17][0] - - ylmcoef[27] * (rly[10] * tx + grly[10][0] * radius2); - grly[26][1] = tmp10 * grly[17][1] - - ylmcoef[27] * (rly[10] * ty + grly[10][1] * radius2); - grly[26][2] = ylmcoef[26] * (z * grly[17][2] + rly[17]) - - ylmcoef[27] * (rly[10] * tz + grly[10][2] * radius2); - - rly[27] = tmp10 * rly[18] - ylmcoef[27] * rly[11] * radius2; // l=5,m=-1 - grly[27][0] = tmp10 * grly[18][0] - - ylmcoef[27] * (rly[11] * tx + grly[11][0] * radius2); - grly[27][1] = tmp10 * grly[18][1] - - ylmcoef[27] * (rly[11] * ty + grly[11][1] * radius2); - grly[27][2] = ylmcoef[26] * (z * grly[18][2] + rly[18]) - - ylmcoef[27] * (rly[11] * tz + grly[11][2] * radius2); - - double tmp11 = ylmcoef[28] * z; - rly[28] = tmp11 * rly[19] - ylmcoef[29] * rly[12] * radius2; // l=5,m=2 - grly[28][0] = tmp11 * grly[19][0] - - ylmcoef[29] * (rly[12] * tx + grly[12][0] * radius2); - grly[28][1] = tmp11 * grly[19][1] - - ylmcoef[29] * (rly[12] * ty + grly[12][1] * radius2); - grly[28][2] = ylmcoef[28] * (z * grly[19][2] + rly[19]) - - ylmcoef[29] * (rly[12] * tz + grly[12][2] * radius2); - - rly[29] = tmp11 * rly[20] - ylmcoef[29] * rly[13] * radius2; // l=5,m=-2 - grly[29][0] = tmp11 * grly[20][0] - - ylmcoef[29] * (rly[13] * tx + grly[13][0] * radius2); - grly[29][1] = tmp11 * grly[20][1] - - ylmcoef[29] * (rly[13] * ty + grly[13][1] * radius2); - grly[29][2] = ylmcoef[28] * (z * grly[20][2] + rly[20]) - - ylmcoef[29] * (rly[13] * tz + grly[13][2] * radius2); - - double tmp12 = ylmcoef[30] * z; - rly[30] = tmp12 * rly[21] - ylmcoef[31] * rly[14] * radius2; // l=5,m=3 - grly[30][0] = tmp12 * grly[21][0] - - ylmcoef[31] * (grly[14][0] * radius2 + rly[14] * tx); - grly[30][1] = tmp12 * grly[21][1] - - ylmcoef[31] * (grly[14][1] * radius2 + rly[14] * ty); - grly[30][2] = ylmcoef[30] * (z * grly[21][2] + rly[21]) - - ylmcoef[31] * (grly[14][2] * radius2 + rly[14] * tz); - - rly[31] = tmp12 * rly[22] - ylmcoef[31] * rly[15] * radius2; // l=5,m=-3 - grly[31][0] = tmp12 * grly[22][0] - - ylmcoef[31] * (grly[15][0] * radius2 + rly[15] * tx); - grly[31][1] = tmp12 * grly[22][1] - - ylmcoef[31] * (grly[15][1] * radius2 + rly[15] * ty); - grly[31][2] = ylmcoef[30] * (z * grly[22][2] + rly[22]) - - ylmcoef[31] * (grly[15][2] * radius2 + rly[15] * tz); - - double tmp13 = ylmcoef[32] * z; - rly[32] = tmp13 * rly[23]; // l=5,m=4 - grly[32][0] = tmp13 * grly[23][0]; - grly[32][1] = tmp13 * grly[23][1]; - grly[32][2] = ylmcoef[32] * (rly[23] + z * grly[23][2]); - - rly[33] = tmp13 * rly[24]; // l=5,m=-4 - grly[33][0] = tmp13 * grly[24][0]; - grly[33][1] = tmp13 * grly[24][1]; - grly[33][2] = ylmcoef[32] * (rly[24] + z * grly[24][2]); - - double tmp14 = ylmcoef[35] * x; - rly[34] = ylmcoef[33] * rly[30] - ylmcoef[34] * rly[14] * radius2 - - tmp14 * rly[23]; // l=5,m=5 - grly[34][0] = ylmcoef[33] * grly[30][0] - - ylmcoef[34] * (rly[14] * tx + grly[14][0] * radius2) - - ylmcoef[35] * (x * grly[23][0] + rly[23]); - grly[34][1] = ylmcoef[33] * grly[30][1] - - ylmcoef[34] * (rly[14] * ty + grly[14][1] * radius2) - - tmp14 * grly[23][1]; - grly[34][2] = ylmcoef[33] * grly[30][2] - - ylmcoef[34] * (rly[14] * tz + grly[14][2] * radius2) - - tmp14 * grly[23][2]; - - rly[35] = ylmcoef[33] * rly[31] - ylmcoef[34] * rly[15] * radius2 - - tmp14 * rly[24]; // l=5,m=-5 - grly[35][0] = ylmcoef[33] * grly[31][0] - - ylmcoef[34] * (rly[15] * tx + grly[15][0] * radius2) - - ylmcoef[35] * (x * grly[24][0] + rly[24]); - grly[35][1] = ylmcoef[33] * grly[31][1] - - ylmcoef[34] * (rly[15] * ty + grly[15][1] * radius2) - - tmp14 * grly[24][1]; - grly[35][2] = ylmcoef[33] * grly[31][2] - - ylmcoef[34] * (rly[15] * tz + grly[15][2] * radius2) - - tmp14 * grly[24][2]; - - if (Lmax == 5) - return; - - // if Lmax > 5 - for (int il = 6; il <= Lmax; il++) - { - int istart = il * il; - int istart1 = (il - 1) * (il - 1); - int istart2 = (il - 2) * (il - 2); - - double fac2 = sqrt(4.0 * istart - 1.0); - double fac4 = sqrt(4.0 * istart1 - 1.0); - - for (int im = 0; im < 2 * il - 1; im++) - { - int imm = (im + 1) / 2; - // if (im % 2 == 0) imm *= -1; - - double var1 = fac2 / sqrt((double)istart - imm * imm); - double var2 = sqrt((double)istart1 - imm * imm) / fac4; - - rly[istart + im] = var1 - * (z * rly[istart1 + im] - - var2 * rly[istart2 + im] * radius2); - - grly[istart + im][0] - = var1 - * (z * grly[istart1 + im][0] - - var2 - * (rly[istart2 + im] * tx - + grly[istart2 + im][0] * radius2)); - grly[istart + im][1] - = var1 - * (z * grly[istart1 + im][1] - - var2 - * (rly[istart2 + im] * ty - + grly[istart2 + im][1] * radius2)); - grly[istart + im][2] - = var1 - * (z * grly[istart1 + im][2] + rly[istart1 + im] - - var2 - * (rly[istart2 + im] * tz - + grly[istart2 + im][2] * radius2)); - } - - double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0)); - double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0)); - double bl3 = sqrt(2.0) / fac2; - - int id1 = istart + 2 * il - 1; - int id2 = istart + 2 * il - 5; - int id3 = istart2 + 2 * il - 5; - int id4 = istart1 + 2 * il - 3; - - rly[id1] - = (bl3 * rly[id2] - bl2 * rly[id3] * radius2 - 2.0 * x * rly[id4]) - / bl1; - grly[id1][0] = (bl3 * grly[id2][0] - - bl2 * (grly[id3][0] * radius2 + rly[id3] * tx) - - 2.0 * (rly[id4] + x * grly[id4][0])) - / bl1; - grly[id1][1] = (bl3 * grly[id2][1] - - bl2 * (grly[id3][1] * radius2 + rly[id3] * ty) - - 2.0 * x * grly[id4][1]) - / bl1; - grly[id1][2] = (bl3 * grly[id2][2] - - bl2 * (grly[id3][2] * radius2 + rly[id3] * tz) - - 2.0 * x * grly[id4][2]) - / bl1; - - rly[id1 + 1] = (bl3 * rly[id2 + 1] - bl2 * rly[id3 + 1] * radius2 - - 2.0 * x * rly[id4 + 1]) - / bl1; - grly[id1 + 1][0] - = (bl3 * grly[id2 + 1][0] - - bl2 * (grly[id3 + 1][0] * radius2 + rly[id3 + 1] * tx) - - 2.0 * (rly[id4 + 1] + x * grly[id4 + 1][0])) - / bl1; - grly[id1 + 1][1] - = (bl3 * grly[id2 + 1][1] - - bl2 * (grly[id3 + 1][1] * radius2 + rly[id3 + 1] * ty) - - 2.0 * x * grly[id4 + 1][1]) - / bl1; - grly[id1 + 1][2] - = (bl3 * grly[id2 + 1][2] - - bl2 * (grly[id3 + 1][2] * radius2 + rly[id3 + 1] * tz) - - 2.0 * x * grly[id4 + 1][2]) - / bl1; - } - - return; -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/test/test_sph.cu b/source/source_lcao/module_gint/test/test_sph.cu deleted file mode 100644 index 9d41705667..0000000000 --- a/source/source_lcao/module_gint/test/test_sph.cu +++ /dev/null @@ -1,138 +0,0 @@ -#include -#include "../kernels/cuda/sph.cuh" - -#include "float.h" -#include "cuda_runtime.h" -#include "device_launch_parameters.h" -#include "gtest/gtest.h" -#include "source_lcao/module_hcontainer/hcontainer.h" -#include "test_sph.h" -#include "source_base/array_pool.h" -using namespace std; - -class gintTest : public ::testing::Test -{ - public: -}; - -__global__ void cuda_test(double* dr, int nwl, double* ylma_g, double* ylmcoef) -{ - double ylma[49] = {0.0}; - GintKernel::spherical_harmonics(dr, nwl, ylma, ylmcoef); - for (int i = 0; i < 49; i++) - { - ylma_g[i] = ylma[i]; - } -} -__global__ void cuda_test2(double* dr, double distance, int nwl, double* dylma_g, double* ylmcoef) -{ - double ylma[49] = {0.0}; - double grly[49][3] = {0.0}; - GintKernel::spherical_harmonics_d(dr, distance, grly, nwl, ylma, ylmcoef); - for (int i = 0; i < 49; i++) - { - dylma_g[i] = ylma[i]; - } -} - -void get_random_double(int min, int max, double* result, int length) -{ - std::random_device rd; - std::default_random_engine eng(rd()); - std::uniform_real_distribution distribution(0, 10); - for (int i = 0; i < 3; i++) - { - result[i] = distribution(eng); - } -} -void get_random_int(int min, int max, int& result) -{ - std::random_device rd; - std::default_random_engine eng(rd()); - std::uniform_int_distribution distribution(min, max); - result = distribution(eng); -} -// __global__ void cuda_test -TEST_F(gintTest, test) -{ - int nwl; - double distance; - - double* dr = new double[3]; - double* dr_g; - - double ylma[49]; - double dylma[49]; - double ylma_ans[49]; - - double* ylmcoef_g; - double* ylma_g; - double* dylma_g; - double* ylmcoef = new double[100]; - - std::vector ylma_cpu(49, 0.0); - std::vector ylma_cpu_dpsir(49, 0.0); - ModuleBase::Array_Pool ylma_cpu_ddpsir(49, 3); - - nwl=3; - for (int i=0;i<3;i++){ - dr[i]=i*1.0; - distance += dr[i] * dr[i]; - } - for (int i=0;i<100;i++) - { - ylmcoef[i]=i*0.1; - } - - cudaMalloc((void**)&ylmcoef_g, 100 * sizeof(double)); - cudaMalloc((void**)&dr_g, 3 * sizeof(double)); - cudaMalloc((void**)&ylma_g, 49 * sizeof(double)); - cudaMalloc((void**)&dylma_g, 49 * 3 * sizeof(double)); - - cudaMemcpy(ylmcoef_g, ylmcoef, 100 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dr_g, dr, 3 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemset(ylma_g, 0, 49 * sizeof(double)); - cudaMemset(dylma_g, 0, 49 * sizeof(double)); - - cuda_test<<<1, 1>>>(dr_g, nwl, ylma_g, ylmcoef_g); - cuda_test2<<<1, 1>>>(dr_g, distance, nwl, dylma_g, ylmcoef_g); - sph_harm(nwl, dr[0], dr[1], dr[2], ylma_cpu, ylmcoef); - grad_rl_sph_harm(nwl, dr[0], dr[1], dr[2], ylma_cpu_dpsir.data(), ylma_cpu_ddpsir.get_ptr_2D(), ylmcoef); - cudaMemcpy(ylma, ylma_g, 49 * sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(dylma, dylma_g, 49 * sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceReset(); - - for (int i = 0; i < 49; i++) - { - ylma_ans[i] = ylma_cpu[i]; - if ((abs(ylma[i])!= 0) && (ylma_ans[i]==ylma_ans[i]) && (ylma[i]==ylma[i])) - { - EXPECT_LT(abs(ylma_ans[i] - ylma[i]) / abs(ylma[i]), 1e-15); - } - ylma_ans[i] = ylma_cpu_dpsir[i]; - if ((abs(dylma[i]) != 0) &&(ylma_ans[i]==ylma_ans[i]) && (dylma[i]==dylma[i])) - { - EXPECT_LT(abs(ylma_ans[i] - dylma[i]) / abs(dylma[i]), 1e-15); - } - } - delete[] dr; - delete[] ylmcoef; - -} - -int main(int argc, char** argv) -{ -#ifdef __MPI - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &GlobalV::NPROC); - MPI_Comm_rank(MPI_COMM_WORLD, &GlobalV::MY_RANK); -#endif - testing::InitGoogleTest(&argc, argv); - int result = RUN_ALL_TESTS(); - -#ifdef __MPI - MPI_Finalize(); -#endif - - return result; -} \ No newline at end of file diff --git a/source/source_lcao/module_gint/test/test_sph.h b/source/source_lcao/module_gint/test/test_sph.h deleted file mode 100644 index 141e917200..0000000000 --- a/source/source_lcao/module_gint/test/test_sph.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef TEST_SPH_H -#define TEST_SPH_H -#include -// using namespace std; -void sph_harm(const int& Lmax, - const double& xdr, - const double& ydr, - const double& zdr, - std::vector& rly, - double* ylmcoef); - -void grad_rl_sph_harm(const int& Lmax, // max momentum of L - const double& x, - const double& y, - const double& z, - double* rly, - double** grly, - const double* ylmcoef); -#endif \ No newline at end of file diff --git a/source/source_lcao/module_gint/temp_gint/unitcell_info.cpp b/source/source_lcao/module_gint/unitcell_info.cpp similarity index 100% rename from source/source_lcao/module_gint/temp_gint/unitcell_info.cpp rename to source/source_lcao/module_gint/unitcell_info.cpp diff --git a/source/source_lcao/module_gint/temp_gint/unitcell_info.h b/source/source_lcao/module_gint/unitcell_info.h similarity index 100% rename from source/source_lcao/module_gint/temp_gint/unitcell_info.h rename to source/source_lcao/module_gint/unitcell_info.h diff --git a/source/source_lcao/module_lr/esolver_lrtd_lcao.cpp b/source/source_lcao/module_lr/esolver_lrtd_lcao.cpp index 90bcab0c47..f8cafe4902 100644 --- a/source/source_lcao/module_lr/esolver_lrtd_lcao.cpp +++ b/source/source_lcao/module_lr/esolver_lrtd_lcao.cpp @@ -1,5 +1,4 @@ #include "esolver_lrtd_lcao.h" -#include "utils/gint_move.hpp" #include "utils/lr_util.h" #include "hamilt_casida.h" #include "hamilt_ulr.hpp" @@ -44,8 +43,6 @@ void LR::ESolver_LR::move_exx_lri(std::shared_ptr::move_exx_lri: cannot move std::complex to double"); } #endif -template<>void LR::ESolver_LR::set_gint() { this->gint_ = &this->gint_g_;this->gint_g_.gridt = &this->gt_; } -template<>void LR::ESolver_LR>::set_gint() { this->gint_ = &this->gint_k_; this->gint_k_.gridt = &this->gt_; } inline int cal_nupdown_form_occ(const ModuleBase::matrix& wg) { // only for nspin=2 @@ -241,23 +238,7 @@ LR::ESolver_LR::ESolver_LR(ModuleESolver::ESolver_KS_LCAO&& ks_sol this->nupdown = cal_nupdown_form_occ(ks_sol.pelec->wg); reset_dim_spin2(); } -#ifdef __OLD_GINT - //grid integration - this->gt_ = std::move(ks_sol.GridT); - - if (std::is_same::value) - { - this->gint_g_ = std::move(ks_sol.GG); - } - else - { - this->gint_k_ = std::move(ks_sol.GK); - } - this->set_gint(); - this->gint_->reset_DMRGint(1); -#else this->gint_info_ = std::move(ks_sol.gint_info_); -#endif // move pw basis if (this->pw_rho_flag) { @@ -395,66 +376,6 @@ LR::ESolver_LR::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu this->ucell, search_radius, PARAM.inp.test_atom_input); -#ifdef __OLD_GINT - this->set_gint(); - this->gint_->gridt = &this->gt_; - - // (3) Periodic condition search for each grid. - double dr_uniform = 0.001; - std::vector rcuts; - std::vector> psi_u; - std::vector> dpsi_u; - std::vector> d2psi_u; - - Gint_Tools::init_orb(dr_uniform, rcuts, ucell, orb, psi_u, dpsi_u, d2psi_u); - this->gt_.set_pbc_grid(this->pw_rho->nx, - this->pw_rho->ny, - this->pw_rho->nz, - this->pw_big->bx, - this->pw_big->by, - this->pw_big->bz, - this->pw_big->nbx, - this->pw_big->nby, - this->pw_big->nbz, - this->pw_big->nbxx, - this->pw_big->nbzp_start, - this->pw_big->nbzp, - this->pw_rho->ny, - this->pw_rho->nplane, - this->pw_rho->startz_current, - ucell, - this->gd, - dr_uniform, - rcuts, - psi_u, - dpsi_u, - d2psi_u, - PARAM.inp.nstream); - psi_u.clear(); - psi_u.shrink_to_fit(); - dpsi_u.clear(); - dpsi_u.shrink_to_fit(); - d2psi_u.clear(); - d2psi_u.shrink_to_fit(); - - this->gint_->prep_grid(this->gt_, - this->pw_big->nbx, - this->pw_big->nby, - this->pw_big->nbzp, - this->pw_big->nbzp_start, - this->pw_rho->nxyz, - this->pw_big->bx, - this->pw_big->by, - this->pw_big->bz, - this->pw_big->bxyz, - this->pw_big->nbxx, - this->pw_rho->ny, - this->pw_rho->nplane, - this->pw_rho->startz_current, - &ucell, - &orb); - this->gint_->initialize_pvpR(ucell, &this->gd, 1); // always use nspin=1 for transition density -#else gint_info_.reset( new ModuleGint::GintInfo( this->pw_big->nbx, @@ -473,7 +394,6 @@ LR::ESolver_LR::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu ucell, this->gd)); ModuleGint::Gint::set_gint_info(gint_info_.get()); -#endif // if EXX from scratch, init 2-center integral and calculate Cs, Vs #ifdef __EXX if ((xc_kernel == "hf" || xc_kernel == "hse") && this->input.lr_solver != "spectrum") @@ -533,7 +453,6 @@ void LR::ESolver_LR::runner(UnitCell& ucell, const int istep) this->exx_lri, this->exx_info.info_global.hybrid_alpha, #endif - this->gint_, this->pot, this->kv, this->paraX_, @@ -564,7 +483,6 @@ void LR::ESolver_LR::runner(UnitCell& ucell, const int istep) this->exx_lri, this->exx_info.info_global.hybrid_alpha, #endif - this->gint_, this->pot[is], this->kv, this->paraX_, @@ -621,7 +539,7 @@ void LR::ESolver_LR::after_all_runners(UnitCell& ucell) auto spin_types = (nspin == 2 && !openshell) ? std::vector({ "singlet", "triplet" }) : std::vector({ "updown" }); for (int is = 0;is < this->X.size();++is) { - LR_Spectrum spectrum(nspin, this->nbasis, this->nocc, this->nvirt, this->gint_, *this->pw_rho, *this->psi_ks, + LR_Spectrum spectrum(nspin, this->nbasis, this->nocc, this->nvirt, *this->pw_rho, *this->psi_ks, this->ucell, this->kv, this->gd, this->orb_cutoff_, this->two_center_bundle_, this->paraX_, this->paraC_, this->paraMat_, &this->pelec->ekb.c[is * nstates], this->X[is].template data(), nstates, openshell, diff --git a/source/source_lcao/module_lr/esolver_lrtd_lcao.h b/source/source_lcao/module_lr/esolver_lrtd_lcao.h index f08ddec52b..3f2d040501 100644 --- a/source/source_lcao/module_lr/esolver_lrtd_lcao.h +++ b/source/source_lcao/module_lr/esolver_lrtd_lcao.h @@ -11,13 +11,10 @@ #include #include "source_esolver/esolver_ks_lcao.h" //for the move constructor -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" -#include "source_lcao/module_gint/grid_technique.h" #include "source_estate/module_dm/density_matrix.h" #include "source_lcao/module_lr/potentials/pot_hxc_lrtd.h" #include "source_lcao/module_lr/hamilt_casida.h" -#include "source_lcao/module_gint/temp_gint/gint_info.h" +#include "source_lcao/module_gint/gint_info.h" #ifdef __EXX // #include #include "source_lcao/module_ri/Exx_LRI.h" @@ -90,13 +87,7 @@ namespace LR bool openshell = false; std::string xc_kernel; - Grid_Technique gt_; - Gint_Gamma gint_g_; - Gint_k gint_k_; - typename TGint::type* gint_ = nullptr; - #ifndef __OLD_GINT std::unique_ptr gint_info_ = nullptr; - #endif void set_gint(); /// @brief variables for parallel distribution of KS orbitals diff --git a/source/source_lcao/module_lr/hamilt_casida.h b/source/source_lcao/module_lr/hamilt_casida.h index e692dc051a..d835fad2d3 100644 --- a/source/source_lcao/module_lr/hamilt_casida.h +++ b/source/source_lcao/module_lr/hamilt_casida.h @@ -17,7 +17,6 @@ namespace LR class HamiltLR { public: - template HamiltLR(std::string& xc_kernel, const int& nspin, const int& naos, @@ -32,7 +31,6 @@ namespace LR std::weak_ptr> exx_lri_in, const double& exx_alpha, #endif - TGint* gint_in, std::weak_ptr pot_in, const K_Vectors& kv_in, const std::vector& pX_in, @@ -95,7 +93,7 @@ namespace LR #endif { OperatorLRHxc* lr_hxc = new OperatorLRHxc(nspin, naos, nocc, nvirt, psi_ks_in, - this->DM_trans, gint_in, pot_in, ucell_in, orb_cutoff, gd_in, kv_in, pX_in, pc_in, pmat_in); + this->DM_trans, pot_in, ucell_in, orb_cutoff, gd_in, kv_in, pX_in, pc_in, pmat_in); this->ops->add(lr_hxc); } #ifdef __EXX diff --git a/source/source_lcao/module_lr/hamilt_ulr.hpp b/source/source_lcao/module_lr/hamilt_ulr.hpp index 838a3d4999..4f5fdfbfd9 100644 --- a/source/source_lcao/module_lr/hamilt_ulr.hpp +++ b/source/source_lcao/module_lr/hamilt_ulr.hpp @@ -15,7 +15,6 @@ namespace LR class HamiltULR { public: - template HamiltULR(std::string& xc_kernel, const int& nspin, const int& naos, @@ -30,7 +29,6 @@ namespace LR std::weak_ptr> exx_lri_in, const double& exx_alpha, #endif - TGint* gint_in, std::vector>& pot_in, const K_Vectors& kv_in, const std::vector& pX_in, ///< {up, down} @@ -49,7 +47,7 @@ namespace LR this->ops[3] = new OperatorLRDiag(eig_ks.c + nk * (nocc[0] + nvirt[0]), pX_in[1], nk, nocc[1], nvirt[1]); auto newHxc = [&](const int& sl, const int& sr) { return new OperatorLRHxc(nspin, naos, nocc, nvirt, psi_ks_in, - this->DM_trans, gint_in, pot_in[sl], ucell_in, orb_cutoff, gd_in, kv_in, pX_in, pc_in, pmat_in, { sl,sr }); }; + this->DM_trans, pot_in[sl], ucell_in, orb_cutoff, gd_in, kv_in, pX_in, pc_in, pmat_in, { sl,sr }); }; this->ops[0]->add(newHxc(0, 0)); this->ops[1] = newHxc(0, 1); this->ops[2] = newHxc(1, 0); diff --git a/source/source_lcao/module_lr/lr_spectrum.cpp b/source/source_lcao/module_lr/lr_spectrum.cpp index 235a9829e2..f698541c30 100644 --- a/source/source_lcao/module_lr/lr_spectrum.cpp +++ b/source/source_lcao/module_lr/lr_spectrum.cpp @@ -6,7 +6,7 @@ #include "source_lcao/module_lr/utils/lr_util.h" #include "source_lcao/module_lr/utils/lr_util_hcontainer.h" #include "source_lcao/module_lr/utils/lr_util_print.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" template elecstate::DensityMatrix LR::LR_Spectrum::cal_transition_density_matrix(const int istate, const T* X_in, const bool need_R) @@ -35,16 +35,6 @@ elecstate::DensityMatrix LR::LR_Spectrum::cal_transition_density_matrix return DM_trans; } -#ifdef __OLD_GINT -template -void LR::LR_Spectrum::cal_gint_rho(double** rho, const int& nrxx) -{ - ModuleBase::GlobalFunc::ZEROS(rho[0], nrxx); - Gint_inout inout_rho(rho, Gint_Tools::job_type::rho, 1, false); - this->gint->cal_gint(&inout_rho); -} -#endif - inline void check_sum_rule(const double& osc_tot) { if (std::abs(osc_tot - 1.0) > 1e-3) { @@ -65,13 +55,8 @@ ModuleBase::Vector3 LR::LR_Spectrum::cal_transition_dipole_istat // 2. transition density double** rho_trans; LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, this->rho_basis.nrxx); -#ifdef __OLD_GINT - this->gint->transfer_DM2DtoGrid({ DM_trans.get_DMR_vector().at(is) }); - this->cal_gint_rho(rho_trans, this->rho_basis.nrxx); -#else ModuleBase::GlobalFunc::ZEROS(rho_trans[0], this->rho_basis.nrxx); ModuleGint::cal_gint_rho({ DM_trans.get_DMR_vector().at(is) }, 1, rho_trans, false); -#endif // 3. transition dipole moment for (int ir = 0; ir < rho_basis.nrxx; ++ir) @@ -115,24 +100,14 @@ ModuleBase::Vector3> LR::LR_Spectrum>: // real part LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'R'); -#ifdef __OLD_GINT - this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); - this->cal_gint_rho(rho_trans_real, this->rho_basis.nrxx); -#else ModuleBase::GlobalFunc::ZEROS(rho_trans_real[0], this->rho_basis.nrxx); ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_real, false); -#endif // LR_Util::print_grid_nonzero(rho_trans_real[0], this->rho_basis.nrxx, 10, "rho_trans"); // imag part LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'I'); -#ifdef __OLD_GINT - this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); - this->cal_gint_rho(rho_trans_imag, this->rho_basis.nrxx); -#else ModuleBase::GlobalFunc::ZEROS(rho_trans_imag[0], this->rho_basis.nrxx); ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_imag, false); -#endif // LR_Util::print_grid_nonzero(rho_trans_imag[0], this->rho_basis.nrxx, 10, "rho_trans"); // 3. transition dipole moment diff --git a/source/source_lcao/module_lr/lr_spectrum.h b/source/source_lcao/module_lr/lr_spectrum.h index bd2d1f489d..79a188621b 100644 --- a/source/source_lcao/module_lr/lr_spectrum.h +++ b/source/source_lcao/module_lr/lr_spectrum.h @@ -1,6 +1,5 @@ #pragma once #include "source_cell/klist.h" -#include "source_lcao/module_lr/utils/gint_template.h" #include "source_psi/psi.h" #include "source_estate/module_dm/density_matrix.h" #include "source_lcao/module_lr/utils/lr_util.h" @@ -13,14 +12,14 @@ namespace LR { public: LR_Spectrum(const int& nspin_global, const int& naos, const std::vector& nocc, const std::vector& nvirt, - typename TGint::type* gint, const ModulePW::PW_Basis& rho_basis, psi::Psi& psi_ks_in, + const ModulePW::PW_Basis& rho_basis, psi::Psi& psi_ks_in, const UnitCell& ucell, const K_Vectors& kv_in, const Grid_Driver& gd, const std::vector& orb_cutoff, const TwoCenterBundle& two_center_bundle_, const std::vector& pX_in, const Parallel_2D& pc_in, const Parallel_Orbitals& pmat_in, const double* eig, const T* X, const int& nstate, const bool& openshell, const std::string& gauge = "length") : nspin_x(openshell ? 2 : 1), naos(naos), nocc(nocc), nvirt(nvirt), nk(kv_in.get_nks() / nspin_global), - gint(gint), rho_basis(rho_basis), ucell(ucell), kv(kv_in), gd_(gd), + rho_basis(rho_basis), ucell(ucell), kv(kv_in), gd_(gd), orb_cutoff_(orb_cutoff), two_center_bundle_(two_center_bundle_), pX(pX_in), pc(pc_in), pmat(pmat_in), eig(eig), X(X), nstate(nstate), @@ -75,7 +74,6 @@ namespace LR const std::vector& pX; const Parallel_2D& pc; const Parallel_Orbitals& pmat; - typename TGint::type* gint = nullptr; const ModulePW::PW_Basis& rho_basis; const Grid_Driver& gd_; const UnitCell& ucell; diff --git a/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.cpp b/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.cpp index 56d2e4fda7..4aed4244f4 100644 --- a/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.cpp +++ b/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.cpp @@ -9,7 +9,7 @@ #include "source_lcao/module_hcontainer/hcontainer_funcs.h" #include "source_lcao/module_lr/ao_to_mo_transformer/ao_to_mo.h" #include "source_pw/module_pwdft/global.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" inline double conj(double a) { return a; } inline std::complex conj(std::complex a) { return std::conj(a); } @@ -61,13 +61,7 @@ namespace LR const int& nrxx = this->pot.lock()->nrxx; LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // currently gint_kernel_rho uses PARAM.inp.nspin, it needs refactor ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx); -#ifdef __OLD_GINT - this->gint->transfer_DM2DtoGrid(this->DM_trans->get_DMR_vector()); // 2d block to grid - Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false); - this->gint->cal_gint(&inout_rho); -#else ModuleGint::cal_gint_rho(this->DM_trans->get_DMR_vector(), 1, rho_trans, false); -#endif // 3. v_hxc = f_hxc * rho_trans ModuleBase::matrix vr_hxc(1, nrxx); //grid this->pot.lock()->cal_v_eff(rho_trans, ucell, vr_hxc, ispin_ks); @@ -75,14 +69,7 @@ namespace LR // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r) this->hR->set_zero(); // clear hR for each bands -#ifdef __OLD_GINT - Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal); - this->gint->get_hRGint()->set_zero(); - this->gint->cal_gint(&inout_vlocal); - this->gint->transfer_pvpR(&*this->hR, &ucell); //grid to 2d block -#else ModuleGint::cal_gint_vl(vr_hxc.c, &*this->hR); -#endif ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation"); } @@ -109,14 +96,7 @@ namespace LR LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // nspin=1 for transition density ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx); -#ifdef __OLD_GINT - this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); - // LR_Util::print_HR(*this->gint->get_DMRGint()[0], this->ucell.nat, "DMR(grid, real)"); - Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false); - this->gint->cal_gint(&inout_rho); -#else ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans, false); -#endif // print_grid_nonzero(rho_trans[0], nrxx, 10, "rho_trans"); // 3. v_hxc = f_hxc * rho_trans @@ -128,15 +108,7 @@ namespace LR // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r) HR_real_imag.set_zero(); -#ifdef __OLD_GINT - Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal); - this->gint->get_hRGint()->set_zero(); - this->gint->cal_gint(&inout_vlocal); - // LR_Util::print_HR(*this->gint->get_hRGint(), this->ucell.nat, "VR(grid)"); - this->gint->transfer_pvpR(&HR_real_imag, &ucell, &this->gd); -#else ModuleGint::cal_gint_vl(vr_hxc.c, &HR_real_imag); -#endif // LR_Util::print_HR(HR_real_imag, this->ucell.nat, "VR(real, 2d)"); LR_Util::set_HR_real_imag_part(HR_real_imag, *this->hR, ucell.nat, type); }; diff --git a/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.h b/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.h index e201561380..bb82780e14 100644 --- a/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.h +++ b/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.h @@ -1,8 +1,6 @@ #pragma once #include "source_cell/klist.h" #include "source_hamilt/operator.h" -#include "source_lcao/module_lr/utils/gint_template.h" -#include "source_lcao/module_gint/grid_technique.h" #include "source_estate/module_dm/density_matrix.h" #include "source_lcao/module_lr/potentials/pot_hxc_lrtd.h" #include "source_lcao/module_lr/utils/lr_util.h" @@ -21,7 +19,6 @@ namespace LR const std::vector& nvirt, const psi::Psi& psi_ks_in, std::unique_ptr>& DM_trans_in, - typename TGint::type* gint_in, std::weak_ptr pot_in, const UnitCell& ucell_in, const std::vector& orb_cutoff, @@ -32,7 +29,7 @@ namespace LR const Parallel_Orbitals& pmat_in, const std::vector& ispin_ks = {0}) : nspin(nspin), naos(naos), nocc(nocc), nvirt(nvirt), nk(kv_in.get_nks() / nspin), psi_ks(psi_ks_in), - DM_trans(DM_trans_in), gint(gint_in), pot(pot_in), ucell(ucell_in), orb_cutoff_(orb_cutoff), gd(gd_in), + DM_trans(DM_trans_in), pot(pot_in), ucell(ucell_in), orb_cutoff_(orb_cutoff), gd(gd_in), kv(kv_in), pX(pX_in), pc(pc_in), pmat(pmat_in), ispin_ks(ispin_ks) { ModuleBase::TITLE("OperatorLRHxc", "OperatorLRHxc"); @@ -82,8 +79,6 @@ namespace LR std::weak_ptr pot; - typename TGint::type* gint = nullptr; - const UnitCell& ucell; std::vector orb_cutoff_; const Grid_Driver& gd; diff --git a/source/source_lcao/module_lr/utils/gint_move.hpp b/source/source_lcao/module_lr/utils/gint_move.hpp deleted file mode 100644 index b7c01118ef..0000000000 --- a/source/source_lcao/module_lr/utils/gint_move.hpp +++ /dev/null @@ -1,87 +0,0 @@ -#include "lr_util.h" -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_io/module_parameter/parameter.h" -#include "source_lcao/module_gint/gint_k.h" -#include "source_lcao/module_gint/grid_technique.h" - -// Here will be the only place where GlobalCs are used (to be moved) in source_lcao/module_lr -#include "source_pw/module_pwdft/global.h" - -template -using D2 = void(*) (T**, size_t); -// template -// using D3 = void(*) (T***, size_t, size_t); -// template -// D2 d2 = LR_Util::_deallocate_2order_nested_ptr; -// template -// D3 d3 = LR_Util::delete_p3; -// Change to C++ 11 -D2 d2 = LR_Util::_deallocate_2order_nested_ptr; -// D3 d3 = LR_Util::delete_p3; - - -Gint& Gint::operator=(Gint&& rhs) -{ - if (this == &rhs) {return *this; -} - - this->nbx = rhs.nbx; - this->nby = rhs.nby; - this->nbz = rhs.nbz; - this->ncxyz = rhs.ncxyz; - this->nbz_start = rhs.nbz_start; - this->bx = rhs.bx; - this->by = rhs.by; - this->bz = rhs.bz; - this->bxyz = rhs.bxyz; - this->nbxx = rhs.nbxx; - this->ny = rhs.ny; - this->nplane = rhs.nplane; - this->startz_current = rhs.startz_current; - - this->gridt = rhs.gridt; - this->ucell = rhs.ucell; - - // move hR after refactor - this->hRGint = rhs.hRGint; - rhs.hRGint = nullptr; - this->hRGintCd = rhs.hRGintCd; - rhs.hRGintCd = nullptr; - for (int i = 0; i < this->dmr_gint.size(); i++) - { - delete this->dmr_gint[i]; - } - for (int i = 0; i < this->hr_gint_tmp .size(); i++) - { - delete this->hr_gint_tmp [i]; - } - this->pvdpRx_reduced = std::move(rhs.pvdpRx_reduced); - this->pvdpRy_reduced = std::move(rhs.pvdpRy_reduced); - this->pvdpRz_reduced = std::move(rhs.pvdpRz_reduced); - this->dmr_gint = std::move(rhs.dmr_gint); - this->hr_gint_tmp = std::move(rhs.hr_gint_tmp ); - this->dm2d_tmp = rhs.dm2d_tmp; - rhs.dm2d_tmp = nullptr; - - return *this; -} - -Gint_Gamma& Gint_Gamma::operator=(Gint_Gamma&& rhs) -{ - if (this == &rhs) {return *this; -} - Gint::operator=(std::move(rhs)); - - // DM may not needed in beyond DFT ESolver - // if (this->DM != nullptr) d3(this->DM, PARAM.inp.nspin, gridt.lgd); - assert(this->DM == nullptr); - return *this; -} - -Gint_k& Gint_k::operator=(Gint_k&& rhs) -{ - if (this == &rhs) {return *this; -} - this->Gint::operator=(std::move(rhs)); - return *this; -} \ No newline at end of file diff --git a/source/source_lcao/module_lr/utils/gint_template.h b/source/source_lcao/module_lr/utils/gint_template.h deleted file mode 100644 index e56bb33961..0000000000 --- a/source/source_lcao/module_lr/utils/gint_template.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" -namespace LR -{ - template struct TGint; - template <> - struct TGint { - using type = Gint_Gamma; - }; - template <> - struct TGint> { - using type = Gint_k; - }; -} \ No newline at end of file diff --git a/source/source_lcao/module_operator_lcao/meta_lcao.h b/source/source_lcao/module_operator_lcao/meta_lcao.h index 53dca154c4..61bcf7510a 100644 --- a/source/source_lcao/module_operator_lcao/meta_lcao.h +++ b/source/source_lcao/module_operator_lcao/meta_lcao.h @@ -1,8 +1,6 @@ #ifndef METALCAO_H #define METALCAO_H #include "source_base/timer.h" -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" #include "operator_lcao.h" namespace hamilt diff --git a/source/source_lcao/module_operator_lcao/veff_lcao.cpp b/source/source_lcao/module_operator_lcao/veff_lcao.cpp index 576d9c3a5b..0df6ed33a5 100644 --- a/source/source_lcao/module_operator_lcao/veff_lcao.cpp +++ b/source/source_lcao/module_operator_lcao/veff_lcao.cpp @@ -4,7 +4,7 @@ #include "source_base/tool_title.h" #include "source_hamilt/module_xc/xc_functional.h" #include "source_cell/unitcell.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" namespace hamilt { @@ -68,20 +68,6 @@ void Veff>::contributeHR() double* vr_eff1 = this->pot->get_effective_v(this->current_spin); double* vofk_eff1 = this->pot->get_effective_vofk(this->current_spin); -#ifdef __OLD_GINT - if(XC_Functional::get_ked_flag()) - { - Gint_inout inout(vr_eff1, vofk_eff1, Gint_Tools::job_type::vlocal_meta); - this->GG->cal_vlocal(&inout, this->new_e_iteration); - } - else - { - Gint_inout inout(vr_eff1, Gint_Tools::job_type::vlocal); - this->GG->cal_vlocal(&inout, this->new_e_iteration); - } - this->GG->transfer_pvpR(this->hR,this->ucell); - this->new_e_iteration = false; -#else if(XC_Functional::get_ked_flag()) { ModuleGint::cal_gint_vl_metagga(vr_eff1, vofk_eff1, this->hR); @@ -90,7 +76,6 @@ void Veff>::contributeHR() { ModuleGint::cal_gint_vl(vr_eff1, this->hR); } -#endif if(this->nspin == 2) { @@ -113,23 +98,6 @@ void Veff, double>>::contributeHR() double* vr_eff1 = this->pot->get_effective_v(this->current_spin); double* vofk_eff1 = this->pot->get_effective_vofk(this->current_spin); -#ifdef __OLD_GINT - // if you change the place of the following code, - // rememeber to delete the #include - if(XC_Functional::get_ked_flag()) - { - Gint_inout inout(vr_eff1, vofk_eff1, 0, Gint_Tools::job_type::vlocal_meta); - this->GK->cal_gint(&inout); - } - else - { - // vlocal = Vh[rho] + Vxc[rho] + Vl(pseudo) - Gint_inout inout(vr_eff1, 0, Gint_Tools::job_type::vlocal); - this->GK->cal_gint(&inout); - } - - this->GK->transfer_pvpR(this->hR,this->ucell,this->gd); -#else if(XC_Functional::get_ked_flag()) { ModuleGint::cal_gint_vl_metagga(vr_eff1, vofk_eff1, this->hR); @@ -138,7 +106,6 @@ void Veff, double>>::contributeHR() { ModuleGint::cal_gint_vl(vr_eff1, this->hR); } -#endif if(this->nspin == 2) { @@ -155,30 +122,6 @@ void Veff, std::complex>>::contributeH ModuleBase::TITLE("Veff", "contributeHR"); ModuleBase::timer::tick("Veff", "contributeHR"); -#ifdef __OLD_GINT - double* vr_eff1 = nullptr; - double* vofk_eff1 = nullptr; - for (int is = 0; is < 4; is++) - { - vr_eff1 = this->pot->get_effective_v(is); - if(XC_Functional::get_ked_flag()) - { - vofk_eff1 = this->pot->get_effective_vofk(is); - } - - if(XC_Functional::get_ked_flag()) - { - Gint_inout inout(vr_eff1, vofk_eff1, is, Gint_Tools::job_type::vlocal_meta); - this->GK->cal_gint(&inout); - } - else - { - Gint_inout inout(vr_eff1, is, Gint_Tools::job_type::vlocal); - this->GK->cal_gint(&inout); - } - } - this->GK->transfer_pvpR(this->hR,this->ucell,this->gd); -#else std::vector vr_eff(4, nullptr); std::vector vofk_eff(4, nullptr); for (int is = 0; is < 4; is++) @@ -196,7 +139,6 @@ void Veff, std::complex>>::contributeH { ModuleGint::cal_gint_vl(vr_eff, this->hR); } -#endif ModuleBase::timer::tick("Veff", "contributeHR"); return; diff --git a/source/source_lcao/module_operator_lcao/veff_lcao.h b/source/source_lcao/module_operator_lcao/veff_lcao.h index a621f71fc6..8ec1265a00 100644 --- a/source/source_lcao/module_operator_lcao/veff_lcao.h +++ b/source/source_lcao/module_operator_lcao/veff_lcao.h @@ -2,8 +2,6 @@ #define VEFFLCAO_H #include "source_base/timer.h" #include "source_estate/module_pot/potential_new.h" -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" #include "operator_lcao.h" #include "source_cell/module_neighbor/sltk_grid_driver.h" #include "source_cell/unitcell.h" @@ -32,11 +30,9 @@ class Veff> : public OperatorLCAO { public: /** - * @brief Construct a new Veff object for multi-kpoint calculation - * @param GK_in: the pointer of Gint_k object, used for grid integration + * @brief Construct a new Veff object */ - Veff>(Gint_k* GK_in, - HS_Matrix_K* hsk_in, + Veff>(HS_Matrix_K* hsk_in, const std::vector>& kvec_d_in, elecstate::Potential* pot_in, hamilt::HContainer* hR_in, @@ -44,36 +40,12 @@ class Veff> : public OperatorLCAO const std::vector& orb_cutoff, const Grid_Driver* GridD_in, const int& nspin) - : GK(GK_in), orb_cutoff_(orb_cutoff), pot(pot_in), ucell(ucell_in), + : orb_cutoff_(orb_cutoff), pot(pot_in), ucell(ucell_in), gd(GridD_in), OperatorLCAO(hsk_in, kvec_d_in, hR_in) { this->cal_type = calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); -#ifdef __OLD_GINT - GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin); -#endif - } - /** - * @brief Construct a new Veff object for Gamma-only calculation - * @param GG_in: the pointer of Gint_Gamma object, used for grid integration - */ - Veff>(Gint_Gamma* GG_in, - HS_Matrix_K* hsk_in, - const std::vector>& kvec_d_in, - elecstate::Potential* pot_in, - hamilt::HContainer* hR_in, - const UnitCell* ucell_in, - const std::vector& orb_cutoff, - const Grid_Driver* GridD_in, - const int& nspin) - : GG(GG_in), orb_cutoff_(orb_cutoff), pot(pot_in), OperatorLCAO(hsk_in, kvec_d_in, hR_in) - { - this->cal_type = calculation_type::lcao_gint; - this->initialize_HR(ucell_in, GridD_in); -#ifdef __OLD_GINT - GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin); -#endif } ~Veff>(){}; @@ -90,11 +62,6 @@ class Veff> : public OperatorLCAO const Grid_Driver* gd; private: - // used for k-dependent grid integration. - Gint_k* GK = nullptr; - - // used for gamma only algorithms. - Gint_Gamma* GG = nullptr; std::vector orb_cutoff_; diff --git a/source/source_lcao/module_rdmft/rdmft.cpp b/source/source_lcao/module_rdmft/rdmft.cpp index 837128fcf9..34549fb671 100644 --- a/source/source_lcao/module_rdmft/rdmft.cpp +++ b/source/source_lcao/module_rdmft/rdmft.cpp @@ -55,9 +55,7 @@ RDMFT::~RDMFT() } template -void RDMFT::init(Gint_Gamma& GG_in, - Gint_k& GK_in, - Parallel_Orbitals& ParaV_in, +void RDMFT::init(Parallel_Orbitals& ParaV_in, UnitCell& ucell_in, const Grid_Driver& gd_in, K_Vectors& kv_in, @@ -67,8 +65,6 @@ void RDMFT::init(Gint_Gamma& GG_in, std::string XC_func_rdmft_in, double alpha_power_in) { - GG = &GG_in; - GK = &GK_in; ParaV = &ParaV_in; ucell = &ucell_in; kv = &kv_in; diff --git a/source/source_lcao/module_rdmft/rdmft.h b/source/source_lcao/module_rdmft/rdmft.h index a8bf7ea97e..0e6b532d6e 100644 --- a/source/source_lcao/module_rdmft/rdmft.h +++ b/source/source_lcao/module_rdmft/rdmft.h @@ -13,8 +13,6 @@ #include "source_base/parallel_2d.h" #include "source_basis/module_ao/parallel_orbitals.h" #include "source_cell/unitcell.h" -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" #include "source_basis/module_ao/ORB_read.h" #include "source_basis/module_nao/two_center_bundle.h" @@ -80,9 +78,7 @@ class RDMFT // std::vector E_RDMFT(4); //! initialization of rdmft calculation - void init(Gint_Gamma& GG_in, - Gint_k& GK_in, - Parallel_Orbitals& ParaV_in, + void init(Parallel_Orbitals& ParaV_in, UnitCell& ucell_in, const Grid_Driver& gd_in, K_Vectors& kv_in, @@ -190,9 +186,6 @@ class RDMFT const int cal_E_type = 1; // cal_type = 2 just support XC-functional without exx /****** these parameters are passed in from outside, don't need delete ******/ - // GK and GG are used for multi-k grid integration and gamma only algorithms respectively - Gint_k* GK = nullptr; - Gint_Gamma* GG = nullptr; Charge* charge = nullptr; // update after ion step diff --git a/source/source_lcao/module_rdmft/rdmft_pot.cpp b/source/source_lcao/module_rdmft/rdmft_pot.cpp index ba29d9ebc6..a962c2cac1 100644 --- a/source/source_lcao/module_rdmft/rdmft_pot.cpp +++ b/source/source_lcao/module_rdmft/rdmft_pot.cpp @@ -69,8 +69,7 @@ void RDMFT::cal_V_TV() if( PARAM.inp.gamma_only ) { - V_local = new rdmft::Veff_rdmft(GG, - hsk_TV, + V_local = new rdmft::Veff_rdmft(hsk_TV, kv->kvec_d, this->pelec->pot, HR_TV, @@ -86,8 +85,7 @@ void RDMFT::cal_V_TV() } else { - V_local = new rdmft::Veff_rdmft(GK, - hsk_TV, + V_local = new rdmft::Veff_rdmft(hsk_TV, kv->kvec_d, this->pelec->pot, HR_TV, @@ -117,8 +115,7 @@ void RDMFT::cal_V_hartree() if( PARAM.inp.gamma_only ) { - V_hartree = new rdmft::Veff_rdmft(GG, - hsk_hartree, + V_hartree = new rdmft::Veff_rdmft(hsk_hartree, kv->kvec_d, this->pelec->pot, HR_hartree, @@ -135,8 +132,7 @@ void RDMFT::cal_V_hartree() else { // this can be optimized, use potHartree.update_from_charge() - V_hartree = new rdmft::Veff_rdmft(GK, - hsk_hartree, + V_hartree = new rdmft::Veff_rdmft(hsk_hartree, kv->kvec_d, this->pelec->pot, HR_hartree, @@ -197,8 +193,7 @@ void RDMFT::cal_V_XC(const UnitCell& ucell) if( PARAM.inp.gamma_only ) { // this can be optimized, use potXC.update_from_charge() - V_dft_XC = new rdmft::Veff_rdmft(GG, - hsk_dft_XC, + V_dft_XC = new rdmft::Veff_rdmft(hsk_dft_XC, kv->kvec_d, this->pelec->pot, HR_dft_XC, @@ -217,8 +212,7 @@ void RDMFT::cal_V_XC(const UnitCell& ucell) else { // this can be optimized, use potXC.update_from_charge() - V_dft_XC = new rdmft::Veff_rdmft(GK, - hsk_dft_XC, + V_dft_XC = new rdmft::Veff_rdmft(hsk_dft_XC, kv->kvec_d, this->pelec->pot, HR_dft_XC, diff --git a/source/source_lcao/module_rdmft/rdmft_tools.cpp b/source/source_lcao/module_rdmft/rdmft_tools.cpp index 32c22dfef2..f8725b204c 100644 --- a/source/source_lcao/module_rdmft/rdmft_tools.cpp +++ b/source/source_lcao/module_rdmft/rdmft_tools.cpp @@ -12,7 +12,7 @@ #include "source_estate/module_pot/pot_local.h" #include "source_estate/module_pot/pot_xc.h" #include "source_pw/module_pwdft/structure_factor.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" #include #include @@ -266,12 +266,7 @@ void Veff_rdmft, double>::contributeHR() vr_eff_rdmft = &v_matrix_hartree(is, 0); // do grid integral calculation to get HR -#ifdef __OLD_GINT - Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); - this->GK->cal_gint(&inout); -#else ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); -#endif } } else if( potential_ == "local" ) @@ -285,12 +280,7 @@ void Veff_rdmft, double>::contributeHR() vr_eff_rdmft = &v_matrix_local(0, 0); // do grid integral calculation to get HR -#ifdef __OLD_GINT - Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal); - this->GK->cal_gint(&inout); -#else ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); -#endif } else if( potential_ == "xc" ) { @@ -309,12 +299,7 @@ void Veff_rdmft, double>::contributeHR() vr_eff_rdmft = &v_matrix_XC(is, 0); // do grid integral calculation to get HR -#ifdef __OLD_GINT - Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); - this->GK->cal_gint(&inout); -#else ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); -#endif } } else @@ -323,10 +308,6 @@ void Veff_rdmft, double>::contributeHR() } // get HR for 2D-block parallel format - // this->GK->transfer_pvpR(this->hR); -#ifdef __OLD_GINT - this->GK->transfer_pvpR(this->hR,this->ucell,this->gd); -#endif if(this->nspin == 2) { @@ -351,7 +332,6 @@ void Veff_rdmft::contributeHR() ModuleBase::TITLE("Veff", "contributeHR"); ModuleBase::timer::tick("Veff", "contributeHR"); - // this->GK->reset_spin(this->current_spin); double* vr_eff_rdmft = nullptr; @@ -368,12 +348,7 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_hartree(is, 0); // do grid integral calculation to get HR -#ifdef __OLD_GINT - Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); - this->GG->cal_gint(&inout); -#else ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); -#endif } } else if( potential_ == "local" ) @@ -387,16 +362,7 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_local(0, 0); // do grid integral calculation to get HR -#ifdef __OLD_GINT - Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal); - - // because in gamma_only, cal_gint would not set hRGint zero first - // so must use cal_vlocal(), and in rdmft_test.h, calculate V_hartree->contributeHR() first - - this->GG->cal_vlocal(&inout, false); // cal_gint ??? -#else ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); -#endif } else if( potential_ == "xc" ) { @@ -414,12 +380,7 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_XC(is, 0); // do grid integral calculation to get HR -#ifdef __OLD_GINT - Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); - this->GG->cal_gint(&inout); -#else ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); -#endif } } else @@ -427,10 +388,6 @@ void Veff_rdmft::contributeHR() std::cout << "\n\n!!!!!!\n there may be something wrong when use class Veff_rdmft\n\n!!!!!!\n"; } -#ifdef __OLD_GINT - // get HR for 2D-block parallel format - this->GG->transfer_pvpR(this->hR,this->ucell); -#endif this->new_e_iteration = false; if(this->nspin == 2) diff --git a/source/source_lcao/module_rdmft/rdmft_tools.h b/source/source_lcao/module_rdmft/rdmft_tools.h index 7b1639f8e8..91c69fb8c4 100644 --- a/source/source_lcao/module_rdmft/rdmft_tools.h +++ b/source/source_lcao/module_rdmft/rdmft_tools.h @@ -9,8 +9,6 @@ #include "source_base/matrix.h" #include "source_cell/module_neighbor/sltk_grid_driver.h" #include "source_cell/unitcell.h" -#include "source_lcao/module_gint/gint_gamma.h" -#include "source_lcao/module_gint/gint_k.h" #include "source_estate/module_pot/potential_new.h" #include "source_base/module_external/blas_connector.h" #include "source_base/module_external/scalapack_connector.h" @@ -259,10 +257,8 @@ class Veff_rdmft : public hamilt::OperatorLCAO public: /** * @brief Construct a new Veff object for multi-kpoint calculation - * @param GK_in: the pointer of Gint_k object, used for grid integration */ - Veff_rdmft(Gint_k* GK_in, - hamilt::HS_Matrix_K* hsk_in, + Veff_rdmft(hamilt::HS_Matrix_K* hsk_in, const std::vector>& kvec_d_in, elecstate::Potential* pot_in, hamilt::HContainer* hR_in, @@ -277,45 +273,15 @@ class Veff_rdmft : public hamilt::OperatorLCAO const std::string potential_in, double* etxc_in = nullptr, double* vtxc_in = nullptr) - : GK(GK_in), orb_cutoff_(orb_cutoff), pot(pot_in), ucell(ucell_in), + : orb_cutoff_(orb_cutoff), pot(pot_in), ucell(ucell_in), gd(GridD_in), hamilt::OperatorLCAO(hsk_in, kvec_d_in, hR_in), charge_(charge_in), rho_basis_(rho_basis_in), vloc_(vloc_in), sf_(sf_in), potential_(potential_in), etxc(etxc_in), vtxc(vtxc_in) { this->cal_type = hamilt::calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); -#ifdef __OLD_GINT - GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin); -#endif } - Veff_rdmft(Gint_Gamma* GG_in, - hamilt::HS_Matrix_K* hsk_in, - const std::vector>& kvec_d_in, - elecstate::Potential* pot_in, - hamilt::HContainer* hR_in, - const UnitCell* ucell_in, - const std::vector& orb_cutoff, - const Grid_Driver* GridD_in, - const int& nspin, - const Charge* charge_in, - const ModulePW::PW_Basis* rho_basis_in, - const ModuleBase::matrix* vloc_in, - const ModuleBase::ComplexMatrix* sf_in, - const std::string potential_in, - double* etxc_in = nullptr, - double* vtxc_in = nullptr) - : GG(GG_in), orb_cutoff_(orb_cutoff), pot(pot_in), hamilt::OperatorLCAO(hsk_in, kvec_d_in, hR_in), - ucell(ucell_in), gd(GridD_in), charge_(charge_in), rho_basis_(rho_basis_in), vloc_(vloc_in), sf_(sf_in), - potential_(potential_in), etxc(etxc_in), vtxc(vtxc_in) - { - this->cal_type = hamilt::calculation_type::lcao_gint; - - this->initialize_HR(ucell_in, GridD_in); -#ifdef __OLD_GINT - GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin); -#endif - } - + ~Veff_rdmft(){}; /** @@ -331,11 +297,6 @@ class Veff_rdmft : public hamilt::OperatorLCAO const Grid_Driver* gd; private: - // used for k-dependent grid integration. - Gint_k* GK = nullptr; - - // used for gamma only algorithms. - Gint_Gamma* GG = nullptr; std::vector orb_cutoff_; diff --git a/source/source_lcao/module_rdmft/update_state_rdmft.cpp b/source/source_lcao/module_rdmft/update_state_rdmft.cpp index 88a6761d1b..7a43c9be91 100644 --- a/source/source_lcao/module_rdmft/update_state_rdmft.cpp +++ b/source/source_lcao/module_rdmft/update_state_rdmft.cpp @@ -8,7 +8,7 @@ #include "source_estate/module_dm/cal_dm_psi.h" #include "source_estate/module_dm/density_matrix.h" #include "source_estate/module_charge/symmetry_rho.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" namespace rdmft @@ -106,22 +106,10 @@ void RDMFT::update_charge(UnitCell& ucell) { ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx); } -#ifdef __OLD_GINT - GG->transfer_DM2DtoGrid(DM_gamma_only.get_DMR_vector()); - Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin); - GG->cal_gint(&inout); -#else ModuleGint::cal_gint_rho(DM_gamma_only.get_DMR_vector(), nspin, charge->rho); -#endif if (XC_Functional::get_ked_flag()) { - // for (int is = 0; is < nspin; is++) - // { - // ModuleBase::GlobalFunc::ZEROS(charge->kin_r[is], charge->nrxx); - // } - // Gint_inout inout1(charge->kin_r, Gint_Tools::job_type::tau); - // GG->cal_gint(&inout1); this->pelec->cal_tau(wfc); } @@ -140,22 +128,10 @@ void RDMFT::update_charge(UnitCell& ucell) ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx); } -#ifdef __OLD_GINT - GK->transfer_DM2DtoGrid(DM.get_DMR_vector()); - Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin); - GK->cal_gint(&inout); -#else ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, charge->rho); -#endif if (XC_Functional::get_ked_flag()) { - // for (int is = 0; is < nspin; is++) - // { - // ModuleBase::GlobalFunc::ZEROS(charge->kin_r[is], charge->nrxx); - // } - // Gint_inout inout1(charge->kin_r, Gint_Tools::job_type::tau); - // GK->cal_gint(&inout1); this->pelec->cal_tau(wfc); } diff --git a/source/source_lcao/pulay_fs_gint.hpp b/source/source_lcao/pulay_fs_gint.hpp index f097bf8a93..9603a28c34 100644 --- a/source/source_lcao/pulay_fs_gint.hpp +++ b/source/source_lcao/pulay_fs_gint.hpp @@ -3,7 +3,7 @@ #include "source_lcao/stress_tools.h" #include "source_hamilt/module_xc/xc_functional.h" #include "source_io/module_parameter/parameter.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" namespace PulayForceStress { template @@ -18,7 +18,6 @@ namespace PulayForceStress const bool& set_dmr_gint) { const int nspin = PARAM.inp.nspin; - std::vector vr_eff(nspin, nullptr); std::vector vofk_eff(nspin, nullptr); if (XC_Functional::get_func_type() == 3 || XC_Functional::get_func_type() == 5) diff --git a/source/source_lcao/pulay_fs_temp.hpp b/source/source_lcao/pulay_fs_temp.hpp index 8d60d47255..ba32eedb2e 100644 --- a/source/source_lcao/pulay_fs_temp.hpp +++ b/source/source_lcao/pulay_fs_temp.hpp @@ -1,4 +1,5 @@ #pragma once +#include #include "pulay_fs.h" #include "source_base/timer.h" #include "source_io/module_parameter/parameter.h" diff --git a/source/source_lcao/record_adj.cpp b/source/source_lcao/record_adj.cpp index 7e2b9c7bbe..47118496a5 100644 --- a/source/source_lcao/record_adj.cpp +++ b/source/source_lcao/record_adj.cpp @@ -283,226 +283,4 @@ void Record_adj::for_2d(const UnitCell& ucell, return; } -//-------------------------------------------- -// This will record the orbitals according to -// grid division (cut along z direction) -//-------------------------------------------- -void Record_adj::for_grid(const UnitCell& ucell, - const Grid_Driver& grid_d, - const Grid_Technique& gt, - const std::vector& orb_cutoff) -{ - ModuleBase::TITLE("Record_adj", "for_grid"); - ModuleBase::timer::tick("Record_adj", "for_grid"); - - this->na_proc = 0; - this->iat2ca = new int[ucell.nat]; - for (int iat = 0; iat < ucell.nat; ++iat) - { - { - if (gt.in_this_processor[iat]) - { - iat2ca[iat] = na_proc; - ++na_proc; - } - else - { - iat2ca[iat] = -1; - } - } - } - - // number of adjacents for each atom. - this->na_each = new int[na_proc]; - ModuleBase::GlobalFunc::ZEROS(na_each, na_proc); - this->info = new int**[na_proc]; -#ifdef _OPENMP -#pragma omp parallel - { -#endif - ModuleBase::Vector3 tau1, tau2, dtau; - ModuleBase::Vector3 tau0, dtau1, dtau2; - -#ifdef _OPENMP -#pragma omp for schedule(dynamic) -#endif - for (int iat = 0; iat < ucell.nat; ++iat) - { - const int T1 = ucell.iat2it[iat]; - Atom* atom1 = &ucell.atoms[T1]; - const int I1 = ucell.iat2ia[iat]; - { - const int ca = iat2ca[iat]; - // key in this function - if (gt.in_this_processor[iat]) - { - tau1 = atom1->tau[I1]; - // grid_d.Find_atom(tau1); - AdjacentAtomInfo adjs; - grid_d.Find_atom(ucell, tau1, T1, I1, &adjs); - for (int ad = 0; ad < adjs.adj_num + 1; ad++) - { - const int T2 = adjs.ntype[ad]; - const int I2 = adjs.natom[ad]; - const int iat2 = ucell.itia2iat(T2, I2); - if (gt.in_this_processor[iat2]) - { - // Atom* atom2 = &ucell.atoms[T2]; - tau2 = adjs.adjacent_tau[ad]; - dtau = tau2 - tau1; - double distance = dtau.norm() * ucell.lat0; - double rcut = orb_cutoff[T1] + orb_cutoff[T2]; - - bool is_adj = false; - if (distance < rcut) - { - is_adj = true; - } - /* - else if(distance >= rcut) - { - for (int ad0 = 0; ad0 < grid_d.getAdjacentNum()+1; ++ad0) - { - const int T0 = grid_d.getType(ad0); - const int I0 = grid_d.getNatom(ad0); - const int iat0 = ucell.itia2iat(T0, I0); - const int start0 = ucell.itiaiw2iwt(T0, I0, 0); - - tau0 = grid_d.getAdjacentTau(ad0); - dtau1 = tau0 - tau1; - dtau2 = tau0 - tau2; - - double distance1 = dtau1.norm() * ucell.lat0; - double distance2 = dtau2.norm() * ucell.lat0; - - double rcut1 = orb_cutoff[T1] + ucell.infoNL.Beta[T0].get_rcut_max(); - double rcut2 = orb_cutoff[T2] + ucell.infoNL.Beta[T0].get_rcut_max(); - - if( distance1 < rcut1 && distance2 < rcut2 ) - { - is_adj = true; - break; - } // dis1, dis2 - } - } - */ - - // check the distance - if (is_adj) - { - ++na_each[ca]; - } - } // end judge 2 - } // end ad - } // end judge 1 - } // end I1 - } // end T1 - -#ifdef _OPENMP -#pragma omp for schedule(dynamic) -#endif - for (int i = 0; i < na_proc; i++) - { - assert(na_each[i] > 0); - info[i] = new int*[na_each[i]]; - for (int j = 0; j < na_each[i]; j++) - { - // (Rx, Ry, Rz, T, I) - info[i][j] = new int[5]; - ModuleBase::GlobalFunc::ZEROS(info[i][j], 5); - } - } - -#ifdef _OPENMP -#pragma omp for schedule(dynamic) -#endif - for (int iat = 0; iat < ucell.nat; ++iat) - { - const int T1 = ucell.iat2it[iat]; - Atom* atom1 = &ucell.atoms[T1]; - const int I1 = ucell.iat2ia[iat]; - { - const int ca = iat2ca[iat]; - - // key of this function - if (gt.in_this_processor[iat]) - { - tau1 = atom1->tau[I1]; - // grid_d.Find_atom(tau1); - AdjacentAtomInfo adjs; - grid_d.Find_atom(ucell, tau1, T1, I1, &adjs); - - int cb = 0; - for (int ad = 0; ad < adjs.adj_num + 1; ad++) - { - const int T2 = adjs.ntype[ad]; - const int I2 = adjs.natom[ad]; - const int iat2 = ucell.itia2iat(T2, I2); - // key of this function - if (gt.in_this_processor[iat2]) - { - // Atom* atom2 = &ucell.atoms[T2]; - tau2 = adjs.adjacent_tau[ad]; - dtau = tau2 - tau1; - double distance = dtau.norm() * ucell.lat0; - double rcut = orb_cutoff[T1] + orb_cutoff[T2]; - - // check the distance - if (distance < rcut) - { - info[ca][cb][0] = adjs.box[ad].x; - info[ca][cb][1] = adjs.box[ad].y; - info[ca][cb][2] = adjs.box[ad].z; - info[ca][cb][3] = T2; - info[ca][cb][4] = I2; - ++cb; - } - /* - else if(distance >= rcut) - { - for (int ad0 = 0; ad0 < grid_d.getAdjacentNum()+1; ++ad0) - { - const int T0 = grid_d.getType(ad0); - const int I0 = grid_d.getNatom(ad0); - const int iat0 = ucell.itia2iat(T0, I0); - const int start0 = ucell.itiaiw2iwt(T0, I0, 0); - - tau0 = grid_d.getAdjacentTau(ad0); - dtau1 = tau0 - tau1; - dtau2 = tau0 - tau2; - - double distance1 = dtau1.norm() * ucell.lat0; - double distance2 = dtau2.norm() * ucell.lat0; - - double rcut1 = orb_cutoff[T1] + ucell.infoNL.Beta[T0].get_rcut_max(); - double rcut2 = orb_cutoff[T2] + ucell.infoNL.Beta[T0].get_rcut_max(); - - if( distance1 < rcut1 && distance2 < rcut2 ) - { - info[ca][cb][0] = grid_d.getBox(ad).x; - info[ca][cb][1] = grid_d.getBox(ad).y; - info[ca][cb][2] = grid_d.getBox(ad).z; - info[ca][cb][3] = T2; - info[ca][cb][4] = I2; - ++cb; - break; - } // dis1, dis2 - } - } - */ - } - } // end ad - - assert(cb == na_each[ca]); - } - } - } -#ifdef _OPENMP - } -#endif - ModuleBase::timer::tick("Record_adj", "for_grid"); - info_modified = true; - // std::cout << " after for_grid" << std::endl; - return; -} diff --git a/source/source_lcao/record_adj.h b/source/source_lcao/record_adj.h index 3d1f16a402..871403ca14 100644 --- a/source/source_lcao/record_adj.h +++ b/source/source_lcao/record_adj.h @@ -2,7 +2,8 @@ #define RECORD_ADJ_H #include "source_basis/module_ao/parallel_orbitals.h" -#include "source_lcao/module_gint/grid_technique.h" +#include "source_cell/unitcell.h" +#include "source_cell/module_neighbor/sltk_grid_driver.h" //--------------------------------------------------- // FUNCTION: record the adjacent atoms for each atom @@ -26,14 +27,6 @@ class Record_adj bool gamma_only, const std::vector& orb_cutoff); - //-------------------------------------------- - // This will record the orbitals according to - // grid division (cut along z direction) - //-------------------------------------------- - void for_grid(const UnitCell& ucell, - const Grid_Driver& grid_d, - const Grid_Technique& gt, - const std::vector& orb_cutoff); void delete_grid(); @@ -41,7 +34,7 @@ class Record_adj int* na_each=nullptr; //-------------------------------------------- - // record sparse atom index in for_grid(const Grid_Technique >); + // record sparse atom index in for_grid(); // Map iat(dense atom index) to sparse atom index // Mainly removing the index dependency for OpenMP parallel loop // diff --git a/source/source_lcao/spar_dh.cpp b/source/source_lcao/spar_dh.cpp index 21748e830e..7d5d485d56 100644 --- a/source/source_lcao/spar_dh.cpp +++ b/source/source_lcao/spar_dh.cpp @@ -2,7 +2,7 @@ #include "source_io/module_parameter/parameter.h" #include "source_lcao/LCAO_domain.h" -#include "source_lcao/module_gint/temp_gint/gint_interface.h" +#include "source_lcao/module_gint/gint_interface.h" #include void sparse_format::cal_dS(const UnitCell& ucell, @@ -58,8 +58,7 @@ void sparse_format::cal_dH(const UnitCell& ucell, const LCAO_Orbitals& orb, const int& current_spin, const double& sparse_thr, - const ModuleBase::matrix& v_eff, - Gint_k& gint_k) + const ModuleBase::matrix& v_eff) { ModuleBase::TITLE("sparse_format", "cal_dH"); @@ -109,26 +108,6 @@ void sparse_format::cal_dH(const UnitCell& ucell, if(PARAM.inp.nspin==2) { -#ifdef __OLD_GINT - gint_k.allocate_pvdpR(); - // note: some MPI process will not have grids when MPI cores are too - // many, v_eff in these processes are empty - const double* vr_eff1 - = v_eff.nc * v_eff.nr > 0 ? &(v_eff(current_spin, 0)) : nullptr; - - if (!PARAM.globalv.gamma_only_local) - { - if (PARAM.inp.vl_in_h) - { - Gint_inout inout(vr_eff1, - current_spin, - Gint_Tools::job_type::dvlocal); - gint_k.cal_gint(&inout); - } - } - gint_k.cal_dvlocal_R_sparseMatrix(current_spin, sparse_thr, HS_Arrays, &pv, ucell, grid); - gint_k.destroy_pvdpR(); -#else const double* vr_eff1 = v_eff.nc * v_eff.nr > 0 ? &(v_eff(current_spin, 0)) : nullptr; if (!PARAM.globalv.gamma_only_local) @@ -137,7 +116,6 @@ void sparse_format::cal_dH(const UnitCell& ucell, PARAM.inp.nspin, PARAM.globalv.npol, current_spin, PARAM.globalv.nlocal, sparse_thr, vr_eff1, pv, ucell, grid, HS_Arrays); } -#endif } return; } diff --git a/source/source_lcao/spar_dh.h b/source/source_lcao/spar_dh.h index a71ebe4ec2..9af4fd6009 100644 --- a/source/source_lcao/spar_dh.h +++ b/source/source_lcao/spar_dh.h @@ -19,8 +19,7 @@ void cal_dH(const UnitCell& ucell, const LCAO_Orbitals& orb, const int& current_spin, const double& sparse_thr, - const ModuleBase::matrix& v_eff, - Gint_k& gint_k); + const ModuleBase::matrix& v_eff); // calculated the derivative of the overlap matrix: void cal_dS(const UnitCell& ucell, diff --git a/source/source_lcao/spar_hsr.h b/source/source_lcao/spar_hsr.h index b3e809ceb2..df8478c4bf 100644 --- a/source/source_lcao/spar_hsr.h +++ b/source/source_lcao/spar_hsr.h @@ -2,6 +2,7 @@ #define SPARSE_FORMAT_HSR_H #include "source_lcao/hamilt_lcao.h" +#include "source_lcao/LCAO_HS_arrays.hpp" namespace sparse_format {