diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00d59548c4..6e921dce23 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -253,9 +253,6 @@ if(ENABLE_LCAO)
     add_compile_definitions(__PEXSI)
     set(CMAKE_CXX_STANDARD 14)
   endif()
-  if(OLD_GINT)
-    add_compile_definitions(__OLD_GINT)
-  endif()
 else()
   set(ENABLE_MLALGO OFF)
   set(ENABLE_LIBRI OFF)
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 8b185f2011..8353a0f754 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -62,7 +62,6 @@ VPATH=./src_global:\
 ./source_lcao/module_deltaspin:\
 ./source_lcao/module_operator_lcao:\
 ./source_lcao/module_gint:\
-./source_lcao/module_gint/temp_gint:\
 ./source_relax:\
 ./source_hamilt/module_vdw:\
 ./source_io:\
@@ -282,32 +281,7 @@ OBJS_ESOLVER_LCAO=esolver_ks_lcao.o\
       esolver_dm2rho.o\
       esolver_double_xc.o\
 
-OBJS_GINT=gint_old.o\
-      gint_gamma_env.o\
-      gint_gamma_vl.o\
-      gint_fvl_old.o\
-      gint_rho_old.o\
-      gint_tau_old.o\
-      gint_vl_old.o\
-      gint_k_env.o\
-      gint_k_sparse1.o\
-      gint_k_pvpr.o\
-      gint_k_pvdpr.o\
-      gint_tools.o\
-      grid_bigcell.o\
-      grid_meshball.o\
-      grid_meshcell.o\
-      grid_meshk.o\
-      grid_technique.o\
-      gint_force_cpu_interface.o\
-      gint_rho_cpu_interface.o\
-      gint_vl_cpu_interface.o\
-      cal_psir_ylm.o\
-      cal_dpsir_ylm.o\
-      cal_ddpsir_ylm.o\
-      mult_psi_dmr.o\
-      init_orb.o\
-      batch_biggrid.o\
+OBJS_GINT=batch_biggrid.o\
       big_grid.o\
       biggrid_info.o\
       divide_info.o\
@@ -655,7 +629,6 @@ OBJS_LCAO=evolve_elec.o\
       stress_tools.o\
       edm.o\
       pulay_fs_center2.o\
-	  grid_init.o\
       spar_dh.o\
       spar_exx.o\
       spar_hsr.o\
diff --git a/source/source_esolver/esolver_double_xc.cpp b/source/source_esolver/esolver_double_xc.cpp
index 2658444bec..0174331a04 100644
--- a/source/source_esolver/esolver_double_xc.cpp
+++ b/source/source_esolver/esolver_double_xc.cpp
@@ -51,8 +51,6 @@ void ESolver_DoubleXC<TK, TR>::before_all_runners(UnitCell& ucell, const Input_p
         this->pelec_base = new elecstate::ElecStateLCAO<TK>(&(this->chr_base), // use which parameter?
                                                        &(this->kv),
                                                        this->kv.get_nks(),
-                                                       &(this->GG),
-                                                       &(this->GK),
                                                        this->pw_rho,
                                                        this->pw_big);
     }    
@@ -145,8 +143,6 @@ void ESolver_DoubleXC<TK, TR>::before_scf(UnitCell& ucell, const int istep)
         elecstate::DensityMatrix<TK, double>* DM = dynamic_cast<elecstate::ElecStateLCAO<TK>*>(this->pelec_base)->get_DM();
 
         this->p_hamilt_base = new hamilt::HamiltLCAO<TK, TR>(
-            PARAM.globalv.gamma_only_local ? &(this->GG) : nullptr,
-            PARAM.globalv.gamma_only_local ? nullptr : &(this->GK),
             ucell,
             this->gd,
             &this->pv,
diff --git a/source/source_esolver/esolver_gets.cpp b/source/source_esolver/esolver_gets.cpp
index d503876212..4e79849367 100644
--- a/source/source_esolver/esolver_gets.cpp
+++ b/source/source_esolver/esolver_gets.cpp
@@ -53,8 +53,6 @@ void ESolver_GetS::before_all_runners(UnitCell& ucell, const Input_para& inp)
         this->pelec = new elecstate::ElecStateLCAO<std::complex<double>>(&(this->chr), // use which parameter?
                                                                          &(this->kv),
                                                                          this->kv.get_nks(),
-                                                                         nullptr, // mohan add 2024-04-01
-                                                                         nullptr, // mohan add 2024-04-01
                                                                          this->pw_rho,
                                                                          this->pw_big);
     }
diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp
index 5b7797169f..29388b04c1 100644
--- a/source/source_esolver/esolver_ks_lcao.cpp
+++ b/source/source_esolver/esolver_ks_lcao.cpp
@@ -57,7 +57,7 @@ void ESolver_KS_LCAO<TK, TR>::before_all_runners(UnitCell& ucell, const Input_pa
     {
         // TK stands for double and std::complex<double>?
         this->pelec = new elecstate::ElecStateLCAO<TK>(&(this->chr), &(this->kv),
-          this->kv.get_nks(), &(this->GG), &(this->GK), this->pw_rho, this->pw_big);
+          this->kv.get_nks(), this->pw_rho, this->pw_big);
     }
 
     // 3) read LCAO orbitals/projectors and construct the interpolation tables.
@@ -136,7 +136,7 @@ void ESolver_KS_LCAO<TK, TR>::before_all_runners(UnitCell& ucell, const Input_pa
     // 16) init rdmft, added by jghan
     if (inp.rdmft == true)
     {
-        rdmft_solver.init(this->GG, this->GK, this->pv, ucell,
+        rdmft_solver.init(this->pv, ucell,
           this->gd, this->kv, *(this->pelec), this->orb_,
           two_center_bundle_, inp.dft_functional, inp.rdmft_power_alpha);
     }
@@ -198,8 +198,6 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
         elecstate::DensityMatrix<TK, double>* DM = estate->get_DM();
 
         this->p_hamilt = new hamilt::HamiltLCAO<TK, TR>(
-            PARAM.globalv.gamma_only_local ? &(this->GG) : nullptr,
-            PARAM.globalv.gamma_only_local ? nullptr : &(this->GK),
             ucell, this->gd, &this->pv, this->pelec->pot, this->kv,
             two_center_bundle_, orb_, DM, this->deepks
 #ifdef __EXX
@@ -371,7 +369,7 @@ void ESolver_KS_LCAO<TK, TR>::after_all_runners(UnitCell& ucell)
 	ModuleIO::ctrl_runner_lcao<TK, TR>(ucell,
 		  PARAM.inp, this->kv, estate, this->pv, this->Pgrid, 
 		  this->gd, this->psi, this->chr, hamilt_lcao,
-          this->two_center_bundle_, this->GG, this->GK,
+          this->two_center_bundle_,
           this->orb_, this->pw_rho, this->pw_rhod,
           this->sf, this->locpp.vloc, this->exx_nao, this->solvent);
 
@@ -639,9 +637,9 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(UnitCell& ucell, const int istep, const
     ModuleIO::ctrl_scf_lcao<TK, TR>(ucell,
             PARAM.inp, this->kv, estate, this->pv,
             this->gd, this->psi, hamilt_lcao,
-            this->two_center_bundle_, this->GK,
+            this->two_center_bundle_,
             this->orb_, this->pw_wfc, this->pw_rho,
-            this->GridT, this->pw_big, this->sf,
+            this->pw_big, this->sf,
             this->rdmft_solver, this->deepks, this->exx_nao, 
             this->conv_esolver, this->scf_nmax_flag,
             istep);
diff --git a/source/source_esolver/esolver_ks_lcao.h b/source/source_esolver/esolver_ks_lcao.h
index 422048c59c..16fb6aa8df 100644
--- a/source/source_esolver/esolver_ks_lcao.h
+++ b/source/source_esolver/esolver_ks_lcao.h
@@ -4,10 +4,8 @@
 #include "esolver_ks.h"
 #include "source_lcao/record_adj.h" // adjacent atoms
 #include "source_basis/module_nao/two_center_bundle.h" // nao basis
-#include "source_lcao/module_gint/gint_gamma.h" // gint for gamma-only k-points
-#include "source_lcao/module_gint/gint_k.h" // gint for multi k-points
-#include "source_lcao/module_gint/temp_gint/gint.h" // gint
-#include "source_lcao/module_gint/temp_gint/gint_info.h"
+#include "source_lcao/module_gint/gint.h" // gint
+#include "source_lcao/module_gint/gint_info.h"
 #include "source_lcao/setup_deepks.h" // for deepks, mohan add 20251008
 #include "source_lcao/setup_exx.h" // for exx, mohan add 20251008
 #include "source_lcao/module_rdmft/rdmft.h" // rdmft
@@ -67,15 +65,6 @@ class ESolver_KS_LCAO : public ESolver_KS<TK>
     //! NAO orbitals: 2d block-cyclic distribution info
     Parallel_Orbitals pv;
 
-    //! Grid integration: used for k-point-dependent algorithm
-    Gint_k GK;
-
-    //! Grid integration: used for gamma only algorithms.
-    Gint_Gamma GG;
-
-    //! Grid integration: used to store some basic information
-    Grid_Technique GridT;
-
     //! GintInfo: used to store some basic infomation about module_gint
     std::unique_ptr<ModuleGint::GintInfo> gint_info_;
 
@@ -107,12 +96,7 @@ class ESolver_KS_LCAO : public ESolver_KS<TK>
     const Record_adj & get_RA() const { return RA; }
     const Grid_Driver & get_gd() const { return gd; }
     const Parallel_Orbitals & get_pv() const { return pv; }
-    const Gint_k & get_GK() const { return GK; }
-    const Gint_Gamma & get_GG() const { return GG; }
-    const Grid_Technique & get_GridT() const { return GridT; }
-  #ifndef __OLD_GINT
     const std::unique_ptr<ModuleGint::GintInfo> & get_gint_info() const { return gint_info_; }
-  #endif
     const TwoCenterBundle & get_two_center_bundle() const { return two_center_bundle_; }
     const rdmft::RDMFT<TK, TR> & get_rdmft_solver() const { return rdmft_solver; }
     const LCAO_Orbitals & get_orb() const { return orb_; }
diff --git a/source/source_esolver/lcao_others.cpp b/source/source_esolver/lcao_others.cpp
index 1d4891954b..72aabae78e 100644
--- a/source/source_esolver/lcao_others.cpp
+++ b/source/source_esolver/lcao_others.cpp
@@ -175,8 +175,6 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
     {
         elecstate::DensityMatrix<TK, double>* DM = dynamic_cast<elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM();
         this->p_hamilt = new hamilt::HamiltLCAO<TK, TR>(
-            PARAM.globalv.gamma_only_local ? &(this->GG) : nullptr,
-            PARAM.globalv.gamma_only_local ? nullptr : &(this->GK),
             ucell,
             this->gd,
             &this->pv,
@@ -235,8 +233,7 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
         Get_pchg_lcao get_pchg(this->psi, &(this->pv));
         if (PARAM.globalv.gamma_only_local)
         {
-            get_pchg.begin(this->GG,
-                           this->chr.rho,
+            get_pchg.begin(this->chr.rho,
                            this->pelec->wg,
                            this->pelec->eferm.get_all_ef(),
                            this->pw_rhod->nrxx,
@@ -253,8 +250,7 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
         }
         else
         {
-            get_pchg.begin(this->GK,
-                           this->chr.rho,
+            get_pchg.begin(this->chr.rho,
                            this->chr.rhog,
                            this->pelec->wg,
                            this->pelec->eferm.get_all_ef(),
@@ -286,7 +282,6 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
                          this->pw_wfc,
                          this->Pgrid,
                          this->pv,
-                         this->GG,
                          PARAM.inp.out_wfc_pw,
                          this->kv,
                          PARAM.inp.nelec,
@@ -305,7 +300,6 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
                          this->pw_wfc,
                          this->Pgrid,
                          this->pv,
-                         this->GK,
                          PARAM.inp.out_wfc_pw,
                          this->kv,
                          PARAM.inp.nelec,
diff --git a/source/source_estate/elecstate_lcao.cpp b/source/source_estate/elecstate_lcao.cpp
index 7eaef537b0..fe3bb11758 100644
--- a/source/source_estate/elecstate_lcao.cpp
+++ b/source/source_estate/elecstate_lcao.cpp
@@ -5,11 +5,10 @@
 #include "source_estate/module_dm/cal_dm_psi.h"
 #include "source_hamilt/module_xc/xc_functional.h"
 #include "source_lcao/module_deltaspin/spin_constrain.h"
-#include "source_lcao/module_gint/grid_technique.h"
 #include "source_pw/module_pwdft/global.h"
 #include "source_io/module_parameter/parameter.h"
 
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 
 #include <vector>
 
@@ -34,13 +33,7 @@ void ElecStateLCAO<std::complex<double>>::psiToRho(const psi::Psi<std::complex<d
     //------------------------------------------------------------
 
     ModuleBase::GlobalFunc::NOTE("Calculate the charge on real space grid!");
-#ifdef __OLD_GINT
-    this->gint_k->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer dm2d to DM_grid in gint
-    Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin);
-    this->gint_k->cal_gint(&inout);
-#else
     ModuleGint::cal_gint_rho(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->rho);
-#endif
 
     if (XC_Functional::get_ked_flag())
     {
@@ -71,13 +64,7 @@ void ElecStateLCAO<double>::psiToRho(const psi::Psi<double>& psi)
     //------------------------------------------------------------
     ModuleBase::GlobalFunc::NOTE("Calculate the charge on real space grid!");
 
-#ifdef __OLD_GINT 
-    this->gint_gamma->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer dm2d to DM_grid in gint
-    Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin);
-    this->gint_gamma->cal_gint(&inout);
-#else
     ModuleGint::cal_gint_rho(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->rho);
-#endif
 
     if (XC_Functional::get_ked_flag())
     {
@@ -139,25 +126,14 @@ void ElecStateLCAO<double>::dmToRho(std::vector<double*> pexsi_DM, std::vector<d
     }
 
     ModuleBase::GlobalFunc::NOTE("Calculate the charge on real space grid!");
-#ifdef __OLD_GINT
-    this->gint_gamma->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer dm2d to DM_grid in gint
-    Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin);
-    this->gint_gamma->cal_gint(&inout);
-#else
     ModuleGint::cal_gint_rho(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->rho);
-#endif
     if (XC_Functional::get_ked_flag())
     {
         for (int is = 0; is < PARAM.inp.nspin; is++)
         {
             ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[0], this->charge->nrxx);
         }
-#ifdef __OLD_GINT
-        Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau);
-        this->gint_gamma->cal_gint(&inout1);
-#else
         ModuleGint::cal_gint_tau(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->kin_r);
-#endif
     }
 
     this->charge->renormalize_rho();
diff --git a/source/source_estate/elecstate_lcao.h b/source/source_estate/elecstate_lcao.h
index b56ec31f18..7c19f2c39b 100644
--- a/source/source_estate/elecstate_lcao.h
+++ b/source/source_estate/elecstate_lcao.h
@@ -3,8 +3,6 @@
 
 #include "elecstate.h"
 #include "source_estate/module_dm/density_matrix.h"
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
 
 #include <vector>
 
@@ -20,14 +18,10 @@ class ElecStateLCAO : public ElecState
     ElecStateLCAO(Charge* chg_in,
                   const K_Vectors* klist_in,
                   int nks_in,
-                  Gint_Gamma* gint_gamma_in, // mohan add 2024-04-01
-                  Gint_k* gint_k_in,         // mohan add 2024-04-01
                   ModulePW::PW_Basis* rhopw_in,
                   ModulePW::PW_Basis_Big* bigpw_in)
     {
         init_ks(chg_in, klist_in, nks_in, rhopw_in, bigpw_in);
-        this->gint_gamma = gint_gamma_in; // mohan add 2024-04-01
-        this->gint_k = gint_k_in;         // mohan add 2024-04-01
         this->classname = "ElecStateLCAO";
     }
 
@@ -85,8 +79,6 @@ class ElecStateLCAO : public ElecState
     // calcualte rho for each k
     // void rhoBandK(const psi::Psi<std::complex<double>>& psi);
 
-    Gint_Gamma* gint_gamma = nullptr; // mohan add 2024-04-01
-    Gint_k* gint_k = nullptr;         // mohan add 2024-04-01
 };
 
 template <typename TK>
diff --git a/source/source_estate/elecstate_lcao_cal_tau.cpp b/source/source_estate/elecstate_lcao_cal_tau.cpp
index db85c314fd..a2a4210002 100644
--- a/source/source_estate/elecstate_lcao_cal_tau.cpp
+++ b/source/source_estate/elecstate_lcao_cal_tau.cpp
@@ -1,5 +1,5 @@
 #include "elecstate_lcao.h"
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 
 #include "source_base/timer.h"
 
@@ -16,12 +16,7 @@ void ElecStateLCAO<std::complex<double>>::cal_tau(const psi::Psi<std::complex<do
     {
         ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[is], this->charge->nrxx);
     }
-#ifdef __OLD_GINT
-    Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau, PARAM.inp.nspin);
-    this->gint_k->cal_gint(&inout1);
-#else
     ModuleGint::cal_gint_tau(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->kin_r);
-#endif
     ModuleBase::timer::tick("ElecStateLCAO", "cal_tau");
     return;
 }
@@ -36,12 +31,7 @@ void ElecStateLCAO<double>::cal_tau(const psi::Psi<double>& psi)
     {
         ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[is], this->charge->nrxx);
     }
-#ifdef __OLD_GINT
-    Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau, PARAM.inp.nspin);
-    this->gint_gamma->cal_gint(&inout1);
-#else
     ModuleGint::cal_gint_tau(this->DM->get_DMR_vector(), PARAM.inp.nspin, this->charge->kin_r);
-#endif
 
     ModuleBase::timer::tick("ElecStateLCAO", "cal_tau");
     return;
diff --git a/source/source_io/berryphase.cpp b/source/source_io/berryphase.cpp
index 8d31edec91..c98913b35d 100644
--- a/source/source_io/berryphase.cpp
+++ b/source/source_io/berryphase.cpp
@@ -44,11 +44,10 @@ void berryphase::get_occupation_bands()
 void berryphase::lcao_init(const UnitCell& ucell,
                            const Grid_Driver& gd,
                            const K_Vectors& kv,
-                           const Grid_Technique& grid_tech,
                            const LCAO_Orbitals& orb)
 {
     ModuleBase::TITLE("berryphase", "lcao_init");
-    lcao_method.init(ucell,grid_tech, kv.get_nkstot(), orb);
+    lcao_method.init(ucell, kv.get_nkstot(), orb);
     lcao_method.cal_R_number(ucell, gd);
     lcao_method.cal_orb_overlap(ucell);
     return;
diff --git a/source/source_io/berryphase.h b/source/source_io/berryphase.h
index 1685ccdefd..a040fe758c 100644
--- a/source/source_io/berryphase.h
+++ b/source/source_io/berryphase.h
@@ -39,7 +39,6 @@ class berryphase
     void lcao_init(const UnitCell& ucell,
                    const Grid_Driver& gd,
                    const K_Vectors& kv,
-                   const Grid_Technique& grid_tech,
                    const LCAO_Orbitals& orb);
 #endif
     void set_kpoints(const K_Vectors& kv, const int direction);
diff --git a/source/source_io/cal_ldos.cpp b/source/source_io/cal_ldos.cpp
index ec2f00bfc7..f12fc9e391 100644
--- a/source/source_io/cal_ldos.cpp
+++ b/source/source_io/cal_ldos.cpp
@@ -3,7 +3,7 @@
 #include "cal_dos.h"
 #include "cube_io.h"
 #include "source_estate/module_dm/cal_dm_psi.h"
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 
 #include <type_traits>
 
@@ -60,12 +60,7 @@ void Cal_ldos<T>::cal_ldos_lcao(const elecstate::ElecStateLCAO<T>* pelec,
         }
 
     // calculate ldos
-#ifdef __OLD_GINT
-        ModuleBase::WARNING_QUIT("Cal_ldos::dm2ldos",
-                                 "do not support old grid integral, please recompile with __NEW_GINT");
-#else
         ModuleGint::cal_gint_rho(dm_ldos.get_DMR_vector(), PARAM.inp.nspin, ldos);
-#endif
 
         // I'm not sure whether ldos should be output for each spin or not
         // ldos[0] += ldos[1] for nspin_dm == 2
diff --git a/source/source_io/ctrl_runner_lcao.cpp b/source/source_io/ctrl_runner_lcao.cpp
index c6e2c0e447..9cf550830c 100644
--- a/source/source_io/ctrl_runner_lcao.cpp
+++ b/source/source_io/ctrl_runner_lcao.cpp
@@ -24,8 +24,6 @@ void ctrl_runner_lcao(UnitCell& ucell,      // unitcell
         Charge &chr,                  // charge density
 		hamilt::HamiltLCAO<TK, TR>* p_hamilt, // hamiltonian
 		TwoCenterBundle &two_center_bundle,   // use two-center integration
-        Gint_Gamma &gg,                     // gint for Gamma-only
-		Gint_k &gk,                         // gint for multi k-points
 		LCAO_Orbitals &orb,                 // LCAO orbitals
 		ModulePW::PW_Basis* pw_rho,   // charge density
 		ModulePW::PW_Basis* pw_rhod,  // dense charge density 
@@ -64,8 +62,6 @@ void ctrl_runner_lcao(UnitCell& ucell,      // unitcell
                                     *pw_rhod,
                                     vloc,
                                     chr,
-                                    gg,
-                                    gk,
                                     kv,
                                     orb.cutoffs(),
                                     pelec->wg,
@@ -89,8 +85,6 @@ void ctrl_runner_lcao(UnitCell& ucell,      // unitcell
                                       *pw_rhod,
                                       vloc,
                                       chr,
-                                      gg,
-                                      gk,
                                       kv,
                                       orb.cutoffs(),
                                       gd
@@ -118,8 +112,6 @@ void ctrl_runner_lcao(UnitCell& ucell,      // unitcell
                                             *pw_rhod,
                                             vloc,
                                             chr,
-                                            gg,
-                                            gk,
                                             kv,
                                             pelec->wg,
                                             gd,
@@ -150,8 +142,6 @@ template void ModuleIO::ctrl_runner_lcao<double, double>(UnitCell& ucell,      /
         Charge &chr,                  // charge density
 		hamilt::HamiltLCAO<double, double>* p_hamilt, // hamiltonian
 		TwoCenterBundle &two_center_bundle,   // use two-center integration
-        Gint_Gamma &gg,                     // gint for Gamma-only
-		Gint_k &gk,                         // gint for multi k-points
 		LCAO_Orbitals &orb,                 // LCAO orbitals
 		ModulePW::PW_Basis* pw_rho,   // charge density
 		ModulePW::PW_Basis* pw_rhod,  // dense charge density 
@@ -172,8 +162,6 @@ template void ctrl_runner_lcao<std::complex<double>, double>(UnitCell& ucell,
         Charge &chr,                  // charge density
 		hamilt::HamiltLCAO<std::complex<double>, double>* p_hamilt, // hamiltonian
 		TwoCenterBundle &two_center_bundle,   // use two-center integration
-        Gint_Gamma &gg,                     // gint for Gamma-only
-		Gint_k &gk,                         // gint for multi k-points
 		LCAO_Orbitals &orb,                 // LCAO orbitals
 		ModulePW::PW_Basis* pw_rho,   // charge density
 		ModulePW::PW_Basis* pw_rhod,  // dense charge density 
@@ -194,8 +182,6 @@ template void ctrl_runner_lcao<std::complex<double>, std::complex<double>>(UnitC
         Charge &chr,                  // charge density
 		hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>* p_hamilt, // hamiltonian
 		TwoCenterBundle &two_center_bundle,   // use two-center integration
-        Gint_Gamma &gg,                     // gint for Gamma-only
-		Gint_k &gk,                         // gint for multi k-points
 		LCAO_Orbitals &orb,                 // LCAO orbitals
 		ModulePW::PW_Basis* pw_rho,   // charge density
 		ModulePW::PW_Basis* pw_rhod,  // dense charge density 
diff --git a/source/source_io/ctrl_runner_lcao.h b/source/source_io/ctrl_runner_lcao.h
index 34eae3d26a..2b57c1800f 100644
--- a/source/source_io/ctrl_runner_lcao.h
+++ b/source/source_io/ctrl_runner_lcao.h
@@ -7,7 +7,6 @@
 #include "source_psi/psi.h" // use Psi<TK>
 #include "source_lcao/hamilt_lcao.h" // use hamilt::HamiltLCAO<TK, TR>
 #include "source_basis/module_nao/two_center_bundle.h" // use TwoCenterBundle
-#include "source_lcao/module_gint/gint_k.h" // use Gint_k
 #include "source_lcao/setup_exx.h" // for exx, mohan add 20251018
 
 namespace ModuleIO
@@ -25,8 +24,6 @@ void ctrl_runner_lcao(UnitCell& ucell,      // unitcell
         Charge &chr,                  // charge density
 		hamilt::HamiltLCAO<TK, TR>* p_hamilt, // hamiltonian
 		TwoCenterBundle &two_center_bundle,   // use two-center integration
-        Gint_Gamma &gg,                     // gint for Gamma-only
-		Gint_k &gk,                         // gint for multi k-points
 		LCAO_Orbitals &orb,                 // LCAO orbitals
 		ModulePW::PW_Basis* pw_rho,   // charge density
 		ModulePW::PW_Basis* pw_rhod,  // dense charge density 
diff --git a/source/source_io/ctrl_scf_lcao.cpp b/source/source_io/ctrl_scf_lcao.cpp
index 7ab02002c2..9fd14afbd6 100644
--- a/source/source_io/ctrl_scf_lcao.cpp
+++ b/source/source_io/ctrl_scf_lcao.cpp
@@ -41,11 +41,9 @@ void ModuleIO::ctrl_scf_lcao(UnitCell& ucell,
 		psi::Psi<TK>* psi,
 		hamilt::HamiltLCAO<TK, TR>* p_hamilt,
 		TwoCenterBundle &two_center_bundle,
-		Gint_k &gk,
 		LCAO_Orbitals &orb,
 		const ModulePW::PW_Basis_K* pw_wfc, // for berryphase
 		const ModulePW::PW_Basis* pw_rho, // for berryphase
-		Grid_Technique &gt, // for berryphase
 		const ModulePW::PW_Basis_Big* pw_big, // for Wannier90
 		const Structure_Factor& sf, // for Wannier90
 		rdmft::RDMFT<TK, TR> &rdmft_solver, // for RDMFT
@@ -219,7 +217,6 @@ void ModuleIO::ctrl_scf_lcao(UnitCell& ucell,
 			istep,
 			pelec->pot->get_effective_v(),
 			pv,
-			gk,
 			two_center_bundle,
 			orb,
 			ucell,
@@ -329,7 +326,7 @@ void ModuleIO::ctrl_scf_lcao(UnitCell& ucell,
     {
         std::cout << FmtCore::format("\n * * * * * *\n << Start %s.\n", "Berry phase calculation");
         berryphase bp(&pv);
-        bp.lcao_init(ucell, gd, kv, gt, orb);
+        bp.lcao_init(ucell, gd, kv, orb);
         // additional step before calling macroscopic_polarization
         bp.Macroscopic_polarization(ucell, pw_wfc->npwk_max, psi, pw_rho, pw_wfc, kv);
         std::cout << FmtCore::format(" >> Finish %s.\n * * * * * *\n", "Berry phase calculation");
@@ -471,11 +468,9 @@ template void ModuleIO::ctrl_scf_lcao<double, double>(
 		psi::Psi<double>* psi,
 		hamilt::HamiltLCAO<double, double>* p_hamilt,
 		TwoCenterBundle &two_center_bundle,
-		Gint_k &gk,
 		LCAO_Orbitals &orb,
 		const ModulePW::PW_Basis_K* pw_wfc, // for berryphase
 		const ModulePW::PW_Basis* pw_rho, // for berryphase
-		Grid_Technique &gt, // for berryphase
 		const ModulePW::PW_Basis_Big* pw_big, // for Wannier90
 		const Structure_Factor& sf, // for Wannier90
 		rdmft::RDMFT<double, double> &rdmft_solver, // for RDMFT
@@ -496,11 +491,9 @@ template void ModuleIO::ctrl_scf_lcao<std::complex<double>, double>(
 		psi::Psi<std::complex<double>>* psi,
 		hamilt::HamiltLCAO<std::complex<double>, double>* p_hamilt,
 		TwoCenterBundle &two_center_bundle,
-		Gint_k &gk,
 		LCAO_Orbitals &orb,
 		const ModulePW::PW_Basis_K* pw_wfc, // for berryphase
 		const ModulePW::PW_Basis* pw_rho, // for berryphase
-		Grid_Technique &gt, // for berryphase
 		const ModulePW::PW_Basis_Big* pw_big, // for Wannier90
 		const Structure_Factor& sf, // for Wannier90
 		rdmft::RDMFT<std::complex<double>, double> &rdmft_solver, // for RDMFT
@@ -520,11 +513,9 @@ template void ModuleIO::ctrl_scf_lcao<std::complex<double>, std::complex<double>
 		psi::Psi<std::complex<double>>* psi,
 		hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>* p_hamilt,
 		TwoCenterBundle &two_center_bundle,
-		Gint_k &gk,
 		LCAO_Orbitals &orb,
 		const ModulePW::PW_Basis_K* pw_wfc, // for berryphase
 		const ModulePW::PW_Basis* pw_rho, // for berryphase
-		Grid_Technique &gt, // for berryphase
 		const ModulePW::PW_Basis_Big* pw_big, // for Wannier90
 		const Structure_Factor& sf, // for Wannier90
 		rdmft::RDMFT<std::complex<double>, std::complex<double>> &rdmft_solver, // for RDMFT
diff --git a/source/source_io/ctrl_scf_lcao.h b/source/source_io/ctrl_scf_lcao.h
index ee1dcfdada..98ee5e18a5 100644
--- a/source/source_io/ctrl_scf_lcao.h
+++ b/source/source_io/ctrl_scf_lcao.h
@@ -9,7 +9,6 @@
 #include "source_psi/psi.h" // use Psi<TK>
 #include "source_lcao/hamilt_lcao.h" // use hamilt::HamiltLCAO<TK, TR>
 #include "source_basis/module_nao/two_center_bundle.h" // use TwoCenterBundle
-#include "source_lcao/module_gint/gint_k.h" // use Gint_k
 #include "source_basis/module_pw/pw_basis_k.h" // use ModulePW::PW_Basis_K and ModulePW::PW_Basis
 #include "source_pw/module_pwdft/structure_factor.h" // use Structure_Factor 
 #include "source_lcao/module_rdmft/rdmft.h" // use RDMFT codes
@@ -30,11 +29,9 @@ namespace ModuleIO
 				psi::Psi<TK>* psi,
 				hamilt::HamiltLCAO<TK, TR>* p_hamilt,
 				TwoCenterBundle &two_center_bundle,
-				Gint_k &gk,
 				LCAO_Orbitals &orb,
 				const ModulePW::PW_Basis_K* pw_wfc, // for berryphase
 				const ModulePW::PW_Basis* pw_rho, // for berryphase
-				Grid_Technique &gt, // for berryphase
 				const ModulePW::PW_Basis_Big* pw_big, // for Wannier90
 				const Structure_Factor& sf, // for Wannier90
 				rdmft::RDMFT<TK, TR> &rdmft_solver, // for RDMFT
diff --git a/source/source_io/get_pchg_lcao.cpp b/source/source_io/get_pchg_lcao.cpp
index e293a51312..346b56f12a 100644
--- a/source/source_io/get_pchg_lcao.cpp
+++ b/source/source_io/get_pchg_lcao.cpp
@@ -3,7 +3,7 @@
 #include "source_io/cube_io.h"
 #include "source_estate/module_charge/symmetry_rho.h"
 #include "source_estate/module_dm/cal_dm_psi.h"
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 
 Get_pchg_lcao::Get_pchg_lcao(psi::Psi<double>* psi_gamma_in, const Parallel_Orbitals* ParaV_in)
     : psi_gamma(psi_gamma_in), ParaV(ParaV_in)
@@ -20,8 +20,7 @@ Get_pchg_lcao::~Get_pchg_lcao()
 }
 
 // For gamma_only
-void Get_pchg_lcao::begin(Gint_Gamma& gg,
-                          double** rho,
+void Get_pchg_lcao::begin(double** rho,
                           const ModuleBase::matrix& wg,
                           const std::vector<double>& ef_all_spin,
                           const int rhopw_nrxx,
@@ -70,14 +69,7 @@ void Get_pchg_lcao::begin(Gint_Gamma& gg,
 
             DM.init_DMR(GridD_in, ucell_in);
             DM.cal_DMR();
-#ifdef __OLD_GINT
-            gg.initialize_pvpR(*ucell_in, GridD_in, nspin);
-            gg.transfer_DM2DtoGrid(DM.get_DMR_vector());
-            Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin);
-            gg.cal_gint(&inout);
-#else
             ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho);
-#endif
 
             // A solution to replace the original implementation of the following code:
             // pelec->charge->save_rho_before_sum_band();
@@ -109,8 +101,7 @@ void Get_pchg_lcao::begin(Gint_Gamma& gg,
 }
 
 // For multi-k
-void Get_pchg_lcao::begin(Gint_k& gk,
-                          double** rho,
+void Get_pchg_lcao::begin(double** rho,
                           std::complex<double>** rhog,
                           const ModuleBase::matrix& wg,
                           const std::vector<double>& ef_all_spin,
@@ -169,14 +160,7 @@ void Get_pchg_lcao::begin(Gint_k& gk,
 
                     DM.init_DMR(GridD_in, ucell_in);
                     DM.cal_DMR(ik);
-#ifdef __OLD_GINT
-                    gk.initialize_pvpR(*ucell_in, GridD_in, nspin);
-                    gk.transfer_DM2DtoGrid(DM.get_DMR_vector());
-                    Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin);
-                    gk.cal_gint(&inout);
-#else
                     ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho);
-#endif
                 
 
                     // Using std::vector to replace the original double** rho_save
@@ -216,14 +200,7 @@ void Get_pchg_lcao::begin(Gint_k& gk,
 
                 DM.init_DMR(GridD_in, ucell_in);
                 DM.cal_DMR();
-#ifdef __OLD_GINT
-                gk.initialize_pvpR(*ucell_in, GridD_in, nspin);
-                gk.transfer_DM2DtoGrid(DM.get_DMR_vector());
-                Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin);
-                gk.cal_gint(&inout);
-#else
                 ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho);
-#endif
                 // Using std::vector to replace the original double** rho_save
                 std::vector<std::vector<double>> rho_save(nspin, std::vector<double>(rhopw_nrxx));
 
diff --git a/source/source_io/get_pchg_lcao.h b/source/source_io/get_pchg_lcao.h
index 1c34219ade..130637c775 100644
--- a/source/source_io/get_pchg_lcao.h
+++ b/source/source_io/get_pchg_lcao.h
@@ -1,8 +1,6 @@
 #ifndef GET_PCHG_LCAO_H
 #define GET_PCHG_LCAO_H
 
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
 #include "source_cell/klist.h"
 #include "source_estate/module_dm/density_matrix.h"
 
@@ -22,8 +20,7 @@ class Get_pchg_lcao
     ~Get_pchg_lcao();
 
     // For gamma_only
-    void begin(Gint_Gamma& gg,
-               double** rho,
+    void begin(double** rho,
                const ModuleBase::matrix& wg,
                const std::vector<double>& ef_all_spin,
                const int rhopw_nrxx,
@@ -39,8 +36,7 @@ class Get_pchg_lcao
                std::ofstream& ofs_running);
 
     // For multi-k
-    void begin(Gint_k& gk,
-               double** rho,
+    void begin(double** rho,
                std::complex<double>** rhog,
                const ModuleBase::matrix& wg,
                const std::vector<double>& ef_all_spin,
diff --git a/source/source_io/get_wf_lcao.cpp b/source/source_io/get_wf_lcao.cpp
index 03e40ae21a..7d6cd0d15c 100644
--- a/source/source_io/get_wf_lcao.cpp
+++ b/source/source_io/get_wf_lcao.cpp
@@ -4,10 +4,8 @@
 #include "source_io/write_wfc_pw.h"
 #include "source_base/memory.h"
 
-#ifndef __OLD_GINT
-#include "source_lcao/module_gint/temp_gint/gint_env_gamma.h"
-#include "source_lcao/module_gint/temp_gint/gint_env_k.h"
-#endif
+#include "source_lcao/module_gint/gint_env_gamma.h"
+#include "source_lcao/module_gint/gint_env_k.h"
 
 Get_wf_lcao::Get_wf_lcao(const elecstate::ElecState* pes)
 {
@@ -24,7 +22,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
                         const ModulePW::PW_Basis_K* pw_wfc,
                         const Parallel_Grid& pgrid,
                         const Parallel_Orbitals& para_orb,
-                        Gint_Gamma& gg,
                         const int& out_wfc_pw,
                         const K_Vectors& kv,
                         const double nelec,
@@ -45,31 +42,12 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
 
     prepare_get_wf(ofs_running);
 
-#ifdef __OLD_GINT
-    // allocate grid wave functions for gamma_only
-    std::vector<double**> wfc_gamma_grid(nspin);
-    for (int is = 0; is < nspin; ++is)
-    {
-        wfc_gamma_grid[is] = new double*[nbands];
-        for (int ib = 0; ib < nbands; ++ib)
-        {
-            wfc_gamma_grid[is][ib] = new double[gg.gridt->lgd];
-        }
-    }
-#endif
-
     // for pw_wfc in G space
     psi::Psi<std::complex<double>> psi_g;
 
     // if (out_wfc_pw || out_wfc_r)
     psi_g.resize(nspin, nbands, kv.ngk[0]);
 
-#ifdef __OLD_GINT
-    const double mem_size = sizeof(double) * double(gg.gridt->lgd) * double(nbands) * double(nspin) / 1024.0 / 1024.0;
-    ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size);
-    ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size);
-#endif
-
     // Set this->bands_picked_
     this->select_bands(out_wfc_norm, nbands, fermi_band);
 
@@ -77,35 +55,12 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
     for (int is = 0; is < nspin; ++is)
     {
         psid->fix_k(is);
-#ifdef __OLD_GINT
-    #ifdef __MPI
-        wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo);
-    #else
-        // if not MPI enabled, it is the case psid holds a global matrix. 
-        // use fix_k to switch between different spin channels (actually kpoints, 
-        // because now the same kpoint in different spin channels are treated
-        // as distinct kpoints)
-        for (int i = 0; i < nbands; ++i)
-        {
-            for (int j = 0; j < nlocal; ++j)
-            {
-                wfc_gamma_grid[is][i][j] = psid[0](i, j);
-            }
-        }
-    #endif
-#else
         ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), &para_orb, nbands, nlocal, pes_->charge->rho[is]);
-#endif
         for (int ib = 0; ib < nbands; ++ib)
         {
             if (bands_picked_[ib])
             {
-            #ifdef __OLD_GINT
-                ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx);
-                gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell);
-            #else
                 gint_env.cal_env_band(ib);
-            #endif
                 pes_->charge->save_rho_before_sum_band();
 
                 // pint out information
@@ -140,34 +95,12 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
     for (int is = 0; is < nspin; ++is)
     {
         psid->fix_k(is);
-#ifdef __OLD_GINT
-    #ifdef __MPI
-        wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo);
-    #else
-        // if not MPI enabled, it is the case psid holds a global matrix. use fix_k to switch between
-        // different spin channels (actually kpoints, because now the same kpoint in different spin channels
-        // are treated as distinct kpoints)
-        for (int i = 0; i < nbands; ++i)
-        {
-            for (int j = 0; j < nlocal; ++j)
-            {
-                wfc_gamma_grid[is][i][j] = psid[0](i, j);
-            }
-        }
-    #endif
-#else
         ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), &para_orb, nbands, nlocal, pes_->charge->rho[is]);
-#endif
         for (int ib = 0; ib < nbands; ++ib)
         {
             if (bands_picked_[ib])
             {
-#ifdef __OLD_GINT
-                ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx);
-                gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell);
-#else
                 gint_env.cal_env_band(ib);
-#endif
                 pes_->charge->save_rho_before_sum_band();
 
                 const double ef_tmp = this->pes_->eferm.get_efval(is);
@@ -222,16 +155,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
                            pw_wfc,
                            ofs_running);
 
-#ifdef __OLD_GINT
-    for (int is = 0; is < nspin; ++is)
-    {
-        for (int ib = 0; ib < nbands; ++ib)
-        {
-            delete[] wfc_gamma_grid[is][ib];
-        }
-        delete[] wfc_gamma_grid[is];
-    }
-#endif
     return;
 }
 
@@ -241,7 +164,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
                         const ModulePW::PW_Basis_K* pw_wfc,
                         const Parallel_Grid& pgrid,
                         const Parallel_Orbitals& para_orb,
-                        Gint_k& gk,
                         const int& out_wfc_pw,
                         const K_Vectors& kv,
                         const double nelec,
@@ -262,21 +184,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
     // allocate grid wave functions for multi-k
     const int nks = kv.get_nks();
     std::vector<std::complex<double>**> wfc_k_grid(nks);
-#ifdef __OLD_GINT
-    for (int ik = 0; ik < nks; ++ik)
-    {
-        wfc_k_grid[ik] = new std::complex<double>*[nbands];
-        for (int ib = 0; ib < nbands; ++ib)
-        {
-            wfc_k_grid[ik][ib] = new std::complex<double>[gk.gridt->lgd];
-        }
-    }
-
-    const double mem_size
-        = sizeof(std::complex<double>) * double(gk.gridt->lgd) * double(nbands) * double(nks) / 1024.0 / 1024.0;
-    ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size);
-    ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size);
-#endif
 
     // for pw_wfc in G space
     psi::Psi<std::complex<double>> psi_g;
@@ -295,36 +202,14 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
         //  2d-to-grid conversion is unified into `wfc_2d_to_grid`.
         psi->fix_k(ik);
 
-#ifdef __OLD_GINT
-    #ifdef __MPI // need to deal with NSPIN=4 !!!!
-        wfc_2d_to_grid(psi->get_pointer(), para_orb, wfc_k_grid[ik], gk.gridt->trace_lo);
-    #else
-        for (int i = 0; i < nbands; ++i)
-        {
-            for (int j = 0; j < nlocal; ++j)
-            {
-                wfc_k_grid[ik][i][j] = psi[0](i, j);
-            }
-        }
-    #endif
-#else
         ModuleGint::Gint_env_k gint_env(psi->get_pointer(), &para_orb, kv.kvec_c, kv.kvec_d,
                                         nbands, nlocal, ik, PARAM.inp.nspin, PARAM.globalv.npol, pes_->charge->rho[ispin]);
-#endif
         
         for (int ib = 0; ib < nbands; ++ib)
         {
             if (bands_picked_[ib])
             {
-#ifdef __OLD_GINT
-                ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[ispin],
-                                              pw_wfc->nrxx); // terrible, you make changes on another instance's data???
-
-                // deal with NSPIN=4
-                gk.cal_env_k(ik, wfc_k_grid[ik][ib], pes_->charge->rho[ispin], kv.kvec_c, kv.kvec_d, ucell);
-#else
                 gint_env.cal_env_band(ib);
-#endif
 
                 // ik0 is the real k-point index, starting from 0
                 int ik0 = kv.ik2iktot[ik];
@@ -440,16 +325,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
             }
         }
     }
-#ifdef __OLD_GINT
-    for (int ik = 0; ik < nks; ++ik)
-    {
-        for (int ib = 0; ib < nbands; ++ib)
-        {
-            delete[] wfc_k_grid[ik][ib];
-        }
-        delete[] wfc_k_grid[ik];
-    }
-#endif
     return;
 }
 
diff --git a/source/source_io/get_wf_lcao.h b/source/source_io/get_wf_lcao.h
index b182e352bd..94afb2cb64 100644
--- a/source/source_io/get_wf_lcao.h
+++ b/source/source_io/get_wf_lcao.h
@@ -1,9 +1,8 @@
 #ifndef GET_WF_LCAO_H
 #define GET_WF_LCAO_H
 
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
 #include "source_estate/elecstate.h"
+#include "source_basis/module_ao/parallel_orbitals.h"
 
 class Get_wf_lcao
 {
@@ -17,7 +16,6 @@ class Get_wf_lcao
                const ModulePW::PW_Basis_K* pw_wfc,
                const Parallel_Grid& pgrid,
                const Parallel_Orbitals& para_orb,
-               Gint_Gamma& gg,
                const int& out_wfc_pw,
                const K_Vectors& kv,
                const double nelec,
@@ -29,34 +27,12 @@ class Get_wf_lcao
                const std::string& global_out_dir,
                std::ofstream& ofs_running);
 
-    /// tmp, delete after Gint is refactored.
-    void begin(const UnitCell& ucell,
-               const psi::Psi<double>* psid,
-               const ModulePW::PW_Basis_K* pw_wfc,
-               const Parallel_Grid& pgrid,
-               const Parallel_Orbitals& para_orb,
-               Gint_k& gg,
-               const int& out_wfc_pw,
-               const K_Vectors& kv,
-               const double nelec,
-               const std::vector<int>& out_wfc_norm,
-               const std::vector<int>& out_wfc_re_im,
-               const int nbands,
-               const int nspin,
-               const int nlocal,
-               const std::string& global_out_dir,
-               std::ofstream& ofs_running)
-    {
-        throw std::logic_error("gint_k should use with complex psi.");
-    };
-
     /// For multi-k
     void begin(const UnitCell& ucell,
                const psi::Psi<std::complex<double>>* psi,
                const ModulePW::PW_Basis_K* pw_wfc,
                const Parallel_Grid& pgrid,
                const Parallel_Orbitals& para_orb,
-               Gint_k& gk,
                const int& out_wfc_pw,
                const K_Vectors& kv,
                const double nelec,
@@ -68,27 +44,6 @@ class Get_wf_lcao
                const std::string& global_out_dir,
                std::ofstream& ofs_running);
 
-    /// tmp, delete after Gint is refactored.
-    void begin(const UnitCell& ucell,
-               const psi::Psi<std::complex<double>>* psi,
-               const ModulePW::PW_Basis_K* pw_wfc,
-               const Parallel_Grid& pgrid,
-               const Parallel_Orbitals& para_orb,
-               Gint_Gamma& gk,
-               const int& out_wfc_pw,
-               const K_Vectors& kv,
-               const double nelec,
-               const std::vector<int>& out_wfc_norm,
-               const std::vector<int>& out_wfc_re_im,
-               const int nbands,
-               const int nspin,
-               const int nlocal,
-               const std::string& global_out_dir,
-               std::ofstream& ofs_running)
-    {
-        throw std::logic_error("gint_gamma should use with real psi.");
-    };
-
   private:
     void prepare_get_wf(std::ofstream& ofs_running);
 
diff --git a/source/source_io/output_mat_sparse.cpp b/source/source_io/output_mat_sparse.cpp
index 12d65edd61..7381c61391 100644
--- a/source/source_io/output_mat_sparse.cpp
+++ b/source/source_io/output_mat_sparse.cpp
@@ -15,7 +15,6 @@ void output_mat_sparse(const bool& out_mat_hsR,
                        const int& istep,
                        const ModuleBase::matrix& v_eff,
                        const Parallel_Orbitals& pv,
-                       Gint_k& gint_k,
                        const TwoCenterBundle& two_center_bundle,
                        const LCAO_Orbitals& orb,
                        UnitCell& ucell,
@@ -34,7 +33,6 @@ void output_mat_sparse(const bool& out_mat_hsR,
                        const int& istep,
                        const ModuleBase::matrix& v_eff,
                        const Parallel_Orbitals& pv,
-                       Gint_k& gint_k,
                        const TwoCenterBundle& two_center_bundle,
                        const LCAO_Orbitals& orb,
                        UnitCell& ucell,
@@ -61,7 +59,6 @@ void output_mat_sparse(const bool& out_mat_hsR,
     {
         output_dHR(istep,
                    v_eff,
-                   gint_k, // mohan add 2024-04-01
                    ucell,
                    pv,
                    HS_Arrays,
diff --git a/source/source_io/output_mat_sparse.h b/source/source_io/output_mat_sparse.h
index 065f510214..bce47f7fb0 100644
--- a/source/source_io/output_mat_sparse.h
+++ b/source/source_io/output_mat_sparse.h
@@ -5,8 +5,7 @@
 #include "source_basis/module_nao/two_center_bundle.h"
 #include "source_cell/klist.h"
 #include "source_hamilt/hamilt.h"
-#include "source_lcao/module_gint/gint_k.h"
-
+#include "source_cell/module_neighbor/sltk_grid_driver.h"
 namespace ModuleIO
 {
 /// @brief the output interface to write the sparse matrix of H, S, T, and r
@@ -19,7 +18,6 @@ void output_mat_sparse(const bool& out_mat_hsR,
                        const int& istep,
                        const ModuleBase::matrix& v_eff,
                        const Parallel_Orbitals& pv,
-                       Gint_k& gint_k, // mohan add 2024-04-01
                        const TwoCenterBundle& two_center_bundle,
                        const LCAO_Orbitals& orb,
                        UnitCell& ucell,
diff --git a/source/source_io/to_wannier90_lcao.h b/source/source_io/to_wannier90_lcao.h
index 50560464e9..fa75293d9b 100644
--- a/source/source_io/to_wannier90_lcao.h
+++ b/source/source_io/to_wannier90_lcao.h
@@ -39,7 +39,6 @@
 #include "fR_overlap.h"
 #include "source_base/abfs-vector3_order.h"
 #include "source_base/math_lebedev_laikov.h"
-#include "source_lcao/module_gint/grid_technique.h"
 #include "source_lcao/module_hcontainer/hcontainer.h"
 
 class Coordinate_3D
diff --git a/source/source_io/to_wannier90_lcao_in_pw.h b/source/source_io/to_wannier90_lcao_in_pw.h
index d7a728a209..cf6d5fc915 100644
--- a/source/source_io/to_wannier90_lcao_in_pw.h
+++ b/source/source_io/to_wannier90_lcao_in_pw.h
@@ -30,7 +30,6 @@
 
 #ifdef __LCAO
 #include "source_basis/module_ao/parallel_orbitals.h"
-#include "source_lcao/module_gint/grid_technique.h"
 #include "source_psi/psi_initializer.h"
 
 class toWannier90_LCAO_IN_PW : public toWannier90_PW
diff --git a/source/source_io/unk_overlap_lcao.cpp b/source/source_io/unk_overlap_lcao.cpp
index dbd734f7e2..a352995a79 100644
--- a/source/source_io/unk_overlap_lcao.cpp
+++ b/source/source_io/unk_overlap_lcao.cpp
@@ -25,7 +25,6 @@ unkOverlap_lcao::~unkOverlap_lcao()
 }
 
 void unkOverlap_lcao::init(const UnitCell& ucell,
-                           const Grid_Technique& gt, 
                            const int nkstot, 
                            const LCAO_Orbitals& orb)
 {
diff --git a/source/source_io/unk_overlap_lcao.h b/source/source_io/unk_overlap_lcao.h
index a867a4b0c6..7abc37d337 100644
--- a/source/source_io/unk_overlap_lcao.h
+++ b/source/source_io/unk_overlap_lcao.h
@@ -12,7 +12,7 @@
 #include "source_lcao/center2_orb-orb11.h"
 #include "source_lcao/center2_orb-orb21.h"
 #include "source_lcao/center2_orb.h"
-#include "source_lcao/module_gint/grid_technique.h"
+#include "source_cell/module_neighbor/sltk_grid_driver.h"
 
 #include <map>
 #include <set>
@@ -48,7 +48,7 @@ class unkOverlap_lcao
     unkOverlap_lcao();
     ~unkOverlap_lcao();
 
-    void init(const UnitCell& ucell, const Grid_Technique& gt, const int nkstot, const LCAO_Orbitals& orb);
+    void init(const UnitCell& ucell, const int nkstot, const LCAO_Orbitals& orb);
     int iw2it(const UnitCell& ucell, int iw);
     int iw2ia(const UnitCell& ucell, int iw);
     int iw2iL(const UnitCell& ucell, int iw);
diff --git a/source/source_io/write_HS_R.cpp b/source/source_io/write_HS_R.cpp
index f88a8476f2..b98b74ef80 100644
--- a/source/source_io/write_HS_R.cpp
+++ b/source/source_io/write_HS_R.cpp
@@ -126,7 +126,6 @@ void ModuleIO::output_dSR(const int& istep,
 
 void ModuleIO::output_dHR(const int& istep,
                           const ModuleBase::matrix& v_eff,
-                          Gint_k& gint_k, // mohan add 2024-04-01
                           const UnitCell& ucell,
                           const Parallel_Orbitals& pv,
                           LCAO_HS_Arrays& HS_Arrays,
@@ -161,8 +160,7 @@ void ModuleIO::output_dHR(const int& istep,
 				orb,
 				cspin,
 				sparse_thr,
-                v_eff,
-				gint_k);
+                v_eff);
 	} 
 	else if (nspin == 2) 
 	{
@@ -176,8 +174,7 @@ void ModuleIO::output_dHR(const int& istep,
                                   orb,
                                   cspin,
                                   sparse_thr,
-                                  v_eff,
-                                  gint_k);
+                                  v_eff);
         }
     }
     // mohan update 2024-04-01
diff --git a/source/source_io/write_HS_R.h b/source/source_io/write_HS_R.h
index bf95c2d648..2f831d2baa 100644
--- a/source/source_io/write_HS_R.h
+++ b/source/source_io/write_HS_R.h
@@ -5,8 +5,8 @@
 #include "source_basis/module_nao/two_center_bundle.h"
 #include "source_cell/klist.h"
 #include "source_hamilt/hamilt.h"
-#include "source_lcao/module_gint/gint_k.h"
 #include "source_pw/module_pwdft/global.h"
+#include "source_lcao/LCAO_HS_arrays.hpp"
 
 namespace ModuleIO
 {
@@ -31,7 +31,6 @@ namespace ModuleIO
 
 	void output_dHR(const int& istep,
 			const ModuleBase::matrix& v_eff,
-			Gint_k& gint_k, // mohan add 2024-04-01
 			const UnitCell& ucell,
 			const Parallel_Orbitals& pv,
 			LCAO_HS_Arrays& HS_Arrays,
diff --git a/source/source_io/write_eband_terms.hpp b/source/source_io/write_eband_terms.hpp
index 0aa48770e0..701b097b25 100644
--- a/source/source_io/write_eband_terms.hpp
+++ b/source/source_io/write_eband_terms.hpp
@@ -21,8 +21,6 @@ void write_eband_terms(const int nspin,
                        const ModulePW::PW_Basis& rhod_basis,
                        const ModuleBase::matrix& vloc,
                        const Charge& chg,
-                       Gint_Gamma& gint_gamma, // mohan add 2024-04-01
-                       Gint_k& gint_k,         // mohan add 2024-04-01
                        const K_Vectors& kv,
                        const ModuleBase::matrix& wg,
                        Grid_Driver& gd,
@@ -45,10 +43,6 @@ void write_eband_terms(const int nspin,
 
         set_para2d_MO(*pv, nbands, p2d);
 
-        typename TGint<TK>::type* gint = nullptr;
-
-        set_gint_pointer<TK>(gint_gamma, gint_k, gint);
-
 		auto if_gamma_fix = [](hamilt::HContainer<TR>& hR) 
 		{
 			if (std::is_same<TK, double>::value) 
@@ -110,7 +104,7 @@ void write_eband_terms(const int nspin,
             if_gamma_fix(v_pp_local_R_ao);
             std::vector<std::vector<double>> e_orb_pp_local;
 
-			hamilt::Veff<hamilt::OperatorLCAO<TK, TR>> v_pp_local_op(gint,
+			hamilt::Veff<hamilt::OperatorLCAO<TK, TR>> v_pp_local_op(
 					&v_pp_local_k_ao, 
 					kv.kvec_d, 
 					&pot_local, 
@@ -167,7 +161,7 @@ void write_eband_terms(const int nspin,
             std::vector<hamilt::Veff<hamilt::OperatorLCAO<TK, TR>>*> v_hartree_op(nspin0);
             for (int is = 0; is < nspin0; ++is)
             {
-                v_hartree_op[is] = new hamilt::Veff<hamilt::OperatorLCAO<TK, TR>>(gint,
+                v_hartree_op[is] = new hamilt::Veff<hamilt::OperatorLCAO<TK, TR>>(
                     &v_hartree_k_ao, kv.kvec_d, &pot_hartree, &v_hartree_R_ao[is], &ucell, orb_cutoff, &gd, nspin);
                 v_hartree_op[is]->contributeHR();
             }
@@ -199,8 +193,6 @@ void write_eband_terms(const int nspin,
                               rhod_basis,
                               vloc,
                               chg,
-                              gint_gamma,
-                              gint_k,
                               kv,
                               orb_cutoff,
                               wg,
diff --git a/source/source_io/write_vxc.hpp b/source/source_io/write_vxc.hpp
index 43fd803bb7..ad503265c0 100644
--- a/source/source_io/write_vxc.hpp
+++ b/source/source_io/write_vxc.hpp
@@ -10,24 +10,6 @@
 #include "source_io/write_HS.h"
 #include "source_io/filename.h" // use filename_output function
 
-#ifndef TGINT_H
-#define TGINT_H
-template <typename T>
-struct TGint;
-
-template <>
-struct TGint<double>
-{
-    using type = Gint_Gamma;
-};
-
-template <>
-struct TGint<std::complex<double>>
-{
-    using type = Gint_k;
-};
-#endif
-
 namespace ModuleIO
 {
 
@@ -125,29 +107,6 @@ std::vector<double> orbital_energy(const int ik, const int nbands, const std::ve
     return e;
 }
 
-#ifndef SET_GINT_POINTER_H
-#define SET_GINT_POINTER_H
-// mohan update 2024-04-01
-template <typename T>
-void set_gint_pointer(Gint_Gamma& gint_gamma, Gint_k& gint_k, typename TGint<T>::type*& gint);
-
-// mohan update 2024-04-01
-template <>
-void set_gint_pointer<double>(Gint_Gamma& gint_gamma, Gint_k& gint_k, typename TGint<double>::type*& gint)
-{
-    gint = &gint_gamma;
-}
-
-// mohan update 2024-04-01
-template <>
-void set_gint_pointer<std::complex<double>>(Gint_Gamma& gint_gamma,
-                                            Gint_k& gint_k,
-                                            typename TGint<std::complex<double>>::type*& gint)
-{
-    gint = &gint_k;
-}
-#endif
-
 inline void write_orb_energy(const K_Vectors& kv,
     const int nspin0, const int nbands,
     const std::vector<std::vector<double>>& e_orb,
@@ -187,8 +146,6 @@ void write_Vxc(const int nspin,
                const ModulePW::PW_Basis& rhod_basis,
                const ModuleBase::matrix& vloc,
                const Charge& chg,
-               Gint_Gamma& gint_gamma, // mohan add 2024-04-01
-               Gint_k& gint_k,         // mohan add 2024-04-01
                const K_Vectors& kv,
                const std::vector<double>& orb_cutoff,
                const ModuleBase::matrix& wg,
@@ -227,14 +184,11 @@ void write_Vxc(const int nspin,
 
     // 3. allocate operators and contribute HR
     // op (corresponding to hR)
-    typename TGint<TK>::type* gint = nullptr;
-
-    set_gint_pointer<TK>(gint_gamma, gint_k, gint);
 
     std::vector<hamilt::Veff<hamilt::OperatorLCAO<TK, TR>>*> vxcs_op_ao(nspin0);
     for (int is = 0; is < nspin0; ++is)
     {
-        vxcs_op_ao[is] = new hamilt::Veff<hamilt::OperatorLCAO<TK, TR>>(gint,
+        vxcs_op_ao[is] = new hamilt::Veff<hamilt::OperatorLCAO<TK, TR>>(
             &vxc_k_ao, kv.kvec_d, potxc, &vxcs_R_ao[is], &ucell, orb_cutoff, &gd, nspin);
 
         vxcs_op_ao[is]->contributeHR();
diff --git a/source/source_io/write_vxc_r.hpp b/source/source_io/write_vxc_r.hpp
index 183d032760..0adfddb299 100644
--- a/source/source_io/write_vxc_r.hpp
+++ b/source/source_io/write_vxc_r.hpp
@@ -10,47 +10,8 @@
 #include "source_lcao/module_ri/RI_2D_Comm.h"
 #endif
 
-#ifndef TGINT_H
-#define TGINT_H
-template <typename T>
-struct TGint;
-
-template <>
-struct TGint<double>
-{
-    using type = Gint_Gamma;
-};
-
-template <>
-struct TGint<std::complex<double>>
-{
-    using type = Gint_k;
-};
-#endif
-
 namespace ModuleIO
 {
-
-#ifndef SET_GINT_POINTER_H
-#define SET_GINT_POINTER_H
-template <typename T>
-void set_gint_pointer(Gint_Gamma& gint_gamma, Gint_k& gint_k, typename TGint<T>::type*& gint);
-
-template <>
-void set_gint_pointer<double>(Gint_Gamma& gint_gamma, Gint_k& gint_k, typename TGint<double>::type*& gint)
-{
-    gint = &gint_gamma;
-}
-
-template <>
-void set_gint_pointer<std::complex<double>>(Gint_Gamma& gint_gamma,
-                                            Gint_k& gint_k,
-                                            typename TGint<std::complex<double>>::type*& gint)
-{
-    gint = &gint_k;
-}
-#endif
-
 template <typename TR> std::set<Abfs::Vector3_Order<int>> get_R_range(const hamilt::HContainer<TR>& hR)
 {
     std::set<Abfs::Vector3_Order<int>> all_R_coor;
@@ -97,8 +58,6 @@ void write_Vxc_R(const int nspin,
     const ModulePW::PW_Basis& rhod_basis,
     const ModuleBase::matrix& vloc,
     const Charge& chg,
-    Gint_Gamma& gint_gamma,
-    Gint_k& gint_k,
     const K_Vectors& kv,
     const std::vector<double>& orb_cutoff,
     Grid_Driver& gd,
@@ -144,12 +103,10 @@ const double sparse_thr=1e-10)
 
     // 3. calculate the Vxc(R)
     hamilt::HS_Matrix_K<TK> vxc_k_ao(pv, 1); // only hk is needed, sk is skipped
-    typename TGint<TK>::type* gint = nullptr;
-    set_gint_pointer<TK>(gint_gamma, gint_k, gint);
     std::vector<hamilt::Veff<hamilt::OperatorLCAO<TK, TR>>*> vxcs_op_ao(nspin0);
     for (int is = 0; is < nspin0; ++is)
     {
-        vxcs_op_ao[is] = new hamilt::Veff<hamilt::OperatorLCAO<TK, TR>>(gint,
+        vxcs_op_ao[is] = new hamilt::Veff<hamilt::OperatorLCAO<TK, TR>>(
             &vxc_k_ao, kv.kvec_d, potxc, &vxcs_R_ao[is], &ucell, orb_cutoff, &gd, nspin);
         vxcs_op_ao[is]->contributeHR();
 #ifdef __EXX
diff --git a/source/source_lcao/CMakeLists.txt b/source/source_lcao/CMakeLists.txt
index 1831ac6522..118e877239 100644
--- a/source/source_lcao/CMakeLists.txt
+++ b/source/source_lcao/CMakeLists.txt
@@ -28,7 +28,6 @@ if(ENABLE_LCAO)
         FORCE_k.cpp
         stress_tools.cpp
         edm.cpp
-        grid_init.cpp
         spar_dh.cpp
         spar_exx.cpp
         spar_hsr.cpp
diff --git a/source/source_lcao/FORCE.h b/source/source_lcao/FORCE.h
index ee16afd8b0..5eba250181 100644
--- a/source/source_lcao/FORCE.h
+++ b/source/source_lcao/FORCE.h
@@ -13,7 +13,6 @@
 #include "source_psi/psi.h"
 #include "source_lcao/setup_deepks.h"
 
-
 template <typename T>
 class Force_Stress_LCAO;
 
diff --git a/source/source_lcao/LCAO_domain.h b/source/source_lcao/LCAO_domain.h
index 77281f7efb..cf4af3ace9 100644
--- a/source/source_lcao/LCAO_domain.h
+++ b/source/source_lcao/LCAO_domain.h
@@ -9,9 +9,7 @@
 #include "source_lcao/LCAO_HS_arrays.hpp"
 #include "source_lcao/force_stress_arrays.h"
 #include "source_lcao/module_deepks/LCAO_deepks.h"
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
-#include "source_lcao/module_gint/grid_technique.h"
+#include "source_basis/module_ao/parallel_orbitals.h"
 
 namespace LCAO_domain
 {
@@ -35,17 +33,6 @@ void build_Nonlocal_mu_new(const Parallel_Orbitals& pv,
                            const TwoCenterIntegrator& intor_orb_beta,
                            const Grid_Driver* GridD);
 
-/**
- * @brief prepare gird integration
- */
-void grid_prepare(const Grid_Technique& gt,
-                  Gint_Gamma& gint_gamma,
-                  Gint_k& gint_k,
-                  const UnitCell& ucell,
-                  const LCAO_Orbitals& orb,
-                  const ModulePW::PW_Basis& rhopw,
-                  const ModulePW::PW_Basis_Big& bigpw);
-
 /**
  * @brief set the elements of force-related matrices in LCAO method
  */
diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp
index 7743a68f1f..f8b60b6298 100644
--- a/source/source_lcao/LCAO_init_basis.cpp
+++ b/source/source_lcao/LCAO_init_basis.cpp
@@ -1,6 +1,7 @@
 #include "LCAO_domain.h"
 
 #include "source_io/module_parameter/parameter.h"
+#include "source_base/parallel_comm.h"
 /// once the GlobalC::exx_info has been deleted, this include can be gone 
 /// mohan note 2024-07-21
 #ifdef __EXX
diff --git a/source/source_lcao/grid_init.cpp b/source/source_lcao/grid_init.cpp
deleted file mode 100644
index 517f39a273..0000000000
--- a/source/source_lcao/grid_init.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "source_lcao/LCAO_domain.h"
-#include "source_pw/module_pwdft/global.h"
-#include "source_io/module_parameter/parameter.h"
-#include "source_base/global_variable.h"
-#include "source_base/parallel_reduce.h"
-#include "source_base/timer.h"
-
-namespace LCAO_domain
-{
-
-//--------------------------------------------
-// prepare grid network for Gint(grid integral)
-//--------------------------------------------
-void grid_prepare(
-		const Grid_Technique& gt, 
-        Gint_Gamma &gint_gamma,
-        Gint_k &gint_k,
-		const UnitCell& ucell,
-        const LCAO_Orbitals& orb,
-		const ModulePW::PW_Basis& rhopw, 
-		const ModulePW::PW_Basis_Big& bigpw)
-{
-    ModuleBase::TITLE("LCAO_domain","grid_prepare");
-    ModuleBase::timer::tick("LCAO_domain","grid_prepare");
-    if(PARAM.globalv.gamma_only_local)
-    {
-		gint_gamma.prep_grid(
-				gt, 
-				bigpw.nbx, 
-				bigpw.nby, 
-				bigpw.nbzp, 
-				bigpw.nbzp_start,
-				rhopw.nxyz, 
-				bigpw.bx, 
-				bigpw.by, 
-				bigpw.bz, 
-				bigpw.bxyz, 
-				bigpw.nbxx,
-				rhopw.ny, 
-				rhopw.nplane, 
-				rhopw.startz_current,
-				&ucell,
-				&orb);
-	}
-    else // multiple k-points
-    {
-        // cal the grid integration of 'Vl' matrix for l-points algorithms.
-		gint_k.prep_grid(
-				gt, 
-				bigpw.nbx, 
-				bigpw.nby, 
-				bigpw.nbzp, 
-				bigpw.nbzp_start,
-				rhopw.nxyz, 
-				bigpw.bx, 
-				bigpw.by, 
-				bigpw.bz, 
-				bigpw.bxyz, 
-				bigpw.nbxx,
-				rhopw.ny, 
-				rhopw.nplane, 
-				rhopw.startz_current,
-				&ucell,
-				&orb);
-	}
-
-    ModuleBase::timer::tick("LCAO_domain","grid_prepare");
-    return;
-}
-
-}
diff --git a/source/source_lcao/hamilt_lcao.cpp b/source/source_lcao/hamilt_lcao.cpp
index 389b9812cf..920d66ac11 100644
--- a/source/source_lcao/hamilt_lcao.cpp
+++ b/source/source_lcao/hamilt_lcao.cpp
@@ -70,9 +70,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(const UnitCell& ucell,
 }
 
 template <typename TK, typename TR>
-HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
-                               Gint_k* GK_in,
-                               const UnitCell& ucell,
+HamiltLCAO<TK, TR>::HamiltLCAO(const UnitCell& ucell,
                                const Grid_Driver& grid_d,
                                const Parallel_Orbitals* paraV,
                                elecstate::Potential* pot_in,
@@ -186,8 +184,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
                 // register Potential by gathered operator
                 pot_in->pot_register(pot_register_in);
                 // effective potential term
-                Operator<TK>* veff = new Veff<OperatorLCAO<TK, TR>>(GG_in,
-                                                                    this->hsk,
+                Operator<TK>* veff = new Veff<OperatorLCAO<TK, TR>>(this->hsk,
                                                                     this->kv->kvec_d,
                                                                     pot_in,
                                                                     this->hR, // no explicit call yet
@@ -256,8 +253,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
                 // register Potential by gathered operator
                 pot_in->pot_register(pot_register_in);
                 // Veff term
-                this->getOperator() = new Veff<OperatorLCAO<TK, TR>>(GK_in,
-                                                                     this->hsk,
+                this->getOperator() = new Veff<OperatorLCAO<TK, TR>>(this->hsk,
                                                                      this->kv->kvec_d,
                                                                      pot_in,
                                                                      this->hR,
diff --git a/source/source_lcao/hamilt_lcao.h b/source/source_lcao/hamilt_lcao.h
index 7e888ded3d..acd838ca2f 100644
--- a/source/source_lcao/hamilt_lcao.h
+++ b/source/source_lcao/hamilt_lcao.h
@@ -8,8 +8,6 @@
 #include "source_estate/module_pot/potential_new.h"
 #include "source_hamilt/hamilt.h"
 #include "source_lcao/hs_matrix_k.hpp"
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
 #include "source_lcao/module_hcontainer/hcontainer.h"
 
 #include <vector>
@@ -39,9 +37,7 @@ class HamiltLCAO : public Hamilt<TK>
      * @brief Constructor of Hamiltonian for LCAO base
      * HR and SR will be allocated with Operators
      */
-    HamiltLCAO(Gint_Gamma* GG_in,
-               Gint_k* GK_in,
-               const UnitCell& ucell,
+    HamiltLCAO(const UnitCell& ucell,
                const Grid_Driver& grid_d,
 			   const Parallel_Orbitals* paraV,
 			   elecstate::Potential* pot_in,
diff --git a/source/source_lcao/module_gint/CMakeLists.txt b/source/source_lcao/module_gint/CMakeLists.txt
index 0505957b9c..6969abd7e0 100644
--- a/source/source_lcao/module_gint/CMakeLists.txt
+++ b/source/source_lcao/module_gint/CMakeLists.txt
@@ -2,103 +2,46 @@
 if(ENABLE_LCAO)
 
 list(APPEND objects
-    gint_old.cpp
-    gint_gamma_env.cpp
-    gint_gamma_vl.cpp
-    gint_fvl_old.cpp
-    gint_rho_old.cpp
-    gint_tau_old.cpp
-    gint_vl_old.cpp
-    gint_k_env.cpp
-    gint_k_sparse1.cpp
-    gint_k_pvpr.cpp
-    gint_k_pvdpr.cpp
-    gint_tools.cpp
-    grid_bigcell.cpp
-    grid_meshball.cpp
-    grid_meshcell.cpp
-    grid_meshk.cpp
-    grid_technique.cpp
-    gint_force_cpu_interface.cpp
-    gint_rho_cpu_interface.cpp
-    gint_vl_cpu_interface.cpp
-    cal_psir_ylm.cpp
-    cal_dpsir_ylm.cpp
-    cal_ddpsir_ylm.cpp
-    mult_psi_dmr.cpp
-    init_orb.cpp
-)
-
-if(NOT DEFINED OLD_GINT)
-  list(APPEND objects
-      temp_gint/biggrid_info.cpp
-      temp_gint/big_grid.cpp
-      temp_gint/divide_info.cpp
-      temp_gint/gint_atom.cpp
-      temp_gint/gint_info.cpp
-      temp_gint/gint.cpp
-      temp_gint/gint_vl.cpp
-      temp_gint/gint_vl_metagga.cpp
-      temp_gint/gint_vl_nspin4.cpp
-      temp_gint/gint_vl_metagga_nspin4.cpp
-      temp_gint/gint_rho.cpp
-      temp_gint/gint_tau.cpp
-      temp_gint/gint_fvl.cpp
-      temp_gint/gint_fvl_meta.cpp
-      temp_gint/gint_env_gamma.cpp
-      temp_gint/gint_env_k.cpp
-      temp_gint/gint_dvlocal.cpp
-      temp_gint/localcell_info.cpp
-      temp_gint/phi_operator.cpp
-      temp_gint/set_ddphi.cpp
-      temp_gint/unitcell_info.cpp
-      temp_gint/gint_common.cpp
-      temp_gint/gint_interface.cpp
-      )
-  if(USE_CUDA)
-    list(APPEND objects
-        temp_gint/kernel/gint_gpu_vars.cpp
-        temp_gint/kernel/phi_operator_gpu.cu
-        temp_gint/kernel/phi_operator_kernel.cu
-        temp_gint/kernel/set_const_mem.cu
-        temp_gint/batch_biggrid.cpp
-        temp_gint/gint_vl_gpu.cpp
-        temp_gint/gint_rho_gpu.cpp
-        temp_gint/gint_fvl_gpu.cpp
-        temp_gint/gint_vl_metagga_gpu.cpp
-        temp_gint/gint_vl_nspin4_gpu.cpp
-        temp_gint/gint_vl_metagga_nspin4_gpu.cpp
-        temp_gint/gint_tau_gpu.cpp
-        temp_gint/gint_fvl_meta_gpu.cpp
-        temp_gint/kernel/dgemm_vbatch.cu
+    biggrid_info.cpp
+    big_grid.cpp
+    divide_info.cpp
+    gint_atom.cpp
+    gint_info.cpp
+    gint.cpp
+    gint_vl.cpp
+    gint_vl_metagga.cpp
+    gint_vl_nspin4.cpp
+    gint_vl_metagga_nspin4.cpp
+    gint_rho.cpp
+    gint_tau.cpp
+    gint_fvl.cpp
+    gint_fvl_meta.cpp
+    gint_env_gamma.cpp
+    gint_env_k.cpp
+    gint_dvlocal.cpp
+    localcell_info.cpp
+    phi_operator.cpp
+    set_ddphi.cpp
+    unitcell_info.cpp
+    gint_common.cpp
+    gint_interface.cpp
     )
-  endif()
-endif()
-
 if(USE_CUDA)
   list(APPEND objects
-      gint_gpu_interface.cpp
-      kernels/cuda/cuda_tools.cu
-      kernels/cuda/gint_vl.cu
-      kernels/cuda/gint_rho.cu
-      kernels/cuda/gint_force.cu
-      gint_vl_gpu.cu
-      gint_rho_gpu.cu
-      gint_force_gpu.cu
-      kernels/cuda/gemm_selector.cu
-      kernels/cuda/code_gen_00.cu
-      kernels/cuda/code_gen_01.cu
-      kernels/cuda/code_gen_02.cu
-      kernels/cuda/code_gen_03.cu
-      kernels/cuda/code_gen_04.cu
-      kernels/cuda/code_gen_05.cu
-      kernels/cuda/code_gen_06.cu
-      kernels/cuda/code_gen_07.cu
-      kernels/cuda/code_gen_08.cu
-      kernels/cuda/code_gen_09.cu
-      gtask_vl.cpp
-      gtask_rho.cpp
-      gtask_force.cpp
+      kernel/gint_gpu_vars.cpp
+      kernel/phi_operator_gpu.cu
+      kernel/phi_operator_kernel.cu
+      kernel/set_const_mem.cu
+      batch_biggrid.cpp
+      gint_vl_gpu.cpp
+      gint_rho_gpu.cpp
+      gint_fvl_gpu.cpp
+      gint_vl_metagga_gpu.cpp
+      gint_vl_nspin4_gpu.cpp
+      gint_vl_metagga_nspin4_gpu.cpp
+      gint_tau_gpu.cpp
+      gint_fvl_meta_gpu.cpp
+      kernel/dgemm_vbatch.cu
   )
 endif()
 
@@ -112,10 +55,4 @@ if(ENABLE_COVERAGE)
   add_coverage(gint)
 endif()
 
-IF (BUILD_TESTING)
-  if(ENABLE_MPI)
-    add_subdirectory(test)
-  endif()
-endif()
-
 endif()
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/batch_biggrid.cpp b/source/source_lcao/module_gint/batch_biggrid.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/batch_biggrid.cpp
rename to source/source_lcao/module_gint/batch_biggrid.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/batch_biggrid.h b/source/source_lcao/module_gint/batch_biggrid.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/batch_biggrid.h
rename to source/source_lcao/module_gint/batch_biggrid.h
diff --git a/source/source_lcao/module_gint/temp_gint/big_grid.cpp b/source/source_lcao/module_gint/big_grid.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/big_grid.cpp
rename to source/source_lcao/module_gint/big_grid.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/big_grid.h b/source/source_lcao/module_gint/big_grid.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/big_grid.h
rename to source/source_lcao/module_gint/big_grid.h
diff --git a/source/source_lcao/module_gint/temp_gint/biggrid_info.cpp b/source/source_lcao/module_gint/biggrid_info.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/biggrid_info.cpp
rename to source/source_lcao/module_gint/biggrid_info.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/biggrid_info.h b/source/source_lcao/module_gint/biggrid_info.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/biggrid_info.h
rename to source/source_lcao/module_gint/biggrid_info.h
diff --git a/source/source_lcao/module_gint/cal_ddpsir_ylm.cpp b/source/source_lcao/module_gint/cal_ddpsir_ylm.cpp
deleted file mode 100644
index 206c6f95e8..0000000000
--- a/source/source_lcao/module_gint/cal_ddpsir_ylm.cpp
+++ /dev/null
@@ -1,316 +0,0 @@
-#include "gint_tools.h"
-#include "source_base/timer.h"
-#include "source_base/ylm.h"
-namespace Gint_Tools{
-void cal_ddpsir_ylm(
-    const Grid_Technique& gt, const int bxyz,
-    const int na_grid,                 // number of atoms on this grid
-    const int grid_index,              // 1d index of FFT index (i,j,k)
-    const double delta_r,              // delta_r of the uniform FFT grid
-    const int* const block_index,      // block_index[na_grid+1], count total number of atomis orbitals
-    const int* const block_size,       // block_size[na_grid],	number of columns of a band
-    const bool* const* const cal_flag, // cal_flag[bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
-    double* const* const ddpsir_ylm_xx, double* const* const ddpsir_ylm_xy, double* const* const ddpsir_ylm_xz,
-    double* const* const ddpsir_ylm_yy, double* const* const ddpsir_ylm_yz, double* const* const ddpsir_ylm_zz)
-{
-    ModuleBase::timer::tick("Gint_Tools", "cal_ddpsir_ylm");
-    const UnitCell& ucell = *gt.ucell;
-    std::vector<const double*> it_psi_uniform(gt.nwmax);
-    std::vector<const double*> it_dpsi_uniform(gt.nwmax);
-    std::vector<const double*> it_d2psi_uniform(gt.nwmax);
-    std::vector<int> it_psi_nr_uniform(gt.nwmax);
-    // array to store spherical harmonics and its derivatives
-    // the first dimension equals 36 because the maximum nwl is 5.
-    double rly[36];
-    ModuleBase::Array_Pool<double> grly(36, 3);
-
-    for (int id = 0; id < na_grid; id++)
-    {
-        const int mcell_index = gt.bcell_start[grid_index] + id;
-        const int imcell = gt.which_bigcell[mcell_index];
-        int iat = gt.which_atom[mcell_index];
-        const int it = ucell.iat2it[iat];
-        const int ia = ucell.iat2ia[iat];
-        Atom* atom = &ucell.atoms[it];
-
-        const double mt[3] = {gt.meshball_positions[imcell][0] - gt.tau_in_bigcell[iat][0],
-                              gt.meshball_positions[imcell][1] - gt.tau_in_bigcell[iat][1],
-                              gt.meshball_positions[imcell][2] - gt.tau_in_bigcell[iat][2]};
-
-        for (int iw=0; iw< atom->nw; ++iw)
-        {
-            if ( atom->iw2_new[iw] )
-            {
-                it_psi_uniform[iw]= gt.psi_u[it*gt.nwmax + iw].data();
-                it_dpsi_uniform[iw] = gt.dpsi_u[it*gt.nwmax + iw].data();
-                it_psi_nr_uniform[iw]= gt.psi_u[it*gt.nwmax + iw].size();
-            }
-        }
-
-        for (int ib = 0; ib < bxyz; ib++)
-        {
-            double* const p_ddpsi_xx = &ddpsir_ylm_xx[ib][block_index[id]];
-            double* const p_ddpsi_xy = &ddpsir_ylm_xy[ib][block_index[id]];
-            double* const p_ddpsi_xz = &ddpsir_ylm_xz[ib][block_index[id]];
-            double* const p_ddpsi_yy = &ddpsir_ylm_yy[ib][block_index[id]];
-            double* const p_ddpsi_yz = &ddpsir_ylm_yz[ib][block_index[id]];
-            double* const p_ddpsi_zz = &ddpsir_ylm_zz[ib][block_index[id]];
-            if (!cal_flag[ib][id])
-            {
-                ModuleBase::GlobalFunc::ZEROS(p_ddpsi_xx, block_size[id]);
-                ModuleBase::GlobalFunc::ZEROS(p_ddpsi_xy, block_size[id]);
-                ModuleBase::GlobalFunc::ZEROS(p_ddpsi_xz, block_size[id]);
-                ModuleBase::GlobalFunc::ZEROS(p_ddpsi_yy, block_size[id]);
-                ModuleBase::GlobalFunc::ZEROS(p_ddpsi_yz, block_size[id]);
-                ModuleBase::GlobalFunc::ZEROS(p_ddpsi_zz, block_size[id]);
-            }
-            else
-            {
-                const double dr[3]
-                    = {// vectors between atom and grid
-                       gt.meshcell_pos[ib][0] + mt[0], gt.meshcell_pos[ib][1] + mt[1], gt.meshcell_pos[ib][2] + mt[2]};
-                double distance = std::sqrt(dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]);
-
-                // for some unknown reason, the finite difference between dpsi and ddpsi
-                // using analytical expression is always wrong; as a result,
-                // I switch to explicit finite difference method for evaluating
-                // the second derivatives of the orbitals
-                if (/*distance < 1e-9*/ true)
-                {
-                    double*** dpsi = new double**[atom->nw];
-                    for (int i = 0; i < atom->nw; i++)
-                    {
-                        dpsi[i] = new double*[6];
-                        for (int j = 0; j < 6; j++)
-                        {
-                            dpsi[i][j] = new double[3];
-                            ModuleBase::GlobalFunc::ZEROS(dpsi[i][j], 3);
-                        }
-                    }
-
-                    double* dr1 = new double[3];
-
-                    double** displ = new double*[6];
-                    for (int i = 0; i < 6; i++)
-                    {
-                        displ[i] = new double[3];
-                        ModuleBase::GlobalFunc::ZEROS(displ[i], 3);
-                    }
-                    displ[0][0] = 0.0001; // in x direction
-                    displ[1][0] = -0.0001;
-                    displ[2][1] = 0.0001; // in y direction
-                    displ[3][1] = -0.0001;
-                    displ[4][2] = 0.0001; // in z direction
-                    displ[5][2] = -0.0001;
-
-                    for (int i = 0; i < 6; i++)
-                    {
-                        dr1[0] = dr[0] + displ[i][0];
-                        dr1[1] = dr[1] + displ[i][1];
-                        dr1[2] = dr[2] + displ[i][2];
-
-                        ModuleBase::Ylm::grad_rl_sph_harm(ucell.atoms[it].nwl, dr1[0], dr1[1], dr1[2], rly, grly.get_ptr_2D());
-
-                        double distance1 = std::sqrt(dr1[0] * dr1[0] + dr1[1] * dr1[1] + dr1[2] * dr1[2]);
-                        if (distance1 < 1e-9) {
-                            distance1 = 1e-9;
-}
-
-                        const double position = distance1 / delta_r;
-
-                        const int ip = static_cast<int>(position);
-                        const double iq = static_cast<int>(position);
-                        const double x0 = position - iq;
-                        const double x1 = 1.0 - x0;
-                        const double x2 = 2.0 - x0;
-                        const double x3 = 3.0 - x0;
-                        const double x12 = x1 * x2 / 6;
-                        const double x03 = x0 * x3 / 2;
-
-                        double tmp, dtmp;
-
-                        for (int iw = 0; iw < atom->nw; ++iw)
-                        {
-                            // this is a new 'l', we need 1D orbital wave
-                            // function from interpolation method.
-                            if (atom->iw2_new[iw])
-                            {
-                                auto psi_uniform = it_psi_uniform[iw];
-                                auto dpsi_uniform = it_dpsi_uniform[iw];
-
-                                // if ( iq[id] >= philn.nr_uniform-4)
-                                if (iq >= it_psi_nr_uniform[iw]-4)
-                                {
-                                    tmp = dtmp = 0.0;
-                                }
-                                else
-                                {
-                                    // use Polynomia Interpolation method to get the
-                                    // wave functions
-
-                                    tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0)
-                                          + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1);
-
-                                    dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0)
-                                           + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1);
-                                }
-                            } // new l is used.
-
-                            // get the 'l' of this localized wave function
-                            const int ll = atom->iw2l[iw];
-                            const int idx_lm = atom->iw2_ylm[iw];
-
-                            const double rl = pow_int(distance1, ll);
-
-                            // derivative of wave functions with respect to atom positions.
-                            const double tmpdphi_rly = (dtmp - tmp * ll / distance1) / rl * rly[idx_lm] / distance1;
-                            const double tmprl = tmp / rl;
-
-                            dpsi[iw][i][0] = tmpdphi_rly * dr1[0] + tmprl * grly[idx_lm][0];
-                            dpsi[iw][i][1] = tmpdphi_rly * dr1[1] + tmprl * grly[idx_lm][1];
-                            dpsi[iw][i][2] = tmpdphi_rly * dr1[2] + tmprl * grly[idx_lm][2];
-                        } // end iw
-                    }     // end i = 0-6
-
-                    for (int iw = 0; iw < atom->nw; iw++)
-                    {
-                        p_ddpsi_xx[iw] = (dpsi[iw][0][0] - dpsi[iw][1][0]) / 0.0002;
-                        p_ddpsi_xy[iw]
-                            = ((dpsi[iw][2][0] - dpsi[iw][3][0]) + (dpsi[iw][0][1] - dpsi[iw][1][1])) / 0.0004;
-                        p_ddpsi_xz[iw]
-                            = ((dpsi[iw][4][0] - dpsi[iw][5][0]) + (dpsi[iw][0][2] - dpsi[iw][1][2])) / 0.0004;
-                        p_ddpsi_yy[iw] = (dpsi[iw][2][1] - dpsi[iw][3][1]) / 0.0002;
-                        p_ddpsi_yz[iw]
-                            = ((dpsi[iw][4][1] - dpsi[iw][5][1]) + (dpsi[iw][2][2] - dpsi[iw][3][2])) / 0.0004;
-                        p_ddpsi_zz[iw] = (dpsi[iw][4][2] - dpsi[iw][5][2]) / 0.0002;
-                    }
-
-                    for (int i = 0; i < atom->nw; i++)
-                    {
-                        for (int j = 0; j < 6; j++)
-                        {
-                            delete[] dpsi[i][j];
-                        }
-                        delete[] dpsi[i];
-                    }
-                    delete[] dpsi;
-
-                    delete[] dr1;
-                    for (int i = 0; i < 6; i++)
-                    {
-                        delete[] displ[i];
-                    }
-                    delete[] displ;
-                }
-                else
-                // the analytical method for evaluating 2nd derivatives
-                // it is not used currently
-                {
-                    // Add it here, but do not run it. If there is a need to run this code 
-                    // in the future, include it in the previous initialization process.
-                    for (int iw=0; iw< atom->nw; ++iw)
-                    {
-                        if ( atom->iw2_new[iw] )
-                        {
-                            it_d2psi_uniform[iw] = gt.d2psi_u[it*gt.nwmax + iw].data();
-                        }
-                    }
-                    // End of code addition section.
-
-                    std::vector<std::vector<double>> hrly;
-                    ModuleBase::Ylm::grad_rl_sph_harm(ucell.atoms[it].nwl, dr[0], dr[1], dr[2], rly, grly.get_ptr_2D());
-                    ModuleBase::Ylm::hes_rl_sph_harm(ucell.atoms[it].nwl, dr[0], dr[1], dr[2], hrly);
-                    const double position = distance / delta_r;
-
-                    const double iq = static_cast<int>(position);
-                    const int ip = static_cast<int>(position);
-                    const double x0 = position - iq;
-                    const double x1 = 1.0 - x0;
-                    const double x2 = 2.0 - x0;
-                    const double x3 = 3.0 - x0;
-                    const double x12 = x1 * x2 / 6;
-                    const double x03 = x0 * x3 / 2;
-
-                    double tmp, dtmp, ddtmp;
-
-                    for (int iw = 0; iw < atom->nw; ++iw)
-                    {
-                        // this is a new 'l', we need 1D orbital wave
-                        // function from interpolation method.
-                        if (atom->iw2_new[iw])
-                        {
-                            auto psi_uniform = it_psi_uniform[iw];
-                            auto dpsi_uniform = it_dpsi_uniform[iw];
-                            auto ddpsi_uniform = it_d2psi_uniform[iw];
-
-                            // if ( iq[id] >= philn.nr_uniform-4)
-                            if (iq >= it_psi_nr_uniform[iw]-4)
-                            {
-                                tmp = dtmp = ddtmp = 0.0;
-                            }
-                            else
-                            {
-                                // use Polynomia Interpolation method to get the
-                                // wave functions
-
-                                tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0)
-                                      + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1);
-
-                                dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0)
-                                       + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1);
-
-                                ddtmp = x12 * (ddpsi_uniform[ip] * x3 + ddpsi_uniform[ip + 3] * x0)
-                                        + x03 * (ddpsi_uniform[ip + 1] * x2 - ddpsi_uniform[ip + 2] * x1);
-                            }
-                        } // new l is used.
-
-                        // get the 'l' of this localized wave function
-                        const int ll = atom->iw2l[iw];
-                        const int idx_lm = atom->iw2_ylm[iw];
-
-                        const double rl = pow_int(distance, ll);
-                        const double r_lp2 =rl * distance * distance;
-
-                        // d/dr (R_l / r^l)
-                        const double tmpdphi = (dtmp - tmp * ll / distance) / rl;
-                        const double term1 = ddtmp / r_lp2;
-                        const double term2 = (2 * ll + 1) * dtmp / r_lp2 / distance;
-                        const double term3 = ll * (ll + 2) * tmp / r_lp2 / distance / distance;
-                        const double term4 = tmpdphi / distance;
-                        const double term5 = term1 - term2 + term3;
-
-                        // hessian of (R_l / r^l)
-                        const double term_xx = term4 + dr[0] * dr[0] * term5;
-                        const double term_xy = dr[0] * dr[1] * term5;
-                        const double term_xz = dr[0] * dr[2] * term5;
-                        const double term_yy = term4 + dr[1] * dr[1] * term5;
-                        const double term_yz = dr[1] * dr[2] * term5;
-                        const double term_zz = term4 + dr[2] * dr[2] * term5;
-
-                        // d/dr (R_l / r^l) * alpha / r
-                        const double term_1x = dr[0] * term4;
-                        const double term_1y = dr[1] * term4;
-                        const double term_1z = dr[2] * term4;
-
-                        p_ddpsi_xx[iw]
-                            = term_xx * rly[idx_lm] + 2.0 * term_1x * grly[idx_lm][0] + tmp / rl * hrly[idx_lm][0];
-                        p_ddpsi_xy[iw] = term_xy * rly[idx_lm] + term_1x * grly[idx_lm][1] + term_1y * grly[idx_lm][0]
-                                         + tmp / rl * hrly[idx_lm][1];
-                        p_ddpsi_xz[iw] = term_xz * rly[idx_lm] + term_1x * grly[idx_lm][2] + term_1z * grly[idx_lm][0]
-                                         + tmp / rl * hrly[idx_lm][2];
-                        p_ddpsi_yy[iw]
-                            = term_yy * rly[idx_lm] + 2.0 * term_1y * grly[idx_lm][1] + tmp / rl * hrly[idx_lm][3];
-                        p_ddpsi_yz[iw] = term_yz * rly[idx_lm] + term_1y * grly[idx_lm][2] + term_1z * grly[idx_lm][1]
-                                         + tmp / rl * hrly[idx_lm][4];
-                        p_ddpsi_zz[iw]
-                            = term_zz * rly[idx_lm] + 2.0 * term_1z * grly[idx_lm][2] + tmp / rl * hrly[idx_lm][5];
-
-                    } // iw
-                }     // end if
-            }         // else
-        }             // end ib
-    }                 // end id(atom)
-    ModuleBase::timer::tick("Gint_Tools", "cal_ddpsir_ylm");
-    return;
-}
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/cal_dpsir_ylm.cpp b/source/source_lcao/module_gint/cal_dpsir_ylm.cpp
deleted file mode 100644
index 8b32b2fc05..0000000000
--- a/source/source_lcao/module_gint/cal_dpsir_ylm.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-#include "gint_tools.h"
-#include "source_base/timer.h"
-#include "source_base/ylm.h"
-#include "source_base/array_pool.h"
-namespace Gint_Tools{
-void cal_dpsir_ylm(
-    const Grid_Technique& gt, const int bxyz,
-    const int na_grid,                 // number of atoms on this grid
-    const int grid_index,              // 1d index of FFT index (i,j,k)
-    const double delta_r,              // delta_r of the uniform FFT grid
-    const int* const block_index,      // block_index[na_grid+1], count total number of atomis orbitals
-    const int* const block_size,       // block_size[na_grid],	number of columns of a band
-    const bool* const* const cal_flag, // cal_flag[bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
-    double* const* const psir_ylm, double* const* const dpsir_ylm_x, double* const* const dpsir_ylm_y,
-    double* const* const dpsir_ylm_z)
-{
-    ModuleBase::timer::tick("Gint_Tools", "cal_dpsir_ylm");
-    const UnitCell& ucell = *gt.ucell;
-    std::vector<const double*> it_psi_uniform(gt.nwmax);
-    std::vector<const double*> it_dpsi_uniform(gt.nwmax);
-    std::vector<int> it_psi_nr_uniform(gt.nwmax);
-    // array to store spherical harmonics and its derivatives
-    // the first dimension equals 36 because the maximum nwl is 5.
-    double rly[36];
-    ModuleBase::Array_Pool<double> grly(36, 3);
-
-    for (int id = 0; id < na_grid; id++)
-    {
-        const int mcell_index = gt.bcell_start[grid_index] + id;
-        const int imcell = gt.which_bigcell[mcell_index];
-        int iat = gt.which_atom[mcell_index];
-        const int it = ucell.iat2it[iat];
-        const int ia = ucell.iat2ia[iat];
-        Atom* atom = &ucell.atoms[it];
-
-        const double mt[3] = {gt.meshball_positions[imcell][0] - gt.tau_in_bigcell[iat][0],
-                              gt.meshball_positions[imcell][1] - gt.tau_in_bigcell[iat][1],
-                              gt.meshball_positions[imcell][2] - gt.tau_in_bigcell[iat][2]};
-        // preprocess index
-        for (int iw=0; iw< atom->nw; ++iw)
-        {
-            if ( atom->iw2_new[iw] )
-            {
-                it_psi_uniform[iw]= gt.psi_u[it*gt.nwmax + iw].data();
-                it_dpsi_uniform[iw] = gt.dpsi_u[it*gt.nwmax + iw].data();
-                it_psi_nr_uniform[iw]= gt.psi_u[it*gt.nwmax + iw].size();
-            }
-        }
-
-        for (int ib = 0; ib < bxyz; ib++)
-        {
-            double* const p_psi = &psir_ylm[ib][block_index[id]];
-            double* const p_dpsi_x = &dpsir_ylm_x[ib][block_index[id]];
-            double* const p_dpsi_y = &dpsir_ylm_y[ib][block_index[id]];
-            double* const p_dpsi_z = &dpsir_ylm_z[ib][block_index[id]];
-            if (!cal_flag[ib][id])
-            {
-                ModuleBase::GlobalFunc::ZEROS(p_psi, block_size[id]);
-                ModuleBase::GlobalFunc::ZEROS(p_dpsi_x, block_size[id]);
-                ModuleBase::GlobalFunc::ZEROS(p_dpsi_y, block_size[id]);
-                ModuleBase::GlobalFunc::ZEROS(p_dpsi_z, block_size[id]);
-            }
-            else
-            {
-                const double dr[3]
-                    = {// vectors between atom and grid
-                       gt.meshcell_pos[ib][0] + mt[0], gt.meshcell_pos[ib][1] + mt[1], gt.meshcell_pos[ib][2] + mt[2]};
-                double distance = std::sqrt(dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]);
-
-                ModuleBase::Ylm::grad_rl_sph_harm(ucell.atoms[it].nwl, dr[0], dr[1], dr[2], rly, grly.get_ptr_2D());
-                if (distance < 1e-9) {
-                    distance = 1e-9;
-}
-
-                const double position = distance / delta_r;
-
-                const double iq = static_cast<int>(position);
-                const int ip = static_cast<int>(position);
-                const double x0 = position - iq;
-                const double x1 = 1.0 - x0;
-                const double x2 = 2.0 - x0;
-                const double x3 = 3.0 - x0;
-                const double x12 = x1 * x2 / 6;
-                const double x03 = x0 * x3 / 2;
-
-                double tmp, dtmp;
-
-                for (int iw = 0; iw < atom->nw; ++iw)
-                {
-		
-                    // this is a new 'l', we need 1D orbital wave
-                    // function from interpolation method.
-                    if (atom->iw2_new[iw])
-                    {
-                        auto psi_uniform = it_psi_uniform[iw];
-                        auto dpsi_uniform = it_dpsi_uniform[iw];
-                        // if ( iq[id] >= philn.nr_uniform-4)
-                        if (iq >= it_psi_nr_uniform[iw] - 4)
-                        {
-                            tmp = dtmp = 0.0;
-                        }
-                        else
-                        {
-                            // use Polynomia Interpolation method to get the
-                            // wave functions
-
-                            tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0)
-                                  + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1);
-
-                            dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0)
-                                   + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1);
-                        }
-                    } // new l is used.
-
-                    // get the 'l' of this localized wave function
-                    const int ll = atom->iw2l[iw];
-                    const int idx_lm = atom->iw2_ylm[iw];
-
-                    const double rl = pow_int(distance, ll);
-                    const double tmprl = tmp / rl;
-                    
-                    // 3D wave functions
-                    p_psi[iw] = tmprl * rly[idx_lm];
-
-                    // derivative of wave functions with respect to atom positions.
-                    const double tmpdphi_rly = (dtmp - tmp * ll / distance) / rl * rly[idx_lm] / distance;
-
-                    p_dpsi_x[iw] = tmpdphi_rly * dr[0] + tmprl * grly[idx_lm][0];
-                    p_dpsi_y[iw] = tmpdphi_rly * dr[1] + tmprl * grly[idx_lm][1];
-                    p_dpsi_z[iw] = tmpdphi_rly * dr[2] + tmprl * grly[idx_lm][2];
-                } // iw
-            }     // else
-        }
-    }
-    ModuleBase::timer::tick("Gint_Tools", "cal_dpsir_ylm");
-    return;
-}
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/cal_psir_ylm.cpp b/source/source_lcao/module_gint/cal_psir_ylm.cpp
deleted file mode 100644
index 4eeedd19a5..0000000000
--- a/source/source_lcao/module_gint/cal_psir_ylm.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#include "gint_tools.h"
-#include "source_base/timer.h"
-#include "source_base/ylm.h"
-namespace Gint_Tools{
-void cal_psir_ylm(
-    const Grid_Technique& gt,
-    const int bxyz,
-    const int na_grid,            // number of atoms on this grid
-    const int grid_index,         // 1d index of FFT index (i,j,k)
-    const double delta_r,         // delta_r of the uniform FFT grid
-    const int* const block_index, // block_index[na_grid+1], count total number of atomis orbitals
-    const int* const block_size,  // block_size[na_grid],	number of columns of a band
-    const bool* const* const cal_flag,
-    double* const* const psir_ylm) // cal_flag[bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
-{
-//    ModuleBase::timer::tick("Gint_Tools", "cal_psir_ylm");
-    std::vector<double> ylma;
-    const UnitCell& ucell = *gt.ucell;
-    std::vector<const double*> it_psi_uniform(gt.nwmax);
-    std::vector<const double*> it_dpsi_uniform(gt.nwmax);
-
-    for (int id = 0; id < na_grid; id++)
-    {
-        // there are two parameters we want to know here:
-        // in which bigcell of the meshball the atom is in?
-        // what's the cartesian coordinate of the bigcell?
-        const int mcell_index = gt.bcell_start[grid_index] + id;
-
-        const int iat = gt.which_atom[mcell_index]; // index of atom
-        const int it = ucell.iat2it[iat];           // index of atom type
-        const Atom* const atom = &ucell.atoms[it];
-        std::vector<const double*> it_psi_uniform(atom->nw);
-        std::vector<const double*> it_dpsi_uniform(atom->nw);
-        // preprocess index
-        for (int iw = 0; iw < atom->nw; ++iw)
-        {
-            if (atom->iw2_new[iw])
-            {
-                it_psi_uniform[iw]= gt.psi_u[it*gt.nwmax + iw].data();
-                it_dpsi_uniform[iw] = gt.dpsi_u[it*gt.nwmax + iw].data();
-            }
-        }
-
-        // meshball_positions should be the bigcell position in meshball
-        // to the center of meshball.
-        // calculated in cartesian coordinates
-        // the std::vector from the grid which is now being operated to the atom position.
-        // in meshball language, is the std::vector from imcell to the center cel, plus
-        // tau_in_bigcell.
-        const int imcell = gt.which_bigcell[mcell_index];
-        const double mt[3] = {gt.meshball_positions[imcell][0] - gt.tau_in_bigcell[iat][0],
-                              gt.meshball_positions[imcell][1] - gt.tau_in_bigcell[iat][1],
-                              gt.meshball_positions[imcell][2] - gt.tau_in_bigcell[iat][2]};
-
-        // number of grids in each big cell (bxyz)
-        for (int ib = 0; ib < bxyz; ib++)
-        {
-            double* p = &psir_ylm[ib][block_index[id]];
-            if (!cal_flag[ib][id])
-            {
-                ModuleBase::GlobalFunc::ZEROS(p, block_size[id]);
-            }
-            else
-            {
-                // meshcell_pos: z is the fastest
-                const double dr[3]
-                    = {gt.meshcell_pos[ib][0] + mt[0], gt.meshcell_pos[ib][1] + mt[1], gt.meshcell_pos[ib][2] + mt[2]};
-                double distance
-                    = std::sqrt(dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]); // distance between atom and grid
-                // if(distance[id] > gt.orbital_rmax) continue;
-                if (distance < 1.0E-9)
-                    distance += 1.0E-9;
-
-                //------------------------------------------------------
-                // spherical harmonic functions Ylm
-                //------------------------------------------------------
-                //	Ylm::get_ylm_real(this->nnn[it], this->dr[id], ylma);
-                ModuleBase::Ylm::sph_harm(ucell.atoms[it].nwl, dr[0] / distance, dr[1] / distance, dr[2] / distance,
-                                          ylma);
-                // these parameters are related to interpolation
-                // because once the distance from atom to grid point is known,
-                // we can obtain the parameters for interpolation and
-                // store them first! these operations can save lots of efforts.
-                const double position = distance / delta_r;
-                const int ip = static_cast<int>(position);
-                const double dx = position - ip;
-                const double dx2 = dx * dx;
-                const double dx3 = dx2 * dx;
-
-                const double c3 = 3.0 * dx2 - 2.0 * dx3;
-                const double c1 = 1.0 - c3;
-                const double c2 = (dx - 2.0 * dx2 + dx3) * delta_r;
-                const double c4 = (dx3 - dx2) * delta_r;
-
-                double phi = 0;
-                for (int iw = 0; iw < atom->nw; ++iw)
-                {
-                    if (atom->iw2_new[iw])
-                    {
-                        auto psi_uniform = it_psi_uniform[iw];
-                        auto dpsi_uniform = it_dpsi_uniform[iw];
-                        phi = c1 * psi_uniform[ip] + c2 * dpsi_uniform[ip] // radial wave functions
-                              + c3 * psi_uniform[ip + 1] + c4 * dpsi_uniform[ip + 1];
-                    }
-                    p[iw] = phi * ylma[atom->iw2_ylm[iw]];
-                } // end iw
-            }     // end distance<=(rcuts[it]-1.0e-15)
-        }         // end ib
-    }             // end id
-//    ModuleBase::timer::tick("Gint_Tools", "cal_psir_ylm");
-    return;
-}
-}
diff --git a/source/source_lcao/module_gint/temp_gint/divide_info.cpp b/source/source_lcao/module_gint/divide_info.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/divide_info.cpp
rename to source/source_lcao/module_gint/divide_info.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/divide_info.h b/source/source_lcao/module_gint/divide_info.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/divide_info.h
rename to source/source_lcao/module_gint/divide_info.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint.cpp b/source/source_lcao/module_gint/gint.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint.cpp
rename to source/source_lcao/module_gint/gint.cpp
diff --git a/source/source_lcao/module_gint/gint.h b/source/source_lcao/module_gint/gint.h
index 3c447b7e4f..1255bae971 100644
--- a/source/source_lcao/module_gint/gint.h
+++ b/source/source_lcao/module_gint/gint.h
@@ -1,275 +1,26 @@
-#ifndef GINT_INTERFACE
-#define GINT_INTERFACE
-
-#include "gint_tools.h"
-#include "source_cell/module_neighbor/sltk_grid_driver.h"
-#include "source_lcao/module_gint/grid_technique.h"
-#include "source_lcao/module_hcontainer/hcontainer.h"
-#include <functional>
-
-//----------------------------------------------------------
-//！This class provides a unified interface to the
-//！grid intergration operation used to calculate
-//！electron density, and the contribution of local
-//！potential to Hamiltonian and force/stress.
-//！There are two derived classes of this class
-//! namely Gint_Gamma and Gint_k, which contain
-//! specific operations for gamma point/multi-k calculations
-//----------------------------------------------------------
-
-class Gint {
-  public:
-    ~Gint();
-
-    //! move operator for the next ESolver to directly use its infomation
-    Gint& operator=(Gint&& rhs);
-
-    hamilt::HContainer<double>* get_hRGint() const { return hRGint; }
-
-    std::vector<hamilt::HContainer<double>*> get_DMRGint() const { return dmr_gint; }
-
-    int get_ncxyz() const { return ncxyz; }
-
-    //! the unified interface to grid integration
-    void cal_gint(Gint_inout* inout);
-
-    //! preparing FFT grid
-    void prep_grid(const Grid_Technique& gt,
-                   const int& nbx_in,
-                   const int& nby_in,
-                   const int& nbz_in,
-                   const int& nbz_start_in,
-                   const int& ncxyz_in,
-                   const int& bx_in,
-                   const int& by_in,
-                   const int& bz_in,
-                   const int& bxyz_in,
-                   const int& nbxx_in,
-                   const int& ny_in,
-                   const int& nplane_in,
-                   const int& startz_current_in,
-                   const UnitCell* ucell_in,
-                   const LCAO_Orbitals* orb_in);
-
-    /**
-     * @brief calculate the neighbor atoms of each atom in this processor
-     * size of BaseMatrix with be the non-parallel version
-     */
-    void initialize_pvpR(const UnitCell& unitcell, const Grid_Driver* gd, const int& nspin);
-
-    /**
-     * @brief resize dmr_gint to nspin and reallocate the memory
-     */
-    void reset_DMRGint(const int& nspin);
-
-    /**
-     * @brief transfer DMR (2D para) to DMR (Grid para) in elecstate_lcao.cpp
-     */
-    void transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> dm2d);
-
-    const Grid_Technique* gridt = nullptr;
-    const UnitCell* ucell;
-
-    // psir_ylm_new = psir_func(psir_ylm)
-    // psir_func==nullptr means psir_ylm_new=psir_ylm
-    using T_psir_func = std::function<
-        const ModuleBase::Array_Pool<double>&(
-            const ModuleBase::Array_Pool<double> &psir_ylm,
-            const Grid_Technique &gt,
-            const int grid_index,
-            const int is,
-            const std::vector<int> &block_iw,
-            const std::vector<int> &block_size,
-            const std::vector<int> &block_index,
-            const ModuleBase::Array_Pool<bool> &cal_flag)>;
-
-    T_psir_func psir_func_1 = nullptr;
-    T_psir_func psir_func_2 = nullptr;
-
-  protected:
-
-    //! variables related to FFT grid
-    int nbx;
-    int nby;
-    int nbz;
-    int ncxyz;
-    int nbz_start;
-    int bx;
-    int by;
-    int bz;
-    int bxyz;
-    int nbxx;
-    int ny;
-    int nplane;
-    int startz_current; // from rhopw
-
-    //! in cal_gint_gpu.cpp
-    void gpu_vlocal_interface(Gint_inout* inout);
-
-    void gpu_rho_interface(Gint_inout* inout);
-
-    void gpu_force_interface(Gint_inout* inout);
-
-    //! in cal_gint_cpu.cpp
-    void gint_kernel_vlocal(Gint_inout* inout);
-
-    //! calculate H_mu_nu(local)=<phi_0|vlocal|dphi_R>
-    void gint_kernel_dvlocal(Gint_inout* inout);
-
-    //! calculate vlocal in meta-GGA functionals
-    void gint_kernel_vlocal_meta(Gint_inout* inout);
-
-    //! calculate charge density rho(r)=\int D_munu \phi_mu \phi_nu
-    void gint_kernel_rho(Gint_inout* inout);
-
-    //! used in meta-GGA functional
-    void gint_kernel_tau(Gint_inout* inout);
-
-    //! compute forces
-    void gint_kernel_force(Gint_inout* inout);
-
-    //! compute forces related to meta-GGA functionals
-    void gint_kernel_force_meta(Gint_inout* inout);
-
-    //! calculate local potential contribution to the Hamiltonian
-    //! na_grid: how many atoms on this (i,j,k) grid
-    //! block_size: dim is [block_size], number of columns of a band
-    //! block_index: dim is [na_grid+1], total number of atomic orbitals
-    //! grid_index: index of grid group, for tracing iat
-    //! cal_flag: dim is [bxyz][na_grid], whether the atom-grid distance is larger than cutoff
-    //! psir_ylm: dim is [bxyz][LD_pool]
-    //! psir_vlbr3: dim is [bxyz][LD_pool]
-    //! hR: HContainer for storing the <phi_0|V|phi_R> matrix elements
-    //! cal_meshball_vlocal is thread-safe!
-    void cal_meshball_vlocal(
-        const int na_grid,
-        const int LD_pool,
-        const int* const block_size,
-        const int* const block_index,
-        const int grid_index,
-        const bool* const* const cal_flag,
-        const double* const* const psir_ylm,
-        const double* const* const psir_vlbr3,
-        hamilt::HContainer<double>* hR);
-
-    //! in gint_fvl.cpp
-    //! calculate vl contributuion to force & stress via grid integrals
-    void gint_kernel_force(const int na_grid,
-                           const int grid_index,
-                           const double delta_r,
-                           double* vldr3,
-                           const int is,
-                           const bool isforce,
-                           const bool isstress,
-                           ModuleBase::matrix* fvl_dphi,
-                           ModuleBase::matrix* svl_dphi,
-                           const UnitCell& ucell);
-
-    //! in gint_fvl.cpp
-    //! calculate vl contributuion to force & stress via grid integrals
-    //! used in meta-GGA calculations
-    void gint_kernel_force_meta(const int na_grid,
-                                const int grid_index,
-                                const double delta_r,
-                                double* vldr3,
-                                double* vkdr3,
-                                const int is,
-                                const bool isforce,
-                                const bool isstress,
-                                ModuleBase::matrix* fvl_dphi,
-                                ModuleBase::matrix* svl_dphi,
-                                const UnitCell& ucell);
-
-    //! Use grid integrals to compute the atomic force contributions
-    //! na_grid: how many atoms on this (i,j,k) grid
-    //! block_size: dim is [na_grid], number of columns of a band
-    //! block_index: dim is [na_grid+1], total number of atomis orbitals
-    //! psir_vlbr3_DMR: dim is [bxyz][LD_pool]
-    //! dpsir_x: dim is [bxyz][LD_pool]
-    //! dpsir_y: dim is [bxyz][LD_pool]
-    //! dpsir_z: dim is [bxyz][LD_pool]
-    void cal_meshball_force(
-        const int grid_index,
-        const int na_grid,
-        const int* const block_size,
-        const int* const block_index,
-        const double* const* const psir_vlbr3_DMR,
-        const double* const* const dpsir_x,        // psir_vlbr3[bxyz][LD_pool]
-        const double* const* const dpsir_y,        // psir_vlbr3[bxyz][LD_pool]
-        const double* const* const dpsir_z,        // psir_vlbr3[bxyz][LD_pool]
-        ModuleBase::matrix* force);
-
-    //! Use grid integrals to compute the stress contributions
-    //! na_grid: how many atoms on this (i,j,k) grid
-    //! block_index: dim is [na_grid+1], total number of atomis orbitals
-    void cal_meshball_stress(
-        const int na_grid,
-        const int*const block_index,
-        const double*const psir_vlbr3_DMR,
-        const double*const dpsirr,
-        ModuleBase::matrix *stress);
-    
-    //! Use grid integrals to compute charge density
-    //! in gint_k_rho.cpp
-    //! calculate the charge density & kinetic energy density (tau) via grid integrals
-    void gint_kernel_rho(const int na_grid,
-                         const int grid_index,
-                         const double delta_r,
-                         int* vindex,
-                         const int LD_pool,
-                         const UnitCell& ucell,
-                         Gint_inout* inout);
-
-    //! Use grid integrals to compute charge density in a meshball
-    void cal_meshball_rho(const int na_grid,
-                          const int*const block_index,
-                          const int*const vindex,
-                          const double*const*const psir_ylm,
-                          const double*const*const psir_DMR,
-                          double*const rho);
-
-    //! Use grid integrals to compute kinetic energy density tau 
-    //！in meta-GGA functional 
-    void gint_kernel_tau(const int na_grid,
-                         const int grid_index,
-                         const double delta_r,
-                         int* vindex,
-                         const int LD_pool,
-                         Gint_inout* inout,
-                         const UnitCell& ucell);
-
-    //! Use grid integrals to compute kinetic energy density tau
-    //！in a meshball, used in meta-GGA functional calculations
-    void cal_meshball_tau(const int na_grid,
-                          int* block_index,
-                          int* vindex,
-                          double** dpsix,
-                          double** dpsiy,
-                          double** dpsiz,
-                          double** dpsix_dm,
-                          double** dpsiy_dm,
-                          double** dpsiz_dm,
-                          double* rho);
-
-    //! save the < phi_0i | V | phi_Rj > in sparse H matrix.
-    //! stores Hamiltonian in sparse format
-    hamilt::HContainer<double>* hRGint = nullptr; 
-
-    //! size of vec is 4, only used when nspin = 4
-    std::vector<hamilt::HContainer<double>*> hr_gint_tmp; 
-
-    //! stores Hamiltonian in sparse format
-    hamilt::HContainer<std::complex<double>>* hRGintCd = nullptr; 
-
-    //! stores DMR in sparse format
-    std::vector<hamilt::HContainer<double>*> dmr_gint; 
-
-    //! tmp tools used in transfer_DM2DtoGrid 
-    hamilt::HContainer<double>* dm2d_tmp = nullptr;
-
-    std::vector<hamilt::HContainer<double>> pvdpRx_reduced;
-    std::vector<hamilt::HContainer<double>> pvdpRy_reduced;
-    std::vector<hamilt::HContainer<double>> pvdpRz_reduced;
+#pragma once
+#include <memory>
+#include "gint_info.h"
+#include "gint_type.h"
+
+namespace ModuleGint
+{
+
+class Gint
+{
+    public:
+    Gint() = default;
+    virtual ~Gint() = default;
+
+    // note that gint_info_ is a static member variable
+    // it is shared by all instances of Gint
+    static void set_gint_info(GintInfo* gint_info)
+    {
+        gint_info_ = gint_info;
+    }
+
+    protected:
+    static GintInfo* gint_info_;
 };
 
-#endif
+}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/gint_atom.cpp b/source/source_lcao/module_gint/gint_atom.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_atom.cpp
rename to source/source_lcao/module_gint/gint_atom.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_atom.h b/source/source_lcao/module_gint/gint_atom.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_atom.h
rename to source/source_lcao/module_gint/gint_atom.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_common.cpp b/source/source_lcao/module_gint/gint_common.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_common.cpp
rename to source/source_lcao/module_gint/gint_common.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_common.h b/source/source_lcao/module_gint/gint_common.h
similarity index 94%
rename from source/source_lcao/module_gint/temp_gint/gint_common.h
rename to source/source_lcao/module_gint/gint_common.h
index 0e04a7cffc..180bc9e8ea 100644
--- a/source/source_lcao/module_gint/temp_gint/gint_common.h
+++ b/source/source_lcao/module_gint/gint_common.h
@@ -1,6 +1,6 @@
 #pragma once
 #include "source_lcao/module_hcontainer/hcontainer.h"
-#include "source_lcao/module_gint/temp_gint/gint_info.h"
+#include "source_lcao/module_gint/gint_info.h"
 
 namespace ModuleGint
 {
diff --git a/source/source_lcao/module_gint/temp_gint/gint_dvlocal.cpp b/source/source_lcao/module_gint/gint_dvlocal.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_dvlocal.cpp
rename to source/source_lcao/module_gint/gint_dvlocal.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_dvlocal.h b/source/source_lcao/module_gint/gint_dvlocal.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_dvlocal.h
rename to source/source_lcao/module_gint/gint_dvlocal.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_env_gamma.cpp b/source/source_lcao/module_gint/gint_env_gamma.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_env_gamma.cpp
rename to source/source_lcao/module_gint/gint_env_gamma.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_env_gamma.h b/source/source_lcao/module_gint/gint_env_gamma.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_env_gamma.h
rename to source/source_lcao/module_gint/gint_env_gamma.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_env_k.cpp b/source/source_lcao/module_gint/gint_env_k.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_env_k.cpp
rename to source/source_lcao/module_gint/gint_env_k.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_env_k.h b/source/source_lcao/module_gint/gint_env_k.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_env_k.h
rename to source/source_lcao/module_gint/gint_env_k.h
diff --git a/source/source_lcao/module_gint/gint_force_cpu_interface.cpp b/source/source_lcao/module_gint/gint_force_cpu_interface.cpp
deleted file mode 100644
index f4f346783d..0000000000
--- a/source/source_lcao/module_gint/gint_force_cpu_interface.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-#include "gint.h"
-#include "source_base/memory.h"
-#include "source_base/timer.h"
-
-void Gint::gint_kernel_force(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint_force");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_force");
-    const UnitCell& ucell = *this->ucell;
-    const int max_size = this->gridt->max_atom;
-    const int ncyz = this->ny * this->nplane;
-    const double dv = ucell.omega / this->ncxyz;
-    const double delta_r = this->gridt->dr_uniform;
-
-
-#pragma omp parallel 
-{
-    ModuleBase::matrix* fvl_dphi_thread=inout->fvl_dphi;
-    ModuleBase::matrix* svl_dphi_thread=inout->svl_dphi;
-    if (inout->isforce) {
-        fvl_dphi_thread=new ModuleBase::matrix(*inout->fvl_dphi);
-        fvl_dphi_thread->zero_out();
-    }
-    if (inout->isstress) {
-        svl_dphi_thread=new ModuleBase::matrix(*inout->svl_dphi);
-        svl_dphi_thread->zero_out();
-    }
-    std::vector<int> block_iw(max_size,0);
-    std::vector<int> block_index(max_size+1,0);
-    std::vector<int> block_size(max_size,0);
-    std::vector<double> vldr3(this->bxyz,0.0);
-#pragma omp for schedule(dynamic)
-    for (int grid_index = 0; grid_index < this->nbxx; grid_index++) {
-        const int na_grid = this->gridt->how_many_atoms[grid_index];
-        if (na_grid == 0) {
-            continue;
-        }
-        Gint_Tools::get_gint_vldr3(vldr3.data(),
-                                    inout->vl,
-                                    this->bxyz,
-                                    this->bx,
-                                    this->by,
-                                    this->bz,
-                                    this->nplane,
-                                    this->gridt->start_ind[grid_index],
-                                    ncyz,
-                                    dv);
-         //prepare block information
-        ModuleBase::Array_Pool<bool> cal_flag(this->bxyz,max_size);
-        Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index,
-                                            block_iw.data(), block_index.data(), block_size.data(), 
-                                            cal_flag.get_ptr_2D());
-        const int LD_pool = block_index[na_grid];
-
-    //evaluate psi and dpsi on grids
-        ModuleBase::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_x(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_y(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_z(this->bxyz, LD_pool);
-
-        Gint_Tools::cal_dpsir_ylm(*this->gridt, this->bxyz, na_grid, grid_index, delta_r,	
-                                    block_index.data(), block_size.data(),
-                                    cal_flag.get_ptr_2D(),psir_ylm.get_ptr_2D(),
-                                    dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D());
-
-    //calculating f_mu(r) = v(r)*psi_mu(r)*dv
-        const ModuleBase::Array_Pool<double> psir_vlbr3 = 
-                Gint_Tools::get_psir_vlbr3(this->bxyz, na_grid, LD_pool, block_index.data(), 
-                cal_flag.get_ptr_2D(), vldr3.data(), psir_ylm.get_ptr_2D());
-
-        ModuleBase::Array_Pool<double> psir_vlbr3_DM(this->bxyz, LD_pool);
-        ModuleBase::GlobalFunc::ZEROS(psir_vlbr3_DM.get_ptr_1D(), this->bxyz*LD_pool);
-
-	//calculating g_mu(r) = sum_nu rho_mu,nu f_nu(r)
-        Gint_Tools::mult_psi_DMR(
-                *this->gridt, 
-                this->bxyz,
-                LD_pool, 
-                grid_index, 
-                na_grid, 
-                block_index.data(), 
-                block_size.data(), 
-                cal_flag.get_ptr_2D(),
-                psir_vlbr3.get_ptr_2D(), 
-                psir_vlbr3_DM.get_ptr_2D(), 
-                this->dmr_gint[inout->ispin], 
-                false);
-
-        if(inout->isforce)
-        {
-            //do integration to get force
-            this-> cal_meshball_force(grid_index, na_grid, block_size.data(), block_index.data(),
-                                        psir_vlbr3_DM.get_ptr_2D(), dpsir_ylm_x.get_ptr_2D(),
-                                        dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D(),
-                                        fvl_dphi_thread);
-        }
-        if(inout->isstress)
-        {
-            //calculating g_mu(r)*(r-R) where R is the location of atom
-
-            // The array dpsirr contains derivatives of psir in the xx, xy, xz, yy, yz, zz directions,
-            // with each set of six numbers representing the derivatives in these respective directions.
-            ModuleBase::Array_Pool<double> dpsirr_ylm(this->bxyz, LD_pool * 6);
-            Gint_Tools::cal_dpsirr_ylm(*this->gridt, this->bxyz, na_grid, grid_index, block_index.data(), 
-                                        block_size.data(), cal_flag.get_ptr_2D(),dpsir_ylm_x.get_ptr_2D(), 
-                                        dpsir_ylm_y.get_ptr_2D(),dpsir_ylm_z.get_ptr_2D(),
-                                        dpsirr_ylm.get_ptr_2D());
-
-            //do integration to get stress
-            this-> cal_meshball_stress(na_grid, block_index.data(), psir_vlbr3_DM.get_ptr_1D(), 
-                                       dpsirr_ylm.get_ptr_1D(), svl_dphi_thread);
-        }
-    }
-#pragma omp critical(gint)
-    {
-        if (inout->isforce) {
-            inout->fvl_dphi[0] += fvl_dphi_thread[0];
-            delete fvl_dphi_thread;
-        }
-        if (inout->isstress) {
-            inout->svl_dphi[0] += svl_dphi_thread[0];
-            delete svl_dphi_thread;
-        }
-    }
-}
-    ModuleBase::TITLE("Gint_interface", "cal_gint_force");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_force");
-}
-
-void Gint::gint_kernel_force_meta(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint_force_meta");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_force_meta");
-    const UnitCell& ucell = *this->ucell;
-    const int max_size = this->gridt->max_atom;
-    const int ncyz = this->ny * this->nplane;
-    const double dv = ucell.omega / this->ncxyz;
-    const double delta_r = this->gridt->dr_uniform;
-
-
-#pragma omp parallel 
-{
-    ModuleBase::matrix* fvl_dphi_thread=inout->fvl_dphi;
-    ModuleBase::matrix* svl_dphi_thread=inout->svl_dphi;
-    if (inout->isforce) {
-        fvl_dphi_thread=new ModuleBase::matrix(*inout->fvl_dphi);
-        fvl_dphi_thread->zero_out();
-    }
-    if (inout->isstress) {
-        svl_dphi_thread=new ModuleBase::matrix(*inout->svl_dphi);
-        svl_dphi_thread->zero_out();
-    }
-    std::vector<int> block_iw(max_size,0);
-    std::vector<int> block_index(max_size+1,0);
-    std::vector<int> block_size(max_size,0);
-    std::vector<double> vldr3(this->bxyz,0.0);
-    std::vector<double> vkdr3(this->bxyz,0.0);
-#pragma omp for schedule(dynamic)
-    for (int grid_index = 0; grid_index < this->nbxx; grid_index++) {
-        const int na_grid = this->gridt->how_many_atoms[grid_index];
-        if (na_grid == 0) {
-            continue;
-        }
-        Gint_Tools::get_gint_vldr3(vldr3.data(),
-                                    inout->vl,
-                                    this->bxyz,
-                                    this->bx,
-                                    this->by,
-                                    this->bz,
-                                    this->nplane,
-                                    this->gridt->start_ind[grid_index],
-                                    ncyz,
-                                    dv);
-
-        Gint_Tools::get_gint_vldr3(vkdr3.data(),
-                                    inout->vofk,
-                                    this->bxyz,
-                                    this->bx,
-                                    this->by,
-                                    this->bz,
-                                    this->nplane,
-                                    this->gridt->start_ind[grid_index],
-                                    ncyz,
-                                    dv);
-         //prepare block information
-        ModuleBase::Array_Pool<bool> cal_flag(this->bxyz,max_size);
-        Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, 
-                                            block_iw.data(), block_index.data(), block_size.data(), cal_flag.get_ptr_2D());
-        const int LD_pool = block_index[na_grid];
-
-    //evaluate psi and dpsi on grids
-        ModuleBase::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_x(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_y(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_z(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> ddpsir_ylm_xx(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> ddpsir_ylm_xy(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> ddpsir_ylm_xz(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> ddpsir_ylm_yy(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> ddpsir_ylm_yz(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> ddpsir_ylm_zz(this->bxyz, LD_pool);
-
-	//psi and gradient of psi
-        Gint_Tools::cal_dpsir_ylm(*this->gridt, this->bxyz, na_grid, grid_index, delta_r,	block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),
-            psir_ylm.get_ptr_2D(), dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D());
-
-	//hessian of psi
-        Gint_Tools::cal_ddpsir_ylm(*this->gridt, this->bxyz, na_grid, grid_index, delta_r, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),
-            ddpsir_ylm_xx.get_ptr_2D(), ddpsir_ylm_xy.get_ptr_2D(), ddpsir_ylm_xz.get_ptr_2D(),
-            ddpsir_ylm_yy.get_ptr_2D(), ddpsir_ylm_yz.get_ptr_2D(), ddpsir_ylm_zz.get_ptr_2D());
-
-    //calculating f_mu(r) = v(r)*psi_mu(r)*dv 
-        const ModuleBase::Array_Pool<double> psir_vlbr3 
-            = Gint_Tools::get_psir_vlbr3(this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vldr3.data(), psir_ylm.get_ptr_2D());
-        const ModuleBase::Array_Pool<double> dpsir_x_vlbr3 
-            = Gint_Tools::get_psir_vlbr3(this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_x.get_ptr_2D());
-        const ModuleBase::Array_Pool<double> dpsir_y_vlbr3 
-            = Gint_Tools::get_psir_vlbr3(this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_y.get_ptr_2D());
-        const ModuleBase::Array_Pool<double> dpsir_z_vlbr3 
-            = Gint_Tools::get_psir_vlbr3(this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_z.get_ptr_2D());
-
-        ModuleBase::Array_Pool<double> psir_vlbr3_DM(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsirx_v_DM(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsiry_v_DM(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsirz_v_DM(this->bxyz, LD_pool);
-
-        ModuleBase::GlobalFunc::ZEROS(psir_vlbr3_DM.get_ptr_1D(), this->bxyz*LD_pool);
-        ModuleBase::GlobalFunc::ZEROS(dpsirx_v_DM.get_ptr_1D(), this->bxyz*LD_pool);
-        ModuleBase::GlobalFunc::ZEROS(dpsiry_v_DM.get_ptr_1D(), this->bxyz*LD_pool);
-        ModuleBase::GlobalFunc::ZEROS(dpsirz_v_DM.get_ptr_1D(), this->bxyz*LD_pool);
-
-	//calculating g_mu(r) = sum_nu rho_mu,nu f_nu(r)
-        Gint_Tools::mult_psi_DMR(*this->gridt, this->bxyz, LD_pool, grid_index, 
-            na_grid, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),
-            psir_vlbr3.get_ptr_2D(), psir_vlbr3_DM.get_ptr_2D(), this->dmr_gint[inout->ispin], false);
-
-        Gint_Tools::mult_psi_DMR(*this->gridt, this->bxyz, LD_pool, grid_index, 
-            na_grid, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),
-            dpsir_x_vlbr3.get_ptr_2D(), dpsirx_v_DM.get_ptr_2D(), this->dmr_gint[inout->ispin], false);
-
-        Gint_Tools::mult_psi_DMR(*this->gridt, this->bxyz, LD_pool, grid_index,
-            na_grid, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),
-            dpsir_y_vlbr3.get_ptr_2D(), dpsiry_v_DM.get_ptr_2D(), this->dmr_gint[inout->ispin], false);
-
-        Gint_Tools::mult_psi_DMR(*this->gridt, this->bxyz, LD_pool, grid_index, 
-            na_grid, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),
-            dpsir_z_vlbr3.get_ptr_2D(), dpsirz_v_DM.get_ptr_2D(), this->dmr_gint[inout->ispin], false);
-
-        if(inout->isforce)
-        {
-            //do integration to get force
-            this-> cal_meshball_force(grid_index, na_grid, block_size.data(), block_index.data(),
-                psir_vlbr3_DM.get_ptr_2D(), dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D(), 
-                fvl_dphi_thread);
-                
-            this-> cal_meshball_force(grid_index, na_grid, block_size.data(), block_index.data(),
-                dpsirx_v_DM.get_ptr_2D(), ddpsir_ylm_xx.get_ptr_2D(), ddpsir_ylm_xy.get_ptr_2D(), ddpsir_ylm_xz.get_ptr_2D(), 
-                fvl_dphi_thread);
-            this-> cal_meshball_force(grid_index, na_grid, block_size.data(), block_index.data(),
-                dpsiry_v_DM.get_ptr_2D(), ddpsir_ylm_xy.get_ptr_2D(), ddpsir_ylm_yy.get_ptr_2D(), ddpsir_ylm_yz.get_ptr_2D(), 
-                fvl_dphi_thread);
-            this-> cal_meshball_force(grid_index, na_grid, block_size.data(), block_index.data(),
-                dpsirz_v_DM.get_ptr_2D(), ddpsir_ylm_xz.get_ptr_2D(), ddpsir_ylm_yz.get_ptr_2D(), ddpsir_ylm_zz.get_ptr_2D(), 
-                fvl_dphi_thread);		
-            
-        }
-        if(inout->isstress)
-        {
-            //calculating g_mu(r)*(r-R) where R is the location of atom
-            ModuleBase::Array_Pool<double> array(this->bxyz, LD_pool * 6);
-
-            //the vxc part
-            Gint_Tools::cal_dpsirr_ylm(*this->gridt, this->bxyz, na_grid, grid_index, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),
-                dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(),	dpsir_ylm_z.get_ptr_2D(), array.get_ptr_2D());
-            //do integration to get stress
-            this-> cal_meshball_stress(na_grid, block_index.data(), psir_vlbr3_DM.get_ptr_1D(),
-                array.get_ptr_1D(), svl_dphi_thread);
-
-            //partial x of vtau part
-            Gint_Tools::cal_dpsirr_ylm(*this->gridt, this->bxyz, na_grid, grid_index, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),
-                ddpsir_ylm_xx.get_ptr_2D(), ddpsir_ylm_xy.get_ptr_2D(),	ddpsir_ylm_xz.get_ptr_2D(), array.get_ptr_2D());
-            //do integration to get stress
-            this-> cal_meshball_stress(na_grid, block_index.data(), dpsirx_v_DM.get_ptr_1D(),
-                array.get_ptr_1D(), svl_dphi_thread);
-
-            //partial y of vtau part
-            Gint_Tools::cal_dpsirr_ylm(*this->gridt, this->bxyz, na_grid, grid_index, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),
-                ddpsir_ylm_xy.get_ptr_2D(), ddpsir_ylm_yy.get_ptr_2D(),	ddpsir_ylm_yz.get_ptr_2D(), array.get_ptr_2D());
-            //do integration to get stress
-            this-> cal_meshball_stress(na_grid, block_index.data(), dpsiry_v_DM.get_ptr_1D(),
-                array.get_ptr_1D(), svl_dphi_thread);
-
-            //partial z of vtau part
-            Gint_Tools::cal_dpsirr_ylm(*this->gridt, this->bxyz, na_grid, grid_index, block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),
-                ddpsir_ylm_xz.get_ptr_2D(), ddpsir_ylm_yz.get_ptr_2D(), ddpsir_ylm_zz.get_ptr_2D(), array.get_ptr_2D());
-            //do integration to get stress
-            this-> cal_meshball_stress(na_grid, block_index.data(), dpsirz_v_DM.get_ptr_1D(),
-                array.get_ptr_1D(), svl_dphi_thread);
-        }
-    }
-#pragma omp critical(gint)
-    {
-        if (inout->isforce) {
-            inout->fvl_dphi[0] += fvl_dphi_thread[0];
-            delete fvl_dphi_thread;
-        }
-        if (inout->isstress) {
-            inout->svl_dphi[0] += svl_dphi_thread[0];
-            delete svl_dphi_thread;
-        }
-    }
-}
-    ModuleBase::TITLE("Gint_interface", "cal_gint_force_meta");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_force_meta");
-}
diff --git a/source/source_lcao/module_gint/gint_force_gpu.cu b/source/source_lcao/module_gint/gint_force_gpu.cu
deleted file mode 100644
index cb3390aacc..0000000000
--- a/source/source_lcao/module_gint/gint_force_gpu.cu
+++ /dev/null
@@ -1,301 +0,0 @@
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "gint_force_gpu.h"
-#include "kernels/cuda/cuda_tools.cuh"
-#include "kernels/cuda/gint_force.cuh"
-#include "source_base/ylm.h"
-#include "gint_tools.h"
-
-namespace GintKernel
-{
-/**
- * @brief Calculate forces and stresses
- * @note The grid integration on the GPU is mainly divided into the following
- * steps:
- * 1. Use the CPU to divide the grid integration into subtasks.
- * 2. Copy the subtask information to the GPU.
- * 3. Calculate the matrix elements on the GPU.
- * 4. Perform matrix multiplication on the GPU.
- * 5. stress dot on the GPU.
- * 6. force dot on the GPU.
- * 7. Copy the results back to the host.
- */
-void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
-                        const double* vlocal,
-                        double* force_in,
-                        double* stress_in,
-                        double dr,
-                        const double* rcut,
-                        const int isforce,
-                        const int isstress,
-                        const Grid_Technique& gridt,
-                        const UnitCell& ucell)
-{ 
-    checkCuda(cudaSetDevice(gridt.dev_id));
-    // checkCuda(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
-
-    const int nbzp = gridt.nbzp;
-    const int max_atom = gridt.max_atom;
-    const int nwmax = ucell.nwmax;
-    const int bxyz = gridt.bxyz;
-    const int max_atom_per_bcell = max_atom * bxyz;
-    const int max_atom_per_z = max_atom_per_bcell * nbzp;
-    const int max_phi_per_z = max_atom_per_z * ucell.nwmax;
-    const int max_atompair_per_z = max_atom * max_atom * nbzp;
-    const double vfactor = ucell.omega / gridt.ncxyz;
-    const int nczp = nbzp * gridt.bz;
-    const int nat=ucell.nat;
-
-    const int num_streams = gridt.nstreams;
-
-    std::vector<cudaStream_t> streams(num_streams);
-    std::vector<cudaEvent_t> events(num_streams);
-    for (int i = 0; i < num_streams; i++)
-    {
-        checkCuda(cudaStreamCreate(&streams[i]));
-        checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
-    }
-
-    Cuda_Mem_Wrapper<double> dr_part(3 * max_atom_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<uint8_t> atoms_type(max_atom_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> iat_on_nbz(max_atom_per_z, num_streams, true);
-    // The first number in every group of two represents the number of atoms on that bigcell.
-    // The second number represents the cumulative number of atoms up to that bigcell.
-    Cuda_Mem_Wrapper<int> atoms_num_info(2 * nbzp, num_streams, true);
-    Cuda_Mem_Wrapper<double> vldr3(nbzp * gridt.bxyz, num_streams, true);
-
-    Cuda_Mem_Wrapper<double> psi(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> psi_dm(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> dpsi(3 * max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi(6 * max_phi_per_z, num_streams, false);
-
-    Cuda_Mem_Wrapper<double> gemm_alpha(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_m(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_n(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_k(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_lda(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_ldb(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_ldc(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<double*> gemm_A(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<double*> gemm_B(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<double*> gemm_C(max_atompair_per_z, num_streams, true);
-
-    Cuda_Mem_Wrapper<double> force(3 * nat, num_streams, true);
-    Cuda_Mem_Wrapper<double> stress(6, num_streams, true);
-
-    Cuda_Mem_Wrapper<double> dm_matrix(dm->get_nnr(), 1, false);
-    // retrieve the density matrix on the host
-    checkCuda(cudaMemcpy(dm_matrix.get_device_pointer(),
-                         dm->get_wrapper(),
-                         dm->get_nnr() * sizeof(double),
-                         cudaMemcpyHostToDevice));
-
-#ifdef _OPENMP
-const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
-#endif
-#pragma omp parallel num_threads(max_thread_num)
-{
-#ifdef _OPENMP
-    const int tid = omp_get_thread_num();
-    const int num_threads = omp_get_num_threads();
-    const int sid_start = tid * num_streams / num_threads;
-    const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
-#else
-    const int sid_start = 0;
-    const int thread_num_streams = num_streams;
-#endif
-#pragma omp for collapse(2) schedule(dynamic)
-    for (int i = 0; i < gridt.nbx; i++)
-    {
-        for (int j = 0; j < gridt.nby; j++)
-        {
-            // 20240620 Note that it must be set again here because 
-            // cuda's device is not safe in a multi-threaded environment.
-            checkCuda(cudaSetDevice(gridt.dev_id));
-
-            const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
-            checkCuda(cudaEventSynchronize(events[sid]));
-
-            int max_m = 0;
-            int max_n = 0;
-            int atom_pair_num = 0;
-            int atoms_per_z = 0;
-            const int grid_index_ij = i * gridt.nby * nbzp + j * nbzp;
-
-            gtask_force(gridt,
-                        ucell,
-                        grid_index_ij,
-                        nczp,
-                        vfactor,
-                        vlocal,
-                        atoms_per_z,
-                        atoms_num_info.get_host_pointer(sid),
-                        iat_on_nbz.get_host_pointer(sid),
-                        atoms_type.get_host_pointer(sid),
-                        dr_part.get_host_pointer(sid),
-                        vldr3.get_host_pointer(sid));
-           
-            alloc_mult_force(dm,
-                             gridt,
-                             ucell, 
-                             grid_index_ij,
-                             max_atom,
-                             atoms_num_info.get_host_pointer(sid),
-                             psi.get_device_pointer(sid),
-                             psi_dm.get_device_pointer(sid),
-                             dm_matrix.get_device_pointer(),
-                             max_m,
-                             max_n, 
-                             atom_pair_num,
-                             gemm_m.get_host_pointer(sid),
-                             gemm_n.get_host_pointer(sid),
-                             gemm_k.get_host_pointer(sid),
-                             gemm_lda.get_host_pointer(sid),
-                             gemm_ldb.get_host_pointer(sid),
-                             gemm_ldc.get_host_pointer(sid),
-                             gemm_A.get_host_pointer(sid),
-                             gemm_B.get_host_pointer(sid),
-                             gemm_C.get_host_pointer(sid));
-
-            dr_part.copy_host_to_device_async(streams[sid], sid, 3 * atoms_per_z);
-            atoms_type.copy_host_to_device_async(streams[sid], sid, atoms_per_z);
-            iat_on_nbz.copy_host_to_device_async(streams[sid], sid, atoms_per_z);
-            vldr3.copy_host_to_device_async(streams[sid], sid);
-            atoms_num_info.copy_host_to_device_async(streams[sid], sid);
-            
-            gemm_m.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_n.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_k.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_lda.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_ldb.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_ldc.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            checkCuda(cudaEventRecord(events[sid], streams[sid]));
-
-            psi.memset_device_async(streams[sid], sid, 0);
-            psi_dm.memset_device_async(streams[sid], sid, 0);
-            dpsi.memset_device_async(streams[sid], sid, 0);
-            d2psi.memset_device_async(streams[sid], sid, 0);
-
-            dim3 grid_psi(nbzp, gridt.bxyz);
-            dim3 block_psi(64);
-            get_psi_force<<<grid_psi,
-                            block_psi,
-                            0,
-                            streams[sid]>>>(
-                gridt.ylmcoef_g,
-                dr,
-                bxyz,
-                nwmax,
-                max_atom,
-                gridt.atom_nwl_g,
-                gridt.atom_new_g,
-                gridt.atom_ylm_g,
-                gridt.atom_l_g,
-                gridt.atom_nw_g,
-                gridt.rcut_g,
-                gridt.nr_max,
-                gridt.psi_u_g,
-                gridt.mcell_pos_g,
-                dr_part.get_device_pointer(sid),
-                vldr3.get_device_pointer(sid),
-                atoms_type.get_device_pointer(sid),
-                atoms_num_info.get_device_pointer(sid),
-                psi.get_device_pointer(sid),
-                dpsi.get_device_pointer(sid),
-                d2psi.get_device_pointer(sid));
-            checkCudaLastError();
-
-            gridt.fastest_matrix_mul(max_m,
-                                     max_n,
-                                     gemm_m.get_device_pointer(sid),
-                                     gemm_n.get_device_pointer(sid),
-                                     gemm_k.get_device_pointer(sid),
-                                     gemm_A.get_device_pointer(sid),
-                                     gemm_lda.get_device_pointer(sid),
-                                     gemm_B.get_device_pointer(sid),
-                                     gemm_ldb.get_device_pointer(sid),
-                                     gemm_C.get_device_pointer(sid),
-                                     gemm_ldc.get_device_pointer(sid),
-                                     atom_pair_num,
-                                     streams[sid],
-                                     nullptr);
-   
-            if (isforce){
-                dim3 grid_force(nbzp);
-                dim3 block_force(64);
-                dot_product_force<<<grid_force,
-                                    block_force,
-                                    32 * 3 * sizeof(double),
-                                    streams[sid]>>>(
-                                        bxyz,
-                                        nwmax,
-                                        atoms_num_info.get_device_pointer(sid),
-                                        iat_on_nbz.get_device_pointer(sid),
-                                        dpsi.get_device_pointer(sid),
-                                        psi_dm.get_device_pointer(sid),
-                                        force.get_device_pointer(sid));
-                checkCudaLastError();
-            }
-
-            if (isstress){ 
-                dim3 grid_stress(nbzp);
-                dim3 block_stress(64);
-                dot_product_stress<<<grid_stress,
-                                        block_stress,
-                                        32 * 6 * sizeof(double),
-                                        streams[sid]>>>(
-                                    d2psi.get_device_pointer(sid),
-                                    psi_dm.get_device_pointer(sid),
-                                    atoms_per_z * nwmax * bxyz,
-                                    stress.get_device_pointer(sid));
-                checkCudaLastError();
-            }
-        }
-    }
-}
-
-    for(int i = 0; i < num_streams; i++)
-    {
-        stress.copy_device_to_host_async(streams[i], i);
-        force.copy_device_to_host_async(streams[i], i);
-    }
-
-    for (int i = 0; i < num_streams; i++)
-    {
-        checkCuda(cudaStreamSynchronize(streams[i]));
-        checkCuda(cudaEventDestroy(events[i]));
-    }
-
-    if (isstress){
-        for (int i = 0; i < num_streams; i++)
-        {
-            const int offset = 6 * i;
-            for (int j = 0; j < 6; j++)
-            {
-                stress_in[j] += stress.get_host_pointer()[offset + j];
-            }
-        }
-    }
-    if (isforce){
-        for (int i = 0; i < num_streams; i++)
-        {
-            const int offset = 3 * i * nat;
-            for (int j = 0; j < 3 * nat; j++)
-            {
-                force_in[j] += force.get_host_pointer()[offset + j];
-            }
-        }
-    }
-
-    for (int i = 0; i < num_streams; i++)
-    {
-        checkCuda(cudaStreamDestroy(streams[i]));
-    }
-}
-
-} // namespace GintKernel
diff --git a/source/source_lcao/module_gint/gint_force_gpu.h b/source/source_lcao/module_gint/gint_force_gpu.h
deleted file mode 100644
index 0dac4a99d6..0000000000
--- a/source/source_lcao/module_gint/gint_force_gpu.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_FORCE_GPU_H
-#define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_FORCE_GPU_H
-
-#include "source_lcao/module_gint/gint.h"
-#include "source_lcao/module_gint/grid_technique.h"
-namespace GintKernel
-{
-void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
-                        const double* vlocal,
-                        double* force_in,
-                        double* stress_in,
-                        double dr,
-                        const double* rcut,
-                        const int isforce,
-                        const int isstress,
-                        const Grid_Technique& gridt,
-                        const UnitCell& ucell);
-
-void gtask_force(const Grid_Technique& gridt,
-                 const UnitCell& ucell,
-                 const int grid_index_ij,
-                 const int nczp,
-                 const double vfactor,
-                 const double* vlocal_global_value,
-                 int& atoms_per_z,
-                 int* atoms_num_info,
-                 int* iat_on_nbz,
-                 uint8_t* atoms_type,
-                 double* dr_part,
-                 double* vldr3);
-
-void alloc_mult_force(const hamilt::HContainer<double>* dm,
-                      const Grid_Technique& gridt,
-                      const UnitCell& ucell,
-                      const int grid_index_ij,
-                      const int max_atom,
-                      const int *atoms_num_info,
-                      double* const psi_g,
-                      double* const psi_dm_g,
-                      double* const dm_matrix_g,
-                      int& max_m,
-                      int& max_n,
-                      int& atom_pair_num,
-                      int* mat_m,
-                      int* mat_n,
-                      int* mat_k,
-                      int* mat_lda,
-                      int* mat_ldb,
-                      int* mat_ldc,
-                      double** mat_A,
-                      double** mat_B,
-                      double** mat_C);
-
-} // namespace GintKernel
-#endif
diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl.cpp b/source/source_lcao/module_gint/gint_fvl.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_fvl.cpp
rename to source/source_lcao/module_gint/gint_fvl.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl.h b/source/source_lcao/module_gint/gint_fvl.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_fvl.h
rename to source/source_lcao/module_gint/gint_fvl.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp b/source/source_lcao/module_gint/gint_fvl_gpu.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp
rename to source/source_lcao/module_gint/gint_fvl_gpu.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_gpu.h b/source/source_lcao/module_gint/gint_fvl_gpu.h
similarity index 95%
rename from source/source_lcao/module_gint/temp_gint/gint_fvl_gpu.h
rename to source/source_lcao/module_gint/gint_fvl_gpu.h
index b613333e7a..cdbcd40aa9 100644
--- a/source/source_lcao/module_gint/temp_gint/gint_fvl_gpu.h
+++ b/source/source_lcao/module_gint/gint_fvl_gpu.h
@@ -6,7 +6,7 @@
 #include "source_base/matrix.h"
 #include "gint.h"
 #include "gint_info.h"
-#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h"
 
 namespace ModuleGint
 {
diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_meta.cpp b/source/source_lcao/module_gint/gint_fvl_meta.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_fvl_meta.cpp
rename to source/source_lcao/module_gint/gint_fvl_meta.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_meta.h b/source/source_lcao/module_gint/gint_fvl_meta.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_fvl_meta.h
rename to source/source_lcao/module_gint/gint_fvl_meta.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp b/source/source_lcao/module_gint/gint_fvl_meta_gpu.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp
rename to source/source_lcao/module_gint/gint_fvl_meta_gpu.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h b/source/source_lcao/module_gint/gint_fvl_meta_gpu.h
similarity index 95%
rename from source/source_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h
rename to source/source_lcao/module_gint/gint_fvl_meta_gpu.h
index 2b9d88aec2..a1b41cbd61 100644
--- a/source/source_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h
+++ b/source/source_lcao/module_gint/gint_fvl_meta_gpu.h
@@ -6,7 +6,7 @@
 #include "source_base/matrix.h"
 #include "gint.h"
 #include "gint_info.h"
-#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h"
 
 namespace ModuleGint
 {
diff --git a/source/source_lcao/module_gint/gint_fvl_old.cpp b/source/source_lcao/module_gint/gint_fvl_old.cpp
deleted file mode 100644
index 663a7ddce6..0000000000
--- a/source/source_lcao/module_gint/gint_fvl_old.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-#include "gint_k.h"
-#include "source_base/timer.h"
-#include "source_base/ylm.h"
-#include "source_pw/module_pwdft/global.h"
-#include "source_base/array_pool.h"
-
-// This function utilizes the cache more effectively than calling the ddot function, thus performing faster.
-void Gint::cal_meshball_force(
-    const int grid_index,
-    const int na_grid,  					    // how many atoms on this (i,j,k) grid
-	const int*const block_size, 			    // block_size[na_grid],	number of columns of a band
-	const int*const block_index,		    	// block_index[na_grid+1], count total number of atomis orbitals
-	const double*const*const psir_vlbr3_DMR,	    // psir_vlbr3[this->bxyz][LD_pool]
-    const double*const*const dpsir_x,	    // psir_vlbr3[this->bxyz][LD_pool]
-    const double*const*const dpsir_y,	    // psir_vlbr3[this->bxyz][LD_pool]
-    const double*const*const dpsir_z,	    // psir_vlbr3[this->bxyz][LD_pool]
-    ModuleBase::matrix *force)
-{
-    for(int ia1=0;ia1<na_grid;ia1++)
-    {
-        const int mcell_index=this->gridt->bcell_start[grid_index] + ia1;
-        const int iat=this->gridt->which_atom[mcell_index]; // index of atom
-		double rx = 0;
-		double ry = 0;
-		double rz = 0;
-        for(int ib=0; ib<this->bxyz; ib++)
-        {
-            for(int iw=0; iw<block_size[ia1]; iw++)
-			{
-				double psir_vlbr3 = psir_vlbr3_DMR[ib][block_index[ia1]+iw];
-				rx += psir_vlbr3 * dpsir_x[ib][block_index[ia1]+iw];
-				ry += psir_vlbr3 * dpsir_y[ib][block_index[ia1]+iw];
-				rz += psir_vlbr3 * dpsir_z[ib][block_index[ia1]+iw];
-			}
-        }
-        force[0](iat,0) += rx * 2.0;
-        force[0](iat,1) += ry * 2.0;
-		force[0](iat,2) += rz * 2.0;  
-    }
-	return;
-}
-
-// This function utilizes the cache more effectively than calling the ddot function, thus performing faster.
-void Gint::cal_meshball_stress(
-    const int na_grid,  					    // how many atoms on this (i,j,k) grid
-	const int*const block_index,		    	// block_index[na_grid+1], count total number of atomis orbitals
-	const double*const psir_vlbr3_DMR,
-    const double*const dpsirr,
-    ModuleBase::matrix *stress)
-{
-	double rxx = 0;
-	double rxy = 0;
-	double rxz = 0;
-	double ryy = 0;
-	double ryz = 0;
-	double rzz = 0;
-	const int size = block_index[na_grid] * this->bxyz;
-
-    for(int i=0; i<size; ++i)
-    {
-		double psir_vlbr3 = psir_vlbr3_DMR[i];
-		rxx += psir_vlbr3 * dpsirr[i * 6];
-		rxy += psir_vlbr3 * dpsirr[i * 6 + 1];
-		rxz += psir_vlbr3 * dpsirr[i * 6 + 2];
-		ryy += psir_vlbr3 * dpsirr[i * 6 + 3];
-		ryz += psir_vlbr3 * dpsirr[i * 6 + 4];
-		rzz += psir_vlbr3 * dpsirr[i * 6 + 5];
-    }
-	stress[0](0,0) += rxx*2;
-    stress[0](0,1) += rxy*2;
-	stress[0](0,2) += rxz*2;
-    stress[0](1,1) += ryy*2;
-    stress[0](1,2) += ryz*2;
-    stress[0](2,2) += rzz*2;
-    return;
-}
diff --git a/source/source_lcao/module_gint/gint_gamma.h b/source/source_lcao/module_gint/gint_gamma.h
deleted file mode 100644
index 7ed6e2c9b9..0000000000
--- a/source/source_lcao/module_gint/gint_gamma.h
+++ /dev/null
@@ -1,51 +0,0 @@
-//=========================================================
-//AUTHOR : mohan
-//DATE : 2009-09-16
-//REFACTOR : Peize Lin, 2021.06.28
-//=========================================================
-#ifndef GINT_GAMMA_H
-#define GINT_GAMMA_H
-#include "gint.h"
-#include "source_base/global_function.h"
-#include "source_base/global_variable.h"
-#include "grid_technique.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-//=========================================================
-// ModuleBase::Integral On 3D Grids, different from Grid_Integral
-// Feature : Matrix Elements Of Local Potential For 
-// Numerical Orbitals
-//=========================================================
-
-class Gint_Gamma : public Gint
-{
-	public:
-
-    //! @brief move operator for the next ESolver to directly use its infomation
-    //! @param rhs 
-    //! @return *this
-    Gint_Gamma& operator=(Gint_Gamma&& rhs);
-
-    //! in gint_gamma_vl.cpp 
-    //! there is an additional step in calculating vlocal for gamma point
-    //! namely the redistribution of Hamiltonian from grid to 2D block format
-    //! hence we have an additional layer outside the unified interface
-    void cal_vlocal(Gint_inout* inout, const bool new_e_iteration);
-
-    //! in gint_gamma_env.cpp 
-	//! calcualte the electronic wave functions via grid integral
-	void cal_env(const double* wfc, double* rho,const UnitCell &ucell);
-
-    //! transfer this->hRGint to Veff::hR
-    void transfer_pvpR(hamilt::HContainer<double>* hR,const UnitCell* ucell);
-
-private:
-
-    //! pointer to density matrix
-    double*** DM = nullptr;
-
-};
-
-#endif
diff --git a/source/source_lcao/module_gint/gint_gamma_env.cpp b/source/source_lcao/module_gint/gint_gamma_env.cpp
deleted file mode 100644
index 76ae6e506a..0000000000
--- a/source/source_lcao/module_gint/gint_gamma_env.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-#include "gint_gamma.h"
-#include "grid_technique.h"
-#include "source_base/timer.h"
-#include "source_base/ylm.h"
-#include "source_base/array_pool.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_pw/module_pwdft/global.h"
-
-void Gint_Gamma::cal_env(const double* wfc, double* rho,const UnitCell& ucell)
-{
-    ModuleBase::TITLE("Grid_Integral", "cal_env");
-
-    // it's a uniform grid to save orbital values, so the delta_r is a constant.
-    const double delta_r = this->gridt->dr_uniform;
-    const int max_size = this->gridt->max_atom;
-    if (max_size <= 0){
-        ModuleBase::WARNING_QUIT("Gint_Gamma::cal_env",
-                                    "the max_size is less than 0!");
-    }
-    const int nbx = this->gridt->nbx;
-    const int nby = this->gridt->nby;
-    const int nbz = this->gridt->nbzp;
-    const int ncyz = this->ny * this->nplane; // mohan add 2012-03-25
-    const int bxyz = this->bxyz;
-
-    #pragma omp parallel 
-    {
-        std::vector<int> block_iw(max_size, 0);
-        std::vector<int> block_index(max_size+1, 0);
-        std::vector<int> block_size(max_size, 0);
-        std::vector<int> vindex(bxyz,0);
-        #pragma omp for
-        for (int grid_index = 0; grid_index < this->nbxx; grid_index++)
-        {
-
-            // get the value: how many atoms has orbital value on this grid.
-            const int size = this->gridt->how_many_atoms[grid_index];
-            if (size == 0)
-                continue;
-
-            // int *block_iw, *block_index, *block_size;
-            ModuleBase::Array_Pool<bool> cal_flag(bxyz, size);
-            Gint_Tools::get_block_info(*this->gridt,
-                                       this->bxyz,
-                                       size,
-                                       grid_index,
-                                       block_iw.data(),
-                                       block_index.data(),
-                                       block_size.data(),
-                                       cal_flag.get_ptr_2D());
-            const int LD_pool = block_index[size]; 
-
-            // evaluate psi on grids
-            ModuleBase::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
-            Gint_Tools::cal_psir_ylm(*this->gridt,
-                                     this->bxyz,
-                                     size,
-                                     grid_index,
-                                     delta_r,
-                                     block_index.data(),
-                                     block_size.data(),
-                                     cal_flag.get_ptr_2D(),
-                                     psir_ylm.get_ptr_2D());
-
-             Gint_Tools::get_vindex(this->bxyz,
-                                    this->bx,
-                                    this->by,
-                                    this->bz,
-                                    this->nplane,
-                                    this->gridt->start_ind[grid_index],
-                                    ncyz,
-                                    vindex.data());
-
-            for (int ia1 = 0; ia1 < size; ia1++)
-            {
-                const int mcell_index1 = this->gridt->bcell_start[grid_index] + ia1;
-                const int iat = this->gridt->which_atom[mcell_index1];
-                const int T1 = ucell.iat2it[iat];
-                Atom* atom1 = &ucell.atoms[T1];
-                const int I1 = ucell.iat2ia[iat];
-                // get the start index of local orbitals.
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-                for (int ib = 0; ib < this->bxyz; ib++)
-                {
-                    if (cal_flag[ib][ia1])
-                    {
-                        int iw1_lo = this->gridt->trace_lo[start1];
-                        double* psi1 = &psir_ylm[ib][block_index[ia1]];
-                        double tmp = 0.0;
-                        for (int iw = 0; iw < atom1->nw; ++iw, ++iw1_lo)
-                        {
-                            tmp += psi1[iw] * wfc[iw1_lo];
-                        } // iw
-                        rho[vindex[ib]] += tmp;
-                    } // cal_flag
-                }     // ib
-            }         // ia1
-        }
-    }
-    return;
-}
diff --git a/source/source_lcao/module_gint/gint_gamma_vl.cpp b/source/source_lcao/module_gint/gint_gamma_vl.cpp
deleted file mode 100644
index 161a3e7083..0000000000
--- a/source/source_lcao/module_gint/gint_gamma_vl.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-//=========================================================
-// REFACTOR : Peize Lin, 2021.06.28
-//=========================================================
-#include "gint_gamma.h"
-#include "gint_tools.h"
-#include "grid_technique.h"
-#include "source_base/memory.h"
-#include "source_base/timer.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_lcao/module_hcontainer/hcontainer_funcs.h"
-#include "source_pw/module_pwdft/global.h"
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#ifdef __MKL
-#include <mkl_service.h>
-#endif
-
-extern "C"
-{
-    void Cblacs_gridinfo(int icontxt, int* nprow, int* npcol, int* myprow, int* mypcol);
-    void Cblacs_pinfo(int* myid, int* nprocs);
-    void Cblacs_pcoord(int icontxt, int pnum, int* prow, int* pcol);
-}
-
-void Gint_Gamma::cal_vlocal(Gint_inout* inout, bool new_e_iteration)
-{
-    const int max_size = this->gridt->max_atom;
-    const int lgd = this->gridt->lgd;
-
-    if (inout->job == Gint_Tools::job_type::vlocal || inout->job == Gint_Tools::job_type::vlocal_meta)
-    {
-        if (max_size > 0 && lgd > 0)
-        {
-            this->hRGint->set_zero();
-        }
-
-        this->cal_gint(inout);
-    }
-}
-
-#ifdef __MPI
-#include "source_lcao/module_hcontainer/hcontainer_funcs.h"
-#endif
-void Gint_Gamma::transfer_pvpR(hamilt::HContainer<double>* hR, const UnitCell* ucell)
-{
-    ModuleBase::TITLE("Gint_Gamma", "transfer_pvpR");
-    ModuleBase::timer::tick("Gint_Gamma", "transfer_pvpR");
-
-    for (int iap = 0; iap < this->hRGint->size_atom_pairs(); iap++)
-    {
-        auto& ap = this->hRGint->get_atom_pair(iap);
-        const int iat1 = ap.get_atom_i();
-        const int iat2 = ap.get_atom_j();
-        if (iat1 > iat2)
-        {
-            // fill lower triangle matrix with upper triangle matrix
-            // gamma_only case, only 1 R_index in each AtomPair
-            // the upper <IJR> is <iat2, iat1, 0>
-            const hamilt::AtomPair<double>* upper_ap = this->hRGint->find_pair(iat2, iat1);
-#ifdef __DEBUG
-            assert(upper_ap != nullptr);
-#endif
-            double* lower_matrix = ap.get_pointer(0);
-            for (int irow = 0; irow < ap.get_row_size(); ++irow)
-            {
-                for (int icol = 0; icol < ap.get_col_size(); ++icol)
-                {
-                    *lower_matrix++ = upper_ap->get_value(icol, irow);
-                }
-            }
-        }
-    }
-
-#ifdef __MPI
-    int size = 0;
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    if (size == 1)
-    {
-        hR->add(*this->hRGint);
-    }
-    else
-    {
-        hamilt::transferSerials2Parallels(*this->hRGint, hR);
-    }
-#else
-    hR->add(*this->hRGint);
-#endif
-
-    ModuleBase::timer::tick("Gint_Gamma", "transfer_pvpR");
-
-    return;
-}
diff --git a/source/source_lcao/module_gint/gint_gpu_interface.cpp b/source/source_lcao/module_gint/gint_gpu_interface.cpp
deleted file mode 100644
index 8e8e362f23..0000000000
--- a/source/source_lcao/module_gint/gint_gpu_interface.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-#include "gint.h"
-#include "gint_force_gpu.h"
-#include "source_io/module_parameter/parameter.h"
-#include "gint_rho_gpu.h"
-#include "gint_vl_gpu.h"
-#include "source_base/memory.h"
-#include "source_base/timer.h"
-
-void Gint::gpu_vlocal_interface(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal");
-
-    const UnitCell& ucell = *this->ucell;
-    const double dr = this->gridt->dr_uniform;
-    double ylmcoef[100];
-    ModuleBase::GlobalFunc::ZEROS(ylmcoef, 100);
-    for (int i = 0; i < 100; i++) {
-        ylmcoef[i] = ModuleBase::Ylm::ylmcoef[i];
-    }
-
-    hamilt::HContainer<double>* hRGint_kernel = PARAM.inp.nspin != 4 ? this->hRGint : this->hr_gint_tmp[inout->ispin];
-    GintKernel::gint_vl_gpu(hRGint_kernel,
-                            inout->vl,
-                            ylmcoef,
-                            dr,
-                            this->gridt->rcuts.data(),
-                            *this->gridt,
-                            ucell);
-
-    ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal");
-}
-
-void Gint::gpu_rho_interface(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint_rho");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_rho");
-
-    const UnitCell& ucell = *this->ucell;
-    const double dr = this->gridt->dr_uniform;
-    double ylmcoef[100];
-    ModuleBase::GlobalFunc::ZEROS(ylmcoef, 100);
-    for (int i = 0; i < 100; i++) {
-        ylmcoef[i] = ModuleBase::Ylm::ylmcoef[i];
-    }
-    int nrxx = this->gridt->ncx * this->gridt->ncy * this->nplane;
-    for (int is = 0; is < PARAM.inp.nspin; ++is) {
-        ModuleBase::GlobalFunc::ZEROS(inout->rho[is], nrxx);
-        GintKernel::gint_rho_gpu(this->dmr_gint[is],
-                                       ylmcoef,
-                                       dr,
-                                       this->gridt->rcuts.data(),
-                                       *this->gridt,
-                                       ucell,
-                                       inout->rho[is]);
-    }
-    ModuleBase::TITLE("Gint_interface", "cal_gint_rho");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_rho");
-}
-
-void Gint::gpu_force_interface(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint_force");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_force");
-
-    const UnitCell& ucell = *this->ucell;
-    const double dr = this->gridt->dr_uniform;
-    double ylmcoef[100];
-    ModuleBase::GlobalFunc::ZEROS(ylmcoef, 100);
-    for (int i = 0; i < 100; i++) {
-        ylmcoef[i] = ModuleBase::Ylm::ylmcoef[i];
-    }
-
-    const int ncyz = this->ny * this->nplane;
-    int nat = ucell.nat;
-    const int isforce = inout->isforce;
-    const int isstress = inout->isstress;
-    if (isforce || isstress) {
-        std::vector<double> force(nat * 3, 0.0);
-        std::vector<double> stress(6, 0.0);
-        GintKernel::gint_fvl_gpu(this->dmr_gint[inout->ispin],
-                                       inout->vl,
-                                       force.data(),
-                                       stress.data(),
-                                       dr,
-                                       this->gridt->rcuts.data(),
-                                       isforce,
-                                       isstress,
-                                       *this->gridt,
-                                       ucell);
-        if (inout->isforce) {
-            for (int iat = 0; iat < nat; iat++) {
-                inout->fvl_dphi[0](iat, 0) += force[iat * 3];
-                inout->fvl_dphi[0](iat, 1) += force[iat * 3 + 1];
-                inout->fvl_dphi[0](iat, 2) += force[iat * 3 + 2];
-            }
-        }
-        if (inout->isstress) {
-            inout->svl_dphi[0](0, 0) += stress[0];
-            inout->svl_dphi[0](0, 1) += stress[1];
-            inout->svl_dphi[0](0, 2) += stress[2];
-            inout->svl_dphi[0](1, 1) += stress[3];
-            inout->svl_dphi[0](1, 2) += stress[4];
-            inout->svl_dphi[0](2, 2) += stress[5];
-        }
-    }
-
-    ModuleBase::TITLE("Gint_interface", "cal_gint_force");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_force");
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/gint_helper.h b/source/source_lcao/module_gint/gint_helper.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_helper.h
rename to source/source_lcao/module_gint/gint_helper.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_info.cpp b/source/source_lcao/module_gint/gint_info.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_info.cpp
rename to source/source_lcao/module_gint/gint_info.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_info.h b/source/source_lcao/module_gint/gint_info.h
similarity index 98%
rename from source/source_lcao/module_gint/temp_gint/gint_info.h
rename to source/source_lcao/module_gint/gint_info.h
index 0f311c1bcc..a2e35b6642 100644
--- a/source/source_lcao/module_gint/temp_gint/gint_info.h
+++ b/source/source_lcao/module_gint/gint_info.h
@@ -15,7 +15,7 @@
 
 #ifdef __CUDA
 #include "batch_biggrid.h"
-#include "source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h"
+#include "source_lcao/module_gint/kernel/gint_gpu_vars.h"
 #endif
 
 namespace ModuleGint
diff --git a/source/source_lcao/module_gint/temp_gint/gint_interface.cpp b/source/source_lcao/module_gint/gint_interface.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_interface.cpp
rename to source/source_lcao/module_gint/gint_interface.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_interface.h b/source/source_lcao/module_gint/gint_interface.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_interface.h
rename to source/source_lcao/module_gint/gint_interface.h
diff --git a/source/source_lcao/module_gint/gint_k.h b/source/source_lcao/module_gint/gint_k.h
deleted file mode 100644
index ec2de50730..0000000000
--- a/source/source_lcao/module_gint/gint_k.h
+++ /dev/null
@@ -1,86 +0,0 @@
-#ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H
-#define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H
-
-#include "gint.h"
-#include "grid_technique.h"
-#include "source_basis/module_ao/ORB_atomic_lm.h"
-#include "source_estate/module_charge/charge.h"
-#include "source_lcao/LCAO_HS_arrays.hpp"
-
-// add by jingan for map<> in 2021-12-2, will be deleted in the future
-#include "source_base/abfs-vector3_order.h"
-
-class Gint_k : public Gint {
-  public:
-    /// @brief move operator for the next ESolver to directly use its infomation
-    /// @param rhs 
-    /// @return *this
-    Gint_k& operator=(Gint_k&& rhs);
-
-    //------------------------------------------------------
-    // in gint_k_pvpr.cpp
-    //------------------------------------------------------
-    // pvpR and reset_spin/get_spin : auxilliary methods
-    // for calculating hamiltonian
-
-    // allocate the <phi_0 | V | dphi_R> matrix element.
-    void allocate_pvdpR();
-    // destroy the temporary <phi_0 | V | dphi_R> matrix element.
-    void destroy_pvdpR();
-
-    /**
-     * @brief transfer pvpR to this->hRGint
-     * then pass this->hRGint to Veff<OperatorLCAO>::hR
-     */
-    void transfer_pvpR(hamilt::HContainer<double>* hR, const UnitCell* ucell_in, const Grid_Driver* gd);
-    void transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR, const UnitCell* ucell_in, const Grid_Driver* gd);
-
-    //------------------------------------------------------
-    // in gint_k_env.cpp
-    //------------------------------------------------------
-    // calculate the envelop function via grid integrals
-    void cal_env_k(int ik,
-                   const std::complex<double>* psi_k,
-                   double* rho,
-                   const std::vector<ModuleBase::Vector3<double>>& kvec_c,
-                   const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                   const UnitCell& ucell);
-
-    //------------------------------------------------------
-    // in gint_k_sparse1.cpp
-    //------------------------------------------------------
-    // similar to the above 3, just for the derivative
-    void distribute_pvdpR_sparseMatrix(
-        const int current_spin,
-        const int dim,
-        const double& sparse_threshold,
-        const std::map<Abfs::Vector3_Order<int>,
-                       std::map<size_t, std::map<size_t, double>>>&
-            pvdpR_sparseMatrix,
-        LCAO_HS_Arrays& HS_Arrays,
-        const Parallel_Orbitals* pv);
-
-    void distribute_pvdpR_soc_sparseMatrix(
-        const int dim,
-        const double& sparse_threshold,
-        const std::map<
-            Abfs::Vector3_Order<int>,
-            std::map<size_t, std::map<size_t, std::complex<double>>>>&
-            pvdpR_soc_sparseMatrix,
-        LCAO_HS_Arrays& HS_Arrays,
-        const Parallel_Orbitals* pv);
-
-    void cal_dvlocal_R_sparseMatrix(const int& current_spin,
-                                    const double& sparse_threshold,
-                                    LCAO_HS_Arrays& HS_Arrays,
-                                    const Parallel_Orbitals* pv,
-                                    const UnitCell& ucell,
-                                    const Grid_Driver& gdriver);
-
-  private:
-    //----------------------------
-    // key variable
-    //----------------------------
-};
-
-#endif
diff --git a/source/source_lcao/module_gint/gint_k_env.cpp b/source/source_lcao/module_gint/gint_k_env.cpp
deleted file mode 100644
index 67ce701461..0000000000
--- a/source/source_lcao/module_gint/gint_k_env.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-#include "gint_k.h"
-#include "grid_technique.h"
-#include "source_io/module_parameter/parameter.h"
-#include "source_base/timer.h"
-#include "source_base/ylm.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_pw/module_pwdft/global.h"
-#include "source_base/array_pool.h"
-#include "source_base/vector3.h"
-
-void Gint_k::cal_env_k(int ik,
-                       const std::complex<double>* psi_k,
-                       double* rho,
-                       const std::vector<ModuleBase::Vector3<double>>& kvec_c,
-                       const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                       const UnitCell& ucell)
-{
-    ModuleBase::TITLE("Gint_k", "cal_env_k");
-    ModuleBase::timer::tick("Gint_k", "cal_env_k");
-
-    // it's a uniform grid to save orbital values, so the delta_r is a constant.
-    const double delta_r = this->gridt->dr_uniform;
-    const int max_size = this->gridt->max_atom;
-    if (max_size <= 0){
-        ModuleBase::WARNING_QUIT("Gint_Gamma::cal_env",
-                                    "the max_size is less than 0!");
-    }
-    const int nbx = this->gridt->nbx;
-    const int nby = this->gridt->nby;
-    const int nbz = this->gridt->nbzp;
-    const int ncyz = this->ny * this->nplane; // mohan add 2012-03-25
-
-    #pragma omp parallel 
-    {
-        std::vector<int> vindex(this->bxyz, 0);
-        std::vector<int> block_iw(max_size, 0);
-        std::vector<int> block_index(max_size + 1, 0);
-        std::vector<int> block_size(max_size, 0);
-        #pragma omp for
-        for (int grid_index = 0; grid_index < this->nbxx; grid_index++)
-        {
-
-            // get the value: how many atoms has orbital value on this grid.
-            const int size = this->gridt->how_many_atoms[grid_index];
-            if (size == 0)
-            {
-                continue;
-            }
-            ModuleBase::Array_Pool<bool> cal_flag(this->bxyz, max_size);
-            Gint_Tools::get_block_info(*this->gridt,
-                                       this->bxyz,
-                                       size,
-                                       grid_index,
-                                       block_iw.data(),
-                                       block_index.data(),
-                                       block_size.data(),
-                                       cal_flag.get_ptr_2D());
-            const int LD_pool = block_index[size];
-
-            // evaluate psi on grids
-            ModuleBase::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
-            Gint_Tools::cal_psir_ylm(*this->gridt,
-                                     this->bxyz,
-                                     size,
-                                     grid_index,
-                                     delta_r,
-                                     block_index.data(),
-                                     block_size.data(),
-                                     cal_flag.get_ptr_2D(),
-                                     psir_ylm.get_ptr_2D());
-
-            Gint_Tools::get_vindex(this->bxyz,
-                                    this->bx,
-                                    this->by,
-                                    this->bz,
-                                    this->nplane,
-                                    this->gridt->start_ind[grid_index],
-                                    ncyz,
-                                    vindex.data());
-
-            for (int ia1 = 0; ia1 < size; ia1++)
-            {
-                const int mcell_index1 = this->gridt->bcell_start[grid_index] + ia1;
-                const int iat = this->gridt->which_atom[mcell_index1];
-                const int T1 = ucell.iat2it[iat];
-                Atom* atom1 = &ucell.atoms[T1];
-                const int I1 = ucell.iat2ia[iat];
-
-                // find R by which_unitcell and cal kphase
-                const int id_ucell = this->gridt->which_unitcell[mcell_index1];
-                ModuleBase::Vector3<double> R(this->gridt->get_ucell_coords(id_ucell));
-                // std::cout << "kvec_d: " << kvec_d[ik].x << " " << kvec_d[ik].y << " " << kvec_d[ik].z << std::endl;
-                // std::cout << "kvec_c: " << kvec_c[ik].x << " " << kvec_c[ik].y << " " << kvec_c[ik].z << std::endl;
-                // std::cout << "R: " << R.x << " " << R.y << " " << R.z << std::endl;
-                const double arg = (kvec_d[ik] * R) * ModuleBase::TWO_PI;
-                const double arg1
-                    = (kvec_c[ik] * (R.x * ucell.a1 + R.y * ucell.a2 + R.z * ucell.a3)) * ModuleBase::TWO_PI;
-                // std::cout << "arg0=" << arg << ", arg1=" << arg1 << std::endl;
-                const std::complex<double> kphase = std::complex<double>(cos(arg), sin(arg));
-
-                // get the start index of local orbitals.
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-                for (int ib = 0; ib < this->bxyz; ib++)
-                {
-                    if (cal_flag[ib][ia1])
-                    {
-                        int iw1_lo = 0;
-                        double* psi1 = &psir_ylm[ib][block_index[ia1]];
-                        std::complex<double> tmp{0.0, 0.0};
-                        if (PARAM.inp.nspin == 4) // is it a simple add of 2 spins?
-                        {
-                            for (int is = 0; is < 2; ++is)
-                            {
-                                iw1_lo = this->gridt->trace_lo[start1] / PARAM.globalv.npol
-                                         + this->gridt->lgd / PARAM.globalv.npol * is;
-                                for (int iw = 0; iw < atom1->nw; ++iw, ++iw1_lo)
-                                {
-                                    tmp += std::complex<double>(psi1[iw], 0.0) * psi_k[iw1_lo] * kphase;
-                                }
-                            }
-                        }
-                        else
-                        {
-                            iw1_lo = this->gridt->trace_lo[start1];
-                            for (int iw = 0; iw < atom1->nw; ++iw, ++iw1_lo)
-                            {
-                                tmp += std::complex<double>(psi1[iw], 0.0) * psi_k[iw1_lo] * kphase;
-                            }
-                        }
-                        rho[vindex[ib]] += tmp.real();
-                    } // cal_flag
-                }     // ib
-            }         // ia1
-        } // i
-    }
-    ModuleBase::timer::tick("Gint_k", "cal_env_k");
-    return;
-}
diff --git a/source/source_lcao/module_gint/gint_k_pvdpr.cpp b/source/source_lcao/module_gint/gint_k_pvdpr.cpp
deleted file mode 100644
index b03f012a66..0000000000
--- a/source/source_lcao/module_gint/gint_k_pvdpr.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "gint_k.h"
-#include "grid_technique.h"
-#include "source_io/module_parameter/parameter.h"
-#include "source_base/global_function.h"
-#include "source_base/global_variable.h"
-#include "source_base/memory.h"
-#include "source_base/parallel_reduce.h"
-#include "source_base/timer.h"
-#include "source_base/tool_threading.h"
-#include "source_base/ylm.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_pw/module_pwdft/global.h"
-
-void Gint_k::allocate_pvdpR(void)
-{
-    ModuleBase::TITLE("Gint_k","allocate_pvpR");
-
-    const int nspin = PARAM.inp.nspin;
-    assert(nspin>0);
-
-    //xiaohui modify 2015-05-30
-    // the number of matrix element <phi_0 | V | dphi_R> is this->gridt->nnrg.
-    for(int is =0;is<nspin;is++)
-    {
-        this->pvdpRx_reduced.push_back(hamilt::HContainer<double>(this->ucell->nat));
-        pvdpRx_reduced[is].insert_ijrs(this->gridt->get_ijr_info(), *this->ucell);
-        pvdpRx_reduced[is].allocate(nullptr, true);
-        this->pvdpRy_reduced.push_back(hamilt::HContainer<double>(this->ucell->nat));
-        pvdpRy_reduced[is].insert_ijrs(this->gridt->get_ijr_info(), *this->ucell);
-        pvdpRy_reduced[is].allocate(nullptr, true);
-        this->pvdpRz_reduced.push_back(hamilt::HContainer<double>(this->ucell->nat));
-        pvdpRz_reduced[is].insert_ijrs(this->gridt->get_ijr_info(), *this->ucell);
-        pvdpRz_reduced[is].allocate(nullptr, true);
-    }
-
-    ModuleBase::Memory::record("pvdpR_reduced", 3 * sizeof(double) * this->gridt->nnrg * nspin);
-    return;
-}
-
-void Gint_k::destroy_pvdpR(void)
-{
-    ModuleBase::TITLE("Gint_k","destroy_pvpR");
-
-    const int nspin = PARAM.inp.nspin;
-    assert(nspin>0);
-    pvdpRx_reduced.clear();
-    pvdpRy_reduced.clear();
-    pvdpRz_reduced.clear();
-    pvdpRx_reduced.shrink_to_fit();
-    pvdpRy_reduced.shrink_to_fit();
-    pvdpRz_reduced.shrink_to_fit();
-    return;
-}
diff --git a/source/source_lcao/module_gint/gint_k_pvpr.cpp b/source/source_lcao/module_gint/gint_k_pvpr.cpp
deleted file mode 100644
index 8f98e1dcaf..0000000000
--- a/source/source_lcao/module_gint/gint_k_pvpr.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "gint_k.h"
-#include "grid_technique.h"
-#include "source_io/module_parameter/parameter.h"
-#include "source_base/global_function.h"
-#include "source_base/global_variable.h"
-#include "source_base/libm/libm.h"
-#include "source_base/memory.h"
-#include "source_base/parallel_reduce.h"
-#include "source_base/timer.h"
-#include "source_base/tool_threading.h"
-#include "source_base/ylm.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_cell/module_neighbor/sltk_grid_driver.h"
-#include "source_pw/module_pwdft/global.h"
-#include "source_lcao/module_hcontainer/hcontainer_funcs.h"
-#ifdef __MPI
-#include <mpi.h>
-#endif
-
-// transfer_pvpR, NSPIN = 1 or 2
-void Gint_k::transfer_pvpR(hamilt::HContainer<double>* hR, const UnitCell* ucell, const Grid_Driver* gd)
-{
-    ModuleBase::TITLE("Gint_k", "transfer_pvpR");
-    ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
-
-    for (int iap = 0; iap < this->hRGint->size_atom_pairs(); iap++)
-    {
-        auto& ap = this->hRGint->get_atom_pair(iap);
-        const int iat1 = ap.get_atom_i();
-        const int iat2 = ap.get_atom_j();
-        if (iat1 > iat2)
-        {
-            // fill lower triangle matrix with upper triangle matrix
-            // the upper <IJR> is <iat2, iat1>
-            const hamilt::AtomPair<double>* upper_ap = this->hRGint->find_pair(iat2, iat1);
-            const hamilt::AtomPair<double>* lower_ap = this->hRGint->find_pair(iat1, iat2);
-#ifdef __DEBUG
-            assert(upper_ap != nullptr);
-#endif
-            for (int ir = 0; ir < ap.get_R_size(); ir++)
-            {   
-                auto R_index = ap.get_R_index(ir);
-                auto upper_mat = upper_ap->find_matrix(-R_index);
-                auto lower_mat = lower_ap->find_matrix(R_index);
-                for (int irow = 0; irow < upper_mat->get_row_size(); ++irow)
-                {
-                    for (int icol = 0; icol < upper_mat->get_col_size(); ++icol)
-                    {
-                        lower_mat->get_value(icol, irow) = upper_ap->get_value(irow, icol);
-                    }
-                }
-            }
-        }
-    }
-#ifdef __MPI
-    int size = 0;
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    if (size == 1)
-    {
-        hR->add(*this->hRGint);
-    }
-    else
-    {
-        hamilt::transferSerials2Parallels(*this->hRGint, hR);
-    }
-#else
-    hR->add(*this->hRGint);
-#endif
-    ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
-    return;
-}
-
-// transfer_pvpR, NSPIN = 4
-void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
-                           const UnitCell* ucell_in,
-                           const Grid_Driver* gd)
-{
-    ModuleBase::TITLE("Gint_k", "transfer_pvpR");
-    ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
-
-    this->hRGintCd->set_zero();
-    
-    for (int iap = 0; iap < this->hRGintCd->size_atom_pairs(); iap++)
-    {
-        auto* ap = &this->hRGintCd->get_atom_pair(iap);
-        const int iat1 = ap->get_atom_i();
-        const int iat2 = ap->get_atom_j();
-        if (iat1 <= iat2)
-        {
-            hamilt::AtomPair<std::complex<double>>* upper_ap = ap;
-            hamilt::AtomPair<std::complex<double>>* lower_ap = this->hRGintCd->find_pair(iat2, iat1);
-            const hamilt::AtomPair<double>* ap_nspin_0 = this->hr_gint_tmp[0]->find_pair(iat1, iat2);
-            const hamilt::AtomPair<double>* ap_nspin_3 = this->hr_gint_tmp[3]->find_pair(iat1, iat2);
-            for (int ir = 0; ir < upper_ap->get_R_size(); ir++)
-            {   
-                const auto R_index = upper_ap->get_R_index(ir);
-                auto upper_mat = upper_ap->find_matrix(R_index);
-                auto mat_nspin_0 = ap_nspin_0->find_matrix(R_index);
-                auto mat_nspin_3 = ap_nspin_3->find_matrix(R_index);
-
-                // The row size and the col size of upper_matrix is double that of matrix_nspin_0
-                for (int irow = 0; irow < mat_nspin_0->get_row_size(); ++irow)
-                {
-                    for (int icol = 0; icol < mat_nspin_0->get_col_size(); ++icol)
-                    {
-                        upper_mat->get_value(2*irow, 2*icol) = mat_nspin_0->get_value(irow, icol) + mat_nspin_3->get_value(irow, icol);
-                        upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin_0->get_value(irow, icol) - mat_nspin_3->get_value(irow, icol);
-                    }
-                }
-
-                if (PARAM.globalv.domag)
-                {
-                    const hamilt::AtomPair<double>* ap_nspin_1 = this->hr_gint_tmp[1]->find_pair(iat1, iat2);
-                    const hamilt::AtomPair<double>* ap_nspin_2 = this->hr_gint_tmp[2]->find_pair(iat1, iat2);
-                    const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index);
-                    const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index);
-                    for (int irow = 0; irow < mat_nspin_1->get_row_size(); ++irow)
-                    {
-                        for (int icol = 0; icol < mat_nspin_1->get_col_size(); ++icol)
-                        {
-                            upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin_1->get_value(irow, icol) +  std::complex<double>(0.0, 1.0) * mat_nspin_2->get_value(irow, icol);
-                            upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin_1->get_value(irow, icol) -  std::complex<double>(0.0, 1.0) * mat_nspin_2->get_value(irow, icol);
-                        }
-                    }
-                }
-
-                // fill the lower triangle matrix
-                if (iat1 < iat2)
-                {
-                    auto lower_mat = lower_ap->find_matrix(-R_index);
-                    for (int irow = 0; irow < upper_mat->get_row_size(); ++irow)
-                    {
-                        for (int icol = 0; icol < upper_mat->get_col_size(); ++icol)
-                        {
-                            lower_mat->get_value(icol, irow) = conj(upper_mat->get_value(irow, icol));
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    // ===================================
-    // transfer HR from Gint to Veff<OperatorLCAO<std::complex<double>, std::complex<double>>>
-    // ===================================
-#ifdef __MPI
-    int size;
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    if (size == 1)
-    {
-        hR->add(*this->hRGintCd);
-    }
-    else
-    {
-        hamilt::transferSerials2Parallels<std::complex<double>>(*this->hRGintCd, hR);
-    }
-#else
-    hR->add(*this->hRGintCd);
-#endif
-
-    ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
-    return;
-}
diff --git a/source/source_lcao/module_gint/gint_k_sparse1.cpp b/source/source_lcao/module_gint/gint_k_sparse1.cpp
deleted file mode 100644
index ab0d8b60ef..0000000000
--- a/source/source_lcao/module_gint/gint_k_sparse1.cpp
+++ /dev/null
@@ -1,554 +0,0 @@
-#include "gint_k.h"
-#include "grid_technique.h"
-#include "source_io/module_parameter/parameter.h"
-#include "source_base/global_function.h"
-#include "source_base/global_variable.h"
-#include "source_base/memory.h"
-#include "source_base/parallel_reduce.h"
-#include "source_base/timer.h"
-#include "source_base/ylm.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_cell/module_neighbor/sltk_grid_driver.h"
-#include "source_pw/module_pwdft/global.h"
-
-void Gint_k::distribute_pvdpR_sparseMatrix(
-    const int current_spin,
-    const int dim,
-    const double& sparse_threshold,
-    const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>>& pvdpR_sparseMatrix,
-    LCAO_HS_Arrays& HS_Arrays,
-    const Parallel_Orbitals* pv)
-{
-    ModuleBase::TITLE("Gint_k", "distribute_pvdpR_sparseMatrix");
-
-    int total_R_num = HS_Arrays.all_R_coor.size();
-    int* nonzero_num = new int[total_R_num];
-    int* minus_nonzero_num = new int[total_R_num];
-    ModuleBase::GlobalFunc::ZEROS(nonzero_num, total_R_num);
-    ModuleBase::GlobalFunc::ZEROS(minus_nonzero_num, total_R_num);
-    int count = 0;
-    for (auto& R_coor: HS_Arrays.all_R_coor)
-    {
-        auto iter = pvdpR_sparseMatrix.find(R_coor);
-        if (iter != pvdpR_sparseMatrix.end())
-        {
-            for (auto& row_loop: iter->second)
-            {
-                nonzero_num[count] += row_loop.second.size();
-            }
-        }
-
-        auto minus_R_coor = -1 * R_coor;
-
-        iter = pvdpR_sparseMatrix.find(minus_R_coor);
-        if (iter != pvdpR_sparseMatrix.end())
-        {
-            for (auto& row_loop: iter->second)
-            {
-                minus_nonzero_num[count] += row_loop.second.size();
-            }
-        }
-
-        count++;
-    }
-
-    Parallel_Reduce::reduce_all(nonzero_num, total_R_num);
-    Parallel_Reduce::reduce_all(minus_nonzero_num, total_R_num);
-    // Parallel_Reduce::reduce_pool(nonzero_num, total_R_num);
-    // Parallel_Reduce::reduce_pool(minus_nonzero_num, total_R_num);
-
-    double* tmp = nullptr;
-    tmp = new double[PARAM.globalv.nlocal];
-
-    count = 0;
-    for (auto& R_coor: HS_Arrays.all_R_coor)
-    {
-        if (nonzero_num[count] != 0 || minus_nonzero_num[count] != 0)
-        {
-            auto minus_R_coor = -1 * R_coor;
-
-            for (int row = 0; row < PARAM.globalv.nlocal; ++row)
-            {
-                ModuleBase::GlobalFunc::ZEROS(tmp, PARAM.globalv.nlocal);
-
-                auto iter = pvdpR_sparseMatrix.find(R_coor);
-                if (iter != pvdpR_sparseMatrix.end())
-                {
-
-                    if (this->gridt->trace_lo[row] >= 0)
-                    {
-                        auto row_iter = iter->second.find(row);
-                        if (row_iter != iter->second.end())
-                        {
-                            for (auto& value: row_iter->second)
-                            {
-                                tmp[value.first] = value.second;
-                            }
-                        }
-                    }
-                }
-
-                auto minus_R_iter = pvdpR_sparseMatrix.find(minus_R_coor);
-                if (minus_R_iter != pvdpR_sparseMatrix.end())
-                {
-                    for (int col = 0; col < row; ++col)
-                    {
-                        if (this->gridt->trace_lo[col] >= 0)
-                        {
-                            auto row_iter = minus_R_iter->second.find(col);
-                            if (row_iter != minus_R_iter->second.end())
-                            {
-                                auto col_iter = row_iter->second.find(row);
-                                if (col_iter != row_iter->second.end())
-                                {
-                                    tmp[col] = col_iter->second;
-                                }
-                            }
-                        }
-                    }
-                }
-
-                Parallel_Reduce::reduce_pool(tmp, PARAM.globalv.nlocal);
-
-                if (pv->global2local_row(row) >= 0)
-                {
-                    for (int col = 0; col < PARAM.globalv.nlocal; ++col)
-                    {
-                        if (pv->global2local_col(col) >= 0)
-                        {
-                            if (std::abs(tmp[col]) > sparse_threshold)
-                            {
-                                if (dim == 0)
-                                {
-                                    double& value = HS_Arrays.dHRx_sparse[current_spin][R_coor][row][col];
-                                    value += tmp[col];
-                                    if (std::abs(value) <= sparse_threshold)
-                                    {
-                                        HS_Arrays.dHRx_sparse[current_spin][R_coor][row].erase(col);
-                                    }
-                                }
-                                if (dim == 1)
-                                {
-                                    double& value = HS_Arrays.dHRy_sparse[current_spin][R_coor][row][col];
-                                    value += tmp[col];
-                                    if (std::abs(value) <= sparse_threshold)
-                                    {
-                                        HS_Arrays.dHRy_sparse[current_spin][R_coor][row].erase(col);
-                                    }
-                                }
-                                if (dim == 2)
-                                {
-                                    double& value = HS_Arrays.dHRz_sparse[current_spin][R_coor][row][col];
-                                    value += tmp[col];
-                                    if (std::abs(value) <= sparse_threshold)
-                                    {
-                                        HS_Arrays.dHRz_sparse[current_spin][R_coor][row].erase(col);
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        count++;
-    }
-
-    delete[] nonzero_num;
-    delete[] minus_nonzero_num;
-    delete[] tmp;
-    nonzero_num = nullptr;
-    minus_nonzero_num = nullptr;
-    tmp = nullptr;
-
-    return;
-}
-
-void Gint_k::distribute_pvdpR_soc_sparseMatrix(
-    const int dim,
-    const double& sparse_threshold,
-    const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>&
-        pvdpR_soc_sparseMatrix,
-    LCAO_HS_Arrays& HS_Arrays,
-    const Parallel_Orbitals* pv)
-{
-    ModuleBase::TITLE("Gint_k", "distribute_pvdpR_soc_sparseMatrix");
-
-    int total_R_num = HS_Arrays.all_R_coor.size();
-    int* nonzero_num = new int[total_R_num];
-    int* minus_nonzero_num = new int[total_R_num];
-    ModuleBase::GlobalFunc::ZEROS(nonzero_num, total_R_num);
-    ModuleBase::GlobalFunc::ZEROS(minus_nonzero_num, total_R_num);
-    int count = 0;
-    for (auto& R_coor: HS_Arrays.all_R_coor)
-    {
-        auto iter = pvdpR_soc_sparseMatrix.find(R_coor);
-        if (iter != pvdpR_soc_sparseMatrix.end())
-        {
-            for (auto& row_loop: iter->second)
-            {
-                nonzero_num[count] += row_loop.second.size();
-            }
-        }
-
-        auto minus_R_coor = -1 * R_coor;
-
-        iter = pvdpR_soc_sparseMatrix.find(minus_R_coor);
-        if (iter != pvdpR_soc_sparseMatrix.end())
-        {
-            for (auto& row_loop: iter->second)
-            {
-                minus_nonzero_num[count] += row_loop.second.size();
-            }
-        }
-
-        count++;
-    }
-
-    Parallel_Reduce::reduce_all(nonzero_num, total_R_num);
-    Parallel_Reduce::reduce_all(minus_nonzero_num, total_R_num);
-    // Parallel_Reduce::reduce_pool(nonzero_num, total_R_num);
-    // Parallel_Reduce::reduce_pool(minus_nonzero_num, total_R_num);
-
-    std::complex<double>* tmp_soc = nullptr;
-    tmp_soc = new std::complex<double>[PARAM.globalv.nlocal];
-
-    count = 0;
-    for (auto& R_coor: HS_Arrays.all_R_coor)
-    {
-        if (nonzero_num[count] != 0 || minus_nonzero_num[count] != 0)
-        {
-            auto minus_R_coor = -1 * R_coor;
-
-            for (int row = 0; row < PARAM.globalv.nlocal; ++row)
-            {
-                ModuleBase::GlobalFunc::ZEROS(tmp_soc, PARAM.globalv.nlocal);
-
-                auto iter = pvdpR_soc_sparseMatrix.find(R_coor);
-                if (iter != pvdpR_soc_sparseMatrix.end())
-                {
-                    if (this->gridt->trace_lo[row] >= 0)
-                    {
-                        auto row_iter = iter->second.find(row);
-                        if (row_iter != iter->second.end())
-                        {
-                            for (auto& value: row_iter->second)
-                            {
-                                tmp_soc[value.first] = value.second;
-                            }
-                        }
-                    }
-                }
-
-                auto minus_R_iter = pvdpR_soc_sparseMatrix.find(minus_R_coor);
-                if (minus_R_iter != pvdpR_soc_sparseMatrix.end())
-                {
-                    for (int col = 0; col < row; ++col)
-                    {
-                        if (this->gridt->trace_lo[col] >= 0)
-                        {
-                            auto row_iter = minus_R_iter->second.find(col);
-                            if (row_iter != minus_R_iter->second.end())
-                            {
-                                auto col_iter = row_iter->second.find(row);
-                                if (col_iter != row_iter->second.end())
-                                {
-                                    tmp_soc[col] = conj(col_iter->second);
-                                }
-                            }
-                        }
-                    }
-                }
-
-                Parallel_Reduce::reduce_pool(tmp_soc, PARAM.globalv.nlocal);
-
-                if (pv->global2local_row(row) >= 0)
-                {
-                    for (int col = 0; col < PARAM.globalv.nlocal; ++col)
-                    {
-                        if (pv->global2local_col(col) >= 0)
-                        {
-                            if (std::abs(tmp_soc[col]) > sparse_threshold)
-                            {
-                                if (dim == 0)
-                                {
-                                    std::complex<double>& value = HS_Arrays.dHRx_soc_sparse[R_coor][row][col];
-                                    value += tmp_soc[col];
-                                    if (std::abs(value) <= sparse_threshold)
-                                    {
-                                        HS_Arrays.dHRx_soc_sparse[R_coor][row].erase(col);
-                                    }
-                                }
-                                if (dim == 1)
-                                {
-                                    std::complex<double>& value = HS_Arrays.dHRy_soc_sparse[R_coor][row][col];
-                                    value += tmp_soc[col];
-                                    if (std::abs(value) <= sparse_threshold)
-                                    {
-                                        HS_Arrays.dHRy_soc_sparse[R_coor][row].erase(col);
-                                    }
-                                }
-                                if (dim == 2)
-                                {
-                                    std::complex<double>& value = HS_Arrays.dHRz_soc_sparse[R_coor][row][col];
-                                    value += tmp_soc[col];
-                                    if (std::abs(value) <= sparse_threshold)
-                                    {
-                                        HS_Arrays.dHRz_soc_sparse[R_coor][row].erase(col);
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        count++;
-    }
-
-    delete[] nonzero_num;
-    delete[] minus_nonzero_num;
-    delete[] tmp_soc;
-    nonzero_num = nullptr;
-    minus_nonzero_num = nullptr;
-    tmp_soc = nullptr;
-
-    return;
-}
-
-void Gint_k::cal_dvlocal_R_sparseMatrix(const int& current_spin,
-                                        const double& sparse_threshold,
-                                        LCAO_HS_Arrays& HS_Arrays,
-                                        const Parallel_Orbitals* pv,
-                                        const UnitCell& ucell,
-                                        const Grid_Driver& gdriver)
-{
-    ModuleBase::TITLE("Gint_k", "cal_dvlocal_R_sparseMatrix");
-
-    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>> pvdpRx_sparseMatrix;
-    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>> pvdpRy_sparseMatrix;
-    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>> pvdpRz_sparseMatrix;
-    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>
-        pvdpRx_soc_sparseMatrix;
-    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>
-        pvdpRy_soc_sparseMatrix;
-    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>
-        pvdpRz_soc_sparseMatrix;
-
-    double temp_value_double;
-    std::complex<double> temp_value_complex;
-
-    ModuleBase::Vector3<double> tau1, dtau;
-    for (int iap = 0; iap < pvdpRx_reduced[0].size_atom_pairs(); iap++)
-    {
-        const auto& ap = pvdpRx_reduced[0].get_atom_pair(iap);
-        const int iat1 = ap.get_atom_i();
-        const int iat2 = ap.get_atom_j();
-        const int it1 = ucell.iat2it[iat1];
-        const int it2 = ucell.iat2it[iat2];
-        const Atom* atom1 = &ucell.atoms[it1];
-        const Atom* atom2 = &ucell.atoms[it2];
-        const int start1 = ucell.itiaiw2iwt(it1, ucell.iat2ia[iat1], 0);
-        const int start2 = ucell.itiaiw2iwt(it2, ucell.iat2ia[iat2], 0);
-
-        for (int ir = 0; ir < ap.get_R_size(); ir++)
-        {
-            const ModuleBase::Vector3<int> R = ap.get_R_index(ir);
-            Abfs::Vector3_Order<int> dR(R.x, R.y, R.z);
-            std::vector<double *> pvdpRx;
-            std::vector<double *> pvdpRy;
-            std::vector<double *> pvdpRz;
-            for(int i = 0; i < PARAM.inp.nspin; i++)
-            {
-                pvdpRx.push_back(pvdpRx_reduced[i].get_atom_pair(iap).get_pointer(ir));
-                pvdpRy.push_back(pvdpRy_reduced[i].get_atom_pair(iap).get_pointer(ir));
-                pvdpRz.push_back(pvdpRz_reduced[i].get_atom_pair(iap).get_pointer(ir));
-            }
-
-            for (int iw = 0; iw < atom1->nw * PARAM.globalv.npol; iw++)
-            {
-                for (int iw2 = 0; iw2 < atom2->nw * PARAM.globalv.npol; iw2++)
-                {
-                    const int nw = atom2->nw;
-                    const int mug0 = iw / PARAM.globalv.npol;
-                    const int nug0 = iw2 / PARAM.globalv.npol;
-                    const int iw_nowg = mug0 * nw + nug0;
-
-                    if (PARAM.inp.nspin == 4)
-                    {
-                        // pvp is symmetric, only half is calculated.
-
-                        if (iw % 2 == 0 && iw2 % 2 == 0)
-                        {
-                            // spin = 0;
-                            temp_value_complex
-                                = std::complex<double>(1.0, 0.0) * pvdpRx[0][iw_nowg]
-                                    + std::complex<double>(1.0, 0.0) * pvdpRx[3][iw_nowg];
-
-                            if (std::abs(temp_value_complex) > sparse_threshold)
-                            {
-                                pvdpRx_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                    = temp_value_complex;
-                            }
-
-                            temp_value_complex
-                                = std::complex<double>(1.0, 0.0) * pvdpRy[0][iw_nowg]
-                                    + std::complex<double>(1.0, 0.0) * pvdpRy[3][iw_nowg];
-
-                            if (std::abs(temp_value_complex) > sparse_threshold)
-                            {
-                                pvdpRy_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                    = temp_value_complex;
-                            }
-                            temp_value_complex
-                                = std::complex<double>(1.0, 0.0) * pvdpRz[0][iw_nowg]
-                                    + std::complex<double>(1.0, 0.0) * pvdpRz[3][iw_nowg];
-
-                            if (std::abs(temp_value_complex) > sparse_threshold)
-                            {
-                                pvdpRz_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                    = temp_value_complex;
-                            }
-                        }
-                        else if (iw % 2 == 1 && iw2 % 2 == 1)
-                        {
-                            // spin = 3;
-                            temp_value_complex
-                                = std::complex<double>(1.0, 0.0) * pvdpRx[0][iw_nowg]
-                                    - std::complex<double>(1.0, 0.0) * pvdpRx[3][iw_nowg];
-                            if (std::abs(temp_value_complex) > sparse_threshold)
-                            {
-                                pvdpRx_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                    = temp_value_complex;
-                            }
-                            temp_value_complex
-                                = std::complex<double>(1.0, 0.0) * pvdpRy[0][iw_nowg]
-                                    - std::complex<double>(1.0, 0.0) * pvdpRy[3][iw_nowg];
-                            if (std::abs(temp_value_complex) > sparse_threshold)
-                            {
-                                pvdpRy_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                    = temp_value_complex;
-                            }
-                            temp_value_complex
-                                = std::complex<double>(1.0, 0.0) * pvdpRz[0][iw_nowg]
-                                    - std::complex<double>(1.0, 0.0) * pvdpRz[3][iw_nowg];
-                            if (std::abs(temp_value_complex) > sparse_threshold)
-                            {
-                                pvdpRz_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                    = temp_value_complex;
-                            }
-                        }
-                        else if (iw % 2 == 0 && iw2 % 2 == 1)
-                        {
-                            // spin = 1;
-                            if (!PARAM.globalv.domag)
-                            {
-                                // do nothing
-                            }
-                            else
-                            {
-                                temp_value_complex
-                                    = pvdpRx[1][iw_nowg]
-                                        - std::complex<double>(0.0, 1.0) * pvdpRx[2][iw_nowg];
-                                if (std::abs(temp_value_complex) > sparse_threshold)
-                                {
-                                    pvdpRx_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                        = temp_value_complex;
-                                }
-                                temp_value_complex
-                                    = pvdpRy[1][iw_nowg]
-                                        - std::complex<double>(0.0, 1.0) * pvdpRy[2][iw_nowg];
-                                if (std::abs(temp_value_complex) > sparse_threshold)
-                                {
-                                    pvdpRy_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                        = temp_value_complex;
-                                }
-                                temp_value_complex
-                                    = pvdpRz[1][iw_nowg]
-                                        - std::complex<double>(0.0, 1.0) * pvdpRz[2][iw_nowg];
-                                if (std::abs(temp_value_complex) > sparse_threshold)
-                                {
-                                    pvdpRz_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                        = temp_value_complex;
-                                }
-                            }
-                        }
-                        else if (iw % 2 == 1 && iw2 % 2 == 0)
-                        {
-                            // spin = 2;
-                            if (!PARAM.globalv.domag)
-                            {
-                                // do nothing
-                            }
-                            else
-                            {
-                                temp_value_complex
-                                    = pvdpRx[1][iw_nowg]
-                                        + std::complex<double>(0.0, 1.0) * pvdpRx[2][iw_nowg];
-                                if (std::abs(temp_value_complex) > sparse_threshold)
-                                {
-                                    pvdpRx_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                        = temp_value_complex;
-                                }
-                                temp_value_complex
-                                    = pvdpRy[1][iw_nowg]
-                                        + std::complex<double>(0.0, 1.0) * pvdpRy[2][iw_nowg];
-                                if (std::abs(temp_value_complex) > sparse_threshold)
-                                {
-                                    pvdpRy_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                        = temp_value_complex;
-                                }
-                                temp_value_complex
-                                    = pvdpRz[1][iw_nowg]
-                                        + std::complex<double>(0.0, 1.0) * pvdpRz[2][iw_nowg];
-                                if (std::abs(temp_value_complex) > sparse_threshold)
-                                {
-                                    pvdpRz_soc_sparseMatrix[dR][start1 + iw][start2 + iw2]
-                                        = temp_value_complex;
-                                }
-                            }
-                        }
-                        else
-                        {
-                            ModuleBase::WARNING_QUIT("Gint_k::folding_vl_k_nc", "index is wrong!");
-                        }
-                    } // endif NC
-                    else
-                    {
-                        temp_value_double = pvdpRx[current_spin][iw_nowg];
-                        if (std::abs(temp_value_double) > sparse_threshold)
-                        {
-                            pvdpRx_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value_double;
-                        }
-                        temp_value_double = pvdpRy[current_spin][iw_nowg];
-                        if (std::abs(temp_value_double) > sparse_threshold)
-                        {
-                            pvdpRy_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value_double;
-                        }
-                        temp_value_double = pvdpRz[current_spin][iw_nowg];
-                        if (std::abs(temp_value_double) > sparse_threshold)
-                        {
-                            pvdpRz_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value_double;
-                        }
-                    } // endif normal
-                }
-            }
-        }
-    }
-    if (PARAM.inp.nspin != 4)
-    {
-        distribute_pvdpR_sparseMatrix(current_spin, 0, sparse_threshold, pvdpRx_sparseMatrix, HS_Arrays, pv);
-        distribute_pvdpR_sparseMatrix(current_spin, 1, sparse_threshold, pvdpRy_sparseMatrix, HS_Arrays, pv);
-        distribute_pvdpR_sparseMatrix(current_spin, 2, sparse_threshold, pvdpRz_sparseMatrix, HS_Arrays, pv);
-    }
-    else
-    {
-        distribute_pvdpR_soc_sparseMatrix(0, sparse_threshold, pvdpRx_soc_sparseMatrix, HS_Arrays, pv);
-        distribute_pvdpR_soc_sparseMatrix(1, sparse_threshold, pvdpRy_soc_sparseMatrix, HS_Arrays, pv);
-        distribute_pvdpR_soc_sparseMatrix(2, sparse_threshold, pvdpRz_soc_sparseMatrix, HS_Arrays, pv);
-    }
-
-    return;
-}
diff --git a/source/source_lcao/module_gint/gint_old.cpp b/source/source_lcao/module_gint/gint_old.cpp
deleted file mode 100644
index 73b666581c..0000000000
--- a/source/source_lcao/module_gint/gint_old.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-#include "gint.h"
-
-#include "source_io/module_parameter/parameter.h"
-#if ((defined __CUDA))
-#include "gint_force_gpu.h"
-#include "gint_rho_gpu.h"
-#include "gint_vl_gpu.h"
-#endif
-
-#include "source_base/memory.h"
-#include "source_base/timer.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_lcao/module_hcontainer/hcontainer_funcs.h"
-#include "source_pw/module_pwdft/global.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#ifdef __MKL
-#include <mkl_service.h>
-#endif
-
-Gint::~Gint() {
-
-    delete this->hRGint;
-    delete this->hRGintCd;
-    // in gamma_only case, dmr_gint.size()=0, 
-    // in multi-k case, dmr_gint.size()=nspin
-    for (int is = 0; is < this->dmr_gint.size(); is++) {
-        delete this->dmr_gint[is];
-    }
-    for(int is = 0; is < this->hr_gint_tmp .size(); is++) {
-        delete this->hr_gint_tmp [is];
-    }
-#ifdef __MPI
-    delete this->dm2d_tmp;
-#endif
-}
-
-void Gint::cal_gint(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint");
-    // In multi-process environments,
-    // some processes may not be allocated any data.
-    if (this->gridt->get_init_malloced() == false) {
-        ModuleBase::WARNING_QUIT("Gint_interface::cal_gint",
-                                 "gridt has not been allocated yet!");
-    }
-    if (this->gridt->max_atom > 0) {
-#ifdef __CUDA
-        if (PARAM.inp.device == "gpu"
-            && (inout->job == Gint_Tools::job_type::vlocal
-                || inout->job == Gint_Tools::job_type::rho
-                || inout->job == Gint_Tools::job_type::force)) {
-            if (inout->job == Gint_Tools::job_type::vlocal) {
-                gpu_vlocal_interface(inout);
-            } else if (inout->job == Gint_Tools::job_type::rho) {
-                gpu_rho_interface(inout);
-            } else if (inout->job == Gint_Tools::job_type::force) {
-                gpu_force_interface(inout);
-            }
-        } else
-#endif
-        {
-#ifdef __MKL
-            const int mkl_threads = mkl_get_max_threads();
-            mkl_set_num_threads(mkl_threads);
-#endif
-            {
-                if (inout->job == Gint_Tools::job_type::vlocal) {
-                    gint_kernel_vlocal(inout);
-                } else if (inout->job == Gint_Tools::job_type::dvlocal) {
-                    gint_kernel_dvlocal(inout);
-                } else if (inout->job == Gint_Tools::job_type::vlocal_meta) {
-                    gint_kernel_vlocal_meta(inout);
-                } else if (inout->job == Gint_Tools::job_type::rho) {
-                    gint_kernel_rho(inout);
-                } else if (inout->job == Gint_Tools::job_type::tau) {
-                    gint_kernel_tau(inout);
-                } else if (inout->job == Gint_Tools::job_type::force) {
-                    gint_kernel_force(inout);
-                } else if (inout->job == Gint_Tools::job_type::force_meta) {
-                    gint_kernel_force_meta(inout);
-                }
-            }
-        }
-    }
-    ModuleBase::timer::tick("Gint_interface", "cal_gint");
-    return;
-}
-void Gint::prep_grid(const Grid_Technique& gt,
-                     const int& nbx_in,
-                     const int& nby_in,
-                     const int& nbz_in,
-                     const int& nbz_start_in,
-                     const int& ncxyz_in,
-                     const int& bx_in,
-                     const int& by_in,
-                     const int& bz_in,
-                     const int& bxyz_in,
-                     const int& nbxx_in,
-                     const int& ny_in,
-                     const int& nplane_in,
-                     const int& startz_current_in,
-                     const UnitCell* ucell_in,
-                     const LCAO_Orbitals* orb_in) {
-    ModuleBase::TITLE(GlobalV::ofs_running, "Gint_k", "prep_grid");
-
-    this->gridt = &gt;
-    this->nbx = nbx_in;
-    this->nby = nby_in;
-    this->nbz = nbz_in;
-    this->ncxyz = ncxyz_in;
-    this->nbz_start = nbz_start_in;
-    this->bx = bx_in;
-    this->by = by_in;
-    this->bz = bz_in;
-    this->bxyz = bxyz_in;
-    this->nbxx = nbxx_in;
-    this->ny = ny_in;
-    this->nplane = nplane_in;
-    this->startz_current = startz_current_in;
-    this->ucell = ucell_in;
-    assert(nbx > 0);
-    assert(nby > 0);
-    assert(nbz >= 0);
-    assert(ncxyz > 0);
-    assert(bx > 0);
-    assert(by > 0);
-    assert(bz > 0);
-    assert(bxyz > 0);
-    assert(nbxx >= 0);
-    assert(ny > 0);
-    assert(nplane >= 0);
-    assert(startz_current >= 0);
-    assert(this->ucell->omega > 0.0);
-
-    return;
-}
-
-void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, const int& nspin)
-{
-    ModuleBase::TITLE("Gint", "initialize_pvpR");
-    int npol = 1;
-    // there is the only resize code of dmr_gint
-    if (this->dmr_gint.size() == 0) {
-        this->dmr_gint.resize(nspin);
-    }
-    hr_gint_tmp.resize(nspin);
-    if (nspin != 4) {
-        if (this->hRGint != nullptr) {
-            delete this->hRGint;
-        }
-        this->hRGint = new hamilt::HContainer<double>(ucell_in.nat);
-    } else {
-        npol = 2;
-        if (this->hRGintCd != nullptr) {
-            delete this->hRGintCd;
-        }
-        this->hRGintCd
-            = new hamilt::HContainer<std::complex<double>>(ucell_in.nat);
-        for (int is = 0; is < nspin; is++) {
-            if (this->dmr_gint[is] != nullptr) {
-                delete this->dmr_gint[is];
-            }
-            if (this->hr_gint_tmp[is] != nullptr) {
-                delete this->hr_gint_tmp[is];
-            }
-            this->dmr_gint[is] = new hamilt::HContainer<double>(ucell_in.nat);
-            this->hr_gint_tmp[is] = new hamilt::HContainer<double>(ucell_in.nat);
-        }
-#ifdef __MPI
-        if (this->dm2d_tmp != nullptr) {
-            delete this->dm2d_tmp;
-        }
-#endif
-    }
-    if (PARAM.globalv.gamma_only_local && nspin != 4) {
-        this->hRGint->fix_gamma();
-    }
-    if (npol == 1) {
-        this->hRGint->insert_ijrs(this->gridt->get_ijr_info(), ucell_in);
-        this->hRGint->allocate(nullptr, true);
-        ModuleBase::Memory::record("Gint::hRGint",
-                            this->hRGint->get_memory_size());
-        // initialize dmr_gint with hRGint when NSPIN != 4
-        for (int is = 0; is < this->dmr_gint.size(); is++) {
-            if (this->dmr_gint[is] != nullptr) {
-                delete this->dmr_gint[is];
-            }
-            this->dmr_gint[is] = new hamilt::HContainer<double>(*this->hRGint);
-        }
-        ModuleBase::Memory::record("Gint::dmr_gint",
-                                   this->dmr_gint[0]->get_memory_size()
-                                       * this->dmr_gint.size());
-    } else {
-        this->hRGintCd->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol);
-        this->hRGintCd->allocate(nullptr, true);
-        for(int is = 0; is < nspin; is++) {
-            this->hr_gint_tmp[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in);
-            this->dmr_gint[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in);
-            this->hr_gint_tmp[is]->allocate(nullptr, true);
-            this->dmr_gint[is]->allocate(nullptr, true);
-        }
-        ModuleBase::Memory::record("Gint::hr_gint_tmp",
-                                       this->hr_gint_tmp[0]->get_memory_size()*nspin);
-        ModuleBase::Memory::record("Gint::dmr_gint",
-                                       this->dmr_gint[0]->get_memory_size()
-                                           * this->dmr_gint.size()*nspin);
-    }
-}
-
-void Gint::reset_DMRGint(const int& nspin)
-{
-    if (this->hRGint)
-    {
-        for (auto& d : this->dmr_gint) { delete d; }
-        this->dmr_gint.resize(nspin);
-        this->dmr_gint.shrink_to_fit();
-        for (auto& d : this->dmr_gint) { d = new hamilt::HContainer<double>(*this->hRGint); }
-        if (nspin == 4)
-        {
-            for (auto& d : this->dmr_gint) { d->allocate(nullptr, false); }
-#ifdef __MPI
-            delete this->dm2d_tmp;
-#endif
-        }
-    }
-}
-
-void Gint::transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> dm2d) {
-    ModuleBase::TITLE("Gint", "transfer_DMR");
-    // To check whether input parameter dm2d has been initialized
-#ifdef __DEBUG
-    assert(!dm2d.empty()
-           && "Input parameter dm2d has not been initialized while calling "
-              "function transfer_DM2DtoGrid!");
-#endif
-    ModuleBase::timer::tick("Gint", "transfer_DMR");
-    if (PARAM.inp.nspin != 4) {
-        for (int is = 0; is < this->dmr_gint.size(); is++) {
-#ifdef __MPI
-            hamilt::transferParallels2Serials(*dm2d[is], dmr_gint[is]);
-#else
-            this->dmr_gint[is]->set_zero();
-            this->dmr_gint[is]->add(*dm2d[is]);
-#endif
-        }
-    } else // NSPIN=4 case
-    {
-        // is=0:↑↑, 1:↑↓, 2:↓↑, 3:↓↓
-        const int row_set[4] = {0, 0, 1, 1};
-        const int col_set[4] = {0, 1, 0, 1};
-        int mg = dm2d[0]->get_paraV()->get_global_row_size()/2;
-        int ng = dm2d[0]->get_paraV()->get_global_col_size()/2;
-        int nb = dm2d[0]->get_paraV()->get_block_size()/2;
-        auto ijr_info = dm2d[0]->get_ijr_info();
-#ifdef __MPI
-        int blacs_ctxt = dm2d[0]->get_paraV()->blacs_ctxt;
-        std::vector<int> iat2iwt(ucell->nat);
-        for (int iat = 0; iat < ucell->nat; iat++) {
-            iat2iwt[iat] = ucell->get_iat2iwt()[iat]/2;
-        }
-        Parallel_Orbitals pv{};
-        pv.set(mg, ng, nb, blacs_ctxt);
-        pv.set_atomic_trace(iat2iwt.data(), ucell->nat, mg);
-        this-> dm2d_tmp = new hamilt::HContainer<double>(&pv, nullptr, &ijr_info);
-#else
-        if (this->dm2d_tmp != nullptr) {
-            delete this->dm2d_tmp;
-        }
-        this-> dm2d_tmp = new hamilt::HContainer<double>(*this->hRGint);
-        this-> dm2d_tmp -> insert_ijrs(this->gridt->get_ijr_info(), *(this->ucell));
-        this-> dm2d_tmp -> allocate(nullptr, true);
-#endif
-        ModuleBase::Memory::record("Gint::dm2d_tmp", this->dm2d_tmp->get_memory_size());
-        for (int is = 0; is < 4; is++){
-            for (int iap = 0; iap < dm2d[0]->size_atom_pairs(); ++iap) {
-                auto& ap = dm2d[0]->get_atom_pair(iap);
-                int iat1 = ap.get_atom_i();
-                int iat2 = ap.get_atom_j();
-                for (int ir = 0; ir < ap.get_R_size(); ++ir) {
-                    const ModuleBase::Vector3<int> r_index = ap.get_R_index(ir);
-                    double* matrix_out = this -> dm2d_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer();
-                    double* matrix_in = ap.get_pointer(ir);
-                    for (int irow = 0; irow < ap.get_row_size()/2; irow ++) {
-                        for (int icol = 0; icol < ap.get_col_size()/2; icol++){
-                            int index_i = irow* ap.get_col_size()/2 + icol;
-                            int index_j = (irow*2+row_set[is]) * ap.get_col_size() + icol*2+col_set[is];
-                            matrix_out[index_i] = matrix_in[index_j];
-                        }
-                    }
-                }
-            }
-#ifdef __MPI
-            hamilt::transferParallels2Serials( *(this->dm2d_tmp), this->dmr_gint[is]);
-#else
-            this->dmr_gint[is]->set_zero();
-            this->dmr_gint[is]->add(*(this->dm2d_tmp));
-#endif
-        }//is=4
-        delete this->dm2d_tmp;
-        this->dm2d_tmp = nullptr;
-    }
-    ModuleBase::timer::tick("Gint", "transfer_DMR");
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/gint_rho.cpp b/source/source_lcao/module_gint/gint_rho.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_rho.cpp
rename to source/source_lcao/module_gint/gint_rho.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_rho.h b/source/source_lcao/module_gint/gint_rho.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_rho.h
rename to source/source_lcao/module_gint/gint_rho.h
diff --git a/source/source_lcao/module_gint/gint_rho_cpu_interface.cpp b/source/source_lcao/module_gint/gint_rho_cpu_interface.cpp
deleted file mode 100644
index 2f41152fc3..0000000000
--- a/source/source_lcao/module_gint/gint_rho_cpu_interface.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-#include "gint.h"
-#include "source_base/memory.h"
-#include "source_io/module_parameter/parameter.h"
-#include "source_base/timer.h"
-
-void Gint::gint_kernel_rho(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint_rho");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_rho");
-    const int max_size = this->gridt->max_atom;
-    const int ncyz = this->ny * this->nplane;
-    const double delta_r = this->gridt->dr_uniform;
-
-#pragma omp parallel
-{
-    std::vector<int> block_iw(max_size, 0);
-    std::vector<int> block_index(max_size+1, 0);
-    std::vector<int> block_size(max_size, 0);
-    std::vector<int> vindex(this->bxyz, 0);
-#pragma omp for schedule(dynamic)
-    for (int grid_index = 0; grid_index < this->nbxx; grid_index++)
-    {
-        const int na_grid = this->gridt->how_many_atoms[grid_index];
-        if (na_grid == 0) {
-            continue;
-        }
-        Gint_Tools::get_vindex(this->bxyz,
-                                    this->bx,
-                                    this->by,
-                                    this->bz,
-                                    this->nplane,
-                                    this->gridt->start_ind[grid_index],
-                                    ncyz,
-                                    vindex.data());
-         // prepare block information
-        ModuleBase::Array_Pool<bool> cal_flag(this->bxyz,max_size);
-        Gint_Tools::get_block_info(*this->gridt,
-                                this->bxyz,
-                                na_grid,
-                                grid_index,
-                                block_iw.data(),
-                                block_index.data(),
-                                block_size.data(),
-                                cal_flag.get_ptr_2D());
-
-        // evaluate psi on grids
-        const int LD_pool = block_index[na_grid];
-        ModuleBase::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
-        Gint_Tools::cal_psir_ylm(*this->gridt,
-                                this->bxyz,
-                                na_grid,
-                                grid_index,
-                                delta_r,
-                                block_index.data(),
-                                block_size.data(),
-                                cal_flag.get_ptr_2D(),
-                                psir_ylm.get_ptr_2D());
-
-        for (int is = 0; is < inout->nspin_rho; ++is)
-        {
-            // psir_ylm_new = psir_func(psir_ylm)
-            // psir_func==nullptr means psir_ylm_new=psir_ylm
-            const ModuleBase::Array_Pool<double> &psir_ylm_1 = (!this->psir_func_1) ? psir_ylm : this->psir_func_1(psir_ylm, *this->gridt, grid_index, is, block_iw, block_size, block_index, cal_flag);
-            const ModuleBase::Array_Pool<double> &psir_ylm_2 = (!this->psir_func_2) ? psir_ylm : this->psir_func_2(psir_ylm, *this->gridt, grid_index, is, block_iw, block_size, block_index, cal_flag);
-
-            ModuleBase::Array_Pool<double> psir_DM(this->bxyz, LD_pool);
-            ModuleBase::GlobalFunc::ZEROS(psir_DM.get_ptr_1D(), this->bxyz * LD_pool);
-
-            // calculating g_mu(r) = sum_nu rho_mu,nu psi_nu(r)
-            Gint_Tools::mult_psi_DMR(*this->gridt,
-                                    this->bxyz,
-                                    LD_pool,
-                                    grid_index,
-                                    na_grid,
-                                    block_index.data(),
-                                    block_size.data(),
-                                    cal_flag.get_ptr_2D(),
-                                    psir_ylm_1.get_ptr_2D(),
-                                    psir_DM.get_ptr_2D(),
-                                    this->dmr_gint[is],
-                                    inout->if_symm);
-
-            // do sum_mu g_mu(r)psi_mu(r) to get electron density on grid
-            this->cal_meshball_rho(na_grid, block_index.data(), vindex.data(), psir_ylm_2.get_ptr_2D(), psir_DM.get_ptr_2D(), inout->rho[is]);
-        }
-    }
-}
-    ModuleBase::TITLE("Gint_interface", "cal_gint_rho");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_rho");
-}
-
-void Gint::gint_kernel_tau(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint_tau");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_tau");
-    const int max_size = this->gridt->max_atom;
-    const int ncyz = this->ny * this->nplane;
-    const double delta_r = this->gridt->dr_uniform;
-
-
-#pragma omp parallel
-{
-    std::vector<int> block_iw(max_size, 0);
-    std::vector<int> block_index(max_size+1, 0);
-    std::vector<int> block_size(max_size, 0);
-    std::vector<int> vindex(bxyz, 0);
-#pragma omp for schedule(dynamic)
-    for (int grid_index = 0; grid_index < this->nbxx; grid_index++)
-    {
-        const int na_grid = this->gridt->how_many_atoms[grid_index];
-        if (na_grid == 0) {
-            continue;
-        }
-        Gint_Tools::get_vindex(this->bxyz,
-                                this->bx,
-                                this->by,
-                                this->bz,
-                                this->nplane,
-                                this->gridt->start_ind[grid_index],
-                                ncyz,
-                                vindex.data());
-        //prepare block information
-        ModuleBase::Array_Pool<bool> cal_flag(this->bxyz,max_size);
-        Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index,
-                                            block_iw.data(), block_index.data(), block_size.data(), cal_flag.get_ptr_2D());
-
-        //evaluate psi and dpsi on grids
-        const int LD_pool = block_index[na_grid];
-        ModuleBase::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_x(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_y(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_z(this->bxyz, LD_pool);
-
-        Gint_Tools::cal_dpsir_ylm(*this->gridt,
-            this->bxyz, na_grid, grid_index, delta_r,
-            block_index.data(), block_size.data(),
-            cal_flag.get_ptr_2D(),
-            psir_ylm.get_ptr_2D(),
-            dpsir_ylm_x.get_ptr_2D(),
-            dpsir_ylm_y.get_ptr_2D(),
-            dpsir_ylm_z.get_ptr_2D());
-
-        for(int is=0; is<PARAM.inp.nspin; ++is)
-        {
-            ModuleBase::Array_Pool<double> dpsix_DM(this->bxyz, LD_pool);
-            ModuleBase::Array_Pool<double> dpsiy_DM(this->bxyz, LD_pool);
-            ModuleBase::Array_Pool<double> dpsiz_DM(this->bxyz, LD_pool);
-            ModuleBase::GlobalFunc::ZEROS(dpsix_DM.get_ptr_1D(), this->bxyz*LD_pool);
-            ModuleBase::GlobalFunc::ZEROS(dpsiy_DM.get_ptr_1D(), this->bxyz*LD_pool);
-            ModuleBase::GlobalFunc::ZEROS(dpsiz_DM.get_ptr_1D(), this->bxyz*LD_pool);
-
-            //calculating g_i,mu(r) = sum_nu rho_mu,nu d/dx_i psi_nu(r), x_i=x,y,z
-            Gint_Tools::mult_psi_DMR(
-                *this->gridt, this->bxyz,
-                LD_pool,
-                grid_index, na_grid,
-                block_index.data(), block_size.data(),
-                cal_flag.get_ptr_2D(),
-                dpsir_ylm_x.get_ptr_2D(),
-                dpsix_DM.get_ptr_2D(),
-                this->dmr_gint[is],
-                true);
-            Gint_Tools::mult_psi_DMR(
-                *this->gridt, this->bxyz,
-                LD_pool,
-                grid_index, na_grid,
-                block_index.data(), block_size.data(),
-                cal_flag.get_ptr_2D(),
-                dpsir_ylm_y.get_ptr_2D(),
-                dpsiy_DM.get_ptr_2D(),
-                this->dmr_gint[is],
-                true);
-            Gint_Tools::mult_psi_DMR(
-                *this->gridt, this->bxyz,
-                LD_pool,
-                grid_index, na_grid,
-                block_index.data(), block_size.data(),
-                cal_flag.get_ptr_2D(),
-                dpsir_ylm_z.get_ptr_2D(),
-                dpsiz_DM.get_ptr_2D(),
-                this->dmr_gint[is],
-                true);
-
-            //do sum_i,mu g_i,mu(r) * d/dx_i psi_mu(r) to get kinetic energy density on grid
-            if(inout->job==Gint_Tools::job_type::tau)
-            {
-                this->cal_meshball_tau(
-                    na_grid, block_index.data(),
-                    vindex.data(),
-                    dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D(),
-                    dpsix_DM.get_ptr_2D(), dpsiy_DM.get_ptr_2D(), dpsiz_DM.get_ptr_2D(),
-                    inout->rho[is]);
-            }
-        }
-    }
-}
-    ModuleBase::TITLE("Gint_interface", "cal_gint_tau");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_tau");
-}
diff --git a/source/source_lcao/module_gint/temp_gint/gint_rho_gpu.cpp b/source/source_lcao/module_gint/gint_rho_gpu.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_rho_gpu.cpp
rename to source/source_lcao/module_gint/gint_rho_gpu.cpp
diff --git a/source/source_lcao/module_gint/gint_rho_gpu.cu b/source/source_lcao/module_gint/gint_rho_gpu.cu
deleted file mode 100644
index c5591e662e..0000000000
--- a/source/source_lcao/module_gint/gint_rho_gpu.cu
+++ /dev/null
@@ -1,234 +0,0 @@
-#include "kernels/cuda/cuda_tools.cuh"
-#include "source_base/ylm.h"
-#include "gint_rho_gpu.h"
-#include "gint_tools.h"
-#include "kernels/cuda/gint_rho.cuh"
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace GintKernel
-{
-
-void gint_rho_gpu(const hamilt::HContainer<double>* dm,
-                        const double* ylmcoef_now,
-                        const double dr,
-                        const double* rcut,
-                        const Grid_Technique& gridt,
-                        const UnitCell& ucell,
-                        double* rho)
-{
-    checkCuda(cudaSetDevice(gridt.dev_id));
-    // checkCuda(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
-
-    const int nbzp = gridt.nbzp;
-    const int nczp =nbzp * gridt.bz;
-    const int num_mcell_on_proc = nczp * gridt.ncx * gridt.ncy;
-    const int lgd = gridt.lgd;
-    const int max_atom = gridt.max_atom;
-    const int num_streams = gridt.nstreams;
-    const int max_atom_per_bcell = max_atom * gridt.bxyz;
-    const int max_atom_per_z = max_atom * nbzp;
-    const int max_phi_per_z = max_atom_per_bcell * nbzp * ucell.nwmax;
-    const int max_atompair_per_z = max_atom * max_atom * nbzp;
-
-    std::vector<cudaStream_t> streams(num_streams);
-    std::vector<cudaEvent_t> events(num_streams);
-    for (int i = 0; i < num_streams; i++)
-    {
-        checkCuda(cudaStreamCreate(&streams[i]));
-        checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
-    }
-
-    Cuda_Mem_Wrapper<double> dr_part(max_atom_per_z * 3, num_streams, true);
-    Cuda_Mem_Wrapper<uint8_t> atoms_type(max_atom_per_z, num_streams, true);
-    // The first number in every group of two represents the number of atoms on that bigcell.
-    // The second number represents the cumulative number of atoms up to that bigcell.
-    Cuda_Mem_Wrapper<int> atoms_num_info(2 * nbzp, num_streams, true);
-
-    Cuda_Mem_Wrapper<double> psi(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> psi_dm(max_phi_per_z, num_streams, false);
-
-    Cuda_Mem_Wrapper<double> gemm_alpha(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_m(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_n(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_k(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_lda(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_ldb(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_ldc(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<double*> gemm_A(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<double*> gemm_B(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<double*> gemm_C(max_atompair_per_z, num_streams, true);
-    
-    Cuda_Mem_Wrapper<double> rho_g(num_mcell_on_proc, 1, false);
-    Cuda_Mem_Wrapper<double*> dot_product(nbzp * gridt.bxyz, num_streams, true);
-
-    Cuda_Mem_Wrapper<double> dm_matrix(dm->get_nnr(), 1, false);
-    // retrieve the density matrix on the host
-    checkCuda(cudaMemcpy(dm_matrix.get_device_pointer(),
-                         dm->get_wrapper(),
-                         dm->get_nnr() * sizeof(double),
-                         cudaMemcpyHostToDevice));
-
-// calculate the rho for every nbzp bigcells
-#ifdef _OPENMP
-const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
-#endif
-#pragma omp parallel num_threads(max_thread_num)
-{
-#ifdef _OPENMP
-    const int tid = omp_get_thread_num();
-    const int num_threads = omp_get_num_threads();
-    const int sid_start = tid * num_streams / num_threads;
-    const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
-#else
-    const int sid_start = 0;
-    const int thread_num_streams = num_streams;
-#endif
-#pragma omp for collapse(2) schedule(dynamic)
-    for (int i = 0; i < gridt.nbx; i++)
-    {
-        for (int j = 0; j < gridt.nby; j++)
-        {
-            // 20240620 Note that it must be set again here because 
-            // cuda's device is not safe in a multi-threaded environment.
-
-            checkCuda(cudaSetDevice(gridt.dev_id));
-
-            const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
-            checkCuda(cudaEventSynchronize(events[sid]));
-
-            int max_m = 0;
-            int max_n = 0;
-            int atom_pair_num = 0;
-            int atoms_per_z = 0;
-            const int grid_index_ij = i * gridt.nby * nbzp + j * nbzp;
-
-            // generate GPU tasks, including the calculation of psir, matrix
-            // multiplication, and dot product
-            gtask_rho(gridt,
-                      grid_index_ij,
-                      ucell,
-                      dr_part.get_host_pointer(sid),
-                      atoms_type.get_host_pointer(sid),
-                      atoms_num_info.get_host_pointer(sid),
-                      atoms_per_z);
-
-            alloc_mult_dot_rho(
-                dm,
-                gridt,
-                ucell,
-                grid_index_ij,
-                max_atom,
-                lgd,
-                nczp,
-                atoms_num_info.get_host_pointer(sid),
-                psi.get_device_pointer(sid),
-                psi_dm.get_device_pointer(sid),
-                dm_matrix.get_device_pointer(),
-                gemm_alpha.get_host_pointer(sid),
-                gemm_m.get_host_pointer(sid),
-                gemm_n.get_host_pointer(sid),
-                gemm_k.get_host_pointer(sid),
-                gemm_lda.get_host_pointer(sid),
-                gemm_ldb.get_host_pointer(sid),
-                gemm_ldc.get_host_pointer(sid),
-                gemm_A.get_host_pointer(sid),
-                gemm_B.get_host_pointer(sid),
-                gemm_C.get_host_pointer(sid),
-                max_m,
-                max_n,
-                atom_pair_num,
-                rho_g.get_device_pointer(),
-                dot_product.get_host_pointer(sid));
-            
-            dr_part.copy_host_to_device_async(streams[sid], sid, atoms_per_z * 3);
-            atoms_type.copy_host_to_device_async(streams[sid], sid, atoms_per_z);
-            atoms_num_info.copy_host_to_device_async(streams[sid], sid);
-
-            gemm_alpha.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_m.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_n.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_k.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_lda.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_ldb.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_ldc.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            dot_product.copy_host_to_device_async(streams[sid], sid);
-            checkCuda(cudaEventRecord(events[sid], streams[sid]));
-            
-            psi.memset_device_async(streams[sid], sid, 0);
-            psi_dm.memset_device_async(streams[sid], sid, 0);
-
-            // Launching kernel to calculate psi
-            dim3 grid_psi(nbzp, gridt.bxyz);
-            dim3 block_psi(64);
-            get_psi<<<grid_psi, block_psi, 0, streams[sid]>>>(
-                gridt.ylmcoef_g,
-                dr,
-                gridt.bxyz,
-                ucell.nwmax,
-                max_atom,
-                gridt.atom_nwl_g,
-                gridt.atom_new_g,
-                gridt.atom_ylm_g,
-                gridt.atom_nw_g,
-                gridt.rcut_g,
-                gridt.nr_max,
-                gridt.psi_u_g,
-                gridt.mcell_pos_g,
-                dr_part.get_device_pointer(sid),
-                atoms_type.get_device_pointer(sid),
-                atoms_num_info.get_device_pointer(sid),
-                psi.get_device_pointer(sid));
-            checkCudaLastError();
-
-            // Performing matrix multiplication alpha * mat_dm * mat_psir
-            gridt.fastest_matrix_mul(max_m,
-                                     max_n,
-                                     gemm_m.get_device_pointer(sid),
-                                     gemm_n.get_device_pointer(sid),
-                                     gemm_k.get_device_pointer(sid),
-                                     gemm_A.get_device_pointer(sid),
-                                     gemm_lda.get_device_pointer(sid),
-                                     gemm_B.get_device_pointer(sid),
-                                     gemm_ldb.get_device_pointer(sid),
-                                     gemm_C.get_device_pointer(sid),
-                                     gemm_ldc.get_device_pointer(sid),
-                                     atom_pair_num,
-                                     streams[sid],
-                                     gemm_alpha.get_device_pointer(sid));
-            checkCudaLastError();
-
-            // Launching kernel to calculate dot product psir * psir_dm
-            // if warpSize is not eauql to 32, the psir_dot kernel should be modified
-            dim3 grid_dot(nbzp, gridt.bxyz);
-            dim3 block_dot(64); 
-            psir_dot<<<grid_dot, block_dot, sizeof(double) * 32, streams[sid]>>>(
-                gridt.bxyz,
-                ucell.nwmax,
-                atoms_num_info.get_device_pointer(sid),
-                psi.get_device_pointer(sid),
-                psi_dm.get_device_pointer(sid),
-                dot_product.get_device_pointer(sid));
-            checkCudaLastError();
-        }
-    }
-}
-
-    // Copy rho from device to host
-    checkCuda(cudaMemcpy(rho,
-                         rho_g.get_device_pointer(),
-                         num_mcell_on_proc * sizeof(double),
-                         cudaMemcpyDeviceToHost));
-
-    for (int i = 0; i < num_streams; i++)
-    {
-        checkCuda(cudaStreamDestroy(streams[i]));
-        checkCuda(cudaEventDestroy(events[i]));
-    }
-}
-} // namespace GintKernel
diff --git a/source/source_lcao/module_gint/gint_rho_gpu.h b/source/source_lcao/module_gint/gint_rho_gpu.h
index 7dba352a84..d8a8fe6e01 100644
--- a/source/source_lcao/module_gint/gint_rho_gpu.h
+++ b/source/source_lcao/module_gint/gint_rho_gpu.h
@@ -1,68 +1,52 @@
-#ifndef GINT_RHO_H
-#define GINT_RHO_H
-#include <cublas_v2.h>
-#include <cuda.h> // for CUDA_VERSION
-#include <cuda_runtime.h>
+#pragma once
 
-#include "source_lcao/module_gint/gint.h"
-#include "source_lcao/module_gint/grid_technique.h"
+#include <memory>
+#include <vector>
+#include "source_lcao/module_hcontainer/hcontainer.h"
+#include "gint.h"
+#include "gint_info.h"
+#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h"
 
-namespace GintKernel
+namespace ModuleGint
 {
 
-/**
- * calculate the rho by GPU
- *
- * @param dm density matrix.
- * @param ylmcoef_now coefficients for the spherical harmonics expansion.
- * @param dr The grid spacing.
- * @param rcut Pointer to the cutoff radius array.
- * @param gridt Grid_Technique object containing grid information.
- * @param ucell UnitCell.
- * @param rho rho.
- */
-void gint_rho_gpu(const hamilt::HContainer<double>* dm,
-                        const double* ylmcoef_now,
-                        const double dr,
-                        const double* rcut,
-                        const Grid_Technique& gridt,
-                        const UnitCell& ucell,
-                        double* rho);
+class Gint_rho_gpu: public Gint
+{
+    public:
+    Gint_rho_gpu(
+        const std::vector<HContainer<double>*>& dm_vec,
+        const int nspin,
+        double **rho,
+        bool is_dm_symm = true)
+        : dm_vec_(dm_vec), nspin_(nspin), rho_(rho), is_dm_symm_(is_dm_symm) {}
+    
+    void cal_gint();
+
+    private:
+    void init_dm_gint_();
+
+    void cal_rho_();
+
+    void transfer_cpu_to_gpu_();
+
+    void transfer_gpu_to_cpu_();
+
+    // input
+    const std::vector<HContainer<double>*> dm_vec_;
+    const int nspin_;
+
+    // if true, it means the DMR matrix is symmetric,
+    // which leads to faster computations compared to the asymmetric case.
+    const bool is_dm_symm_;
+
+    // output
+    double **rho_;
 
-void gtask_rho(const Grid_Technique& gridt,
-               const int grid_index_ij,
-               const UnitCell& ucell,
-               double* dr_part,
-               uint8_t* atoms_type,
-               int* atoms_num_info,
-               int& atoms_per_z);
+    // Intermediate variables
+    std::vector<HContainer<double>> dm_gint_vec_;
 
-void alloc_mult_dot_rho(const hamilt::HContainer<double>* dm,
-                        const Grid_Technique& gridt,
-                        const UnitCell& ucell,
-                        const int grid_index_ij,
-                        const int max_atom,
-                        const int lgd,
-                        const int nczp,
-                        const int* atoms_num_info,
-                        double* const psir_ylm_g,
-                        double* const psir_dm_g,
-                        double* const dm_matrix_g,
-                        double* mat_alpha,
-                        int* mat_m,
-                        int* mat_n,
-                        int* mat_k,
-                        int* mat_lda,
-                        int* mat_ldb,
-                        int* mat_ldc,
-                        double** mat_A,
-                        double** mat_B,
-                        double** mat_C,
-                        int& max_m,
-                        int& max_n,
-                        int& atom_pair_num,
-                        double* rho_g,
-                        double** dot_product);
+    std::vector<CudaMemWrapper<double>> dm_gint_d_vec_;
+    std::vector<CudaMemWrapper<double>> rho_d_vec_;
+};
 
-} // namespace GintKernel
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/gint_rho_old.cpp b/source/source_lcao/module_gint/gint_rho_old.cpp
deleted file mode 100644
index b3027d6b12..0000000000
--- a/source/source_lcao/module_gint/gint_rho_old.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "gint_k.h"
-#include "gint_tools.h"
-#include "grid_technique.h"
-#include "source_base/global_function.h"
-#include "source_base/global_variable.h"
-#include "source_base/timer.h"
-#include "source_base/array_pool.h"
-#include "source_base/ylm.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_pw/module_pwdft/global.h"
-
-void Gint::cal_meshball_rho(const int na_grid,
-                            const int*const block_index,
-                            const int*const vindex,
-                            const double*const*const psir_ylm,
-                            const double*const*const psir_DMR,
-                            double*const rho)
-{
-    const int inc = 1;
-    // sum over mu to get density on grid
-    for (int ib = 0; ib < this->bxyz; ++ib)
-    {
-        const double r = ddot_(&block_index[na_grid], psir_ylm[ib], &inc, psir_DMR[ib], &inc);
-        const int grid = vindex[ib];
-        rho[grid] += r;
-    }
-}
diff --git a/source/source_lcao/module_gint/temp_gint/gint_tau.cpp b/source/source_lcao/module_gint/gint_tau.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_tau.cpp
rename to source/source_lcao/module_gint/gint_tau.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_tau.h b/source/source_lcao/module_gint/gint_tau.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_tau.h
rename to source/source_lcao/module_gint/gint_tau.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_tau_gpu.cpp b/source/source_lcao/module_gint/gint_tau_gpu.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_tau_gpu.cpp
rename to source/source_lcao/module_gint/gint_tau_gpu.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_tau_gpu.h b/source/source_lcao/module_gint/gint_tau_gpu.h
similarity index 92%
rename from source/source_lcao/module_gint/temp_gint/gint_tau_gpu.h
rename to source/source_lcao/module_gint/gint_tau_gpu.h
index da19c98828..638892ff13 100644
--- a/source/source_lcao/module_gint/temp_gint/gint_tau_gpu.h
+++ b/source/source_lcao/module_gint/gint_tau_gpu.h
@@ -5,7 +5,7 @@
 #include "source_lcao/module_hcontainer/hcontainer.h"
 #include "gint.h"
 #include "gint_info.h"
-#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h"
 
 namespace ModuleGint
 {
diff --git a/source/source_lcao/module_gint/gint_tau_old.cpp b/source/source_lcao/module_gint/gint_tau_old.cpp
deleted file mode 100644
index adf20d45b5..0000000000
--- a/source/source_lcao/module_gint/gint_tau_old.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "source_base/global_function.h"
-#include "source_base/global_variable.h"
-#include "gint_k.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "grid_technique.h"
-#include "source_base/ylm.h"
-#include "source_pw/module_pwdft/global.h"
-#include "source_base/module_external/blas_connector.h"
-#include "source_base/timer.h"
-#include "source_base/array_pool.h"
-#include "gint_tools.h"
-#include "source_base/memory.h"
-#include "source_lcao/module_gint/grid_technique.h"
-
-
-void Gint::cal_meshball_tau(
-	const int na_grid,
-	int* block_index,
-	int* vindex,
-	double** dpsix,
-	double** dpsiy,
-	double** dpsiz,
-	double** dpsix_dm,
-	double** dpsiy_dm,
-	double** dpsiz_dm,
-	double* rho)
-{		
-	const int inc = 1;
-	// sum over mu to get density on grid
-	for(int ib=0; ib<this->bxyz; ++ib)
-	{
-		double rx=ddot_(&block_index[na_grid], dpsix[ib], &inc, dpsix_dm[ib], &inc);
-		double ry=ddot_(&block_index[na_grid], dpsiy[ib], &inc, dpsiy_dm[ib], &inc);
-		double rz=ddot_(&block_index[na_grid], dpsiz[ib], &inc, dpsiz_dm[ib], &inc);
-		const int grid = vindex[ib];
-		rho[ grid ] += rx + ry + rz;
-	}
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/gint_tools.cpp b/source/source_lcao/module_gint/gint_tools.cpp
deleted file mode 100644
index d60db04a1a..0000000000
--- a/source/source_lcao/module_gint/gint_tools.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-//=========================================================
-//REFACTOR : Peize Lin, 2021.06.28
-//=========================================================
-#include "gint_tools.h"
-
-#include <cmath>
-#include <utility> // for std::pair
-
-#include "source_base/timer.h"
-#include "source_base/ylm.h"
-#include "source_base/array_pool.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_pw/module_pwdft/global.h"
-
-namespace Gint_Tools{
-void get_vindex(const int bxyz, const int bx, const int by, const int bz, 
-				const int nplane, const int start_ind,
-				const int ncyz,int* vindex)
-{
-    int bindex = 0;
-
-		for(int ii=0; ii<bx; ii++)
-		{
-			const int ipart = ii*ncyz;
-			for(int jj=0; jj<by; jj++)
-			{
-				const int jpart = jj*nplane + ipart;
-				for(int kk=0; kk<bz; kk++)
-				{
-					vindex[bindex] = start_ind + kk + jpart;
-					++bindex;
-				}
-			}
-		}
-
-}
-
-	// here vindex refers to local potentials
-
-	// extract the local potentials.
-	void get_gint_vldr3(
-		double* vldr3,
-        const double* const vlocal,		// vlocal[ir]
-        const int bxyz,
-        const int bx,
-        const int by,
-        const int bz,
-        const int nplane,
-        const int start_ind,
-		const int ncyz,
-		const double dv)
-	{
-		// set the index for obtaining local potentials
-		std::vector<int> vindex(bxyz,0);
-		Gint_Tools::get_vindex(bxyz, bx, by, bz, nplane, start_ind, ncyz,vindex.data());
-		for(int ib=0; ib<bxyz; ib++)
-		{
-			vldr3[ib]=vlocal[vindex[ib]] * dv;
-		}
-	}
-
-	void get_block_info(const Grid_Technique& gt, const int bxyz, const int na_grid, const int grid_index, int* block_iw,
-						int* block_index, int* block_size, bool** cal_flag)
-	{
-		const UnitCell& ucell = *gt.ucell;
-		block_index[0] = 0;
-		for (int id = 0; id < na_grid; id++)
-		{
-			const int mcell_index = gt.bcell_start[grid_index] + id;
-			const int iat = gt.which_atom[mcell_index];    // index of atom
-			const int it = ucell.iat2it[iat];              // index of atom type
-			const int ia = ucell.iat2ia[iat];              // index of atoms within each type
-			const int start = ucell.itiaiw2iwt(it, ia, 0); // the index of the first wave function for atom (it,ia)
-			block_iw[id] = gt.trace_lo[start];
-			block_index[id + 1] = block_index[id] + ucell.atoms[it].nw;
-			block_size[id] = ucell.atoms[it].nw;
-
-			const int imcell=gt.which_bigcell[mcell_index];
-			const double mt[3] = {
-				gt.meshball_positions[imcell][0] - gt.tau_in_bigcell[iat][0],
-				gt.meshball_positions[imcell][1] - gt.tau_in_bigcell[iat][1],
-				gt.meshball_positions[imcell][2] - gt.tau_in_bigcell[iat][2]};
-
-			for(int ib=0; ib<bxyz; ib++)
-			{
-				// meshcell_pos: z is the fastest
-				const double dr[3] = {
-					gt.meshcell_pos[ib][0] + mt[0],
-					gt.meshcell_pos[ib][1] + mt[1],
-					gt.meshcell_pos[ib][2] + mt[2]};
-				const double distance = std::sqrt(dr[0]*dr[0] + dr[1]*dr[1] + dr[2]*dr[2]);	// distance between atom and grid
-
-			if (distance > gt.rcuts[it] - 1.0e-10) {
-				cal_flag[ib][id] = false;
-			} else {
-				cal_flag[ib][id] = true;
-			}
-			} // end ib
-		}
-	}
-
-
-void cal_dpsirr_ylm(
-    const Grid_Technique& gt, const int bxyz,
-    const int na_grid,                 // number of atoms on this grid
-    const int grid_index,              // 1d index of FFT index (i,j,k)
-    const int* const block_index,      // block_index[na_grid+1], count total number of atomis orbitals
-    const int* const block_size,       // block_size[na_grid],	number of columns of a band
-    const bool* const* const cal_flag, // cal_flag[bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
-    double* const* const dpsir_ylm_x, double* const* const dpsir_ylm_y, double* const* const dpsir_ylm_z,
-    double* const* const dpsirr_ylm)
-{
-    ModuleBase::timer::tick("Gint_Tools", "cal_dpsirr_ylm");
-    const UnitCell& ucell = *gt.ucell;
-    for (int id = 0; id < na_grid; id++)
-    {
-        const int mcell_index = gt.bcell_start[grid_index] + id;
-        const int imcell = gt.which_bigcell[mcell_index];
-        int iat = gt.which_atom[mcell_index];
-        const int it = ucell.iat2it[iat];
-        Atom* atom = &ucell.atoms[it];
-
-			const double mt[3]={
-				gt.meshball_positions[imcell][0] - gt.tau_in_bigcell[iat][0],
-				gt.meshball_positions[imcell][1] - gt.tau_in_bigcell[iat][1],
-				gt.meshball_positions[imcell][2] - gt.tau_in_bigcell[iat][2]};
-
-			for(int ib=0; ib<bxyz; ib++)
-			{
-				double*const p_dpsi_x=&dpsir_ylm_x[ib][block_index[id]];
-				double*const p_dpsi_y=&dpsir_ylm_y[ib][block_index[id]];
-				double*const p_dpsi_z=&dpsir_ylm_z[ib][block_index[id]];
-				double*const p_dpsirr=&dpsirr_ylm[ib][block_index[id] * 6];
-				if(!cal_flag[ib][id])
-				{
-					ModuleBase::GlobalFunc::ZEROS(p_dpsirr, block_size[id] * 6);
-				}
-				else
-				{
-					const double dr[3]={						// vectors between atom and grid
-						gt.meshcell_pos[ib][0] + mt[0],
-						gt.meshcell_pos[ib][1] + mt[1],
-						gt.meshcell_pos[ib][2] + mt[2]};
-
-					for (int iw=0; iw< atom->nw; ++iw)
-					{
-						p_dpsirr[iw * 6] = p_dpsi_x[iw]*dr[0];
-						p_dpsirr[iw * 6 + 1] = p_dpsi_x[iw]*dr[1];
-						p_dpsirr[iw * 6 + 2] = p_dpsi_x[iw]*dr[2];
-						p_dpsirr[iw * 6 + 3] = p_dpsi_y[iw]*dr[1];
-						p_dpsirr[iw * 6 + 4] = p_dpsi_y[iw]*dr[2];
-						p_dpsirr[iw * 6 + 5] = p_dpsi_z[iw]*dr[2];
-					}//iw
-				}//else
-			}
-		}
-		ModuleBase::timer::tick("Gint_Tools", "cal_dpsirr_ylm");
-		return;
-	}
-
-	// atomic basis sets
-	// psir_vlbr3[bxyz][LD_pool]
-    ModuleBase::Array_Pool<double> get_psir_vlbr3(
-        const int bxyz,
-        const int na_grid,  					    // how many atoms on this (i,j,k) grid
-		const int LD_pool,
-		const int*const block_index,		    	// block_index[na_grid+1], count total number of atomis orbitals
-		const bool*const*const cal_flag,	    	// cal_flag[bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
-		const double*const vldr3,			    	// vldr3[bxyz]
-		const double*const*const psir_ylm)		    // psir_ylm[bxyz][LD_pool]
-	{
-		ModuleBase::Array_Pool<double> psir_vlbr3(bxyz, LD_pool);
-		for(int ib=0; ib<bxyz; ++ib)
-		{
-			for(int ia=0; ia<na_grid; ++ia)
-			{
-				if(cal_flag[ib][ia])
-				{
-					for(int i=block_index[ia]; i<block_index[ia+1]; ++i)
-					{
-						psir_vlbr3[ib][i]=psir_ylm[ib][i]*vldr3[ib];
-					}
-				}
-				else
-				{
-					for(int i=block_index[ia]; i<block_index[ia+1]; ++i)
-					{
-						psir_vlbr3[ib][i]=0;
-					}
-				}
-
-			}
-		}
-		return psir_vlbr3;
-	}
-
-std::pair<int, int> cal_info(const int bxyz, 
-			                 const int ia1,
-			                 const int ia2,
-			                 const bool* const* const cal_flag)
-{
-	int ib_start = bxyz;
-	int ib_end = 0;
-	int ib_length = 0;
-	for(int ib=0; ib<bxyz; ++ib)
-	{
-		if(cal_flag[ib][ia1] && cal_flag[ib][ia2])
-		{
-		    ib_start = ib;
-			break;
-		}
-	}
-
-	if(ib_start == bxyz)
-	{
-		return std::make_pair(bxyz, 0);
-	}
-	else
-	{
-		for(int ib=bxyz-1; ib>=0; --ib)
-		{
-			if(cal_flag[ib][ia1] && cal_flag[ib][ia2])
-			{
-				ib_end = ib;
-				break;
-			}
-		}
-	}
-
-	ib_length = ib_end - ib_start + 1;
-	return std::make_pair(ib_start, ib_length);
-}
-
-} // namespace Gint_Tools
diff --git a/source/source_lcao/module_gint/gint_tools.h b/source/source_lcao/module_gint/gint_tools.h
deleted file mode 100644
index a7f0e1b0d0..0000000000
--- a/source/source_lcao/module_gint/gint_tools.h
+++ /dev/null
@@ -1,311 +0,0 @@
-//=========================================================
-// REFACTOR : Peize Lin, 2021.06.28
-//=========================================================
-#ifndef GINT_TOOLS_H
-#define GINT_TOOLS_H
-#include "grid_technique.h"
-#include "source_estate/module_charge/charge.h"
-#include "source_lcao/module_hcontainer/hcontainer.h"
-#include "source_base/array_pool.h"
-
-#include <cstdlib>
-#include <utility> // for std::pair
-
-namespace Gint_Tools
-{
-enum class job_type
-{
-    vlocal,
-    rho,
-    force,
-    tau,
-    vlocal_meta,
-    force_meta,
-    dvlocal
-};
-// Hamiltonian, electron density, force, kinetic energy density, Hamiltonian for mGGA
-} // namespace Gint_Tools
-
-// the class is used to pass input/output variables
-// into the unified interface gint
-// not sure if this is the best practice though ..
-class Gint_inout
-{
-  public:
-    // input
-    double*** DM=nullptr;
-    const double* vl=nullptr;
-    const double* vofk=nullptr;
-    bool isforce=false;
-    bool isstress=false;
-    int ispin=0;
-    int nspin_rho=0;  // usually, but not always, equal to global nspin
-    bool if_symm = false; // if true, use dsymv in gint_kernel_rho; if false, use dgemv.
-
-    // output
-    double** rho=nullptr;
-    ModuleBase::matrix* fvl_dphi=nullptr;
-    ModuleBase::matrix* svl_dphi=nullptr;
-    Gint_Tools::job_type job;
-
-    // electron density and kin_r, multi-k
-    Gint_inout(double** rho_in, Gint_Tools::job_type job_in, const int& nspin_rho_in, bool if_symm_in = true)
-    {
-        rho = rho_in;
-        job = job_in;
-        nspin_rho = nspin_rho_in;
-        if_symm = if_symm_in;
-    }
-
-    // force
-    Gint_inout(const int ispin_in,
-               const double* vl_in,
-               bool isforce_in,
-               bool isstress_in,
-               ModuleBase::matrix* fvl_dphi_in,
-               ModuleBase::matrix* svl_dphi_in,
-               Gint_Tools::job_type job_in)
-    {
-        vl = vl_in;
-        isforce = isforce_in;
-        isstress = isstress_in;
-        fvl_dphi = fvl_dphi_in;
-        svl_dphi = svl_dphi_in;
-        job = job_in;
-        ispin = ispin_in;
-    }
-
-    // force (mGGA)
-    Gint_inout(const int ispin_in,
-               const double* vl_in,
-               const double* vofk_in,
-               const bool isforce_in,
-               const bool isstress_in,
-               ModuleBase::matrix* fvl_dphi_in,
-               ModuleBase::matrix* svl_dphi_in,
-               Gint_Tools::job_type job_in)
-    {
-        vl = vl_in;
-        vofk = vofk_in;
-        isforce = isforce_in;
-        isstress = isstress_in;
-        fvl_dphi = fvl_dphi_in;
-        svl_dphi = svl_dphi_in;
-        job = job_in;
-        ispin = ispin_in;
-    }
-
-    // vlocal, multi-k
-    Gint_inout(const double* vl_in, int ispin_in, Gint_Tools::job_type job_in)
-    {
-        vl = vl_in;
-        ispin = ispin_in;
-        job = job_in;
-    }
-
-    // mGGA vlocal, multi-k
-    Gint_inout(const double* vl_in, const double* vofk_in, int ispin_in, Gint_Tools::job_type job_in)
-    {
-        vl = vl_in;
-        vofk = vofk_in;
-        ispin = ispin_in;
-        job = job_in;
-    }
-
-    // vlocal, gamma point
-    Gint_inout(const double* vl_in, Gint_Tools::job_type job_in)
-    {
-        vl = vl_in;
-        job = job_in;
-    }
-
-    // mGGA vlocal, gamma point
-    Gint_inout(const double* vl_in, const double* vofk_in, Gint_Tools::job_type job_in)
-    {
-        vl = vl_in;
-        vofk = vofk_in;
-        job = job_in;
-    }
-};
-
-namespace Gint_Tools
-{
-// if exponent is an integer between 0 and 5 (the most common cases in gint),
-// pow_int is much faster than std::pow
-inline double pow_int(const double base, const int exp)
-{
-    switch (exp)
-    {
-    case 0:
-        return 1.0;
-    case 1:
-        return base;
-    case 2:
-        return base * base;
-    case 3:
-        return base * base * base;
-    case 4:
-        return base * base * base * base;
-    case 5:
-        return base * base * base * base * base;
-    default:
-        double result = std::pow(base, exp);
-        return result;
-    }
-}
-// vindex[pw.bxyz]
-
-/**
- * @brief Get the vindex form the grid index
- * @param bxyz number of big grids
- * @param bx number of big grids in x direction
- * @param by number of big grids in y direction
- * @param bz number of big grids in z direction
- * @param nplane Currently using Z-axis 1D division, 
- * recording the number of the Z-axis process
- * (nbz in the current process).
- * @param start_ind start index of the grid in the 1D FFT grid
- * @param ncyz number of grids in yz plane
- * @param vindex the index of the grid 
-*/
-void get_vindex(const int bxyz, const int bx, const int by,
-                    const int bz, const int nplane, 
-                    const int start_ind,const int ncyz,int* vindex);
-
-/**
- * @brief Get the vldr3 form the grid index
- * @param vldr3 the local potential multiplied by the grid volume
- * @param vlocal the local potential
- * @param bxyz number of grids
- * @param bx number of grids in x direction
- * @param by number of grids in y direction
- * @param bz number of grids in z direction
- * @param nplane Currently using Z-axis 1D division, 
- * recording the number of the Z-axis process
- * (nbz in the current process).
- * @param start_ind start index of the grid in the 1D FFT grid
- * @param ncyz number of grids in yz plane
- * @param dv the volume of the grid
-*/
-void get_gint_vldr3(double* vldr3,
-                    const double* const vlocal,
-                    const int bxyz,
-                    const int bx,
-                    const int by,
-                    const int bz,
-                    const int nplane,
-                    const int start_ind,
-                    const int ncyz,
-                    const double dv);
-
-/**
- * @brief Get the information of a big grid index
- * @param gt the grid technique, which contains the tools of the grid intergration
- * @param bxyz number of grids
- * @param na_grid number of atoms on this grid
- * @param grid_index 1d index of FFT index (i,j,k)
- * @param block_iw track the atom orbitals in all atoms
- * @param block_index count total number of atomis orbitals
- * @param block_size count the number of atomis orbitals in each atom
- * @param cal_flag whether the atom-grid distance is larger than cutoff
-*/                    
-void get_block_info(const Grid_Technique& gt, const int bxyz, const int na_grid, const int grid_index,
-                    int* block_iw, int* block_index, int* block_size, bool** cal_flag);
-
-void init_orb(double& dr_uniform,
-              std::vector<double>& rcuts,
-              UnitCell& ucell,
-              const LCAO_Orbitals& orb,
-              std::vector<std::vector<double>>& psi_u,
-              std::vector<std::vector<double>>& dpsi_u,
-              std::vector<std::vector<double>>& d2psi_u);
-
-// psir_ylm[pw.bxyz][LD_pool]
-void cal_psir_ylm(const Grid_Technique& gt,
-                  const int bxyz,
-                  const int na_grid,            // number of atoms on this grid
-                  const int grid_index,         // 1d index of FFT index (i,j,k)
-                  const double delta_r,         // delta_r of the uniform FFT grid
-                  const int* const block_index, // count total number of atomis orbitals
-                  const int* const block_size,
-                  const bool* const* const cal_flag,
-                  double* const* const psir_ylm); // whether the atom-grid distance is larger than cutoff
-
-// psir_ylm and dpsir_ylm, both[pw.bxyz][LD_pool]
-void cal_dpsir_ylm(
-    const Grid_Technique& gt,
-    const int bxyz,
-    const int na_grid,                 // number of atoms on this grid
-    const int grid_index,              // 1d index of FFT index (i,j,k)
-    const double delta_r,              // delta_r of the uniform FFT grid
-    const int* const block_index,      // block_index[na_grid+1], count total number of atomis orbitals
-    const int* const block_size,       // block_size[na_grid],	number of columns of a band
-    const bool* const* const cal_flag, // cal_flag[bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
-    double* const* const psir_ylm,
-    double* const* const dpsir_ylm_x,
-    double* const* const dpsir_ylm_y,
-    double* const* const dpsir_ylm_z);
-
-// dpsir_ylm * (r-R), R is the atomic position
-void cal_dpsirr_ylm(
-    const Grid_Technique& gt, const int bxyz,
-    const int na_grid,                 // number of atoms on this grid
-    const int grid_index,              // 1d index of FFT index (i,j,k)
-    const int* const block_index,      // block_index[na_grid+1], count total number of atomis orbitals
-    const int* const block_size,       // block_size[na_grid],	number of columns of a band
-    const bool* const* const cal_flag, // cal_flag[bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
-    double* const* const dpsir_ylm_x, double* const* const dpsir_ylm_y, double* const* const dpsir_ylm_z,
-    double* const* const dpsir_ylm);
-
-void cal_ddpsir_ylm(
-    const Grid_Technique& gt,
-    const int bxyz,
-    const int na_grid,                 // number of atoms on this grid
-    const int grid_index,              // 1d index of FFT index (i,j,k)
-    const double delta_r,              // delta_r of the uniform FFT grid
-    const int* const block_index,      // block_index[na_grid+1], count total number of atomis orbitals
-    const int* const block_size,       // block_size[na_grid],	number of columns of a band
-    const bool* const* const cal_flag, // cal_flag[bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
-    double* const* const ddpsir_ylm_xx,
-    double* const* const ddpsir_ylm_xy,
-    double* const* const ddpsir_ylm_xz,
-    double* const* const ddpsir_ylm_yy,
-    double* const* const ddpsir_ylm_yz,
-    double* const* const ddpsir_ylm_zz);
-
-// psir_ylm * vldr3
-ModuleBase::Array_Pool<double> get_psir_vlbr3(
-    const int bxyz,
-    const int na_grid, // how many atoms on this (i,j,k) grid
-    const int LD_pool,
-    const int* const block_index,      // block_index[na_grid+1], count total number of atomis orbitals
-    const bool* const* const cal_flag, // cal_flag[bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
-    const double* const vldr3,         // vldr3[bxyz]
-    const double* const* const psir_ylm); // psir_ylm[bxyz][LD_pool]
-
-// sum_nu,R rho_mu,nu(R) psi_nu, for multi-k and gamma point
-void mult_psi_DMR(
-    const Grid_Technique& gt,
-    const int bxyz,
-    const int LD_pool,
-    const int &grid_index,
-    const int &na_grid,
-    const int*const block_index,
-    const int*const block_size,
-    const bool*const*const cal_flag,
-    const double*const*const psi,
-    double*const*const psi_DMR,
-    const hamilt::HContainer<double>*const DM,
-    const bool if_symm);
-
-
-// pair.first is the first index of the meshcell which is inside atoms ia1 and ia2.
-// pair.second is the number of meshcells which should be calculated in the following gemm.
-// If no meshcell is inside both ia1 and ia2, return [bxyz, 0].
-std::pair<int, int> cal_info(const int bxyz, 
-			                 const int ia1,
-			                 const int ia2,
-			                 const bool* const* const cal_flag);
-            
-} // namespace Gint_Tools
-#endif
diff --git a/source/source_lcao/module_gint/temp_gint/gint_type.h b/source/source_lcao/module_gint/gint_type.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_type.h
rename to source/source_lcao/module_gint/gint_type.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl.cpp b/source/source_lcao/module_gint/gint_vl.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl.cpp
rename to source/source_lcao/module_gint/gint_vl.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl.h b/source/source_lcao/module_gint/gint_vl.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl.h
rename to source/source_lcao/module_gint/gint_vl.h
diff --git a/source/source_lcao/module_gint/gint_vl_cpu_interface.cpp b/source/source_lcao/module_gint/gint_vl_cpu_interface.cpp
deleted file mode 100644
index f913fab83e..0000000000
--- a/source/source_lcao/module_gint/gint_vl_cpu_interface.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-#include "gint.h"
-#include "source_base/memory.h"
-#include "source_io/module_parameter/parameter.h"
-#include "source_base/timer.h"
-
-void Gint::gint_kernel_vlocal(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal");
-    const UnitCell& ucell = *this->ucell;
-    const int max_size = this->gridt->max_atom;
-    const int lgd = this->gridt->lgd;
-    const int ncyz = this->ny * this->nplane;
-    const double dv = ucell.omega / this->ncxyz;
-    const double delta_r = this->gridt->dr_uniform;
-    hamilt::HContainer<double>* hRGint_kernel = PARAM.inp.nspin != 4 ? this->hRGint : this->hr_gint_tmp[inout->ispin];
-    hRGint_kernel->set_zero();
-
-#pragma omp parallel 
-    {   /**
-        * @brief When in OpenMP, it points to a newly allocated memory,
-        */
-        std::vector<int> block_iw(max_size,0);
-        std::vector<int> block_index(max_size+1,0);
-        std::vector<int> block_size(max_size,0);
-        std::vector<double> vldr3(this->bxyz,0.0);
-        #pragma omp for schedule(dynamic)
-        for (int grid_index = 0; grid_index < this->nbxx; grid_index++) {
-            const int na_grid = this->gridt->how_many_atoms[grid_index];
-            if (na_grid == 0) {
-                continue;
-            }
-            /**
-             * @brief Prepare block information
-            */
-            ModuleBase::Array_Pool<bool> cal_flag(this->bxyz,max_size);
-
-            Gint_Tools::get_gint_vldr3(vldr3.data(),
-                                        inout->vl,
-                                        this->bxyz,
-                                        this->bx,
-                                        this->by,
-                                        this->bz,
-                                        this->nplane,
-                                        this->gridt->start_ind[grid_index],
-                                        ncyz,
-                                        dv);
-
-            Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, 
-                                                block_iw.data(), block_index.data(), block_size.data(), cal_flag.get_ptr_2D());
-
-        /**
-         * @brief Evaluate psi and dpsi on grids
-        */
-        const int LD_pool = block_index[na_grid];
-        ModuleBase::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
-	    Gint_Tools::cal_psir_ylm(*this->gridt, 
-            this->bxyz, na_grid, grid_index, delta_r,
-            block_index.data(), block_size.data(), 
-            cal_flag.get_ptr_2D(),psir_ylm.get_ptr_2D());
-
-        // psir_ylm_new=psir_func(psir_ylm)
-        // psir_func==nullptr means psir_ylm_new=psir_ylm
-        const ModuleBase::Array_Pool<double> &psir_ylm_1 = (!this->psir_func_1) ? psir_ylm : this->psir_func_1(psir_ylm, *this->gridt, grid_index, 0, block_iw, block_size, block_index, cal_flag);
-        const ModuleBase::Array_Pool<double> &psir_ylm_2 = (!this->psir_func_2) ? psir_ylm : this->psir_func_2(psir_ylm, *this->gridt, grid_index, 0, block_iw, block_size, block_index, cal_flag);
-
-	//calculating f_mu(r) = v(r)*psi_mu(r)*dv
-        const ModuleBase::Array_Pool<double> psir_vlbr3 = Gint_Tools::get_psir_vlbr3(
-                this->bxyz, na_grid, LD_pool, block_index.data(), 
-                cal_flag.get_ptr_2D(), vldr3.data(), psir_ylm_1.get_ptr_2D());
-
-            //integrate (psi_mu*v(r)*dv) * psi_nu on grid
-            //and accumulates to the corresponding element in Hamiltonian
-            this->cal_meshball_vlocal(
-                na_grid, LD_pool, block_size.data(), block_index.data(), grid_index, 
-                cal_flag.get_ptr_2D(),psir_ylm.get_ptr_2D(), psir_vlbr3.get_ptr_2D(),
-                hRGint_kernel);
-        }
-        ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal");
-        ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal");
-    }
-}
-
-void Gint::gint_kernel_dvlocal(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint_dvlocal");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_dvlocal");
-    const UnitCell& ucell = *this->ucell;
-    const int max_size = this->gridt->max_atom;
-    const int lgd = this->gridt->lgd;
-    const int nnrg = pvdpRx_reduced[inout->ispin].get_nnr();
-    const int ncyz = this->ny * this->nplane;
-    const double dv = ucell.omega / this->ncxyz;
-    const double delta_r = this->gridt->dr_uniform;
-
-    if (PARAM.globalv.gamma_only_local) {
-        ModuleBase::WARNING_QUIT("Gint_interface::cal_gint","dvlocal only for k point!");
-    }
-    pvdpRx_reduced[inout->ispin].set_zero();
-    pvdpRy_reduced[inout->ispin].set_zero();
-    pvdpRz_reduced[inout->ispin].set_zero();
-
-#pragma omp parallel 
-{
-    std::vector<int> block_iw(max_size,0);
-    std::vector<int> block_index(max_size+1,0);
-    std::vector<int> block_size(max_size,0);
-    std::vector<double> vldr3(this->bxyz,0.0);
-#pragma omp for schedule(dynamic)
-    for (int grid_index = 0; grid_index < this->nbxx; grid_index++) {
-        const int na_grid = this->gridt->how_many_atoms[grid_index];
-        if (na_grid == 0) {
-            continue;
-        }
-        Gint_Tools::get_gint_vldr3(vldr3.data(),
-                                    inout->vl,
-                                    this->bxyz,
-                                    this->bx,
-                                    this->by,
-                                    this->bz,
-                                    this->nplane,
-                                    this->gridt->start_ind[grid_index],
-                                    ncyz,
-                                    dv);
-    //prepare block information
-        ModuleBase::Array_Pool<bool> cal_flag(this->bxyz,max_size);
-        Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, 
-                                    block_iw.data(), block_index.data(), block_size.data(), cal_flag.get_ptr_2D());
-        
-	//evaluate psi and dpsi on grids
-        const int LD_pool = block_index[na_grid];
-
-        ModuleBase::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_x(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_y(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_z(this->bxyz, LD_pool);
-        Gint_Tools::cal_dpsir_ylm(*this->gridt, this->bxyz, na_grid, grid_index, delta_r, 
-                                    block_index.data(), block_size.data(), cal_flag.get_ptr_2D(),psir_ylm.get_ptr_2D(),
-                                    dpsir_ylm_x.get_ptr_2D(), dpsir_ylm_y.get_ptr_2D(), dpsir_ylm_z.get_ptr_2D());
-
-	//calculating f_mu(r) = v(r)*psi_mu(r)*dv
-        const ModuleBase::Array_Pool<double> psir_vlbr3 = Gint_Tools::get_psir_vlbr3(
-                this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vldr3.data(), psir_ylm.get_ptr_2D());
-
-	//integrate (psi_mu*v(r)*dv) * psi_nu on grid
-	//and accumulates to the corresponding element in Hamiltonian
-        this->cal_meshball_vlocal(na_grid, LD_pool, block_size.data(), block_index.data(),
-                                    grid_index, cal_flag.get_ptr_2D(),psir_vlbr3.get_ptr_2D(),
-                                    dpsir_ylm_x.get_ptr_2D(), &this->pvdpRx_reduced[inout->ispin]);
-        this->cal_meshball_vlocal(na_grid, LD_pool, block_size.data(), block_index.data(),
-                                    grid_index, cal_flag.get_ptr_2D(),psir_vlbr3.get_ptr_2D(),
-                                    dpsir_ylm_y.get_ptr_2D(), &this->pvdpRy_reduced[inout->ispin]);
-        this->cal_meshball_vlocal(na_grid, LD_pool, block_size.data(), block_index.data(),
-                                    grid_index, cal_flag.get_ptr_2D(),psir_vlbr3.get_ptr_2D(),
-                                    dpsir_ylm_z.get_ptr_2D(), &this->pvdpRz_reduced[inout->ispin]);
-    }
-}
-    ModuleBase::TITLE("Gint_interface", "cal_gint_dvlocal");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_dvlocal");
-}
-
-void Gint::gint_kernel_vlocal_meta(Gint_inout* inout) {
-    ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal_meta");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal_meta");
-    const UnitCell& ucell = *this->ucell;
-    const int max_size = this->gridt->max_atom;
-    const int lgd = this->gridt->lgd;
-    const int ncyz = this->ny * this->nplane;
-    const double dv = ucell.omega / this->ncxyz;
-    const double delta_r = this->gridt->dr_uniform;
-    hamilt::HContainer<double>* hRGint_kernel = PARAM.inp.nspin != 4 ? this->hRGint : this->hr_gint_tmp[inout->ispin];
-    hRGint_kernel->set_zero();
-    const int nnrg = hRGint_kernel->get_nnr();
-
-#pragma omp parallel
-{
-    // define HContainer here to reference.
-    //Under the condition of gamma_only, hRGint will be instantiated.
-    std::vector<int> block_iw(max_size,0);
-    std::vector<int> block_index(max_size+1,0);
-    std::vector<int> block_size(max_size,0);
-    std::vector<double> vldr3(this->bxyz,0.0);
-    std::vector<double> vkdr3(this->bxyz,0.0);
-
-#pragma omp for schedule(dynamic)
-    for (int grid_index = 0; grid_index < this->nbxx; grid_index++) {
-        const int na_grid = this->gridt->how_many_atoms[grid_index];
-        if (na_grid == 0) {
-            continue;
-        }
-        Gint_Tools::get_gint_vldr3(vldr3.data(),
-                                inout->vl,
-                                this->bxyz,
-                                this->bx,
-                                this->by,
-                                this->bz,
-                                this->nplane,
-                                this->gridt->start_ind[grid_index],
-                                ncyz,
-                                dv);
-        Gint_Tools::get_gint_vldr3(vkdr3.data(),
-                                    inout->vofk,
-                                    this->bxyz,
-                                    this->bx,
-                                    this->by,
-                                    this->bz,
-                                    this->nplane,
-                                    this->gridt->start_ind[grid_index],
-                                    ncyz,
-                                    dv);
-        //prepare block information
-        ModuleBase::Array_Pool<bool> cal_flag(this->bxyz,max_size);
-	    Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, 
-                                    block_iw.data(), block_index.data(), block_size.data(), cal_flag.get_ptr_2D());
-
-        //evaluate psi and dpsi on grids
-        const int LD_pool = block_index[na_grid];
-        ModuleBase::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_x(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_y(this->bxyz, LD_pool);
-        ModuleBase::Array_Pool<double> dpsir_ylm_z(this->bxyz, LD_pool);
-
-        Gint_Tools::cal_dpsir_ylm(*this->gridt,
-            this->bxyz, na_grid, grid_index, delta_r,
-            block_index.data(), block_size.data(), 
-            cal_flag.get_ptr_2D(),
-            psir_ylm.get_ptr_2D(),
-            dpsir_ylm_x.get_ptr_2D(),
-            dpsir_ylm_y.get_ptr_2D(),
-            dpsir_ylm_z.get_ptr_2D()
-        );
-	
-	    //calculating f_mu(r) = v(r)*psi_mu(r)*dv
-	    const ModuleBase::Array_Pool<double> psir_vlbr3 = Gint_Tools::get_psir_vlbr3(
-		    	this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vldr3.data(), psir_ylm.get_ptr_2D());
-
-	    //calculating df_mu(r) = vofk(r) * dpsi_mu(r) * dv
-	    const ModuleBase::Array_Pool<double> dpsix_vlbr3 = Gint_Tools::get_psir_vlbr3(
-			this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_x.get_ptr_2D());
-	    const ModuleBase::Array_Pool<double> dpsiy_vlbr3 = Gint_Tools::get_psir_vlbr3(
-			this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_y.get_ptr_2D());	
-	    const ModuleBase::Array_Pool<double> dpsiz_vlbr3 = Gint_Tools::get_psir_vlbr3(
-			this->bxyz, na_grid, LD_pool, block_index.data(), cal_flag.get_ptr_2D(), vkdr3.data(), dpsir_ylm_z.get_ptr_2D());
-
-
-        //integrate (psi_mu*v(r)*dv) * psi_nu on grid
-        //and accumulates to the corresponding element in Hamiltonian
-        this->cal_meshball_vlocal(
-            na_grid, LD_pool, block_size.data(), block_index.data(), grid_index, cal_flag.get_ptr_2D(),
-            psir_ylm.get_ptr_2D(), psir_vlbr3.get_ptr_2D(), hRGint_kernel);
-        //integrate (d/dx_i psi_mu*vk(r)*dv) * (d/dx_i psi_nu) on grid (x_i=x,y,z)
-        //and accumulates to the corresponding element in Hamiltonian
-        this->cal_meshball_vlocal(
-            na_grid, LD_pool, block_size.data(), block_index.data(), grid_index, cal_flag.get_ptr_2D(),
-            dpsir_ylm_x.get_ptr_2D(), dpsix_vlbr3.get_ptr_2D(), hRGint_kernel);
-        this->cal_meshball_vlocal(
-            na_grid, LD_pool, block_size.data(), block_index.data(), grid_index, cal_flag.get_ptr_2D(),
-            dpsir_ylm_y.get_ptr_2D(), dpsiy_vlbr3.get_ptr_2D(), hRGint_kernel);
-        this->cal_meshball_vlocal(
-            na_grid, LD_pool, block_size.data(), block_index.data(), grid_index, cal_flag.get_ptr_2D(),
-            dpsir_ylm_z.get_ptr_2D(), dpsiz_vlbr3.get_ptr_2D(), hRGint_kernel);
-    }
-}
-
-    ModuleBase::TITLE("Gint_interface", "cal_gint_vlocal_meta");
-    ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal_meta");
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_gpu.cpp b/source/source_lcao/module_gint/gint_vl_gpu.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_gpu.cpp
rename to source/source_lcao/module_gint/gint_vl_gpu.cpp
diff --git a/source/source_lcao/module_gint/gint_vl_gpu.cu b/source/source_lcao/module_gint/gint_vl_gpu.cu
deleted file mode 100644
index ddbca83a60..0000000000
--- a/source/source_lcao/module_gint/gint_vl_gpu.cu
+++ /dev/null
@@ -1,219 +0,0 @@
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "kernels/cuda/cuda_tools.cuh"
-#include "source_base/ylm.h"
-#include "gint_vl_gpu.h"
-#include "kernels/cuda/gint_vl.cuh"
-
-namespace GintKernel
-{
-
-/**
- * Computes the gamma component of the VL (Vlocal) integral on the GPU.
- *
- * @note The grid integration on the GPU is mainly divided into the following
- * steps:
- * 1. Use the CPU to divide the grid integration into subtasks.
- * 2. Copy the subtask information to the GPU.
- * 3. Calculate the matrix elements on the GPU.
- * 4. Perform matrix multiplication on the GPU.
- * 5. Copy the results back to the host.
- */
-void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
-                 const double* vlocal,
-                 const double* ylmcoef_now,
-                 const double dr,
-                 const double* rcut,
-                 const Grid_Technique& gridt,
-                 const UnitCell& ucell)
-{
-    checkCuda(cudaSetDevice(gridt.dev_id));
-    // checkCuda(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
-    const int nbzp = gridt.nbzp;
-    const int num_streams = gridt.nstreams;
-    const int max_atom = gridt.max_atom;
-    const int max_atom_per_bcell = max_atom * gridt.bxyz;
-    const int max_atom_per_z = max_atom_per_bcell * nbzp;
-    const int max_phi_per_z = max_atom_per_z * ucell.nwmax;
-    const int max_atompair_per_z = max_atom * max_atom * nbzp;
-    const double vfactor = ucell.omega / gridt.ncxyz;
-    const int nczp = nbzp * gridt.bz;
-    std::vector<cudaStream_t> streams(num_streams);
-    std::vector<cudaEvent_t> events(num_streams);
-
-    for (int i = 0; i < num_streams; i++)
-    {
-        checkCuda(cudaStreamCreate(&streams[i]));
-        checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
-    }
-
-    const int nnrg = hRGint->get_nnr();
-    hRGint->set_zero();
-    Cuda_Mem_Wrapper<double> grid_vlocal_g(nnrg, 1, false);
-    grid_vlocal_g.memset_device_sync();
-
-    Cuda_Mem_Wrapper<double> dr_part(max_atom_per_z * 3, num_streams, true);
-    Cuda_Mem_Wrapper<uint8_t> atoms_type(max_atom_per_z, num_streams, true);
-    // The first number in every group of two represents the number of atoms on that bigcell.
-    // The second number represents the cumulative number of atoms up to that bigcell.
-    Cuda_Mem_Wrapper<int> atoms_num_info(2 * nbzp, num_streams, true);
-    Cuda_Mem_Wrapper<double> vldr3(nbzp * gridt.bxyz, num_streams, true);
-
-    Cuda_Mem_Wrapper<double> psi(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> psi_vldr3(max_phi_per_z, num_streams, false);
-
-    Cuda_Mem_Wrapper<int> gemm_m(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_n(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_k(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_lda(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_ldb(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<int> gemm_ldc(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<double*> gemm_A(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<double*> gemm_B(max_atompair_per_z, num_streams, true);
-    Cuda_Mem_Wrapper<double*> gemm_C(max_atompair_per_z, num_streams, true);
-
-#ifdef _OPENMP
-const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
-#endif
-#pragma omp parallel num_threads(max_thread_num)
-{
-#ifdef _OPENMP
-    const int tid = omp_get_thread_num();
-    const int num_threads = omp_get_num_threads();
-    const int sid_start = tid * num_streams / num_threads;
-    const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
-#else
-    const int sid_start = 0;
-    const int thread_num_streams = num_streams;
-#endif
-#pragma omp for collapse(2) schedule(dynamic)
-    for (int i = 0; i < gridt.nbx; i++)
-    {
-        for (int j = 0; j < gridt.nby; j++)
-        {
-            // 20240620 Note that it must be set again here because 
-            // cuda's device is not safe in a multi-threaded environment.
-            checkCuda(cudaSetDevice(gridt.dev_id));
-
-            const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
-            checkCuda(cudaEventSynchronize(events[sid]));
-            int max_m = 0;
-            int max_n = 0;
-            int atom_pair_num = 0;
-            int atoms_per_z = 0;
-            const int grid_index_ij = i * gridt.nby * nbzp + j * nbzp;
-            
-            gtask_vlocal(gridt,
-                         ucell,
-                         grid_index_ij,
-                         nczp,
-                         vfactor,
-                         vlocal,
-                         atoms_per_z,
-                         atoms_num_info.get_host_pointer(sid),
-                         atoms_type.get_host_pointer(sid),
-                         dr_part.get_host_pointer(sid),
-                         vldr3.get_host_pointer(sid));
-        
-            alloc_mult_vlocal(hRGint,
-                              gridt,
-                              ucell,
-                              grid_index_ij,
-                              max_atom,
-                              psi.get_device_pointer(sid),
-                              psi_vldr3.get_device_pointer(sid),
-                              grid_vlocal_g.get_device_pointer(),
-                              gemm_m.get_host_pointer(sid),
-                              gemm_n.get_host_pointer(sid),
-                              gemm_k.get_host_pointer(sid),
-                              gemm_lda.get_host_pointer(sid),
-                              gemm_ldb.get_host_pointer(sid),
-                              gemm_ldc.get_host_pointer(sid),
-                              gemm_A.get_host_pointer(sid),
-                              gemm_B.get_host_pointer(sid),
-                              gemm_C.get_host_pointer(sid),
-                              atom_pair_num,
-                              max_m,
-                              max_n);
-
-            dr_part.copy_host_to_device_async(streams[sid], sid, atoms_per_z * 3);
-            atoms_type.copy_host_to_device_async(streams[sid], sid, atoms_per_z);
-            vldr3.copy_host_to_device_async(streams[sid], sid);
-            atoms_num_info.copy_host_to_device_async(streams[sid], sid, 2 * nbzp);
-            
-            gemm_m.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_n.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_k.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_lda.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_ldb.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_ldc.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
-            checkCuda(cudaEventRecord(events[sid], streams[sid]));
-            
-            psi.memset_device_async(streams[sid], sid, 0);
-            psi_vldr3.memset_device_async(streams[sid], sid, 0);
-
-            dim3 grid_psi(nbzp, gridt.bxyz);
-            dim3 block_psi(64);
-            get_psi_and_vldr3<<<grid_psi,
-                                block_psi,
-                                0,
-                                streams[sid]>>>(
-                gridt.ylmcoef_g,
-                dr,
-                gridt.bxyz,
-                ucell.nwmax,
-                max_atom,
-                gridt.atom_nwl_g,
-                gridt.atom_new_g,
-                gridt.atom_ylm_g,
-                gridt.atom_nw_g,
-                gridt.rcut_g,
-                gridt.nr_max,
-                gridt.psi_u_g,
-                gridt.mcell_pos_g,
-                dr_part.get_device_pointer(sid),
-                vldr3.get_device_pointer(sid),
-                atoms_type.get_device_pointer(sid),
-                atoms_num_info.get_device_pointer(sid),
-                psi.get_device_pointer(sid),
-                psi_vldr3.get_device_pointer(sid));
-            checkCudaLastError();
-            
-            gridt.fastest_matrix_mul(max_m,
-                                     max_n,
-                                     gemm_m.get_device_pointer(sid),
-                                     gemm_n.get_device_pointer(sid),
-                                     gemm_k.get_device_pointer(sid),
-                                     gemm_A.get_device_pointer(sid),
-                                     gemm_lda.get_device_pointer(sid),
-                                     gemm_B.get_device_pointer(sid),
-                                     gemm_ldb.get_device_pointer(sid),
-                                     gemm_C.get_device_pointer(sid),
-                                     gemm_ldc.get_device_pointer(sid),
-                                     atom_pair_num,
-                                     streams[sid],
-                                     nullptr);
-            checkCudaLastError();
-        }
-    }
-}
-
-    checkCuda(cudaMemcpy(
-        hRGint->get_wrapper(),
-        grid_vlocal_g.get_device_pointer(),
-        nnrg * sizeof(double),
-        cudaMemcpyDeviceToHost));
-
-    for (int i = 0; i < num_streams; i++)
-    {
-        checkCuda(cudaStreamDestroy(streams[i]));
-        checkCuda(cudaEventDestroy(events[i]));
-    }
-}
-
-} // namespace GintKernel
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/gint_vl_gpu.h b/source/source_lcao/module_gint/gint_vl_gpu.h
index a04b6a130d..a671b6b33a 100644
--- a/source/source_lcao/module_gint/gint_vl_gpu.h
+++ b/source/source_lcao/module_gint/gint_vl_gpu.h
@@ -1,53 +1,49 @@
-#ifndef GINT_VL_GPU_H
-#define GINT_VL_GPU_H
+#pragma once
 
+#include <memory>
+#include <vector>
+#include "source_lcao/module_hcontainer/hcontainer.h"
 #include "gint.h"
-#include "grid_technique.h"
-#include "kernels/cuda/cuda_tools.cuh"
+#include "gint_info.h"
+#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h"
 
-namespace GintKernel
+namespace ModuleGint
 {
 
-void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
-                 const double* vlocal,
-                 const double* ylmcoef_now,
-                 const double dr,
-                 const double* rcut,
-                 const Grid_Technique& gridt,
-                 const UnitCell& ucell);
-
-void gtask_vlocal(const Grid_Technique& gridt,
-                  const UnitCell& ucell,
-                  const int grid_index_ij,
-                  const int nczp,
-                  const double vfactor,
-                  const double* vlocal_global_value,
-                  int& atoms_per_z,
-                  int* atoms_num_info,
-                  uint8_t* atoms_type,
-                  double* dr_part,
-                  double* vldr3);
-
-void alloc_mult_vlocal(const hamilt::HContainer<double>* hRGint,
-                       const Grid_Technique& gridt,
-                       const UnitCell& ucell,
-                       const int grid_index_ij,
-                       const int max_atom,
-                       double* const psi,
-                       double* const psi_vldr3,
-                       double* const grid_vlocal_g,
-                       int* mat_m,
-                       int* mat_n,
-                       int* mat_k,
-                       int* mat_lda,
-                       int* mat_ldb,
-                       int* mat_ldc,
-                       double** mat_A,
-                       double** mat_B,
-                       double** mat_C,
-                       int& atom_pair_num,
-                       int& max_m,
-                       int& max_n);
-} // namespace GintKernel
-
-#endif
\ No newline at end of file
+class Gint_vl_gpu : public Gint
+{
+    public:
+    Gint_vl_gpu(
+        const double* vr_eff,
+        HContainer<double>* hR)
+        : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {}
+    
+    void cal_gint();
+
+    private:
+
+    void init_hr_gint_();
+
+    void transfer_cpu_to_gpu_();
+
+    void transfer_gpu_to_cpu_();
+
+    void cal_hr_gint_();
+
+    // input
+    const double* vr_eff_;
+
+        
+    // output
+    HContainer<double>* hR_;
+
+    // Intermediate variables
+    double dr3_;
+
+    HContainer<double> hr_gint_;
+    
+    CudaMemWrapper<double> hr_gint_d_;
+    CudaMemWrapper<double> vr_eff_d_;
+};
+
+}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga.cpp b/source/source_lcao/module_gint/gint_vl_metagga.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga.cpp
rename to source/source_lcao/module_gint/gint_vl_metagga.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga.h b/source/source_lcao/module_gint/gint_vl_metagga.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga.h
rename to source/source_lcao/module_gint/gint_vl_metagga.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp b/source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp
rename to source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h b/source/source_lcao/module_gint/gint_vl_metagga_gpu.h
similarity index 93%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h
rename to source/source_lcao/module_gint/gint_vl_metagga_gpu.h
index aabae7e52f..f55c409c66 100644
--- a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h
+++ b/source/source_lcao/module_gint/gint_vl_metagga_gpu.h
@@ -5,7 +5,7 @@
 #include "source_lcao/module_hcontainer/hcontainer.h"
 #include "gint.h"
 #include "gint_info.h"
-#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h"
 
 namespace ModuleGint
 {
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp b/source/source_lcao/module_gint/gint_vl_metagga_nspin4.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp
rename to source/source_lcao/module_gint/gint_vl_metagga_nspin4.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h b/source/source_lcao/module_gint/gint_vl_metagga_nspin4.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h
rename to source/source_lcao/module_gint/gint_vl_metagga_nspin4.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp
rename to source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h b/source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.h
similarity index 93%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h
rename to source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.h
index c5f6f7c729..9c1b8ca166 100644
--- a/source/source_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h
+++ b/source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.h
@@ -5,7 +5,7 @@
 #include "source_lcao/module_hcontainer/hcontainer.h"
 #include "gint.h"
 #include "gint_info.h"
-#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h"
 
 namespace ModuleGint
 {
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp b/source/source_lcao/module_gint/gint_vl_nspin4.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp
rename to source/source_lcao/module_gint/gint_vl_nspin4.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_nspin4.h b/source/source_lcao/module_gint/gint_vl_nspin4.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_nspin4.h
rename to source/source_lcao/module_gint/gint_vl_nspin4.h
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp b/source/source_lcao/module_gint/gint_vl_nspin4_gpu.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp
rename to source/source_lcao/module_gint/gint_vl_nspin4_gpu.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h b/source/source_lcao/module_gint/gint_vl_nspin4_gpu.h
similarity index 94%
rename from source/source_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h
rename to source/source_lcao/module_gint/gint_vl_nspin4_gpu.h
index 6d17a9a1bb..2e1aa1a475 100644
--- a/source/source_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h
+++ b/source/source_lcao/module_gint/gint_vl_nspin4_gpu.h
@@ -5,7 +5,7 @@
 #include "source_lcao/module_hcontainer/hcontainer.h"
 #include "gint.h"
 #include "gint_info.h"
-#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+#include "source_lcao/module_gint/kernel/cuda_mem_wrapper.h"
 
 namespace ModuleGint
 {
diff --git a/source/source_lcao/module_gint/gint_vl_old.cpp b/source/source_lcao/module_gint/gint_vl_old.cpp
deleted file mode 100644
index 9ebc341d7f..0000000000
--- a/source/source_lcao/module_gint/gint_vl_old.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "source_base/global_function.h"
-#include "source_base/global_variable.h"
-#include "gint_k.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "grid_technique.h"
-#include "source_base/ylm.h"
-#include "source_pw/module_pwdft/global.h"
-#include "source_base/timer.h"
-#include "source_base/array_pool.h"
-#include "source_base/vector3.h"
-//#include <mkl_cblas.h>
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#ifdef __MKL
-#include <mkl_service.h>
-#endif
-
-// this is a thread-safe function
-void Gint::cal_meshball_vlocal(
-	const int na_grid,  					    // how many atoms on this (i,j,k) grid
-	const int LD_pool,
-	const int*const block_size, 			    // block_size[na_grid],	number of columns of a band
-	const int*const block_index,		    	// block_index[na_grid+1], count total number of atomis orbitals
-	const int grid_index,                       // index of grid group, for tracing global atom index
-	const bool*const*const cal_flag,	    	// cal_flag[this->bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
-	const double*const*const psir_ylm,		    // psir_ylm[this->bxyz][LD_pool]
-	const double*const*const psir_vlbr3,	    // psir_vlbr3[this->bxyz][LD_pool]
-	hamilt::HContainer<double>* hR)	    // this->hRGint is the container of <phi_0 | V | phi_R> matrix element.
-{
-	const char transa='N', transb='T';
-	const double alpha=1, beta=1;
-    const int lgd_now = this->gridt->lgd;
-
-	const int mcell_index = this->gridt->bcell_start[grid_index];
-    std::vector<double> hr_tmp;
-	for(int ia1=0; ia1<na_grid; ++ia1)
-	{
-		const int bcell1 = mcell_index + ia1;
-		const int iat1 = this->gridt->which_atom[bcell1];
-		const int id1 = this->gridt->which_unitcell[bcell1];
-		const ModuleBase::Vector3<int> r1 = this->gridt->get_ucell_coords(id1);
-
-		for(int ia2=0; ia2<na_grid; ++ia2)
-		{
-			const int bcell2 = mcell_index + ia2;
-			const int iat2= this->gridt->which_atom[bcell2];
-			const int id2 = this->gridt->which_unitcell[bcell2];
-			const ModuleBase::Vector3<int> r2 = this->gridt->get_ucell_coords(id2);
-
-			if(iat1<=iat2)
-			{
-                int first_ib=0;
-                for(int ib=0; ib<this->bxyz; ++ib)
-                {
-                    if(cal_flag[ib][ia1] && cal_flag[ib][ia2])
-                    {
-                        first_ib=ib;
-                        break;
-                    }
-                }
-                int last_ib=0;
-                for(int ib=this->bxyz-1; ib>=0; --ib)
-                {
-                    if(cal_flag[ib][ia1] && cal_flag[ib][ia2])
-                    {
-                        last_ib=ib+1;
-                        break;
-                    }
-                }
-                const int ib_length = last_ib-first_ib;
-                if(ib_length<=0) { continue; }
-
-				const auto tmp_matrix = hR->find_matrix(iat1, iat2, r1-r2);
-				if (tmp_matrix == nullptr)
-				{
-					continue;
-				}
-				const int m = tmp_matrix->get_row_size();
-				const int n = tmp_matrix->get_col_size();
-                hr_tmp.resize(m * n);
-                ModuleBase::GlobalFunc::ZEROS(hr_tmp.data(), m*n);
-
-                dgemm_(&transa, &transb, &n, &m, &ib_length, &alpha,
-                    &psir_vlbr3[first_ib][block_index[ia2]], &LD_pool,
-                    &psir_ylm[first_ib][block_index[ia1]], &LD_pool,
-                    &beta, hr_tmp.data(), &n); 
-                tmp_matrix->add_array_ts(hr_tmp.data());
-			}
-		}
-	}
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/grid_bigcell.cpp b/source/source_lcao/module_gint/grid_bigcell.cpp
deleted file mode 100644
index ec5b29970c..0000000000
--- a/source/source_lcao/module_gint/grid_bigcell.cpp
+++ /dev/null
@@ -1,363 +0,0 @@
-#include "grid_bigcell.h"
-
-#include "source_io/module_parameter/parameter.h"
-#include "source_base/memory.h"
-#include "source_base/timer.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_pw/module_pwdft/global.h"
-#include "source_cell/unitcell.h"
-Grid_BigCell::Grid_BigCell()
-{
-    this->orbital_rmax = 0.0;
-    this->nxe = this->nye = this->nze = 0;
-    this->dxe = 0;
-    this->dye = 0;
-    this->dze = 0;
-    this->nxe = 0;
-    this->nye = 0;
-    this->nze = 0;
-    this->nxyze = 0;
-}
-
-Grid_BigCell::~Grid_BigCell()
-{
-}
-
-void Grid_BigCell::init_big_latvec(const UnitCell& ucell)
-{
-	ModuleBase::TITLE("Grid_BigCell","init_big_latvec");
-	// initialize the mesh cell vectors.
-	assert(nbx>0);
-	assert(nby>0);
-	assert(nbz>=0);
-
-	this->nat=ucell.nat;
-	//size of each big room (same shape with unitcell)
-	this->bigcell_vec1=std::vector<double>(3,0.0);
-	this->bigcell_vec1[0]=ucell.a1.x / (double)nbx * ucell.lat0;
-	this->bigcell_vec1[1]=ucell.a1.y / (double)nbx * ucell.lat0;
-	this->bigcell_vec1[2]=ucell.a1.z / (double)nbx * ucell.lat0;
-
-	this->bigcell_vec2=std::vector<double>(3,0.0);
-	this->bigcell_vec2[0]=ucell.a2.x / (double)nby * ucell.lat0;
-	this->bigcell_vec2[1]=ucell.a2.y / (double)nby * ucell.lat0;
-	this->bigcell_vec2[2]=ucell.a2.z / (double)nby * ucell.lat0;
-
-	this->bigcell_vec3=std::vector<double>(3,0.0);
-	this->bigcell_vec3[0]=ucell.a3.x / (double)nbz * ucell.lat0;
-	this->bigcell_vec3[1]=ucell.a3.y / (double)nbz * ucell.lat0;
-	this->bigcell_vec3[2]=ucell.a3.z / (double)nbz * ucell.lat0;
-
-	this->bigcell_latvec0.e11 = this->bigcell_vec1[0];
-	this->bigcell_latvec0.e12 = this->bigcell_vec1[1];
-	this->bigcell_latvec0.e13 = this->bigcell_vec1[2];
-
-	this->bigcell_latvec0.e21 = this->bigcell_vec2[0];
-	this->bigcell_latvec0.e22 = this->bigcell_vec2[1];
-	this->bigcell_latvec0.e23 = this->bigcell_vec2[2];
-
-	this->bigcell_latvec0.e31 = this->bigcell_vec3[0];
-	this->bigcell_latvec0.e32 = this->bigcell_vec3[1];
-	this->bigcell_latvec0.e33 = this->bigcell_vec3[2];
-
-	// why we need GT = bigcell_latvec0^(-1)?
-	// note that (i,j,k) is a grid point.
-	// (x,y,z) is the cartesian coordinates.
-	// because
-	// (x,y,z) = (i,j,k) * bigcell_latvec0
-	// once we know (x,y,z) and bigcell_latvec0
-	// we need to transform the formula to
-	// (x,y,z) * bigcell_latvec0^(-1) = (i,j,k)
-	this->bigcell_GT = this->bigcell_latvec0.Inverse();
-
-	if(PARAM.inp.test_gridt)
-	{
-		GlobalV::ofs_running << " the VECTORS of BIGCELL are (Bohr): " << std::endl;
-		GlobalV::ofs_running << " vec1( " 
-			<< std::setw(15) << bigcell_vec1[0]
-			<< std::setw(15) << bigcell_vec1[1]
-			<< std::setw(15) << bigcell_vec1[2] 
-			<< ")" << std::endl;
-
-		GlobalV::ofs_running << " vec2( " 
-			<< std::setw(15) << bigcell_vec2[0]
-			<< std::setw(15) << bigcell_vec2[1]
-			<< std::setw(15) << bigcell_vec2[2]
-			<< ")" << std::endl;
-
-		GlobalV::ofs_running << " vec3( " 
-			<< std::setw(15) << bigcell_vec3[0]
-			<< std::setw(15) << bigcell_vec3[1]
-			<< std::setw(15) << bigcell_vec3[2]
-			<< ")" << std::endl;
-	}
-	return;
-}
-
-
-void Grid_BigCell::init_grid_expansion(const UnitCell& ucell,double* rcut)
-{
-	ModuleBase::TITLE("Grid_BigCell","init_grid_expansion");
-
-	// calculate the max cutoff radius among all orbitals.
-	// then we will use this parameter to generate grid expansion.
-
-	for(int T=0; T<ucell.ntype; T++)
-	{
-		this->orbital_rmax = std::max( rcut[T], this->orbital_rmax);
-	}
-	if(PARAM.inp.test_gridt)ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"rmax of periodic grid (bohr)",orbital_rmax);
-
-	// mohan fixed serious bug 2010-03-06
-	// G = GT^T
-	// g1 = the norm of first std::vector of G 
-	// g2 = the norm of second std::vector of G 
-	// g3 = the norm of third std::vector of G 
-	double g1 = sqrt(bigcell_GT.e11 * bigcell_GT.e11 
-	+ bigcell_GT.e21 * bigcell_GT.e21 
-	+ bigcell_GT.e31 * bigcell_GT.e31);
-	
-	double g2 = sqrt(bigcell_GT.e12 * bigcell_GT.e12 
-	+ bigcell_GT.e22 * bigcell_GT.e22 
-	+ bigcell_GT.e32 * bigcell_GT.e32);
-	
-	double g3 = sqrt(bigcell_GT.e13 * bigcell_GT.e13 
-	+ bigcell_GT.e23 * bigcell_GT.e23 
-	+ bigcell_GT.e33 * bigcell_GT.e33);
-
-	// we assume the added bigcell can present even the atom
-	// is at the edge of the origin grid.
-	// mohan add +1, 2011-04-23
-	this->dxe = static_cast<int>( this->orbital_rmax * g1) +1;
-	this->dye = static_cast<int>( this->orbital_rmax * g2) +1;
-	this->dze = static_cast<int>( this->orbital_rmax * g3) +1;
-	//xiaohui add 'PARAM.inp.out_level' line, 2015-09-16
-	if(PARAM.inp.out_level != "m") ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"extended fft grid",dxe,dye,dze);
-
-	// calculate the dimension of expanded grid.
-	// +1 in order to cover the spillage atom on the right side.
-	assert(nbx>0);
-	assert(nby>0);
-	assert(nbz>=0);
-
-	this->nxe = nbx + 2*dxe +1;
-	this->nye = nby + 2*dye +1;
-	this->nze = nbz + 2*dze +1;
-	this->nxyze = this->nxe * this->nye * this->nze;
-
-	if(PARAM.inp.out_level != "m") ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"dimension of extened grid",nxe,nye,nze);
-	return;
-}
-
-
-void Grid_BigCell::init_tau_in_bigcell(const UnitCell& ucell)
-{
-	ModuleBase::TITLE("Grid_BigCell","init_tau_in_bigcell");
-	
-	// allcoate space for atom positions relative
-	// to meshcell.
-	this->tau_in_bigcell = std::vector<std::vector<double>>(ucell.nat,std::vector<double>(3,0.0));
-	ModuleBase::Memory::record("tau_in_bigcell", sizeof(double) * ucell.nat*3);
-	// allocate space, these arrays record which meshcell
-	// the atom is in.
-	this->index_atom = std::vector<int>(ucell.nat, 0);
-	ModuleBase::Memory::record("index_atom", sizeof(double) * ucell.nat);
-	
-	// get the fraction number of (i,j,k)
-	ModuleBase::Vector3<double> fraction;
-	int iat=0;
-	int ii,jj,kk;
-	double delta[3];
-	for(int it=0; it<ucell.ntype; it++)
-	{
-		for(int ia=0; ia<ucell.atoms[it].na; ia++)
-		{
-			// direct positions of atoms calculated from cartesian coordinates.
-			// not used because the factrion may be <0 (although very small, such as
-			// -1.0e-15) mohan note 2012-07-03
-			//fraction = ( ucell.atoms[it].tau[ia] * ucell.lat0 )* this->bigcell_GT;
-
-			//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-			// mohan add 2012-07-03,
-			// this can make sure faction are always larger than 0.
-			//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-			fraction.x = ucell.atoms[it].taud[ia].x / (1.0/(double)nbx);
-			fraction.y = ucell.atoms[it].taud[ia].y / (1.0/(double)nby);
-			fraction.z = ucell.atoms[it].taud[ia].z / (1.0/(double)nbz);
-
-			// never use the following, especially for k-algorithm,
-			// it may move the atom to a cell that it doesn't belong 
-			// to
-			//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-			// mohan add 2012-06-07
-			// fraction may be very very small, about -1.0e-15,
-			// and the fraction must > 0, so I use periodic boundary condition
-//			if( fraction.x < 0.0 ) fraction.x += nxe;
-//			if( fraction.y < 0.0 ) fraction.y += nye;
-//			if( fraction.z < 0.0 ) fraction.z += nze;
-
-
-
-			if( fraction.x < 0 || fraction.y < 0 || fraction.z < 0)
-			{
-				std::cout << " Atom positions " << std::endl;
-				std::cout << ucell.atoms[it].tau[ia].x << " " ;
-				std::cout << ucell.atoms[it].tau[ia].y << " " ;
-				std::cout << ucell.atoms[it].tau[ia].z << " " ;
-				std::cout << " fraction " << std::endl;
-				std::cout << fraction.x << " ";
-				std::cout << fraction.y << " ";
-				std::cout << fraction.z << " ";
-				std::cout << std::endl;
-				ModuleBase::WARNING_QUIT("Grid_BigCell::init_tau_in_bigcell","fraction.x<0 || fraction.y<0 || fraction.z<0");
-			}
-
-			assert(fraction.x >= 0.0);
-			assert(fraction.y >= 0.0);
-			assert(fraction.z >= 0.0);
-
-			// make clean which meshcell the atom is in.
-			ii = static_cast<int>(fraction.x+1.0e-8);
-			jj = static_cast<int>(fraction.y+1.0e-8);
-			kk = static_cast<int>(fraction.z+1.0e-8);
-	
-			// calculate the index of each corresponding meshcell.
-			// Notice ! In fact, we need to minus ii,jj,kk by 1.
-			// to label the atom belong to which meshcell
-			// in a usual way: left, down corner.
-			// if we dont' do this, means the start position 
-			// of atom is another tyep: right,up corner.
-			// which cause minus atom position in grid integration.
-
-			// index_atom: atom 'iat' index in extended grid.
-			this->index_atom[iat] = (kk+dze) + (jj+dye) * this->nze + (ii+dxe) * this->nye * this->nze;
-
-			/*
-			if(index_atom[iat]==3483935)
-			{
-				std::cout << "\n i=" << kk+dze << " j=" << jj+dye << " k=" << ii+dxe;
-				BLOCK_HERE("check index atom");
-			}
-			*/
-
-			// get the relative position in direct coordinate.
-			delta[0] = fraction.x - (double)ii;
-			delta[1] = fraction.y - (double)jj;
-			delta[2] = fraction.z - (double)kk;
-			
-			if( std::abs(delta[0]) < 1.0e-8) delta[0] = 0.0;
-			if( std::abs(delta[1]) < 1.0e-8) delta[1] = 0.0;
-			if( std::abs(delta[2]) < 1.0e-8) delta[2] = 0.0;
-
-//			std::cout << " fraction=" << fraction.x << " " << fraction.y << " " << fraction.z << std::endl;
-//			std::cout << " delta=" << delta[0] << " " << delta[1] << " " << delta[2] << std::endl;
-
-			// get the true relative cartesian coordinate of each atom to the coresponding
-			// meshcell.
-			for(int ic=0; ic<3; ic++)
-			{
-				this->tau_in_bigcell[iat][ic] = 
-					delta[0] * this->bigcell_vec1[ic] + 
-					delta[1] * this->bigcell_vec2[ic] + 
-					delta[2] * this->bigcell_vec3[ic];
-			}
-
-			++iat;
-		}
-	}
-
-	return;
-}
-
-// (3)
-// if f2normal == true, calculate the index2normal.
-// if f2normal == false, calculate the index2cell. 
-void Grid_BigCell::grid_expansion_index(bool f2normal, int *target)const
-{
-	ModuleBase::TITLE("Grid_BigCell","grid_expansion_index");
-	ModuleBase::timer::tick("Grid_BigCell","grid_expansion_index");
-	
-	int ii,jj,kk,in_ext,in_normal;
-	for(int i=0; i<this->nxe; i++)
-	{
-		for(int j=0; j<this->nye; j++)
-		{
-			for(int k=0; k<this->nze; k++)
-			{
-				in_ext = k + j * this->nze + i * this->nye * this->nze;
-				
-				// range from [-dxe,ncx+dxe]
-				ii = i - this->dxe;
-				jj = j - this->dye;
-				kk = k - this->dze;
-
-				//---------------------------------------------------
-				// mohan add 2010-10-28	
-				// be careful of the box.
-				// it's useful only when k points are used in LCAO.
-				// for example, we construct a 2D supercell
-				// and using 32 * 32 FFT grid (bigcell ) to do 
-				// grid integration,
-				// then the first cell (0,0) along x is [0,31)
-				// others are:
-				// cell index: (-2,0)   , (-1,0)  , (0,0),  (0,1)
-				// fft index: [-64,-33], [-32,-1], [0,31], [32,63].
-				// look at the formulas below,
-				// at first, we take grid_index2ucell1=(ii/nbx)
-				// but then we found it is wrong if ii < 0.
-				// for example, if ii is -31, the box is -1,
-				// so we add -1, the formula turns to ii/nbx-1,
-				// but if ii is -32, the box is -1-1 = -2, not correct.
-				// so we add 1 to ii, the box will be -31/32-1=-1, correct!
-				// the formula is (ii+1)/nbx-1,
-				// if ii is -1, the box is still -1, correct!
-				// if ii is -33, the box is -2, correct!
-				//---------------------------------------------------
-
-				int cel1, cel2, cel3;
-
-				if(ii<0) cel1 = (ii+1) / nbx - 1;
-				else cel1 = ii / nbx;
-				if(jj<0) cel2 = (jj+1) / nby - 1;
-				else cel2 = jj / nby;
-				if(kk<0) cel3 = (kk+1) / nbz - 1;
-				else cel3 = kk / nbz;
-
-				if(!f2normal)
-				{
-					// target: index2ucell
-					target[in_ext] = this->cal_Rindex(cel1, cel2, cel3);
-				}
-				else
-				{
-					// if ii < 0, we need to make ii > 0.
-					// so we add 10000 layers. It should be enough.
-					// ii, jj, kk shoudl -- ?????????????
-					ii = (ii + 10000 * nbx) % nbx;
-					jj = (jj + 10000 * nby) % nby;
-					kk = (kk + 10000 * nbz) % nbz;
-
-					assert(ii>=0);
-					assert(jj>=0);
-					assert(kk>=0);
-
-					assert( in_ext < nxyze);
-
-					if(ii<nbx && jj<nby && kk<nbz)
-					{
-						in_normal = kk + jj * nbz + ii * nby * nbz;
-						
-						// target: index2normal
-						target[in_ext] = in_normal;
-					}
-					else
-					{
-						ModuleBase::WARNING_QUIT("Grid_BigCell::init_grid_expansion_index","check ii,jj,kk!");
-					}
-				}// f2 normal
-			}// k
-		}// j
-	}// i
-	ModuleBase::timer::tick("Grid_BigCell","grid_expansion_index");
-	return;
-}
diff --git a/source/source_lcao/module_gint/grid_bigcell.h b/source/source_lcao/module_gint/grid_bigcell.h
deleted file mode 100644
index 7d380782dd..0000000000
--- a/source/source_lcao/module_gint/grid_bigcell.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef GRID_BIGCELL_H
-#define GRID_BIGCELL_H
-
-#include "source_base/global_function.h"
-#include "source_base/global_variable.h"
-#include "source_base/matrix3.h"
-#include "grid_meshcell.h"
-
-class Grid_BigCell: public Grid_MeshCell
-{
-	public:
-		Grid_BigCell();
-		~Grid_BigCell();
-		// number atoms and type.
-		int nat=0;
-		// save the relative cartesian position
-		// to bigcell of each atom.
-        std::vector<std::vector<double>> tau_in_bigcell;
-
-        /// move operator for the next ESolver to directly use its infomation
-        Grid_BigCell& operator=(Grid_BigCell&& rhs) = default;
-
-      protected:
-        // get the max radius of all orbitals
-		// which will use to generate grid expansion,
-		// and the meshball.
-		double orbital_rmax;
-		
-		// the added number of bigcelli each direction.
-		int dxe;
-		int dye;
-		int dze;
-
-		// expansion grid dimension.
-		int nxe;
-		int nye;
-		int nze;
-		int nxyze;
-		
-        std::vector<int> index_atom;
-
-        // save the position of base vector of bigcell.
-        std::vector<double> bigcell_vec1;
-        std::vector<double> bigcell_vec2;
-        std::vector<double> bigcell_vec3;
-
-		ModuleBase::Matrix3 bigcell_latvec0;
-		ModuleBase::Matrix3 bigcell_GT;
-		
-		//---------------------------------
-		void grid_expansion_index(bool f2normal, int *target)const;
-		//---------------------------------
-		void init_big_latvec(const UnitCell &ucell);
-		//---------------------------------
-		void init_tau_in_bigcell(const UnitCell& ucell);
-		//---------------------------------
-		void init_grid_expansion(const UnitCell& ucell,double* rcut);
-};
-#endif
diff --git a/source/source_lcao/module_gint/grid_meshball.cpp b/source/source_lcao/module_gint/grid_meshball.cpp
deleted file mode 100644
index 464ea8d962..0000000000
--- a/source/source_lcao/module_gint/grid_meshball.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-#include "grid_meshball.h"
-#include "source_base/memory.h"
-#include "source_io/module_parameter/parameter.h"
-
-Grid_MeshBall::Grid_MeshBall()
-{
-}
-
-Grid_MeshBall::~Grid_MeshBall()
-{
-}
-
-void Grid_MeshBall::init_meshball()
-{	
-	ModuleBase::TITLE("Grid_MeshBall","init_meshball");
-
-    // init meshball_radius, generally the value
-    // is same as orbital_rmax, of course you can
-    // incrase meshball_radius, but there will be
-    // no atoms in the added bigcells.
-    // (in case subcell are too many).
-	this->meshball_radius = this->orbital_rmax;
-
-	// select a ball in a cubic.
-	double pos[3];
-	double r2=0.0;
-
-	//------------------------------------------------------------------
-	// const double rcut2 = this->meshball_radius * this->meshball_radius;
-	// qianrui fix a bug and add 0.001 2022-4-30
-	// Sometimes r2 is equal to rcut2, for example they are 36.
-	// However, r2 is either 35.99.. or 36.0..001， which makes  count != this->meshball_ncells
-	// and segment fault.
-	// I do not know how to solve it and this may occurs in somewhere else in ABACUS.
-	// May some genius can give a better solution.
-	//------------------------------------------------------------------
-	const double rcut2 = this->meshball_radius * this->meshball_radius + 0.001;
-	
-	//-------------------------------------------------------------------
-	// calculate twice, the first time find the number of mesh points,
-	// then allocate array and save each bigcell's cartesian coordinate.
-	// plus one because we need to cover atom spillage.
-	// meshball_ncells: How many cells in mesh ball.
-	//-------------------------------------------------------------------
-	this->meshball_ncells = 0;
-	for(int i=-dxe; i<dxe+1; i++) // mohan fix bug 2009-10-21, range should be [-dxe,dxe]
-	{
-		for(int j=-dye; j<dye+1; j++)
-		{
-			for(int k=-dze; k<dze+1; k++)
-			{
-				// caclculate the std::vector away from 'zero point'.
-				for(int ip=0; ip<3; ip++)
-				{
-					pos[ip] = i*bigcell_vec1[ip]+j*bigcell_vec2[ip]+k*bigcell_vec3[ip];
-				}
-				r2 = this->deal_with_atom_spillage( pos );
-				//r2 = pos[0]*pos[0]+pos[1]*pos[1]+pos[2]*pos[2];
-	
-				// calculate the distance.
-				if( r2 < rcut2 )
-				{
-					++meshball_ncells;
-				} 
-			}
-		}
-	}
-	if(PARAM.inp.test_gridt) {ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "how many cells in meshball",this->meshball_ncells);
-}
-
-	// prepare for the second calculation.
-	this->meshball_positions = std::vector<std::vector<double>>(meshball_ncells, std::vector<double>(3, 0.0));
-	ModuleBase::Memory::record("meshball_pos", sizeof(double) * meshball_ncells*3);
-    this->index_ball = std::vector<int>(meshball_ncells);
-	ModuleBase::Memory::record("index_ball", sizeof(int) * meshball_ncells);
-
-	// second time.
-	int count = 0;
-	for(int i=-dxe; i<this->dxe+1; i++)
-	{
-		for(int j=-dye; j<this->dye+1; j++)
-		{
-			for(int k=-dze; k<this->dze+1; k++)
-			{
-				// caclculate the std::vector away from 'zero point'.
-				// change to cartesian coordinates.
-				for(int ip=0; ip<3; ip++)
-				{
-					pos[ip] = i*bigcell_vec1[ip]+j*bigcell_vec2[ip]+k*bigcell_vec3[ip];
-				}
-				r2 = this->deal_with_atom_spillage( pos );
-
-				// calculate the distance.
-				if( r2 < rcut2 )
-				{
-					for(int ip=0; ip<3; ip++)
-					{
-						this->meshball_positions[count][ip] = pos[ip];
-					}
-
-					// record each position.
-					this->index_ball[count] = k + j * this->nze + i * this->nye * this->nze;
-					++count;
-				} 
-			}
-		}
-	}
-
-	assert(count == this->meshball_ncells);
-	return;
-}
-
-double Grid_MeshBall::deal_with_atom_spillage(const double *pos)
-{
-	double dx;
-	double r2 = 100000;
-	double *cell=new double[3];
-	
-	for(int i=-1; i<=1; i++)
-	{
-		for(int j=-1; j<=1; j++)
-		{
-			for(int k=-1; k<=1; k++)
-			{
-				dx = 0.0;
-				for(int ip=0; ip<3; ip++)
-				{
-					// change to cartesian coordinates.	
-					cell[ip] = i*this->bigcell_vec1[ip] +
-						j*this->bigcell_vec2[ip] +
-						k*this->bigcell_vec3[ip];
-					dx += (cell[ip] - pos[ip]) * (cell[ip] - pos[ip]);
-				}
-				r2 = std::min(dx, r2);
-			}
-		}
-	}
-	delete[] cell;
-	return r2;
-}
-
-
diff --git a/source/source_lcao/module_gint/grid_meshball.h b/source/source_lcao/module_gint/grid_meshball.h
deleted file mode 100644
index 571d59126e..0000000000
--- a/source/source_lcao/module_gint/grid_meshball.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef GRID_MESHBALL_H
-#define GRID_MESHBALL_H
-
-#include "grid_bigcell.h"
-
-class Grid_MeshBall : public Grid_BigCell
-{
-	public:
-		Grid_MeshBall();
-		~Grid_MeshBall();
-		// cartesian coordinates of meshball.
-        std::vector<std::vector<double>> meshball_positions;
-
-        /// move operator for the next ESolver to directly use its infomation
-        Grid_MeshBall& operator=(Grid_MeshBall&& rhs) = default;
-
-      protected:
-		// number of meshcells in meshball.
-		int meshball_ncells=0;
-		// used in index2normal
-		std::vector<int> index_ball;
-		// search each meshcell of this meshball.
-		void init_meshball(void);
-
-	private:
-		// init the meshball radius.
-		double meshball_radius=0.0;
-		// Handle as a truncation function.
-		double deal_with_atom_spillage(const double* pos);
-	
-};
-#endif
diff --git a/source/source_lcao/module_gint/grid_meshcell.cpp b/source/source_lcao/module_gint/grid_meshcell.cpp
deleted file mode 100644
index 77e933c55d..0000000000
--- a/source/source_lcao/module_gint/grid_meshcell.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-#include "grid_meshcell.h"
-
-#include "source_io/module_parameter/parameter.h"
-#include "source_base/memory.h"
-#include "source_pw/module_pwdft/global.h"
-
-Grid_MeshCell::Grid_MeshCell()
-{
-}
-
-Grid_MeshCell::~Grid_MeshCell()
-{
-}
-
-void Grid_MeshCell::set_grid_dim(
-    const int &ncx_in,
-    const int &ncy_in,
-    const int &ncz_in,
-    const int &bx_in,
-    const int &by_in,
-    const int &bz_in,
-    const int &nbx_in,
-    const int &nby_in,
-    const int &nbz_in,
-    const int &nbxx_in,
-    const int &nbzp_start_in,
-    const int &nbzp_in
-    )
-{
-    this->ncx = ncx_in;
-    this->ncy = ncy_in;
-    this->ncz = ncz_in;
-    this->ncxyz = ncx * ncy * ncz;
-    this->bx = bx_in;
-    this->by = by_in;
-    this->bz = bz_in;
-    this->bxyz = bx*by*bz;
-    this->nbx = nbx_in;
-    this->nby = nby_in;
-    this->nbz = nbz_in;
-    this->nbxyz = nbx*nby*nbz;
-    this->nbxx = nbxx_in;
-    this->nbzp_start = nbzp_start_in;
-    this->nbzp = nbzp_in;
-
-
-	//xiaohui add 'PARAM.inp.out_level' line, 2015-09-16
-	if(PARAM.inp.out_level != "m") 
-	{
-		ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"real space grid",ncx,ncy,ncz); // real space uniform grid
-	}
-
-	if(PARAM.inp.out_level != "m") 
-	{
-		ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"big cell numbers in grid",nbx,nby,nbz); // reduced by BIG_CELL
-	}
-
-	if(PARAM.inp.out_level != "m") 
-	{
-		ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"meshcell numbers in big cell",bx,by,bz); // is small integer, typical number 2*2*2
-	}
-
-    return;
-}
-
-
-
-// (1)
-void Grid_MeshCell::init_latvec(const UnitCell &ucell)
-{
-	ModuleBase::TITLE("Grid_MeshCell","init_latvec");
-	// initialize the mesh cell vectors.
-	assert(ncx>0);
-	assert(ncy>0);
-	assert(ncz>0);
-
-	//size of each room (same shape with unitcell)
-	this->meshcell_vec1=std::vector<double>(3,0.0);
-	this->meshcell_vec1[0]=ucell.a1.x / (double)ncx * ucell.lat0;
-	this->meshcell_vec1[1]=ucell.a1.y / (double)ncx * ucell.lat0;
-	this->meshcell_vec1[2]=ucell.a1.z / (double)ncx * ucell.lat0;
-
-	this->meshcell_vec2=std::vector<double>(3,0.0);
-	this->meshcell_vec2[0]=ucell.a2.x / (double)ncy * ucell.lat0;
-	this->meshcell_vec2[1]=ucell.a2.y / (double)ncy * ucell.lat0;
-	this->meshcell_vec2[2]=ucell.a2.z / (double)ncy * ucell.lat0;
-
-	this->meshcell_vec3=std::vector<double>(3,0.0);
-	this->meshcell_vec3[0]=ucell.a3.x / (double)ncz * ucell.lat0;
-	this->meshcell_vec3[1]=ucell.a3.y / (double)ncz * ucell.lat0;
-	this->meshcell_vec3[2]=ucell.a3.z / (double)ncz * ucell.lat0;
-
-	this->meshcell_latvec0.e11 = this->meshcell_vec1[0];
-	this->meshcell_latvec0.e12 = this->meshcell_vec1[1];
-	this->meshcell_latvec0.e13 = this->meshcell_vec1[2];
-
-	this->meshcell_latvec0.e21 = this->meshcell_vec2[0];
-	this->meshcell_latvec0.e22 = this->meshcell_vec2[1];
-	this->meshcell_latvec0.e23 = this->meshcell_vec2[2];
-
-	this->meshcell_latvec0.e31 = this->meshcell_vec3[0];
-	this->meshcell_latvec0.e32 = this->meshcell_vec3[1];
-	this->meshcell_latvec0.e33 = this->meshcell_vec3[2];
-
-	// why we need GT = meshcell_latvec0^(-1)?
-	// note that (i,j,k) is a grid point.
-	// (x,y,z) is the cartesian coordinates.
-	// because
-	// (x,y,z) = (i,j,k) * meshcell_latvec0
-	// once we know (x,y,z) and meshcell_latvec0
-	// we need to transform the formula to
-	// (x,y,z) * meshcell_latvec0^(-1) = (i,j,k)
-	this->meshcell_GT = this->meshcell_latvec0.Inverse();
-
-	if(PARAM.inp.test_gridt)
-	{
-		GlobalV::ofs_running << " the VECTORS of MESHCELL are (Bohr): " << std::endl;
-		GlobalV::ofs_running << " vec1( " 
-			<< std::setw(15) << meshcell_vec1[0]
-			<< std::setw(15) << meshcell_vec1[1]
-			<< std::setw(15) << meshcell_vec1[2] 
-			<< ")" << std::endl;
-
-		GlobalV::ofs_running << " vec2( " 
-			<< std::setw(15) << meshcell_vec2[0]
-			<< std::setw(15) << meshcell_vec2[1]
-			<< std::setw(15) << meshcell_vec2[2]
-			<< ")" << std::endl;
-
-		GlobalV::ofs_running << " vec3( " 
-			<< std::setw(15) << meshcell_vec3[0]
-			<< std::setw(15) << meshcell_vec3[1]
-			<< std::setw(15) << meshcell_vec3[2]
-			<< ")" << std::endl;
-	}
-	
-	return;
-}
-
-void Grid_MeshCell::init_meshcell_pos(void)
-{
-	assert(bx>0);
-	assert(by>0);
-	assert(bz>0);
-	assert(bxyz>0);
-
-	meshcell_pos = std::vector<std::vector<double>>(bxyz,std::vector<double>(3,0.0));
-	ModuleBase::Memory::record("meshcell_pos", sizeof(double) * bxyz*3);
-
-	int index=0;
-	for(int i=0; i<bx; i++)
-	{
-		for(int j=0; j<by; j++)
-		{
-			for(int k=0; k<bz; k++)
-			{
-				for(int p=0; p<3; p++)
-				{
-					meshcell_pos[index][p] = i*meshcell_vec1[p] + j*meshcell_vec2[p] + k*meshcell_vec3[p];
-				}
-				++index;
-			}
-		}
-	}
-	return;
-}
-
-
diff --git a/source/source_lcao/module_gint/grid_meshcell.h b/source/source_lcao/module_gint/grid_meshcell.h
deleted file mode 100644
index 3d0ee3f49a..0000000000
--- a/source/source_lcao/module_gint/grid_meshcell.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef GRID_MESHCELL_H
-#define GRID_MESHCELL_H
-#include "source_base/global_function.h"
-#include "source_base/global_variable.h"
-#include "source_base/matrix3.h"
-#include "grid_meshk.h"
-#include "source_cell/unitcell.h"
-class Grid_MeshCell: public Grid_MeshK
-{
-	public:
-	Grid_MeshCell();
-	~Grid_MeshCell();
-	
-	int ncx,ncy,ncz,ncxyz;
-	int bx=1,by=1,bz=1,bxyz=1;
-	int nbx,nby,nbz,nbxyz;
-	int nbxx;
-	int nbzp_start,nbzp;
-	// save the position of each meshcell.
-	std::vector<std::vector<double>> meshcell_pos;
-
-	private:
-	// latvec0 and GT are not used in current code.
-	// these two variables may be removed in the future.
-	ModuleBase::Matrix3 meshcell_latvec0;
-	ModuleBase::Matrix3 meshcell_GT;
-	
-	protected:
-
-	std::vector<double> meshcell_vec1;
-	std::vector<double> meshcell_vec2;
-	std::vector<double> meshcell_vec3;
-
-    /// move operator for the next ESolver to directly use its infomation
-    Grid_MeshCell& operator=(Grid_MeshCell&& rhs) = default;
-
-    void set_grid_dim(
-			const int &ncx_in,
-			const int &ncy_in,
-			const int &ncz_in,
-			const int &bx_in,
-			const int &by_in,
-			const int &bz_in,
-			const int &nbx_in,
-			const int &nby_in,
-			const int &nbz_in,
-			const int &nbxx_in,
-			const int &nbzp_start_in,
-			const int &nbzp_in);
-
-	void init_latvec(const UnitCell &ucell);
-    void init_meshcell_pos();
-
-};
-
-#endif
diff --git a/source/source_lcao/module_gint/grid_meshk.cpp b/source/source_lcao/module_gint/grid_meshk.cpp
deleted file mode 100644
index e1451a31d8..0000000000
--- a/source/source_lcao/module_gint/grid_meshk.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-#include "grid_meshk.h"
-#include "source_pw/module_pwdft/global.h"
-#include "source_io/module_parameter/parameter.h"
-
-Grid_MeshK::Grid_MeshK()
-{
-}
-
-Grid_MeshK::~Grid_MeshK()
-{
-}
-
-int Grid_MeshK::cal_Rindex(const int &u1, const int &u2, const int &u3)const
-{
-	const int x1 = u1 - this->minu1;
-	const int x2 = u2 - this->minu2;
-	const int x3 = u3 - this->minu3;
-	
-	if(x1<0 || x2<0 || x3<0)
-	{
-		std::cout << " u1=" << u1 << " minu1=" << minu1 << std::endl;
-		std::cout << " u2=" << u2 << " minu2=" << minu2 << std::endl;
-		std::cout << " u3=" << u3 << " minu3=" << minu3 << std::endl;
-		ModuleBase::WARNING_QUIT("Grid_MeshK::cal_Rindex","x1<0 || x2<0 || x3<0 !");
-	}
-
-	assert(x1>=0);
-	assert(x2>=0);
-	assert(x3>=0);
-
-	return (x3 + x2 * this->nu3 + x1 * this->nu2 * this->nu3);
-}
-
-ModuleBase::Vector3<int> Grid_MeshK::get_ucell_coords(const int &Rindex)const
-{
-	const int x = ucell_index2x[Rindex];
-	const int y = ucell_index2y[Rindex];
-	const int z = ucell_index2z[Rindex];
-
-	return ModuleBase::Vector3<int>(x, y, z);
-}
-
-void Grid_MeshK::cal_extended_cell(const int &dxe, const int &dye, const int &dze,const int& nbx, const int& nby, const int& nbz)
-{
-	ModuleBase::TITLE("Grid_MeshK","cal_extended_cell");
-
-	//--------------------------------------
-	// max and min unitcell in expaned grid.
-	//--------------------------------------
-	this->maxu1 = dxe / nbx + 1;
-	this->maxu2 = dye / nby + 1;
-	this->maxu3 = dze / nbz + 1;
-
-	this->minu1 = (-dxe+1) / nbx - 1; 
-	this->minu2 = (-dye+1) / nby - 1; 
-	this->minu3 = (-dze+1) / nbz - 1; 
-
-	if(PARAM.inp.test_gridt) {ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"MaxUnitcell",maxu1,maxu2,maxu3);
-}
-	if(PARAM.inp.test_gridt) {ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"MinUnitcell",minu1,minu2,minu3);
-}
-
-	//--------------------------------------
-	// number of unitcell in each direction.
-	//--------------------------------------
-	this->nu1 = maxu1 - minu1 + 1;
-	this->nu2 = maxu2 - minu2 + 1;
-	this->nu3 = maxu3 - minu3 + 1;
-	this->nutot = nu1 * nu2 * nu3;
-
-	if(PARAM.inp.test_gridt) {ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"UnitCellNumber",nu1,nu2,nu3);
-}
-	if(PARAM.inp.out_level != "m") { ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"UnitCellTotal",nutot);
-}
-
-
-    this->ucell_index2x = std::vector<int>(nutot, 0);
-    this->ucell_index2y = std::vector<int>(nutot, 0);
-    this->ucell_index2z = std::vector<int>(nutot, 0);
-
-	this->nutot = nu1 * nu2 * nu3;
-
-	for(int i=minu1; i<=maxu1; i++)
-	{
-		for(int j=minu2; j<=maxu2; j++)
-		{
-			for(int k=minu3; k<=maxu3; k++)
-			{
-				const int cell = cal_Rindex(i,j,k);	
-				assert(cell<nutot);
-
-				this->ucell_index2x[cell] = i;
-				this->ucell_index2y[cell] = j;
-				this->ucell_index2z[cell] = k;
-
-			}
-		}
-	}
-
-	return;
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/grid_meshk.h b/source/source_lcao/module_gint/grid_meshk.h
deleted file mode 100644
index fb8d458bb0..0000000000
--- a/source/source_lcao/module_gint/grid_meshk.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef GRID_MESHK_H
-#define GRID_MESHK_H
-#include "source_base/global_function.h"
-#include "source_base/global_variable.h"
-#include "source_base/vector3.h"
-
-class Grid_MeshK
-{
-	public:
-		Grid_MeshK();
-		~Grid_MeshK();
-
-		// calculate the index of unitcell.
-        int cal_Rindex(const int& u1, const int& u2, const int& u3)const;
-
-		ModuleBase::Vector3<int> get_ucell_coords(const int& Rindex)const;
-
-        /// move operator for the next ESolver to directly use its infomation
-        Grid_MeshK& operator=(Grid_MeshK&& rhs) = default;
-
-      private:
-		// the max and the min unitcell.
-		int maxu1;
-		int maxu2;
-		int maxu3;
-
-		int minu1;
-		int minu2;
-		int minu3;
-
-		// the number of unitcells.
-		int nu1;
-		int nu2;
-		int nu3;
-		int nutot;
-
-		// from 1D index to unitcell.
-		std::vector<int> ucell_index2x;
-		std::vector<int> ucell_index2y;
-		std::vector<int> ucell_index2z;
-
-		protected:
-		// calculate the extended unitcell.
-		void cal_extended_cell(const int &dxe, const int &dye, const int &dze,
-								const int& nbx, const int& nby, const int& nbz);
-};
-
-#endif
diff --git a/source/source_lcao/module_gint/grid_technique.cpp b/source/source_lcao/module_gint/grid_technique.cpp
deleted file mode 100644
index de52f4d5f9..0000000000
--- a/source/source_lcao/module_gint/grid_technique.cpp
+++ /dev/null
@@ -1,784 +0,0 @@
-#if ((defined __CUDA) /* || (defined __ROCM) */)
-#include <cuda_runtime.h>
-#include "source_io/module_parameter/parameter.h"
-#endif
-#include "grid_technique.h"
-#include "source_io/module_parameter/parameter.h"
-#include "source_base/memory.h"
-#include "source_base/parallel_reduce.h"
-#include "source_base/timer.h"
-#include "source_pw/module_pwdft/global.h"
-#include "source_hsolver/kernels/cuda/helper_cuda.h"
-
-#include "source_lcao/module_gint/temp_gint/gint_helper.h"
-
-Grid_Technique::Grid_Technique() {
-#if ((defined __CUDA) /* || (defined __ROCM) */)
-    if (PARAM.inp.device == "gpu") {
-        is_malloced = false;
-    }
-#endif
-}
-
-Grid_Technique::~Grid_Technique() {
-
-#if ((defined __CUDA) /* || (defined __ROCM) */)
-    if (PARAM.inp.device == "gpu") {
-        free_gpu_gint_variables(this->nat);
-    }
-#endif
-}
-
-// This function is called in esolver_ks_lcao_elec.cpp
-// after the orbital information has been read,
-// this function control the routinue to generate
-// grid technique parameters.
-void Grid_Technique::set_pbc_grid(const int& ncx_in,
-                                  const int& ncy_in,
-                                  const int& ncz_in,
-                                  const int& bx_in,
-                                  const int& by_in,
-                                  const int& bz_in,
-                                  const int& nbx_in,
-                                  const int& nby_in,
-                                  const int& nbz_in,
-                                  const int& nbxx_in,
-                                  const int& nbzp_start_in,
-                                  const int& nbzp_in,
-                                  const int& ny,
-                                  const int& nplane,
-                                  const int& startz_current,
-                                  const UnitCell& ucell,
-                                  const Grid_Driver& gd,
-                                  const double& dr_uniform,
-                                  const std::vector<double>& rcuts,
-                                  const std::vector<std::vector<double>>& psi_u,
-                                  const std::vector<std::vector<double>>& dpsi_u,
-                                  const std::vector<std::vector<double>>& d2psi_u,
-                                  const int& num_stream)
-{
-    ModuleBase::TITLE("Grid_Technique", "init");
-    ModuleBase::timer::tick("Grid_Technique", "init");
-
-    if (PARAM.inp.out_level != "m") {
-        GlobalV::ofs_running
-            << "\n SETUP EXTENDED REAL SPACE GRID FOR GRID INTEGRATION"
-            << std::endl;
-    }
-    this->init_malloced = true;
-
-    // copy ucell and orb parameters
-    this->ucell = &ucell;
-    this->dr_uniform = dr_uniform;
-
-    this->nwmax = ucell.nwmax;
-    this->ntype = ucell.ntype;
-
-    this->rcuts = rcuts;
-    double max_cut = *std::max_element(this->rcuts.begin(), this->rcuts.end());
-    this->nr_max = static_cast<int>(1 / this->dr_uniform * max_cut) + 10;
-    this->psi_u = psi_u;
-    this->dpsi_u = dpsi_u;
-    this->d2psi_u = d2psi_u;
-
-    // (1) init_meshcell cell and big cell.
-    this->set_grid_dim(ncx_in,
-                       ncy_in,
-                       ncz_in,
-                       bx_in,
-                       by_in,
-                       bz_in,
-                       nbx_in,
-                       nby_in,
-                       nbz_in,
-                       nbxx_in,
-                       nbzp_start_in,
-                       nbzp_in);
-    this->init_latvec(ucell);
-
-    this->init_big_latvec(ucell);
-
-    this->init_meshcell_pos();
-
-    // (2) expand the grid
-
-    this->init_grid_expansion(ucell, this->rcuts.data());
-
-    // (3) calculate the extended grid.
-    this->cal_extended_cell(this->dxe,
-                            this->dye,
-                            this->dze,
-                            this->nbx,
-                            this->nby,
-                            this->nbz);
-
-    this->init_tau_in_bigcell(ucell);
-
-    this->init_meshball();
-
-    this->init_atoms_on_grid(ny, nplane, ucell);
-
-    this->init_ijr_and_nnrg(ucell, gd);
-    this->cal_trace_lo(ucell);
-#if ((defined __CUDA) /* || (defined __ROCM) */)
-    if (PARAM.inp.device == "gpu") {
-        this->init_gpu_gint_variables(ucell, num_stream);
-    }
-#endif
-
-    ModuleBase::timer::tick("Grid_Technique", "init");
-    return;
-}
-
-void Grid_Technique::get_startind(const int& ny,
-                                  const int& nplane) {
-    ModuleBase::TITLE("Grid_Technique", "get_startind");
-
-    assert(nbxx >= 0);
-
-    // calculates start_ind, which stores the
-    // starting index of each bigcell
-    this->start_ind = std::vector<int>(nbxx, 0);
-    ModuleBase::Memory::record("GT::start_ind", sizeof(int) * nbxx);
-
-    for (int i = 0; i < nbxx; i++) {
-        int ibx = 0;
-        int iby = 0;
-        int ibz = 0;
-
-        int ix = 0;
-        int iy = 0;
-        int iz = 0;
-
-        ibx = i / (nby * nbzp);
-        iby = (i - ibx * nby * nbzp) / nbzp;
-        ibz = i % nbzp;
-
-        ix = ibx * this->bx;
-        iy = iby * this->by;
-        iz = ibz * this->bz;
-
-        int ind = iz + iy * nplane + ix * ny * nplane;
-
-        start_ind[i] = ind;
-    }
-
-    return;
-}
-
-// PLEASE update this 'init_atoms_on_grid' to make
-// it adapted to 'cuboid' shape of grid
-// mohan add 2021-04-06
-void Grid_Technique::init_atoms_on_grid(const int& ny,
-                                        const int& nplane,
-                                        const UnitCell& ucell) {
-    ModuleBase::TITLE("Grid_Technique", "init_atoms_on_grid");
-
-    assert(nbxx >= 0);
-    this->get_startind(ny, nplane);
-
-    // (1) prepare data.
-    // counting the number of atoms whose orbitals have
-    // values on the bigcell.
-    this->how_many_atoms = std::vector<int>(nbxx, 0);
-    ModuleBase::Memory::record("GT::how_many_atoms", sizeof(int) * nbxx);
-
-    // (2) information about gloabl grid
-    // and local grid.
-    // mohan add 2010-07-02
-    std::vector<int> ind_bigcell = std::vector<int>(nbxyz, 0);
-    ModuleBase::Memory::record("GT::ind_bigcell", sizeof(int) * this->nxyze);
-    std::vector<char> bigcell_on_processor = std::vector<char>(nbxyz, 0);
-    ModuleBase::Memory::record("GT::bigcell_on_processor",
-                               sizeof(char) * this->nxyze);
-    this->check_bigcell(ind_bigcell.data(), bigcell_on_processor.data());
-
-    // (3) Find the atoms using
-    // when doing grid integration.
-    this->in_this_processor = std::vector<bool>(ucell.nat, false);
-    ModuleBase::Memory::record("GT::in_this_processor",
-                               sizeof(int) * this->nxyze);
-
-    // (4) init atoms on grid
-    std::vector<int> index2normal = std::vector<int>(this->nxyze, 0);
-    ModuleBase::Memory::record("GT::index2normal", sizeof(int) * this->nxyze);
-    this->grid_expansion_index(true, index2normal.data());
-
-    // (5) record how many atoms on
-    // each local grid point (ix,iy,iz)
-    int nat_local = 0;
-    this->total_atoms_on_grid = 0;
-    for (int iat = 0; iat < ucell.nat; iat++) 
-    {
-        const int it = ucell.iat2it[iat];
-        const double rcut_square = this->rcuts[it] * this->rcuts[it];
-        for (int im = 0; im < this->meshball_ncells; im++)
-        {
-            // bcell[iat]: which bcell iat atom is in.
-            // ball[im]: relative position of adjacent bcell.
-            const int normal = index2normal[this->index_atom[iat] + this->index_ball[im]];
-#ifdef __DEBUG
-            if (normal >= nbxyz)
-            {
-                #pragma omp critical
-                {
-                    std::cout << " index_atom=" << index_atom[iat] << std::endl;
-                    std::cout << " index_ball=" << index_ball[im] << std::endl;
-                    std::cout << " normal=" << normal << std::endl;
-                    std::cout << " nbxyz=" << nbxyz << std::endl;
-                    ModuleBase::WARNING_QUIT(
-                        "Grid_Technique::init_atoms_on_grid",
-                        "normal >= nbxyz");
-                }
-            }
-#endif
-            assert(normal >= 0);
-            const int bcell_idx_on_proc = ind_bigcell[normal];
-            if (!bigcell_on_processor[normal])
-            {    
-                continue;
-            }
-
-            bool is_atom_on_bcell = false;
-            const double dr_x_part = this->meshball_positions[im][0] - this->tau_in_bigcell[iat][0];
-            const double dr_y_part = this->meshball_positions[im][1] - this->tau_in_bigcell[iat][1];
-            const double dr_z_part = this->meshball_positions[im][2] - this->tau_in_bigcell[iat][2];
-            for(int imcell = 0; imcell < this -> bxyz; imcell++)
-            {
-                const double dr_x = this->meshcell_pos[imcell][0] + dr_x_part;
-                const double dr_y = this->meshcell_pos[imcell][1] + dr_y_part;
-                const double dr_z = this->meshcell_pos[imcell][2] + dr_z_part;
-                const double dist_square = dr_x * dr_x + dr_y * dr_y + dr_z * dr_z;
-                if(dist_square <= rcut_square)
-                {
-                    is_atom_on_bcell = true;
-                    break;
-                }
-            }
-            if(is_atom_on_bcell)
-            {
-                ++how_many_atoms[bcell_idx_on_proc];
-                ++this->total_atoms_on_grid;
-                this->in_this_processor[iat] = true;
-            }
-        }
-        if (this->in_this_processor[iat])
-        {
-            ++nat_local;
-        }
-    }
-
-    if (PARAM.inp.test_gridt) {
-        ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,
-                                    "Total_atoms_on_grid",
-                                    total_atoms_on_grid);
-}
-
-    int stop = 0;
-    if (total_atoms_on_grid == 0) {
-        GlobalV::ofs_running << " No atoms on this sub-FFT-mesh." << std::endl;
-        stop = 1;
-    }
-    Parallel_Reduce::reduce_all(stop);
-    if (stop) {
-        ModuleBase::WARNING("Grid_Technique::init_atoms_on_grid",
-                            "No atom on this sub-FFT-mesh.");
-    }
-
-    // calculate the trach of local ia to global iat
-    if (nat_local > 0) {
-        this->trace_iat.resize(nat_local);
-        for (int iat = ucell.nat - 1; iat >= 0; iat--) {
-            if (this->in_this_processor[iat]) {
-                this->trace_iat[--nat_local] = iat;
-            }
-        }
-    }
-
-    // need how_many_atoms first.
-    this->cal_grid_integration_index();
-    // bcell_start is needed.
-    this->init_atoms_on_grid2(index2normal.data(), ucell);
-    return;
-}
-
-void Grid_Technique::check_bigcell(int* ind_bigcell,
-                                   char* bigcell_on_processor) {
-    // check if a given bigcell is treated on this processor
-    const int zstart = nbzp_start;
-    const int zend = nbzp + zstart;
-    const int nbyz = nby * nbz;
-    const int nz = nbzp;
-
-    int iz_now = 0;
-    int ix = 0;
-    int iy = 0;
-    int iz = 0;
-    int ind = 0;
-    bool flag = false;
-
-    for (int i = 0; i < nbxyz; i++) {
-        int iz_now = i % nbz;
-        if (iz_now < zstart || iz_now >= zend) {
-            flag = false;
-        } else {
-            flag = true;
-            ix = i / nbyz;
-            iy = (i - ix * nbyz) / nbz;
-            iz = iz_now - zstart;
-            ind = ix * nby * nz + iy * nz + iz;
-            // no need to calculate index if bigcell is
-            // not on this processor
-        }
-
-        ind_bigcell[i] = ind;
-        bigcell_on_processor[i] = flag;
-    }
-    return;
-}
-
-void Grid_Technique::init_atoms_on_grid2(const int* index2normal,
-                                         const UnitCell& ucell) {
-    ModuleBase::TITLE("Grid_Techinique", "init_atoms_on_grid2");
-
-    if (total_atoms_on_grid == 0) {
-        ModuleBase::WARNING("Grid_Technique::init_atoms_on_grid2",
-                            "no atom on this sub FFT grid.");
-        return;
-    }
-
-    std::vector<int> index2ucell = std::vector<int>(this->nxyze, 0);
-    ModuleBase::Memory::record("GT::index2ucell", sizeof(int) * this->nxyze);
-    this->grid_expansion_index(false, index2ucell.data());
-
-    std::vector<int> ind_bigcell = std::vector<int>(nbxyz, 0);
-    ModuleBase::Memory::record("GT::ind_bigcell", sizeof(int) * nbxyz);
-    std::vector<char> bigcell_on_processor = std::vector<char>(nbxyz, 0);
-    this->check_bigcell(ind_bigcell.data(), bigcell_on_processor.data());
-
-    //--------------------------------------
-    // save which atom is in the bigcell,unitcell
-    //--------------------------------------
-    assert(total_atoms_on_grid != 0);
-    this->which_atom = std::vector<int>(total_atoms_on_grid, 0);
-    ModuleBase::Memory::record("GT::which_atom",
-                               sizeof(int) * total_atoms_on_grid);
-
-    this->which_bigcell = std::vector<int>(total_atoms_on_grid, 0);
-    ModuleBase::Memory::record("GT::which_bigcell",
-                               sizeof(int) * total_atoms_on_grid);
-
-    this->which_unitcell = std::vector<int>(total_atoms_on_grid, 0);
-    ModuleBase::Memory::record("GT::which_unitcell",
-                               sizeof(int) * total_atoms_on_grid);
-
-    // for each atom, first we need to locate which cell
-    // the atom is in, then we search meshball aroung this
-    // grid, and record each grid's atom position.
-    int count = 0;
-    this->how_many_atoms = std::vector<int>(nbxx, 0);
-    ModuleBase::Memory::record("GT::how many atoms", sizeof(int) * nbxx);
-    std::vector<double> coord_x(total_atoms_on_grid* bxyz, 0.0);
-    std::vector<double> coords3(bxyz * 3, 0.0);
-    for(int iat = 0; iat < ucell.nat; iat++)
-    {
-        const int it = ucell.iat2it[iat];
-        const double rcut_square = this->rcuts[it] * this->rcuts[it];
-        // zero bigcell of meshball indicate ?
-        for (int im = 0; im < this->meshball_ncells; im++)
-        {
-            const int extgrid = this->index_atom[iat] + this->index_ball[im];
-            const int normal = index2normal[extgrid];
-
-            // mohan add 2010-07-01
-            const int bcell_idx_on_proc = ind_bigcell[normal];
-            if (!bigcell_on_processor[normal])
-            {
-                continue;
-            }
-            
-            bool is_atom_on_bcell = false;
-            const double dr_x_part = this->meshball_positions[im][0] - this->tau_in_bigcell[iat][0];
-            const double dr_y_part = this->meshball_positions[im][1] - this->tau_in_bigcell[iat][1];
-            const double dr_z_part = this->meshball_positions[im][2] - this->tau_in_bigcell[iat][2];
-            for(int imcell = 0; imcell < this -> bxyz; imcell++)
-            {
-                const double dr_x = this->meshcell_pos[imcell][0] + dr_x_part;
-                const double dr_y = this->meshcell_pos[imcell][1] + dr_y_part;
-                const double dr_z = this->meshcell_pos[imcell][2] + dr_z_part;
-                const double dist_square = dr_x * dr_x + dr_y * dr_y + dr_z * dr_z;
-                if(dist_square <= rcut_square)
-                {
-                    is_atom_on_bcell = true;
-                    break;
-                }
-            }
-
-            if(is_atom_on_bcell)
-            {
-            // it's not the normal order to calculate which_atom
-            // and which_bigcell, especailly in 1D array.
-            // Each grid's adjacent atom number is different,
-            // so, first we need to locate which grid, using
-            // bcell_start, then we need to count which adjacent atom.
-            // using how_many_atoms.
-            const int index = this->bcell_start[bcell_idx_on_proc] + this->how_many_atoms[bcell_idx_on_proc];
-
-            // we save which_atom and which_bigcell in 1D array,
-            // once you want to use this in grid integration,
-            // the only information you got is the 'normal' index,
-            // so you need to use bcell_start
-            // to get the 'mesh_index', then you can you this mesh_index
-            // to use which_atom or which_bigcell.
-            this->which_atom[index] = iat;
-            this->which_bigcell[index] = im;
-            this->which_unitcell[index] = index2ucell[extgrid];
-            for(int imcell = 0; imcell < this -> bxyz; imcell++)
-            {
-                const double dr_x = this->meshcell_pos[imcell][0] + dr_x_part;
-                coord_x[index * bxyz + imcell] = dr_x;
-            }
-
-            ++count;
-            ++how_many_atoms[bcell_idx_on_proc];
-            }
-        }
-    }
-    for(int i = 0; i < this->bxyz; i++)
-    {
-        for(int j = 0; j < 3; j++)
-        {
-            coords3[i * 3 + j] = this->meshcell_pos[i][j];
-        }
-    }
-    assert(count == total_atoms_on_grid);
-    return;
-}
-
-void Grid_Technique::cal_grid_integration_index() {
-    // save the start
-    this->bcell_start = std::vector<int>(nbxx, 0);
-    ModuleBase::Memory::record("GT::bcell_start", sizeof(int) * nbxx);
-    for (int i = 1; i < nbxx; i++) {
-        this->bcell_start[i]
-            = this->bcell_start[i - 1] + this->how_many_atoms[i - 1];
-    }
-
-    // calculate which grid has the largest number of atoms,
-    // and how many atoms.
-    this->max_atom = 0;
-    for (int i = 0; i < nbxx; i++) {
-        this->max_atom = std::max(this->max_atom, this->how_many_atoms[i]);
-    }
-
-#ifdef __MPI
-    int* all = new int[GlobalV::NPROC];
-    ModuleBase::GlobalFunc::ZEROS(all, GlobalV::NPROC);
-    Parallel_Reduce::gather_int_all(max_atom, all);
-    if (GlobalV::MY_RANK == 0) {
-        GlobalV::ofs_warning << std::setw(15) << "Processor" << std::setw(15)
-                             << "Atom" << std::endl;
-        for (int i = 0; i < GlobalV::NPROC; i++) {
-            GlobalV::ofs_warning << std::setw(15) << i + 1 << std::setw(15)
-                                 << all[i] << std::endl;
-        }
-    }
-    delete[] all;
-#endif
-
-    if (PARAM.inp.test_gridt) {
-        ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,
-                                    "Max atom on bigcell",
-                                    max_atom);
-    }
-    return;
-}
-
-// set 'lgd' variable
-void Grid_Technique::cal_trace_lo(const UnitCell& ucell) {
-    ModuleBase::TITLE("Grid_Technique", "cal_trace_lo");
-    // save the atom information in trace_lo,
-    // in fact the trace_lo dimension can be reduced
-    // to ucell.nat, but I think this is another way.
-    this->trace_lo = std::vector<int>(PARAM.globalv.nlocal, -1);
-    ModuleBase::Memory::record("GT::trace_lo", sizeof(int) * PARAM.globalv.nlocal);
-
-    this->lnat = 0;
-    this->lgd = 0;
-    int iat = 0;
-    int iw_all = 0;
-    int iw_local = 0;
-
-    for (int it = 0; it < ucell.ntype; it++) {
-        for (int ia = 0; ia < ucell.atoms[it].na; ia++) {
-            if (this->in_this_processor[iat]) {
-                ++lnat;
-                int nw0 = ucell.atoms[it].nw;
-                if (PARAM.inp.nspin
-                    == 4) { // added by zhengdy-soc, need to be double in soc
-                    nw0 *= 2;
-                    this->lgd += nw0;
-                } else {
-                    this->lgd += ucell.atoms[it].nw;
-                }
-
-                for (int iw = 0; iw < nw0; iw++) {
-                    this->trace_lo[iw_all] = iw_local;
-                    ++iw_local;
-                    ++iw_all;
-                }
-            } else {
-                // global index of atomic orbitals
-                iw_all += ucell.atoms[it].nw;
-                if (PARAM.inp.nspin == 4) {
-                    iw_all += ucell.atoms[it].nw;
-}
-            }
-            ++iat;
-        }
-    }
-
-    if (PARAM.inp.out_level != "m") {
-        ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,
-                                    "Atom number in sub-FFT-grid",
-                                    lnat);
-        ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,
-                                    "Local orbitals number in sub-FFT-grid",
-                                    lgd);
-    }
-
-    assert(iw_local == lgd);
-    assert(iw_all == PARAM.globalv.nlocal);
-    return;
-}
-
-void Grid_Technique::init_ijr_and_nnrg(const UnitCell& ucell, const Grid_Driver& gd)
-{
-    ModuleBase::TITLE("Grid_Technique", "init_ijr_and_nnrg");
-
-    hamilt::HContainer<double> hr_gint_tmp(ucell.nat);
-    // prepare the row_index and col_index for construct AtomPairs, they are
-    // same, name as orb_index
-    std::vector<int> orb_index(ucell.nat + 1);
-    orb_index[0] = 0;
-    for (int i = 1; i < orb_index.size(); i++) {
-        int type = ucell.iat2it[i - 1];
-        orb_index[i] = orb_index[i - 1] + ucell.atoms[type].nw;
-    }
-
-    for (int T1 = 0; T1 < ucell.ntype; ++T1) {
-            const Atom* atom1 = &(ucell.atoms[T1]);
-            for (int I1 = 0; I1 < atom1->na; ++I1) {
-                auto& tau1 = atom1->tau[I1];
-
-                gd.Find_atom(ucell, tau1, T1, I1);
-
-                const int iat1 = ucell.itia2iat(T1, I1);
-                // whether this atom is in this processor.
-                if (this->in_this_processor[iat1]) {
-                    for (int ad = 0; ad < gd.getAdjacentNum() + 1; ++ad) {
-                        const int T2 = gd.getType(ad);
-                        const int I2 = gd.getNatom(ad);
-                        const int iat2 = ucell.itia2iat(T2, I2);
-                        const Atom* atom2 = &(ucell.atoms[T2]);
-
-                        // NOTE: hRGint wil save total number of atom pairs,
-                        // if only upper triangle is saved, the lower triangle will
-                        // be lost in 2D-block parallelization. if the adjacent atom
-                        // is in this processor.
-                        if (this->in_this_processor[iat2]) {
-                            ModuleBase::Vector3<double> dtau
-                                = gd.getAdjacentTau(ad) - tau1;
-                            double distance = dtau.norm() * ucell.lat0;
-                            double rcut
-                                = this->rcuts[T1] + this->rcuts[T2];
-
-                            // if(distance < rcut)
-                            //  mohan reset this 2013-07-02 in Princeton
-                            //  we should make absolutely sure that the distance is
-                            //  smaller than rcuts[it] this should be consistant
-                            //  with LCAO_nnr::cal_nnrg function typical example : 7
-                            //  Bohr cutoff Si orbital in 14 Bohr length of cell.
-                            //  distance = 7.0000000000000000
-                            //  rcuts[it] = 7.0000000000000008
-                            if (distance < rcut - 1.0e-15) {
-                                // calculate R index
-                                auto& R_index = gd.getBox(ad);
-                                // insert this atom-pair into this->hRGint
-                                    hamilt::AtomPair<double> tmp_atom_pair(
-                                        iat1,
-                                        iat2,
-                                        R_index.x,
-                                        R_index.y,
-                                        R_index.z,
-                                        orb_index.data(),
-                                        orb_index.data(),
-                                        ucell.nat);
-                                    hr_gint_tmp.insert_pair(tmp_atom_pair);
-                            }
-                        }
-                    }
-                }
-            }
-    }
-    this->ijr_info = hr_gint_tmp.get_ijr_info();
-    this->nnrg = hr_gint_tmp.get_nnr();
-    return;
-}
-
-#if ((defined __CUDA) /* || (defined __ROCM) */)
-
-void Grid_Technique::init_gpu_gint_variables(const UnitCell& ucell,
-                                             const int num_stream) {
-#ifdef __MPI
-    dev_id = base_device::information::set_device_by_rank();
-#endif
-    if (is_malloced) {
-        free_gpu_gint_variables(this->nat);
-    }
-    nstreams = num_stream;
-    double ylmcoef[100];
-    ModuleBase::GlobalFunc::ZEROS(ylmcoef, 100);
-    for (int i = 0; i < 100; i++) {
-        ylmcoef[i] = ModuleBase::Ylm::ylmcoef[i];
-    }
-    checkCudaErrors(cudaMalloc((void**)&ylmcoef_g, 100 * sizeof(double)));
-    checkCudaErrors(cudaMemcpy(ylmcoef_g,
-                               ylmcoef,
-                               100 * sizeof(double),
-                               cudaMemcpyHostToDevice));
-
-    double max_cut = *std::max_element(this->rcuts.begin(), this->rcuts.end());
-
-    int atom_nw_now[ucell.ntype];
-    int ucell_atom_nwl_now[ucell.ntype];
-    for (int i = 0; i < ucell.ntype; i++) {
-        atom_nw_now[i] = ucell.atoms[i].nw;
-        ucell_atom_nwl_now[i] = ucell.atoms[i].nwl;
-    }
-
-    // double psi_u_now[ucell.ntype * ucell.nwmax * nr_max *
-    // 2];
-    double* psi_u_now = (double*)malloc(ucell.ntype * ucell.nwmax * this->nr_max * 2 * sizeof(double));
-    memset(psi_u_now, 0, ucell.ntype * ucell.nwmax * this->nr_max * 2 * sizeof(double));
-    bool* atom_iw2_new_now = (bool*)malloc(ucell.ntype * ucell.nwmax * sizeof(bool));
-    memset(atom_iw2_new_now, 0, ucell.ntype * ucell.nwmax * sizeof(bool));
-    int* atom_iw2_ylm_now
-        = (int*)malloc(ucell.ntype * ucell.nwmax * sizeof(int));
-    memset(atom_iw2_ylm_now, 0, ucell.ntype * ucell.nwmax * sizeof(int));
-    int* atom_iw2_l_now = (int*)malloc(ucell.ntype * ucell.nwmax * sizeof(int));
-    memset(atom_iw2_l_now, 0, ucell.ntype * ucell.nwmax * sizeof(int));
-
-    Atom* atomx;
-    for (int i = 0; i < ucell.ntype; i++) {
-        atomx = &ucell.atoms[i];
-        for (int j = 0; j < ucell.nwmax; j++) {
-            if (j < atomx->nw) {
-                atom_iw2_new_now[i * ucell.nwmax + j] = atomx->iw2_new[j];
-                atom_iw2_ylm_now[i * ucell.nwmax + j] = atomx->iw2_ylm[j];
-                atom_iw2_l_now[i * ucell.nwmax + j] = atomx->iw2l[j];
-                for (int k = 0; k < this->nr_max; k++) {
-                    int index_temp = (i * ucell.nwmax * this->nr_max
-                                      + j * this->nr_max + k)
-                                     * 2;
-                    if (k < this->psi_u[i * this->nwmax + j].size()) {
-                        psi_u_now[index_temp]
-                            = this->psi_u[i * this->nwmax + j].data()[k];
-                        psi_u_now[index_temp + 1]
-                            = this->dpsi_u[i * this->nwmax + j].data()[k];
-                    }
-                }
-            }
-        }
-    }
-
-    checkCudaErrors(cudaMalloc((void**)&atom_nw_g, ucell.ntype * sizeof(int)));
-    checkCudaErrors(cudaMemcpy(atom_nw_g,
-                               atom_nw_now,
-                               ucell.ntype * sizeof(int),
-                               cudaMemcpyHostToDevice));
-
-    checkCudaErrors(cudaMalloc((void**)&atom_nwl_g, ucell.ntype * sizeof(int)));
-    checkCudaErrors(cudaMemcpy(atom_nwl_g, ucell_atom_nwl_now, ucell.ntype * sizeof(int), cudaMemcpyHostToDevice));
-
-    checkCudaErrors(cudaMalloc((void**)&psi_u_g, ucell.ntype * ucell.nwmax * this->nr_max * sizeof(double) * 2));
-    checkCudaErrors(cudaMemcpy(psi_u_g,
-                               psi_u_now,
-                               ucell.ntype * ucell.nwmax * this->nr_max * sizeof(double) * 2,
-                               cudaMemcpyHostToDevice));
-
-    checkCudaErrors(cudaMalloc((void**)&psi_u_g,
-                                ucell.ntype * ucell.nwmax * nr_max * sizeof(double) * 2));
-    checkCudaErrors(cudaMemcpy(psi_u_g,
-                                psi_u_now,
-                                ucell.ntype * ucell.nwmax * nr_max * sizeof(double) * 2,
-                                cudaMemcpyHostToDevice));
-
-    checkCudaErrors(cudaMalloc((void**)&atom_new_g,
-                               ucell.ntype * ucell.nwmax * sizeof(bool)));
-    checkCudaErrors(cudaMemcpy(atom_new_g,
-                                 atom_iw2_new_now,
-                                 ucell.ntype * ucell.nwmax * sizeof(bool),
-                                 cudaMemcpyHostToDevice));
-
-    checkCudaErrors(cudaMalloc((void**)&atom_ylm_g,
-                               ucell.ntype * ucell.nwmax * sizeof(int)));
-
-    checkCudaErrors(cudaMemcpy(atom_ylm_g,
-                                atom_iw2_ylm_now,
-                                ucell.ntype * ucell.nwmax * sizeof(int),
-                                cudaMemcpyHostToDevice));
-
-    checkCudaErrors(cudaMalloc((void**)&atom_l_g,
-                                ucell.ntype * ucell.nwmax * sizeof(int)));
-    checkCudaErrors(cudaMemcpy(atom_l_g,
-                                atom_iw2_l_now,
-                                ucell.ntype * ucell.nwmax * sizeof(int),
-                                cudaMemcpyHostToDevice));
-
-    checkCudaErrors(cudaMalloc((void**)&rcut_g, ucell.ntype * sizeof(double)));
-    checkCudaErrors(cudaMemcpy(rcut_g,
-                               rcuts.data(),
-                               ucell.ntype * sizeof(double),
-                               cudaMemcpyHostToDevice));
-    std::vector<double> mcell_pos(bxyz * 3, 0);
-    for (int i = 0; i < bxyz; i++)
-    {
-        mcell_pos[3 * i] = meshcell_pos[i][0];
-        mcell_pos[3 * i + 1] = meshcell_pos[i][1];
-        mcell_pos[3 * i + 2] = meshcell_pos[i][2];
-    }
-    checkCudaErrors(cudaMalloc((void**)&mcell_pos_g,
-                               bxyz * 3 * sizeof(double)));
-    checkCudaErrors(cudaMemcpy(mcell_pos_g,
-                               mcell_pos.data(),
-                               bxyz * 3 * sizeof(double),
-                               cudaMemcpyHostToDevice));
-
-    gemm_algo_selector(bxyz, fastest_matrix_mul, ucell);
-
-    is_malloced = true;
-
-    free(psi_u_now);
-    free(atom_iw2_new_now);
-    free(atom_iw2_ylm_now);
-}
-
-void Grid_Technique::free_gpu_gint_variables(int nat) {
-    if (!is_malloced) {
-        return;
-    }
-
-    checkCudaErrors(cudaFree(ylmcoef_g));
-    checkCudaErrors(cudaFree(atom_nwl_g));
-    checkCudaErrors(cudaFree(psi_u_g));
-    checkCudaErrors(cudaFree(atom_new_g));
-    checkCudaErrors(cudaFree(atom_ylm_g));
-    checkCudaErrors(cudaFree(atom_nw_g));
-    checkCudaErrors(cudaFree(atom_l_g));
-    checkCudaErrors(cudaFree(rcut_g));
-    checkCudaErrors(cudaFree(mcell_pos_g));
-
-    is_malloced = false;
-}
-#endif
diff --git a/source/source_lcao/module_gint/grid_technique.h b/source/source_lcao/module_gint/grid_technique.h
deleted file mode 100644
index 947b8d9337..0000000000
--- a/source/source_lcao/module_gint/grid_technique.h
+++ /dev/null
@@ -1,172 +0,0 @@
-#ifndef GRID_TECHNIQUE_H
-#define GRID_TECHNIQUE_H
-
-#include "grid_meshball.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_basis/module_ao/parallel_orbitals.h"
-#include "source_cell/module_neighbor/sltk_grid_driver.h"
-#include "source_cell/unitcell.h"
-#include "source_lcao/module_hcontainer/hcontainer.h"
-#if ((defined __CUDA) /* || (defined __ROCM) */)
-#include "kernels/cuda/gemm_selector.cuh"
-
-#include <cuda_runtime.h>
-#endif
-
-// Author: mohan
-// Date: 2009-10-17
-class Grid_Technique : public Grid_MeshBall {
-    // public variables.
-  public:
-    Grid_Technique();
-    ~Grid_Technique();
-
-    /// move operator for the next ESolver to directly use its infomation
-    Grid_Technique& operator=(Grid_Technique&& rhs) = default;
-    //------------------------------------
-    // 1: Info about atom number on grid.
-    //------------------------------------
-    // record how many atoms on each grid.
-    std::vector<int> how_many_atoms;
-    // max atom on grid
-    int max_atom=0;
-    // sum of how_many_atoms
-    int total_atoms_on_grid=0;
-    std::vector<int> start_ind;
-
-    //------------------------------------
-    // 2: Info about which atom on grid.
-    //------------------------------------
-    // save the start position of each big cell's adjacent
-    // atoms in 1D grid.
-    std::vector<int> bcell_start;
-    // save the 'iat' atom.
-    // dim: total_atoms_on_grid.
-    std::vector<int> which_atom;
-
-    //--------------------------------------
-    // save the bigcell index in meshball.
-    // dim: total_atoms_on_grid.
-    //--------------------------------------
-    std::vector<int> which_bigcell;
-    std::vector<int> which_unitcell;
-
-    //------------------------------------
-    // 3: which atom on local grid.
-    //------------------------------------
-    int lnat=0; // local nat.
-    int lgd=0;  // local grid dimension.  lgd * lgd symmetry matrix.
-    std::vector<bool> in_this_processor;
-    std::vector<int> trace_iat;
-    std::vector<int> trace_lo; // trace local orbital.
-
-    //---------------------------------------
-    // nnrg: number of matrix elements on
-    // each processor's real space grid.
-    // use: GridT.in_this_processor
-    //---------------------------------------
-    int nnrg = 0;
-
-    // UnitCell and LCAO_Obrbitals
-    const UnitCell* ucell=nullptr;
-    const LCAO_Orbitals* orb=nullptr;
-
-    // UnitCell parameters
-    int nwmax=0;
-    int nr_max=0;
-    int ntype=0;
-
-    // LCAO Orbitals
-    double dr_uniform={0.0};
-    std::vector<double> rcuts;
-    std::vector<std::vector<double>> psi_u;
-    std::vector<std::vector<double>> dpsi_u;
-    std::vector<std::vector<double>> d2psi_u;
-
-    // Determine whether the grid point integration is initialized.
-    bool init_malloced=false;
-
-    bool get_init_malloced() const { return init_malloced; }
-
-    void set_pbc_grid(const int& ncx_in,
-                      const int& ncy_in,
-                      const int& ncz_in,
-                      const int& bx_in,
-                      const int& by_in,
-                      const int& bz_in,
-                      const int& nbx_in,
-                      const int& nby_in,
-                      const int& nbz_in,
-                      const int& nbxx_in,
-                      const int& nbzp_start_in,
-                      const int& nbzp_in,
-                      const int& ny,
-                      const int& nplane,
-                      const int& startz_current,
-                      const UnitCell& ucell,
-                      const Grid_Driver& gd,
-                      const double& dr_uniform,
-                      const std::vector<double>& rcuts,
-                      const std::vector<std::vector<double>>& psi_u,
-                      const std::vector<std::vector<double>>& dpsi_u,
-                      const std::vector<std::vector<double>>& d2psi_u,
-                      const int& num_stream);
-
-    const std::vector<int>* get_ijr_info() const { return &ijr_info; }
-
-    /// number of elements(basis-pairs) in this processon
-    /// on all adjacent atoms-pairs(Grid division)
-    int cal_RindexAtom(const int& u1,
-                       const int& u2,
-                       const int& u3,
-                       const int& iat2) const;
-
-    int find_offset(const int id1, const int id2, const int iat1, const int iat2) const;
-    
-  private:
-
-    // store the information of atom pairs on this processor, used to initialize hcontainer.
-    // The meaning of ijr can be referred to in the get_ijr_info function in hcontainer.cpp.
-    std::vector<int> ijr_info;
-
-    void cal_max_box_index();
-    // atoms on meshball
-    void init_atoms_on_grid(const int& ny,
-                            const int& nplane,
-                            const UnitCell& ucell);
-    void init_atoms_on_grid2(const int* index2normal, const UnitCell& ucell);
-    // initialize the ijr_info and nnrg
-    void init_ijr_and_nnrg(const UnitCell& ucell, const Grid_Driver& gd);
-    void cal_grid_integration_index();
-    void cal_trace_lo(const UnitCell& ucell);
-    void check_bigcell(int* ind_bigcell, char* bigcell_on_processor);
-    void get_startind(const int& ny,
-                      const int& nplane);
-
-#if ((defined __CUDA) /* || (defined __ROCM) */)
-  public:
-    double* ylmcoef_g;
-    bool is_malloced;
-
-    int* atom_nw_g;
-    int* atom_nwl_g;
-    double* psi_u_g;
-    bool* atom_new_g;
-    int* atom_ylm_g;
-    int* atom_l_g;
-    double* rcut_g;
-    double*mcell_pos_g;
-
-    int dev_id = 0;
-    int nstreams = 4;
-    // streams[nstreams]
-    // TODO it needs to be implemented through configuration files
-    matrix_multiple_func_type fastest_matrix_mul;
-
-  private:
-    void init_gpu_gint_variables(const UnitCell& ucell, const int num_stream);
-    void free_gpu_gint_variables(int nat);
-
-#endif
-};
-#endif
diff --git a/source/source_lcao/module_gint/gtask_force.cpp b/source/source_lcao/module_gint/gtask_force.cpp
deleted file mode 100644
index 2fab74907e..0000000000
--- a/source/source_lcao/module_gint/gtask_force.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-#include <omp.h>
-
-#include "gint_force_gpu.h"
-#include "source_base/ylm.h"
-#include "source_lcao/module_gint/gint_tools.h"
-#include "source_base/vector3.h"
-namespace GintKernel
-{
-
-void gtask_force(const Grid_Technique& gridt,
-                 const UnitCell& ucell,
-                 const int grid_index_ij,
-                 const int nczp,
-                 const double vfactor,
-                 const double* vlocal_global_value,
-                 int& atoms_per_z,
-                 int* atoms_num_info,
-                 int* iat_on_nbz,
-                 uint8_t* atoms_type,
-                 double* dr_part,
-                 double* vldr3)
-{
-    atoms_per_z = 0;
-    for (int z_index = 0; z_index < gridt.nbzp; z_index++)
-    {
-        const int grid_index = grid_index_ij + z_index;
-        const int bcell_start_index = gridt.bcell_start[grid_index];
-        const int na_grid = gridt.how_many_atoms[grid_index];
-        atoms_num_info[z_index * 2] = na_grid;
-        atoms_num_info[z_index * 2 + 1] = atoms_per_z;
-        for (int id = 0; id < na_grid; id++)
-        {
-            const int mcell_index = bcell_start_index + id;
-            const int imcell = gridt.which_bigcell[mcell_index];
-            const int iat = gridt.which_atom[mcell_index];
-            const int it_temp = ucell.iat2it[iat];
-
-            dr_part[atoms_per_z * 3] = gridt.meshball_positions[imcell][0]
-                                       - gridt.tau_in_bigcell[iat][0];
-            dr_part[atoms_per_z * 3 + 1] = gridt.meshball_positions[imcell][1]
-                                           - gridt.tau_in_bigcell[iat][1];
-            dr_part[atoms_per_z * 3 + 2] = gridt.meshball_positions[imcell][2]
-                                           - gridt.tau_in_bigcell[iat][2];
-            atoms_type[atoms_per_z] = it_temp;
-            iat_on_nbz[atoms_per_z] = iat;
-            atoms_per_z++;
-        }
-
-        const int start_ind_grid = gridt.start_ind[grid_index];
-        int id = z_index * gridt.bxyz;
-        for (int bx_index = 0; bx_index < gridt.bx; bx_index++)
-        {
-            for (int by_index = 0; by_index < gridt.by; by_index++)
-            {
-                for (int bz_index = 0; bz_index < gridt.bz; bz_index++)
-                {
-                    int vindex_global = bx_index * gridt.ncy * nczp
-                                        + by_index * nczp + bz_index
-                                        + start_ind_grid;
-                    vldr3[id]= vlocal_global_value[vindex_global] * vfactor;
-                    id++;
-                }
-            }
-        }
-    }
-}
-
-void alloc_mult_force(const hamilt::HContainer<double>* dm,
-                      const Grid_Technique& gridt,
-                      const UnitCell& ucell,
-                      const int grid_index_ij,
-                      const int max_atom,
-                      const int *atoms_num_info,
-                      double* const psi_g,
-                      double* const psi_dm_g,
-                      double* const dm_matrix_g,
-                      int& max_m,
-                      int& max_n,
-                      int& atom_pair_num,
-                      int* mat_m,
-                      int* mat_n,
-                      int* mat_k,
-                      int* mat_lda,
-                      int* mat_ldb,
-                      int* mat_ldc,
-                      double** mat_A,
-                      double** mat_B,
-                      double** mat_C)
-{
-    int tid = 0;
-    max_m = 0;
-    max_n = 0;
-    const int nwmax = ucell.nwmax;
-    const int lgd = gridt.lgd;
-    for (int z_index = 0; z_index < gridt.nbzp; z_index++)
-    {
-        const int grid_index = grid_index_ij + z_index;
-        const int bcell_start_index = gridt.bcell_start[grid_index];
-        const int pre_atoms = atoms_num_info[z_index * 2 + 1];
-
-        for (int atom1 = 0; atom1 < gridt.how_many_atoms[grid_index]; atom1++)
-        {
-            const int mcell_index1 = bcell_start_index + atom1;
-            const int iat1 = gridt.which_atom[mcell_index1];
-            const int uc1 = gridt.which_unitcell[mcell_index1];
-            const ModuleBase::Vector3<int> r1 = gridt.get_ucell_coords(uc1); 
-            const int it1 = ucell.iat2it[iat1];
-            const int nw1 = ucell.atoms[it1].nw;
-
-            for (int atom2 = 0; atom2 < gridt.how_many_atoms[grid_index];atom2++)
-            {
-                const int mcell_index2 = bcell_start_index + atom2;
-                const int iat2 = gridt.which_atom[mcell_index2];
-                const int uc2 = gridt.which_unitcell[mcell_index2];
-                const ModuleBase::Vector3<int> r2 = gridt.get_ucell_coords(uc2);
-                const int offset = dm->find_matrix_offset(iat1, iat2, r1-r2);
-                if (offset == -1)
-                {
-                    continue;
-                }
-                const int it2 = ucell.iat2it[iat2];
-                const int nw2 = ucell.atoms[it2].nw;
-
-                const int mat_A_idx = (pre_atoms + atom2) * nwmax * gridt.bxyz;
-                const int mat_C_idx = (pre_atoms + atom1) * nwmax * gridt.bxyz;
-                mat_m[tid] = gridt.bxyz;
-                mat_n[tid] = nw1;
-                mat_k[tid] = nw2;
-                mat_lda[tid] = nwmax;
-                mat_ldb[tid] = nw2;
-                mat_ldc[tid] = nwmax;
-                mat_A[tid] = psi_g + mat_A_idx;
-                mat_B[tid] = dm_matrix_g + offset;
-                mat_C[tid] = psi_dm_g + mat_C_idx;
-
-                if (mat_m[tid] > max_m)
-                {
-                    max_m = mat_m[tid];
-                }
-
-                if (mat_n[tid] > max_n)
-                {
-                    max_n = mat_n[tid];
-                }
-
-                tid++;
-            }
-        }
-    }
-    atom_pair_num = tid;
-}
-} // namespace GintKernel
diff --git a/source/source_lcao/module_gint/gtask_rho.cpp b/source/source_lcao/module_gint/gtask_rho.cpp
deleted file mode 100644
index 691504d943..0000000000
--- a/source/source_lcao/module_gint/gtask_rho.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-#include "gint_rho_gpu.h"
-#include "source_base/ylm.h"
-#include "source_lcao/module_gint/gint_tools.h"
-#include "source_base/vector3.h"
-#include "omp.h"
-namespace GintKernel
-{
-
-void gtask_rho(const Grid_Technique& gridt,
-               const int grid_index_ij,
-               const UnitCell& ucell,
-               double* dr_part,
-               uint8_t* atoms_type,
-               int* atoms_num_info,
-               int& atoms_per_z)         
-{
-    atoms_per_z = 0;
-    for (int z_index = 0; z_index < gridt.nbzp; z_index++)
-    {
-        const int grid_index = grid_index_ij + z_index;
-        const int bcell_start_index = gridt.bcell_start[grid_index];
-        const int na_grid = gridt.how_many_atoms[grid_index];
-        atoms_num_info[2 * z_index] = na_grid;
-        atoms_num_info[2 * z_index + 1] = atoms_per_z;
-        for (int id = 0; id < na_grid; id++)
-        {
-            const int mcell_index = bcell_start_index + id;
-            const int imcell = gridt.which_bigcell[mcell_index];
-            const int iat = gridt.which_atom[mcell_index];
-            const int it_temp = ucell.iat2it[iat];
-
-            dr_part[atoms_per_z * 3] = gridt.meshball_positions[imcell][0]
-                      - gridt.tau_in_bigcell[iat][0];
-            dr_part[atoms_per_z * 3 + 1] = gridt.meshball_positions[imcell][1]
-                      - gridt.tau_in_bigcell[iat][1];
-            dr_part[atoms_per_z * 3 + 2] = gridt.meshball_positions[imcell][2]
-                      - gridt.tau_in_bigcell[iat][2];
-            atoms_type[atoms_per_z] = it_temp;
-            atoms_per_z++;
-        }
-    }
-}
-
-void alloc_mult_dot_rho(const hamilt::HContainer<double>* dm,
-                        const Grid_Technique& gridt,
-                        const UnitCell& ucell,
-                        const int grid_index_ij,
-                        const int max_atom,
-                        const int lgd,
-                        const int nczp,
-                        const int* atoms_num_info,
-                        double* const psir_ylm_g,
-                        double* const psir_dm_g,
-                        double* const dm_matrix_g,
-                        double* mat_alpha,
-                        int* mat_m,
-                        int* mat_n,
-                        int* mat_k,
-                        int* mat_lda,
-                        int* mat_ldb,
-                        int* mat_ldc,
-                        double** mat_A,
-                        double** mat_B,
-                        double** mat_C,
-                        int& max_m,
-                        int& max_n,
-                        int& atom_pair_num,
-                        double* rho_g,
-                        double** dot_product)
-{
-    int tid = 0;
-    int dot_count = 0;
-    max_m = 0;
-    max_n = 0;
-    const int nwmax=ucell.nwmax;
-    // generate matrix multiplication tasks
-    for (int z_index = 0; z_index < gridt.nbzp; z_index++)
-    {
-        const int grid_index = grid_index_ij + z_index;
-        const int bcell_start_index = gridt.bcell_start[grid_index];
-        const int bcell_start_psir = atoms_num_info[2 * z_index + 1] * gridt.bxyz * nwmax;
-        const int na_grid = atoms_num_info[2 * z_index];
-
-        for (int atom1 = 0; atom1 < gridt.how_many_atoms[grid_index]; atom1++)
-        {
-            const int mcell_index1 = bcell_start_index + atom1;
-            const int iat1 = gridt.which_atom[mcell_index1];
-            const int uc1 = gridt.which_unitcell[mcell_index1];
-            const ModuleBase::Vector3<int> r1 = gridt.get_ucell_coords(uc1);
-            const int it1 = ucell.iat2it[iat1];
-            const int nw1 = ucell.atoms[it1].nw;
-
-            for (int atom2 = atom1; atom2 < gridt.how_many_atoms[grid_index];
-                 atom2++)
-            {
-                const int mcell_index2 = bcell_start_index + atom2;
-                const int iat2 = gridt.which_atom[mcell_index2];
-                const int uc2 = gridt.which_unitcell[mcell_index2];
-                const ModuleBase::Vector3<int> r2 = gridt.get_ucell_coords(uc2);
-                const int offset = dm->find_matrix_offset(iat1, iat2, r1-r2);
-                if (offset == -1)
-                {
-                    continue;
-                }
-                const int it2 = ucell.iat2it[iat2];
-                const int nw2 = ucell.atoms[it2].nw;
-
-                const int mat_A_idx = bcell_start_psir + atom2 * nwmax;
-                const int mat_C_idx = bcell_start_psir + atom1 * nwmax;
-
-                mat_alpha[tid] = atom2 == atom1 ? 1 : 2;
-                mat_m[tid] = gridt.bxyz;
-                mat_n[tid] = nw1;
-                mat_k[tid] = nw2;
-                mat_lda[tid] = nwmax * na_grid;
-                mat_ldb[tid] = nw2;
-                mat_ldc[tid] = nwmax * na_grid;
-                mat_A[tid] = psir_ylm_g + mat_A_idx;
-                mat_B[tid] = dm_matrix_g + offset;
-                mat_C[tid] = psir_dm_g + mat_C_idx;
-
-                if (mat_m[tid] > max_m)
-                {
-                    max_m = mat_m[tid];
-                }
-
-                if (mat_n[tid] > max_n)
-                {
-                    max_n = mat_n[tid];
-                }
-
-                tid++;
-            }
-        }
-
-        // generate vec dot product tasks
-        std::vector<int> vindex(gridt.bxyz);
-        Gint_Tools::get_vindex(gridt.bxyz,
-                                gridt.bx,
-                                gridt.by,
-                                gridt.bz,
-                                nczp,
-                                gridt.start_ind[grid_index],
-                                gridt.ncy * nczp,
-                                vindex.data());
-        for (int i = 0; i < gridt.bxyz; i++)
-        {
-            dot_product[dot_count] = rho_g + vindex[i];
-            dot_count++;
-        }
-    }
-    atom_pair_num = tid;
-}
-
-} // namespace GintKernel
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/gtask_vl.cpp b/source/source_lcao/module_gint/gtask_vl.cpp
deleted file mode 100644
index 026ed3ffab..0000000000
--- a/source/source_lcao/module_gint/gtask_vl.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-#include <omp.h>
-
-#include "gint_vl_gpu.h"
-#include "source_base/ylm.h"
-#include "source_lcao/module_gint/gint_tools.h"
-#include "source_base/vector3.h"
-namespace GintKernel
-{
-
-void gtask_vlocal(const Grid_Technique& gridt,
-                  const UnitCell& ucell,
-                  const int grid_index_ij,
-                  const int nczp,
-                  const double vfactor,
-                  const double* vlocal_global_value,
-                  int& atoms_per_z,
-                  int* atoms_num_info,
-                  uint8_t* atoms_type,
-                  double* dr_part,
-                  double* vldr3)
-{
-    atoms_per_z = 0;
-    for (int z_index = 0; z_index < gridt.nbzp; z_index++)
-    {
-        const int grid_index = grid_index_ij + z_index;
-        const int bcell_start_index = gridt.bcell_start[grid_index];
-        const int na_grid = gridt.how_many_atoms[grid_index];
-        atoms_num_info[2 * z_index] = na_grid;
-        atoms_num_info[2 * z_index + 1] = atoms_per_z;
-        for (int id = 0; id < na_grid; id++)
-        {
-            const int mcell_index = bcell_start_index + id;
-            const int imcell = gridt.which_bigcell[mcell_index];
-            const int iat = gridt.which_atom[mcell_index];
-            const int it_temp = ucell.iat2it[iat];
-
-            dr_part[atoms_per_z * 3] = gridt.meshball_positions[imcell][0]
-                                       - gridt.tau_in_bigcell[iat][0];
-            dr_part[atoms_per_z * 3 + 1] = gridt.meshball_positions[imcell][1]
-                                           - gridt.tau_in_bigcell[iat][1];
-            dr_part[atoms_per_z * 3 + 2] = gridt.meshball_positions[imcell][2]
-                                           - gridt.tau_in_bigcell[iat][2];
-            atoms_type[atoms_per_z] = it_temp;
-            atoms_per_z++;
-        }
-
-        const int start_ind_grid = gridt.start_ind[grid_index];
-        int id = z_index * gridt.bxyz;
-        for (int bx_index = 0; bx_index < gridt.bx; bx_index++)
-        {
-            for (int by_index = 0; by_index < gridt.by; by_index++)
-            {
-                for (int bz_index = 0; bz_index < gridt.bz; bz_index++)
-                {
-                    int vindex_global = bx_index * gridt.ncy * nczp
-                                        + by_index * nczp + bz_index
-                                        + start_ind_grid;
-                    vldr3[id]= vlocal_global_value[vindex_global] * vfactor;
-                    id++;
-                }
-            }
-        }
-    }
-}
-
-void alloc_mult_vlocal(const hamilt::HContainer<double>* hRGint,
-                       const Grid_Technique& gridt,
-                       const UnitCell& ucell,
-                       const int grid_index_ij,
-                       const int max_atom,
-                       double* const psi,
-                       double* const psi_vldr3,
-                       double* const grid_vlocal_g,
-                       int* mat_m,
-                       int* mat_n,
-                       int* mat_k,
-                       int* mat_lda,
-                       int* mat_ldb,
-                       int* mat_ldc,
-                       double** mat_A,
-                       double** mat_B,
-                       double** mat_C,
-                       int& atom_pair_num,
-                       int& max_m,
-                       int& max_n)
-{
-    atom_pair_num = 0;
-    max_m = 0;
-    max_n = 0;
-    const int nwmax = ucell.nwmax;
-    for (int z_index = 0; z_index < gridt.nbzp; z_index++)
-    {
-        const int grid_index = grid_index_ij + z_index;
-        const int atom_num = gridt.how_many_atoms[grid_index];
-        const int vldr3_index = z_index * max_atom * nwmax * gridt.bxyz;
-        const int bcell_start_index = gridt.bcell_start[grid_index];
-        for (int atom1 = 0; atom1 < atom_num; atom1++)
-        {
-            const int iat1 = gridt.which_atom[bcell_start_index + atom1];
-            const int uc1 = gridt.which_unitcell[bcell_start_index + atom1];
-            const ModuleBase::Vector3<int> r1 = gridt.get_ucell_coords(uc1);
-            const int it1 = ucell.iat2it[iat1];
-
-            for (int atom2 = 0; atom2 < atom_num; atom2++)
-            {
-                const int iat2 = gridt.which_atom[bcell_start_index + atom2];
-                const int uc2 = gridt.which_unitcell[bcell_start_index + atom2];
-                const ModuleBase::Vector3<int> r2 = gridt.get_ucell_coords(uc2);
-                int offset = hRGint->find_matrix_offset(iat1, iat2, r1-r2);
-                if (offset == -1)
-                {
-                    continue;
-                }
-                const int it2 = ucell.iat2it[iat2];
-
-                if (iat1 <= iat2)
-                {
-                    const int atom_pair_nw
-                        = ucell.atoms[it1].nw * ucell.atoms[it2].nw;
-
-                    const int calc_index1 = vldr3_index + atom1 * nwmax * gridt.bxyz;
-                    const int calc_index2 = vldr3_index + atom2 * nwmax * gridt.bxyz;
-
-                    mat_A[atom_pair_num]
-                        = psi + calc_index1;
-                    mat_B[atom_pair_num]
-                        = psi_vldr3 + calc_index2;
-                    mat_C[atom_pair_num]
-                        = grid_vlocal_g + offset;
-
-                    mat_lda[atom_pair_num] = gridt.bxyz;
-                    mat_ldb[atom_pair_num] = gridt.bxyz;
-                    mat_ldc[atom_pair_num] = ucell.atoms[it2].nw;
-
-                    mat_m[atom_pair_num] = ucell.atoms[it1].nw;
-                    mat_n[atom_pair_num] = ucell.atoms[it2].nw;
-                    mat_k[atom_pair_num] = gridt.bxyz;
-                    
-                    if (mat_m[atom_pair_num] > max_m)
-                    {
-                        max_m = mat_m[atom_pair_num];
-                    }
-                    if (mat_n[atom_pair_num] > max_n)
-                    {
-                        max_n = mat_n[atom_pair_num];
-                    }
-                    atom_pair_num++;
-                }
-            }
-        }
-    }
-}
-
-} // namespace GintKernel
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/init_orb.cpp b/source/source_lcao/module_gint/init_orb.cpp
deleted file mode 100644
index 4ad04e08d6..0000000000
--- a/source/source_lcao/module_gint/init_orb.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "gint_tools.h"
-#include "source_base/memory.h"
-#include "source_basis/module_ao/ORB_read.h"
-#include "source_cell/unitcell.h"
-
-namespace Gint_Tools{
-
-void init_orb(double& dr_uniform, 
-                std::vector<double>& rcuts,
-                UnitCell& ucell,
-                const LCAO_Orbitals& orb,
-                std::vector<std::vector<double>>& psi_u,
-                std::vector<std::vector<double>>& dpsi_u,
-                std::vector<std::vector<double>>& d2psi_u)
-{
-    //! set the grid parameters
-    dr_uniform=orb.dr_uniform;
-
-    assert(dr_uniform>0.0);
-    
-    const int nwmax=ucell.nwmax;
-    const int ntype=ucell.ntype;
-
-    assert(nwmax>0);
-    assert(ntype>0);
-    
-    rcuts=std::vector<double>(ntype);
-    ModuleBase::Memory::record("rcuts", sizeof(double)*ntype*3);
-    
-    for(int it=0; it<ntype; it++)
-	{
-		rcuts[it]=orb.Phi[it].getRcut();
-	}
-    
-    const double max_cut = *std::max_element(rcuts.begin(), rcuts.end());
-    const int nr_max = static_cast<int>(1/dr_uniform * max_cut) + 10;
-    psi_u=std::vector<std::vector<double>>(ntype * nwmax);
-    dpsi_u=std::vector<std::vector<double>>(ntype * nwmax);
-    d2psi_u=std::vector<std::vector<double>>(ntype * nwmax);
-    ModuleBase::Memory::record("psi_u", sizeof(double)*nwmax*ntype*3);
-    
-    Atom* atomx = nullptr;
-    const Numerical_Orbital_Lm* pointer = nullptr;
-    
-    for (int i = 0; i < ntype; i++)
-    {
-        atomx = &ucell.atoms[i];
-        for (int j = 0; j < nwmax; j++)
-        {
-            const int k=i*nwmax+j;
-            if (j < atomx->nw)
-            {
-                pointer = &orb.Phi[i].PhiLN(atomx->iw2l[j],atomx->iw2n[j]);
-                psi_u[k]=pointer->psi_uniform;
-                dpsi_u[k]=pointer->dpsi_uniform;
-                d2psi_u[k]=pointer->ddpsi_uniform;
-            }
-        }
-    }
-}// End of init_orb()
-
-}// End of Gint_Tools
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/source_lcao/module_gint/kernel/cuda_mem_wrapper.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h
rename to source/source_lcao/module_gint/kernel/cuda_mem_wrapper.h
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu b/source/source_lcao/module_gint/kernel/dgemm_vbatch.cu
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu
rename to source/source_lcao/module_gint/kernel/dgemm_vbatch.cu
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h b/source/source_lcao/module_gint/kernel/dgemm_vbatch.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h
rename to source/source_lcao/module_gint/kernel/dgemm_vbatch.h
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh b/source/source_lcao/module_gint/kernel/gemm_nn_vbatch.cuh
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh
rename to source/source_lcao/module_gint/kernel/gemm_nn_vbatch.cuh
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh b/source/source_lcao/module_gint/kernel/gemm_tn_vbatch.cuh
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh
rename to source/source_lcao/module_gint/kernel/gemm_tn_vbatch.cuh
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp b/source/source_lcao/module_gint/kernel/gint_gpu_vars.cpp
similarity index 98%
rename from source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp
rename to source/source_lcao/module_gint/kernel/gint_gpu_vars.cpp
index f4443762f0..f81af2779c 100644
--- a/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp
+++ b/source/source_lcao/module_gint/kernel/gint_gpu_vars.cpp
@@ -101,8 +101,6 @@ GintGpuVars::GintGpuVars(std::shared_ptr<const BigGridInfo> biggrid_info,
     
     checkCuda(cudaMalloc((void**)&iat2it_d, sizeof(int) * ucell.nat));
     checkCuda(cudaMemcpy(iat2it_d, ucell.iat2it, sizeof(int) * ucell.nat, cudaMemcpyHostToDevice));
-
-    gemm_algo_selector(mgrid_num, fastest_matrix_mul, ucell);
 }
 
 GintGpuVars::~GintGpuVars()
diff --git a/source/source_lcao/module_gint/kernel/gint_gpu_vars.h b/source/source_lcao/module_gint/kernel/gint_gpu_vars.h
new file mode 100644
index 0000000000..5f711aa6a0
--- /dev/null
+++ b/source/source_lcao/module_gint/kernel/gint_gpu_vars.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include "set_const_mem.cuh"
+#include "source_base/ylm.h"
+#include "source_cell/unitcell.h"
+#include "source_cell/atom_spec.h"
+#include "source_lcao/module_gint/biggrid_info.h"
+#include "gint_helper.cuh"
+
+namespace ModuleGint
+{
+
+class GintGpuVars
+{
+    public:
+    GintGpuVars(std::shared_ptr<const BigGridInfo> bgrid_info,
+                const UnitCell& ucell,
+                const Numerical_Orbital* Phi);
+    ~GintGpuVars();
+    
+    int nwmax;
+    double dr_uniform;
+    double nr_max;
+    // ylmcoef_d is __constant__ memory, no need to cudaFree
+    double* ylmcoef_d = nullptr;
+    double* rcut_d = nullptr;
+    int* atom_nw_d = nullptr;
+    int* ucell_atom_nwl_d = nullptr;
+    bool* atom_iw2_new_d = nullptr;
+    int* atom_iw2_ylm_d = nullptr;
+    int* atom_iw2_l_d = nullptr;
+    double* psi_u_d = nullptr;
+    double* dpsi_u_d = nullptr;
+    double* d2psi_u_d = nullptr;
+    double3* mgrids_pos_d = nullptr;
+    int* iat2it_d = nullptr;
+
+    // the index of gpu device
+    int dev_id_ = 0;
+
+};
+
+}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/gint_helper.cuh b/source/source_lcao/module_gint/kernel/gint_helper.cuh
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/gint_helper.cuh
rename to source/source_lcao/module_gint/kernel/gint_helper.cuh
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu
rename to source/source_lcao/module_gint/kernel/phi_operator_gpu.cu
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/source_lcao/module_gint/kernel/phi_operator_gpu.h
similarity index 98%
rename from source/source_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h
rename to source/source_lcao/module_gint/kernel/phi_operator_gpu.h
index 897218a8dd..27568e5ec9 100644
--- a/source/source_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h
+++ b/source/source_lcao/module_gint/kernel/phi_operator_gpu.h
@@ -2,7 +2,7 @@
 #include <memory>
 #include <cuda_runtime.h>
 
-#include "source_lcao/module_gint/temp_gint/batch_biggrid.h"
+#include "source_lcao/module_gint/batch_biggrid.h"
 #include "gint_helper.cuh"
 #include "gint_gpu_vars.h"
 #include "cuda_mem_wrapper.h"
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu b/source/source_lcao/module_gint/kernel/phi_operator_kernel.cu
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu
rename to source/source_lcao/module_gint/kernel/phi_operator_kernel.cu
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh b/source/source_lcao/module_gint/kernel/phi_operator_kernel.cuh
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh
rename to source/source_lcao/module_gint/kernel/phi_operator_kernel.cuh
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/set_const_mem.cu b/source/source_lcao/module_gint/kernel/set_const_mem.cu
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/set_const_mem.cu
rename to source/source_lcao/module_gint/kernel/set_const_mem.cu
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh b/source/source_lcao/module_gint/kernel/set_const_mem.cuh
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh
rename to source/source_lcao/module_gint/kernel/set_const_mem.cuh
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/sph.cuh b/source/source_lcao/module_gint/kernel/sph.cuh
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/kernel/sph.cuh
rename to source/source_lcao/module_gint/kernel/sph.cuh
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen.cpp b/source/source_lcao/module_gint/kernels/cuda/code_gen.cpp
deleted file mode 100644
index 42e8c4f0c5..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen.cpp
+++ /dev/null
@@ -1,4426 +0,0 @@
-gemm_time_measure<double, 2, 16, 16, 32, 2, 2, 16, 2, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 2, 16, 16, 32, 4, 2, 16, 2, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 2, 16, 16, 32, 6, 2, 16, 2, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 2, 16, 16, 32, 8, 2, 16, 2, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 2, 16, 16, 48, 2, 2, 16, 2, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 2, 16, 16, 48, 4, 2, 16, 2, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 2, 16, 16, 48, 6, 2, 16, 2, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 8, 8, 24, 4, 4, 8, 4, 8>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 4, 8, 8, 24, 8, 4, 8, 4, 8>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 4, 8, 8, 24, 12, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 8, 32, 4, 4, 8, 4, 8>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 4, 8, 8, 32, 8, 4, 8, 4, 8>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 4, 8, 8, 40, 4, 4, 8, 4, 8>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 4, 8, 8, 40, 8, 4, 8, 4, 8>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 4, 8, 8, 48, 4, 4, 8, 4, 8>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 4, 8, 8, 56, 4, 4, 8, 4, 8>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 4, 8, 8, 64, 4, 4, 8, 4, 8>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 4, 8, 16, 16, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 16, 16, 8, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 16, 16, 12, 4, 8, 4, 8>(max_m,
-                                                        max_n,
-                                                        d_m,
-                                                        d_n,
-                                                        d_k,
-                                                        d_global_A_array,
-                                                        d_global_lda,
-                                                        d_global_B_array,
-                                                        d_global_ldb,
-                                                        d_global_C_array,
-                                                        d_global_ldc,
-                                                        batchCount,
-                                                        temp_stream,
-                                                        fastest_time,
-                                                        fastest_algo,
-                                                        cpu_result,
-                                                        h_global_C,
-                                                        d_global_C);
-
-gemm_time_measure<double, 4, 8, 16, 24, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 16, 24, 8, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 16, 32, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 16, 32, 8, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 16, 40, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 16, 48, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 16, 56, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 24, 16, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 24, 16, 8, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 24, 24, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 24, 24, 8, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 24, 32, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 24, 40, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 32, 16, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 32, 16, 8, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 32, 24, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 40, 16, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 40, 24, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 48, 16, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 8, 56, 16, 4, 4, 8, 4, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 4, 16, 16, 32, 4, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 16, 32, 8, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 16, 32, 12, 4, 16, 4, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 4, 16, 16, 32, 16, 4, 16, 4, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 4, 16, 16, 48, 4, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 16, 48, 8, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 16, 48, 12, 4, 16, 4, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 4, 16, 16, 64, 4, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 16, 64, 8, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 32, 32, 4, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 32, 32, 8, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 32, 32, 12, 4, 16, 4, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 4, 16, 32, 48, 4, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 32, 48, 8, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 48, 32, 4, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 16, 48, 32, 8, 4, 16, 4, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 24, 24, 48, 4, 4, 24, 4, 24>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 24, 24, 48, 8, 4, 24, 4, 24>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 24, 24, 48, 12, 4, 24, 4, 24>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 4, 24, 48, 48, 4, 4, 24, 4, 24>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 24, 48, 48, 8, 4, 24, 4, 24>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 32, 32, 64, 4, 4, 32, 4, 32>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 32, 32, 64, 8, 4, 32, 4, 32>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 4, 32, 32, 64, 12, 4, 32, 4, 32>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 4, 32, 32, 64, 16, 4, 32, 4, 32>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 6, 16, 48, 32, 6, 6, 16, 6, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 6, 16, 48, 32, 12, 6, 16, 6, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 6, 16, 48, 48, 6, 6, 16, 6, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 4, 16, 12, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 16, 16, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 16, 20, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 16, 24, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 16, 28, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 16, 32, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 24, 8, 8, 8, 4, 8, 4>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 8, 4, 24, 12, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 24, 16, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 24, 20, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 24, 24, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 32, 8, 8, 8, 4, 8, 4>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 8, 4, 32, 12, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 32, 16, 8, 8, 4, 8, 4>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 4, 40, 8, 8, 8, 4, 8, 4>(max_m,
-                                                      max_n,
-                                                      d_m,
-                                                      d_n,
-                                                      d_k,
-                                                      d_global_A_array,
-                                                      d_global_lda,
-                                                      d_global_B_array,
-                                                      d_global_ldb,
-                                                      d_global_C_array,
-                                                      d_global_ldc,
-                                                      batchCount,
-                                                      temp_stream,
-                                                      fastest_time,
-                                                      fastest_algo,
-                                                      cpu_result,
-                                                      h_global_C,
-                                                      d_global_C);
-
-gemm_time_measure<double, 8, 8, 16, 24, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 16, 24, 16, 8, 8, 8, 8>(max_m,
-                                                        max_n,
-                                                        d_m,
-                                                        d_n,
-                                                        d_k,
-                                                        d_global_A_array,
-                                                        d_global_lda,
-                                                        d_global_B_array,
-                                                        d_global_ldb,
-                                                        d_global_C_array,
-                                                        d_global_ldc,
-                                                        batchCount,
-                                                        temp_stream,
-                                                        fastest_time,
-                                                        fastest_algo,
-                                                        cpu_result,
-                                                        h_global_C,
-                                                        d_global_C);
-
-gemm_time_measure<double, 8, 8, 16, 32, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 16, 32, 16, 8, 8, 8, 8>(max_m,
-                                                        max_n,
-                                                        d_m,
-                                                        d_n,
-                                                        d_k,
-                                                        d_global_A_array,
-                                                        d_global_lda,
-                                                        d_global_B_array,
-                                                        d_global_ldb,
-                                                        d_global_C_array,
-                                                        d_global_ldc,
-                                                        batchCount,
-                                                        temp_stream,
-                                                        fastest_time,
-                                                        fastest_algo,
-                                                        cpu_result,
-                                                        h_global_C,
-                                                        d_global_C);
-
-gemm_time_measure<double, 8, 8, 16, 40, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 16, 48, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 16, 56, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 16, 64, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 24, 16, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 24, 16, 16, 8, 8, 8, 8>(max_m,
-                                                        max_n,
-                                                        d_m,
-                                                        d_n,
-                                                        d_k,
-                                                        d_global_A_array,
-                                                        d_global_lda,
-                                                        d_global_B_array,
-                                                        d_global_ldb,
-                                                        d_global_C_array,
-                                                        d_global_ldc,
-                                                        batchCount,
-                                                        temp_stream,
-                                                        fastest_time,
-                                                        fastest_algo,
-                                                        cpu_result,
-                                                        h_global_C,
-                                                        d_global_C);
-
-gemm_time_measure<double, 8, 8, 24, 24, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 24, 24, 16, 8, 8, 8, 8>(max_m,
-                                                        max_n,
-                                                        d_m,
-                                                        d_n,
-                                                        d_k,
-                                                        d_global_A_array,
-                                                        d_global_lda,
-                                                        d_global_B_array,
-                                                        d_global_ldb,
-                                                        d_global_C_array,
-                                                        d_global_ldc,
-                                                        batchCount,
-                                                        temp_stream,
-                                                        fastest_time,
-                                                        fastest_algo,
-                                                        cpu_result,
-                                                        h_global_C,
-                                                        d_global_C);
-
-gemm_time_measure<double, 8, 8, 24, 32, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 24, 40, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 24, 48, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 24, 56, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 24, 64, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 32, 16, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 32, 16, 16, 8, 8, 8, 8>(max_m,
-                                                        max_n,
-                                                        d_m,
-                                                        d_n,
-                                                        d_k,
-                                                        d_global_A_array,
-                                                        d_global_lda,
-                                                        d_global_B_array,
-                                                        d_global_ldb,
-                                                        d_global_C_array,
-                                                        d_global_ldc,
-                                                        batchCount,
-                                                        temp_stream,
-                                                        fastest_time,
-                                                        fastest_algo,
-                                                        cpu_result,
-                                                        h_global_C,
-                                                        d_global_C);
-
-gemm_time_measure<double, 8, 8, 32, 24, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 32, 32, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 32, 40, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 32, 48, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 32, 56, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 40, 16, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 40, 24, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 40, 32, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 40, 40, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 40, 48, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 48, 16, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 48, 24, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 48, 32, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 48, 40, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 56, 16, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 56, 24, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 56, 32, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 64, 16, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 8, 64, 24, 8, 8, 8, 8, 8>(max_m,
-                                                       max_n,
-                                                       d_m,
-                                                       d_n,
-                                                       d_k,
-                                                       d_global_A_array,
-                                                       d_global_lda,
-                                                       d_global_B_array,
-                                                       d_global_ldb,
-                                                       d_global_C_array,
-                                                       d_global_ldc,
-                                                       batchCount,
-                                                       temp_stream,
-                                                       fastest_time,
-                                                       fastest_algo,
-                                                       cpu_result,
-                                                       h_global_C,
-                                                       d_global_C);
-
-gemm_time_measure<double, 8, 12, 24, 24, 8, 8, 12, 8, 12>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 12, 24, 24, 16, 8, 12, 8, 12>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 12, 24, 36, 8, 8, 12, 8, 12>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 12, 24, 36, 16, 8, 12, 8, 12>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 12, 24, 48, 8, 8, 12, 8, 12>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 12, 24, 60, 8, 8, 12, 8, 12>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 12, 48, 24, 8, 8, 12, 8, 12>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 12, 48, 36, 8, 8, 12, 8, 12>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 12, 48, 48, 8, 8, 12, 8, 12>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 12, 48, 60, 8, 8, 12, 8, 12>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 16, 16, 48, 8, 8, 16, 8, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 16, 16, 48, 16, 8, 16, 8, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 16, 16, 48, 24, 8, 16, 8, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 16, 16, 64, 8, 8, 16, 8, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 16, 16, 64, 16, 8, 16, 8, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 16, 32, 32, 8, 8, 16, 8, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 16, 32, 32, 16, 8, 16, 8, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 16, 32, 32, 24, 8, 16, 8, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 16, 32, 48, 8, 8, 16, 8, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 16, 32, 48, 16, 8, 16, 8, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 16, 32, 64, 8, 8, 16, 8, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 16, 32, 64, 16, 8, 16, 8, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 16, 48, 32, 8, 8, 16, 8, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 16, 48, 32, 16, 8, 16, 8, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 16, 48, 48, 8, 8, 16, 8, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 16, 48, 48, 16, 8, 16, 8, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 16, 48, 64, 8, 8, 16, 8, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 16, 64, 32, 8, 8, 16, 8, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 16, 64, 32, 16, 8, 16, 8, 16>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 16, 64, 48, 8, 8, 16, 8, 16>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 20, 40, 40, 8, 8, 20, 8, 20>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 20, 40, 40, 16, 8, 20, 8, 20>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 20, 40, 60, 8, 8, 20, 8, 20>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 24, 24, 48, 8, 8, 24, 8, 24>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 24, 24, 48, 16, 8, 24, 8, 24>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 24, 24, 48, 24, 8, 24, 8, 24>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 24, 48, 48, 8, 8, 24, 8, 24>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 24, 48, 48, 16, 8, 24, 8, 24>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 28, 56, 56, 8, 8, 28, 8, 28>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 28, 56, 56, 16, 8, 28, 8, 28>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 32, 32, 64, 8, 8, 32, 8, 32>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 32, 32, 64, 16, 8, 32, 8, 32>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 32, 32, 64, 24, 8, 32, 8, 32>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 32, 32, 64, 32, 8, 32, 8, 32>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 32, 64, 64, 8, 8, 32, 8, 32>(max_m,
-                                                          max_n,
-                                                          d_m,
-                                                          d_n,
-                                                          d_k,
-                                                          d_global_A_array,
-                                                          d_global_lda,
-                                                          d_global_B_array,
-                                                          d_global_ldb,
-                                                          d_global_C_array,
-                                                          d_global_ldc,
-                                                          batchCount,
-                                                          temp_stream,
-                                                          fastest_time,
-                                                          fastest_algo,
-                                                          cpu_result,
-                                                          h_global_C,
-                                                          d_global_C);
-
-gemm_time_measure<double, 8, 32, 64, 64, 16, 8, 32, 8, 32>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 8, 32, 64, 64, 24, 8, 32, 8, 32>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 12, 8, 24, 24, 12, 12, 8, 12, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 12, 8, 24, 32, 12, 12, 8, 12, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 12, 8, 24, 40, 12, 12, 8, 12, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 12, 8, 24, 48, 12, 12, 8, 12, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 12, 8, 24, 56, 12, 12, 8, 12, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 12, 8, 48, 16, 12, 12, 8, 12, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 12, 8, 48, 24, 12, 12, 8, 12, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 12, 8, 48, 32, 12, 12, 8, 12, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 12, 16, 48, 32, 12, 12, 16, 12, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 12, 16, 48, 32, 24, 12, 16, 12, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 12, 16, 48, 48, 12, 12, 16, 12, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 12, 16, 48, 64, 12, 12, 16, 12, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 12, 24, 48, 48, 12, 12, 24, 12, 24>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 12, 24, 48, 48, 24, 12, 24, 12, 24>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 4, 32, 12, 16, 16, 4, 16, 4>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 4, 32, 16, 16, 16, 4, 16, 4>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 6, 48, 12, 16, 16, 6, 16, 6>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 32, 24, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 32, 32, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 32, 40, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 32, 48, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 32, 56, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 32, 64, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 48, 16, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 48, 24, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 48, 32, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 48, 40, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 48, 48, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 64, 16, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 64, 24, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 8, 64, 32, 16, 16, 8, 16, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 16, 12, 48, 24, 16, 16, 12, 16, 12>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 12, 48, 36, 16, 16, 12, 16, 12>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 12, 48, 48, 16, 16, 12, 16, 12>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 12, 48, 60, 16, 16, 12, 16, 12>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 32, 48, 16, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 32, 48, 32, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 32, 64, 16, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 32, 64, 32, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 48, 32, 16, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 48, 32, 32, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 48, 48, 16, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 48, 48, 32, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 48, 64, 16, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 64, 32, 16, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 64, 32, 32, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 64, 48, 16, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 16, 64, 64, 16, 16, 16, 16, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 24, 48, 48, 16, 16, 24, 16, 24>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 24, 48, 48, 32, 16, 24, 16, 24>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 32, 64, 64, 16, 16, 32, 16, 32>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 16, 32, 64, 64, 32, 16, 32, 16, 32>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 20, 8, 40, 24, 20, 20, 8, 20, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 20, 8, 40, 32, 20, 20, 8, 20, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 24, 8, 48, 24, 24, 24, 8, 24, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 24, 8, 48, 32, 24, 24, 8, 24, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 24, 12, 48, 36, 24, 24, 12, 24, 12>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 24, 12, 48, 48, 24, 24, 12, 24, 12>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 24, 12, 48, 60, 24, 24, 12, 24, 12>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 24, 16, 48, 48, 24, 24, 16, 24, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 24, 16, 48, 64, 24, 24, 16, 24, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 32, 8, 64, 24, 32, 32, 8, 32, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 32, 8, 64, 32, 32, 32, 8, 32, 8>(max_m,
-                                                           max_n,
-                                                           d_m,
-                                                           d_n,
-                                                           d_k,
-                                                           d_global_A_array,
-                                                           d_global_lda,
-                                                           d_global_B_array,
-                                                           d_global_ldb,
-                                                           d_global_C_array,
-                                                           d_global_ldc,
-                                                           batchCount,
-                                                           temp_stream,
-                                                           fastest_time,
-                                                           fastest_algo,
-                                                           cpu_result,
-                                                           h_global_C,
-                                                           d_global_C);
-
-gemm_time_measure<double, 32, 16, 64, 48, 32, 32, 16, 32, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
-
-gemm_time_measure<double, 32, 16, 64, 64, 32, 32, 16, 32, 16>(max_m,
-                                                              max_n,
-                                                              d_m,
-                                                              d_n,
-                                                              d_k,
-                                                              d_global_A_array,
-                                                              d_global_lda,
-                                                              d_global_B_array,
-                                                              d_global_ldb,
-                                                              d_global_C_array,
-                                                              d_global_ldc,
-                                                              batchCount,
-                                                              temp_stream,
-                                                              fastest_time,
-                                                              fastest_algo,
-                                                              cpu_result,
-                                                              h_global_C,
-                                                              d_global_C);
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen.cuh b/source/source_lcao/module_gint/kernels/cuda/code_gen.cuh
deleted file mode 100644
index a4b1a75916..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen.cuh
+++ /dev/null
@@ -1,473 +0,0 @@
-#ifndef CODE_GEN_CUH
-#define CODE_GEN_CUH
-
-#include "gemm_selector.cuh"
-#include <cuda_runtime.h>
-
-extern template void gemm_time_measure<double, 2, 16, 16, 32, 2, 2, 16, 2, 16>(int, int, int*, int*, int*, double**, int*, double**, int*, double**, int*, int, cudaStream_t, float&, matrix_multiple_func_type&, double*, double*, double*);
-
-extern template void gemm_time_measure<double, 2, 16, 16, 32, 4, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 2, 16, 16, 32, 6, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 2, 16, 16, 32, 8, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 2, 16, 16, 48, 2, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 2, 16, 16, 48, 4, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 2, 16, 16, 48, 6, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 8, 24, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 8, 24, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 8, 24, 12, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 8, 32, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 8, 32, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 8, 40, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 8, 40, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 8, 48, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 8, 56, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 8, 64, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 16, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 16, 16, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 16, 16, 12, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 16, 24, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 16, 24, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 16, 32, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 16, 32, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 16, 40, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 16, 48, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 16, 56, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 24, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 24, 16, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 24, 24, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 24, 24, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 24, 32, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 24, 40, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 32, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 32, 16, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 32, 24, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 40, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 40, 24, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 48, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 8, 56, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 16, 32, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 16, 32, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 16, 32, 12, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 16, 32, 16, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 16, 48, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 16, 48, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 16, 48, 12, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 16, 64, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 16, 64, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 32, 32, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 32, 32, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 32, 32, 12, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 32, 48, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 32, 48, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 48, 32, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 16, 48, 32, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 24, 24, 48, 4, 4, 24, 4, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 24, 24, 48, 8, 4, 24, 4, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 24, 24, 48, 12, 4, 24, 4, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 24, 48, 48, 4, 4, 24, 4, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 24, 48, 48, 8, 4, 24, 4, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 32, 32, 64, 4, 4, 32, 4, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 32, 32, 64, 8, 4, 32, 4, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 32, 32, 64, 12, 4, 32, 4, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 4, 32, 32, 64, 16, 4, 32, 4, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 6, 16, 48, 32, 6, 6, 16, 6, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 6, 16, 48, 32, 12, 6, 16, 6, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 6, 16, 48, 48, 6, 6, 16, 6, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 16, 12, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 16, 16, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 16, 20, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 16, 24, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 16, 28, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 16, 32, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 24, 8, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 24, 12, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 24, 16, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 24, 20, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 24, 24, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 32, 8, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 32, 12, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 32, 16, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 4, 40, 8, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 16, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 16, 24, 16, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 16, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 16, 32, 16, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 16, 40, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 16, 48, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 16, 56, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 16, 64, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 24, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 24, 16, 16, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 24, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 24, 24, 16, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 24, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 24, 40, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 24, 48, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 24, 56, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 24, 64, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 32, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 32, 16, 16, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 32, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 32, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 32, 40, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 32, 48, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 32, 56, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 40, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 40, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 40, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 40, 40, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 40, 48, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 48, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 48, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 48, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 48, 40, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 56, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 56, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 56, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 64, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 8, 64, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 12, 24, 24, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 12, 24, 24, 16, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 12, 24, 36, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 12, 24, 36, 16, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 12, 24, 48, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 12, 24, 60, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 12, 48, 24, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 12, 48, 36, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 12, 48, 48, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 12, 48, 60, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 16, 48, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 16, 48, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 16, 48, 24, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 16, 64, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 16, 64, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 32, 32, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 32, 32, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 32, 32, 24, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 32, 48, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 32, 48, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 32, 64, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 32, 64, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 48, 32, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 48, 32, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 48, 48, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 48, 48, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 48, 64, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 64, 32, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 64, 32, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 16, 64, 48, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 20, 40, 40, 8, 8, 20, 8, 20>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 20, 40, 40, 16, 8, 20, 8, 20>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 20, 40, 60, 8, 8, 20, 8, 20>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 24, 24, 48, 8, 8, 24, 8, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 24, 24, 48, 16, 8, 24, 8, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 24, 24, 48, 24, 8, 24, 8, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 24, 48, 48, 8, 8, 24, 8, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 24, 48, 48, 16, 8, 24, 8, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 28, 56, 56, 8, 8, 28, 8, 28>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 28, 56, 56, 16, 8, 28, 8, 28>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 32, 32, 64, 8, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 32, 32, 64, 16, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 32, 32, 64, 24, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 32, 32, 64, 32, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 32, 64, 64, 8, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 32, 64, 64, 16, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 8, 32, 64, 64, 24, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 8, 24, 24, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 8, 24, 32, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 8, 24, 40, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 8, 24, 48, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 8, 24, 56, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 8, 48, 16, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 8, 48, 24, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 8, 48, 32, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 16, 48, 32, 12, 12, 16, 12, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 16, 48, 32, 24, 12, 16, 12, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 16, 48, 48, 12, 12, 16, 12, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 16, 48, 64, 12, 12, 16, 12, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 24, 48, 48, 12, 12, 24, 12, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 12, 24, 48, 48, 24, 12, 24, 12, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 4, 32, 12, 16, 16, 4, 16, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 4, 32, 16, 16, 16, 4, 16, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 6, 48, 12, 16, 16, 6, 16, 6>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 32, 24, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 32, 32, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 32, 40, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 32, 48, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 32, 56, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 32, 64, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 48, 16, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 48, 24, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 48, 32, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 48, 40, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 48, 48, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 64, 16, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 64, 24, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 8, 64, 32, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 12, 48, 24, 16, 16, 12, 16, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 12, 48, 36, 16, 16, 12, 16, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 12, 48, 48, 16, 16, 12, 16, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 12, 48, 60, 16, 16, 12, 16, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 32, 48, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 32, 48, 32, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 32, 64, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 32, 64, 32, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 48, 32, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 48, 32, 32, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 48, 48, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 48, 48, 32, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 48, 64, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 64, 32, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 64, 32, 32, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 64, 48, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 16, 64, 64, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 24, 48, 48, 16, 16, 24, 16, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 24, 48, 48, 32, 16, 24, 16, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 32, 64, 64, 16, 16, 32, 16, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 16, 32, 64, 64, 32, 16, 32, 16, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 20, 8, 40, 24, 20, 20, 8, 20, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 20, 8, 40, 32, 20, 20, 8, 20, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 24, 8, 48, 24, 24, 24, 8, 24, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 24, 8, 48, 32, 24, 24, 8, 24, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 24, 12, 48, 36, 24, 24, 12, 24, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 24, 12, 48, 48, 24, 24, 12, 24, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 24, 12, 48, 60, 24, 24, 12, 24, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 24, 16, 48, 48, 24, 24, 16, 24, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 24, 16, 48, 64, 24, 24, 16, 24, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 32, 8, 64, 24, 32, 32, 8, 32, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 32, 8, 64, 32, 32, 32, 8, 32, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 32, 16, 64, 48, 32, 32, 16, 32, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-extern template void gemm_time_measure<double, 32, 16, 64, 64, 32, 32, 16, 32, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-#endif
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_00.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_00.cu
deleted file mode 100644
index a07c411485..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen_00.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "vbatch_matrix_mul.cuh"
-
-template void gemm_time_measure<double, 2, 16, 16, 32, 2, 2, 16, 2, 16>(int, int, int*, int*, int*, double**, int*, double**, int*, double**, int*, int, cudaStream_t, float&, matrix_multiple_func_type&, double*, double*, double*);
-
-template void gemm_time_measure<double, 2, 16, 16, 32, 4, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 2, 16, 16, 32, 6, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 2, 16, 16, 32, 8, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 2, 16, 16, 48, 2, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 2, 16, 16, 48, 4, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 2, 16, 16, 48, 6, 2, 16, 2, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 8, 24, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 8, 24, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 8, 24, 12, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 8, 32, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 8, 32, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 8, 40, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 8, 40, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 8, 48, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 8, 56, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 8, 64, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 16, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 16, 16, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 16, 16, 12, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 16, 24, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 16, 24, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 16, 32, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_01.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_01.cu
deleted file mode 100644
index 9f725c23c6..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen_01.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "vbatch_matrix_mul.cuh"
-
-template void gemm_time_measure<double, 4, 8, 16, 32, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 16, 40, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 16, 48, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 16, 56, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 24, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 24, 16, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 24, 24, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 24, 24, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 24, 32, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 24, 40, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 32, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 32, 16, 8, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 32, 24, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 40, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 40, 24, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 48, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 8, 56, 16, 4, 4, 8, 4, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 16, 32, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 16, 32, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 16, 32, 12, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 16, 32, 16, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 16, 48, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 16, 48, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_02.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_02.cu
deleted file mode 100644
index 090eab0709..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen_02.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "vbatch_matrix_mul.cuh"
-
-template void gemm_time_measure<double, 4, 16, 16, 48, 12, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 16, 64, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 16, 64, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 32, 32, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 32, 32, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 32, 32, 12, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 32, 48, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 32, 48, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 48, 32, 4, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 16, 48, 32, 8, 4, 16, 4, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 24, 24, 48, 4, 4, 24, 4, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 24, 24, 48, 8, 4, 24, 4, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 24, 24, 48, 12, 4, 24, 4, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 24, 48, 48, 4, 4, 24, 4, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 24, 48, 48, 8, 4, 24, 4, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 32, 32, 64, 4, 4, 32, 4, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 32, 32, 64, 8, 4, 32, 4, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 32, 32, 64, 12, 4, 32, 4, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 4, 32, 32, 64, 16, 4, 32, 4, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 6, 16, 48, 32, 6, 6, 16, 6, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 6, 16, 48, 32, 12, 6, 16, 6, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 6, 16, 48, 48, 6, 6, 16, 6, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 16, 12, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_03.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_03.cu
deleted file mode 100644
index 046d0e5063..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen_03.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "vbatch_matrix_mul.cuh"
-
-template void gemm_time_measure<double, 8, 4, 16, 16, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 16, 20, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 16, 24, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 16, 28, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 16, 32, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 24, 8, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 24, 12, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 24, 16, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 24, 20, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 24, 24, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 32, 8, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 32, 12, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 32, 16, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 4, 40, 8, 8, 8, 4, 8, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 16, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 16, 24, 16, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 16, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 16, 32, 16, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 16, 40, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 16, 48, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 16, 56, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 16, 64, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 24, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_04.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_04.cu
deleted file mode 100644
index f74209d829..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen_04.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "vbatch_matrix_mul.cuh"
-
-template void gemm_time_measure<double, 8, 8, 24, 16, 16, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 24, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 24, 24, 16, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 24, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 24, 40, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 24, 48, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 24, 56, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 24, 64, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 32, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 32, 16, 16, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 32, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 32, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 32, 40, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 32, 48, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 32, 56, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 40, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 40, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 40, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 40, 40, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 40, 48, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 48, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 48, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 48, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_05.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_05.cu
deleted file mode 100644
index c9cb81bd7c..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen_05.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "vbatch_matrix_mul.cuh"
-
-template void gemm_time_measure<double, 8, 8, 48, 40, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 56, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 56, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 56, 32, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 64, 16, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 8, 64, 24, 8, 8, 8, 8, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 12, 24, 24, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 12, 24, 24, 16, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 12, 24, 36, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 12, 24, 36, 16, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 12, 24, 48, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 12, 24, 60, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 12, 48, 24, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 12, 48, 36, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 12, 48, 48, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 12, 48, 60, 8, 8, 12, 8, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 16, 48, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 16, 48, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 16, 48, 24, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 16, 64, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 16, 64, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 32, 32, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 32, 32, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_06.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_06.cu
deleted file mode 100644
index f5fac39df2..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen_06.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "vbatch_matrix_mul.cuh"
-
-template void gemm_time_measure<double, 8, 16, 32, 32, 24, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 32, 48, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 32, 48, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 32, 64, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 32, 64, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 48, 32, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 48, 32, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 48, 48, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 48, 48, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 48, 64, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 64, 32, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 64, 32, 16, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 16, 64, 48, 8, 8, 16, 8, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 20, 40, 40, 8, 8, 20, 8, 20>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 20, 40, 40, 16, 8, 20, 8, 20>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 20, 40, 60, 8, 8, 20, 8, 20>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 24, 24, 48, 8, 8, 24, 8, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 24, 24, 48, 16, 8, 24, 8, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 24, 24, 48, 24, 8, 24, 8, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 24, 48, 48, 8, 8, 24, 8, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 24, 48, 48, 16, 8, 24, 8, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 28, 56, 56, 8, 8, 28, 8, 28>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 28, 56, 56, 16, 8, 28, 8, 28>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_07.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_07.cu
deleted file mode 100644
index 971c6eb0c0..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen_07.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "vbatch_matrix_mul.cuh"
-
-template void gemm_time_measure<double, 8, 32, 32, 64, 8, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 32, 32, 64, 16, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 32, 32, 64, 24, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 32, 32, 64, 32, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 32, 64, 64, 8, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 32, 64, 64, 16, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 8, 32, 64, 64, 24, 8, 32, 8, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 8, 24, 24, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 8, 24, 32, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 8, 24, 40, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 8, 24, 48, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 8, 24, 56, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 8, 48, 16, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 8, 48, 24, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 8, 48, 32, 12, 12, 8, 12, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 16, 48, 32, 12, 12, 16, 12, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 16, 48, 32, 24, 12, 16, 12, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 16, 48, 48, 12, 12, 16, 12, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 16, 48, 64, 12, 12, 16, 12, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 24, 48, 48, 12, 12, 24, 12, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 12, 24, 48, 48, 24, 12, 24, 12, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 4, 32, 12, 16, 16, 4, 16, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 4, 32, 16, 16, 16, 4, 16, 4>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_08.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_08.cu
deleted file mode 100644
index 8643faae70..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen_08.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "vbatch_matrix_mul.cuh"
-
-template void gemm_time_measure<double, 16, 6, 48, 12, 16, 16, 6, 16, 6>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 32, 24, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 32, 32, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 32, 40, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 32, 48, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 32, 56, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 32, 64, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 48, 16, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 48, 24, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 48, 32, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 48, 40, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 48, 48, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 64, 16, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 64, 24, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 8, 64, 32, 16, 16, 8, 16, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 12, 48, 24, 16, 16, 12, 16, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 12, 48, 36, 16, 16, 12, 16, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 12, 48, 48, 16, 16, 12, 16, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 12, 48, 60, 16, 16, 12, 16, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 32, 48, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 32, 48, 32, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 32, 64, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 32, 64, 32, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
diff --git a/source/source_lcao/module_gint/kernels/cuda/code_gen_09.cu b/source/source_lcao/module_gint/kernels/cuda/code_gen_09.cu
deleted file mode 100644
index 8cf333bf6f..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/code_gen_09.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "vbatch_matrix_mul.cuh"
-
-template void gemm_time_measure<double, 16, 16, 48, 32, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 48, 32, 32, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 48, 48, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 48, 48, 32, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 48, 64, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 64, 32, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 64, 32, 32, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 64, 48, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 16, 64, 64, 16, 16, 16, 16, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 24, 48, 48, 16, 16, 24, 16, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 24, 48, 48, 32, 16, 24, 16, 24>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 32, 64, 64, 16, 16, 32, 16, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 16, 32, 64, 64, 32, 16, 32, 16, 32>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 20, 8, 40, 24, 20, 20, 8, 20, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 20, 8, 40, 32, 20, 20, 8, 20, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 24, 8, 48, 24, 24, 24, 8, 24, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 24, 8, 48, 32, 24, 24, 8, 24, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 24, 12, 48, 36, 24, 24, 12, 24, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 24, 12, 48, 48, 24, 24, 12, 24, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 24, 12, 48, 60, 24, 24, 12, 24, 12>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 24, 16, 48, 48, 24, 24, 16, 24, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 24, 16, 48, 64, 24, 24, 16, 24, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 32, 8, 64, 24, 32, 32, 8, 32, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 32, 8, 64, 32, 32, 32, 8, 32, 8>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 32, 16, 64, 48, 32, 32, 16, 32, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
-
-template void gemm_time_measure<double, 32, 16, 64, 64, 32, 32, 16, 32, 16>(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*);
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cu b/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cu
deleted file mode 100644
index c9bf122628..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cu
+++ /dev/null
@@ -1,292 +0,0 @@
-#include <iostream>
-#include <cstring>
-#include <stdint.h>
-
-#include "cuda_tools.cuh"
-
-void dump_cuda_array_to_file(const double* cuda_array,
-                             int width,
-                             int hight,
-                             const std::string& filename)
-{
-    double* h_data = new double[width * hight];
-    cudaMemcpy(h_data,
-               cuda_array,
-               width * hight * sizeof(double),
-               cudaMemcpyDeviceToHost);
-
-    std::ofstream outFile(filename);
-    if (!outFile.is_open())
-    {
-        std::cerr << "Failed to open file for writing." << std::endl;
-    }
-    for (int j = 0; j < hight; ++j)
-    {
-        for (int i = 0; i < width; ++i)
-        {
-            outFile << "hight" << j << "   width:" << i << "   "
-                    << h_data[j * width + i] << std::endl;
-        }
-    }
-    outFile.close();
-    delete[] h_data;
-}
-
-template <typename T>
-Cuda_Mem_Wrapper<T>::Cuda_Mem_Wrapper()
-{
-    this->device_pointer = nullptr;
-    this->host_pointer = nullptr;
-    this->one_stream_size = 0;
-    this->one_stream_size_aligned = 0;
-    this->stream_number = 1;
-    this->total_size_aligned = 0;
-}
-
-template <typename T>
-Cuda_Mem_Wrapper<T>::Cuda_Mem_Wrapper(int one_stream_size_in,
-                                      int one_stream_size_aligned_in,
-                                      int stream_number_in,
-                                      bool malloc_host_in)
-{
-    this->stream_number = stream_number_in;
-    this->one_stream_size = one_stream_size_in;
-    this->one_stream_size_aligned = one_stream_size_aligned_in;
-    this->total_size_aligned
-        = this->one_stream_size_aligned * this->stream_number;
-
-    checkCuda(cudaMalloc((void**)&this->device_pointer,
-                         this->total_size_aligned * sizeof(T)));
-    checkCuda(cudaMemset(this->device_pointer,
-                         0,
-                         this->total_size_aligned * sizeof(T)));
-    this->host_pointer = nullptr;
-
-    if (malloc_host_in)
-    {
-        checkCuda(cudaMallocHost((void**)&this->host_pointer,
-                                 this->total_size_aligned * sizeof(T)));
-        memset(this->host_pointer, 0, this->total_size_aligned * sizeof(T));
-    }
-}
-
-template <typename T>
-Cuda_Mem_Wrapper<T>::Cuda_Mem_Wrapper(int one_stream_size_in,
-                                      int stream_number_in,
-                                      bool malloc_host_in)
-    : Cuda_Mem_Wrapper(one_stream_size_in,
-                       one_stream_size_in,
-                       stream_number_in,
-                       malloc_host_in)
-{
-}
-
-template <typename T>
-Cuda_Mem_Wrapper<T>::Cuda_Mem_Wrapper(Cuda_Mem_Wrapper&& other) noexcept
-{
-    this->device_pointer = other.device_pointer;
-    this->host_pointer = other.host_pointer;
-    this->one_stream_size = other.one_stream_size;
-    this->one_stream_size_aligned = other.one_stream_size_aligned;
-    this->stream_number = other.stream_number;
-    this->total_size_aligned = other.total_size_aligned;
-
-    other.device_pointer = nullptr;
-    other.host_pointer = nullptr;
-    other.one_stream_size = 0;
-    other.one_stream_size_aligned = 0;
-    other.stream_number = 0;
-    other.total_size_aligned = 0;
-}
-
-template <typename T>
-Cuda_Mem_Wrapper<T>& Cuda_Mem_Wrapper<T>::operator=(Cuda_Mem_Wrapper&& other) noexcept
-{
-    if (this != &other)
-    {
-        this->free_all();
-        this->device_pointer = other.device_pointer;
-        this->host_pointer = other.host_pointer;
-        this->one_stream_size = other.one_stream_size;
-        this->one_stream_size_aligned = other.one_stream_size_aligned;
-        this->stream_number = other.stream_number;
-        this->total_size_aligned = other.total_size_aligned;
-
-        other.device_pointer = nullptr;
-        other.host_pointer = nullptr;
-        other.one_stream_size = 0;
-        other.one_stream_size_aligned = 0;
-        other.stream_number = 0;
-        other.total_size_aligned = 0;
-    }
-    return *this;
-}
-
-template <typename T>
-void Cuda_Mem_Wrapper<T>::free_all()
-{
-    checkCuda(cudaFree(this->device_pointer));
-    if (this->host_pointer != nullptr)
-    {
-        checkCuda(cudaFreeHost(this->host_pointer));
-    }
-}
-
-template <typename T>
-Cuda_Mem_Wrapper<T>::~Cuda_Mem_Wrapper()
-{
-    this->free_all();
-}
-
-template <typename T>
-inline void Cuda_Mem_Wrapper<T>::copy_host_to_device_sync(const int stream_id)
-{
-    if (this->host_pointer == nullptr || this->device_pointer == nullptr)
-    {
-        std::cerr << "host_pointer is nullptr, can not copy host to device"
-                  << std::endl;
-        exit(1);
-    }
-    checkCuda(cudaMemcpy(
-        this->device_pointer + stream_id * this->one_stream_size_aligned,
-        this->host_pointer + stream_id * this->one_stream_size_aligned,
-        this->one_stream_size * sizeof(T),
-        cudaMemcpyHostToDevice));
-}
-
-template <typename T>
-inline void Cuda_Mem_Wrapper<T>::copy_host_to_device_async(const cudaStream_t stream,
-                                                    const int stream_id)
-{
-    if (this->host_pointer == nullptr || this->device_pointer == nullptr)
-    {
-        std::cerr << "host_pointer is nullptr, can not copy host to device"
-                  << std::endl;
-        exit(1);
-    }
-    checkCuda(cudaMemcpyAsync(
-        this->device_pointer + stream_id * this->one_stream_size_aligned,
-        this->host_pointer + stream_id * this->one_stream_size_aligned,
-        this->one_stream_size * sizeof(T),
-        cudaMemcpyHostToDevice,
-        stream));
-}
-
-template <typename T>
-inline void Cuda_Mem_Wrapper<T>::copy_host_to_device_async(const cudaStream_t stream,
-                                                    const int stream_id,
-                                                    const int size)
-{
-    if (this->host_pointer == nullptr || this->device_pointer == nullptr)
-    {
-        std::cerr << "host_pointer is nullptr, can not copy host to device"
-                  << std::endl;
-        exit(1);
-    }
-    checkCuda(cudaMemcpyAsync(
-        this->device_pointer + stream_id * this->one_stream_size_aligned,
-        this->host_pointer + stream_id * this->one_stream_size_aligned,
-        size * sizeof(T),
-        cudaMemcpyHostToDevice,
-        stream));
-}
-
-template <typename T>
-inline void Cuda_Mem_Wrapper<T>::copy_device_to_host_sync(const int stream_id)
-{
-    if (this->host_pointer == nullptr || this->device_pointer == nullptr)
-    {
-        std::cerr << "host_pointer is nullptr, can not copy device to host"
-                  << std::endl;
-        exit(1);
-    }
-    checkCuda(cudaMemcpy(
-        this->host_pointer + stream_id * this->one_stream_size_aligned,
-        this->device_pointer + stream_id * this->one_stream_size_aligned,
-        this->one_stream_size * sizeof(T),
-        cudaMemcpyDeviceToHost));
-}
-
-template <typename T>
-inline void Cuda_Mem_Wrapper<T>::copy_device_to_host_async(const cudaStream_t stream,
-                                                    const int stream_id)
-{
-    if (this->host_pointer == nullptr || this->device_pointer == nullptr)
-    {
-        std::cerr << "host_pointer is nullptr, can not copy device to host"
-                  << std::endl;
-        exit(1);
-    }
-    checkCuda(cudaMemcpyAsync(
-        this->host_pointer + stream_id * this->one_stream_size_aligned,
-        this->device_pointer + stream_id * this->one_stream_size_aligned,
-        this->one_stream_size * sizeof(T),
-        cudaMemcpyDeviceToHost,
-        stream));
-}
-
-template <typename T>
-inline void Cuda_Mem_Wrapper<T>::copy_device_to_host_async(const cudaStream_t stream,
-                                                    const int stream_id,
-                                                    const int size)
-{
-    if (this->host_pointer == nullptr || this->device_pointer == nullptr)
-    {
-        std::cerr << "host_pointer is nullptr, can not copy device to host"
-                  << std::endl;
-        exit(1);
-    }
-    checkCuda(cudaMemcpyAsync(
-        this->host_pointer + stream_id * this->one_stream_size_aligned,
-        this->device_pointer + stream_id * this->one_stream_size_aligned,
-        size * sizeof(T),
-        cudaMemcpyDeviceToHost,
-        stream));
-}
-
-template <typename T>
-inline void Cuda_Mem_Wrapper<T>::memset_device_sync(const int stream_id, const int value)
-{
-    checkCuda(cudaMemset(this->device_pointer
-                             + stream_id * this->one_stream_size_aligned,
-                         value,
-                         this->one_stream_size * sizeof(T)));
-}
-
-template <typename T>
-inline void Cuda_Mem_Wrapper<T>::memset_device_async(const cudaStream_t stream,
-                                              const int stream_id,
-                                              const int value)
-{
-    checkCuda(cudaMemsetAsync(this->device_pointer
-                                  + stream_id * this->one_stream_size_aligned,
-                              value,
-                              this->one_stream_size * sizeof(T),
-                              stream));
-}
-
-template <typename T>
-inline void Cuda_Mem_Wrapper<T>::memset_host(const int stream_id, const int value)
-{
-    memset(this->host_pointer + stream_id * this->one_stream_size_aligned,
-           value,
-           this->one_stream_size * sizeof(T));
-}
-
-template <typename T>
-inline T* Cuda_Mem_Wrapper<T>::get_device_pointer(const int stream_id)
-{
-    return this->device_pointer + stream_id * this->one_stream_size_aligned;
-}
-
-template <typename T>
-inline T* Cuda_Mem_Wrapper<T>::get_host_pointer(const int stream_id)
-{
-    return this->host_pointer + stream_id * this->one_stream_size_aligned;
-}
-template class Cuda_Mem_Wrapper<double>;
-template class Cuda_Mem_Wrapper<double*>;
-template class Cuda_Mem_Wrapper<int>;
-template class Cuda_Mem_Wrapper<bool>;
-template class Cuda_Mem_Wrapper<uint8_t>;
-
diff --git a/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cuh b/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cuh
deleted file mode 100644
index dab697df8c..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/cuda_tools.cuh
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef CUDA_TOOLS_CUH
-#define CUDA_TOOLS_CUH
-#include <assert.h> // for assert
-#include <cublas_v2.h>
-#include <cuda.h> // for CUDA_VERSION
-#include <cuda_runtime.h>
-
-#include <fstream>
-#include <iostream>
-#include <sstream>
-
-#define checkCuda(val) check((val), #val, __FILE__, __LINE__)
-#define checkCudaLastError() __getLastCudaError(__FILE__, __LINE__)
-
-inline void check(cudaError_t result, char const *const func, const char *const file,
-           int const line) {
-  if (result) {
-    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
-            static_cast<unsigned int>(result), cudaGetErrorString(result), func);
-    exit(EXIT_FAILURE);
-  }
-}
-
-inline void __getLastCudaError(const char *file,
-                               const int line) 
-{
-  cudaError_t err = cudaGetLastError();
-
-  if (cudaSuccess != err) {
-    fprintf(stderr,
-            "%s(%i) : getLastCudaError() CUDA error :"
-            " (%d) %s.\n",
-            file, line, static_cast<int>(err),
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-}
-
-static inline int ceildiv(int x, int y)
-{
-    return (x + y - 1) / y;
-}
-
-void dump_cuda_array_to_file(const double* cuda_array,
-                             int width,
-                             int hight,
-                             const std::string& filename);
-
-// inline int ceil_div(int a, int b)
-// {
-//     return (a + b - 1) / b;
-// }
-
-/*
- * @brief: A simple wrapper for cudaMalloc and cudaFree, sync and async CUDA
- * memory copy
- * @param: T: the type of the data
- *
- * @note:
- * Manual management of CUDA memory is a very delicate task; complex pointers
- * and malloc/free operations make it easy for us to encounter memory bugs. The
- * severity of the issues increases significantly when introducing multi-node,
- * multi-GPU, and multi-stream parallelism.
- * Debugging after encountering bugs is also very difficult, finding the leaking
- * pointer from dozens of variables can be quite a headache.
- * Therefore, considering that our use and management of memory have some
- * homogeneity, we have abstracted these needs into the following encapsulations
- * to reduce the cost of maintenance and development. The memory is allocated in
- * the constructor and freed in the destructor.
- *
- * The following interface is primarily designed for the following requirements:
- * 1. We need to split a large task into multiple subtasks to run on multiple
- *    streams across multiple GPUs on multiple nodes.
- * 2. It is necessary to allocate memory of the same shape on both host and
- * device.
- * 3. Data copying between host and device sync or async is required.
- */
-
-template <typename T>
-class Cuda_Mem_Wrapper
-{
-  public:
-
-    Cuda_Mem_Wrapper();
-    Cuda_Mem_Wrapper(int one_stream_size,
-                     int one_stream_size_aligned,
-                     int stream_number = 1,
-                     bool malloc_host = true);
-    Cuda_Mem_Wrapper(int one_stream_size,
-                     int stream_number = 1,
-                     bool malloc_host = true);
-
-    Cuda_Mem_Wrapper(const Cuda_Mem_Wrapper& other) = delete;
-    Cuda_Mem_Wrapper& operator=(const Cuda_Mem_Wrapper& other) = delete;
-    Cuda_Mem_Wrapper(Cuda_Mem_Wrapper&& other) noexcept;
-    Cuda_Mem_Wrapper& operator=(Cuda_Mem_Wrapper&& other) noexcept;
-    
-    ~Cuda_Mem_Wrapper();
-    void copy_host_to_device_sync(const int stream_id = 0);
-    void copy_host_to_device_async(const cudaStream_t stream, const int stream_id);
-    void copy_host_to_device_async(const cudaStream_t stream, const int stream_id, const int size);
-    void copy_device_to_host_sync(const int stream_id = 0);
-    void copy_device_to_host_async(const cudaStream_t stream, const int stream_id);
-    void copy_device_to_host_async(const cudaStream_t stream, const int stream_id, const int size);
-    void memset_device_sync(const int stream_id = 0, const int value = 0);
-    void memset_device_async(const cudaStream_t stream, 
-                             const int stream_id = 0,
-                             const int value = 0);
-    void memset_host(const int stream_id = 0, const int value = 0);
-    T* get_device_pointer(const int stream_id = 0);
-    T* get_host_pointer(const int stream_id = 0);
-    void free_all();
-
-  private:
-    T* device_pointer;
-    T* host_pointer;
-    int one_stream_size;
-    int one_stream_size_aligned;
-    int stream_number;
-    int total_size_aligned;
-};
-
-#endif // CUDA_TOOLS_CUH
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cu b/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cu
deleted file mode 100644
index 6550b21edb..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cu
+++ /dev/null
@@ -1,138 +0,0 @@
-#include <iostream>
-
-#include "gemm_selector.cuh"
-#include "vbatch_matrix_mul.cuh"
-#include "cuda_tools.cuh"
-#include "source_base/module_external/blas_connector.h"
-#include "code_gen.cuh"
-
-/*
- * Here we have utilized a very straightforward and brute-force method to select
- * the optimal matrix multiplication kernel for a given scale of computation: we
- * compute with all scales of kernels under the current computational task to
- * find the fastest parameter combination. This approach can lead to an increase
- * in compilation time.
- */
-void gemm_algo_selector(int matrix_k, matrix_multiple_func_type& fastest_algo,const UnitCell& ucell)
-{
-    int batchCount_per_type = 32;
-    int batchCount
-        = batchCount_per_type * ucell.ntype * ucell.ntype;
-
-    Cuda_Mem_Wrapper<int> m(batchCount);
-    Cuda_Mem_Wrapper<int> n(batchCount);
-    Cuda_Mem_Wrapper<int> k(batchCount);
-
-    int max_m = ucell.nwmax, max_n = ucell.nwmax;
-
-    Cuda_Mem_Wrapper<double> A(batchCount * max_m * matrix_k);
-    Cuda_Mem_Wrapper<double> B(batchCount * max_n * matrix_k);
-    Cuda_Mem_Wrapper<double> C(batchCount * max_m * max_n);
-
-    Cuda_Mem_Wrapper<int> lda(batchCount);
-    Cuda_Mem_Wrapper<int> ldb(batchCount);
-    Cuda_Mem_Wrapper<int> ldc(batchCount);
-
-    Cuda_Mem_Wrapper<double*> A_array(batchCount);
-    Cuda_Mem_Wrapper<double*> B_array(batchCount);
-    Cuda_Mem_Wrapper<double*> C_array(batchCount);
-
-    for (int i = 0; i < batchCount * max_m * matrix_k; ++i)
-    {
-        A.get_host_pointer()[i] = i * 0.001;
-    }
-    for (int i = 0; i < batchCount * max_n * matrix_k; ++i)
-    {
-        B.get_host_pointer()[i] = i * 0.002;
-    }
-
-    double* cpu_result = new double[batchCount * max_m * max_n];
-    memset(cpu_result, 0, batchCount * max_m * max_n * sizeof(double));
-    int index = 0;
-    for (int i = 0; i < batchCount_per_type; ++i)
-    {
-        for (int j = 0; j < ucell.ntype; j++)
-        {
-            for (int l = 0; l < ucell.ntype; l++)
-            {
-                m.get_host_pointer()[index] = ucell.atoms[j].nw;
-                n.get_host_pointer()[index] = ucell.atoms[l].nw;
-                k.get_host_pointer()[index] = matrix_k;
-
-                lda.get_host_pointer()[index] = matrix_k;
-                ldb.get_host_pointer()[index] = matrix_k;
-                ldc.get_host_pointer()[index] = ucell.atoms[l].nw;
-
-                A_array.get_host_pointer()[index]
-                    = &A.get_device_pointer()[index * max_m * matrix_k];
-                B_array.get_host_pointer()[index]
-                    = &B.get_device_pointer()[index * max_n * matrix_k];
-                C_array.get_host_pointer()[index]
-                    = &C.get_device_pointer()[index * max_n
-                                              * max_m]; // test atom add
-                BlasConnector::gemm(
-                    'N',
-                    'T',
-                    m.get_host_pointer()[index],
-                    n.get_host_pointer()[index],
-                    matrix_k,
-                    1.0,
-                    &A.get_host_pointer()[index * max_m * matrix_k],
-                    matrix_k,
-                    &B.get_host_pointer()[index * max_n * matrix_k],
-                    matrix_k,
-                    1.0,
-                    &cpu_result[index * max_m * max_n],
-                    n.get_host_pointer()[index]);
-                index++;
-            }
-        }
-    }
-
-    m.copy_host_to_device_sync();
-    n.copy_host_to_device_sync();
-    k.copy_host_to_device_sync();
-
-    lda.copy_host_to_device_sync();
-    ldb.copy_host_to_device_sync();
-    ldc.copy_host_to_device_sync();
-
-    A.copy_host_to_device_sync();
-    B.copy_host_to_device_sync();
-    A_array.copy_host_to_device_sync();
-    B_array.copy_host_to_device_sync();
-    C_array.copy_host_to_device_sync();
-
-    cudaStream_t temp_stream;
-    checkCuda(cudaStreamCreate(&temp_stream));
-
-    float fastest_time = 1000000;
-    fastest_algo = vbatched_gemm_impl<double, 16, 4, 32, 16, 16, 16, 4, 16, 4>;
-
-    int* d_m = m.get_device_pointer();
-    int* d_n = n.get_device_pointer();
-    int* d_k = k.get_device_pointer();
-
-    double** d_global_A_array = A_array.get_device_pointer();
-    double** d_global_B_array = B_array.get_device_pointer();
-    double** d_global_C_array = C_array.get_device_pointer();
-
-    double* h_global_C = C.get_host_pointer();
-    double* d_global_C = C.get_device_pointer();
-
-    int* d_global_lda = lda.get_device_pointer();
-    int* d_global_ldb = ldb.get_device_pointer();
-    int* d_global_ldc = ldc.get_device_pointer();
-
-/*
- * Please do not manually modify the code in the following file;
- * it should simply be generated through a loop using a short Python program.
- */
-#include "code_gen.cpp"
-    checkCuda(cudaStreamDestroy(temp_stream));
-    std::cout << " gemm_algo_selector::Fastest time: " << fastest_time << " ms"
-              << std::endl;
-    // fastest_algo = vbatched_gemm_impl<double, 16, 4, 32, 16, 16, 16, 4, 16,
-    // 4>;
-    delete[] cpu_result;
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cuh b/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cuh
deleted file mode 100644
index 744f3c887d..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/gemm_selector.cuh
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef GEMM_SELECTOR_H
-#define GEMM_SELECTOR_H
-
-#include "cuda_runtime.h"
-#include "source_cell/unitcell.h"
-typedef std::function<
-    void(int, int, int*, int*, int*, double**, int*, double**, int*, double**, int*, int, cudaStream_t, double* alpha)>
-    matrix_multiple_func_type;
-
-void gemm_algo_selector(int k, matrix_multiple_func_type& func, const UnitCell& ucell);
-
-#endif
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_force.cu b/source/source_lcao/module_gint/kernels/cuda/gint_force.cu
deleted file mode 100644
index 0199c9e37a..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/gint_force.cu
+++ /dev/null
@@ -1,225 +0,0 @@
-#include "sph.cuh"
-#include "interp.cuh"
-#include "gint_force.cuh"
-#include "cuda_tools.cuh"
-#include "source_base/module_device/device.h"
-// CUDA kernel to calculate psi and force
-namespace GintKernel
-{
-__inline__ __device__ double warpReduceSum(double val)
-{   
-    val += __shfl_xor_sync(0xffffffff, val, 16, 32);
-    val += __shfl_xor_sync(0xffffffff, val, 8, 32);
-    val += __shfl_xor_sync(0xffffffff, val, 4, 32);
-    val += __shfl_xor_sync(0xffffffff, val, 2, 32);
-    val += __shfl_xor_sync(0xffffffff, val, 1, 32);
-    return val;
-}
-
-
-__global__ void get_psi_force(double* ylmcoef,
-                              double delta_r,
-                              int bxyz,
-                              const int nwmax,
-                              const int max_atom,
-                              const int* const ucell_atom_nwl,
-                              const bool* const atom_iw2_new,
-                              const int* const atom_iw2_ylm,
-                              const int* const atom_iw2_l,
-                              const int* const atom_nw,
-                              const double* const rcut,
-                              const int nr_max,
-                              const double* const psi_u,
-                              const double* const mcell_pos,
-                              const double* const dr_part,
-                              const double* const vldr3,
-                              const uint8_t* const atoms_type,
-                              const int* const atoms_num_info,
-                              double* psi,
-                              double* dpsi,
-                              double* d2psi)
-{
-    const int bcell_id = blockIdx.x;
-    const int num_atoms = atoms_num_info[2 * bcell_id];
-    const int pre_atoms = atoms_num_info[2 * bcell_id + 1];
-    const int mcell_id = blockIdx.y;
-    const double vldr3_value = vldr3[bcell_id*bxyz + mcell_id];
-    const double mcell_pos_x = mcell_pos[3 * mcell_id];
-    const double mcell_pos_y = mcell_pos[3 * mcell_id + 1];
-    const double mcell_pos_z = mcell_pos[3 * mcell_id + 2];
-
-    for(int atom_id = threadIdx.x; atom_id < num_atoms; atom_id += blockDim.x)
-    {
-        const int dr_start = 3 * (pre_atoms + atom_id);
-        const double dr_x = dr_part[dr_start] + mcell_pos_x;
-        const double dr_y = dr_part[dr_start + 1] + mcell_pos_y;
-        const double dr_z = dr_part[dr_start + 2] + mcell_pos_z;
-        double dist = sqrt(dr_x * dr_x + dr_y * dr_y + dr_z * dr_z);
-        const int atype = __ldg(atoms_type + pre_atoms + atom_id);
-        if(dist < rcut[atype])
-        {
-            if (dist < 1.0E-9)
-            {
-                dist += 1.0E-9;
-            }
-            // dr is different from that in interp_rho and interp_vl
-            double dr[3] = {dr_x, dr_y, dr_z};
-            double ylma[49];
-            double grly[49][3];
-            const int nwl = __ldg(ucell_atom_nwl + atype);
-            spherical_harmonics_d(dr, dist*dist, grly, nwl, ylma, ylmcoef);
-            int psi_idx = ((pre_atoms + atom_id) * bxyz + mcell_id) * nwmax;
-            interp_f(dist,
-                     delta_r,
-                     atype,
-                     nwmax,
-                     nr_max,
-                     atom_nw,
-                     atom_iw2_new,
-                     psi_u,
-                     ylma,
-                     atom_iw2_l,
-                     atom_iw2_ylm,
-                     vldr3_value,
-                     dr,
-                     grly,
-                     psi_idx,
-                     psi,
-                     dpsi,
-                     d2psi);
-        }
-    }
-}
-
-
-__global__ void dot_product_stress(const double* d2psi,
-                                   const double* psi_dm,
-                                   const int size,
-                                   double* stress)
-{
-    __shared__ double cache[32 * 6];
-    const int tid = threadIdx.x;
-    const int stride = blockDim.x * gridDim.x;
-    const int warp_id = tid / 32;
-    const int lane_id = tid % 32;
-    double tmp[6] = {0.0};
-    for(int id = threadIdx.x + blockIdx.x * blockDim.x; id < size; id += stride)
-    {   
-        const double psi_dm_2 = psi_dm[id] * 2;
-        const int id_stress = id * 6;
-        tmp[0] += d2psi[id_stress] * psi_dm_2;
-        tmp[1] += d2psi[id_stress + 1] * psi_dm_2;
-        tmp[2] += d2psi[id_stress + 2] * psi_dm_2;
-        tmp[3] += d2psi[id_stress + 3] * psi_dm_2;
-        tmp[4] += d2psi[id_stress + 4] * psi_dm_2;
-        tmp[5] += d2psi[id_stress + 5] * psi_dm_2;
-    }
-
-    for(int i = 0; i<6; i++)
-    {
-        tmp[i] = warpReduceSum(tmp[i]);
-    }
-
-    if (lane_id == 0)
-    {
-        for (int i = 0; i < 6; i++)
-        {
-            cache[warp_id * 6 + i] = tmp[i];
-        }
-    }
-    __syncthreads();
-
-    for (int i = 0; i < 6; i++)
-    {
-        tmp[i] = (tid < blockDim.x / 32) ? cache[tid * 6 + i] : 0;
-    }
-
-    if(warp_id == 0)
-    {
-        for (int i = 0; i < 6; i++)
-        {
-            tmp[i] = warpReduceSum(tmp[i]);
-        }
-    }
-
-    if (tid == 0)
-    {
-        for (int i = 0; i < 6; i++)
-        {
-            atomicAdd(&stress[i], tmp[i]); // Use atomicAdd() instead of atomic_add().
-        }
-    }
-}
-
-
-__global__ void dot_product_force(const int bxyz,
-                                  const int nwmax,
-                                  const int *atoms_num_info,
-                                  const int *iat_on_nbz,
-                                  const double* dpsi,
-                                  const double* psi_dm,
-                                  double* force)
-{
-    __shared__ double cache[32 * 3];
-    const int tid = threadIdx.x;
-    const int bcell_id = blockIdx.x;
-    const int warp_id = tid / 32;
-    const int lane_id = tid % 32;
-    const int vec_size = bxyz * nwmax;
-    const int atom_num = atoms_num_info[2 * bcell_id];
-    const int pre_atoms = atoms_num_info[2 * bcell_id + 1];
-
-    for(int k = 0; k < atom_num; k++)
-    {
-        const int atom_id = pre_atoms + k;
-        const int offset = atom_id * vec_size;
-        const int iat = iat_on_nbz[atom_id];
-        double force_iat[3] = {0.0};
-
-        for(int i =tid; i < vec_size; i += blockDim.x)
-        {
-            int psi_offset = offset + i;
-            double psi_dm_2 = psi_dm[psi_offset] * 2;
-            force_iat[0] += dpsi[psi_offset * 3] * psi_dm_2;
-            force_iat[1] += dpsi[psi_offset * 3 + 1] * psi_dm_2;
-            force_iat[2] += dpsi[psi_offset * 3 + 2] * psi_dm_2;
-        }
-
-        for (int i = 0; i < 3; i++)
-        {
-            force_iat[i] = warpReduceSum(force_iat[i]);
-        }
-
-        if (lane_id == 0)
-        {
-            for (int i = 0; i < 3; i++)
-            {
-                cache[warp_id * 3 + i] = force_iat[i];
-            }
-        }
-        __syncthreads();
-
-        for (int i = 0; i < 3; i++)
-        {
-            force_iat[i] = (tid < blockDim.x / 32) ? cache[tid * 3 + i] : 0;
-        }
-
-        if (warp_id == 0)
-        {
-            for (int i = 0; i < 3; i++)
-            {
-                force_iat[i] = warpReduceSum(force_iat[i]);
-            }
-        }
-
-        if (tid == 0)
-        {
-            for (int i = 0; i < 3; i++)
-            {
-                atomicAdd(&force[iat * 3 + i], force_iat[i]);
-            }
-        }
-    }
-}
-
-} // namespace GintKernel
diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_force.cuh b/source/source_lcao/module_gint/kernels/cuda/gint_force.cuh
deleted file mode 100644
index 74b941f32a..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/gint_force.cuh
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef GINT_FORCE_CUH
-#define GINT_FORCE_CUH
-
-#include <cuda_runtime.h>
-#include <cstdint>
-namespace GintKernel
-{
-
-__global__ void get_psi_force(double* ylmcoef,
-                              double delta_r,
-                              int bxyz,
-                              const int nwmax,
-                              const int max_atom,
-                              const int* const ucell_atom_nwl,
-                              const bool* const atom_iw2_new,
-                              const int* const atom_iw2_ylm,
-                              const int* const atom_iw2_l,
-                              const int* const atom_nw,
-                              const double* const rcut,
-                              const int nr_max,
-                              const double* const psi_u,
-                              const double* const mcell_pos,
-                              const double* const dr_part,
-                              const double* const vldr3,
-                              const uint8_t* const atoms_type,
-                              const int* const atoms_num_info,
-                              double* psi,
-                              double* dpsi,
-                              double* d2psi);
-
-
-__global__ void dot_product_stress(const double* d2psi,
-                                   const double* psi_dm,
-                                   const int size,
-                                   double* stress);
-
-__global__ void dot_product_force(const int bxyz,
-                                  const int nwmax,
-                                  const int *atoms_num_info,
-                                  const int *iat_on_nbz,
-                                  const double* dpsi,
-                                  const double* psi_dm,
-                                  double* force);
-
-} // namespace GintKernel
-#endif // GINT_VL_CUH
diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_rho.cu b/source/source_lcao/module_gint/kernels/cuda/gint_rho.cu
deleted file mode 100644
index 6b4069c40b..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/gint_rho.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-#include "interp.cuh"
-#include "gint_rho.cuh"
-#include "sph.cuh"
-#include "cuda_tools.cuh"
-
-namespace GintKernel
-{
-__inline__ __device__ double warpReduceSum(double val)
-{   
-    val += __shfl_xor_sync(0xffffffff, val, 16, 32);
-    val += __shfl_xor_sync(0xffffffff, val, 8, 32);
-    val += __shfl_xor_sync(0xffffffff, val, 4, 32);
-    val += __shfl_xor_sync(0xffffffff, val, 2, 32);
-    val += __shfl_xor_sync(0xffffffff, val, 1, 32);
-    return val;
-}
-
-
-/*
-    each block calculates the wavefunction on a meshcell,
-    and each thread loops over the atoms on a meshcell.
-*/
-__global__ void get_psi(const double* const ylmcoef,
-                        const double delta_r,
-                        const int bxyz,
-                        const int nwmax,
-                        const int max_atom,
-                        const int* const ucell_atom_nwl,
-                        const bool* const atom_iw2_new,
-                        const int* const atom_iw2_ylm,
-                        const int* const atom_nw,
-                        const double* const rcut,
-                        const int nr_max,
-                        const double* const psi_u,
-                        const double* const mcell_pos,
-                        const double* const dr_part,
-                        const uint8_t* const atoms_type,
-                        const int* const atoms_num_info,
-                        double* psi)
-{
-    const int bcell_id = blockIdx.x;
-    const int num_atoms = atoms_num_info[2 * bcell_id];
-    const int pre_atoms = atoms_num_info[2 * bcell_id + 1];
-    const int mcell_id = blockIdx.y;
-    const double mcell_pos_x = mcell_pos[3 * mcell_id];
-    const double mcell_pos_y = mcell_pos[3 * mcell_id + 1];
-    const double mcell_pos_z = mcell_pos[3 * mcell_id + 2];
-
-    for(int atom_id = threadIdx.x; atom_id < num_atoms; atom_id += blockDim.x)
-    {
-        const int aid = pre_atoms + atom_id;
-        const double dr_x = dr_part[aid * 3] + mcell_pos_x;
-        const double dr_y = dr_part[aid * 3 + 1] + mcell_pos_y;
-        const double dr_z = dr_part[aid * 3 + 2] + mcell_pos_z;
-        double dist = sqrt(dr_x * dr_x + dr_y * dr_y + dr_z * dr_z);
-        const int atype = __ldg(atoms_type + aid);
-        if(dist < rcut[atype])
-        {
-            if (dist < 1.0E-9)
-            {
-                dist += 1.0E-9;
-            }
-            double dr[3] = {dr_x / dist, dr_y / dist, dr_z / dist};
-            double ylma[49];
-            const int nwl = __ldg(ucell_atom_nwl + atype);
-            int psi_idx = (pre_atoms * bxyz + mcell_id * num_atoms + atom_id) * nwmax;
-            spherical_harmonics(dr, nwl, ylma, ylmcoef);
-            interp_rho(dist,
-                       delta_r,
-                       atype,
-                       nwmax,
-                       nr_max,
-                       atom_nw,
-                       atom_iw2_new,
-                       psi_u,
-                       ylma,
-                       atom_iw2_ylm,
-                       psi,
-                       psi_idx);
-        }
-    }
-}
-
-/*
-    Each block calculates the dot product on a meshcell,
-    and each thread loops over the wavefunction of atoms on a meshcell.
-*/
-__global__ void psir_dot(const int bxyz,
-                         const int nwmax,
-                         const int* atoms_num_info,
-                         const double* __restrict__ vec_a_g,
-                         const double* __restrict__  vec_b_g,
-                         double** results_g)
-{
-    __shared__ double s_data[32];
-    const int tid = threadIdx.x;
-    const int bcell_id = blockIdx.x;
-    const int mcell_id = blockIdx.y;
-    const int vec_size = atoms_num_info[2 * bcell_id] * nwmax;
-    const int offset = atoms_num_info[2 * bcell_id + 1] * nwmax * bxyz + mcell_id * vec_size;
-    const double* vec_a_mcell = vec_a_g + offset;
-    const double* vec_b_mcell = vec_b_g + offset;
-    const int warp_id = tid / 32;
-    const int lane_id = tid % 32;
-    double mySum = 0;
-
-    for (int k = tid; k < vec_size; k += blockDim.x)
-    {
-        mySum += vec_a_mcell[k] * vec_b_mcell[k];
-    }    
-
-    mySum = warpReduceSum(mySum);
-
-    if (lane_id == 0)
-    {
-        s_data[warp_id] = mySum;
-    }
-    __syncthreads();
-
-    mySum = (tid < blockDim.x / 32) ? s_data[tid] : 0;
-    if (warp_id == 0)
-    {
-        mySum = warpReduceSum(mySum);
-    }
-
-    if (tid == 0) {
-        *results_g[bcell_id*bxyz + mcell_id] = mySum;
-    }
-}
-} // namespace GintKernel
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_rho.cuh b/source/source_lcao/module_gint/kernels/cuda/gint_rho.cuh
deleted file mode 100644
index 70cbbb7692..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/gint_rho.cuh
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef GINT_RHO_CUH
-#define GINT_RHO_CUH
-
-#include <cuda_runtime.h>
-#include <cstdint>
-namespace GintKernel
-{
-
-/**
- * @brief CUDA kernel to calculate psir.
- *
- * This kernel calculates the wave function psi using the provided input
- * parameters.
- */
-__global__ void get_psi(const double* const ylmcoef,
-                        const double delta_r,
-                        const int bxyz,
-                        const int nwmax,
-                        const int max_atom,
-                        const int* const ucell_atom_nwl,
-                        const bool* const atom_iw2_new,
-                        const int* const atom_iw2_ylm,
-                        const int* const atom_nw,
-                        const double* const rcut,
-                        const int nr_max,
-                        const double* const psi_u,
-                        const double* const mcell_pos,
-                        const double* const dr_part,
-                        const uint8_t* const atoms_type,
-                        const int* const atoms_num_info,
-                        double* psi);
-
-__global__ void psir_dot(const int bxyz,
-                         const int nwmax,
-                         const int* atoms_num_info,
-                         const double* __restrict__ vec_a_g,
-                         const double* __restrict__  vec_b_g,
-                         double** results_g);
-
-} // namespace GintKernel
-#endif // GINT_RHO_CUH
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_vl.cu b/source/source_lcao/module_gint/kernels/cuda/gint_vl.cu
deleted file mode 100644
index 3b92455e60..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/gint_vl.cu
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "gint_vl.cuh"
-#include "interp.cuh"
-#include "cuda_tools.cuh"
-#include "sph.cuh"
-namespace GintKernel
-{
-
-__global__ void get_psi_and_vldr3(const double* const ylmcoef,
-                                  const double delta_r,
-                                  const int bxyz,
-                                  const double nwmax,
-                                  const double max_atom,
-                                  const int* const ucell_atom_nwl,
-                                  const bool* const atom_iw2_new,
-                                  const int* const atom_iw2_ylm,
-                                  const int* const atom_nw,
-                                  const double* const rcut,
-                                  const int nr_max,
-                                  const double* const psi_u,
-                                  const double* const mcell_pos,
-                                  const double* const dr_part,
-                                  const double* const vldr3,
-                                  const uint8_t* const atoms_type,
-                                  const int* const atoms_num_info,
-                                  double* psi,
-                                  double* psi_vldr3)
-{
-    const int bcell_id = blockIdx.x;
-    const int num_atoms = atoms_num_info[2 * bcell_id];
-    const int pre_atoms = atoms_num_info[2 * bcell_id + 1];
-    const int mcell_id = blockIdx.y;
-    const double vldr3_value = vldr3[bcell_id * bxyz + mcell_id];
-    const double mcell_pos_x = mcell_pos[3 * mcell_id];
-    const double mcell_pos_y = mcell_pos[3 * mcell_id + 1];
-    const double mcell_pos_z = mcell_pos[3 * mcell_id + 2];
-
-    for(int atom_id = threadIdx.x; atom_id < num_atoms; atom_id += blockDim.x)
-    {
-        const int dr_start = 3 * (pre_atoms + atom_id);
-        const double dr_x = dr_part[dr_start] + mcell_pos_x;
-        const double dr_y = dr_part[dr_start + 1] + mcell_pos_y;
-        const double dr_z = dr_part[dr_start + 2] + mcell_pos_z;
-        double dist = sqrt(dr_x * dr_x + dr_y * dr_y + dr_z * dr_z);
-        const int atype = __ldg(atoms_type + pre_atoms + atom_id);
-        if(dist < rcut[atype])
-        {
-            if (dist < 1.0E-9)
-            {
-                dist += 1.0E-9;
-            }
-            double dr[3] = {dr_x / dist, dr_y / dist, dr_z / dist};
-            double ylma[49];
-            const int nwl = __ldg(ucell_atom_nwl + atype);
-            spherical_harmonics(dr, nwl, ylma, ylmcoef);
-            int psi_idx = (bcell_id * max_atom + atom_id) * bxyz * nwmax + mcell_id;
-            interp_vl(dist,
-                      delta_r,
-                      atype,
-                      nwmax,
-                      bxyz,
-                      nr_max,
-                      atom_nw,
-                      atom_iw2_new,
-                      psi_u,
-                      ylma,
-                      atom_iw2_ylm,
-                      vldr3_value,
-                      psi,
-                      psi_vldr3,
-                      psi_idx);
-        }
-    }
-}
-
-} // namespace GintKernel
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/gint_vl.cuh b/source/source_lcao/module_gint/kernels/cuda/gint_vl.cuh
deleted file mode 100644
index ada7954968..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/gint_vl.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef GINT_VL_CUH
-#define GINT_VL_CUH
-
-#include <cuda_runtime.h>
-#include <cstdint>
-namespace GintKernel
-{
-/*
- * @brief: get the value of the spherical harmonics
- *
- *
- * @note the left and right matrix elements of the grid point integral.
- * We can understand the grid point integral of the local potential term
- * as the following operation:
- * H = psi * vlocal * psi * dr^3.
- * Here, the matrix element of the left matrix is psi, and the matrix
- * element of the right matrix is vlocal * psi * dr^3.
- */
-__global__ void get_psi_and_vldr3(const double* const ylmcoef,
-                                  const double delta_r,
-                                  const int bxyz,
-                                  const double nwmax,
-                                  const double max_atom,
-                                  const int* const ucell_atom_nwl,
-                                  const bool* const atom_iw2_new,
-                                  const int* const atom_iw2_ylm,
-                                  const int* const atom_nw,
-                                  const double* const rcut,
-                                  const int nr_max,
-                                  const double* const psi_u,
-                                  const double* const mcell_pos,
-                                  const double* const dr_part,
-                                  const double* const vldr3,
-                                  const uint8_t* const atoms_type,
-                                  const int* const atoms_num_info,
-                                  double* psi,
-                                  double* psi_vldr3);
-
-} // namespace GintKernel
-#endif // GINT_VL_CUH
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/interp.cuh b/source/source_lcao/module_gint/kernels/cuda/interp.cuh
deleted file mode 100644
index 31ccf3ca2c..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/interp.cuh
+++ /dev/null
@@ -1,204 +0,0 @@
-#ifndef INTERP_CUH
-#define INTERP_CUH
-
-#include <cuda_runtime.h>
-
-namespace GintKernel
-{
-// if exponent is an integer between 0 and 5 (the most common cases in gint),
-// pow_int is much faster than std::pow
-static __device__ double pow_int(double base, int exp)
-{
-    switch (exp)
-    {
-    case 0:
-        return 1.0;
-    case 1:
-        return base;
-    case 2:
-        return base * base;
-    case 3:
-        return base * base * base;
-    case 4:
-        return base * base * base * base;
-    case 5:
-        return base * base * base * base * base;
-    default:
-        double result = pow(base, exp);
-        return result;      
-    }
-}
-
-static __device__ void interp_rho(const double dist,
-                                  const double delta_r,
-                                  const int atype,
-                                  const double nwmax,
-                                  const int nr_max,
-                                  const int* __restrict__ atom_nw,
-                                  const bool* __restrict__ atom_iw2_new,
-                                  const double* __restrict__ psi_u,
-                                  const double ylma[49],
-                                  const int* __restrict__ atom_iw2_ylm,
-                                  double* psi,
-                                  int psi_idx)
-{
-    const double distance = dist / delta_r;
-
-    const int ip = (int)(distance);
-    const double dx = distance - ip;
-    const double dx2 = dx * dx;
-    const double dx3 = dx2 * dx;
-
-    const double c3 = 3.0 * dx2 - 2.0 * dx3;
-    const double c1 = 1.0 - c3;
-    const double c2 = (dx - 2.0 * dx2 + dx3) * delta_r;
-    const double c4 = (dx3 - dx2) * delta_r;
-
-    double phi = 0.0;
-    const int it_nw = atype * nwmax;
-    int iw_nr = (it_nw * nr_max + ip) * 2;
-    int it_nw_iw = it_nw;
-    for (int iw = 0; iw < atom_nw[atype]; ++iw)
-    {
-        if (atom_iw2_new[it_nw_iw])
-        {
-            phi = c1 * psi_u[iw_nr] + c2 * psi_u[iw_nr + 1]
-                  + c3 * psi_u[iw_nr + 2] + c4 * psi_u[iw_nr + 3];
-        }
-        psi[psi_idx] = phi * ylma[atom_iw2_ylm[it_nw_iw]];
-        psi_idx += 1;
-        iw_nr += 2 * nr_max;
-        it_nw_iw++;
-    }
-}
-
-static __device__ void interp_vl(const double dist,
-                                 const double delta_r,
-                                 const int atype,
-                                 const double nwmax,
-                                 const int bxyz,
-                                 const int nr_max,
-                                 const int* __restrict__ atom_nw,
-                                 const bool* __restrict__ atom_iw2_new,
-                                 const double* __restrict__ psi_u,
-                                 const double ylma[49],
-                                 const int* __restrict__ atom_iw2_ylm,
-                                 const double vldr3_value,
-                                 double* psi,
-                                 double* psi_vldr3,
-                                 int psi_idx)
-{
-    const double distance = dist / delta_r;
-
-    const int ip = (int)(distance);
-    const double dx = distance - ip;
-    const double dx2 = dx * dx;
-    const double dx3 = dx2 * dx;
-
-    const double c3 = 3.0 * dx2 - 2.0 * dx3;
-    const double c1 = 1.0 - c3;
-    const double c2 = (dx - 2.0 * dx2 + dx3) * delta_r;
-    const double c4 = (dx3 - dx2) * delta_r;
-
-    double phi = 0.0;
-    const int it_nw = atype * nwmax;
-    int iw_nr = (it_nw * nr_max + ip) * 2;
-    int it_nw_iw = it_nw;
-    for (int iw = 0; iw < atom_nw[atype]; ++iw)
-    {
-        if (atom_iw2_new[it_nw_iw])
-        {
-            phi = c1 * psi_u[iw_nr] + c2 * psi_u[iw_nr + 1]
-                  + c3 * psi_u[iw_nr + 2] + c4 * psi_u[iw_nr + 3];
-        }
-        psi[psi_idx] = phi * ylma[atom_iw2_ylm[it_nw_iw]];
-        psi_vldr3[psi_idx] = psi[psi_idx] * vldr3_value;
-        psi_idx += bxyz;
-        iw_nr += 2 * nr_max;
-        it_nw_iw++;
-    }
-}
-
-static __device__ void interp_f(const double dist,
-                                const double delta_r,
-                                const int atype,
-                                const double nwmax,
-                                const int nr_max,
-                                const int* __restrict__ atom_nw,
-                                const bool* __restrict__ atom_iw2_new,
-                                const double* __restrict__ psi_u,
-                                const double ylma[49],
-                                const int* __restrict__ atom_iw2_l,
-                                const int* __restrict__ atom_iw2_ylm,
-                                const double vldr3_value,
-                                const double * __restrict__ dr,
-                                const double grly[49][3],
-                                int psi_idx,
-                                double* psi,
-                                double* dpsi,
-                                double* d2psi)
-{
-    // Calculate normalized position for interpolation
-    const double postion = dist / delta_r;
-    // Extract integer part and fractional part of the position
-    const double ip = static_cast<int>(postion);
-    const double x0 = postion - ip;
-    const double x1 = 1.0 - x0;
-    const double x2 = 2.0 - x0;
-    const double x3 = 3.0 - x0;
-    const double x12 = x1 * x2 / 6;
-    const double x03 = x0 * x3 / 2;
-    // Temporary variables for interpolation
-    double tmp = 0.0;
-    double dtmp = 0.0;
-    // Loop over non-zero elements in atom_nw array
-    const int it_nw = atype * nwmax;
-    int iw_nr = (it_nw * nr_max + ip) * 2;
-    int it_nw_iw = it_nw;
-    for (int iw = 0; iw < atom_nw[atype]; ++iw)
-    {
-        if (atom_iw2_new[it_nw_iw])
-        {
-            // Perform interpolation using cubic B-spline
-            // basis functions
-            tmp = x12 * (psi_u[iw_nr] * x3 + psi_u[iw_nr + 6] * x0)
-                  + x03 * (psi_u[iw_nr + 2] * x2 - psi_u[iw_nr + 4] * x1);
-            dtmp = x12 * (psi_u[iw_nr + 1] * x3 + psi_u[iw_nr + 7] * x0)
-                   + x03 * (psi_u[iw_nr + 3] * x2 - psi_u[iw_nr + 5] * x1);
-        }
-        // Extract information from atom_iw2_* arrays
-        const int ll = atom_iw2_l[it_nw_iw];
-        const int idx_lm = atom_iw2_ylm[it_nw_iw];
-        const double rl = pow_int(dist, ll);
-        const double rl_r = 1.0 / rl;
-        const double dist_r = 1 / dist;
-        const int dpsi_idx = psi_idx * 3;
-        const int d2psi_idx = psi_idx * 6;
-        // Compute derivatives with respect to spatial
-        // coordinates
-        const double tmpdphi_rly
-            = (dtmp - tmp * ll * dist_r) * rl_r * ylma[idx_lm] * dist_r;
-        const double tmprl = tmp * rl_r;
-        const double dpsirx = tmpdphi_rly * dr[0] + tmprl * grly[idx_lm][0];
-        const double dpsiry = tmpdphi_rly * dr[1] + tmprl * grly[idx_lm][1];
-        const double dpsirz = tmpdphi_rly * dr[2] + tmprl * grly[idx_lm][2];
-
-        psi[psi_idx] = tmprl * ylma[idx_lm] * vldr3_value;
-        dpsi[dpsi_idx] = dpsirx;
-        dpsi[dpsi_idx + 1] = dpsiry;
-        dpsi[dpsi_idx + 2] = dpsirz;
-        d2psi[d2psi_idx] = dpsirx * dr[0];
-        d2psi[d2psi_idx + 1] = dpsirx * dr[1];
-        d2psi[d2psi_idx + 2] = dpsirx * dr[2];
-        d2psi[d2psi_idx + 3] = dpsiry * dr[1];
-        d2psi[d2psi_idx + 4] = dpsiry * dr[2];
-        d2psi[d2psi_idx + 5] = dpsirz * dr[2];
-        // Update loop counters and indices
-        psi_idx += 1;
-        iw_nr += 2 * nr_max;
-        it_nw_iw++;
-    }
-}
-} // namespace GintKernel
-
-#endif
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/sph.cuh b/source/source_lcao/module_gint/kernels/cuda/sph.cuh
deleted file mode 100644
index fec963d9fd..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/sph.cuh
+++ /dev/null
@@ -1,519 +0,0 @@
-#ifndef SPH_CUH
-#define SPH_CUH
-
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
-namespace GintKernel
-{
-
-static __device__ void spherical_harmonics(const double* const dr,
-                                           const int nwl,
-                                           double (&ylma)[49],
-                                           const double* const ylmcoef)
-{
-    /***************************
-    L = 0
-    ***************************/
-    ylma[0] = ylmcoef[0]; // l=0, m=0
-    double tmp0;
-    if (nwl == 0)
-        return;
-
-    /***************************
-    L = 1
-    ***************************/
-    ylma[1] = ylmcoef[1] * dr[2];  // l=1, m=0
-    ylma[2] = -ylmcoef[1] * dr[0]; // l=1, m=1
-    ylma[3] = -ylmcoef[1] * dr[1]; // l=1, m=-1
-    if (nwl == 1)
-        return;
-
-    /***************************
-    L = 2
-    ***************************/
-    tmp0=ylmcoef[3] * ylma[0];
-    ylma[4] = ylmcoef[2] * dr[2] * ylma[1] - tmp0 ; // l=2, m=0
-    tmp0 = ylmcoef[4] * dr[2];
-    ylma[5] = tmp0 * ylma[2]; // l=2,m=1
-    ylma[6] = tmp0 * ylma[3]; // l=2,m=-1
-
-    tmp0 = ylmcoef[4] * dr[0];
-    ylma[7] = ylmcoef[5] * ylma[4] - ylmcoef[6] * ylma[0]
-              - tmp0 * ylma[2]; // l=2,m=2
-    ylma[8] = -tmp0 * ylma[3];
-    if (nwl == 2)
-        return;
-
-    /***************************
-    L = 3
-    ***************************/
-    tmp0=ylmcoef[8] * ylma[1];
-    ylma[9] = ylmcoef[7] * dr[2] * ylma[4] - tmp0; // l=3, m=0
-
-    tmp0 = ylmcoef[9] * dr[2];
-    ylma[10] = tmp0 * ylma[5] - ylmcoef[10] * ylma[2]; // l=3,m=1
-    ylma[11] = tmp0 * ylma[6] - ylmcoef[10] * ylma[3]; // l=3,m=-1
-
-    tmp0 = ylmcoef[11] * dr[2];
-    ylma[12] = tmp0 * ylma[7]; // l=3,m=2
-    ylma[13] = tmp0 * ylma[8]; // l=3,m=-2
-
-    tmp0 = ylmcoef[14] * dr[0];
-    ylma[14] = ylmcoef[12] * ylma[10] - ylmcoef[13] * ylma[2]
-               - tmp0 * ylma[7]; // l=3,m=3
-    ylma[15] = ylmcoef[12] * ylma[11] - ylmcoef[13] * ylma[3]
-               - tmp0 * ylma[8]; // l=3,m=-3
-    if (nwl == 3)
-        return;
-
-    /***************************
-    L = 4
-    ***************************/
-    tmp0=ylmcoef[16] * ylma[4];
-    ylma[16] = ylmcoef[15] * dr[2] * ylma[9] - tmp0; // l=4,m=0
-
-    tmp0 = ylmcoef[17] * dr[2];
-    ylma[17] = tmp0 * ylma[10] - ylmcoef[18] * ylma[5]; // l=4,m=1
-    ylma[18] = tmp0 * ylma[11] - ylmcoef[18] * ylma[6]; // l=4,m=-1
-
-    tmp0 = ylmcoef[19] * dr[2];
-    ylma[19] = tmp0 * ylma[12] - ylmcoef[20] * ylma[7]; // l=4,m=2
-    ylma[20] = tmp0 * ylma[13] - ylmcoef[20] * ylma[8]; // l=4,m=-2
-
-    tmp0 = 3.0 * dr[2];
-    ylma[21] = tmp0 * ylma[14]; // l=4,m=3
-    ylma[22] = tmp0 * ylma[15]; // l=4,m=-3
-
-    tmp0 = ylmcoef[23] * dr[0];
-    ylma[23] = ylmcoef[21] * ylma[19] - ylmcoef[22] * ylma[7]
-               - tmp0 * ylma[14]; // l=4,m=4
-    ylma[24] = ylmcoef[21] * ylma[20] - ylmcoef[22] * ylma[8]
-               - tmp0 * ylma[15]; // l=4,m=-4
-    if (nwl == 4)
-        return;
-
-    /***************************
-    L = 5
-    ***************************/
-    tmp0=ylmcoef[25] * ylma[9];
-    ylma[25]
-        = ylmcoef[24] * dr[2] * ylma[16] - tmp0; // l=5,m=0
-
-    tmp0 = ylmcoef[26] * dr[2];
-    ylma[26] = tmp0 * ylma[17] - ylmcoef[27] * ylma[10]; // l=5,m=1
-    ylma[27] = tmp0 * ylma[18] - ylmcoef[27] * ylma[11]; // l=5,m=-1
-
-    tmp0 = ylmcoef[28] * dr[2];
-    ylma[28] = tmp0 * ylma[19] - ylmcoef[29] * ylma[12]; // l=5,m=2
-    ylma[29] = tmp0 * ylma[20] - ylmcoef[29] * ylma[13]; // l=5,m=-2
-
-    tmp0 = ylmcoef[30] * dr[2];
-    ylma[30] = tmp0 * ylma[21] - ylmcoef[31] * ylma[14]; // l=5,m=3
-    ylma[31] = tmp0 * ylma[22] - ylmcoef[31] * ylma[15]; // l=5,m=-3
-
-    tmp0 = ylmcoef[32] * dr[2];
-    ylma[32] = tmp0 * ylma[23]; // l=5,m=4
-    ylma[33] = tmp0 * ylma[24]; // l=5,m=-4
-
-    tmp0 = ylmcoef[35] * dr[0];
-    ylma[34] = ylmcoef[33] * ylma[30] - ylmcoef[34] * ylma[14]
-               - tmp0 * ylma[23]; // l=5,m=5
-    ylma[35] = ylmcoef[33] * ylma[31] - ylmcoef[34] * ylma[15]
-               - tmp0 * ylma[24]; // l=5,m=-5
-    if (nwl == 5)
-        return;
-    /*
-    // if nwl > 5
-    for (int il = 6; il <= nwl; il++)
-    {
-        int istart = il * il;
-        int istart1 = (il - 1) * (il - 1);
-        int istart2 = (il - 2) * (il - 2);
-
-        double fac2 = sqrt(4.0 * istart - 1.0);
-        double fac4 = sqrt(4.0 * istart1 - 1.0);
-
-        for (int im = 0; im < 2 * il - 1; im++)
-        {
-            int imm = (im + 1) / 2;
-            ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (dr[2]
-    * ylma[istart1 + im] - sqrt((double)istart1 - imm * imm) / fac4 *
-    ylma[istart2 + im]);
-        }
-
-        double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0));
-        double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0));
-        double bl3 = sqrt(2.0) / fac2;
-
-        ylma[istart + 2 * il - 1] = (bl3 * ylma[istart + 2 * il - 5] - bl2 *
-    ylma[istart2 + 2 * il - 5] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 3]) /
-    bl1; ylma[istart + 2 * il] = (bl3 * ylma[istart + 2 * il - 4] - bl2 *
-    ylma[istart2 + 2 * il - 4] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 2]) /
-    bl1;
-    }*/
-}
-
-static __device__ void spherical_harmonics_d(const double* const dr,
-                                             const double distance,
-                                             double (&grly)[49][3],
-                                             const int nwl,
-                                             double (&ylma)[49],
-                                             const double* const ylmcoef)
-{
-    double tmp0;
-    double tx = 2.0 * dr[0];
-    double ty = 2.0 * dr[1];
-    double tz = 2.0 * dr[2];
-    ylma[0] = ylmcoef[0]; // l=0, m=0
-    grly[0][0] = grly[0][1] = grly[0][2] = 0.0;
-    if (nwl == 0)
-        return;
-
-    /***************************
-    L = 1
-    ***************************/
-    ylma[1] = ylmcoef[1] * dr[2]; // l=1, m=0
-    grly[1][0] = grly[1][1] = 0.0;
-    grly[1][2] = ylmcoef[1];
-    ylma[2] = -ylmcoef[1] * dr[0]; // l=1, m=1
-    grly[2][1] = grly[2][2] = 0.0;
-    grly[2][0] = -ylmcoef[1];
-    ylma[3] = -ylmcoef[1] * dr[1]; // l=1, m=-1
-    grly[3][0] = grly[3][2] = 0.0;
-    grly[3][1] = -ylmcoef[1];
-    if (nwl == 1)
-        return;
-
-    /***************************
-    L = 2
-    ***************************/
-    ylma[4] = ylmcoef[2] * dr[2] * ylma[1]
-              - ylmcoef[3] * ylma[0] * distance; // l=2, m=0
-    grly[4][0]
-        = ylmcoef[2] * dr[2] * grly[1][0]
-          - ylmcoef[3] * (grly[0][0] * distance + ylma[0] * tx); // l=2, m=0
-    grly[4][1]
-        = ylmcoef[2] * dr[2] * grly[1][1]
-          - ylmcoef[3] * (grly[0][1] * distance + ylma[0] * ty); // l=2, m=0
-    grly[4][2]
-        = ylmcoef[2] * (dr[2] * grly[1][2] + ylma[1])
-          - ylmcoef[3] * (grly[0][2] * distance + ylma[0] * tz); // l=2, m=0
-
-    tmp0 = ylmcoef[4] * dr[2];
-    ylma[5] = tmp0 * ylma[2]; // l=2,m=1
-    grly[5][0] = tmp0 * grly[2][0];
-    grly[5][1] = tmp0 * grly[2][1];
-    grly[5][2] = ylmcoef[4] * (ylma[2] + dr[2] * grly[2][2]);
-
-    ylma[6] = tmp0 * ylma[3]; // l=2,m=-1
-    grly[6][0] = tmp0 * grly[3][0];
-    grly[6][1] = tmp0 * grly[3][1];
-    grly[6][2] = ylmcoef[4] * (ylma[3] + dr[2] * grly[3][2]);
-
-    tmp0 = ylmcoef[4] * dr[0];
-    ylma[7] = ylmcoef[5] * ylma[4] - ylmcoef[6] * ylma[0] * distance
-              - tmp0 * ylma[2]; // l=2,m=2
-    grly[7][0] = ylmcoef[5] * grly[4][0]
-                 - ylmcoef[6] * (ylma[0] * tx + grly[0][0] * distance)
-                 - ylmcoef[4] * (dr[0] * grly[2][0] + ylma[2]);
-    grly[7][1] = ylmcoef[5] * grly[4][1]
-                 - ylmcoef[6] * (ylma[0] * ty + grly[0][1] * distance)
-                 - tmp0 * grly[2][1];
-    grly[7][2] = ylmcoef[5] * grly[4][2]
-                 - ylmcoef[6] * (ylma[0] * tz + grly[0][2] * distance)
-                 - tmp0 * grly[2][2];
-
-    ylma[8] = -tmp0 * ylma[3];
-    grly[8][0] = -ylmcoef[4] * (ylma[3] + dr[0] * grly[3][0]);
-    grly[8][1] = -tmp0 * grly[3][1];
-    grly[8][2] = -tmp0 * grly[3][2];
-    if (nwl == 2)
-        return;
-
-    /***************************
-    L = 3
-    ***************************/
-    ylma[9] = ylmcoef[7] * dr[2] * ylma[4]
-              - ylmcoef[8] * ylma[1] * distance; // l=3, m=0
-    grly[9][0] = ylmcoef[7] * dr[2] * grly[4][0]
-                 - ylmcoef[8] * (ylma[1] * tx + grly[1][0] * distance);
-    grly[9][1] = ylmcoef[7] * dr[2] * grly[4][1]
-                 - ylmcoef[8] * (ylma[1] * ty + grly[1][1] * distance);
-    grly[9][2] = ylmcoef[7] * (ylma[4] + dr[2] * grly[4][2])
-                 - ylmcoef[8] * (ylma[1] * tz + grly[1][2] * distance);
-
-    tmp0 = ylmcoef[9] * dr[2];
-    ylma[10] = tmp0 * ylma[5] - ylmcoef[10] * ylma[2] * distance; // l=3,m=1
-    grly[10][0] = tmp0 * grly[5][0]
-                  - ylmcoef[10] * (grly[2][0] * distance + ylma[2] * tx);
-    grly[10][1] = tmp0 * grly[5][1]
-                  - ylmcoef[10] * (grly[2][1] * distance + ylma[2] * ty);
-    grly[10][2] = ylmcoef[9] * (dr[2] * grly[5][2] + ylma[5])
-                  - ylmcoef[10] * (grly[2][2] * distance + ylma[2] * tz);
-
-    ylma[11] = tmp0 * ylma[6] - ylmcoef[10] * ylma[3] * distance; // l=3,m=-1
-    grly[11][0] = tmp0 * grly[6][0]
-                  - ylmcoef[10] * (grly[3][0] * distance + ylma[3] * tx);
-    grly[11][1] = tmp0 * grly[6][1]
-                  - ylmcoef[10] * (grly[3][1] * distance + ylma[3] * ty);
-    grly[11][2] = ylmcoef[9] * (dr[2] * grly[6][2] + ylma[6])
-                  - ylmcoef[10] * (grly[3][2] * distance + ylma[3] * tz);
-
-    tmp0 = ylmcoef[11] * dr[2];
-    ylma[12] = tmp0 * ylma[7]; // l=3,m=2
-    grly[12][0] = tmp0 * grly[7][0];
-    grly[12][1] = tmp0 * grly[7][1];
-    grly[12][2] = ylmcoef[11] * (dr[2] * grly[7][2] + ylma[7]);
-
-    ylma[13] = tmp0 * ylma[8]; // l=3,m=-2
-    grly[13][0] = tmp0 * grly[8][0];
-    grly[13][1] = tmp0 * grly[8][1];
-    grly[13][2] = ylmcoef[11] * (dr[2] * grly[8][2] + ylma[8]);
-
-    tmp0 = ylmcoef[14] * dr[0];
-    ylma[14] = ylmcoef[12] * ylma[10] - ylmcoef[13] * ylma[2] * distance
-               - tmp0 * ylma[7]; // l=3,m=3
-    grly[14][0] = ylmcoef[12] * grly[10][0]
-                  - ylmcoef[13] * (ylma[2] * tx + grly[2][0] * distance)
-                  - ylmcoef[14] * (ylma[7] + dr[0] * grly[7][0]);
-    grly[14][1] = ylmcoef[12] * grly[10][1]
-                  - ylmcoef[13] * (ylma[2] * ty + grly[2][1] * distance)
-                  - tmp0 * grly[7][1];
-    grly[14][2] = ylmcoef[12] * grly[10][2]
-                  - ylmcoef[13] * (ylma[2] * tz + grly[2][2] * distance)
-                  - tmp0 * grly[7][2];
-
-    ylma[15] = ylmcoef[12] * ylma[11] - ylmcoef[13] * ylma[3] * distance
-               - tmp0 * ylma[8]; // l=3,m=-3
-    grly[15][0] = ylmcoef[12] * grly[11][0]
-                  - ylmcoef[13] * (ylma[3] * tx + grly[3][0] * distance)
-                  - ylmcoef[14] * (ylma[8] + dr[0] * grly[8][0]);
-    grly[15][1] = ylmcoef[12] * grly[11][1]
-                  - ylmcoef[13] * (ylma[3] * ty + grly[3][1] * distance)
-                  - tmp0 * grly[8][1];
-    grly[15][2] = ylmcoef[12] * grly[11][2]
-                  - ylmcoef[13] * (ylma[3] * tz + grly[3][2] * distance)
-                  - tmp0 * grly[8][2];
-    if (nwl == 3)
-        return;
-
-    /***************************
-    L = 4
-    ***************************/
-    ylma[16] = ylmcoef[15] * dr[2] * ylma[9]
-               - ylmcoef[16] * ylma[4] * distance; // l=4,m=0
-    grly[16][0] = ylmcoef[15] * dr[2] * grly[9][0]
-                  - ylmcoef[16] * (ylma[4] * tx + grly[4][0] * distance);
-    grly[16][1] = ylmcoef[15] * dr[2] * grly[9][1]
-                  - ylmcoef[16] * (ylma[4] * ty + grly[4][1] * distance);
-    grly[16][2] = ylmcoef[15] * (dr[2] * grly[9][2] + ylma[9])
-                  - ylmcoef[16] * (ylma[4] * tz + grly[4][2] * distance);
-
-    tmp0 = ylmcoef[17] * dr[2];
-    ylma[17] = tmp0 * ylma[10] - ylmcoef[18] * ylma[5] * distance; // l=4,m=1
-    grly[17][0] = tmp0 * grly[10][0]
-                  - ylmcoef[18] * (ylma[5] * tx + grly[5][0] * distance);
-    grly[17][1] = tmp0 * grly[10][1]
-                  - ylmcoef[18] * (ylma[5] * ty + grly[5][1] * distance);
-    grly[17][2] = ylmcoef[17] * (dr[2] * grly[10][2] + ylma[10])
-                  - ylmcoef[18] * (ylma[5] * tz + grly[5][2] * distance);
-
-    ylma[18] = tmp0 * ylma[11] - ylmcoef[18] * ylma[6] * distance; // l=4,m=-1
-    grly[18][0] = tmp0 * grly[11][0]
-                  - ylmcoef[18] * (ylma[6] * tx + grly[6][0] * distance);
-    grly[18][1] = tmp0 * grly[11][1]
-                  - ylmcoef[18] * (ylma[6] * ty + grly[6][1] * distance);
-    grly[18][2] = ylmcoef[17] * (dr[2] * grly[11][2] + ylma[11])
-                  - ylmcoef[18] * (ylma[6] * tz + grly[6][2] * distance);
-
-    tmp0 = ylmcoef[19] * dr[2];
-    ylma[19] = tmp0 * ylma[12] - ylmcoef[20] * ylma[7] * distance; // l=4,m=2
-    grly[19][0] = tmp0 * grly[12][0]
-                  - ylmcoef[20] * (ylma[7] * tx + grly[7][0] * distance);
-    grly[19][1] = tmp0 * grly[12][1]
-                  - ylmcoef[20] * (ylma[7] * ty + grly[7][1] * distance);
-    grly[19][2] = ylmcoef[19] * (dr[2] * grly[12][2] + ylma[12])
-                  - ylmcoef[20] * (ylma[7] * tz + grly[7][2] * distance);
-
-    ylma[20] = tmp0 * ylma[13] - ylmcoef[20] * ylma[8] * distance; // l=4,m=-2
-    grly[20][0] = tmp0 * grly[13][0]
-                  - ylmcoef[20] * (ylma[8] * tx + grly[8][0] * distance);
-    grly[20][1] = tmp0 * grly[13][1]
-                  - ylmcoef[20] * (ylma[8] * ty + grly[8][1] * distance);
-    grly[20][2] = ylmcoef[19] * (dr[2] * grly[13][2] + ylma[13])
-                  - ylmcoef[20] * (ylma[8] * tz + grly[8][2] * distance);
-
-    tmp0 = 3.0 * dr[2];
-    ylma[21] = tmp0 * ylma[14]; // l=4,m=3
-    grly[21][0] = tmp0 * grly[14][0];
-    grly[21][1] = tmp0 * grly[14][1];
-    grly[21][2] = 3.0 * (dr[2] * grly[14][2] + ylma[14]);
-
-    ylma[22] = tmp0 * ylma[15]; // l=4,m=-3
-    grly[22][0] = tmp0 * grly[15][0];
-    grly[22][1] = tmp0 * grly[15][1];
-    grly[22][2] = 3.0 * (dr[2] * grly[15][2] + ylma[15]);
-
-    tmp0 = ylmcoef[23] * dr[0];
-    ylma[23] = ylmcoef[21] * ylma[19] - ylmcoef[22] * ylma[7] * distance
-               - tmp0 * ylma[14]; // l=4,m=4
-    grly[23][0] = ylmcoef[21] * grly[19][0]
-                  - ylmcoef[22] * (ylma[7] * tx + grly[7][0] * distance)
-                  - ylmcoef[23] * (dr[0] * grly[14][0] + ylma[14]);
-    grly[23][1] = ylmcoef[21] * grly[19][1]
-                  - ylmcoef[22] * (ylma[7] * ty + grly[7][1] * distance)
-                  - tmp0 * grly[14][1];
-    grly[23][2] = ylmcoef[21] * grly[19][2]
-                  - ylmcoef[22] * (ylma[7] * tz + grly[7][2] * distance)
-                  - tmp0 * grly[14][2];
-
-    ylma[24] = ylmcoef[21] * ylma[20] - ylmcoef[22] * ylma[8] * distance
-               - tmp0 * ylma[15]; // l=4,m=-4
-    grly[24][0] = ylmcoef[21] * grly[20][0]
-                  - ylmcoef[22] * (ylma[8] * tx + grly[8][0] * distance)
-                  - ylmcoef[23] * (dr[0] * grly[15][0] + ylma[15]);
-    grly[24][1] = ylmcoef[21] * grly[20][1]
-                  - ylmcoef[22] * (ylma[8] * ty + grly[8][1] * distance)
-                  - tmp0 * grly[15][1];
-    grly[24][2] = ylmcoef[21] * grly[20][2]
-                  - ylmcoef[22] * (ylma[8] * tz + grly[8][2] * distance)
-                  - tmp0 * grly[15][2];
-    if (nwl == 4)
-        return;
-
-    /***************************
-    L = 5
-    ***************************/
-    ylma[25] = ylmcoef[24] * dr[2] * ylma[16]
-               - ylmcoef[25] * ylma[9] * distance; // l=5,m=0
-    grly[25][0] = ylmcoef[24] * dr[2] * grly[16][0]
-                  - ylmcoef[25] * (ylma[9] * tx + grly[9][0] * distance);
-    grly[25][1] = ylmcoef[24] * dr[2] * grly[16][1]
-                  - ylmcoef[25] * (ylma[9] * ty + grly[9][1] * distance);
-    grly[25][2] = ylmcoef[24] * (dr[2] * grly[16][2] + ylma[16])
-                  - ylmcoef[25] * (ylma[9] * tz + grly[9][2] * distance);
-
-    tmp0 = ylmcoef[26] * dr[2];
-    ylma[26] = tmp0 * ylma[17] - ylmcoef[27] * ylma[10] * distance; // l=5,m=1
-    grly[26][0] = tmp0 * grly[17][0]
-                  - ylmcoef[27] * (ylma[10] * tx + grly[10][0] * distance);
-    grly[26][1] = tmp0 * grly[17][1]
-                  - ylmcoef[27] * (ylma[10] * ty + grly[10][1] * distance);
-    grly[26][2] = ylmcoef[26] * (dr[2] * grly[17][2] + ylma[17])
-                  - ylmcoef[27] * (ylma[10] * tz + grly[10][2] * distance);
-
-    ylma[27] = tmp0 * ylma[18] - ylmcoef[27] * ylma[11] * distance; // l=5,m=-1
-    grly[27][0] = tmp0 * grly[18][0]
-                  - ylmcoef[27] * (ylma[11] * tx + grly[11][0] * distance);
-    grly[27][1] = tmp0 * grly[18][1]
-                  - ylmcoef[27] * (ylma[11] * ty + grly[11][1] * distance);
-    grly[27][2] = ylmcoef[26] * (dr[2] * grly[18][2] + ylma[18])
-                  - ylmcoef[27] * (ylma[11] * tz + grly[11][2] * distance);
-
-    tmp0 = ylmcoef[28] * dr[2];
-    ylma[28] = tmp0 * ylma[19] - ylmcoef[29] * ylma[12] * distance; // l=5,m=2
-    grly[28][0] = tmp0 * grly[19][0]
-                  - ylmcoef[29] * (ylma[12] * tx + grly[12][0] * distance);
-    grly[28][1] = tmp0 * grly[19][1]
-                  - ylmcoef[29] * (ylma[12] * ty + grly[12][1] * distance);
-    grly[28][2] = ylmcoef[28] * (dr[2] * grly[19][2] + ylma[19])
-                  - ylmcoef[29] * (ylma[12] * tz + grly[12][2] * distance);
-
-    ylma[29] = tmp0 * ylma[20] - ylmcoef[29] * ylma[13] * distance; // l=5,m=-2
-    grly[29][0] = tmp0 * grly[20][0]
-                  - ylmcoef[29] * (ylma[13] * tx + grly[13][0] * distance);
-    grly[29][1] = tmp0 * grly[20][1]
-                  - ylmcoef[29] * (ylma[13] * ty + grly[13][1] * distance);
-    grly[29][2] = ylmcoef[28] * (dr[2] * grly[20][2] + ylma[20])
-                  - ylmcoef[29] * (ylma[13] * tz + grly[13][2] * distance);
-
-    tmp0 = ylmcoef[30] * dr[2];
-    ylma[30] = tmp0 * ylma[21] - ylmcoef[31] * ylma[14] * distance; // l=5,m=3
-    grly[30][0] = tmp0 * grly[21][0]
-                  - ylmcoef[31] * (grly[14][0] * distance + ylma[14] * tx);
-    grly[30][1] = tmp0 * grly[21][1]
-                  - ylmcoef[31] * (grly[14][1] * distance + ylma[14] * ty);
-    grly[30][2] = ylmcoef[30] * (dr[2] * grly[21][2] + ylma[21])
-                  - ylmcoef[31] * (ylma[14] * tz + grly[14][2] * distance);
-
-    ylma[31] = tmp0 * ylma[22] - ylmcoef[31] * ylma[15] * distance; // l=5,m=-3
-    grly[31][0] = tmp0 * grly[22][0]
-                  - ylmcoef[31] * (grly[15][0] * distance + ylma[15] * tx);
-    grly[31][1] = tmp0 * grly[22][1]
-                  - ylmcoef[31] * (grly[15][1] * distance + ylma[15] * ty);
-    grly[31][2] = ylmcoef[30] * (dr[2] * grly[22][2] + ylma[22])
-                  - ylmcoef[31] * (ylma[15] * tz + grly[15][2] * distance);
-
-    tmp0 = ylmcoef[32] * dr[2];
-    ylma[32] = tmp0 * ylma[23]; // l=5,m=4
-    grly[32][0] = tmp0 * grly[23][0];
-    grly[32][1] = tmp0 * grly[23][1];
-    grly[32][2] = ylmcoef[32] * (ylma[23] + dr[2] * grly[23][2]);
-
-    ylma[33] = tmp0 * ylma[24]; // l=5,m=-4
-    grly[33][0] = tmp0 * grly[24][0];
-    grly[33][1] = tmp0 * grly[24][1];
-    grly[33][2] = ylmcoef[32] * (ylma[24] + dr[2] * grly[24][2]);
-
-    tmp0 = ylmcoef[35] * dr[0];
-    ylma[34] = ylmcoef[33] * ylma[30] - ylmcoef[34] * ylma[14] * distance
-               - tmp0 * ylma[23]; // l=5,m=5
-    grly[34][0] = ylmcoef[33] * grly[30][0]
-                  - ylmcoef[34] * (ylma[14] * tx + grly[14][0] * distance)
-                  - ylmcoef[35] * (dr[0] * grly[23][0] + ylma[23]);
-    grly[34][1] = ylmcoef[33] * grly[30][1]
-                  - ylmcoef[34] * (ylma[14] * ty + grly[14][1] * distance)
-                  - tmp0 * grly[23][1];
-    grly[34][2] = ylmcoef[33] * grly[30][2]
-                  - ylmcoef[34] * (ylma[14] * tz + grly[14][2] * distance)
-                  - tmp0 * grly[23][2];
-
-    ylma[35] = ylmcoef[33] * ylma[31] - ylmcoef[34] * ylma[15] * distance
-               - tmp0 * ylma[24]; // l=5,m=-5
-    grly[35][0] = ylmcoef[33] * grly[31][0]
-                  - ylmcoef[34] * (ylma[15] * tx + grly[15][0] * distance)
-                  - ylmcoef[35] * (dr[0] * grly[24][0] + ylma[24]);
-    grly[35][1] = ylmcoef[33] * grly[31][1]
-                  - ylmcoef[34] * (ylma[15] * ty + grly[15][1] * distance)
-                  - tmp0 * grly[24][1];
-    grly[35][2] = ylmcoef[33] * grly[31][2]
-                  - ylmcoef[34] * (ylma[15] * tz + grly[15][2] * distance)
-                  - tmp0 * grly[24][2];
-
-    if (nwl == 5)
-        return;
-    /*
-    // if nwl > 5
-    for (int il = 6; il <= nwl; il++)
-    {
-        int istart = il * il;
-        int istart1 = (il - 1) * (il - 1);
-        int istart2 = (il - 2) * (il - 2);
-
-        double fac2 = sqrt(4.0 * istart - 1.0);
-        double fac4 = sqrt(4.0 * istart1 - 1.0);
-
-        for (int im = 0; im < 2 * il - 1; im++)
-        {
-            int imm = (im + 1) / 2;
-            ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (dr[2]
-    * ylma[istart1 + im] - sqrt((double)istart1 - imm * imm) / fac4 *
-    ylma[istart2 + im]);
-        }
-
-        double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0));
-        double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0));
-        double bl3 = sqrt(2.0) / fac2;
-
-        ylma[istart + 2 * il - 1] = (bl3 * ylma[istart + 2 * il - 5] - bl2 *
-    ylma[istart2 + 2 * il - 5] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 3]) /
-    bl1; ylma[istart + 2 * il] = (bl3 * ylma[istart + 2 * il - 4] - bl2 *
-    ylma[istart2 + 2 * il - 4] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 2]) /
-    bl1;
-    }*/
-}
-
-} // namespace GintKernel
-
-#endif
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh b/source/source_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh
deleted file mode 100644
index 77cbec17f6..0000000000
--- a/source/source_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh
+++ /dev/null
@@ -1,545 +0,0 @@
-#ifndef VBATCH_MATRIX_MUL_CUH
-#define VBATCH_MATRIX_MUL_CUH
-#include "cuda_tools.cuh"
-#include "source_pw/module_pwdft/global.h"
-#include "source_base/module_device/device.h"
-#include "source_cell/unitcell.h"
-
-#include <assert.h> // for assert
-#include <cublas_v2.h>
-#include <cuda.h> // for CUDA_VERSION
-#include <cuda_runtime.h>
-#include <functional>
-#include <stdio.h> // for fprintf and stderr
-
-#define sA(i, j) sA[(j)*slda + (i)]
-#define sB(i, j) sB[(j)*sldb + (i)]
-#define fetch(A, m, n, bound) offs_d##A[min(n * LD##A + m, bound)]
-
-template <typename T,
-          int DIM_X,
-          int DIM_Y,
-          int BLK_M,
-          int BLK_N,
-          int BLK_K,
-          int DIM_XA,
-          int DIM_YA,
-          int DIM_XB,
-          int DIM_YB,
-          int THR_M,
-          int THR_N>
-static __device__ void vbatched_gemm_device(int M,
-                                            int N,
-                                            int K,
-                                            T* __restrict__ A,
-                                            int LDA,
-                                            T* __restrict__ B,
-                                            int LDB,
-                                            T* __restrict__ C,
-                                            int LDC,
-                                            T* sA,
-                                            int slda,
-                                            T* sB,
-                                            int sldb,
-                                            T alpha)
-{
-    int idx = threadIdx.x; // thread's m dimension
-    int idy = threadIdx.y; // thread's n dimension
-
-    int idt = DIM_X * idy + idx; // thread's global number
-
-    int idxA = idt % DIM_XA; // idx within A
-    int idyA = idt / DIM_XA; // idy within A
-
-    int idxB = idt % DIM_XB; // idx within B
-    int idyB = idt / DIM_XB; // idy within B
-
-    int blx = blockIdx.x; // block's m dimension
-    int bly = blockIdx.y; // block's n dimension
-
-    // Registers for the innermost loop
-    T rC[THR_N][THR_M];
-    T rA[THR_M];
-    T rB[THR_N];
-
-    // Registers for the dev->shmem copy
-    T ra[BLK_M / DIM_YA][BLK_K / DIM_XA];
-    T rb[BLK_N / DIM_YB][BLK_K / DIM_XB];
-
-    // bound is the correction to offs_d in order to not get out of memory bound
-    // so bound could be negative value since offs_d could be out of bound
-    T* offs_dA = A + blx * BLK_M * LDA + idyA * LDA + idxA;
-    int boundA = (LDA * (M - 1) + K) - (blx * BLK_M * LDA + idyA * LDA + idxA) - 1;
-
-    T* offs_dB = B + bly * BLK_N * LDB + idyB * LDB + idxB;
-    int boundB = (LDB * (N - 1) + K) - (bly * BLK_N * LDB + idyB * LDB + idxB) - 1;
-
-    int m, n, k, kk;
-
-// Zero C
-#pragma unroll
-    for (n = 0; n < THR_N; n++)
-    {
-#pragma unroll
-        for (m = 0; m < THR_M; m++)
-        {
-            rC[n][m] = 0.0;
-        }
-    }
-
-// Load A dev->shmem
-#pragma unroll
-    for (n = 0; n < BLK_M; n += DIM_YA)
-    {
-#pragma unroll
-        for (m = 0; m < BLK_K; m += DIM_XA)
-        {
-            sA(n + idyA, m + idxA) = fetch(A, m, n, boundA);
-        }
-    }
-
-#pragma unroll
-    for (n = 0; n < BLK_N; n += DIM_YB)
-    {
-#pragma unroll
-        for (m = 0; m < BLK_K; m += DIM_XB)
-        {
-            sB(m + idxB, n + idyB) = fetch(B, m, n, boundB);
-        }
-    }
-
-    __syncthreads();
-
-    for (kk = 0; kk < K - BLK_K; kk += BLK_K)
-    {
-        offs_dA += BLK_K;
-        boundA -= BLK_K;
-
-        offs_dB += BLK_K;
-        boundB -= BLK_K;
-
-// Load A dev->regs
-#pragma unroll
-        for (n = 0; n < BLK_M / DIM_YA; n++)
-        {
-#pragma unroll
-            for (m = 0; m < BLK_K / DIM_XA; m++)
-            {
-                ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA);
-            }
-        }
-
-// Load B dev->regs
-#pragma unroll
-        for (n = 0; n < BLK_N / DIM_YB; n++)
-        {
-#pragma unroll
-            for (m = 0; m < BLK_K / DIM_XB; m++)
-            {
-                rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB);
-            }
-        }
-
-// Multiply
-#pragma unroll
-        for (k = 0; k < BLK_K; k++)
-        {
-// Load A shmem->regs
-#pragma unroll
-            for (m = 0; m < THR_M; m++)
-            {
-                rA[m] = sA(m * DIM_X + idx, k);
-            }
-
-// Load B shmem->regs
-#pragma unroll
-            for (n = 0; n < THR_N; n++)
-            {
-                rB[n] = sB(k, n * DIM_Y + idy);
-            }
-
-// Compute
-#pragma unroll
-            for (n = 0; n < THR_N; n++)
-            {
-#pragma unroll
-                for (m = 0; m < THR_M; m++)
-                {
-                    rC[n][m] += rA[m] * rB[n];
-                }
-            }
-        }
-
-        __syncthreads();
-
-// Load A regs->shmem
-#pragma unroll
-        for (n = 0; n < BLK_M / DIM_YA; n++)
-        {
-#pragma unroll
-            for (m = 0; m < BLK_K / DIM_XA; m++)
-            {
-                sA(n * DIM_YA + idyA, m * DIM_XA + idxA) = ra[n][m];
-            }
-        }
-
-// Load B regs->shmem
-#pragma unroll
-        for (n = 0; n < BLK_N / DIM_YB; n++)
-        {
-#pragma unroll
-            for (m = 0; m < BLK_K / DIM_XB; m++)
-            {
-                sB(m * DIM_XB + idxB, n * DIM_YB + idyB) = rb[n][m];
-            }
-        }
-        __syncthreads();
-    }
-
-    // Multiply last full (BLK_K) or partial block of
-    // columns of op(A) and rows of op(B).
-    // It's okay that m,n exceed matrix bounds as all work is in registers
-    // or shared memory, and out-of-bounds rC[n][m] will not be saved later.
-    kk = K - kk;
-#pragma unroll
-    for (k = 0; k < kk; k++)
-    {
-// Load A shmem->regs
-#pragma unroll
-        for (m = 0; m < THR_M; m++)
-        {
-            rA[m] = sA(m * DIM_X + idx, k);
-        }
-
-// Load B shmem->regs
-#pragma unroll
-        for (n = 0; n < THR_N; n++)
-        {
-            rB[n] = sB(k, n * DIM_Y + idy);
-        }
-
-// Compute
-#pragma unroll
-        for (n = 0; n < THR_N; n++)
-        {
-#pragma unroll
-            for (m = 0; m < THR_M; m++)
-            {
-                rC[n][m] += rA[m] * rB[n];
-            }
-        }
-    }
-
-// Store C regs->dev
-#pragma unroll
-    for (n = 0; n < THR_N; n++)
-    {
-        int coord_dCn = bly * BLK_N + n * DIM_Y + idy;
-#pragma unroll
-        for (m = 0; m < THR_M; m++)
-        {
-            int coord_dCm = blx * BLK_M + m * DIM_X + idx;
-            if (coord_dCm < M && coord_dCn < N)
-            {
-                int offsC = coord_dCn * LDC + coord_dCm;
-
-                atomicAdd(C + offsC, rC[n][m] * alpha);
-            }
-        }
-    }
-}
-
-/******************************************************************************/
-template <typename T,
-          int DIM_X,
-          int DIM_Y,
-          int BLK_M,
-          int BLK_N,
-          int BLK_K,
-          int DIM_XA,
-          int DIM_YA,
-          int DIM_XB,
-          int DIM_YB>
-static __global__ void vbatched_gemm_kernel(int* M,
-                                            int* N,
-                                            int* K,
-                                            T** global_A_array,
-                                            int* global_lda,
-                                            T** global_B_array,
-                                            int* global_ldb,
-                                            T** global_C_array,
-                                            int* global_ldc,
-                                            T* alpha)
-{
-    extern __shared__ __align__(sizeof(T)) unsigned char smem[];
-    T* shared_mem = reinterpret_cast<T*>(smem);
-
-    int batchid = blockIdx.z;
-    int local_M = (int)M[batchid];
-    int local_N = (int)N[batchid];
-    int local_K = (int)K[batchid];
-
-    if (blockIdx.x >= (local_M + BLK_M - 1) / BLK_M)
-        return;
-    if (blockIdx.y >= (local_N + BLK_N - 1) / BLK_N)
-        return;
-
-    int shared_lda = BLK_M + 1;
-    int shared_ldb = BLK_K + 1;
-    T* shared_A = (T*)shared_mem;
-    T* shared_B = shared_A + shared_lda * BLK_K;
-    double alpha_tmp = 1.0;
-    if (alpha != nullptr)
-    {
-        alpha_tmp = alpha[batchid];
-    }
-    vbatched_gemm_device<T,
-                         DIM_X,
-                         DIM_Y,
-                         BLK_M,
-                         BLK_N,
-                         BLK_K,
-                         DIM_XA,
-                         DIM_YA,
-                         DIM_XB,
-                         DIM_YB,
-                         (BLK_M / DIM_X),
-                         (BLK_N / DIM_Y)>(local_M,
-                                          local_N,
-                                          local_K,
-                                          global_A_array[batchid],
-                                          (int)global_lda[batchid],
-                                          global_B_array[batchid],
-                                          (int)global_ldb[batchid],
-                                          global_C_array[batchid],
-                                          (int)global_ldc[batchid],
-                                          shared_A,
-                                          shared_lda,
-                                          shared_B,
-                                          shared_ldb,
-                                          alpha_tmp);
-}
-
-/**
- * Performs a batched matrix multiplication using the vbatched_gemm_impl
- * function.
- *
- * C = alpha * A * B + C
- * @tparam T The data type of the matrices.
- * @tparam DIM_X The number of threads in the x-dimension of each block.
- * @tparam DIM_Y The number of threads in the y-dimension of each block.
- * @tparam BLK_M The number of rows processed by each thread block.
- * @tparam BLK_N The number of columns processed by each thread block.
- * @tparam BLK_K The number of elements processed by each thread block along the
- * K dimension.
- * @tparam DIM_XA The number of threads in the x-dimension used for loading
- * matrix A.
- * @tparam DIM_YA The number of threads in the y-dimension used for loading
- * matrix A.
- * @tparam DIM_XB The number of threads in the x-dimension used for loading
- * matrix B.
- * @tparam DIM_YB The number of threads in the y-dimension used for loading
- * matrix B.
- * @param max_m The maximum number of rows in the matrices.
- * @param max_n The maximum number of columns in the matrices.
- * @param m An array of batch sizes for the number of rows in each matrix.
- * @param n An array of batch sizes for the number of columns in each matrix.
- * @param k An array of batch sizes for the number of elements in each matrix
- * along the K dimension.
- * @param global_A_array An array of pointers to the input matrices A.
- * @param global_lda An array of leading dimensions for the input matrices A.
- * @param global_B_array An array of pointers to the input matrices B.
- * @param global_ldb An array of leading dimensions for the input matrices B.
- * @param global_C_array An array of pointers to the output matrices C.
- * @param global_ldc An array of leading dimensions for the output matrices C.
- * @param batchCount The number of matrices in the batch.
- * @param stream The CUDA stream to use for the computation.
- * @param alpha The scalar value to multiply the matrices by (optional, default
- * is nullptr). generate by copilot
- */
-
-/*
- * Why do we need to implement our own matrix multiplication based on the magma
- * code? There are two main reasons. First is when we are doing batch matrix
- * multiplication, since we need to accumulate the results of the
- * multiplications, it is necessary to pass the same memory address of matrix C
- * to different multiplications. This way, the accumulation can be done directly
- * through atomic operations during the matrix multiplication, avoiding the
- * reduction operations after the multiplication. Secondly, when calculating the
- * charge density, where C = alpha * A * B + C, the value of alpha might be
- * different for the same batch of matrices. Using the standard matrix
- * multiplication interface would require breaking down the batch matrix
- * multiplication into smaller batches. In practice, it is difficult to
- * accumulate a batch.
- *
- * Moreover, taking into account the specific requirements of our application,
- * especially the fact that we can relatively easily control the arrangement of
- * the matrix elements, we have only implemented one type of requirement for
- * matrix transposition. That is, we have implemented the operation C = alpha *
- * trans(A) * B + C under the constraint of column-major order.
- *
- * Finally, we would like to thank Magma for its contributions to the field of
- * scientific computing.
- */
-
-template <typename T,
-          int DIM_X,
-          int DIM_Y,
-          int BLK_M,
-          int BLK_N,
-          int BLK_K,
-          int DIM_XA,
-          int DIM_YA,
-          int DIM_XB,
-          int DIM_YB>
-void vbatched_gemm_impl(int max_m,
-                        int max_n,
-                        int* m,
-                        int* n,
-                        int* k,
-                        T** global_A_array,
-                        int* global_lda,
-                        T** global_B_array,
-                        int* global_ldb,
-                        T** global_C_array,
-                        int* global_ldc,
-                        int batchCount,
-                        cudaStream_t stream,
-                        T* alpha = nullptr)
-{
-    // The positions of A and B have been swapped here.
-    // This is because the original code is for column-major matrices.
-    // We use row-major matrices, so we need to swap A and B.
-    // The vbatched_gemm_impl is for C = trans(A) * B + C, but we need trans(C).
-    // Which means: trans(C) = trans(trans(A)*B + C) = trans(B) * A + trans(C)
-    // Then, ldc should be N, lda and ldb should be K
-
-    size_t shared_mem_size = 0;
-    shared_mem_size += (BLK_M + 1) * BLK_K * sizeof(T);
-    shared_mem_size += (BLK_K + 1) * BLK_N * sizeof(T);
-    dim3 dimBlock(DIM_X, DIM_Y);
-    const int max_batch_count = 32768;
-    const int loop_num = batchCount / max_batch_count;
-    const int remain_num = batchCount % max_batch_count;
-
-    for (int i = 0; i < loop_num; ++i)
-    {
-        dim3 dimGrid(ceildiv(max_n, BLK_M), ceildiv(max_m, BLK_N), max_batch_count);
-        T* alpha_tmp = nullptr;
-        if (alpha != nullptr)
-        {
-            alpha_tmp = alpha + i * max_batch_count;
-        }
-
-        vbatched_gemm_kernel<T, DIM_X, DIM_Y, BLK_M, BLK_N, BLK_K, DIM_XA, DIM_YA, DIM_XB, DIM_YB>
-            <<<dimGrid, dimBlock, shared_mem_size, stream>>>(n + i * max_batch_count,
-                                                             m + i * max_batch_count,
-                                                             k + i * max_batch_count,
-                                                             global_B_array + i * max_batch_count,
-                                                             global_ldb + i * max_batch_count,
-                                                             global_A_array + i * max_batch_count,
-                                                             global_lda + i * max_batch_count,
-                                                             global_C_array + i * max_batch_count,
-                                                             global_ldc + i * max_batch_count,
-                                                             alpha_tmp);
-        checkCudaLastError();
-    }
-    if (remain_num > 0)
-    {
-        dim3 dimGrid(ceildiv(max_n, BLK_M), ceildiv(max_m, BLK_N), remain_num);
-        T* alpha_tmp = nullptr;
-        if (alpha != nullptr)
-        {
-            alpha_tmp = alpha + loop_num * max_batch_count;
-        }
-        vbatched_gemm_kernel<T, DIM_X, DIM_Y, BLK_M, BLK_N, BLK_K, DIM_XA, DIM_YA, DIM_XB, DIM_YB>
-            <<<dimGrid, dimBlock, shared_mem_size, stream>>>(n + loop_num * max_batch_count,
-                                                             m + loop_num * max_batch_count,
-                                                             k + loop_num * max_batch_count,
-                                                             global_B_array + loop_num * max_batch_count,
-                                                             global_ldb + loop_num * max_batch_count,
-                                                             global_A_array + loop_num * max_batch_count,
-                                                             global_lda + loop_num * max_batch_count,
-                                                             global_C_array + loop_num * max_batch_count,
-                                                             global_ldc + loop_num * max_batch_count,
-                                                             alpha_tmp);
-        checkCudaLastError();
-    }
-}
-
-template <typename T,
-          int DIM_X,
-          int DIM_Y,
-          int BLK_M,
-          int BLK_N,
-          int BLK_K,
-          int DIM_XA,
-          int DIM_YA,
-          int DIM_XB,
-          int DIM_YB>
-void gemm_time_measure(int max_m,
-                       int max_n,
-                       int* m,
-                       int* n,
-                       int* k,
-                       T** global_A_array,
-                       int* global_lda,
-                       T** global_B_array,
-                       int* global_ldb,
-                       T** global_C_array,
-                       int* global_ldc,
-                       int batchCount,
-                       cudaStream_t stream,
-                       float& fast_time,
-                       matrix_multiple_func_type& fastest_algo,
-                       double* cpu_result,
-                       double* h_global_C,
-                       double* d_global_C)
-{
-    cudaEvent_t start, stop;
-    checkCuda(cudaMemset(d_global_C, 0, batchCount * max_m * max_n * sizeof(double)));
-    checkCuda(cudaEventCreate(&start));
-    checkCuda(cudaEventCreate(&stop));
-    checkCuda(cudaEventRecord(start, stream));
-    vbatched_gemm_impl<T, DIM_X, DIM_Y, BLK_M, BLK_N, BLK_K, DIM_XA, DIM_YA, DIM_XB, DIM_YB>(max_m,
-                                                                                             max_n,
-                                                                                             m,
-                                                                                             n,
-                                                                                             k,
-                                                                                             global_A_array,
-                                                                                             global_lda,
-                                                                                             global_B_array,
-                                                                                             global_ldb,
-                                                                                             global_C_array,
-                                                                                             global_ldc,
-                                                                                             batchCount,
-                                                                                             stream);
-    checkCuda(cudaEventRecord(stop, stream));
-    cudaError_t cuda_status = cudaGetLastError();
-    checkCuda(cudaStreamSynchronize(stream));
-    float milliseconds = 0;
-    checkCuda(cudaEventElapsedTime(&milliseconds, start, stop));
-
-    // WARNING !!!!! Here we assume that all m and n are the same
-    checkCuda(cudaMemcpy(h_global_C, d_global_C, batchCount * max_m * max_n * sizeof(double), cudaMemcpyDeviceToHost));
-    bool check_result = true;
-    for (int i = 0; i < batchCount * max_m * max_n; ++i)
-    {
-        if (abs(cpu_result[i] - h_global_C[i]) > 0.001)
-        {
-            check_result = false;
-            break;
-        }
-    }
-    if (milliseconds < fast_time && cuda_status == cudaSuccess && check_result)
-    {
-        fast_time = milliseconds;
-        fastest_algo = vbatched_gemm_impl<T, DIM_X, DIM_Y, BLK_M, BLK_N, BLK_K, DIM_XA, DIM_YA, DIM_XB, DIM_YB>;
-#ifdef __DEBUG
-        std::cout << "found! fastest time: " << fast_time << std::endl;
-        std::cout << DIM_X << "," << DIM_Y << "," << BLK_M << "," << BLK_N << "," << BLK_K << "," << DIM_XA << ","
-                  << DIM_YA << "," << DIM_XB << "," << DIM_YB << std::endl;
-#endif
-    }
-}
-#endif // VBATCH_MATRIX_MUL_CUH
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/localcell_info.cpp b/source/source_lcao/module_gint/localcell_info.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/localcell_info.cpp
rename to source/source_lcao/module_gint/localcell_info.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/localcell_info.h b/source/source_lcao/module_gint/localcell_info.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/localcell_info.h
rename to source/source_lcao/module_gint/localcell_info.h
diff --git a/source/source_lcao/module_gint/temp_gint/meshgrid_info.h b/source/source_lcao/module_gint/meshgrid_info.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/meshgrid_info.h
rename to source/source_lcao/module_gint/meshgrid_info.h
diff --git a/source/source_lcao/module_gint/mult_psi_dmr.cpp b/source/source_lcao/module_gint/mult_psi_dmr.cpp
deleted file mode 100644
index fab47c1aee..0000000000
--- a/source/source_lcao/module_gint/mult_psi_dmr.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-#include "gint_tools.h"
-#include "source_base/timer.h"
-#include "source_base/ylm.h"
-#include "source_base/module_external/blas_connector.h"
-
-namespace Gint_Tools{
-
-void mult_psi_DMR(
-    const Grid_Technique& gt,
-    const int bxyz,
-    const int LD_pool,
-    const int &grid_index,
-    const int &na_grid,
-    const int*const block_index,
-    const int*const block_size,
-    const bool*const*const cal_flag,
-    const double*const*const psi,
-    double*const*const psi_DMR,
-    const hamilt::HContainer<double>*const DM,
-    const bool if_symm)
-{
-    const UnitCell& ucell = *gt.ucell;
-
-    // parameters for lapack subroutines
-    constexpr char side = 'L';
-    constexpr char uplo = 'U';
-    const char trans = 'N';
-    const double alpha = 1.0;
-    const double beta = 1.0;
-    const double alpha1 = if_symm ? 2.0 : 1.0;
-
-    for (int ia1 = 0; ia1 < na_grid; ia1++)
-    {
-        const int bcell1 = gt.bcell_start[grid_index] + ia1;
-        const int iat1 = gt.which_atom[bcell1];
-
-        //! get cell R1, this step is redundant in gamma_only case.
-        const int id1 = gt.which_unitcell[bcell1];
-        const ModuleBase::Vector3<int> r1 = gt.get_ucell_coords(id1);
-
-        //! density
-        if (if_symm)
-        {
-            //! ia2==ia1
-            const auto tmp_matrix = DM->find_matrix(iat1, iat1, 0, 0, 0);
-            
-            //! maybe checking "tmp_matrix == nullptr" is not necessary
-            if(tmp_matrix == nullptr)
-            {
-                continue;
-            }
-            
-            const auto cal_info = Gint_Tools::cal_info(bxyz, ia1, ia1, cal_flag);
-            const int ib_start = cal_info.first;
-            const int ib_len = cal_info.second;
-            
-            if(ib_len == 0)
-            {
-                continue;
-            }
-            
-            const auto tmp_matrix_ptr = tmp_matrix->get_pointer();
-            const int idx1 = block_index[ia1];
-            BlasConnector::symm_cm(side, uplo, block_size[ia1], ib_len, alpha, tmp_matrix_ptr, block_size[ia1],
-                    &psi[ib_start][idx1], LD_pool, beta, &psi_DMR[ib_start][idx1], LD_pool);
-        }
-
-        //! get (j,beta,R2)
-        const int start = if_symm ? ia1 + 1 : 0;
-
-        for (int ia2 = start; ia2 < na_grid; ia2++)
-        {
-            const int bcell2 = gt.bcell_start[grid_index] + ia2;
-            const int iat2 = gt.which_atom[bcell2];
-            const int id2 = gt.which_unitcell[bcell2];
-
-            //! get cell R2, this step is redundant in gamma_only case.
-            const ModuleBase::Vector3<int> r2 = gt.get_ucell_coords(id2);
-
-            // get AtomPair
-            const auto tmp_matrix = DM->find_matrix(iat1, iat2, r1-r2);
-            if (tmp_matrix == nullptr)
-            {
-                continue;
-            }
-            const auto tmp_matrix_ptr = tmp_matrix->get_pointer();
-            
-            const auto cal_info = Gint_Tools::cal_info(bxyz, ia1, ia1, cal_flag);
-            const int ib_start = cal_info.first;
-            const int ib_len = cal_info.second;
-            if(ib_len == 0)
-            {
-                continue;
-            }
-            const int idx1 = block_index[ia1];
-            const int idx2 = block_index[ia2];
-            
-            dgemm_(&trans, &trans, &block_size[ia2], &ib_len, &block_size[ia1], &alpha1, tmp_matrix_ptr, &block_size[ia2],
-                    &psi[ib_start][idx1], &LD_pool, &beta, &psi_DMR[ib_start][idx2], &LD_pool);
-
-        }  // ia2
-    } // ia1
-}// End of mult_psi_DMR
-
-}// End of Gint_Tools
diff --git a/source/source_lcao/module_gint/temp_gint/phi_operator.cpp b/source/source_lcao/module_gint/phi_operator.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/phi_operator.cpp
rename to source/source_lcao/module_gint/phi_operator.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/phi_operator.h b/source/source_lcao/module_gint/phi_operator.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/phi_operator.h
rename to source/source_lcao/module_gint/phi_operator.h
diff --git a/source/source_lcao/module_gint/temp_gint/phi_operator.hpp b/source/source_lcao/module_gint/phi_operator.hpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/phi_operator.hpp
rename to source/source_lcao/module_gint/phi_operator.hpp
diff --git a/source/source_lcao/module_gint/temp_gint/set_ddphi.cpp b/source/source_lcao/module_gint/set_ddphi.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/set_ddphi.cpp
rename to source/source_lcao/module_gint/set_ddphi.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/gint.h b/source/source_lcao/module_gint/temp_gint/gint.h
deleted file mode 100644
index 1255bae971..0000000000
--- a/source/source_lcao/module_gint/temp_gint/gint.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-#include <memory>
-#include "gint_info.h"
-#include "gint_type.h"
-
-namespace ModuleGint
-{
-
-class Gint
-{
-    public:
-    Gint() = default;
-    virtual ~Gint() = default;
-
-    // note that gint_info_ is a static member variable
-    // it is shared by all instances of Gint
-    static void set_gint_info(GintInfo* gint_info)
-    {
-        gint_info_ = gint_info;
-    }
-
-    protected:
-    static GintInfo* gint_info_;
-};
-
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/gint_rho_gpu.h b/source/source_lcao/module_gint/temp_gint/gint_rho_gpu.h
deleted file mode 100644
index 07bbf0eaed..0000000000
--- a/source/source_lcao/module_gint/temp_gint/gint_rho_gpu.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "source_lcao/module_hcontainer/hcontainer.h"
-#include "gint.h"
-#include "gint_info.h"
-#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
-
-namespace ModuleGint
-{
-
-class Gint_rho_gpu: public Gint
-{
-    public:
-    Gint_rho_gpu(
-        const std::vector<HContainer<double>*>& dm_vec,
-        const int nspin,
-        double **rho,
-        bool is_dm_symm = true)
-        : dm_vec_(dm_vec), nspin_(nspin), rho_(rho), is_dm_symm_(is_dm_symm) {}
-    
-    void cal_gint();
-
-    private:
-    void init_dm_gint_();
-
-    void cal_rho_();
-
-    void transfer_cpu_to_gpu_();
-
-    void transfer_gpu_to_cpu_();
-
-    // input
-    const std::vector<HContainer<double>*> dm_vec_;
-    const int nspin_;
-
-    // if true, it means the DMR matrix is symmetric,
-    // which leads to faster computations compared to the asymmetric case.
-    const bool is_dm_symm_;
-
-    // output
-    double **rho_;
-
-    // Intermediate variables
-    std::vector<HContainer<double>> dm_gint_vec_;
-
-    std::vector<CudaMemWrapper<double>> dm_gint_d_vec_;
-    std::vector<CudaMemWrapper<double>> rho_d_vec_;
-};
-
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/gint_vl_gpu.h b/source/source_lcao/module_gint/temp_gint/gint_vl_gpu.h
deleted file mode 100644
index de113b2ea0..0000000000
--- a/source/source_lcao/module_gint/temp_gint/gint_vl_gpu.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "source_lcao/module_hcontainer/hcontainer.h"
-#include "gint.h"
-#include "gint_info.h"
-#include "source_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
-
-namespace ModuleGint
-{
-
-class Gint_vl_gpu : public Gint
-{
-    public:
-    Gint_vl_gpu(
-        const double* vr_eff,
-        HContainer<double>* hR)
-        : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {}
-    
-    void cal_gint();
-
-    private:
-
-    void init_hr_gint_();
-
-    void transfer_cpu_to_gpu_();
-
-    void transfer_gpu_to_cpu_();
-
-    void cal_hr_gint_();
-
-    // input
-    const double* vr_eff_;
-
-        
-    // output
-    HContainer<double>* hR_;
-
-    // Intermediate variables
-    double dr3_;
-
-    HContainer<double> hr_gint_;
-    
-    CudaMemWrapper<double> hr_gint_d_;
-    CudaMemWrapper<double> vr_eff_d_;
-};
-
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h b/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h
index 07139c82db..5f711aa6a0 100644
--- a/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h
+++ b/source/source_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h
@@ -5,9 +5,8 @@
 #include "source_base/ylm.h"
 #include "source_cell/unitcell.h"
 #include "source_cell/atom_spec.h"
-#include "source_lcao/module_gint/temp_gint/biggrid_info.h"
+#include "source_lcao/module_gint/biggrid_info.h"
 #include "gint_helper.cuh"
-#include "source_lcao/module_gint/kernels/cuda/gemm_selector.cuh"
 
 namespace ModuleGint
 {
@@ -39,7 +38,6 @@ class GintGpuVars
 
     // the index of gpu device
     int dev_id_ = 0;
-    matrix_multiple_func_type fastest_matrix_mul;
 
 };
 
diff --git a/source/source_lcao/module_gint/test/CMakeLists.txt b/source/source_lcao/module_gint/test/CMakeLists.txt
deleted file mode 100644
index 2030b04a12..0000000000
--- a/source/source_lcao/module_gint/test/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-if(ENABLE_LCAO AND USE_CUDA)
-  AddTest(
-  TARGET gint_gpu_test
-  LIBS parameter ${math_libs} psi base device
-  SOURCES test_sph.cu test_sph.cpp
-)
-endif()
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/test/test_sph.cpp b/source/source_lcao/module_gint/test/test_sph.cpp
deleted file mode 100644
index e13a4d5675..0000000000
--- a/source/source_lcao/module_gint/test/test_sph.cpp
+++ /dev/null
@@ -1,597 +0,0 @@
-#include "test_sph.h"
-using namespace std;
-
-void sph_harm(const int& Lmax, // max momentum of l
-              const double& xdr,
-              const double& ydr,
-              const double& zdr,
-              std::vector<double>& rly,
-              double* ylmcoef)
-{
-
-    // begin calculation
-    /***************************
-             L = 0
-    ***************************/
-    rly[0] = ylmcoef[0]; // l=0, m=0
-    if (Lmax == 0)
-        return;
-
-    /***************************
-             L = 1
-    ***************************/
-    rly[1] = ylmcoef[1] * zdr;  // l=1, m=0
-    rly[2] = -ylmcoef[1] * xdr; // l=1, m=1
-    rly[3] = -ylmcoef[1] * ydr; // l=1, m=-1
-    if (Lmax == 1)
-        return;
-
-    /***************************
-             L = 2
-    ***************************/
-    double tmp0 = ylmcoef[3] * rly[0];
-    rly[4] = ylmcoef[2] * zdr * rly[1] - tmp0; // l=2, m=0
-
-    tmp0 = ylmcoef[4] * zdr;
-    rly[5] = tmp0 * rly[2]; // l=2,m=1
-    rly[6] = tmp0 * rly[3]; // l=2,m=-1
-
-    double tmp2 = ylmcoef[4] * xdr;
-    rly[7]
-        = ylmcoef[5] * rly[4] - ylmcoef[6] * rly[0] - tmp2 * rly[2]; // l=2,m=2
-    rly[8] = -tmp2 * rly[3];
-    //	rly[8] = tmp1+tmp2*rly[3];//l=2,m=-2
-    if (Lmax == 2)
-        return;
-
-    /***************************
-             L = 3
-    ***************************/
-    tmp0 = ylmcoef[8] * rly[1];
-    rly[9] = ylmcoef[7] * zdr * rly[4] - tmp0; // l=3, m=0
-
-    double tmp3 = ylmcoef[9] * zdr;
-    rly[10] = tmp3 * rly[5] - ylmcoef[10] * rly[2]; // l=3,m=1
-    rly[11] = tmp3 * rly[6] - ylmcoef[10] * rly[3]; // l=3,m=-1
-
-    double tmp4 = ylmcoef[11] * zdr;
-    rly[12] = tmp4 * rly[7]; // l=3,m=2
-    rly[13] = tmp4 * rly[8]; // l=3,m=-2
-
-    double tmp5 = ylmcoef[14] * xdr;
-    rly[14] = ylmcoef[12] * rly[10] - ylmcoef[13] * rly[2]
-              - tmp5 * rly[7]; // l=3,m=3
-    rly[15] = ylmcoef[12] * rly[11] - ylmcoef[13] * rly[3]
-              - tmp5 * rly[8]; // l=3,m=-3
-    if (Lmax == 3)
-        return;
-
-    /***************************
-             L = 4
-    ***************************/
-    tmp0 = ylmcoef[16] * rly[4];
-    rly[16] = ylmcoef[15] * zdr * rly[9] - tmp0; // l=4,m=0
-
-    double tmp6 = ylmcoef[17] * zdr;
-    rly[17] = tmp6 * rly[10] - ylmcoef[18] * rly[5]; // l=4,m=1
-    rly[18] = tmp6 * rly[11] - ylmcoef[18] * rly[6]; // l=4,m=-1
-
-    double tmp7 = ylmcoef[19] * zdr;
-    rly[19] = tmp7 * rly[12] - ylmcoef[20] * rly[7]; // l=4,m=2
-    rly[20] = tmp7 * rly[13] - ylmcoef[20] * rly[8]; // l=4,m=-2
-
-    double tmp8 = 3.0 * zdr;
-    rly[21] = tmp8 * rly[14]; // l=4,m=3
-    rly[22] = tmp8 * rly[15]; // l=4,m=-3
-
-    double tmp9 = ylmcoef[23] * xdr;
-    rly[23] = ylmcoef[21] * rly[19] - ylmcoef[22] * rly[7]
-              - tmp9 * rly[14]; // l=4,m=4
-    rly[24] = ylmcoef[21] * rly[20] - ylmcoef[22] * rly[8]
-              - tmp9 * rly[15]; // l=4,m=-4
-    if (Lmax == 4)
-        return;
-
-    /***************************
-             L = 5
-    ***************************/
-    tmp0 = ylmcoef[25] * rly[9];
-    rly[25] = ylmcoef[24] * zdr * rly[16] - tmp0; // l=5,m=0
-
-    double tmp10 = ylmcoef[26] * zdr;
-    rly[26] = tmp10 * rly[17] - ylmcoef[27] * rly[10]; // l=5,m=1
-    rly[27] = tmp10 * rly[18] - ylmcoef[27] * rly[11]; // l=5,m=-1
-
-    double tmp11 = ylmcoef[28] * zdr;
-    rly[28] = tmp11 * rly[19] - ylmcoef[29] * rly[12]; // l=5,m=2
-    rly[29] = tmp11 * rly[20] - ylmcoef[29] * rly[13]; // l=5,m=-2
-
-    double tmp12 = ylmcoef[30] * zdr;
-    rly[30] = tmp12 * rly[21] - ylmcoef[31] * rly[14]; // l=5,m=3
-    rly[31] = tmp12 * rly[22] - ylmcoef[31] * rly[15]; // l=5,m=-3
-
-    double tmp13 = ylmcoef[32] * zdr;
-    rly[32] = tmp13 * rly[23]; // l=5,m=4
-    rly[33] = tmp13 * rly[24]; // l=5,m=-4
-
-    double tmp14 = ylmcoef[35] * xdr;
-    rly[34] = ylmcoef[33] * rly[30] - ylmcoef[34] * rly[14]
-              - tmp14 * rly[23]; // l=5,m=5
-    rly[35] = ylmcoef[33] * rly[31] - ylmcoef[34] * rly[15]
-              - tmp14 * rly[24]; // l=5,m=-5
-    if (Lmax == 5)
-        return;
-
-    // if Lmax > 5
-    for (int il = 6; il <= Lmax; il++)
-    {
-        int istart = il * il;
-        int istart1 = (il - 1) * (il - 1);
-        int istart2 = (il - 2) * (il - 2);
-
-        double fac2 = sqrt(4.0 * istart - 1.0);
-        double fac4 = sqrt(4.0 * istart1 - 1.0);
-
-        for (int im = 0; im < 2 * il - 1; im++)
-        {
-            int imm = (im + 1) / 2;
-            //			if (im % 2 == 0) imm *= -1;
-
-            rly[istart + im] = fac2 / sqrt((double)istart - imm * imm)
-                               * (zdr * rly[istart1 + im]
-                                  - sqrt((double)istart1 - imm * imm) / fac4
-                                        * rly[istart2 + im]);
-        }
-
-        double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0));
-        double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0));
-        double bl3 = sqrt(2.0) / fac2;
-
-        rly[istart + 2 * il - 1]
-            = (bl3 * rly[istart + 2 * il - 5] - bl2 * rly[istart2 + 2 * il - 5]
-               - 2.0 * xdr * rly[istart1 + 2 * il - 3])
-              / bl1;
-        rly[istart + 2 * il]
-            = (bl3 * rly[istart + 2 * il - 4] - bl2 * rly[istart2 + 2 * il - 4]
-               - 2.0 * xdr * rly[istart1 + 2 * il - 2])
-              / bl1;
-    }
-
-    return;
-}
-void grad_rl_sph_harm(const int& Lmax, // max momentum of L
-                      const double& x,
-                      const double& y,
-                      const double& z,
-                      double* rly,
-                      double** grly,
-                      const double* ylmcoef)
-{
-    double radius2 = x * x + y * y + z * z;
-    double tx = 2.0 * x;
-    double ty = 2.0 * y;
-    double tz = 2.0 * z;
-
-    // begin calculation
-    /***************************
-             L = 0
-    ***************************/
-    rly[0] = ylmcoef[0]; // l=0, m=0
-    grly[0][0] = grly[0][1] = grly[0][2] = 0.0;
-    if (Lmax == 0)
-        return;
-
-    /***************************
-             L = 1
-    ***************************/
-    rly[1] = ylmcoef[1] * z; // l=1, m=0
-    grly[1][0] = grly[1][1] = 0.0;
-    grly[1][2] = ylmcoef[1];
-
-    rly[2] = -ylmcoef[1] * x; // l=1, m=1
-    grly[2][1] = grly[2][2] = 0.0;
-    grly[2][0] = -ylmcoef[1];
-
-    rly[3] = -ylmcoef[1] * y; // l=1, m=-1
-    grly[3][0] = grly[3][2] = 0.0;
-    grly[3][1] = -ylmcoef[1];
-
-    if (Lmax == 1)
-        return;
-
-    /***************************
-             L = 2
-    ***************************/
-    rly[4]
-        = ylmcoef[2] * z * rly[1] - ylmcoef[3] * rly[0] * radius2; // l=2, m=0
-    grly[4][0]
-        = ylmcoef[2] * z * grly[1][0]
-          - ylmcoef[3] * (grly[0][0] * radius2 + rly[0] * tx); // l=2, m=0
-    grly[4][1]
-        = ylmcoef[2] * z * grly[1][1]
-          - ylmcoef[3] * (grly[0][1] * radius2 + rly[0] * ty); // l=2, m=0
-    grly[4][2]
-        = ylmcoef[2] * (z * grly[1][2] + rly[1])
-          - ylmcoef[3] * (grly[0][2] * radius2 + rly[0] * tz); // l=2, m=0
-
-    double tmp0 = ylmcoef[4] * z;
-    rly[5] = tmp0 * rly[2]; // l=2,m=1
-    grly[5][0] = tmp0 * grly[2][0];
-    grly[5][1] = tmp0 * grly[2][1];
-    grly[5][2] = ylmcoef[4] * (rly[2] + z * grly[2][2]);
-
-    rly[6] = tmp0 * rly[3]; // l=2,m=-1
-    grly[6][0] = tmp0 * grly[3][0];
-    grly[6][1] = tmp0 * grly[3][1];
-    grly[6][2] = ylmcoef[4] * (rly[3] + z * grly[3][2]);
-
-    double tmp2 = ylmcoef[4] * x;
-    rly[7] = ylmcoef[5] * rly[4] - ylmcoef[6] * rly[0] * radius2
-             - tmp2 * rly[2]; // l=2,m=2
-    grly[7][0] = ylmcoef[5] * grly[4][0]
-                 - ylmcoef[6] * (rly[0] * tx + grly[0][0] * radius2)
-                 - ylmcoef[4] * (x * grly[2][0] + rly[2]);
-
-    //	std::cout << "\np1 = "<< ylmcoef[5]*grly[4][0] << " p2 = " <<
-    //-ylmcoef[6]*rly[0]*tx
-    //						<< " p3 = " << -ylmcoef[4]*x*grly[2][0] << " p4 = "
-    //<< -ylmcoef[4]*rly[2] << std::endl;
-
-    grly[7][1] = ylmcoef[5] * grly[4][1]
-                 - ylmcoef[6] * (rly[0] * ty + grly[0][1] * radius2)
-                 - tmp2 * grly[2][1];
-    grly[7][2] = ylmcoef[5] * grly[4][2]
-                 - ylmcoef[6] * (rly[0] * tz + grly[0][2] * radius2)
-                 - tmp2 * grly[2][2];
-
-    rly[8] = -tmp2 * rly[3];
-    grly[8][0] = -ylmcoef[4] * (rly[3] + x * grly[3][0]);
-    grly[8][1] = -tmp2 * grly[3][1];
-    grly[8][2] = -tmp2 * grly[3][2];
-    //	rly[8] = tmp1+tmp2*rly[3];//l=2,m=-2
-    if (Lmax == 2)
-        return;
-
-    /***************************
-             L = 3
-    ***************************/
-    rly[9]
-        = ylmcoef[7] * z * rly[4] - ylmcoef[8] * rly[1] * radius2; // l=3, m=0
-    grly[9][0] = ylmcoef[7] * z * grly[4][0]
-                 - ylmcoef[8] * (rly[1] * tx + grly[1][0] * radius2);
-    grly[9][1] = ylmcoef[7] * z * grly[4][1]
-                 - ylmcoef[8] * (rly[1] * ty + grly[1][1] * radius2);
-    grly[9][2] = ylmcoef[7] * (rly[4] + z * grly[4][2])
-                 - ylmcoef[8] * (rly[1] * tz + grly[1][2] * radius2);
-
-    double tmp3 = ylmcoef[9] * z;
-    rly[10] = tmp3 * rly[5] - ylmcoef[10] * rly[2] * radius2; // l=3,m=1
-    grly[10][0] = tmp3 * grly[5][0]
-                  - ylmcoef[10] * (grly[2][0] * radius2 + rly[2] * tx);
-    grly[10][1] = tmp3 * grly[5][1]
-                  - ylmcoef[10] * (grly[2][1] * radius2 + rly[2] * ty);
-    grly[10][2] = ylmcoef[9] * (z * grly[5][2] + rly[5])
-                  - ylmcoef[10] * (grly[2][2] * radius2 + rly[2] * tz);
-
-    rly[11] = tmp3 * rly[6] - ylmcoef[10] * rly[3] * radius2; // l=3,m=-1
-    grly[11][0] = tmp3 * grly[6][0]
-                  - ylmcoef[10] * (grly[3][0] * radius2 + rly[3] * tx);
-    grly[11][1] = tmp3 * grly[6][1]
-                  - ylmcoef[10] * (grly[3][1] * radius2 + rly[3] * ty);
-    grly[11][2] = ylmcoef[9] * (z * grly[6][2] + rly[6])
-                  - ylmcoef[10] * (grly[3][2] * radius2 + rly[3] * tz);
-
-    double tmp4 = ylmcoef[11] * z;
-    rly[12] = tmp4 * rly[7]; // l=3,m=2
-    grly[12][0] = tmp4 * grly[7][0];
-    grly[12][1] = tmp4 * grly[7][1];
-    grly[12][2] = ylmcoef[11] * (z * grly[7][2] + rly[7]);
-
-    rly[13] = tmp4 * rly[8]; // l=3,m=-2
-    grly[13][0] = tmp4 * grly[8][0];
-    grly[13][1] = tmp4 * grly[8][1];
-    grly[13][2] = ylmcoef[11] * (z * grly[8][2] + rly[8]);
-
-    double tmp5 = ylmcoef[14] * x;
-    rly[14] = ylmcoef[12] * rly[10] - ylmcoef[13] * rly[2] * radius2
-              - tmp5 * rly[7]; // l=3,m=3
-    grly[14][0] = ylmcoef[12] * grly[10][0]
-                  - ylmcoef[13] * (rly[2] * tx + grly[2][0] * radius2)
-                  - ylmcoef[14] * (rly[7] + x * grly[7][0]);
-    grly[14][1] = ylmcoef[12] * grly[10][1]
-                  - ylmcoef[13] * (rly[2] * ty + grly[2][1] * radius2)
-                  - tmp5 * grly[7][1];
-    grly[14][2] = ylmcoef[12] * grly[10][2]
-                  - ylmcoef[13] * (rly[2] * tz + grly[2][2] * radius2)
-                  - tmp5 * grly[7][2];
-
-    rly[15] = ylmcoef[12] * rly[11] - ylmcoef[13] * rly[3] * radius2
-              - tmp5 * rly[8]; // l=3,m=-3
-    grly[15][0] = ylmcoef[12] * grly[11][0]
-                  - ylmcoef[13] * (rly[3] * tx + grly[3][0] * radius2)
-                  - ylmcoef[14] * (rly[8] + x * grly[8][0]);
-    grly[15][1] = ylmcoef[12] * grly[11][1]
-                  - ylmcoef[13] * (rly[3] * ty + grly[3][1] * radius2)
-                  - tmp5 * grly[8][1];
-    grly[15][2] = ylmcoef[12] * grly[11][2]
-                  - ylmcoef[13] * (rly[3] * tz + grly[3][2] * radius2)
-                  - tmp5 * grly[8][2];
-    if (Lmax == 3)
-        return;
-
-    /***************************
-             L = 4
-    ***************************/
-    rly[16]
-        = ylmcoef[15] * z * rly[9] - ylmcoef[16] * rly[4] * radius2; // l=4,m=0
-    grly[16][0] = ylmcoef[15] * z * grly[9][0]
-                  - ylmcoef[16] * (rly[4] * tx + grly[4][0] * radius2);
-    grly[16][1] = ylmcoef[15] * z * grly[9][1]
-                  - ylmcoef[16] * (rly[4] * ty + grly[4][1] * radius2);
-    grly[16][2] = ylmcoef[15] * (z * grly[9][2] + rly[9])
-                  - ylmcoef[16] * (rly[4] * tz + grly[4][2] * radius2);
-
-    double tmp6 = ylmcoef[17] * z;
-    rly[17] = tmp6 * rly[10] - ylmcoef[18] * rly[5] * radius2; // l=4,m=1
-    grly[17][0] = tmp6 * grly[10][0]
-                  - ylmcoef[18] * (rly[5] * tx + grly[5][0] * radius2);
-    grly[17][1] = tmp6 * grly[10][1]
-                  - ylmcoef[18] * (rly[5] * ty + grly[5][1] * radius2);
-    grly[17][2] = ylmcoef[17] * (z * grly[10][2] + rly[10])
-                  - ylmcoef[18] * (rly[5] * tz + grly[5][2] * radius2);
-
-    rly[18] = tmp6 * rly[11] - ylmcoef[18] * rly[6] * radius2; // l=4,m=-1
-    grly[18][0] = tmp6 * grly[11][0]
-                  - ylmcoef[18] * (rly[6] * tx + grly[6][0] * radius2);
-    grly[18][1] = tmp6 * grly[11][1]
-                  - ylmcoef[18] * (rly[6] * ty + grly[6][1] * radius2);
-    grly[18][2] = ylmcoef[17] * (z * grly[11][2] + rly[11])
-                  - ylmcoef[18] * (rly[6] * tz + grly[6][2] * radius2);
-
-    double tmp7 = ylmcoef[19] * z;
-    rly[19] = tmp7 * rly[12] - ylmcoef[20] * rly[7] * radius2; // l=4,m=2
-    grly[19][0] = tmp7 * grly[12][0]
-                  - ylmcoef[20] * (rly[7] * tx + grly[7][0] * radius2);
-    grly[19][1] = tmp7 * grly[12][1]
-                  - ylmcoef[20] * (rly[7] * ty + grly[7][1] * radius2);
-    grly[19][2] = ylmcoef[19] * (z * grly[12][2] + rly[12])
-                  - ylmcoef[20] * (rly[7] * tz + grly[7][2] * radius2);
-
-    rly[20] = tmp7 * rly[13] - ylmcoef[20] * rly[8] * radius2; // l=4,m=-2
-    grly[20][0] = tmp7 * grly[13][0]
-                  - ylmcoef[20] * (rly[8] * tx + grly[8][0] * radius2);
-    grly[20][1] = tmp7 * grly[13][1]
-                  - ylmcoef[20] * (rly[8] * ty + grly[8][1] * radius2);
-    grly[20][2] = ylmcoef[19] * (z * grly[13][2] + rly[13])
-                  - ylmcoef[20] * (rly[8] * tz + grly[8][2] * radius2);
-
-    double tmp8 = 3.0 * z;
-    rly[21] = tmp8 * rly[14]; // l=4,m=3
-    grly[21][0] = tmp8 * grly[14][0];
-    grly[21][1] = tmp8 * grly[14][1];
-    grly[21][2] = 3.0 * (z * grly[14][2] + rly[14]);
-
-    rly[22] = tmp8 * rly[15]; // l=4,m=-3
-    grly[22][0] = tmp8 * grly[15][0];
-    grly[22][1] = tmp8 * grly[15][1];
-    grly[22][2] = 3.0 * (z * grly[15][2] + rly[15]);
-
-    double tmp9 = ylmcoef[23] * x;
-    rly[23] = ylmcoef[21] * rly[19] - ylmcoef[22] * rly[7] * radius2
-              - tmp9 * rly[14]; // l=4,m=4
-    grly[23][0] = ylmcoef[21] * grly[19][0]
-                  - ylmcoef[22] * (rly[7] * tx + grly[7][0] * radius2)
-                  - ylmcoef[23] * (x * grly[14][0] + rly[14]);
-    grly[23][1] = ylmcoef[21] * grly[19][1]
-                  - ylmcoef[22] * (rly[7] * ty + grly[7][1] * radius2)
-                  - tmp9 * grly[14][1];
-    grly[23][2] = ylmcoef[21] * grly[19][2]
-                  - ylmcoef[22] * (rly[7] * tz + grly[7][2] * radius2)
-                  - tmp9 * grly[14][2];
-
-    rly[24] = ylmcoef[21] * rly[20] - ylmcoef[22] * rly[8] * radius2
-              - tmp9 * rly[15]; // l=4,m=-4
-    grly[24][0] = ylmcoef[21] * grly[20][0]
-                  - ylmcoef[22] * (rly[8] * tx + grly[8][0] * radius2)
-                  - ylmcoef[23] * (x * grly[15][0] + rly[15]);
-    grly[24][1] = ylmcoef[21] * grly[20][1]
-                  - ylmcoef[22] * (rly[8] * ty + grly[8][1] * radius2)
-                  - tmp9 * grly[15][1];
-    grly[24][2] = ylmcoef[21] * grly[20][2]
-                  - ylmcoef[22] * (rly[8] * tz + grly[8][2] * radius2)
-                  - tmp9 * grly[15][2];
-
-    if (Lmax == 4)
-        return;
-
-    /***************************
-             L = 5
-    ***************************/
-    rly[25]
-        = ylmcoef[24] * z * rly[16] - ylmcoef[25] * rly[9] * radius2; // l=5,m=0
-    grly[25][0] = ylmcoef[24] * z * grly[16][0]
-                  - ylmcoef[25] * (rly[9] * tx + grly[9][0] * radius2);
-    grly[25][1] = ylmcoef[24] * z * grly[16][1]
-                  - ylmcoef[25] * (rly[9] * ty + grly[9][1] * radius2);
-    grly[25][2] = ylmcoef[24] * (z * grly[16][2] + rly[16])
-                  - ylmcoef[25] * (rly[9] * tz + grly[9][2] * radius2);
-
-    double tmp10 = ylmcoef[26] * z;
-    rly[26] = tmp10 * rly[17] - ylmcoef[27] * rly[10] * radius2; // l=5,m=1
-    grly[26][0] = tmp10 * grly[17][0]
-                  - ylmcoef[27] * (rly[10] * tx + grly[10][0] * radius2);
-    grly[26][1] = tmp10 * grly[17][1]
-                  - ylmcoef[27] * (rly[10] * ty + grly[10][1] * radius2);
-    grly[26][2] = ylmcoef[26] * (z * grly[17][2] + rly[17])
-                  - ylmcoef[27] * (rly[10] * tz + grly[10][2] * radius2);
-
-    rly[27] = tmp10 * rly[18] - ylmcoef[27] * rly[11] * radius2; // l=5,m=-1
-    grly[27][0] = tmp10 * grly[18][0]
-                  - ylmcoef[27] * (rly[11] * tx + grly[11][0] * radius2);
-    grly[27][1] = tmp10 * grly[18][1]
-                  - ylmcoef[27] * (rly[11] * ty + grly[11][1] * radius2);
-    grly[27][2] = ylmcoef[26] * (z * grly[18][2] + rly[18])
-                  - ylmcoef[27] * (rly[11] * tz + grly[11][2] * radius2);
-
-    double tmp11 = ylmcoef[28] * z;
-    rly[28] = tmp11 * rly[19] - ylmcoef[29] * rly[12] * radius2; // l=5,m=2
-    grly[28][0] = tmp11 * grly[19][0]
-                  - ylmcoef[29] * (rly[12] * tx + grly[12][0] * radius2);
-    grly[28][1] = tmp11 * grly[19][1]
-                  - ylmcoef[29] * (rly[12] * ty + grly[12][1] * radius2);
-    grly[28][2] = ylmcoef[28] * (z * grly[19][2] + rly[19])
-                  - ylmcoef[29] * (rly[12] * tz + grly[12][2] * radius2);
-
-    rly[29] = tmp11 * rly[20] - ylmcoef[29] * rly[13] * radius2; // l=5,m=-2
-    grly[29][0] = tmp11 * grly[20][0]
-                  - ylmcoef[29] * (rly[13] * tx + grly[13][0] * radius2);
-    grly[29][1] = tmp11 * grly[20][1]
-                  - ylmcoef[29] * (rly[13] * ty + grly[13][1] * radius2);
-    grly[29][2] = ylmcoef[28] * (z * grly[20][2] + rly[20])
-                  - ylmcoef[29] * (rly[13] * tz + grly[13][2] * radius2);
-
-    double tmp12 = ylmcoef[30] * z;
-    rly[30] = tmp12 * rly[21] - ylmcoef[31] * rly[14] * radius2; // l=5,m=3
-    grly[30][0] = tmp12 * grly[21][0]
-                  - ylmcoef[31] * (grly[14][0] * radius2 + rly[14] * tx);
-    grly[30][1] = tmp12 * grly[21][1]
-                  - ylmcoef[31] * (grly[14][1] * radius2 + rly[14] * ty);
-    grly[30][2] = ylmcoef[30] * (z * grly[21][2] + rly[21])
-                  - ylmcoef[31] * (grly[14][2] * radius2 + rly[14] * tz);
-
-    rly[31] = tmp12 * rly[22] - ylmcoef[31] * rly[15] * radius2; // l=5,m=-3
-    grly[31][0] = tmp12 * grly[22][0]
-                  - ylmcoef[31] * (grly[15][0] * radius2 + rly[15] * tx);
-    grly[31][1] = tmp12 * grly[22][1]
-                  - ylmcoef[31] * (grly[15][1] * radius2 + rly[15] * ty);
-    grly[31][2] = ylmcoef[30] * (z * grly[22][2] + rly[22])
-                  - ylmcoef[31] * (grly[15][2] * radius2 + rly[15] * tz);
-
-    double tmp13 = ylmcoef[32] * z;
-    rly[32] = tmp13 * rly[23]; // l=5,m=4
-    grly[32][0] = tmp13 * grly[23][0];
-    grly[32][1] = tmp13 * grly[23][1];
-    grly[32][2] = ylmcoef[32] * (rly[23] + z * grly[23][2]);
-
-    rly[33] = tmp13 * rly[24]; // l=5,m=-4
-    grly[33][0] = tmp13 * grly[24][0];
-    grly[33][1] = tmp13 * grly[24][1];
-    grly[33][2] = ylmcoef[32] * (rly[24] + z * grly[24][2]);
-
-    double tmp14 = ylmcoef[35] * x;
-    rly[34] = ylmcoef[33] * rly[30] - ylmcoef[34] * rly[14] * radius2
-              - tmp14 * rly[23]; // l=5,m=5
-    grly[34][0] = ylmcoef[33] * grly[30][0]
-                  - ylmcoef[34] * (rly[14] * tx + grly[14][0] * radius2)
-                  - ylmcoef[35] * (x * grly[23][0] + rly[23]);
-    grly[34][1] = ylmcoef[33] * grly[30][1]
-                  - ylmcoef[34] * (rly[14] * ty + grly[14][1] * radius2)
-                  - tmp14 * grly[23][1];
-    grly[34][2] = ylmcoef[33] * grly[30][2]
-                  - ylmcoef[34] * (rly[14] * tz + grly[14][2] * radius2)
-                  - tmp14 * grly[23][2];
-
-    rly[35] = ylmcoef[33] * rly[31] - ylmcoef[34] * rly[15] * radius2
-              - tmp14 * rly[24]; // l=5,m=-5
-    grly[35][0] = ylmcoef[33] * grly[31][0]
-                  - ylmcoef[34] * (rly[15] * tx + grly[15][0] * radius2)
-                  - ylmcoef[35] * (x * grly[24][0] + rly[24]);
-    grly[35][1] = ylmcoef[33] * grly[31][1]
-                  - ylmcoef[34] * (rly[15] * ty + grly[15][1] * radius2)
-                  - tmp14 * grly[24][1];
-    grly[35][2] = ylmcoef[33] * grly[31][2]
-                  - ylmcoef[34] * (rly[15] * tz + grly[15][2] * radius2)
-                  - tmp14 * grly[24][2];
-
-    if (Lmax == 5)
-        return;
-
-    // if Lmax > 5
-    for (int il = 6; il <= Lmax; il++)
-    {
-        int istart = il * il;
-        int istart1 = (il - 1) * (il - 1);
-        int istart2 = (il - 2) * (il - 2);
-
-        double fac2 = sqrt(4.0 * istart - 1.0);
-        double fac4 = sqrt(4.0 * istart1 - 1.0);
-
-        for (int im = 0; im < 2 * il - 1; im++)
-        {
-            int imm = (im + 1) / 2;
-            //			if (im % 2 == 0) imm *= -1;
-
-            double var1 = fac2 / sqrt((double)istart - imm * imm);
-            double var2 = sqrt((double)istart1 - imm * imm) / fac4;
-
-            rly[istart + im] = var1
-                               * (z * rly[istart1 + im]
-                                  - var2 * rly[istart2 + im] * radius2);
-
-            grly[istart + im][0]
-                = var1
-                  * (z * grly[istart1 + im][0]
-                     - var2
-                           * (rly[istart2 + im] * tx
-                              + grly[istart2 + im][0] * radius2));
-            grly[istart + im][1]
-                = var1
-                  * (z * grly[istart1 + im][1]
-                     - var2
-                           * (rly[istart2 + im] * ty
-                              + grly[istart2 + im][1] * radius2));
-            grly[istart + im][2]
-                = var1
-                  * (z * grly[istart1 + im][2] + rly[istart1 + im]
-                     - var2
-                           * (rly[istart2 + im] * tz
-                              + grly[istart2 + im][2] * radius2));
-        }
-
-        double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0));
-        double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0));
-        double bl3 = sqrt(2.0) / fac2;
-
-        int id1 = istart + 2 * il - 1;
-        int id2 = istart + 2 * il - 5;
-        int id3 = istart2 + 2 * il - 5;
-        int id4 = istart1 + 2 * il - 3;
-
-        rly[id1]
-            = (bl3 * rly[id2] - bl2 * rly[id3] * radius2 - 2.0 * x * rly[id4])
-              / bl1;
-        grly[id1][0] = (bl3 * grly[id2][0]
-                        - bl2 * (grly[id3][0] * radius2 + rly[id3] * tx)
-                        - 2.0 * (rly[id4] + x * grly[id4][0]))
-                       / bl1;
-        grly[id1][1] = (bl3 * grly[id2][1]
-                        - bl2 * (grly[id3][1] * radius2 + rly[id3] * ty)
-                        - 2.0 * x * grly[id4][1])
-                       / bl1;
-        grly[id1][2] = (bl3 * grly[id2][2]
-                        - bl2 * (grly[id3][2] * radius2 + rly[id3] * tz)
-                        - 2.0 * x * grly[id4][2])
-                       / bl1;
-
-        rly[id1 + 1] = (bl3 * rly[id2 + 1] - bl2 * rly[id3 + 1] * radius2
-                        - 2.0 * x * rly[id4 + 1])
-                       / bl1;
-        grly[id1 + 1][0]
-            = (bl3 * grly[id2 + 1][0]
-               - bl2 * (grly[id3 + 1][0] * radius2 + rly[id3 + 1] * tx)
-               - 2.0 * (rly[id4 + 1] + x * grly[id4 + 1][0]))
-              / bl1;
-        grly[id1 + 1][1]
-            = (bl3 * grly[id2 + 1][1]
-               - bl2 * (grly[id3 + 1][1] * radius2 + rly[id3 + 1] * ty)
-               - 2.0 * x * grly[id4 + 1][1])
-              / bl1;
-        grly[id1 + 1][2]
-            = (bl3 * grly[id2 + 1][2]
-               - bl2 * (grly[id3 + 1][2] * radius2 + rly[id3 + 1] * tz)
-               - 2.0 * x * grly[id4 + 1][2])
-              / bl1;
-    }
-
-    return;
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/test/test_sph.cu b/source/source_lcao/module_gint/test/test_sph.cu
deleted file mode 100644
index 9d41705667..0000000000
--- a/source/source_lcao/module_gint/test/test_sph.cu
+++ /dev/null
@@ -1,138 +0,0 @@
-#include <bits/stdc++.h>
-#include "../kernels/cuda/sph.cuh"
-
-#include "float.h"
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#include "gtest/gtest.h"
-#include "source_lcao/module_hcontainer/hcontainer.h"
-#include "test_sph.h"
-#include "source_base/array_pool.h"
-using namespace std;
-
-class gintTest : public ::testing::Test
-{
-  public:
-};
-
-__global__ void cuda_test(double* dr, int nwl, double* ylma_g, double* ylmcoef)
-{
-    double ylma[49] = {0.0};
-    GintKernel::spherical_harmonics(dr, nwl, ylma, ylmcoef);
-    for (int i = 0; i < 49; i++)
-    {
-        ylma_g[i] = ylma[i];
-    }
-}
-__global__ void cuda_test2(double* dr, double distance, int nwl, double* dylma_g, double* ylmcoef)
-{
-    double ylma[49] = {0.0};
-    double grly[49][3] = {0.0};
-    GintKernel::spherical_harmonics_d(dr, distance, grly, nwl, ylma, ylmcoef);
-    for (int i = 0; i < 49; i++)
-    {
-        dylma_g[i] = ylma[i];
-    }
-}
-
-void get_random_double(int min, int max, double* result, int length)
-{
-    std::random_device rd;
-    std::default_random_engine eng(rd());
-    std::uniform_real_distribution<double> distribution(0, 10);
-    for (int i = 0; i < 3; i++)
-    {
-        result[i] = distribution(eng);
-    }
-}
-void get_random_int(int min, int max, int& result)
-{
-    std::random_device rd;
-    std::default_random_engine eng(rd());
-    std::uniform_int_distribution<int> distribution(min, max);
-    result = distribution(eng);
-}
-// __global__ void cuda_test
-TEST_F(gintTest, test)
-{
-    int nwl;
-    double distance;
-
-    double* dr = new double[3];
-    double* dr_g;
-
-    double ylma[49];
-    double dylma[49];
-    double ylma_ans[49];
-
-    double* ylmcoef_g;
-    double* ylma_g;
-    double* dylma_g;
-    double* ylmcoef = new double[100];
-
-    std::vector<double> ylma_cpu(49, 0.0);
-    std::vector<double> ylma_cpu_dpsir(49, 0.0);
-    ModuleBase::Array_Pool<double> ylma_cpu_ddpsir(49, 3);
-    
-    nwl=3;
-    for (int i=0;i<3;i++){
-        dr[i]=i*1.0;
-        distance += dr[i] * dr[i];
-    }
-    for (int i=0;i<100;i++)
-    {
-        ylmcoef[i]=i*0.1;
-    }
-
-    cudaMalloc((void**)&ylmcoef_g, 100 * sizeof(double));
-    cudaMalloc((void**)&dr_g, 3 * sizeof(double));
-    cudaMalloc((void**)&ylma_g, 49 * sizeof(double));
-    cudaMalloc((void**)&dylma_g, 49 * 3 * sizeof(double));
-
-    cudaMemcpy(ylmcoef_g, ylmcoef, 100 * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(dr_g, dr, 3 * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemset(ylma_g, 0, 49 * sizeof(double));
-    cudaMemset(dylma_g, 0, 49 * sizeof(double));
-
-    cuda_test<<<1, 1>>>(dr_g, nwl, ylma_g, ylmcoef_g);
-    cuda_test2<<<1, 1>>>(dr_g, distance, nwl, dylma_g, ylmcoef_g);
-    sph_harm(nwl, dr[0], dr[1], dr[2], ylma_cpu, ylmcoef);
-    grad_rl_sph_harm(nwl, dr[0], dr[1], dr[2], ylma_cpu_dpsir.data(), ylma_cpu_ddpsir.get_ptr_2D(), ylmcoef);
-    cudaMemcpy(ylma, ylma_g, 49 * sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(dylma, dylma_g, 49 * sizeof(double), cudaMemcpyDeviceToHost);
-    cudaDeviceReset();
-
-    for (int i = 0; i < 49; i++)
-    {
-        ylma_ans[i] = ylma_cpu[i];
-        if ((abs(ylma[i])!= 0) && (ylma_ans[i]==ylma_ans[i]) && (ylma[i]==ylma[i]))
-        {
-            EXPECT_LT(abs(ylma_ans[i] - ylma[i]) / abs(ylma[i]), 1e-15);
-        }
-        ylma_ans[i] = ylma_cpu_dpsir[i];
-        if ((abs(dylma[i]) != 0) &&(ylma_ans[i]==ylma_ans[i]) && (dylma[i]==dylma[i]))
-        {
-            EXPECT_LT(abs(ylma_ans[i] - dylma[i]) / abs(dylma[i]), 1e-15);
-        }
-    }
-    delete[] dr;
-    delete[] ylmcoef;
-    
-}
-
-int main(int argc, char** argv)
-{
-#ifdef __MPI
-    MPI_Init(&argc, &argv);
-    MPI_Comm_size(MPI_COMM_WORLD, &GlobalV::NPROC);
-    MPI_Comm_rank(MPI_COMM_WORLD, &GlobalV::MY_RANK);
-#endif
-    testing::InitGoogleTest(&argc, argv);
-    int result = RUN_ALL_TESTS();
-
-#ifdef __MPI
-    MPI_Finalize();
-#endif
-
-    return result;
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/test/test_sph.h b/source/source_lcao/module_gint/test/test_sph.h
deleted file mode 100644
index 141e917200..0000000000
--- a/source/source_lcao/module_gint/test/test_sph.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef TEST_SPH_H
-#define TEST_SPH_H
-#include <bits/stdc++.h>
-// using namespace std;
-void sph_harm(const int& Lmax, 
-              const double& xdr,
-              const double& ydr,
-              const double& zdr,
-              std::vector<double>& rly,
-              double* ylmcoef);
-              
-void grad_rl_sph_harm(const int& Lmax, // max momentum of L
-                      const double& x,
-                      const double& y,
-                      const double& z,
-                      double* rly,
-                      double** grly,
-                      const double* ylmcoef);
-#endif
\ No newline at end of file
diff --git a/source/source_lcao/module_gint/temp_gint/unitcell_info.cpp b/source/source_lcao/module_gint/unitcell_info.cpp
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/unitcell_info.cpp
rename to source/source_lcao/module_gint/unitcell_info.cpp
diff --git a/source/source_lcao/module_gint/temp_gint/unitcell_info.h b/source/source_lcao/module_gint/unitcell_info.h
similarity index 100%
rename from source/source_lcao/module_gint/temp_gint/unitcell_info.h
rename to source/source_lcao/module_gint/unitcell_info.h
diff --git a/source/source_lcao/module_lr/esolver_lrtd_lcao.cpp b/source/source_lcao/module_lr/esolver_lrtd_lcao.cpp
index 90bcab0c47..f8cafe4902 100644
--- a/source/source_lcao/module_lr/esolver_lrtd_lcao.cpp
+++ b/source/source_lcao/module_lr/esolver_lrtd_lcao.cpp
@@ -1,5 +1,4 @@
 #include "esolver_lrtd_lcao.h"
-#include "utils/gint_move.hpp"
 #include "utils/lr_util.h"
 #include "hamilt_casida.h"
 #include "hamilt_ulr.hpp"
@@ -44,8 +43,6 @@ void LR::ESolver_LR<double>::move_exx_lri(std::shared_ptr<Exx_LRI<std::complex<d
     throw std::runtime_error("ESolver_LR<double>::move_exx_lri: cannot move std::complex<double> to double");
 }
 #endif
-template<>void LR::ESolver_LR<double>::set_gint() { this->gint_ = &this->gint_g_;this->gint_g_.gridt = &this->gt_; }
-template<>void LR::ESolver_LR<std::complex<double>>::set_gint() { this->gint_ = &this->gint_k_; this->gint_k_.gridt = &this->gt_; }
 
 inline int cal_nupdown_form_occ(const ModuleBase::matrix& wg)
 {   // only for nspin=2
@@ -241,23 +238,7 @@ LR::ESolver_LR<T, TR>::ESolver_LR(ModuleESolver::ESolver_KS_LCAO<T, TR>&& ks_sol
         this->nupdown = cal_nupdown_form_occ(ks_sol.pelec->wg);
         reset_dim_spin2();
     }
-#ifdef __OLD_GINT
-    //grid integration
-    this->gt_ = std::move(ks_sol.GridT);
-
-	if (std::is_same<T, double>::value) 
-	{ 
-		this->gint_g_ = std::move(ks_sol.GG); 
-	}
-	else 
-	{ 
-		this->gint_k_ = std::move(ks_sol.GK); 
-	}
-    this->set_gint();
-    this->gint_->reset_DMRGint(1);
-#else
     this->gint_info_ = std::move(ks_sol.gint_info_);
-#endif
     // move pw basis
     if (this->pw_rho_flag)
     {
@@ -395,66 +376,6 @@ LR::ESolver_LR<T, TR>::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu
                          this->ucell,
                          search_radius,
                          PARAM.inp.test_atom_input);
-#ifdef __OLD_GINT
-    this->set_gint();
-    this->gint_->gridt = &this->gt_;
-
-    // (3) Periodic condition search for each grid.
-    double dr_uniform = 0.001;
-    std::vector<double> rcuts;
-    std::vector<std::vector<double>> psi_u;
-    std::vector<std::vector<double>> dpsi_u;
-    std::vector<std::vector<double>> d2psi_u;
-
-    Gint_Tools::init_orb(dr_uniform, rcuts, ucell, orb, psi_u, dpsi_u, d2psi_u);
-    this->gt_.set_pbc_grid(this->pw_rho->nx,
-                           this->pw_rho->ny,
-                           this->pw_rho->nz,
-                           this->pw_big->bx,
-                           this->pw_big->by,
-                           this->pw_big->bz,
-                           this->pw_big->nbx,
-                           this->pw_big->nby,
-                           this->pw_big->nbz,
-                           this->pw_big->nbxx,
-                           this->pw_big->nbzp_start,
-                           this->pw_big->nbzp,
-                           this->pw_rho->ny,
-                           this->pw_rho->nplane,
-                           this->pw_rho->startz_current,
-                           ucell,
-                           this->gd,
-                           dr_uniform,
-                           rcuts,
-                           psi_u,
-                           dpsi_u,
-                           d2psi_u,
-                           PARAM.inp.nstream);
-    psi_u.clear();
-    psi_u.shrink_to_fit();
-    dpsi_u.clear();
-    dpsi_u.shrink_to_fit();
-    d2psi_u.clear();
-    d2psi_u.shrink_to_fit();
-
-    this->gint_->prep_grid(this->gt_,
-        this->pw_big->nbx,
-        this->pw_big->nby,
-        this->pw_big->nbzp,
-        this->pw_big->nbzp_start,
-        this->pw_rho->nxyz,
-        this->pw_big->bx,
-        this->pw_big->by,
-        this->pw_big->bz,
-        this->pw_big->bxyz,
-        this->pw_big->nbxx,
-        this->pw_rho->ny,
-        this->pw_rho->nplane,
-        this->pw_rho->startz_current,
-        &ucell,
-        &orb);
-    this->gint_->initialize_pvpR(ucell, &this->gd, 1); // always use nspin=1 for transition density
-#else
     gint_info_.reset(
         new ModuleGint::GintInfo(
         this->pw_big->nbx,
@@ -473,7 +394,6 @@ LR::ESolver_LR<T, TR>::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu
         ucell,
         this->gd));
     ModuleGint::Gint::set_gint_info(gint_info_.get());
-#endif
     // if EXX from scratch, init 2-center integral and calculate Cs, Vs 
 #ifdef __EXX
     if ((xc_kernel == "hf" || xc_kernel == "hse") && this->input.lr_solver != "spectrum")
@@ -533,7 +453,6 @@ void LR::ESolver_LR<T, TR>::runner(UnitCell& ucell, const int istep)
                               this->exx_lri,
                               this->exx_info.info_global.hybrid_alpha,
 #endif
-                              this->gint_,
                               this->pot,
                               this->kv,
                               this->paraX_,
@@ -564,7 +483,6 @@ void LR::ESolver_LR<T, TR>::runner(UnitCell& ucell, const int istep)
                                 this->exx_lri,
                                 this->exx_info.info_global.hybrid_alpha,
 #endif
-                                this->gint_,
                                 this->pot[is],
                                 this->kv,
                                 this->paraX_,
@@ -621,7 +539,7 @@ void LR::ESolver_LR<T, TR>::after_all_runners(UnitCell& ucell)
     auto spin_types = (nspin == 2 && !openshell) ? std::vector<std::string>({ "singlet", "triplet" }) : std::vector<std::string>({ "updown" });
     for (int is = 0;is < this->X.size();++is)
     {
-        LR_Spectrum<T> spectrum(nspin, this->nbasis, this->nocc, this->nvirt, this->gint_, *this->pw_rho, *this->psi_ks,
+        LR_Spectrum<T> spectrum(nspin, this->nbasis, this->nocc, this->nvirt, *this->pw_rho, *this->psi_ks,
             this->ucell, this->kv, this->gd, this->orb_cutoff_, this->two_center_bundle_,
             this->paraX_, this->paraC_, this->paraMat_,
             &this->pelec->ekb.c[is * nstates], this->X[is].template data<T>(), nstates, openshell,
diff --git a/source/source_lcao/module_lr/esolver_lrtd_lcao.h b/source/source_lcao/module_lr/esolver_lrtd_lcao.h
index f08ddec52b..3f2d040501 100644
--- a/source/source_lcao/module_lr/esolver_lrtd_lcao.h
+++ b/source/source_lcao/module_lr/esolver_lrtd_lcao.h
@@ -11,13 +11,10 @@
 #include <memory>
 
 #include "source_esolver/esolver_ks_lcao.h" //for the move constructor
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
-#include "source_lcao/module_gint/grid_technique.h"
 #include "source_estate/module_dm/density_matrix.h"
 #include "source_lcao/module_lr/potentials/pot_hxc_lrtd.h"
 #include "source_lcao/module_lr/hamilt_casida.h"
-#include "source_lcao/module_gint/temp_gint/gint_info.h"
+#include "source_lcao/module_gint/gint_info.h"
 #ifdef __EXX
 // #include <RI/physics/Exx.h>
 #include "source_lcao/module_ri/Exx_LRI.h"
@@ -90,13 +87,7 @@ namespace LR
         bool openshell = false;
         std::string xc_kernel;
 
-        Grid_Technique gt_;
-        Gint_Gamma gint_g_;
-        Gint_k gint_k_;
-        typename TGint<T>::type* gint_ = nullptr;
-        #ifndef __OLD_GINT
         std::unique_ptr<ModuleGint::GintInfo> gint_info_ = nullptr;
-        #endif
         void set_gint();
 
         /// @brief variables for parallel distribution of KS orbitals
diff --git a/source/source_lcao/module_lr/hamilt_casida.h b/source/source_lcao/module_lr/hamilt_casida.h
index e692dc051a..d835fad2d3 100644
--- a/source/source_lcao/module_lr/hamilt_casida.h
+++ b/source/source_lcao/module_lr/hamilt_casida.h
@@ -17,7 +17,6 @@ namespace LR
     class HamiltLR
     {
     public:
-      template <typename TGint>
       HamiltLR(std::string& xc_kernel,
                const int& nspin,
                const int& naos,
@@ -32,7 +31,6 @@ namespace LR
                std::weak_ptr<Exx_LRI<T>> exx_lri_in,
                const double& exx_alpha,
 #endif
-               TGint* gint_in,
                std::weak_ptr<PotHxcLR> pot_in,
                const K_Vectors& kv_in,
                const std::vector<Parallel_2D>& pX_in,
@@ -95,7 +93,7 @@ namespace LR
 #endif
             {
                 OperatorLRHxc<T>* lr_hxc = new OperatorLRHxc<T>(nspin, naos, nocc, nvirt, psi_ks_in,
-                    this->DM_trans, gint_in, pot_in, ucell_in, orb_cutoff, gd_in, kv_in, pX_in, pc_in, pmat_in);
+                    this->DM_trans, pot_in, ucell_in, orb_cutoff, gd_in, kv_in, pX_in, pc_in, pmat_in);
                 this->ops->add(lr_hxc);
             }
 #ifdef __EXX
diff --git a/source/source_lcao/module_lr/hamilt_ulr.hpp b/source/source_lcao/module_lr/hamilt_ulr.hpp
index 838a3d4999..4f5fdfbfd9 100644
--- a/source/source_lcao/module_lr/hamilt_ulr.hpp
+++ b/source/source_lcao/module_lr/hamilt_ulr.hpp
@@ -15,7 +15,6 @@ namespace LR
     class HamiltULR
     {
     public:
-        template<typename TGint>
         HamiltULR(std::string& xc_kernel,
             const int& nspin,
             const int& naos,
@@ -30,7 +29,6 @@ namespace LR
             std::weak_ptr<Exx_LRI<T>> exx_lri_in,
             const double& exx_alpha,
 #endif 
-            TGint* gint_in,
             std::vector<std::shared_ptr<PotHxcLR>>& pot_in,
             const K_Vectors& kv_in,
             const std::vector<Parallel_2D>& pX_in,   ///< {up, down}
@@ -49,7 +47,7 @@ namespace LR
             this->ops[3] = new OperatorLRDiag<T>(eig_ks.c + nk * (nocc[0] + nvirt[0]), pX_in[1], nk, nocc[1], nvirt[1]);
 
             auto newHxc = [&](const int& sl, const int& sr) { return new OperatorLRHxc<T>(nspin, naos, nocc, nvirt, psi_ks_in,
-                this->DM_trans, gint_in, pot_in[sl], ucell_in, orb_cutoff, gd_in, kv_in, pX_in, pc_in, pmat_in, { sl,sr }); };
+                this->DM_trans, pot_in[sl], ucell_in, orb_cutoff, gd_in, kv_in, pX_in, pc_in, pmat_in, { sl,sr }); };
             this->ops[0]->add(newHxc(0, 0));
             this->ops[1] = newHxc(0, 1);
             this->ops[2] = newHxc(1, 0);
diff --git a/source/source_lcao/module_lr/lr_spectrum.cpp b/source/source_lcao/module_lr/lr_spectrum.cpp
index 235a9829e2..f698541c30 100644
--- a/source/source_lcao/module_lr/lr_spectrum.cpp
+++ b/source/source_lcao/module_lr/lr_spectrum.cpp
@@ -6,7 +6,7 @@
 #include "source_lcao/module_lr/utils/lr_util.h"
 #include "source_lcao/module_lr/utils/lr_util_hcontainer.h"
 #include "source_lcao/module_lr/utils/lr_util_print.h"
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 
 template <typename T>
 elecstate::DensityMatrix<T, T> LR::LR_Spectrum<T>::cal_transition_density_matrix(const int istate, const T* X_in, const bool need_R)
@@ -35,16 +35,6 @@ elecstate::DensityMatrix<T, T> LR::LR_Spectrum<T>::cal_transition_density_matrix
     return DM_trans;
 }
 
-#ifdef __OLD_GINT
-template<typename T>
-void LR::LR_Spectrum<T>::cal_gint_rho(double** rho, const int& nrxx)
-{
-    ModuleBase::GlobalFunc::ZEROS(rho[0], nrxx);
-    Gint_inout inout_rho(rho, Gint_Tools::job_type::rho, 1, false);
-    this->gint->cal_gint(&inout_rho);
-}
-#endif
-
 inline void check_sum_rule(const double& osc_tot)
 {
     if (std::abs(osc_tot - 1.0) > 1e-3) {
@@ -65,13 +55,8 @@ ModuleBase::Vector3<double> LR::LR_Spectrum<double>::cal_transition_dipole_istat
         // 2. transition density
         double** rho_trans;
         LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, this->rho_basis.nrxx);
-#ifdef __OLD_GINT
-        this->gint->transfer_DM2DtoGrid({ DM_trans.get_DMR_vector().at(is) });
-        this->cal_gint_rho(rho_trans, this->rho_basis.nrxx);
-#else
         ModuleBase::GlobalFunc::ZEROS(rho_trans[0], this->rho_basis.nrxx);
         ModuleGint::cal_gint_rho({ DM_trans.get_DMR_vector().at(is) }, 1, rho_trans, false);
-#endif
 
         // 3. transition dipole moment
         for (int ir = 0; ir < rho_basis.nrxx; ++ir)
@@ -115,24 +100,14 @@ ModuleBase::Vector3<std::complex<double>> LR::LR_Spectrum<std::complex<double>>:
 
         // real part
         LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'R');
-#ifdef __OLD_GINT
-        this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
-        this->cal_gint_rho(rho_trans_real, this->rho_basis.nrxx);
-#else
         ModuleBase::GlobalFunc::ZEROS(rho_trans_real[0], this->rho_basis.nrxx);
         ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_real, false);
-#endif
         // LR_Util::print_grid_nonzero(rho_trans_real[0], this->rho_basis.nrxx, 10, "rho_trans");
 
         // imag part
         LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'I');
-#ifdef __OLD_GINT
-        this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
-        this->cal_gint_rho(rho_trans_imag, this->rho_basis.nrxx);
-#else
         ModuleBase::GlobalFunc::ZEROS(rho_trans_imag[0], this->rho_basis.nrxx);
         ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_imag, false);
-#endif
         // LR_Util::print_grid_nonzero(rho_trans_imag[0], this->rho_basis.nrxx, 10, "rho_trans");
 
         // 3. transition dipole moment
diff --git a/source/source_lcao/module_lr/lr_spectrum.h b/source/source_lcao/module_lr/lr_spectrum.h
index bd2d1f489d..79a188621b 100644
--- a/source/source_lcao/module_lr/lr_spectrum.h
+++ b/source/source_lcao/module_lr/lr_spectrum.h
@@ -1,6 +1,5 @@
 #pragma once
 #include "source_cell/klist.h"
-#include "source_lcao/module_lr/utils/gint_template.h"
 #include "source_psi/psi.h"
 #include "source_estate/module_dm/density_matrix.h"
 #include "source_lcao/module_lr/utils/lr_util.h"
@@ -13,14 +12,14 @@ namespace LR
     {
     public:
         LR_Spectrum(const int& nspin_global, const int& naos, const std::vector<int>& nocc, const std::vector<int>& nvirt,
-            typename TGint<T>::type* gint, const ModulePW::PW_Basis& rho_basis, psi::Psi<T>& psi_ks_in,
+            const ModulePW::PW_Basis& rho_basis, psi::Psi<T>& psi_ks_in,
             const UnitCell& ucell, const K_Vectors& kv_in, const Grid_Driver& gd, const std::vector<double>& orb_cutoff,
             const TwoCenterBundle& two_center_bundle_,
             const std::vector<Parallel_2D>& pX_in, const Parallel_2D& pc_in, const Parallel_Orbitals& pmat_in,
             const double* eig, const T* X, const int& nstate, const bool& openshell,
             const std::string& gauge = "length") :
             nspin_x(openshell ? 2 : 1), naos(naos), nocc(nocc), nvirt(nvirt), nk(kv_in.get_nks() / nspin_global),
-            gint(gint), rho_basis(rho_basis), ucell(ucell), kv(kv_in), gd_(gd),
+            rho_basis(rho_basis), ucell(ucell), kv(kv_in), gd_(gd),
             orb_cutoff_(orb_cutoff), two_center_bundle_(two_center_bundle_),
             pX(pX_in), pc(pc_in), pmat(pmat_in),
             eig(eig), X(X), nstate(nstate),
@@ -75,7 +74,6 @@ namespace LR
         const std::vector<Parallel_2D>& pX;
         const Parallel_2D& pc;
         const Parallel_Orbitals& pmat;
-        typename TGint<T>::type* gint = nullptr;
         const ModulePW::PW_Basis& rho_basis;
         const Grid_Driver& gd_;
         const UnitCell& ucell;
diff --git a/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.cpp b/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.cpp
index 56d2e4fda7..4aed4244f4 100644
--- a/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.cpp
+++ b/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.cpp
@@ -9,7 +9,7 @@
 #include "source_lcao/module_hcontainer/hcontainer_funcs.h"
 #include "source_lcao/module_lr/ao_to_mo_transformer/ao_to_mo.h"
 #include "source_pw/module_pwdft/global.h"
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 
 inline double conj(double a) { return a; }
 inline std::complex<double> conj(std::complex<double> a) { return std::conj(a); }
@@ -61,13 +61,7 @@ namespace LR
         const int& nrxx = this->pot.lock()->nrxx;
         LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // currently gint_kernel_rho uses PARAM.inp.nspin, it needs refactor
         ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx);
-#ifdef __OLD_GINT
-        this->gint->transfer_DM2DtoGrid(this->DM_trans->get_DMR_vector());     // 2d block to grid
-        Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false);
-        this->gint->cal_gint(&inout_rho);
-#else
         ModuleGint::cal_gint_rho(this->DM_trans->get_DMR_vector(), 1, rho_trans, false);
-#endif
         // 3. v_hxc = f_hxc * rho_trans
         ModuleBase::matrix vr_hxc(1, nrxx);   //grid
         this->pot.lock()->cal_v_eff(rho_trans, ucell, vr_hxc, ispin_ks);
@@ -75,14 +69,7 @@ namespace LR
 
         // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r)
         this->hR->set_zero();   // clear hR for each bands
-#ifdef __OLD_GINT
-        Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal);
-        this->gint->get_hRGint()->set_zero();
-        this->gint->cal_gint(&inout_vlocal);
-        this->gint->transfer_pvpR(&*this->hR, &ucell);    //grid to 2d block
-#else
         ModuleGint::cal_gint_vl(vr_hxc.c, &*this->hR);
-#endif
         ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation");
     }
 
@@ -109,14 +96,7 @@ namespace LR
 
                 LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // nspin=1 for transition density
                 ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx);
-#ifdef __OLD_GINT
-                this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
-                // LR_Util::print_HR(*this->gint->get_DMRGint()[0], this->ucell.nat, "DMR(grid, real)");
-                Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false);
-                this->gint->cal_gint(&inout_rho);
-#else
                 ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans, false);
-#endif
                 // print_grid_nonzero(rho_trans[0], nrxx, 10, "rho_trans");
 
                 // 3. v_hxc = f_hxc * rho_trans
@@ -128,15 +108,7 @@ namespace LR
 
                 // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r)
                 HR_real_imag.set_zero();
-#ifdef __OLD_GINT
-                Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal);
-                this->gint->get_hRGint()->set_zero();
-                this->gint->cal_gint(&inout_vlocal);
-                // LR_Util::print_HR(*this->gint->get_hRGint(), this->ucell.nat, "VR(grid)");
-                this->gint->transfer_pvpR(&HR_real_imag, &ucell, &this->gd);
-#else
                 ModuleGint::cal_gint_vl(vr_hxc.c, &HR_real_imag);
-#endif
                 // LR_Util::print_HR(HR_real_imag, this->ucell.nat, "VR(real, 2d)");
                 LR_Util::set_HR_real_imag_part(HR_real_imag, *this->hR, ucell.nat, type);
             };
diff --git a/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.h b/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.h
index e201561380..bb82780e14 100644
--- a/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.h
+++ b/source/source_lcao/module_lr/operator_casida/operator_lr_hxc.h
@@ -1,8 +1,6 @@
 #pragma once
 #include "source_cell/klist.h"
 #include "source_hamilt/operator.h"
-#include "source_lcao/module_lr/utils/gint_template.h"
-#include "source_lcao/module_gint/grid_technique.h"
 #include "source_estate/module_dm/density_matrix.h"
 #include "source_lcao/module_lr/potentials/pot_hxc_lrtd.h"
 #include "source_lcao/module_lr/utils/lr_util.h"
@@ -21,7 +19,6 @@ namespace LR
                     const std::vector<int>& nvirt,
                     const psi::Psi<T, Device>& psi_ks_in,
                     std::unique_ptr<elecstate::DensityMatrix<T, T>>& DM_trans_in,
-                    typename TGint<T>::type* gint_in,
                     std::weak_ptr<PotHxcLR> pot_in,
                     const UnitCell& ucell_in,
                     const std::vector<double>& orb_cutoff,
@@ -32,7 +29,7 @@ namespace LR
                     const Parallel_Orbitals& pmat_in,
                     const std::vector<int>& ispin_ks = {0})
           : nspin(nspin), naos(naos), nocc(nocc), nvirt(nvirt), nk(kv_in.get_nks() / nspin), psi_ks(psi_ks_in),
-            DM_trans(DM_trans_in), gint(gint_in), pot(pot_in), ucell(ucell_in), orb_cutoff_(orb_cutoff), gd(gd_in),
+            DM_trans(DM_trans_in), pot(pot_in), ucell(ucell_in), orb_cutoff_(orb_cutoff), gd(gd_in),
             kv(kv_in), pX(pX_in), pc(pc_in), pmat(pmat_in), ispin_ks(ispin_ks)
       {
           ModuleBase::TITLE("OperatorLRHxc", "OperatorLRHxc");
@@ -82,8 +79,6 @@ namespace LR
 
         std::weak_ptr<PotHxcLR> pot;
 
-        typename TGint<T>::type* gint = nullptr;
-
         const UnitCell& ucell;
         std::vector<double> orb_cutoff_;
         const Grid_Driver& gd;
diff --git a/source/source_lcao/module_lr/utils/gint_move.hpp b/source/source_lcao/module_lr/utils/gint_move.hpp
deleted file mode 100644
index b7c01118ef..0000000000
--- a/source/source_lcao/module_lr/utils/gint_move.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "lr_util.h"
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_io/module_parameter/parameter.h"
-#include "source_lcao/module_gint/gint_k.h"
-#include "source_lcao/module_gint/grid_technique.h"
-
-// Here will be  the only place where GlobalCs are used (to be moved) in source_lcao/module_lr
-#include "source_pw/module_pwdft/global.h"
-
-template <typename T>
-using D2 = void(*) (T**, size_t);
-// template <typename T>
-// using D3 = void(*) (T***, size_t, size_t);
-// template <typename T>
-// D2<T> d2 = LR_Util::_deallocate_2order_nested_ptr<T>;
-// template <typename T>
-// D3<T> d3 = LR_Util::delete_p3<T>;
-// Change to C++ 11
-D2<double> d2 = LR_Util::_deallocate_2order_nested_ptr<double>;
-// D3<double> d3 = LR_Util::delete_p3<double>;
-
-
-Gint& Gint::operator=(Gint&& rhs)
-{
-    if (this == &rhs) {return *this;
-}
-
-    this->nbx = rhs.nbx;
-    this->nby = rhs.nby;
-    this->nbz = rhs.nbz;
-    this->ncxyz = rhs.ncxyz;
-    this->nbz_start = rhs.nbz_start;
-    this->bx = rhs.bx;
-    this->by = rhs.by;
-    this->bz = rhs.bz;
-    this->bxyz = rhs.bxyz;
-    this->nbxx = rhs.nbxx;
-    this->ny = rhs.ny;
-    this->nplane = rhs.nplane;
-    this->startz_current = rhs.startz_current;
-
-    this->gridt = rhs.gridt;
-    this->ucell = rhs.ucell;
-
-    // move hR after refactor
-    this->hRGint = rhs.hRGint;
-    rhs.hRGint = nullptr;
-    this->hRGintCd = rhs.hRGintCd;
-    rhs.hRGintCd = nullptr;
-    for (int i = 0; i < this->dmr_gint.size(); i++)
-    {
-        delete this->dmr_gint[i];
-    }
-    for (int i = 0; i < this->hr_gint_tmp .size(); i++)
-    {
-        delete this->hr_gint_tmp [i];
-    }
-    this->pvdpRx_reduced = std::move(rhs.pvdpRx_reduced);
-    this->pvdpRy_reduced = std::move(rhs.pvdpRy_reduced);
-    this->pvdpRz_reduced = std::move(rhs.pvdpRz_reduced);
-    this->dmr_gint = std::move(rhs.dmr_gint);
-    this->hr_gint_tmp  = std::move(rhs.hr_gint_tmp );
-    this->dm2d_tmp = rhs.dm2d_tmp;
-    rhs.dm2d_tmp = nullptr;
-
-    return *this;
-}
-
-Gint_Gamma& Gint_Gamma::operator=(Gint_Gamma&& rhs)
-{
-    if (this == &rhs) {return *this;
-}
-    Gint::operator=(std::move(rhs));
-
-    // DM may not needed in beyond DFT ESolver
-    // if (this->DM != nullptr) d3<double>(this->DM, PARAM.inp.nspin, gridt.lgd);
-    assert(this->DM == nullptr);
-    return *this;
-}
-
-Gint_k& Gint_k::operator=(Gint_k&& rhs)
-{
-    if (this == &rhs) {return *this;
-}
-    this->Gint::operator=(std::move(rhs));
-    return *this;
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_lr/utils/gint_template.h b/source/source_lcao/module_lr/utils/gint_template.h
deleted file mode 100644
index e56bb33961..0000000000
--- a/source/source_lcao/module_lr/utils/gint_template.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
-namespace LR
-{
-    template <typename T> struct TGint;
-    template <>
-    struct TGint<double> {
-        using type = Gint_Gamma;
-    };
-    template <>
-    struct TGint<std::complex<double>> {
-        using type = Gint_k;
-    };
-}
\ No newline at end of file
diff --git a/source/source_lcao/module_operator_lcao/meta_lcao.h b/source/source_lcao/module_operator_lcao/meta_lcao.h
index 53dca154c4..61bcf7510a 100644
--- a/source/source_lcao/module_operator_lcao/meta_lcao.h
+++ b/source/source_lcao/module_operator_lcao/meta_lcao.h
@@ -1,8 +1,6 @@
 #ifndef METALCAO_H
 #define METALCAO_H
 #include "source_base/timer.h"
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
 #include "operator_lcao.h"
 
 namespace hamilt
diff --git a/source/source_lcao/module_operator_lcao/veff_lcao.cpp b/source/source_lcao/module_operator_lcao/veff_lcao.cpp
index 576d9c3a5b..0df6ed33a5 100644
--- a/source/source_lcao/module_operator_lcao/veff_lcao.cpp
+++ b/source/source_lcao/module_operator_lcao/veff_lcao.cpp
@@ -4,7 +4,7 @@
 #include "source_base/tool_title.h"
 #include "source_hamilt/module_xc/xc_functional.h"
 #include "source_cell/unitcell.h"
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 namespace hamilt
 {
 
@@ -68,20 +68,6 @@ void Veff<OperatorLCAO<double, double>>::contributeHR()
     double* vr_eff1 = this->pot->get_effective_v(this->current_spin);
     double* vofk_eff1 = this->pot->get_effective_vofk(this->current_spin);
 
-#ifdef __OLD_GINT
-    if(XC_Functional::get_ked_flag())
-    {
-        Gint_inout inout(vr_eff1, vofk_eff1, Gint_Tools::job_type::vlocal_meta);
-        this->GG->cal_vlocal(&inout,  this->new_e_iteration);
-    }
-    else
-    {
-        Gint_inout inout(vr_eff1, Gint_Tools::job_type::vlocal);
-        this->GG->cal_vlocal(&inout,  this->new_e_iteration);
-    }
-    this->GG->transfer_pvpR(this->hR,this->ucell);
-    this->new_e_iteration = false;
-#else
     if(XC_Functional::get_ked_flag())
     {
         ModuleGint::cal_gint_vl_metagga(vr_eff1, vofk_eff1, this->hR);
@@ -90,7 +76,6 @@ void Veff<OperatorLCAO<double, double>>::contributeHR()
     {
         ModuleGint::cal_gint_vl(vr_eff1, this->hR);
     }
-#endif
 
     if(this->nspin == 2) 
     { 
@@ -113,23 +98,6 @@ void Veff<OperatorLCAO<std::complex<double>, double>>::contributeHR()
     double* vr_eff1 = this->pot->get_effective_v(this->current_spin);
     double* vofk_eff1 = this->pot->get_effective_vofk(this->current_spin);
 
-#ifdef __OLD_GINT
-    // if you change the place of the following code,
-    // rememeber to delete the #include
-    if(XC_Functional::get_ked_flag())
-    {
-        Gint_inout inout(vr_eff1, vofk_eff1, 0, Gint_Tools::job_type::vlocal_meta);
-        this->GK->cal_gint(&inout);
-    }
-    else
-    {
-        // vlocal = Vh[rho] + Vxc[rho] + Vl(pseudo)
-        Gint_inout inout(vr_eff1, 0, Gint_Tools::job_type::vlocal);
-        this->GK->cal_gint(&inout);
-    }
-
-    this->GK->transfer_pvpR(this->hR,this->ucell,this->gd);
-#else
     if(XC_Functional::get_ked_flag())
     {
         ModuleGint::cal_gint_vl_metagga(vr_eff1, vofk_eff1, this->hR);
@@ -138,7 +106,6 @@ void Veff<OperatorLCAO<std::complex<double>, double>>::contributeHR()
     {
         ModuleGint::cal_gint_vl(vr_eff1, this->hR);
     }
-#endif
 
     if(this->nspin == 2) 
     { 
@@ -155,30 +122,6 @@ void Veff<OperatorLCAO<std::complex<double>, std::complex<double>>>::contributeH
     ModuleBase::TITLE("Veff", "contributeHR");
     ModuleBase::timer::tick("Veff", "contributeHR");
 
-#ifdef __OLD_GINT
-    double* vr_eff1 = nullptr;
-    double* vofk_eff1 = nullptr;
-    for (int is = 0; is < 4; is++)
-    {
-        vr_eff1 = this->pot->get_effective_v(is);
-        if(XC_Functional::get_ked_flag())
-        {
-            vofk_eff1 = this->pot->get_effective_vofk(is);
-        }
-        
-        if(XC_Functional::get_ked_flag())
-        {
-            Gint_inout inout(vr_eff1, vofk_eff1, is, Gint_Tools::job_type::vlocal_meta);
-            this->GK->cal_gint(&inout);
-        }
-        else
-        {
-            Gint_inout inout(vr_eff1, is, Gint_Tools::job_type::vlocal);
-            this->GK->cal_gint(&inout);
-        }
-    }
-    this->GK->transfer_pvpR(this->hR,this->ucell,this->gd);
-#else
     std::vector<const double*> vr_eff(4, nullptr);
     std::vector<const double*> vofk_eff(4, nullptr);
     for (int is = 0; is < 4; is++)
@@ -196,7 +139,6 @@ void Veff<OperatorLCAO<std::complex<double>, std::complex<double>>>::contributeH
     {
         ModuleGint::cal_gint_vl(vr_eff, this->hR);
     }
-#endif
 
     ModuleBase::timer::tick("Veff", "contributeHR");
     return;
diff --git a/source/source_lcao/module_operator_lcao/veff_lcao.h b/source/source_lcao/module_operator_lcao/veff_lcao.h
index a621f71fc6..8ec1265a00 100644
--- a/source/source_lcao/module_operator_lcao/veff_lcao.h
+++ b/source/source_lcao/module_operator_lcao/veff_lcao.h
@@ -2,8 +2,6 @@
 #define VEFFLCAO_H
 #include "source_base/timer.h"
 #include "source_estate/module_pot/potential_new.h"
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
 #include "operator_lcao.h"
 #include "source_cell/module_neighbor/sltk_grid_driver.h"
 #include "source_cell/unitcell.h"
@@ -32,11 +30,9 @@ class Veff<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
   public:
     /**
-     * @brief Construct a new Veff object for multi-kpoint calculation
-     * @param GK_in: the pointer of Gint_k object, used for grid integration
+     * @brief Construct a new Veff object
     */
-    Veff<OperatorLCAO<TK, TR>>(Gint_k* GK_in,
-                               HS_Matrix_K<TK>* hsk_in,
+    Veff<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
                                const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                elecstate::Potential* pot_in,
                                hamilt::HContainer<TR>* hR_in,
@@ -44,36 +40,12 @@ class Veff<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
                                const std::vector<double>& orb_cutoff,
                                const Grid_Driver* GridD_in,
                                const int& nspin)
-        : GK(GK_in), orb_cutoff_(orb_cutoff), pot(pot_in), ucell(ucell_in),
+        : orb_cutoff_(orb_cutoff), pot(pot_in), ucell(ucell_in),
           gd(GridD_in), OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in)
     {
         this->cal_type = calculation_type::lcao_gint;
 
         this->initialize_HR(ucell_in, GridD_in);
-#ifdef __OLD_GINT
-        GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin);
-#endif
-    }
-    /**
-     * @brief Construct a new Veff object for Gamma-only calculation
-     * @param GG_in: the pointer of Gint_Gamma object, used for grid integration
-    */
-    Veff<OperatorLCAO<TK, TR>>(Gint_Gamma* GG_in,
-                               HS_Matrix_K<TK>* hsk_in,
-                               const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
-                               elecstate::Potential* pot_in,
-                               hamilt::HContainer<TR>* hR_in,
-                               const UnitCell* ucell_in,
-                               const std::vector<double>& orb_cutoff,
-                               const Grid_Driver* GridD_in,
-                               const int& nspin)
-        : GG(GG_in), orb_cutoff_(orb_cutoff), pot(pot_in), OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in)
-    {
-        this->cal_type = calculation_type::lcao_gint;
-        this->initialize_HR(ucell_in, GridD_in);
-#ifdef __OLD_GINT
-        GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin);
-#endif
     }
 
     ~Veff<OperatorLCAO<TK, TR>>(){};
@@ -90,11 +62,6 @@ class Veff<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
   const Grid_Driver* gd;
 
 private:
-  // used for k-dependent grid integration.
-  Gint_k* GK = nullptr;
-
-  // used for gamma only algorithms.
-  Gint_Gamma* GG = nullptr;
 
   std::vector<double> orb_cutoff_;
 
diff --git a/source/source_lcao/module_rdmft/rdmft.cpp b/source/source_lcao/module_rdmft/rdmft.cpp
index 837128fcf9..34549fb671 100644
--- a/source/source_lcao/module_rdmft/rdmft.cpp
+++ b/source/source_lcao/module_rdmft/rdmft.cpp
@@ -55,9 +55,7 @@ RDMFT<TK, TR>::~RDMFT()
 }
 
 template <typename TK, typename TR>
-void RDMFT<TK, TR>::init(Gint_Gamma& GG_in,
-                         Gint_k& GK_in,
-                         Parallel_Orbitals& ParaV_in,
+void RDMFT<TK, TR>::init(Parallel_Orbitals& ParaV_in,
                          UnitCell& ucell_in,
                          const Grid_Driver& gd_in,
                          K_Vectors& kv_in,
@@ -67,8 +65,6 @@ void RDMFT<TK, TR>::init(Gint_Gamma& GG_in,
                          std::string XC_func_rdmft_in,
                          double alpha_power_in)
 {
-    GG = &GG_in;
-    GK = &GK_in;
     ParaV = &ParaV_in;
     ucell = &ucell_in;
     kv = &kv_in;
diff --git a/source/source_lcao/module_rdmft/rdmft.h b/source/source_lcao/module_rdmft/rdmft.h
index a8bf7ea97e..0e6b532d6e 100644
--- a/source/source_lcao/module_rdmft/rdmft.h
+++ b/source/source_lcao/module_rdmft/rdmft.h
@@ -13,8 +13,6 @@
 #include "source_base/parallel_2d.h"
 #include "source_basis/module_ao/parallel_orbitals.h"
 #include "source_cell/unitcell.h"
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
 #include "source_basis/module_ao/ORB_read.h"
 #include "source_basis/module_nao/two_center_bundle.h"
 
@@ -80,9 +78,7 @@ class RDMFT
     // std::vector<double> E_RDMFT(4);
 
     //! initialization of rdmft calculation
-    void init(Gint_Gamma& GG_in,
-              Gint_k& GK_in,
-              Parallel_Orbitals& ParaV_in,
+    void init(Parallel_Orbitals& ParaV_in,
               UnitCell& ucell_in,
               const Grid_Driver& gd_in,
               K_Vectors& kv_in,
@@ -190,9 +186,6 @@ class RDMFT
     const int cal_E_type = 1;   // cal_type = 2 just support XC-functional without exx
 
     /****** these parameters are passed in from outside, don't need delete ******/
-    // GK and GG are used for multi-k grid integration and gamma only algorithms respectively
-    Gint_k* GK = nullptr;
-    Gint_Gamma* GG = nullptr;
     Charge* charge = nullptr;
 
     // update after ion step
diff --git a/source/source_lcao/module_rdmft/rdmft_pot.cpp b/source/source_lcao/module_rdmft/rdmft_pot.cpp
index ba29d9ebc6..a962c2cac1 100644
--- a/source/source_lcao/module_rdmft/rdmft_pot.cpp
+++ b/source/source_lcao/module_rdmft/rdmft_pot.cpp
@@ -69,8 +69,7 @@ void RDMFT<TK, TR>::cal_V_TV()
 
     if( PARAM.inp.gamma_only )
     {
-        V_local = new rdmft::Veff_rdmft<TK, TR>(GG,
-                                                hsk_TV,
+        V_local = new rdmft::Veff_rdmft<TK, TR>(hsk_TV,
                                                 kv->kvec_d,
                                                 this->pelec->pot,
                                                 HR_TV,
@@ -86,8 +85,7 @@ void RDMFT<TK, TR>::cal_V_TV()
     }
     else
     {
-        V_local = new rdmft::Veff_rdmft<TK, TR>(GK,
-                                                hsk_TV,
+        V_local = new rdmft::Veff_rdmft<TK, TR>(hsk_TV,
                                                 kv->kvec_d,
                                                 this->pelec->pot,
                                                 HR_TV,
@@ -117,8 +115,7 @@ void RDMFT<TK, TR>::cal_V_hartree()
 
     if( PARAM.inp.gamma_only )
     {
-        V_hartree = new rdmft::Veff_rdmft<TK, TR>(GG,
-                                                  hsk_hartree,
+        V_hartree = new rdmft::Veff_rdmft<TK, TR>(hsk_hartree,
                                                   kv->kvec_d,
                                                   this->pelec->pot,
                                                   HR_hartree,
@@ -135,8 +132,7 @@ void RDMFT<TK, TR>::cal_V_hartree()
     else
     {
         // this can be optimized, use potHartree.update_from_charge()
-        V_hartree = new rdmft::Veff_rdmft<TK, TR>(GK,
-                                                  hsk_hartree,
+        V_hartree = new rdmft::Veff_rdmft<TK, TR>(hsk_hartree,
                                                   kv->kvec_d,
                                                   this->pelec->pot,
                                                   HR_hartree,
@@ -197,8 +193,7 @@ void RDMFT<TK, TR>::cal_V_XC(const UnitCell& ucell)
         if( PARAM.inp.gamma_only )
         {
             // this can be optimized, use potXC.update_from_charge()
-            V_dft_XC = new rdmft::Veff_rdmft<TK, TR>(GG,
-                                                     hsk_dft_XC,
+            V_dft_XC = new rdmft::Veff_rdmft<TK, TR>(hsk_dft_XC,
                                                      kv->kvec_d,
                                                      this->pelec->pot,
                                                      HR_dft_XC,
@@ -217,8 +212,7 @@ void RDMFT<TK, TR>::cal_V_XC(const UnitCell& ucell)
         else
         {   
             // this can be optimized, use potXC.update_from_charge()
-            V_dft_XC = new rdmft::Veff_rdmft<TK, TR>(GK,
-                                                     hsk_dft_XC,
+            V_dft_XC = new rdmft::Veff_rdmft<TK, TR>(hsk_dft_XC,
                                                      kv->kvec_d,
                                                      this->pelec->pot,
                                                      HR_dft_XC,
diff --git a/source/source_lcao/module_rdmft/rdmft_tools.cpp b/source/source_lcao/module_rdmft/rdmft_tools.cpp
index 32c22dfef2..f8725b204c 100644
--- a/source/source_lcao/module_rdmft/rdmft_tools.cpp
+++ b/source/source_lcao/module_rdmft/rdmft_tools.cpp
@@ -12,7 +12,7 @@
 #include "source_estate/module_pot/pot_local.h"
 #include "source_estate/module_pot/pot_xc.h"
 #include "source_pw/module_pwdft/structure_factor.h"
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 
 #include <iostream>
 #include <cmath>
@@ -266,12 +266,7 @@ void Veff_rdmft<std::complex<double>, double>::contributeHR()
             vr_eff_rdmft = &v_matrix_hartree(is, 0);
 
             // do grid integral calculation to get HR
-#ifdef __OLD_GINT
-            Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal);
-            this->GK->cal_gint(&inout);
-#else
             ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
-#endif
         }
     }
     else if( potential_ == "local" )
@@ -285,12 +280,7 @@ void Veff_rdmft<std::complex<double>, double>::contributeHR()
         vr_eff_rdmft = &v_matrix_local(0, 0);
 
         // do grid integral calculation to get HR
-#ifdef __OLD_GINT
-        Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal);
-        this->GK->cal_gint(&inout);
-#else
         ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
-#endif
     }
     else if( potential_ == "xc" )
     {
@@ -309,12 +299,7 @@ void Veff_rdmft<std::complex<double>, double>::contributeHR()
             vr_eff_rdmft = &v_matrix_XC(is, 0);
 
             // do grid integral calculation to get HR
-#ifdef __OLD_GINT
-            Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal);
-            this->GK->cal_gint(&inout);
-#else
             ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
-#endif
         }
     }
     else
@@ -323,10 +308,6 @@ void Veff_rdmft<std::complex<double>, double>::contributeHR()
     }
 
     // get HR for 2D-block parallel format
-    // this->GK->transfer_pvpR(this->hR);
-#ifdef __OLD_GINT
-    this->GK->transfer_pvpR(this->hR,this->ucell,this->gd);
-#endif
 
     if(this->nspin == 2) 
     { 
@@ -351,7 +332,6 @@ void Veff_rdmft<double, double>::contributeHR()
     ModuleBase::TITLE("Veff", "contributeHR");
     ModuleBase::timer::tick("Veff", "contributeHR");
 
-    // this->GK->reset_spin(this->current_spin);
 
     double* vr_eff_rdmft = nullptr;
 
@@ -368,12 +348,7 @@ void Veff_rdmft<double, double>::contributeHR()
             vr_eff_rdmft = &v_matrix_hartree(is, 0);
 
             // do grid integral calculation to get HR
-#ifdef __OLD_GINT
-            Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal);
-            this->GG->cal_gint(&inout);
-#else
             ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
-#endif
         }
     }
     else if( potential_ == "local" )
@@ -387,16 +362,7 @@ void Veff_rdmft<double, double>::contributeHR()
         vr_eff_rdmft = &v_matrix_local(0, 0);
 
         // do grid integral calculation to get HR
-#ifdef __OLD_GINT
-        Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal);
-
-        // because in gamma_only, cal_gint would not set hRGint zero first
-        // so must use cal_vlocal(), and in rdmft_test.h, calculate V_hartree->contributeHR() first
-
-        this->GG->cal_vlocal(&inout, false);  // cal_gint ???
-#else
         ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
-#endif
     }
     else if( potential_ == "xc" )
     {
@@ -414,12 +380,7 @@ void Veff_rdmft<double, double>::contributeHR()
             vr_eff_rdmft = &v_matrix_XC(is, 0);
 
             // do grid integral calculation to get HR
-#ifdef __OLD_GINT
-            Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal);
-            this->GG->cal_gint(&inout);
-#else
             ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
-#endif
         }
     }
     else
@@ -427,10 +388,6 @@ void Veff_rdmft<double, double>::contributeHR()
         std::cout << "\n\n!!!!!!\n there may be something wrong when use class Veff_rdmft\n\n!!!!!!\n";
     }
 
-#ifdef __OLD_GINT
-    // get HR for 2D-block parallel format
-    this->GG->transfer_pvpR(this->hR,this->ucell);
-#endif
     this->new_e_iteration = false;
 
     if(this->nspin == 2)
diff --git a/source/source_lcao/module_rdmft/rdmft_tools.h b/source/source_lcao/module_rdmft/rdmft_tools.h
index 7b1639f8e8..91c69fb8c4 100644
--- a/source/source_lcao/module_rdmft/rdmft_tools.h
+++ b/source/source_lcao/module_rdmft/rdmft_tools.h
@@ -9,8 +9,6 @@
 #include "source_base/matrix.h"
 #include "source_cell/module_neighbor/sltk_grid_driver.h"
 #include "source_cell/unitcell.h"
-#include "source_lcao/module_gint/gint_gamma.h"
-#include "source_lcao/module_gint/gint_k.h"
 #include "source_estate/module_pot/potential_new.h"
 #include "source_base/module_external/blas_connector.h"
 #include "source_base/module_external/scalapack_connector.h"
@@ -259,10 +257,8 @@ class Veff_rdmft : public hamilt::OperatorLCAO<TK, TR>
   public:
     /**
      * @brief Construct a new Veff object for multi-kpoint calculation
-     * @param GK_in: the pointer of Gint_k object, used for grid integration
     */
-    Veff_rdmft(Gint_k* GK_in,
-               hamilt::HS_Matrix_K<TK>* hsk_in,
+    Veff_rdmft(hamilt::HS_Matrix_K<TK>* hsk_in,
                const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                elecstate::Potential* pot_in,
                hamilt::HContainer<TR>* hR_in,
@@ -277,45 +273,15 @@ class Veff_rdmft : public hamilt::OperatorLCAO<TK, TR>
                const std::string potential_in,
                double* etxc_in = nullptr,
                double* vtxc_in = nullptr)
-        : GK(GK_in), orb_cutoff_(orb_cutoff), pot(pot_in), ucell(ucell_in),
+        : orb_cutoff_(orb_cutoff), pot(pot_in), ucell(ucell_in),
           gd(GridD_in), hamilt::OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in), charge_(charge_in),
           rho_basis_(rho_basis_in), vloc_(vloc_in), sf_(sf_in), potential_(potential_in), etxc(etxc_in), vtxc(vtxc_in)
     {
         this->cal_type = hamilt::calculation_type::lcao_gint;
 
         this->initialize_HR(ucell_in, GridD_in);
-#ifdef __OLD_GINT
-        GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin);
-#endif
     }
-    Veff_rdmft(Gint_Gamma* GG_in,
-               hamilt::HS_Matrix_K<TK>* hsk_in,
-               const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
-               elecstate::Potential* pot_in,
-               hamilt::HContainer<TR>* hR_in,
-               const UnitCell* ucell_in,
-               const std::vector<double>& orb_cutoff,
-               const Grid_Driver* GridD_in,
-               const int& nspin,
-               const Charge* charge_in,
-               const ModulePW::PW_Basis* rho_basis_in,
-               const ModuleBase::matrix* vloc_in,
-               const ModuleBase::ComplexMatrix* sf_in,
-               const std::string potential_in,
-               double* etxc_in = nullptr,
-               double* vtxc_in = nullptr)
-        : GG(GG_in), orb_cutoff_(orb_cutoff), pot(pot_in), hamilt::OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in),
-          ucell(ucell_in), gd(GridD_in), charge_(charge_in), rho_basis_(rho_basis_in), vloc_(vloc_in), sf_(sf_in),
-          potential_(potential_in), etxc(etxc_in), vtxc(vtxc_in)
-    {
-        this->cal_type = hamilt::calculation_type::lcao_gint;
-
-        this->initialize_HR(ucell_in, GridD_in);
-#ifdef __OLD_GINT
-        GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin);
-#endif
-    }
-
+    
     ~Veff_rdmft<TK, TR>(){};
 
     /**
@@ -331,11 +297,6 @@ class Veff_rdmft : public hamilt::OperatorLCAO<TK, TR>
     const Grid_Driver* gd;
 
   private:
-    // used for k-dependent grid integration.
-    Gint_k* GK = nullptr;
-
-    // used for gamma only algorithms.
-    Gint_Gamma* GG = nullptr;
 
     std::vector<double> orb_cutoff_;
 
diff --git a/source/source_lcao/module_rdmft/update_state_rdmft.cpp b/source/source_lcao/module_rdmft/update_state_rdmft.cpp
index 88a6761d1b..7a43c9be91 100644
--- a/source/source_lcao/module_rdmft/update_state_rdmft.cpp
+++ b/source/source_lcao/module_rdmft/update_state_rdmft.cpp
@@ -8,7 +8,7 @@
 #include "source_estate/module_dm/cal_dm_psi.h"
 #include "source_estate/module_dm/density_matrix.h"
 #include "source_estate/module_charge/symmetry_rho.h"
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 
 
 namespace rdmft
@@ -106,22 +106,10 @@ void RDMFT<TK, TR>::update_charge(UnitCell& ucell)
         {
             ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx);
         }
-#ifdef __OLD_GINT
-        GG->transfer_DM2DtoGrid(DM_gamma_only.get_DMR_vector());
-        Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin);
-        GG->cal_gint(&inout);
-#else
         ModuleGint::cal_gint_rho(DM_gamma_only.get_DMR_vector(), nspin, charge->rho);
-#endif
 
         if (XC_Functional::get_ked_flag())
         {
-            // for (int is = 0; is < nspin; is++)
-            // {
-            //     ModuleBase::GlobalFunc::ZEROS(charge->kin_r[is], charge->nrxx);
-            // }
-            // Gint_inout inout1(charge->kin_r, Gint_Tools::job_type::tau);
-            // GG->cal_gint(&inout1);
             this->pelec->cal_tau(wfc);
         }
 
@@ -140,22 +128,10 @@ void RDMFT<TK, TR>::update_charge(UnitCell& ucell)
             ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx);
         }
 
-#ifdef __OLD_GINT
-        GK->transfer_DM2DtoGrid(DM.get_DMR_vector());
-        Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin);
-        GK->cal_gint(&inout);
-#else
         ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, charge->rho);
-#endif
 
         if (XC_Functional::get_ked_flag())
         {
-            // for (int is = 0; is < nspin; is++)
-            // {
-            //     ModuleBase::GlobalFunc::ZEROS(charge->kin_r[is], charge->nrxx);
-            // }
-            // Gint_inout inout1(charge->kin_r, Gint_Tools::job_type::tau);
-            // GK->cal_gint(&inout1);
             this->pelec->cal_tau(wfc);
         }
 
diff --git a/source/source_lcao/pulay_fs_gint.hpp b/source/source_lcao/pulay_fs_gint.hpp
index f097bf8a93..9603a28c34 100644
--- a/source/source_lcao/pulay_fs_gint.hpp
+++ b/source/source_lcao/pulay_fs_gint.hpp
@@ -3,7 +3,7 @@
 #include "source_lcao/stress_tools.h"
 #include "source_hamilt/module_xc/xc_functional.h"
 #include "source_io/module_parameter/parameter.h"
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 namespace PulayForceStress
 {
     template<typename TK, typename TR>
@@ -18,7 +18,6 @@ namespace PulayForceStress
         const bool& set_dmr_gint)
     {
         const int nspin = PARAM.inp.nspin;
-
         std::vector<const double*> vr_eff(nspin, nullptr);
         std::vector<const double*> vofk_eff(nspin, nullptr);
         if (XC_Functional::get_func_type() == 3 || XC_Functional::get_func_type() == 5)
diff --git a/source/source_lcao/pulay_fs_temp.hpp b/source/source_lcao/pulay_fs_temp.hpp
index 8d60d47255..ba32eedb2e 100644
--- a/source/source_lcao/pulay_fs_temp.hpp
+++ b/source/source_lcao/pulay_fs_temp.hpp
@@ -1,4 +1,5 @@
 #pragma once
+#include <omp.h>
 #include "pulay_fs.h"
 #include "source_base/timer.h"
 #include "source_io/module_parameter/parameter.h"
diff --git a/source/source_lcao/record_adj.cpp b/source/source_lcao/record_adj.cpp
index 7e2b9c7bbe..47118496a5 100644
--- a/source/source_lcao/record_adj.cpp
+++ b/source/source_lcao/record_adj.cpp
@@ -283,226 +283,4 @@ void Record_adj::for_2d(const UnitCell& ucell,
     return;
 }
 
-//--------------------------------------------
-// This will record the orbitals according to
-// grid division (cut along z direction)
-//--------------------------------------------
-void Record_adj::for_grid(const UnitCell& ucell,
-                          const Grid_Driver& grid_d,
-                          const Grid_Technique& gt,
-                          const std::vector<double>& orb_cutoff)
-{
-    ModuleBase::TITLE("Record_adj", "for_grid");
-    ModuleBase::timer::tick("Record_adj", "for_grid");
-
-    this->na_proc = 0;
-    this->iat2ca = new int[ucell.nat];
-    for (int iat = 0; iat < ucell.nat; ++iat)
-    {
-        {
-            if (gt.in_this_processor[iat])
-            {
-                iat2ca[iat] = na_proc;
-                ++na_proc;
-            }
-            else
-            {
-                iat2ca[iat] = -1;
-            }
-        }
-    }
-
-    // number of adjacents for each atom.
-    this->na_each = new int[na_proc];
-    ModuleBase::GlobalFunc::ZEROS(na_each, na_proc);
-    this->info = new int**[na_proc];
-#ifdef _OPENMP
-#pragma omp parallel
-    {
-#endif
-        ModuleBase::Vector3<double> tau1, tau2, dtau;
-        ModuleBase::Vector3<double> tau0, dtau1, dtau2;
-
-#ifdef _OPENMP
-#pragma omp for schedule(dynamic)
-#endif
-        for (int iat = 0; iat < ucell.nat; ++iat)
-        {
-            const int T1 = ucell.iat2it[iat];
-            Atom* atom1 = &ucell.atoms[T1];
-            const int I1 = ucell.iat2ia[iat];
-            {
-                const int ca = iat2ca[iat];
-                // key in this function
-                if (gt.in_this_processor[iat])
-                {
-                    tau1 = atom1->tau[I1];
-                    // grid_d.Find_atom(tau1);
-                    AdjacentAtomInfo adjs;
-                    grid_d.Find_atom(ucell, tau1, T1, I1, &adjs);
-                    for (int ad = 0; ad < adjs.adj_num + 1; ad++)
-                    {
-                        const int T2 = adjs.ntype[ad];
-                        const int I2 = adjs.natom[ad];
-                        const int iat2 = ucell.itia2iat(T2, I2);
-                        if (gt.in_this_processor[iat2])
-                        {
-                            // Atom* atom2 = &ucell.atoms[T2];
-                            tau2 = adjs.adjacent_tau[ad];
-                            dtau = tau2 - tau1;
-                            double distance = dtau.norm() * ucell.lat0;
-                            double rcut = orb_cutoff[T1] + orb_cutoff[T2];
-
-                            bool is_adj = false;
-                            if (distance < rcut)
-                            {
-                                is_adj = true;
-                            }
-                            /*
-                            else if(distance >= rcut)
-                            {
-                                for (int ad0 = 0; ad0 < grid_d.getAdjacentNum()+1; ++ad0)
-                                {
-                                    const int T0 = grid_d.getType(ad0);
-                                    const int I0 = grid_d.getNatom(ad0);
-                                    const int iat0 = ucell.itia2iat(T0, I0);
-                                    const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
-
-                                    tau0 = grid_d.getAdjacentTau(ad0);
-                                    dtau1 = tau0 - tau1;
-                                    dtau2 = tau0 - tau2;
-
-                                    double distance1 = dtau1.norm() * ucell.lat0;
-                                    double distance2 = dtau2.norm() * ucell.lat0;
-
-                                    double rcut1 = orb_cutoff[T1] + ucell.infoNL.Beta[T0].get_rcut_max();
-                                    double rcut2 = orb_cutoff[T2] + ucell.infoNL.Beta[T0].get_rcut_max();
-
-                                    if( distance1 < rcut1 && distance2 < rcut2 )
-                                    {
-                                        is_adj = true;
-                                        break;
-                                    } // dis1, dis2
-                                }
-                            }
-                            */
-
-                            // check the distance
-                            if (is_adj)
-                            {
-                                ++na_each[ca];
-                            }
-                        } // end judge 2
-                    } // end ad
-                } // end judge 1
-            } // end I1
-        } // end T1
-
-#ifdef _OPENMP
-#pragma omp for schedule(dynamic)
-#endif
-        for (int i = 0; i < na_proc; i++)
-        {
-            assert(na_each[i] > 0);
-            info[i] = new int*[na_each[i]];
-            for (int j = 0; j < na_each[i]; j++)
-            {
-                // (Rx, Ry, Rz, T, I)
-                info[i][j] = new int[5];
-                ModuleBase::GlobalFunc::ZEROS(info[i][j], 5);
-            }
-        }
-
-#ifdef _OPENMP
-#pragma omp for schedule(dynamic)
-#endif
-        for (int iat = 0; iat < ucell.nat; ++iat)
-        {
-            const int T1 = ucell.iat2it[iat];
-            Atom* atom1 = &ucell.atoms[T1];
-            const int I1 = ucell.iat2ia[iat];
-            {
-                const int ca = iat2ca[iat];
-
-                // key of this function
-                if (gt.in_this_processor[iat])
-                {
-                    tau1 = atom1->tau[I1];
-                    // grid_d.Find_atom(tau1);
-                    AdjacentAtomInfo adjs;
-                    grid_d.Find_atom(ucell, tau1, T1, I1, &adjs);
-
-                    int cb = 0;
-                    for (int ad = 0; ad < adjs.adj_num + 1; ad++)
-                    {
-                        const int T2 = adjs.ntype[ad];
-                        const int I2 = adjs.natom[ad];
-                        const int iat2 = ucell.itia2iat(T2, I2);
 
-                        // key of this function
-                        if (gt.in_this_processor[iat2])
-                        {
-                            // Atom* atom2 = &ucell.atoms[T2];
-                            tau2 = adjs.adjacent_tau[ad];
-                            dtau = tau2 - tau1;
-                            double distance = dtau.norm() * ucell.lat0;
-                            double rcut = orb_cutoff[T1] + orb_cutoff[T2];
-
-                            // check the distance
-                            if (distance < rcut)
-                            {
-                                info[ca][cb][0] = adjs.box[ad].x;
-                                info[ca][cb][1] = adjs.box[ad].y;
-                                info[ca][cb][2] = adjs.box[ad].z;
-                                info[ca][cb][3] = T2;
-                                info[ca][cb][4] = I2;
-                                ++cb;
-                            }
-                            /*
-                            else if(distance >= rcut)
-                            {
-                                for (int ad0 = 0; ad0 < grid_d.getAdjacentNum()+1; ++ad0)
-                                {
-                                    const int T0 = grid_d.getType(ad0);
-                                    const int I0 = grid_d.getNatom(ad0);
-                                    const int iat0 = ucell.itia2iat(T0, I0);
-                                    const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
-
-                                    tau0 = grid_d.getAdjacentTau(ad0);
-                                    dtau1 = tau0 - tau1;
-                                    dtau2 = tau0 - tau2;
-
-                                    double distance1 = dtau1.norm() * ucell.lat0;
-                                    double distance2 = dtau2.norm() * ucell.lat0;
-
-                                    double rcut1 = orb_cutoff[T1] + ucell.infoNL.Beta[T0].get_rcut_max();
-                                    double rcut2 = orb_cutoff[T2] + ucell.infoNL.Beta[T0].get_rcut_max();
-
-                                    if( distance1 < rcut1 && distance2 < rcut2 )
-                                    {
-                                        info[ca][cb][0] = grid_d.getBox(ad).x;
-                                        info[ca][cb][1] = grid_d.getBox(ad).y;
-                                        info[ca][cb][2] = grid_d.getBox(ad).z;
-                                        info[ca][cb][3] = T2;
-                                        info[ca][cb][4] = I2;
-                                        ++cb;
-                                        break;
-                                    } // dis1, dis2
-                                }
-                            }
-                            */
-                        }
-                    } // end ad
-
-                    assert(cb == na_each[ca]);
-                }
-            }
-        }
-#ifdef _OPENMP
-    }
-#endif
-    ModuleBase::timer::tick("Record_adj", "for_grid");
-    info_modified = true;
-    //	std::cout << " after for_grid" << std::endl;
-    return;
-}
diff --git a/source/source_lcao/record_adj.h b/source/source_lcao/record_adj.h
index 3d1f16a402..871403ca14 100644
--- a/source/source_lcao/record_adj.h
+++ b/source/source_lcao/record_adj.h
@@ -2,7 +2,8 @@
 #define RECORD_ADJ_H
 
 #include "source_basis/module_ao/parallel_orbitals.h"
-#include "source_lcao/module_gint/grid_technique.h"
+#include "source_cell/unitcell.h"
+#include "source_cell/module_neighbor/sltk_grid_driver.h"
 
 //---------------------------------------------------
 // FUNCTION: record the adjacent atoms for each atom
@@ -26,14 +27,6 @@ class Record_adj
                 bool gamma_only,
                 const std::vector<double>& orb_cutoff);
 
-    //--------------------------------------------
-    // This will record the orbitals according to
-    // grid division (cut along z direction)
-    //--------------------------------------------
-    void for_grid(const UnitCell& ucell,
-                  const Grid_Driver& grid_d,
-                  const Grid_Technique& gt,
-                  const std::vector<double>& orb_cutoff);
 
     void delete_grid();
 
@@ -41,7 +34,7 @@ class Record_adj
     int* na_each=nullptr;
 
     //--------------------------------------------
-    // record sparse atom index in for_grid(const Grid_Technique &gt);
+    // record sparse atom index in for_grid();
     // Map iat(dense atom index) to sparse atom index
     // Mainly removing the index dependency for OpenMP parallel loop
     //
diff --git a/source/source_lcao/spar_dh.cpp b/source/source_lcao/spar_dh.cpp
index 21748e830e..7d5d485d56 100644
--- a/source/source_lcao/spar_dh.cpp
+++ b/source/source_lcao/spar_dh.cpp
@@ -2,7 +2,7 @@
 
 #include "source_io/module_parameter/parameter.h"
 #include "source_lcao/LCAO_domain.h"
-#include "source_lcao/module_gint/temp_gint/gint_interface.h"
+#include "source_lcao/module_gint/gint_interface.h"
 #include <vector>
 
 void sparse_format::cal_dS(const UnitCell& ucell,
@@ -58,8 +58,7 @@ void sparse_format::cal_dH(const UnitCell& ucell,
                            const LCAO_Orbitals& orb,
                            const int& current_spin,
                            const double& sparse_thr,
-                           const ModuleBase::matrix& v_eff,
-                           Gint_k& gint_k)
+                           const ModuleBase::matrix& v_eff)
 {
     ModuleBase::TITLE("sparse_format", "cal_dH");
 
@@ -109,26 +108,6 @@ void sparse_format::cal_dH(const UnitCell& ucell,
 
     if(PARAM.inp.nspin==2)
     {
-#ifdef __OLD_GINT
-        gint_k.allocate_pvdpR();
-        // note: some MPI process will not have grids when MPI cores are too
-        // many, v_eff in these processes are empty
-        const double* vr_eff1
-            = v_eff.nc * v_eff.nr > 0 ? &(v_eff(current_spin, 0)) : nullptr;
-
-        if (!PARAM.globalv.gamma_only_local) 
-        {
-            if (PARAM.inp.vl_in_h) 
-            {
-                Gint_inout inout(vr_eff1,
-                                 current_spin,
-                                 Gint_Tools::job_type::dvlocal);
-                gint_k.cal_gint(&inout);
-            }
-        }
-        gint_k.cal_dvlocal_R_sparseMatrix(current_spin, sparse_thr, HS_Arrays, &pv, ucell, grid);
-        gint_k.destroy_pvdpR();
-#else
         const double* vr_eff1
             = v_eff.nc * v_eff.nr > 0 ? &(v_eff(current_spin, 0)) : nullptr;
         if (!PARAM.globalv.gamma_only_local) 
@@ -137,7 +116,6 @@ void sparse_format::cal_dH(const UnitCell& ucell,
                 PARAM.inp.nspin, PARAM.globalv.npol, current_spin, PARAM.globalv.nlocal,
                 sparse_thr, vr_eff1, pv, ucell, grid, HS_Arrays);
         }
-#endif
     }
     return;
 }
diff --git a/source/source_lcao/spar_dh.h b/source/source_lcao/spar_dh.h
index a71ebe4ec2..9af4fd6009 100644
--- a/source/source_lcao/spar_dh.h
+++ b/source/source_lcao/spar_dh.h
@@ -19,8 +19,7 @@ void cal_dH(const UnitCell& ucell,
             const LCAO_Orbitals& orb,
             const int& current_spin,
             const double& sparse_thr,
-            const ModuleBase::matrix& v_eff,
-            Gint_k& gint_k);
+            const ModuleBase::matrix& v_eff);
 
 // calculated the derivative of the overlap matrix: <phi|dphi>
 void cal_dS(const UnitCell& ucell,
diff --git a/source/source_lcao/spar_hsr.h b/source/source_lcao/spar_hsr.h
index b3e809ceb2..df8478c4bf 100644
--- a/source/source_lcao/spar_hsr.h
+++ b/source/source_lcao/spar_hsr.h
@@ -2,6 +2,7 @@
 #define SPARSE_FORMAT_HSR_H
 
 #include "source_lcao/hamilt_lcao.h"
+#include "source_lcao/LCAO_HS_arrays.hpp"
 
 namespace sparse_format
 {