diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 7a0ff0aadf..0fb198712d 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -189,8 +189,7 @@ OBJS_CELL=atom_pseudo.o\
     check_atomic_stru.o\
 
 OBJS_DEEPKS=LCAO_deepks.o\
-        deepks_fgamma.o\
-        deepks_fk.o\
+        deepks_force.o\
         LCAO_deepks_odelta.o\
         LCAO_deepks_io.o\
         LCAO_deepks_mpi.o\
diff --git a/source/module_esolver/lcao_before_scf.cpp b/source/module_esolver/lcao_before_scf.cpp
index d51731bbde..e1c2fa4652 100644
--- a/source/module_esolver/lcao_before_scf.cpp
+++ b/source/module_esolver/lcao_before_scf.cpp
@@ -210,6 +210,8 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
     if (PARAM.globalv.deepks_setorb)
     {
         const Parallel_Orbitals* pv = &this->pv;
+        // allocate <psi(0)|alpha(R)>, psialpha is different every ion step, so it is allocated here
+        GlobalC::ld.allocate_psialpha(PARAM.inp.cal_force, ucell, orb_, this->gd);
         // build and save <psi(0)|alpha(R)> at beginning
         GlobalC::ld.build_psialpha(PARAM.inp.cal_force, ucell, orb_, this->gd, *(two_center_bundle_.overlap_orb_alpha));
 
diff --git a/source/module_esolver/lcao_others.cpp b/source/module_esolver/lcao_others.cpp
index fe27014192..0907237682 100644
--- a/source/module_esolver/lcao_others.cpp
+++ b/source/module_esolver/lcao_others.cpp
@@ -216,6 +216,8 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
     if (PARAM.globalv.deepks_setorb)
     {
         const Parallel_Orbitals* pv = &this->pv;
+        // allocate <psi(0)|alpha(R)>, psialpha is different every ion step, so it is allocated here
+        GlobalC::ld.allocate_psialpha(PARAM.inp.cal_force, ucell, orb_, this->gd);
         // build and save <psi(0)|alpha(R)> at beginning
         GlobalC::ld.build_psialpha(PARAM.inp.cal_force, ucell, orb_, this->gd, *(two_center_bundle_.overlap_orb_alpha));
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
index 988999c777..194da269e7 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
@@ -511,7 +511,14 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
                     {
                         const std::vector<std::vector<double>>& dm_gamma
                             = dynamic_cast<const elecstate::ElecStateLCAO<double>*>(pelec)->get_DM()->get_DMK_vector();
-                        GlobalC::ld.cal_gdmx(dm_gamma, ucell, orb, gd, kv.get_nks(), kv.kvec_d, isstress);
+                        GlobalC::ld.cal_gdmx(dm_gamma,
+                                             ucell,
+                                             orb,
+                                             gd,
+                                             kv.get_nks(),
+                                             kv.kvec_d,
+                                             GlobalC::ld.psialpha,
+                                             isstress);
                     }
                     else
                     {
@@ -520,7 +527,8 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
                                   ->get_DM()
                                   ->get_DMK_vector();
 
-                        GlobalC::ld.cal_gdmx(dm_k, ucell, orb, gd, kv.get_nks(), kv.kvec_d, isstress);
+                        GlobalC::ld
+                            .cal_gdmx(dm_k, ucell, orb, gd, kv.get_nks(), kv.kvec_d, GlobalC::ld.psialpha, isstress);
                     }
                     if (PARAM.inp.deepks_out_unittest)
                     {
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
index 41b26e7e9f..59d71d2a4f 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
@@ -1,10 +1,10 @@
 #include "FORCE.h"
 #include "module_base/memory.h"
-#include "module_parameter/parameter.h"
 #include "module_base/parallel_reduce.h"
 #include "module_base/timer.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
+#include "module_parameter/parameter.h"
 #ifdef __DEEPKS
 #include "module_hamilt_lcao/module_deepks/LCAO_deepks.h" //caoyu add for deepks on 20210813
 #include "module_hamilt_lcao/module_deepks/LCAO_deepks_io.h"
@@ -12,8 +12,8 @@
 #include "module_cell/module_neighbor/sltk_grid_driver.h" //GridD
 #include "module_elecstate/elecstate_lcao.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
-#include "module_io/write_HS.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress.h"
+#include "module_io/write_HS.h"
 
 template <>
 void Force_LCAO<double>::allocate(const UnitCell& ucell,
@@ -147,28 +147,28 @@ void Force_LCAO<double>::finish_ftable(ForceStressArrays& fsr)
     return;
 }
 
-//template <>
-//void Force_LCAO<double>::test(Parallel_Orbitals& pv, double* mm, const std::string& name)
+// template <>
+// void Force_LCAO<double>::test(Parallel_Orbitals& pv, double* mm, const std::string& name)
 //{
-//    std::cout << "\n PRINT " << name << std::endl;
-//    std::cout << std::setprecision(6) << std::endl;
-//    for (int i = 0; i < PARAM.globalv.nlocal; i++)
-//    {
-//        for (int j = 0; j < PARAM.globalv.nlocal; j++)
-//        {
-//            if (std::abs(mm[i * PARAM.globalv.nlocal + j]) > 1.0e-5)
-//            {
-//                std::cout << std::setw(12) << mm[i * PARAM.globalv.nlocal + j];
-//            }
-//            else
-//            {
-//                std::cout << std::setw(12) << "0";
-//            }
-//        }
-//        std::cout << std::endl;
-//    }
-//    return;
-//}
+//     std::cout << "\n PRINT " << name << std::endl;
+//     std::cout << std::setprecision(6) << std::endl;
+//     for (int i = 0; i < PARAM.globalv.nlocal; i++)
+//     {
+//         for (int j = 0; j < PARAM.globalv.nlocal; j++)
+//         {
+//             if (std::abs(mm[i * PARAM.globalv.nlocal + j]) > 1.0e-5)
+//             {
+//                 std::cout << std::setw(12) << mm[i * PARAM.globalv.nlocal + j];
+//             }
+//             else
+//             {
+//                 std::cout << std::setw(12) << "0";
+//             }
+//         }
+//         std::cout << std::endl;
+//     }
+//     return;
+// }
 
 // be called in force_lo.cpp
 template <>
@@ -210,20 +210,40 @@ void Force_LCAO<double>::ftable(const bool isforce,
     // allocate DHloc_fixed_x, DHloc_fixed_y, DHloc_fixed_z
     this->allocate(ucell, gd, pv, fsr, two_center_bundle, orb);
 
-    const double* dSx[3] = { fsr.DSloc_x, fsr.DSloc_y, fsr.DSloc_z };
-    const double* dSxy[6] = { fsr.DSloc_11, fsr.DSloc_12, fsr.DSloc_13, fsr.DSloc_22, fsr.DSloc_23, fsr.DSloc_33 };
+    const double* dSx[3] = {fsr.DSloc_x, fsr.DSloc_y, fsr.DSloc_z};
+    const double* dSxy[6] = {fsr.DSloc_11, fsr.DSloc_12, fsr.DSloc_13, fsr.DSloc_22, fsr.DSloc_23, fsr.DSloc_33};
     // calculate the force related to 'energy density matrix'.
-    PulayForceStress::cal_pulay_fs(foverlap, soverlap,
+    PulayForceStress::cal_pulay_fs(
+        foverlap,
+        soverlap,
         this->cal_edm(pelec, *psi, *dm, *kv, pv, PARAM.inp.nspin, PARAM.inp.nbands, ucell, *ra),
-        ucell, pv, dSx, dSxy, isforce, isstress);
-
-    const double* dHx[3] = { fsr.DHloc_fixed_x, fsr.DHloc_fixed_y, fsr.DHloc_fixed_z };
-    const double* dHxy[6] = { fsr.DHloc_fixed_11, fsr.DHloc_fixed_12, fsr.DHloc_fixed_13, fsr.DHloc_fixed_22, fsr.DHloc_fixed_23, fsr.DHloc_fixed_33 };
-    //tvnl_dphi
+        ucell,
+        pv,
+        dSx,
+        dSxy,
+        isforce,
+        isstress);
+
+    const double* dHx[3] = {fsr.DHloc_fixed_x, fsr.DHloc_fixed_y, fsr.DHloc_fixed_z};
+    const double* dHxy[6] = {fsr.DHloc_fixed_11,
+                             fsr.DHloc_fixed_12,
+                             fsr.DHloc_fixed_13,
+                             fsr.DHloc_fixed_22,
+                             fsr.DHloc_fixed_23,
+                             fsr.DHloc_fixed_33};
+    // tvnl_dphi
     PulayForceStress::cal_pulay_fs(ftvnl_dphi, stvnl_dphi, *dm, ucell, pv, dHx, dHxy, isforce, isstress);
 
     // vl_dphi
-    PulayForceStress::cal_pulay_fs(fvl_dphi, svl_dphi, *dm, ucell, pelec->pot, gint, isforce, isstress, false/*reset dm to gint*/);
+    PulayForceStress::cal_pulay_fs(fvl_dphi,
+                                   svl_dphi,
+                                   *dm,
+                                   ucell,
+                                   pelec->pot,
+                                   gint,
+                                   isforce,
+                                   isstress,
+                                   false /*reset dm to gint*/);
 
 #ifdef __DEEPKS
     if (PARAM.inp.deepks_scf)
@@ -237,21 +257,21 @@ void Force_LCAO<double>::ftable(const bool isforce,
 
         GlobalC::ld.cal_gedm(ucell.nat);
 
-        const int nks=1;
-		DeePKS_domain::cal_f_delta_gamma(dm_gamma, 
-				                         ucell, 
-				                         orb, 
-				                         gd, 
-                                         *this->ParaV,
-                                         GlobalC::ld.lmaxd,
-                                         nks, 
-				                         kv->kvec_d, 
-                                         GlobalC::ld.nlm_save,
-                                         GlobalC::ld.gedm,
-                                         GlobalC::ld.inl_index,
-                                         GlobalC::ld.F_delta,
-				                         isstress, 
-				                         svnl_dalpha);
+        const int nks = 1;
+        DeePKS_domain::cal_f_delta<double>(dm_gamma,
+                                           ucell,
+                                           orb,
+                                           gd,
+                                           *this->ParaV,
+                                           GlobalC::ld.lmaxd,
+                                           nks,
+                                           kv->kvec_d,
+                                           GlobalC::ld.psialpha,
+                                           GlobalC::ld.gedm,
+                                           GlobalC::ld.inl_index,
+                                           GlobalC::ld.F_delta,
+                                           isstress,
+                                           svnl_dalpha);
 
 #ifdef __MPI
         Parallel_Reduce::reduce_all(GlobalC::ld.F_delta.c, GlobalC::ld.F_delta.nr * GlobalC::ld.F_delta.nc);
@@ -265,7 +285,7 @@ void Force_LCAO<double>::ftable(const bool isforce,
         if (PARAM.inp.deepks_out_unittest)
         {
             const int nks = 1; // 1 for gamma-only
-            LCAO_deepks_io::print_dm(nks, PARAM.globalv.nlocal, this->ParaV->nrow, dm_gamma); 
+            LCAO_deepks_io::print_dm(nks, PARAM.globalv.nlocal, this->ParaV->nrow, dm_gamma);
 
             GlobalC::ld.check_projected_dm();
 
@@ -273,7 +293,7 @@ void Force_LCAO<double>::ftable(const bool isforce,
 
             GlobalC::ld.check_gedm();
 
-            GlobalC::ld.cal_e_delta_band(dm_gamma,nks);
+            GlobalC::ld.cal_e_delta_band(dm_gamma, nks);
 
             std::ofstream ofs("E_delta_bands.dat");
             ofs << std::setprecision(10) << GlobalC::ld.e_delta_band;
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
index e9203d6352..8aa1c68409 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
@@ -1,6 +1,5 @@
 #include "FORCE.h"
 #include "module_base/memory.h"
-#include "module_parameter/parameter.h"
 #include "module_base/parallel_reduce.h"
 #include "module_base/timer.h"
 #include "module_base/tool_threading.h"
@@ -10,9 +9,10 @@
 #include "module_elecstate/elecstate_lcao.h"
 #include "module_elecstate/module_dm/cal_dm_psi.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_io/write_HS.h"
-#include "module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress.h"
+#include "module_parameter/parameter.h"
 
 #include <map>
 #include <unordered_map>
@@ -138,8 +138,7 @@ void Force_LCAO<std::complex<double>>::allocate(const UnitCell& ucell,
     {
         cal_deri = false;
 
-        ModuleBase::WARNING_QUIT("cal_syns",
-                                 "This function has been broken and will be fixed later.");
+        ModuleBase::WARNING_QUIT("cal_syns", "This function has been broken and will be fixed later.");
 
         LCAO_domain::build_ST_new(fsr,
                                   'S',
@@ -156,7 +155,7 @@ void Force_LCAO<std::complex<double>>::allocate(const UnitCell& ucell,
 
         for (int ik = 0; ik < nks; ik++)
         {
-            
+
             bool bit = false; // LiuXh, 2017-03-21
         }
     }
@@ -188,83 +187,82 @@ void Force_LCAO<std::complex<double>>::finish_ftable(ForceStressArrays& fsr)
     return;
 }
 
-//template <>
-//void Force_LCAO<std::complex<double>>::test(Parallel_Orbitals& pv, double* mmm, const std::string& name)
+// template <>
+// void Force_LCAO<std::complex<double>>::test(Parallel_Orbitals& pv, double* mmm, const std::string& name)
 //{
-//    // mohan remove 'const' for pv, 2024-03-31
-//    if (GlobalV::NPROC != 1)
-//    {
-//        return;
-//    }
+//     // mohan remove 'const' for pv, 2024-03-31
+//     if (GlobalV::NPROC != 1)
+//     {
+//         return;
+//     }
 //
-//    std::cout << "test!" << std::endl;
+//     std::cout << "test!" << std::endl;
 //
-//    int irr = 0;
-//    int ca = 0;
+//     int irr = 0;
+//     int ca = 0;
 //
-//    GlobalV::ofs_running << " Calculate the test in Force_LCAO_k" << std::endl;
-//    Record_adj RA;
+//     GlobalV::ofs_running << " Calculate the test in Force_LCAO_k" << std::endl;
+//     Record_adj RA;
 //
-//    // mohan update 2024-03-31
-//    RA.for_2d(pv, GlobalV::GAMMA_ONLY_LOCAL, GlobalC::ORB.cutoffs());
+//     // mohan update 2024-03-31
+//     RA.for_2d(pv, GlobalV::GAMMA_ONLY_LOCAL, GlobalC::ORB.cutoffs());
 //
-//    double* test;
-//    test = new double[PARAM.globalv.nlocal * PARAM.globalv.nlocal];
-//    ModuleBase::GlobalFunc::ZEROS(test, PARAM.globalv.nlocal * PARAM.globalv.nlocal);
+//     double* test;
+//     test = new double[PARAM.globalv.nlocal * PARAM.globalv.nlocal];
+//     ModuleBase::GlobalFunc::ZEROS(test, PARAM.globalv.nlocal * PARAM.globalv.nlocal);
 //
-//    for (int T1 = 0; T1 < ucell.ntype; T1++)
-//    {
-//        Atom* atom1 = &ucell.atoms[T1];
-//        for (int I1 = 0; I1 < atom1->na; I1++)
-//        {
-//            // const int iat = ucell.itia2iat(T1,I1);
-//            const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-//            for (int cb = 0; cb < RA.na_each[ca]; cb++)
-//            {
-//                const int T2 = RA.info[ca][cb][3];
-//                const int I2 = RA.info[ca][cb][4];
-//                Atom* atom2 = &ucell.atoms[T2];
-//                const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+//     for (int T1 = 0; T1 < ucell.ntype; T1++)
+//     {
+//         Atom* atom1 = &ucell.atoms[T1];
+//         for (int I1 = 0; I1 < atom1->na; I1++)
+//         {
+//             // const int iat = ucell.itia2iat(T1,I1);
+//             const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+//             for (int cb = 0; cb < RA.na_each[ca]; cb++)
+//             {
+//                 const int T2 = RA.info[ca][cb][3];
+//                 const int I2 = RA.info[ca][cb][4];
+//                 Atom* atom2 = &ucell.atoms[T2];
+//                 const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
 //
-//                for (int jj = 0; jj < atom1->nw; jj++)
-//                {
-//                    const int iw1_all = start1 + jj;
-//                    for (int kk = 0; kk < atom2->nw; kk++)
-//                    {
-//                        const int iw2_all = start2 + kk;
-//                        assert(irr < pv.nnr);
-//                        test[iw1_all * PARAM.globalv.nlocal + iw2_all] += mmm[irr];
-//                        ++irr;
-//                    }
-//                }
-//            }
-//            ++ca;
-//        }
-//    }
+//                 for (int jj = 0; jj < atom1->nw; jj++)
+//                 {
+//                     const int iw1_all = start1 + jj;
+//                     for (int kk = 0; kk < atom2->nw; kk++)
+//                     {
+//                         const int iw2_all = start2 + kk;
+//                         assert(irr < pv.nnr);
+//                         test[iw1_all * PARAM.globalv.nlocal + iw2_all] += mmm[irr];
+//                         ++irr;
+//                     }
+//                 }
+//             }
+//             ++ca;
+//         }
+//     }
 //
-//    std::cout << "\n " << name << std::endl;
-//    std::cout << std::setprecision(4);
-//    for (int i = 0; i < PARAM.globalv.nlocal; i++)
-//    {
-//        for (int j = 0; j < PARAM.globalv.nlocal; j++)
-//        {
-//            if (std::abs(test[i * PARAM.globalv.nlocal + j]) > 1.0e-5)
-//            {
-//                std::cout << std::setw(12) << test[i * PARAM.globalv.nlocal + j];
-//            }
-//            else
-//            {
-//                std::cout << std::setw(12) << "0";
-//            }
-//        }
-//        std::cout << std::endl;
-//    }
-//    delete[] test;
+//     std::cout << "\n " << name << std::endl;
+//     std::cout << std::setprecision(4);
+//     for (int i = 0; i < PARAM.globalv.nlocal; i++)
+//     {
+//         for (int j = 0; j < PARAM.globalv.nlocal; j++)
+//         {
+//             if (std::abs(test[i * PARAM.globalv.nlocal + j]) > 1.0e-5)
+//             {
+//                 std::cout << std::setw(12) << test[i * PARAM.globalv.nlocal + j];
+//             }
+//             else
+//             {
+//                 std::cout << std::setw(12) << "0";
+//             }
+//         }
+//         std::cout << std::endl;
+//     }
+//     delete[] test;
 //
-//    RA.delete_grid(); // xiaohui add 2015-02-04
-//    return;
-//}
-
+//     RA.delete_grid(); // xiaohui add 2015-02-04
+//     return;
+// }
 
 // be called in Force_LCAO::start_force_calculation
 template <>
@@ -308,21 +306,39 @@ void Force_LCAO<std::complex<double>>::ftable(const bool isforce,
                    kv->get_nks(),
                    kv->kvec_d);
 
-    const double* dSx[3] = { fsr.DSloc_Rx, fsr.DSloc_Ry, fsr.DSloc_Rz };
+    const double* dSx[3] = {fsr.DSloc_Rx, fsr.DSloc_Ry, fsr.DSloc_Rz};
     // calculate the energy density matrix
     // and the force related to overlap matrix and energy density matrix.
-    PulayForceStress::cal_pulay_fs(foverlap, soverlap,
+    PulayForceStress::cal_pulay_fs(
+        foverlap,
+        soverlap,
         this->cal_edm(pelec, *psi, *dm, *kv, pv, PARAM.inp.nspin, PARAM.inp.nbands, ucell, *ra),
-        ucell, pv, dSx, fsr.DH_r, isforce, isstress, ra, -1.0, 1.0);
-
-    const double* dHx[3] = { fsr.DHloc_fixedR_x, fsr.DHloc_fixedR_y, fsr.DHloc_fixedR_z }; // T+Vnl
-    const double* dHxy[6] = { fsr.stvnl11, fsr.stvnl12, fsr.stvnl13, fsr.stvnl22, fsr.stvnl23, fsr.stvnl33 };   //T
+        ucell,
+        pv,
+        dSx,
+        fsr.DH_r,
+        isforce,
+        isstress,
+        ra,
+        -1.0,
+        1.0);
+
+    const double* dHx[3] = {fsr.DHloc_fixedR_x, fsr.DHloc_fixedR_y, fsr.DHloc_fixedR_z};                    // T+Vnl
+    const double* dHxy[6] = {fsr.stvnl11, fsr.stvnl12, fsr.stvnl13, fsr.stvnl22, fsr.stvnl23, fsr.stvnl33}; // T
     // tvnl_dphi
     PulayForceStress::cal_pulay_fs(ftvnl_dphi, stvnl_dphi, *dm, ucell, pv, dHx, dHxy, isforce, isstress, ra, 1.0, -1.0);
 
     // doing on the real space grid.
     // vl_dphi
-    PulayForceStress::cal_pulay_fs(fvl_dphi, svl_dphi, *dm, ucell, pelec->pot, gint, isforce, isstress, false/*reset dm to gint*/);
+    PulayForceStress::cal_pulay_fs(fvl_dphi,
+                                   svl_dphi,
+                                   *dm,
+                                   ucell,
+                                   pelec->pot,
+                                   gint,
+                                   isforce,
+                                   isstress,
+                                   false /*reset dm to gint*/);
 
 #ifdef __DEEPKS
     if (PARAM.inp.deepks_scf)
@@ -330,26 +346,26 @@ void Force_LCAO<std::complex<double>>::ftable(const bool isforce,
         const std::vector<std::vector<std::complex<double>>>& dm_k = dm->get_DMK_vector();
 
         // when deepks_scf is on, the init pdm should be same as the out pdm, so we should not recalculate the pdm
-        // GlobalC::ld.cal_projected_DM_k(dm, ucell, orb, gd);
+        // GlobalC::ld.cal_projected_DM(dm, ucell, orb, gd);
 
         GlobalC::ld.cal_descriptor(ucell.nat);
 
         GlobalC::ld.cal_gedm(ucell.nat);
 
-        DeePKS_domain::cal_f_delta_k(dm_k,
-                                     ucell,
-                                     orb,
-                                     gd,
-                                     pv,
-                                     GlobalC::ld.lmaxd,
-                                     kv->get_nks(),
-                                     kv->kvec_d,
-                                     GlobalC::ld.nlm_save_k,
-                                     GlobalC::ld.gedm,
-                                     GlobalC::ld.inl_index,
-                                     GlobalC::ld.F_delta,
-                                     isstress,
-                                     svnl_dalpha);
+        DeePKS_domain::cal_f_delta<std::complex<double>>(dm_k,
+                                                         ucell,
+                                                         orb,
+                                                         gd,
+                                                         pv,
+                                                         GlobalC::ld.lmaxd,
+                                                         kv->get_nks(),
+                                                         kv->kvec_d,
+                                                         GlobalC::ld.psialpha,
+                                                         GlobalC::ld.gedm,
+                                                         GlobalC::ld.inl_index,
+                                                         GlobalC::ld.F_delta,
+                                                         isstress,
+                                                         svnl_dalpha);
 
 #ifdef __MPI
         Parallel_Reduce::reduce_all(GlobalC::ld.F_delta.c, GlobalC::ld.F_delta.nr * GlobalC::ld.F_delta.nc);
diff --git a/source/module_hamilt_lcao/module_deepks/CMakeLists.txt b/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
index f32151bafe..9d0049ee47 100644
--- a/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
+++ b/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
@@ -1,8 +1,7 @@
 if(ENABLE_DEEPKS)
   list(APPEND objects
       LCAO_deepks.cpp
-      deepks_fgamma.cpp
-      deepks_fk.cpp
+      deepks_force.cpp
       LCAO_deepks_odelta.cpp
       LCAO_deepks_io.cpp
       LCAO_deepks_mpi.cpp
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
index 2e34264ab7..9f5163cdf3 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
@@ -1,22 +1,21 @@
-//wenfei 2022-1-5
-//This file contains constructor and destructor of the class LCAO_deepks, 
+// wenfei 2022-1-5
+// This file contains constructor and destructor of the class LCAO_deepks,
 #include "module_parameter/parameter.h"
-//as well as subroutines for initializing and releasing relevant data structures 
-
-//Other than the constructor and the destructor, it contains 3 types of subroutines:
-//1. subroutines that are related to calculating descriptors:
-//  - init : allocates some arrays
-//  - init_index : records the index (inl)
-//  - allocate_nlm : allocates data structures (nlm_save) which is used to store <chi|alpha>
-//2. subroutines that are related to calculating force label:
-//  - init_gdmx : allocates gdmx; it is a private subroutine
-//  - del_gdmx : releases gdmx
-//3. subroutines that are related to calculating force label:
-//  - init_gdmepsl : allocates gdm_epsl; it is a private subroutine
-//  - del_gdmepsl : releases gdm_epsl
-//4. subroutines that are related to V_delta:
-//  - allocate_V_delta : allocates H_V_delta; if calculating force, it also calls
-//      init_gdmx, as well as allocating F_delta
+// as well as subroutines for initializing and releasing relevant data structures
+
+// Other than the constructor and the destructor, it contains 3 types of subroutines:
+// 1. subroutines that are related to calculating descriptors:
+//   - init : allocates some arrays
+//   - init_index : records the index (inl)
+// 2. subroutines that are related to calculating force label:
+//   - init_gdmx : allocates gdmx; it is a private subroutine
+//   - del_gdmx : releases gdmx
+// 3. subroutines that are related to calculating force label:
+//   - init_gdmepsl : allocates gdm_epsl; it is a private subroutine
+//   - del_gdmepsl : releases gdm_epsl
+// 4. subroutines that are related to V_delta:
+//   - allocate_V_delta : allocates H_V_delta; if calculating force, it also calls
+//       init_gdmx, as well as allocating F_delta
 
 #ifdef __DEEPKS
 
@@ -25,19 +24,20 @@
 
 namespace GlobalC
 {
-    LCAO_Deepks ld;
+LCAO_Deepks ld;
 }
 
-//Constructor of the class
+// Constructor of the class
 LCAO_Deepks::LCAO_Deepks()
 {
     alpha_index = new ModuleBase::IntArray[1];
     inl_index = new ModuleBase::IntArray[1];
     inl_l = nullptr;
     gedm = nullptr;
+    this->psialpha.resize(1);
 }
 
-//Desctructor of the class
+// Desctructor of the class
 LCAO_Deepks::~LCAO_Deepks()
 {
     delete[] alpha_index;
@@ -45,18 +45,18 @@ LCAO_Deepks::~LCAO_Deepks()
     delete[] inl_l;
 
     //=======1. to use deepks, pdm is required==========
-    //delete pdm**
-    for (int inl = 0;inl < this->inlmax;inl++)
+    // delete pdm**
+    for (int inl = 0; inl < this->inlmax; inl++)
     {
         delete[] pdm[inl];
     }
     delete[] pdm;
     //=======2. "deepks_scf" part==========
-    //if (PARAM.inp.deepks_scf)
+    // if (PARAM.inp.deepks_scf)
     if (gedm)
     {
-        //delete gedm**
-        for (int inl = 0;inl < this->inlmax;inl++)
+        // delete gedm**
+        for (int inl = 0; inl < this->inlmax; inl++)
         {
             delete[] gedm[inl];
         }
@@ -64,16 +64,14 @@ LCAO_Deepks::~LCAO_Deepks()
     }
 
     del_gdmx();
-
 }
 
-void LCAO_Deepks::init(
-    const LCAO_Orbitals& orb,
-    const int nat,
-    const int ntype,
-    const int nks,
-    const Parallel_Orbitals& pv_in,
-    std::vector<int> na)
+void LCAO_Deepks::init(const LCAO_Orbitals& orb,
+                       const int nat,
+                       const int ntype,
+                       const int nks,
+                       const Parallel_Orbitals& pv_in,
+                       std::vector<int> na)
 {
     ModuleBase::TITLE("LCAO_Deepks", "init");
 
@@ -86,51 +84,51 @@ void LCAO_Deepks::init(
     assert(lm >= 0);
     assert(nm >= 0);
     assert(tot_inl_per_atom >= 0);
-    
+
     int tot_inl = tot_inl_per_atom * nat;
 
-    if(PARAM.inp.deepks_equiv) 
+    if (PARAM.inp.deepks_equiv)
     {
         tot_inl = nat;
     }
 
     this->lmaxd = lm;
     this->nmaxd = nm;
-    
+
     GlobalV::ofs_running << " lmax of descriptor = " << this->lmaxd << std::endl;
     GlobalV::ofs_running << " nmax of descriptor = " << nmaxd << std::endl;
 
     int pdm_size = 0;
     this->inlmax = tot_inl;
-    if(!PARAM.inp.deepks_equiv)
+    if (!PARAM.inp.deepks_equiv)
     {
         GlobalV::ofs_running << " total basis (all atoms) for descriptor = " << std::endl;
 
-        //init pdm**
+        // init pdm**
         pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
     }
     else
     {
-        for(int il = 0; il < this->lmaxd + 1; il++)
+        for (int il = 0; il < this->lmaxd + 1; il++)
         {
             pdm_size += (2 * il + 1) * orb.Alpha[0].getNchi(il);
         }
         pdm_size = pdm_size * pdm_size;
-        this->des_per_atom=pdm_size;
+        this->des_per_atom = pdm_size;
         GlobalV::ofs_running << " Equivariant version, size of pdm matrices : " << pdm_size << std::endl;
     }
 
-    this->pdm = new double* [this->inlmax];
-    for (int inl = 0;inl < this->inlmax;inl++)
+    this->pdm = new double*[this->inlmax];
+    for (int inl = 0; inl < this->inlmax; inl++)
     {
         this->pdm[inl] = new double[pdm_size];
         ModuleBase::GlobalFunc::ZEROS(this->pdm[inl], pdm_size);
     }
 
     // cal n(descriptor) per atom , related to Lmax, nchi(L) and m. (not total_nchi!)
-    if(!PARAM.inp.deepks_equiv)
+    if (!PARAM.inp.deepks_equiv)
     {
-        this->des_per_atom=0; // mohan add 2021-04-21
+        this->des_per_atom = 0; // mohan add 2021-04-21
         for (int l = 0; l <= this->lmaxd; l++)
         {
             this->des_per_atom += orb.Alpha[0].getNchi(l) * (2 * l + 1);
@@ -139,14 +137,17 @@ void LCAO_Deepks::init(
 
         this->init_index(ntype, nat, na, tot_inl, orb);
     }
-    this->allocate_nlm(nat);
 
     this->pv = &pv_in;
 
     return;
 }
 
-void LCAO_Deepks::init_index(const int ntype, const int nat, std::vector<int> na, const int Total_nchi, const LCAO_Orbitals &orb)
+void LCAO_Deepks::init_index(const int ntype,
+                             const int nat,
+                             std::vector<int> na,
+                             const int Total_nchi,
+                             const LCAO_Orbitals& orb)
 {
     delete[] this->alpha_index;
     this->alpha_index = new ModuleBase::IntArray[ntype];
@@ -160,23 +161,18 @@ void LCAO_Deepks::init_index(const int ntype, const int nat, std::vector<int> na
     int alpha = 0;
     for (int it = 0; it < ntype; it++)
     {
-        this->alpha_index[it].create(
-            na[it],
-            this->lmaxd + 1, // l starts from 0
-            this->nmaxd,
-            2 * this->lmaxd + 1); // m ==> 2*l+1
+        this->alpha_index[it].create(na[it],
+                                     this->lmaxd + 1, // l starts from 0
+                                     this->nmaxd,
+                                     2 * this->lmaxd + 1); // m ==> 2*l+1
 
-        this->inl_index[it].create(
-            na[it],
-            this->lmaxd + 1,
-            this->nmaxd); 
+        this->inl_index[it].create(na[it], this->lmaxd + 1, this->nmaxd);
 
-        GlobalV::ofs_running << " Type " << it + 1
-                    << " number_of_atoms " << na[it] << std::endl;
+        GlobalV::ofs_running << " Type " << it + 1 << " number_of_atoms " << na[it] << std::endl;
 
         for (int ia = 0; ia < na[it]; ia++)
         {
-            //alpha
+            // alpha
             for (int l = 0; l < this->lmaxd + 1; l++)
             {
                 for (int n = 0; n < orb.Alpha[0].getNchi(l); n++)
@@ -191,51 +187,39 @@ void LCAO_Deepks::init_index(const int ntype, const int nat, std::vector<int> na
                     inl++;
                 }
             }
-        }//end ia
-    }//end it
+        } // end ia
+    }     // end it
     assert(this->n_descriptor == alpha);
     assert(Total_nchi == inl);
     GlobalV::ofs_running << " descriptors_per_atom " << this->des_per_atom << std::endl;
     GlobalV::ofs_running << " total_descriptors " << this->n_descriptor << std::endl;
-	return;
-}
-
-void LCAO_Deepks::allocate_nlm(const int nat)
-{
-    if(PARAM.globalv.gamma_only_local)
-    {
-        this->nlm_save.resize(nat);
-    }
-    else
-    {
-        this->nlm_save_k.resize(nat);
-    }
+    return;
 }
 
 void LCAO_Deepks::init_gdmx(const int nat)
 {
-    this->gdmx = new double** [nat];
-    this->gdmy = new double** [nat];
-    this->gdmz = new double** [nat];
+    this->gdmx = new double**[nat];
+    this->gdmy = new double**[nat];
+    this->gdmz = new double**[nat];
     int pdm_size = 0;
-    if(!PARAM.inp.deepks_equiv)
+    if (!PARAM.inp.deepks_equiv)
     {
         pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
     }
     else
     {
-        pdm_size = this -> des_per_atom;
+        pdm_size = this->des_per_atom;
     }
-    
-    for (int iat = 0;iat < nat;iat++)
+
+    for (int iat = 0; iat < nat; iat++)
     {
-        this->gdmx[iat] = new double* [inlmax];
-        this->gdmy[iat] = new double* [inlmax];
-        this->gdmz[iat] = new double* [inlmax];
-        for (int inl = 0;inl < inlmax;inl++)
+        this->gdmx[iat] = new double*[inlmax];
+        this->gdmy[iat] = new double*[inlmax];
+        this->gdmz[iat] = new double*[inlmax];
+        for (int inl = 0; inl < inlmax; inl++)
         {
-            this->gdmx[iat][inl] = new double [pdm_size];
-            this->gdmy[iat][inl] = new double [pdm_size];
+            this->gdmx[iat][inl] = new double[pdm_size];
+            this->gdmy[iat][inl] = new double[pdm_size];
             this->gdmz[iat][inl] = new double[pdm_size];
             ModuleBase::GlobalFunc::ZEROS(gdmx[iat][inl], pdm_size);
             ModuleBase::GlobalFunc::ZEROS(gdmy[iat][inl], pdm_size);
@@ -246,12 +230,12 @@ void LCAO_Deepks::init_gdmx(const int nat)
     return;
 }
 
-//void LCAO_Deepks::del_gdmx(const int nat)
+// void LCAO_Deepks::del_gdmx(const int nat)
 void LCAO_Deepks::del_gdmx()
 {
-    for (int iat = 0;iat < nat_gdm;iat++)
+    for (int iat = 0; iat < nat_gdm; iat++)
     {
-        for (int inl = 0;inl < inlmax;inl++)
+        for (int inl = 0; inl < inlmax; inl++)
         {
             delete[] this->gdmx[iat][inl];
             delete[] this->gdmy[iat][inl];
@@ -269,24 +253,24 @@ void LCAO_Deepks::del_gdmx()
 
 void LCAO_Deepks::init_gdmepsl()
 {
-    this->gdm_epsl = new double** [6];
-    
+    this->gdm_epsl = new double**[6];
+
     int pdm_size = 0;
-    if(!PARAM.inp.deepks_equiv)
+    if (!PARAM.inp.deepks_equiv)
     {
         pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
     }
     else
     {
-        pdm_size = this -> des_per_atom;
-    }    
+        pdm_size = this->des_per_atom;
+    }
 
-    for (int ipol = 0;ipol < 6;ipol++)
+    for (int ipol = 0; ipol < 6; ipol++)
     {
-        this->gdm_epsl[ipol] = new double* [inlmax];
-        for (int inl = 0;inl < inlmax;inl++)
+        this->gdm_epsl[ipol] = new double*[inlmax];
+        for (int inl = 0; inl < inlmax; inl++)
         {
-            this->gdm_epsl[ipol][inl] = new double [pdm_size];
+            this->gdm_epsl[ipol][inl] = new double[pdm_size];
             ModuleBase::GlobalFunc::ZEROS(gdm_epsl[ipol][inl], pdm_size);
         }
     }
@@ -295,9 +279,9 @@ void LCAO_Deepks::init_gdmepsl()
 
 void LCAO_Deepks::del_gdmepsl()
 {
-    for (int ipol = 0;ipol < 6;ipol++)
+    for (int ipol = 0; ipol < 6; ipol++)
     {
-        for (int inl = 0;inl < inlmax;inl++)
+        for (int inl = 0; inl < inlmax; inl++)
         {
             delete[] this->gdm_epsl[ipol][inl];
         }
@@ -307,14 +291,13 @@ void LCAO_Deepks::del_gdmepsl()
     return;
 }
 
-
 void LCAO_Deepks::allocate_V_delta(const int nat, const int nks)
 {
     ModuleBase::TITLE("LCAO_Deepks", "allocate_V_delta");
     nks_V_delta = nks;
 
-    //initialize the H matrix H_V_delta
-    if(PARAM.globalv.gamma_only_local)
+    // initialize the H matrix H_V_delta
+    if (PARAM.globalv.gamma_only_local)
     {
         H_V_delta.resize(1); // the first dimension is for the consistence with H_V_delta_k
         this->H_V_delta[0].resize(pv->nloc);
@@ -323,47 +306,46 @@ void LCAO_Deepks::allocate_V_delta(const int nat, const int nks)
     else
     {
         H_V_delta_k.resize(nks);
-        for(int ik=0;ik<nks;ik++)
+        for (int ik = 0; ik < nks; ik++)
         {
             this->H_V_delta_k[ik].resize(pv->nloc);
             ModuleBase::GlobalFunc::ZEROS(this->H_V_delta_k[ik].data(), pv->nloc);
         }
     }
 
-    //init gedm**
+    // init gedm**
     int pdm_size = 0;
-    if(!PARAM.inp.deepks_equiv)
+    if (!PARAM.inp.deepks_equiv)
     {
         pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
     }
     else
     {
-        pdm_size = this -> des_per_atom;
+        pdm_size = this->des_per_atom;
     }
 
-    this->gedm = new double* [this->inlmax];
-    for (int inl = 0;inl < this->inlmax;inl++)
+    this->gedm = new double*[this->inlmax];
+    for (int inl = 0; inl < this->inlmax; inl++)
     {
         this->gedm[inl] = new double[pdm_size];
         ModuleBase::GlobalFunc::ZEROS(this->gedm[inl], pdm_size);
     }
     if (PARAM.inp.cal_force)
     {
-        //init F_delta
+        // init F_delta
         F_delta.create(nat, 3);
-        if(PARAM.inp.deepks_out_labels) 
-        { 
+        if (PARAM.inp.deepks_out_labels)
+        {
             this->init_gdmx(nat);
             this->init_gdmepsl();
         }
-        //gdmx is used only in calculating gvx
+        // gdmx is used only in calculating gvx
     }
 
     if (PARAM.inp.deepks_bandgap)
     {
-        //init o_delta
+        // init o_delta
         o_delta.create(nks, 1);
-        
     }
 
     return;
@@ -371,20 +353,21 @@ void LCAO_Deepks::allocate_V_delta(const int nat, const int nks)
 
 void LCAO_Deepks::init_orbital_pdm_shell(const int nks)
 {
-    
-    this->orbital_pdm_shell = new double*** [nks];
 
-    for (int iks=0; iks<nks; iks++)
+    this->orbital_pdm_shell = new double***[nks];
+
+    for (int iks = 0; iks < nks; iks++)
     {
-        this->orbital_pdm_shell[iks] = new double** [1];
-        for (int hl=0; hl < 1; hl++)
+        this->orbital_pdm_shell[iks] = new double**[1];
+        for (int hl = 0; hl < 1; hl++)
         {
-            this->orbital_pdm_shell[iks][hl] = new double* [this->inlmax];
+            this->orbital_pdm_shell[iks][hl] = new double*[this->inlmax];
 
-            for(int inl = 0; inl < this->inlmax; inl++)
+            for (int inl = 0; inl < this->inlmax; inl++)
             {
-                this->orbital_pdm_shell[iks][hl][inl] = new double [(2 * this->lmaxd + 1) * (2 * this->lmaxd + 1)];
-                ModuleBase::GlobalFunc::ZEROS(orbital_pdm_shell[iks][hl][inl], (2 * this->lmaxd + 1) * (2 * this->lmaxd + 1));
+                this->orbital_pdm_shell[iks][hl][inl] = new double[(2 * this->lmaxd + 1) * (2 * this->lmaxd + 1)];
+                ModuleBase::GlobalFunc::ZEROS(orbital_pdm_shell[iks][hl][inl],
+                                              (2 * this->lmaxd + 1) * (2 * this->lmaxd + 1));
             }
         }
     }
@@ -392,14 +375,13 @@ void LCAO_Deepks::init_orbital_pdm_shell(const int nks)
     return;
 }
 
-
 void LCAO_Deepks::del_orbital_pdm_shell(const int nks)
 {
-    for (int iks=0; iks<nks; iks++)
+    for (int iks = 0; iks < nks; iks++)
     {
-        for (int hl=0; hl<1; hl++)
+        for (int hl = 0; hl < 1; hl++)
         {
-            for (int inl = 0;inl < this->inlmax; inl++)
+            for (int inl = 0; inl < this->inlmax; inl++)
             {
                 delete[] this->orbital_pdm_shell[iks][hl][inl];
             }
@@ -407,57 +389,58 @@ void LCAO_Deepks::del_orbital_pdm_shell(const int nks)
         }
         delete[] this->orbital_pdm_shell[iks];
     }
-     delete[] this->orbital_pdm_shell;    
+    delete[] this->orbital_pdm_shell;
 
     return;
 }
 
-void LCAO_Deepks::init_v_delta_pdm_shell(const int nks,const int nlocal)
+void LCAO_Deepks::init_v_delta_pdm_shell(const int nks, const int nlocal)
 {
-    const int mn_size=(2 * this->lmaxd + 1) * (2 * this->lmaxd + 1);
-    if (nks==1){
-        this->v_delta_pdm_shell = new double**** [nks];
-        for (int iks=0; iks<nks; iks++)
+    const int mn_size = (2 * this->lmaxd + 1) * (2 * this->lmaxd + 1);
+    if (nks == 1)
+    {
+        this->v_delta_pdm_shell = new double****[nks];
+        for (int iks = 0; iks < nks; iks++)
         {
-            this->v_delta_pdm_shell[iks] = new double*** [nlocal];
+            this->v_delta_pdm_shell[iks] = new double***[nlocal];
 
-            for (int mu=0; mu<nlocal; mu++)
+            for (int mu = 0; mu < nlocal; mu++)
             {
-                this->v_delta_pdm_shell[iks][mu] = new double** [nlocal];
+                this->v_delta_pdm_shell[iks][mu] = new double**[nlocal];
 
-                for (int nu=0; nu<nlocal; nu++)
+                for (int nu = 0; nu < nlocal; nu++)
                 {
-                    this->v_delta_pdm_shell[iks][mu][nu] = new double* [this->inlmax];
+                    this->v_delta_pdm_shell[iks][mu][nu] = new double*[this->inlmax];
 
-                    for(int inl = 0; inl < this->inlmax; inl++)
+                    for (int inl = 0; inl < this->inlmax; inl++)
                     {
-                        this->v_delta_pdm_shell[iks][mu][nu][inl] = new double [mn_size];
+                        this->v_delta_pdm_shell[iks][mu][nu][inl] = new double[mn_size];
                         ModuleBase::GlobalFunc::ZEROS(v_delta_pdm_shell[iks][mu][nu][inl], mn_size);
-                    }                
+                    }
                 }
             }
         }
     }
     else
     {
-        this->v_delta_pdm_shell_complex = new std::complex<double>**** [nks];
-        for (int iks=0; iks<nks; iks++)
+        this->v_delta_pdm_shell_complex = new std::complex<double>****[nks];
+        for (int iks = 0; iks < nks; iks++)
         {
-            this->v_delta_pdm_shell_complex[iks] = new std::complex<double>*** [nlocal];
+            this->v_delta_pdm_shell_complex[iks] = new std::complex<double>***[nlocal];
 
-            for (int mu=0; mu<nlocal; mu++)
+            for (int mu = 0; mu < nlocal; mu++)
             {
-                this->v_delta_pdm_shell_complex[iks][mu] = new std::complex<double>** [nlocal];
+                this->v_delta_pdm_shell_complex[iks][mu] = new std::complex<double>**[nlocal];
 
-                for (int nu=0; nu<nlocal; nu++)
+                for (int nu = 0; nu < nlocal; nu++)
                 {
-                    this->v_delta_pdm_shell_complex[iks][mu][nu] = new std::complex<double>* [this->inlmax];
+                    this->v_delta_pdm_shell_complex[iks][mu][nu] = new std::complex<double>*[this->inlmax];
 
-                    for(int inl = 0; inl < this->inlmax; inl++)
+                    for (int inl = 0; inl < this->inlmax; inl++)
                     {
-                        this->v_delta_pdm_shell_complex[iks][mu][nu][inl] = new std::complex<double> [mn_size];
+                        this->v_delta_pdm_shell_complex[iks][mu][nu][inl] = new std::complex<double>[mn_size];
                         ModuleBase::GlobalFunc::ZEROS(v_delta_pdm_shell_complex[iks][mu][nu][inl], mn_size);
-                    }                
+                    }
                 }
             }
         }
@@ -466,47 +449,47 @@ void LCAO_Deepks::init_v_delta_pdm_shell(const int nks,const int nlocal)
     return;
 }
 
-void LCAO_Deepks::del_v_delta_pdm_shell(const int nks,const int nlocal)
+void LCAO_Deepks::del_v_delta_pdm_shell(const int nks, const int nlocal)
 {
-    if (nks==1)
+    if (nks == 1)
     {
-        for (int iks=0; iks<nks; iks++)
+        for (int iks = 0; iks < nks; iks++)
         {
-            for (int mu=0; mu<nlocal; mu++)
+            for (int mu = 0; mu < nlocal; mu++)
             {
-                for (int nu=0; nu<nlocal; nu++)
+                for (int nu = 0; nu < nlocal; nu++)
                 {
-                    for (int inl = 0;inl < this->inlmax; inl++)
+                    for (int inl = 0; inl < this->inlmax; inl++)
                     {
                         delete[] this->v_delta_pdm_shell[iks][mu][nu][inl];
                     }
-                    delete[] this->v_delta_pdm_shell[iks][mu][nu];                
+                    delete[] this->v_delta_pdm_shell[iks][mu][nu];
                 }
-                delete[] this->v_delta_pdm_shell[iks][mu]; 
+                delete[] this->v_delta_pdm_shell[iks][mu];
             }
-            delete[] this->v_delta_pdm_shell[iks]; 
+            delete[] this->v_delta_pdm_shell[iks];
         }
-        delete[] this->v_delta_pdm_shell;    
+        delete[] this->v_delta_pdm_shell;
     }
     else
     {
-        for (int iks=0; iks<nks; iks++)
+        for (int iks = 0; iks < nks; iks++)
         {
-            for (int mu=0; mu<nlocal; mu++)
+            for (int mu = 0; mu < nlocal; mu++)
             {
-                for (int nu=0; nu<nlocal; nu++)
+                for (int nu = 0; nu < nlocal; nu++)
                 {
-                    for (int inl = 0;inl < this->inlmax; inl++)
+                    for (int inl = 0; inl < this->inlmax; inl++)
                     {
                         delete[] this->v_delta_pdm_shell_complex[iks][mu][nu][inl];
                     }
-                    delete[] this->v_delta_pdm_shell_complex[iks][mu][nu];                
+                    delete[] this->v_delta_pdm_shell_complex[iks][mu][nu];
                 }
-                delete[] this->v_delta_pdm_shell_complex[iks][mu]; 
+                delete[] this->v_delta_pdm_shell_complex[iks][mu];
             }
-            delete[] this->v_delta_pdm_shell_complex[iks]; 
+            delete[] this->v_delta_pdm_shell_complex[iks];
         }
-        delete[] this->v_delta_pdm_shell_complex;    
+        delete[] this->v_delta_pdm_shell_complex;
     }
 
     return;
@@ -519,6 +502,8 @@ void LCAO_Deepks::dpks_cal_e_delta_band(const std::vector<std::vector<TK>>& dm,
 }
 
 template void LCAO_Deepks::dpks_cal_e_delta_band<double>(const std::vector<std::vector<double>>& dm, const int nks);
-template void LCAO_Deepks::dpks_cal_e_delta_band<std::complex<double>>(const std::vector<std::vector<std::complex<double>>>& dm, const int nks);
+template void LCAO_Deepks::dpks_cal_e_delta_band<std::complex<double>>(
+    const std::vector<std::vector<std::complex<double>>>& dm,
+    const int nks);
 
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
index 89276b7835..e0deda6aa4 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
@@ -1,8 +1,10 @@
-#ifndef LCAO_DEEPKS_H 
-#define LCAO_DEEPKS_H 
+#ifndef LCAO_DEEPKS_H
+#define LCAO_DEEPKS_H
 
 #ifdef __DEEPKS
 
+#include "deepks_force.h"
+#include "deepks_hmat.h"
 #include "module_base/complexmatrix.h"
 #include "module_base/intarray.h"
 #include "module_base/matrix.h"
@@ -11,15 +13,13 @@
 #include "module_basis/module_nao/two_center_integrator.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_elecstate/module_dm/density_matrix.h"
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
 #include "module_io/winput.h"
 
 #include <torch/script.h>
 #include <torch/torch.h>
 #include <unordered_map>
 
-#include "deepks_force.h"
-#include "deepks_hmat.h"
-
 ///
 /// The LCAO_Deepks contains subroutines for implementation of the DeePKS method in atomic basis.
 /// In essential, it is a machine-learned correction term to the XC potential
@@ -56,7 +56,7 @@ class LCAO_Deepks
 
     /// Correction term to the Hamiltonian matrix: \f$\langle\psi|V_\delta|\psi\rangle\f$ (for gamma only)
     /// The size of first dimension is 1, which is used for the consitence with H_V_delta_k
-    std::vector<std::vector<double>> H_V_delta; 
+    std::vector<std::vector<double>> H_V_delta;
     /// Correction term to Hamiltonian, for multi-k
     std::vector<std::vector<std::complex<double>>> H_V_delta_k;
 
@@ -97,8 +97,8 @@ class LCAO_Deepks
     //-------------------
     // private variables
     //-------------------
-//  private:
-  public: // change to public to reconstuct the code, 2024-07-22 by mohan
+    //  private:
+  public:           // change to public to reconstuct the code, 2024-07-22 by mohan
     int lmaxd = 0;  // max l of descirptors
     int nmaxd = 0;  //#. descriptors per l
     int inlmax = 0; // tot. number {i,n,l} - atom, n, l
@@ -111,12 +111,9 @@ class LCAO_Deepks
     // related derivatives.
     torch::jit::script::Module module;
 
-    // saves <psi(0)|alpha(R)>, for gamma only
-    std::vector<std::vector<std::unordered_map<int, std::vector<std::vector<double>>>>> nlm_save;
-
-    // saves <psi(0)|alpha(R)>, for multi_k
-    typedef std::tuple<int, int, int, int> key_tuple;
-    std::vector<std::map<key_tuple, std::unordered_map<int, std::vector<std::vector<double>>>>> nlm_save_k;
+    // saves <psi(0)|alpha(R)> and its derivatives
+    // index 0 for itself and index 1-3 for derivatives over x,y,z
+    std::vector<hamilt::HContainer<double>*> psialpha;
 
     // projected density matrix
     double** pdm; //[tot_Inl][2l+1][2l+1]	caoyu modified 2021-05-07; if equivariant version: [nat][nlm*nlm]
@@ -164,7 +161,7 @@ class LCAO_Deepks
     std::complex<double>***** v_delta_pdm_shell_complex; // for multi-k
     // v_delta_precalc[nks,nlocal,nlocal,NAt,NDscrpt] = gvdm * v_delta_pdm_shell;
     torch::Tensor v_delta_precalc_tensor;
-    //for v_delta==2 , new v_delta_precalc storage method
+    // for v_delta==2 , new v_delta_precalc storage method
     torch::Tensor psialpha_tensor;
     torch::Tensor gevdm_tensor;
 
@@ -198,7 +195,6 @@ class LCAO_Deepks
     // 1. subroutines that are related to calculating descriptors:
     //   - init : allocates some arrays
     //   - init_index : records the index (inl)
-    //   - allocate_nlm : allocates data structures (nlm_save) which is used to store <chi|alpha>
     // 2. subroutines that are related to calculating force label:
     //   - init_gdmx : allocates gdmx; it is a private subroutine
     //   - del_gdmx : releases gdmx
@@ -240,25 +236,31 @@ class LCAO_Deepks
     // for bandgap label calculation; QO added on 2022-1-7
     void init_orbital_pdm_shell(const int nks);
     void del_orbital_pdm_shell(const int nks);
-  
-    //for v_delta label calculation; xinyuan added on 2023-2-22
-    void init_v_delta_pdm_shell(const int nks,const int nlocal);
-    void del_v_delta_pdm_shell(const int nks,const int nlocal);
+
+    // for v_delta label calculation; xinyuan added on 2023-2-22
+    void init_v_delta_pdm_shell(const int nks, const int nlocal);
+    void del_v_delta_pdm_shell(const int nks, const int nlocal);
 
     //-------------------
     // LCAO_deepks_psialpha.cpp
     //-------------------
 
-    // wenfei 2022-1-5
-    // This file contains 2 subroutines:
-    // 1. build_psialpha, which calculates the overlap
+    // E.Wu 2024-12-24
+    // This file contains 3 subroutines:
+    // 1. allocate_psialpha, which allocates memory for psialpha
+    // 2. build_psialpha, which calculates the overlap
     // between atomic basis and projector alpha : <psi_mu|alpha>
     // which will be used in calculating pdm, gdmx, H_V_delta, F_delta;
-    // 2. check_psialpha, which prints the results into .dat files
+    // 3. check_psialpha, which prints the results into .dat files
     // for checking
 
   public:
     // calculates <chi|alpha>
+    void allocate_psialpha(const bool& cal_deri,
+                           const UnitCell& ucell,
+                           const LCAO_Orbitals& orb,
+                           const Grid_Driver& GridD);
+
     void build_psialpha(const bool& cal_deri /**< [in] 0 for 2-center intergration, 1 for its derivation*/,
                         const UnitCell& ucell,
                         const LCAO_Orbitals& orb,
@@ -282,21 +284,22 @@ class LCAO_Deepks
     // It also contains subroutines for printing pdm and gdmx
     // for checking purpose
 
-    // There are 6 subroutines in this file:
-    // 1. cal_projected_DM, which is used for calculating pdm for gamma point calculation
+    // There are 4 subroutines in this file:
+    // 1. cal_projected_DM, which is used for calculating pdm
     // 2. check_projected_dm, which prints pdm to descriptor.dat
 
-    // 3. cal_gdmx, calculating gdmx (and optionally gdm_epsl for stress) for gamma point
+    // 3. cal_gdmx, calculating gdmx (and optionally gdm_epsl for stress)
     // 4. check_gdmx, which prints gdmx to a series of .dat files
 
   public:
-    /** 
+    /**
      * @brief calculate projected density matrix:
      * pdm = sum_i,occ <phi_i|alpha1><alpha2|phi_k>
      * 3 cases to skip calculation of pdm:
      *    1. NSCF calculation of DeePKS, init_chg = file and pdm has been read
      *    2. SCF calculation of DeePKS with init_chg = file and pdm has been read for restarting SCF
-     *    3. Relax/Cell-Relax/MD calculation, non-first step will use the convergence pdm from the last step as initial pdm
+     *    3. Relax/Cell-Relax/MD calculation, non-first step will use the convergence pdm from the last step as initial
+     * pdm
      */
     template <typename TK>
     void cal_projected_DM(const elecstate::DensityMatrix<TK, double>* dm,
@@ -316,11 +319,12 @@ class LCAO_Deepks
         const Grid_Driver& GridD,
         const int nks,
         const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+        std::vector<hamilt::HContainer<double>*> psialpha,
         const bool isstress);
 
     void check_gdmx(const int nat);
 
-    /** 
+    /**
      * @brief set init_pdm to skip the calculation of pdm in SCF iteration
      */
     void set_init_pdm(bool ipdm)
@@ -354,7 +358,6 @@ class LCAO_Deepks
     //! a temporary interface for cal_e_delta_band and cal_e_delta_band_k
     template <typename TK>
     void dpks_cal_e_delta_band(const std::vector<std::vector<TK>>& dm, const int nks);
-    
 
     //-------------------
     // LCAO_deepks_odelta.cpp
@@ -365,9 +368,9 @@ class LCAO_Deepks
 
   public:
     template <typename TK, typename TH>
-    void cal_o_delta(const std::vector<std::vector<TH>>&
-                         dm_hl /**<[in] modified density matrix that contains HOMO and LUMO only*/,
-                     const int nks);
+    void cal_o_delta(
+        const std::vector<std::vector<TH>>& dm_hl /**<[in] modified density matrix that contains HOMO and LUMO only*/,
+        const int nks);
 
     //-------------------
     // LCAO_deepks_torch.cpp
@@ -411,7 +414,7 @@ class LCAO_Deepks
     /// which are eigenvalues of pdm in blocks of I_n_l
     void cal_descriptor(const int nat);
     /// print descriptors based on LCAO basis
-    void check_descriptor(const UnitCell& ucell, const std::string &out_dir);
+    void check_descriptor(const UnitCell& ucell, const std::string& out_dir);
 
     void cal_descriptor_equiv(const int nat);
 
@@ -448,7 +451,7 @@ class LCAO_Deepks
                              const LCAO_Orbitals& orb,
                              const Grid_Driver& GridD);
 
-    //calculates v_delta_precalc
+    // calculates v_delta_precalc
     template <typename TK>
     void cal_v_delta_precalc(const int nlocal,
                              const int nat,
@@ -473,11 +476,9 @@ class LCAO_Deepks
 
     template <typename TK>
     void check_vdp_psialpha(const int nat, const int nks, const int nlocal);
-    
+
     // prepare gevdm for outputting npy file
-    void prepare_gevdm(
-        const int nat,
-        const LCAO_Orbitals &orb);
+    void prepare_gevdm(const int nat, const LCAO_Orbitals& orb);
     void check_vdp_gevdm(const int nat);
 
   private:
@@ -492,15 +493,12 @@ class LCAO_Deepks
                        int ndim,      // second dimension
                        double** mat); // the array being reduced
 #endif
-
-  
 };
 
 namespace GlobalC
 {
-    extern LCAO_Deepks ld;
+extern LCAO_Deepks ld;
 }
 
-
 #endif
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
index 62dfc76aef..7e7365c069 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
@@ -1,40 +1,40 @@
-//wenfei 2022-1-11
-//This file contains subroutines for calculating pdm,
+// wenfei 2022-1-11
+// This file contains subroutines for calculating pdm,
 #include "module_parameter/parameter.h"
-//which is defind as sum_mu,nu rho_mu,nu (<chi_mu|alpha><alpha|chi_nu>);
-//as well as gdmx, which is the gradient of pdm, defined as
-//sum_mu,nu rho_mu,nu d/dX(<chi_mu|alpha><alpha|chi_nu>)
+// which is defind as sum_mu,nu rho_mu,nu (<chi_mu|alpha><alpha|chi_nu>);
+// as well as gdmx, which is the gradient of pdm, defined as
+// sum_mu,nu rho_mu,nu d/dX(<chi_mu|alpha><alpha|chi_nu>)
 
-//It also contains subroutines for printing pdm and gdmx
-//for checking purpose
+// It also contains subroutines for printing pdm and gdmx
+// for checking purpose
 
-//There are 3 subroutines in this file:
-//1. read_projected_DM, which reads pdm from file
-//2. cal_projected_DM, which is used for calculating pdm 
-//3. check_projected_dm, which prints pdm to descriptor.dat
+// There are 3 subroutines in this file:
+// 1. read_projected_DM, which reads pdm from file
+// 2. cal_projected_DM, which is used for calculating pdm
+// 3. check_projected_dm, which prints pdm to descriptor.dat
 
 #ifdef __DEEPKS
 
 #include "LCAO_deepks.h"
-#include "module_base/vector3.h"
-#include "module_base/timer.h"
 #include "module_base/constants.h"
-#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
 #include "module_base/libm/libm.h"
+#include "module_base/timer.h"
+#include "module_base/vector3.h"
+#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
 
 void LCAO_Deepks::read_projected_DM(bool read_pdm_file, bool is_equiv, const Numerical_Orbital& alpha)
 {
-    if (read_pdm_file && !this->init_pdm) //for DeePKS NSCF calculation 
+    if (read_pdm_file && !this->init_pdm) // for DeePKS NSCF calculation
     {
         int pdm_size = 0;
-        if(!is_equiv)
+        if (!is_equiv)
         {
             pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
         }
         else
         {
             int nproj = 0;
-            for(int il = 0; il < this->lmaxd + 1; il++)
+            for (int il = 0; il < this->lmaxd + 1; il++)
             {
                 nproj += (2 * il + 1) * alpha.getNchi(il);
             }
@@ -48,12 +48,12 @@ void LCAO_Deepks::read_projected_DM(bool read_pdm_file, bool is_equiv, const Num
         {
             ModuleBase::WARNING_QUIT("LCAO_Deepks::read_projected_DM", "Cannot find the file deepks_projdm.dat");
         }
-        for(int inl=0;inl<this->inlmax;inl++)
+        for (int inl = 0; inl < this->inlmax; inl++)
         {
-            for(int ind=0;ind<pdm_size;ind++)
+            for (int ind = 0; ind < pdm_size; ind++)
             {
                 double c;
-			    ifs >> c;
+                ifs >> c;
                 pdm[inl][ind] = c;
             }
         }
@@ -61,95 +61,94 @@ void LCAO_Deepks::read_projected_DM(bool read_pdm_file, bool is_equiv, const Num
     }
 }
 
-//this subroutine performs the calculation of projected density matrices
-//pdm_m,m'=\sum_{mu,nu} rho_{mu,nu} <chi_mu|alpha_m><alpha_m'|chi_nu>
+// this subroutine performs the calculation of projected density matrices
+// pdm_m,m'=\sum_{mu,nu} rho_{mu,nu} <chi_mu|alpha_m><alpha_m'|chi_nu>
 template <typename TK>
-void LCAO_Deepks::cal_projected_DM(
-    const elecstate::DensityMatrix<TK, double>* dm, 
-    const UnitCell &ucell,
-    const LCAO_Orbitals &orb,
-    const Grid_Driver& GridD)
+void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<TK, double>* dm,
+                                   const UnitCell& ucell,
+                                   const LCAO_Orbitals& orb,
+                                   const Grid_Driver& GridD)
 
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_projected_DM");
 
     // if pdm has been initialized, skip the calculation
-    if(this->init_pdm)
+    if (this->init_pdm)
     {
         this->init_pdm = false;
         return;
     }
     int pdm_size = 0;
-    if(!PARAM.inp.deepks_equiv)
+    if (!PARAM.inp.deepks_equiv)
     {
         pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
     }
     else
     {
         int nproj = 0;
-        for(int il = 0; il < this->lmaxd + 1; il++)
+        for (int il = 0; il < this->lmaxd + 1; il++)
         {
             nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
         }
         pdm_size = nproj * nproj;
     }
 
-    //if(dm.size() == 0 || dm[0].size() == 0)
+    // if(dm.size() == 0 || dm[0].size() == 0)
     //{
-    //    return;
-    //}
-    ModuleBase::timer::tick("LCAO_Deepks","cal_projected_DM");
+    //     return;
+    // }
+    ModuleBase::timer::tick("LCAO_Deepks", "cal_projected_DM");
 
-    for(int inl=0;inl<inlmax;inl++)
+    for (int inl = 0; inl < inlmax; inl++)
     {
-        ModuleBase::GlobalFunc::ZEROS(pdm[inl],pdm_size);
+        ModuleBase::GlobalFunc::ZEROS(pdm[inl], pdm_size);
     }
 
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        for (int I0 =0; I0< atom0->na; I0++)
+        Atom* atom0 = &ucell.atoms[T0];
+        for (int I0 = 0; I0 < atom0->na; I0++)
         {
-            const int iat = ucell.itia2iat(T0,I0);
+            const int iat = ucell.itia2iat(T0, I0);
             const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
+            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
 
-            //trace alpha orbital
+            // trace alpha orbital
             std::vector<int> trace_alpha_row;
             std::vector<int> trace_alpha_col;
-            if(!PARAM.inp.deepks_equiv)
+            if (!PARAM.inp.deepks_equiv)
             {
-                int ib=0;
-                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                int ib = 0;
+                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
                 {
-                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                    for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
                     {
                         const int inl = this->inl_index[T0](I0, L0, N0);
-                        const int nm = 2*L0+1;
-                
-                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                        const int nm = 2 * L0 + 1;
+
+                        for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                         {
-                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
                             {
-                                trace_alpha_row.push_back(ib+m1);
-                                trace_alpha_col.push_back(ib+m2);
+                                trace_alpha_row.push_back(ib + m1);
+                                trace_alpha_col.push_back(ib + m2);
                             }
                         }
-                        ib+=nm;
+                        ib += nm;
                     }
                 }
             }
             else
             {
                 int nproj = 0;
-                for(int il = 0; il < this->lmaxd + 1; il++)
+                for (int il = 0; il < this->lmaxd + 1; il++)
                 {
                     nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
                 }
-                for(int iproj = 0; iproj < nproj; iproj ++)
+                for (int iproj = 0; iproj < nproj; iproj++)
                 {
-                    for(int jproj = 0; jproj < nproj; jproj ++)
+                    for (int jproj = 0; jproj < nproj; jproj++)
                     {
                         trace_alpha_row.push_back(iproj);
                         trace_alpha_col.push_back(jproj);
@@ -158,115 +157,97 @@ void LCAO_Deepks::cal_projected_DM(
             }
             const int trace_alpha_size = trace_alpha_row.size();
 
-            for (int ad1=0; ad1<GridD.getAdjacentNum()+1 ; ++ad1)
+            for (int ad1 = 0; ad1 < GridD.getAdjacentNum() + 1; ++ad1)
             {
                 const int T1 = GridD.getType(ad1);
                 const int I1 = GridD.getNatom(ad1);
-                const int ibt1 = ucell.itia2iat(T1,I1);
+                const int ibt1 = ucell.itia2iat(T1, I1);
                 const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
-				const Atom* atom1 = &ucell.atoms[T1];
-				const int nw1_tot = atom1->nw*PARAM.globalv.npol;
-				const double Rcut_AO1 = orb.Phi[T1].getRcut(); 
-                const double dist1 = (tau1-tau0).norm() * ucell.lat0;
+                const Atom* atom1 = &ucell.atoms[T1];
+                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
+                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
                 if (dist1 >= Rcut_Alpha + Rcut_AO1)
                 {
                     continue;
                 }
 
-                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z); 
-                key_tuple key_1(ibt1,dR1.x,dR1.y,dR1.z);
+                ModuleBase::Vector3<int> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z);
                 if constexpr (std::is_same<TK, std::complex<double>>::value)
                 {
-                    if(this->nlm_save_k[iat].find(key_1) == this->nlm_save_k[iat].end()) 
-                    { 
+                    if (this->psialpha[0]->find_matrix(iat, ibt1, dR1.x, dR1.y, dR1.z) == nullptr)
+                    {
                         continue;
                     }
                 }
 
                 auto row_indexes = pv->get_indexes_row(ibt1);
                 const int row_size = row_indexes.size();
-                if(row_size == 0) 
-                { 
+                if (row_size == 0)
+                {
                     continue;
                 }
 
                 // no possible to unexist key
                 std::vector<double> s_1t(trace_alpha_size * row_size);
                 std::vector<double> g_1dmt(trace_alpha_size * row_size, 0.0);
-                for(int irow=0;irow<row_size;irow++)
+                for (int irow = 0; irow < row_size; irow++)
                 {
-                    const double* row_ptr;
-                    if constexpr (std::is_same<TK, double>::value)
-                    {
-                        row_ptr = this->nlm_save[iat][ad1][row_indexes[irow]][0].data();
-                    }
-                    else
-                    {
-                        row_ptr = this->nlm_save_k[iat][key_1][row_indexes[irow]][0].data();
-                    }
+                    hamilt::BaseMatrix<double>* row_ptr = this->psialpha[0]->find_matrix(iat, ibt1, dR1);
 
-                    for(int i=0;i<trace_alpha_size;i++)
+                    for (int i = 0; i < trace_alpha_size; i++)
                     {
-                        s_1t[i * row_size + irow] = row_ptr[trace_alpha_row[i]];
+                        s_1t[i * row_size + irow] = row_ptr->get_value(row_indexes[irow], trace_alpha_row[i]);
                     }
                 }
 
-				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
-				{
-					const int T2 = GridD.getType(ad2);
-					const int I2 = GridD.getNatom(ad2);
-                    const int ibt2 = ucell.itia2iat(T2,I2);
-					const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
-					const Atom* atom2 = &ucell.atoms[T2];
-					const int nw2_tot = atom2->nw*PARAM.globalv.npol;
-					
-                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
-					key_tuple key_2(ibt2,dR2.x,dR2.y,dR2.z);
+                for (int ad2 = 0; ad2 < GridD.getAdjacentNum() + 1; ad2++)
+                {
+                    const int T2 = GridD.getType(ad2);
+                    const int I2 = GridD.getNatom(ad2);
+                    const int ibt2 = ucell.itia2iat(T2, I2);
+                    const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
+                    const Atom* atom2 = &ucell.atoms[T2];
+                    const int nw2_tot = atom2->nw * PARAM.globalv.npol;
+
+                    ModuleBase::Vector3<int> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
                     if constexpr (std::is_same<TK, std::complex<double>>::value)
                     {
-                        if (this->nlm_save_k[iat].find(key_2) == this->nlm_save_k[iat].end()) 
-                        { 
+                        if (this->psialpha[0]->find_matrix(iat, ibt2, dR2.x, dR2.y, dR2.z) == nullptr)
+                        {
                             continue;
                         }
                     }
 
-					const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                	const double dist2 = (tau2-tau0).norm() * ucell.lat0;
+                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
+                    const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
 
-					if (dist2 >= Rcut_Alpha + Rcut_AO2)
-					{
-						continue;
-					}
+                    if (dist2 >= Rcut_Alpha + Rcut_AO2)
+                    {
+                        continue;
+                    }
 
                     auto col_indexes = pv->get_indexes_col(ibt2);
                     const int col_size = col_indexes.size();
-                    if(col_size == 0) 
-                    { 
+                    if (col_size == 0)
+                    {
                         continue;
                     }
 
                     std::vector<double> s_2t(trace_alpha_size * col_size);
                     // no possible to unexist key
-                    for(int icol=0;icol<col_size;icol++)
+                    for (int icol = 0; icol < col_size; icol++)
                     {
-                        const double* col_ptr;
-                        if constexpr (std::is_same<TK, double>::value)
-                        {
-                            col_ptr = this->nlm_save[iat][ad2][col_indexes[icol]][0].data();
-                        }
-                        else
-                        {
-                            col_ptr = this->nlm_save_k[iat][key_2][col_indexes[icol]][0].data();
-                        }
-                        for(int i=0;i<trace_alpha_size;i++)
+                        hamilt::BaseMatrix<double>* col_ptr = this->psialpha[0]->find_matrix(iat, ibt2, dR2);
+                        for (int i = 0; i < trace_alpha_size; i++)
                         {
-                            s_2t[i * col_size + icol] = col_ptr[trace_alpha_col[i]];
+                            s_2t[i * col_size + icol] = col_ptr->get_value(col_indexes[icol], trace_alpha_col[i]);
                         }
                     }
                     // prepare DM_gamma from DMR
-                    std::vector<double> dm_array(row_size*col_size, 0.0);
+                    std::vector<double> dm_array(row_size * col_size, 0.0);
                     const double* dm_current = nullptr;
-                    for(int is=0;is<dm->get_DMR_vector().size();is++)
+                    for (int is = 0; is < dm->get_DMR_vector().size(); is++)
                     {
                         int dRx, dRy, dRz;
                         if constexpr (std::is_same<TK, double>::value)
@@ -282,92 +263,99 @@ void LCAO_Deepks::cal_projected_DM(
                             dRz = dR2.z - dR1.z;
                         }
                         auto* tmp = dm->get_DMR_vector()[is]->find_matrix(ibt1, ibt2, dRx, dRy, dRz);
-                        if(tmp == nullptr)
+                        if (tmp == nullptr)
                         {
-                            // in case of no deepks_scf but out_deepks_label, size of DMR would mismatch with deepks-orbitals
-                            dm_current = nullptr; 
+                            // in case of no deepks_scf but out_deepks_label, size of DMR would mismatch with
+                            // deepks-orbitals
+                            dm_current = nullptr;
                             break;
                         }
                         dm_current = tmp->get_pointer();
-                        for(int idm=0;idm<row_size*col_size;idm++)
+                        for (int idm = 0; idm < row_size * col_size; idm++)
                         {
                             dm_array[idm] += dm_current[idm];
                         }
                     }
-                    if(dm_current == nullptr) 
-                    { 
-                        continue; //skip the long range DM pair more than nonlocal term
+                    if (dm_current == nullptr)
+                    {
+                        continue; // skip the long range DM pair more than nonlocal term
                     }
 
                     dm_current = dm_array.data();
-                    //dgemm for s_2t and dm_current to get g_1dmt
-                    constexpr char transa='T', transb='N';
+                    // dgemm for s_2t and dm_current to get g_1dmt
+                    constexpr char transa = 'T', transb = 'N';
                     const double gemm_alpha = 1.0, gemm_beta = 1.0;
-                    dgemm_(
-                        &transa, &transb, 
-                        &row_size, 
-                        &trace_alpha_size, 
-                        &col_size, 
-                        &gemm_alpha, 
-                        dm_current, 
-                        &col_size,
-                        s_2t.data(),  
-                        &col_size,     
-                        &gemm_beta,      
-                        g_1dmt.data(),    
-                        &row_size);
-				}//ad2
+                    dgemm_(&transa,
+                           &transb,
+                           &row_size,
+                           &trace_alpha_size,
+                           &col_size,
+                           &gemm_alpha,
+                           dm_current,
+                           &col_size,
+                           s_2t.data(),
+                           &col_size,
+                           &gemm_beta,
+                           g_1dmt.data(),
+                           &row_size);
+                } // ad2
                 // do dot of g_1dmt and s_1t to get orbital_pdm_shell
-                if(!PARAM.inp.deepks_equiv)
+                if (!PARAM.inp.deepks_equiv)
                 {
-                    int ib=0, index=0, inc=1;
-                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                    int ib = 0, index = 0, inc = 1;
+                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
                     {
-                        for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                        for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
                         {
                             const int inl = this->inl_index[T0](I0, L0, N0);
-                            const int nm = 2*L0+1;
-                    
-                            for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                            const int nm = 2 * L0 + 1;
+
+                            for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                             {
-                                for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                                for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
                                 {
-                                    int ind = m1*nm + m2;
-                                    pdm[inl][ind] += 
-                                        ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
+                                    int ind = m1 * nm + m2;
+                                    pdm[inl][ind] += ddot_(&row_size,
+                                                           g_1dmt.data() + index * row_size,
+                                                           &inc,
+                                                           s_1t.data() + index * row_size,
+                                                           &inc);
                                     index++;
                                 }
                             }
-                            ib+=nm;
+                            ib += nm;
                         }
                     }
                 }
                 else
                 {
-                    int index = 0, inc=1;
+                    int index = 0, inc = 1;
                     int nproj = 0;
-                    for(int il = 0; il < this->lmaxd + 1; il++)
+                    for (int il = 0; il < this->lmaxd + 1; il++)
                     {
                         nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
                     }
-                    for(int iproj = 0; iproj < nproj; iproj ++)
+                    for (int iproj = 0; iproj < nproj; iproj++)
                     {
-                        for(int jproj = 0; jproj < nproj; jproj ++)
+                        for (int jproj = 0; jproj < nproj; jproj++)
                         {
-                            pdm[iat][iproj * nproj + jproj] += 
-                                ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
-                            index ++;
+                            pdm[iat][iproj * nproj + jproj] += ddot_(&row_size,
+                                                                     g_1dmt.data() + index * row_size,
+                                                                     &inc,
+                                                                     s_1t.data() + index * row_size,
+                                                                     &inc);
+                            index++;
                         }
                     }
                 }
-			}//ad1
-        }//I0
-    }//T0
+            } // ad1
+        }     // I0
+    }         // T0
 
 #ifdef __MPI
-    allsum_deepks(this->inlmax,pdm_size,this->pdm);
+    allsum_deepks(this->inlmax, pdm_size, this->pdm);
 #endif
-    ModuleBase::timer::tick("LCAO_Deepks","cal_projected_DM"); 
+    ModuleBase::timer::tick("LCAO_Deepks", "cal_projected_DM");
     return;
 }
 
@@ -377,10 +365,10 @@ void LCAO_Deepks::check_projected_dm()
     std::ofstream ofs(file_projdm.c_str());
 
     const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
-    ofs<<std::setprecision(10);
-    for(int inl=0;inl<inlmax;inl++)
+    ofs << std::setprecision(10);
+    for (int inl = 0; inl < inlmax; inl++)
     {
-        for(int ind=0;ind<pdm_size;ind++)
+        for (int ind = 0; ind < pdm_size; ind++)
         {
             ofs << pdm[inl][ind] << " ";
         }
@@ -388,16 +376,15 @@ void LCAO_Deepks::check_projected_dm()
     }
 }
 
-template void LCAO_Deepks::cal_projected_DM<double>(
-    const elecstate::DensityMatrix<double, double>* dm, 
-    const UnitCell& ucell, 
-    const LCAO_Orbitals& orb, 
-    const Grid_Driver& GridD);
+template void LCAO_Deepks::cal_projected_DM<double>(const elecstate::DensityMatrix<double, double>* dm,
+                                                    const UnitCell& ucell,
+                                                    const LCAO_Orbitals& orb,
+                                                    const Grid_Driver& GridD);
 
 template void LCAO_Deepks::cal_projected_DM<std::complex<double>>(
-    const elecstate::DensityMatrix<std::complex<double>, double>* dm, 
-    const UnitCell& ucell, 
-    const LCAO_Orbitals& orb, 
+    const elecstate::DensityMatrix<std::complex<double>, double>* dm,
+    const UnitCell& ucell,
+    const LCAO_Orbitals& orb,
     const Grid_Driver& GridD);
 
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp
index dcd8440eea..71704eba3a 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp
@@ -13,7 +13,74 @@
 #include "module_base/vector3.h"
 #include "module_parameter/parameter.h"
 
-void LCAO_Deepks::build_psialpha(const bool& calc_deri,
+void LCAO_Deepks::allocate_psialpha(const bool& cal_deri,
+                                    const UnitCell& ucell,
+                                    const LCAO_Orbitals& orb,
+                                    const Grid_Driver& GridD)
+{
+    ModuleBase::TITLE("LCAO_Deepks", "allocate_psialpha");
+
+    this->psialpha.resize(cal_deri ? 4 : 1);
+
+    this->psialpha[0] = new hamilt::HContainer<double>(pv); // psialpha is always real
+    // Do not use fix_gamma, since it may find wrong matrix for gamma-only case in DeePKS
+
+    // cutoff for alpha is same for all types of atoms
+    const double Rcut_Alpha = orb.Alpha[0].getRcut();
+
+    // Total number of atomic basis of projected orbitals
+    int nw_alpha = 0;
+    for (int l = 0; l <= orb.Alpha[0].getLmax(); l++)
+    {
+        nw_alpha += orb.Alpha[0].getNchi(l) * (2 * l + 1);
+    }
+
+    for (int T0 = 0; T0 < ucell.ntype; T0++)
+    {
+        Atom* atom0 = &ucell.atoms[T0];
+        for (int I0 = 0; I0 < atom0->na; I0++)
+        {
+            const int iat = ucell.itia2iat(T0, I0);
+            auto tau_a = atom0->tau[I0];
+            GridD.Find_atom(ucell, tau_a, T0, I0);
+            for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
+            {
+                const int T1 = GridD.getType(ad);
+                const int I1 = GridD.getNatom(ad);
+                const int ibt = ucell.itia2iat(T1, I1);
+                auto tau_b = GridD.getAdjacentTau(ad);
+                const int iw1_start = ucell.itiaiw2iwt(T1, I1, 0);
+                const Atom* atom1 = &ucell.atoms[T1];
+                auto R_index = GridD.getBox(ad);
+
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
+                const double dist = (tau_b - tau_a).norm() * ucell.lat0;
+                if (dist > Rcut_Alpha + Rcut_AO1)
+                {
+                    continue;
+                }
+
+                hamilt::AtomPair<double> pair(iat, ibt, R_index, pv);
+                // Notice: in AtomPair, the usage is set_size(ncol, nrow)
+                pair.set_size(nw_alpha, atom1->nw * PARAM.globalv.npol);
+                this->psialpha[0]->insert_pair(pair);
+            }
+        }
+    }
+
+    this->psialpha[0]->allocate(nullptr, true);
+    // whether to calculate the derivative of psialpha
+    if (cal_deri)
+    {
+        for (int i = 1; i < 4; ++i)
+        {
+            this->psialpha[i] = new hamilt::HContainer<double>(*this->psialpha[0], nullptr); // copy constructor
+        }
+    }
+    return;
+}
+
+void LCAO_Deepks::build_psialpha(const bool& cal_deri,
                                  const UnitCell& ucell,
                                  const LCAO_Orbitals& orb,
                                  const Grid_Driver& GridD,
@@ -25,14 +92,12 @@ void LCAO_Deepks::build_psialpha(const bool& calc_deri,
     // cutoff for alpha is same for all types of atoms
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
 
-    int job;
-    if (!calc_deri)
+    // Total number of atomic basis of projected orbitals
+    // nw_alpha will be used frequently, better to add a function in Numerical Orbital to get it
+    int nw_alpha = 0;
+    for (int l = 0; l <= orb.Alpha[0].getLmax(); l++)
     {
-        job = 0;
-    }
-    else
-    {
-        job = 1;
+        nw_alpha += orb.Alpha[0].getNchi(l) * (2 * l + 1);
     }
 
     for (int T0 = 0; T0 < ucell.ntype; T0++)
@@ -40,97 +105,98 @@ void LCAO_Deepks::build_psialpha(const bool& calc_deri,
         Atom* atom0 = &ucell.atoms[T0];
         for (int I0 = 0; I0 < atom0->na; I0++)
         {
-            // iat: atom index on which |alpha> is located
             const int iat = ucell.itia2iat(T0, I0);
-            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
-
-            if (PARAM.globalv.gamma_only_local)
-            {
-                this->nlm_save[iat].resize(GridD.getAdjacentNum() + 1);
-            }
-            // outermost loop : find all adjacent atoms
+            auto tau_a = atom0->tau[I0];
+            GridD.Find_atom(ucell, tau_a, T0, I0);
             for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
             {
                 const int T1 = GridD.getType(ad);
                 const int I1 = GridD.getNatom(ad);
                 const int ibt = ucell.itia2iat(T1, I1);
-                const double Rcut_AO1 = orb.Phi[T1].getRcut();
-
-                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad);
+                auto tau_b = GridD.getAdjacentTau(ad);
+                const int iw1_start = ucell.itiaiw2iwt(T1, I1, 0);
                 const Atom* atom1 = &ucell.atoms[T1];
+                auto R_index = GridD.getBox(ad);
+                int R[3] = {R_index.x, R_index.y, R_index.z};
 
-                std::unordered_map<int, std::vector<std::vector<double>>> nlm_cur;
-                if (PARAM.globalv.gamma_only_local)
-                {
-                    this->nlm_save[iat][ad].clear();
-                }
-                else
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
+                const double dist = (tau_b - tau_a).norm() * ucell.lat0;
+                if (dist > Rcut_Alpha + Rcut_AO1)
                 {
-                    nlm_cur.clear();
+                    continue;
                 }
 
-                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
-
-                if (dist1 > Rcut_Alpha + Rcut_AO1)
+                double* data_pointer = this->psialpha[0]->data(iat, ibt, R);
+                std::vector<double*> grad_pointer(3);
+                if (cal_deri)
                 {
-                    continue;
+                    for (int i = 0; i < 3; ++i)
+                    {
+                        grad_pointer[i] = this->psialpha[i + 1]->data(iat, ibt, R);
+                    }
                 }
 
+                // Calculate psialpha
+                // Get all indexes of atomic basis on the neighbour atom in this core
+                // Notice that atom pair (a,b) and (b,a) are different when the former is changed to projected orbitals
+                // So we need both row and col indexes for complete calculation
                 auto all_indexes = pv->get_indexes_row(ibt);
                 auto col_indexes = pv->get_indexes_col(ibt);
-                // insert col_indexes into all_indexes to get universal set with no repeat elements
                 all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
                 std::sort(all_indexes.begin(), all_indexes.end());
-                all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
+                all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end()); // for unique
 
-                // middle loop : all atomic basis on the adjacent atom ad
+                // inner loop : all atomic basis on the adjacent atom ad
                 for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += PARAM.globalv.npol)
                 {
                     const int iw1 = all_indexes[iw1l] / PARAM.globalv.npol;
+
                     std::vector<std::vector<double>> nlm;
-                    // 2D, dim 0 contains the overlap <psi|alpha>
+                    // 2D, dim 0 contains the overlap <psi_{iw1}|alpha_{all}>
                     // dim 1-3 contains the gradient of overlap
 
-                    int L1 = atom1->iw2l[iw1];
-                    int N1 = atom1->iw2n[iw1];
-                    int m1 = atom1->iw2m[iw1];
+                    const int L1 = atom1->iw2l[iw1];
+                    const int N1 = atom1->iw2n[iw1];
+                    const int m1 = atom1->iw2m[iw1];
 
                     // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                    int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
+                    const int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
-                    ModuleBase::Vector3<double> dtau = ucell.atoms[T0].tau[I0] - tau1;
-                    overlap_orb_alpha.snap(T1, L1, N1, M1, 0, dtau * ucell.lat0, calc_deri, nlm);
+                    const ModuleBase::Vector3<double> dtau = tau_a - tau_b;
+                    const int T0_fixed = 0; // All the projected orbitals are the same, use 0 here
+                    overlap_orb_alpha.snap(T1, L1, N1, M1, T0_fixed, dtau * ucell.lat0, cal_deri, nlm);
 
-                    if (PARAM.globalv.gamma_only_local)
-                    {
-                        this->nlm_save[iat][ad].insert({all_indexes[iw1l], nlm});
-                    }
-                    else
+                    const int index_begin = all_indexes[iw1l] * nw_alpha * PARAM.globalv.npol;
+                    for (int iw0 = 0; iw0 < nw_alpha; iw0++)
                     {
-                        nlm_cur.insert({all_indexes[iw1l], nlm});
+                        data_pointer[index_begin + iw0] = nlm[0][iw0];
+                        if (cal_deri)
+                        {
+                            grad_pointer[0][index_begin + iw0] = nlm[1][iw0];
+                            grad_pointer[1][index_begin + iw0] = nlm[2][iw0];
+                            grad_pointer[2][index_begin + iw0] = nlm[3][iw0];
+                        }
                         if (PARAM.globalv.npol == 2)
-                            nlm_cur.insert({all_indexes[iw1l + 1], nlm});
+                        {
+                            data_pointer[index_begin + iw0 + nw_alpha] = nlm[0][iw0];
+                            if (cal_deri)
+                            {
+                                grad_pointer[0][index_begin + iw0 + nw_alpha] = nlm[1][iw0];
+                                grad_pointer[1][index_begin + iw0 + nw_alpha] = nlm[2][iw0];
+                                grad_pointer[2][index_begin + iw0 + nw_alpha] = nlm[3][iw0];
+                            }
+                        }
                     }
                 } // end iw
-
-                if (!PARAM.globalv.gamma_only_local)
-                {
-                    const int rx = GridD.getBox(ad).x;
-                    const int ry = GridD.getBox(ad).y;
-                    const int rz = GridD.getBox(ad).z;
-                    key_tuple key_1(ibt, rx, ry, rz);
-                    this->nlm_save_k[iat][key_1] = nlm_cur;
-                }
-            } // end ad
-        }     // end I0
-    }         // end T0
+            }
+        }
+    }
 
     ModuleBase::timer::tick("LCAO_Deepks", "build_psialpha");
     return;
 }
 
-void LCAO_Deepks::check_psialpha(const bool& calc_deri,
+void LCAO_Deepks::check_psialpha(const bool& cal_deri,
                                  const UnitCell& ucell,
                                  const LCAO_Orbitals& orb,
                                  const Grid_Driver& GridD)
@@ -140,14 +206,10 @@ void LCAO_Deepks::check_psialpha(const bool& calc_deri,
 
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
     // same for all types of atoms
-    int job;
-    if (!calc_deri)
+    int nw_alpha = 0;
+    for (int l = 0; l <= orb.Alpha[0].getLmax(); l++)
     {
-        job = 0;
-    }
-    else
-    {
-        job = 1;
+        nw_alpha += orb.Alpha[0].getNchi(l) * (2 * l + 1);
     }
 
     std::ofstream ofs("psialpha.dat");
@@ -198,24 +260,33 @@ void LCAO_Deepks::check_psialpha(const bool& calc_deri,
                     continue;
                 }
 
-                int ibt, rx, ry, rz;
-                if (PARAM.globalv.gamma_only_local)
+                int ibt = ucell.itia2iat(T1, I1);
+                int R[3];
+
+                ofs << "ad : " << ad << " " << dist1 << std::endl;
+                ofs_x << "ad : " << ad << " " << dist1 << std::endl;
+                ofs_y << "ad : " << ad << " " << dist1 << std::endl;
+                ofs_z << "ad : " << ad << " " << dist1 << std::endl;
+
+                R[0] = GridD.getBox(ad).x;
+                R[1] = GridD.getBox(ad).y;
+                R[2] = GridD.getBox(ad).z;
+
+                if (!PARAM.globalv.gamma_only_local)
                 {
-                    ofs << "ad : " << ad << " " << dist1 << std::endl;
-                    ofs_x << "ad : " << ad << " " << dist1 << std::endl;
-                    ofs_y << "ad : " << ad << " " << dist1 << std::endl;
-                    ofs_z << "ad : " << ad << " " << dist1 << std::endl;
+                    ofs << "R : " << R[0] << " " << R[1] << " " << R[2] << std::endl;
+                    ofs_x << "R : " << R[0] << " " << R[1] << " " << R[2] << std::endl;
+                    ofs_y << "R : " << R[0] << " " << R[1] << " " << R[2] << std::endl;
+                    ofs_z << "R : " << R[0] << " " << R[1] << " " << R[2] << std::endl;
                 }
-                else
+
+                const double* data_pointer = this->psialpha[0]->data(iat, ibt, R);
+                std::vector<double*> grad_pointer(3, nullptr);
+                if (cal_deri)
                 {
-                    ibt = ucell.itia2iat(T1, I1);
-                    rx = GridD.getBox(ad).x;
-                    ry = GridD.getBox(ad).y;
-                    rz = GridD.getBox(ad).z;
-                    ofs << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl;
-                    ofs_x << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl;
-                    ofs_y << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl;
-                    ofs_z << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl;
+                    grad_pointer[0] = this->psialpha[1]->data(iat, ibt, R);
+                    grad_pointer[1] = this->psialpha[2]->data(iat, ibt, R);
+                    grad_pointer[2] = this->psialpha[3]->data(iat, ibt, R);
                 }
 
                 for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
@@ -229,40 +300,30 @@ void LCAO_Deepks::check_psialpha(const bool& calc_deri,
                     const int iw2_local = pv->global2local_col(iw1_all);
                     if (iw1_local < 0 && iw2_local < 0)
                         continue;
-                    const int iw1_0 = iw1 / PARAM.globalv.npol;
-
-                    std::vector<std::vector<double>> nlm;
-
-                    if (PARAM.globalv.gamma_only_local)
-                    {
-                        nlm = this->nlm_save[iat][ad][iw1];
-                    }
-                    else
-                    {
-                        key_tuple key_1(ibt, rx, ry, rz);
-                        nlm = this->nlm_save_k[iat][key_1][iw1];
-                    }
 
-                    for (int ind = 0; ind < nlm[0].size(); ind++)
+                    for (int ind = 0; ind < nw_alpha; ind++)
                     {
-                        ofs << nlm[0][ind] << " ";
+                        ofs << data_pointer[iw1 * nw_alpha + ind] << " ";
+                        if (cal_deri)
+                        {
+                            ofs_x << grad_pointer[0][iw1 * nw_alpha + ind] << " ";
+                            ofs_y << grad_pointer[1][iw1 * nw_alpha + ind] << " ";
+                            ofs_z << grad_pointer[2][iw1 * nw_alpha + ind] << " ";
+                        }
+                        // 6 numbers per line
                         if (ind % 6 == 5)
-                            ofs << "\n";
-                        if (calc_deri)
                         {
-                            ofs_x << nlm[1][ind] << " ";
-                            if (ind % 6 == 5)
+                            ofs << "\n";
+                            if (cal_deri)
+                            {
                                 ofs_x << "\n";
-                            ofs_y << nlm[2][ind] << " ";
-                            if (ind % 6 == 5)
                                 ofs_y << "\n";
-                            ofs_z << nlm[3][ind] << " ";
-                            if (ind % 6 == 5)
                                 ofs_z << "\n";
+                            }
                         }
                     }
                     ofs << std::endl;
-                    if (calc_deri)
+                    if (cal_deri)
                     {
                         ofs_x << std::endl;
                         ofs_y << std::endl;
@@ -276,4 +337,5 @@ void LCAO_Deepks::check_psialpha(const bool& calc_deri,
     ModuleBase::timer::tick("LCAO_Deepks", "check_psialpha");
     return;
 }
+
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
index 5dbec1d76d..2e55216091 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
@@ -1,17 +1,17 @@
-//This file contains interfaces with libtorch,
-//including loading of model and calculating gradients
-//as well as subroutines that prints the results for checking
-
-//The file contains 8 subroutines:
-// cal_gvepsl : gvepsl is used for training with stress label, which is derivative of
-//       descriptors wrt strain tensor, calculated by
-//       d(des)/d\epsilon_{ab} = d(pdm)/d\epsilon_{ab} * d(des)/d(pdm) = gdm_epsl * gvdm
-//       using einsum
-// cal_gvdm : d(des)/d(pdm)
-//       calculated using torch::autograd::grad
-// load_model : loads model for applying V_delta
-// prepare_psialpha : prepare psialpha for outputting npy file
-// prepare_gevdm : prepare gevdm for outputting npy file
+// This file contains interfaces with libtorch,
+// including loading of model and calculating gradients
+// as well as subroutines that prints the results for checking
+
+// The file contains 8 subroutines:
+//  cal_gvepsl : gvepsl is used for training with stress label, which is derivative of
+//        descriptors wrt strain tensor, calculated by
+//        d(des)/d\epsilon_{ab} = d(pdm)/d\epsilon_{ab} * d(des)/d(pdm) = gdm_epsl * gvdm
+//        using einsum
+//  cal_gvdm : d(des)/d(pdm)
+//        calculated using torch::autograd::grad
+//  load_model : loads model for applying V_delta
+//  prepare_psialpha : prepare psialpha for outputting npy file
+//  prepare_gevdm : prepare gevdm for outputting npy file
 
 #ifdef __DEEPKS
 
@@ -25,48 +25,52 @@
 #include "module_parameter/parameter.h"
 
 // calculates stress of descriptors from gradient of projected density matrices
-void LCAO_Deepks::cal_gvepsl(const int nat) {
+void LCAO_Deepks::cal_gvepsl(const int nat)
+{
     ModuleBase::TITLE("LCAO_Deepks", "cal_gvepsl");
     // preconditions
     this->cal_gvdm(nat);
-    if (!gdmepsl_vector.empty()) {
+    if (!gdmepsl_vector.empty())
+    {
         gdmepsl_vector.erase(gdmepsl_vector.begin(), gdmepsl_vector.end());
     }
     // gdmr_vector : nat(derivative) * 3 * inl(projector) * nm * nm
-    if (GlobalV::MY_RANK == 0) {
+    if (GlobalV::MY_RANK == 0)
+    {
         // make gdmx as tensor
         int nlmax = this->inlmax / nat;
-        for (int nl = 0; nl < nlmax; ++nl) {
+        for (int nl = 0; nl < nlmax; ++nl)
+        {
             std::vector<torch::Tensor> bmmv;
             // for (int ipol=0;ipol<6;++ipol)
             //{
             //     std::vector<torch::Tensor> xmmv;
-            for (int i = 0; i < 6; ++i) {
+            for (int i = 0; i < 6; ++i)
+            {
                 std::vector<torch::Tensor> ammv;
-                for (int iat = 0; iat < nat; ++iat) {
+                for (int iat = 0; iat < nat; ++iat)
+                {
                     int inl = iat * nlmax + nl;
                     int nm = 2 * this->inl_l[inl] + 1;
                     std::vector<double> mmv;
-                    for (int m1 = 0; m1 < nm; ++m1) {
-                        for (int m2 = 0; m2 < nm; ++m2) {
+                    for (int m1 = 0; m1 < nm; ++m1)
+                    {
+                        for (int m2 = 0; m2 < nm; ++m2)
+                        {
                             mmv.push_back(this->gdm_epsl[i][inl][m1 * nm + m2]);
                         }
                     } // nm^2
                     torch::Tensor mm
-                        = torch::tensor(
-                              mmv,
-                              torch::TensorOptions().dtype(torch::kFloat64))
-                              .reshape({nm, nm}); // nm*nm
+                        = torch::tensor(mmv, torch::TensorOptions().dtype(torch::kFloat64)).reshape({nm, nm}); // nm*nm
                     ammv.push_back(mm);
-				}
-				torch::Tensor bmm = torch::stack(ammv, 0); // nat*nm*nm
-				bmmv.push_back(bmm);
-			}
+                }
+                torch::Tensor bmm = torch::stack(ammv, 0); // nat*nm*nm
+                bmmv.push_back(bmm);
+            }
             // torch::Tensor bmm = torch::stack(xmmv, 0);  //3*nat*nm*nm
             // bmmv.push_back(bmm);
             //}
-            this->gdmepsl_vector.push_back(
-                torch::stack(bmmv, 0)); // nbt*3*nat*nm*nm
+            this->gdmepsl_vector.push_back(torch::stack(bmmv, 0)); // nbt*3*nat*nm*nm
         }
         assert(this->gdmepsl_vector.size() == nlmax);
 
@@ -75,46 +79,45 @@ void LCAO_Deepks::cal_gvepsl(const int nat) {
         // gevdm_vector : a:inl * v:nm (descriptor) * m:nm (pdm, dim1) * n:nm
         // (pdm, dim2) gvepsl_vector : b:npol * a:inl(projector) *
         // m:nm(descriptor)
-		std::vector<torch::Tensor> gvepsl_vector;
-		for (int nl = 0; nl < nlmax; ++nl) {
-			gvepsl_vector.push_back(
-					at::einsum("bamn, avmn->bav",
-						{this->gdmepsl_vector[nl], this->gevdm_vector[nl]}));
-		}
-
-		// cat nv-> \sum_nl(nv) = \sum_nl(nm_nl)=des_per_atom
-		// concatenate index a(inl) and m(nm)
-		this->gvepsl_tensor = torch::cat(gvepsl_vector, -1);
-		assert(this->gvepsl_tensor.size(0) == 6);
-		assert(this->gvepsl_tensor.size(1) == nat);
-		assert(this->gvepsl_tensor.size(2) == this->des_per_atom);
-	}
-
-	return;
-}
+        std::vector<torch::Tensor> gvepsl_vector;
+        for (int nl = 0; nl < nlmax; ++nl)
+        {
+            gvepsl_vector.push_back(at::einsum("bamn, avmn->bav", {this->gdmepsl_vector[nl], this->gevdm_vector[nl]}));
+        }
 
+        // cat nv-> \sum_nl(nv) = \sum_nl(nm_nl)=des_per_atom
+        // concatenate index a(inl) and m(nm)
+        this->gvepsl_tensor = torch::cat(gvepsl_vector, -1);
+        assert(this->gvepsl_tensor.size(0) == 6);
+        assert(this->gvepsl_tensor.size(1) == nat);
+        assert(this->gvepsl_tensor.size(2) == this->des_per_atom);
+    }
+
+    return;
+}
 
 // dDescriptor / dprojected density matrix
-void LCAO_Deepks::cal_gvdm(const int nat) {
+void LCAO_Deepks::cal_gvdm(const int nat)
+{
     ModuleBase::TITLE("LCAO_Deepks", "cal_gvdm");
-    if (!gevdm_vector.empty()) {
+    if (!gevdm_vector.empty())
+    {
         gevdm_vector.erase(gevdm_vector.begin(), gevdm_vector.end());
     }
     // cal gevdm(d(EigenValue(D))/dD)
     int nlmax = inlmax / nat;
-    for (int nl = 0; nl < nlmax; ++nl) {
+    for (int nl = 0; nl < nlmax; ++nl)
+    {
         std::vector<torch::Tensor> avmmv;
-        for (int iat = 0; iat < nat; ++iat) {
+        for (int iat = 0; iat < nat; ++iat)
+        {
             int inl = iat * nlmax + nl;
             int nm = 2 * this->inl_l[inl] + 1;
             // repeat each block for nm times in an additional dimension
-            torch::Tensor tmp_x
-                = this->pdm_tensor[inl].reshape({nm, nm}).unsqueeze(0).repeat(
-                    {nm, 1, 1});
+            torch::Tensor tmp_x = this->pdm_tensor[inl].reshape({nm, nm}).unsqueeze(0).repeat({nm, 1, 1});
             // torch::Tensor tmp_y = std::get<0>(torch::symeig(tmp_x, true));
             torch::Tensor tmp_y = std::get<0>(torch::linalg::eigh(tmp_x, "U"));
-            torch::Tensor tmp_yshell
-                = torch::eye(nm, torch::TensorOptions().dtype(torch::kFloat64));
+            torch::Tensor tmp_yshell = torch::eye(nm, torch::TensorOptions().dtype(torch::kFloat64));
             std::vector<torch::Tensor> tmp_rpt; // repeated-pdm-tensor (x)
             std::vector<torch::Tensor> tmp_rdt; // repeated-d-tensor (y)
             std::vector<torch::Tensor> tmp_gst; // gvx-shell
@@ -122,13 +125,12 @@ void LCAO_Deepks::cal_gvdm(const int nat) {
             tmp_rdt.push_back(tmp_y);
             tmp_gst.push_back(tmp_yshell);
             std::vector<torch::Tensor> tmp_res;
-            tmp_res
-                = torch::autograd::grad(tmp_rdt,
-                                        tmp_rpt,
-                                        tmp_gst,
-                                        false,
-                                        false,
-                                        /*allow_unused*/ true); // nm(v)**nm*nm
+            tmp_res = torch::autograd::grad(tmp_rdt,
+                                            tmp_rpt,
+                                            tmp_gst,
+                                            false,
+                                            false,
+                                            /*allow_unused*/ true); // nm(v)**nm*nm
             avmmv.push_back(tmp_res[0]);
         }
         torch::Tensor avmm = torch::stack(avmmv, 0); // nat*nv**nm*nm
@@ -138,12 +140,15 @@ void LCAO_Deepks::cal_gvdm(const int nat) {
     return;
 }
 
-void LCAO_Deepks::load_model(const std::string& deepks_model) {
+void LCAO_Deepks::load_model(const std::string& deepks_model)
+{
     ModuleBase::TITLE("LCAO_Deepks", "load_model");
 
-    try {
+    try
+    {
         this->module = torch::jit::load(deepks_model);
-    } catch (const c10::Error& e)
+    }
+    catch (const c10::Error& e)
 
     {
         std::cerr << "error loading the model" << std::endl;
@@ -163,136 +168,127 @@ void LCAO_Deepks::prepare_psialpha(const int nlocal,
                                    const Grid_Driver& GridD)
 {
     ModuleBase::TITLE("LCAO_Deepks", "prepare_psialpha");
-    int nlmax = this->inlmax/nat;
-    int mmax = 2*this->lmaxd+1;
+    int nlmax = this->inlmax / nat;
+    int mmax = 2 * this->lmaxd + 1;
     if constexpr (std::is_same<TK, double>::value)
     {
-        this->psialpha_tensor = torch::zeros({ nat, nlmax, nks, nlocal, mmax }, torch::kFloat64); // support gamma-only
+        this->psialpha_tensor = torch::zeros({nat, nlmax, nks, nlocal, mmax}, torch::kFloat64); // support gamma-only
     }
     else
     {
-        this->psialpha_tensor = torch::zeros({ nat, nlmax, nks, nlocal, mmax }, torch::kComplexDouble); // support multi-k
+        this->psialpha_tensor = torch::zeros({nat, nlmax, nks, nlocal, mmax}, torch::kComplexDouble); // support multi-k
     }
 
-    //cutoff for alpha is same for all types of atoms
+    // cutoff for alpha is same for all types of atoms
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
 
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        for (int I0 =0; I0< atom0->na; I0++)
+        Atom* atom0 = &ucell.atoms[T0];
+        for (int I0 = 0; I0 < atom0->na; I0++)
         {
-            //iat: atom index on which |alpha> is located
-            const int iat = ucell.itia2iat(T0,I0);
-			const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
+            // iat: atom index on which |alpha> is located
+            const int iat = ucell.itia2iat(T0, I0);
+            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
+            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
 
-            //outermost loop : find all adjacent atoms
-            for (int ad=0; ad<GridD.getAdjacentNum()+1 ; ++ad)
+            // outermost loop : find all adjacent atoms
+            for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
             {
                 const int T1 = GridD.getType(ad);
                 const int I1 = GridD.getNatom(ad);
                 const int ibt = ucell.itia2iat(T1, I1);
                 const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-				const double Rcut_AO1 = orb.Phi[T1].getRcut();
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
 
                 const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad);
-				const Atom* atom1 = &ucell.atoms[T1];
-				const int nw1_tot = atom1->nw*PARAM.globalv.npol;
+                const Atom* atom1 = &ucell.atoms[T1];
+                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
 
-				const double dist1 = (tau1-tau0).norm() * ucell.lat0;
+                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
 
-				if (dist1 > Rcut_Alpha + Rcut_AO1)
-				{
-					continue;
-				}
-
-                ModuleBase::Vector3<double> dR(GridD.getBox(ad).x,
-                                               GridD.getBox(ad).y,
-                                               GridD.getBox(ad).z);
+                if (dist1 > Rcut_Alpha + Rcut_AO1)
+                {
+                    continue;
+                }
 
-                key_tuple key(ibt, dR.x, dR.y, dR.z);
+                ModuleBase::Vector3<int> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z);
 
                 if constexpr (std::is_same<TK, std::complex<double>>::value)
                 {
-                    if (this->nlm_save_k[iat].find(key)
-                        == this->nlm_save_k[iat].end()) 
+                    if (this->psialpha[0]->find_matrix(iat, ibt, dR.x, dR.y, dR.z) == nullptr)
                     {
                         continue;
                     }
                 }
 
-                //middle loop : all atomic basis on the adjacent atom ad
-				for (int iw1=0; iw1<nw1_tot; ++iw1)
-				{
-					const int iw1_all = start1 + iw1;
-					const int iw1_local = pv->global2local_row(iw1_all);
-					const int iw2_local = pv->global2local_col(iw1_all);
-					if(iw1_local < 0 || iw2_local < 0) {continue;}
-					const int iw1_0 = iw1/PARAM.globalv.npol;
-					std::vector<double> nlm;
-                    if constexpr (std::is_same<TK, double>::value)
-                    {
-                        nlm = this->nlm_save[iat][ad][iw1][0];
-                    }
-                    else
+                // middle loop : all atomic basis on the adjacent atom ad
+                for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
+                {
+                    const int iw1_all = start1 + iw1;
+                    const int iw1_local = pv->global2local_row(iw1_all);
+                    const int iw2_local = pv->global2local_col(iw1_all);
+                    if (iw1_local < 0 || iw2_local < 0)
                     {
-                        nlm = this->nlm_save_k[iat][key][iw1][0];
+                        continue;
                     }
+                    hamilt::BaseMatrix<double>* overlap = psialpha[0]->find_matrix(iat, ibt, dR);
 
-                    for (int ik = 0; ik <nks; ik++)
+                    for (int ik = 0; ik < nks; ik++)
                     {
                         std::complex<double> kphase = std::complex<double>(1.0, 0.0);
                         if constexpr (std::is_same<TK, std::complex<double>>::value)
                         {
-                            const double arg = - (kvec_d[ik] * dR) * ModuleBase::TWO_PI;
+                            const double arg = -(kvec_d[ik] * ModuleBase::Vector3<double>(dR)) * ModuleBase::TWO_PI;
                             double sinp, cosp;
                             ModuleBase::libm::sincos(arg, &sinp, &cosp);
                             kphase = std::complex<double>(cosp, sinp);
                         }
-                        int ib=0;
-                        int nl=0;
-                        for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                        int ib = 0;
+                        int nl = 0;
+                        for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
                         {
-                            for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                            for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
                             {
-                                const int nm = 2*L0+1;
-                                
-                                for (int m1=0; m1<nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
-                                {   
+                                const int nm = 2 * L0 + 1;
+
+                                for (int m1 = 0; m1 < nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
+                                {
                                     if constexpr (std::is_same<TK, double>::value)
                                     {
-                                        this->psialpha_tensor[iat][nl][ik][iw1_all][m1] = nlm[ib+m1];
+                                        this->psialpha_tensor[iat][nl][ik][iw1_all][m1]
+                                            = overlap->get_value(iw1, ib + m1);
                                     }
                                     else
                                     {
-                                        std::complex<double> nlm_phase = nlm[ib + m1] * kphase;
-                                        torch::Tensor nlm_tensor = torch::tensor({nlm_phase.real(), nlm_phase.imag()}, torch::kDouble);
+                                        std::complex<double> nlm_phase = overlap->get_value(iw1, ib + m1) * kphase;
+                                        torch::Tensor nlm_tensor
+                                            = torch::tensor({nlm_phase.real(), nlm_phase.imag()}, torch::kDouble);
                                         torch::Tensor complex_tensor = torch::complex(nlm_tensor[0], nlm_tensor[1]);
                                         this->psialpha_tensor[iat][nl][ik][iw1_all][m1] = complex_tensor;
                                     }
                                 }
-                                ib+=nm;
+                                ib += nm;
                                 nl++;
                             }
-                        }  
-                    }//end ik
-				}//end iw
-			}//end ad
-		}//end I0
-	}//end T0
+                        }
+                    } // end ik
+                }     // end iw
+            }         // end ad
+        }             // end I0
+    }                 // end T0
 
 #ifdef __MPI
     TK msg[mmax];
-    for(int iat=0; iat< nat ; iat++)
+    for (int iat = 0; iat < nat; iat++)
     {
-        for(int nl = 0; nl < nlmax; nl++)
+        for (int nl = 0; nl < nlmax; nl++)
         {
-            for(int ik = 0; ik < nks; ik++)
+            for (int ik = 0; ik < nks; ik++)
             {
-                for(int mu = 0; mu < nlocal ; mu++)
+                for (int mu = 0; mu < nlocal; mu++)
                 {
-                    for(int m=0;m<mmax;m++) 
+                    for (int m = 0; m < mmax; m++)
                     {
                         if constexpr (std::is_same<TK, double>::value)
                         {
@@ -301,11 +297,12 @@ void LCAO_Deepks::prepare_psialpha(const int nlocal,
                         else
                         {
                             auto tensor_value = this->psialpha_tensor.index({iat, nl, ik, mu, m});
-                            msg[m] = std::complex<double>(torch::real(tensor_value).item<double>(), torch::imag(tensor_value).item<double>());
+                            msg[m] = std::complex<double>(torch::real(tensor_value).item<double>(),
+                                                          torch::imag(tensor_value).item<double>());
                         }
                     }
-                    Parallel_Reduce::reduce_all(msg,mmax);
-                    for(int m=0;m<mmax;m++) 
+                    Parallel_Reduce::reduce_all(msg, mmax);
+                    for (int m = 0; m < mmax; m++)
                     {
                         if constexpr (std::is_same<TK, double>::value)
                         {
@@ -323,7 +320,7 @@ void LCAO_Deepks::prepare_psialpha(const int nlocal,
         }
     }
 
-#endif  
+#endif
 }
 
 template <typename TK>
@@ -331,30 +328,32 @@ void LCAO_Deepks::check_vdp_psialpha(const int nat, const int nks, const int nlo
 {
     std::ofstream ofs("vdp_psialpha.dat");
     ofs << std::setprecision(10);
-    
-    int nlmax = this->inlmax/nat;
-    int mmax = 2*this->lmaxd+1;
-    for(int iat=0; iat< nat ; iat++) 
+
+    int nlmax = this->inlmax / nat;
+    int mmax = 2 * this->lmaxd + 1;
+    for (int iat = 0; iat < nat; iat++)
     {
-        for(int nl = 0; nl < nlmax; nl++)
+        for (int nl = 0; nl < nlmax; nl++)
         {
-            for (int iks = 0; iks < nks ; iks++)
+            for (int iks = 0; iks < nks; iks++)
             {
-                for(int mu = 0; mu < nlocal ; mu++)
+                for (int mu = 0; mu < nlocal; mu++)
                 {
-                    for(int m=0; m< mmax; m++)
+                    for (int m = 0; m < mmax; m++)
                     {
                         if constexpr (std::is_same<TK, double>::value)
                         {
-                            ofs << this->psialpha_tensor.index({ iat, nl, iks, mu, m }).item().toDouble() << " ";
+                            ofs << this->psialpha_tensor.index({iat, nl, iks, mu, m}).item().toDouble() << " ";
                         }
                         else
                         {
                             auto tensor_value = this->psialpha_tensor.index({iat, nl, iks, mu, m});
-                            ofs << std::complex<double>(torch::real(tensor_value).item<double>(), torch::imag(tensor_value).item<double>()) << " ";
+                            ofs << std::complex<double>(torch::real(tensor_value).item<double>(),
+                                                        torch::imag(tensor_value).item<double>())
+                                << " ";
                         }
                     }
-                }                
+                }
             }
             ofs << std::endl;
         }
@@ -362,29 +361,27 @@ void LCAO_Deepks::check_vdp_psialpha(const int nat, const int nks, const int nlo
     ofs.close();
 }
 
-void LCAO_Deepks::prepare_gevdm(
-    const int nat,
-    const LCAO_Orbitals &orb)
+void LCAO_Deepks::prepare_gevdm(const int nat, const LCAO_Orbitals& orb)
 {
-    int nlmax = this->inlmax/nat;
-    int mmax = 2*this->lmaxd+1;
-    this->gevdm_tensor = torch::zeros({ nat, nlmax, mmax, mmax, mmax}, torch::TensorOptions().dtype(torch::kFloat64));
+    int nlmax = this->inlmax / nat;
+    int mmax = 2 * this->lmaxd + 1;
+    this->gevdm_tensor = torch::zeros({nat, nlmax, mmax, mmax, mmax}, torch::TensorOptions().dtype(torch::kFloat64));
 
     this->cal_gvdm(nat);
 
-    int nl=0;
-    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+    int nl = 0;
+    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
     {
-        for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+        for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
         {
             for (int iat = 0; iat < nat; iat++)
             {
-                const int nm = 2*L0+1;            
-                for (int v=0; v<nm; ++v) // nm = 1 for s, 3 for p, 5 for d
+                const int nm = 2 * L0 + 1;
+                for (int v = 0; v < nm; ++v) // nm = 1 for s, 3 for p, 5 for d
                 {
-                    for (int m=0; m<nm; ++m)
+                    for (int m = 0; m < nm; ++m)
                     {
-                        for (int n=0; n<nm;++n)
+                        for (int n = 0; n < nm; ++n)
                         {
                             this->gevdm_tensor[iat][nl][v][m][n] = this->gevdm_vector[nl][iat][v][m][n];
                         }
@@ -402,19 +399,19 @@ void LCAO_Deepks::check_vdp_gevdm(const int nat)
     std::ofstream ofs("vdp_gevdm.dat");
     ofs << std::setprecision(10);
 
-    int nlmax = this->inlmax/nat;
-    int mmax = 2*this->lmaxd+1;
-    for(int iat=0; iat< nat ; iat++) 
+    int nlmax = this->inlmax / nat;
+    int mmax = 2 * this->lmaxd + 1;
+    for (int iat = 0; iat < nat; iat++)
     {
-        for(int nl = 0; nl < nlmax; nl++)
+        for (int nl = 0; nl < nlmax; nl++)
         {
-            for(int v=0; v<mmax; v++)
+            for (int v = 0; v < mmax; v++)
             {
-                for(int m=0; m<mmax; m++)
+                for (int m = 0; m < mmax; m++)
                 {
-                    for(int n=0; n<mmax; n++)
+                    for (int n = 0; n < mmax; n++)
                     {
-                        ofs << this->gevdm_tensor.index({ iat,nl, v, m, n }).item().toDouble() << " ";
+                        ofs << this->gevdm_tensor.index({iat, nl, v, m, n}).item().toDouble() << " ";
                     }
                 }
             }
diff --git a/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp b/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp
index b83e40f2ec..ba5f7528cf 100644
--- a/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp
+++ b/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp
@@ -1,23 +1,23 @@
 #ifdef __DEEPKS
 
-#include "module_parameter/parameter.h"
 #include "LCAO_deepks.h"
-#include "module_base/vector3.h"
-#include "module_base/timer.h"
 #include "module_base/constants.h"
-#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
 #include "module_base/libm/libm.h"
+#include "module_base/timer.h"
+#include "module_base/vector3.h"
+#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
+#include "module_parameter/parameter.h"
 
 /// this subroutine calculates the gradient of projected density matrices
 /// gdmx_m,m = d/dX sum_{mu,nu} rho_{mu,nu} <chi_mu|alpha_m><alpha_m'|chi_nu>
-/// if stress label is enabled, the gradient of PDM wrt strain tensor will 
-/// be calculated: 
-/// gdm_epsl = d/d\epsilon_{ab} * 
+/// if stress label is enabled, the gradient of PDM wrt strain tensor will
+/// be calculated:
+/// gdm_epsl = d/d\epsilon_{ab} *
 ///           sum_{mu,nu} rho_{mu,nu} <chi_mu|alpha_m><alpha_m'|chi_nu>
 
-//There are 2 subroutines in this file:
-//1. cal_gdmx, calculating gdmx (and optionally gdm_epsl for stress) for gamma point
-//2. check_gdmx, which prints gdmx to a series of .dat files
+// There are 2 subroutines in this file:
+// 1. cal_gdmx, calculating gdmx (and optionally gdm_epsl for stress) for gamma point
+// 2. check_gdmx, which prints gdmx to a series of .dat files
 
 template <typename TK>
 void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
@@ -26,17 +26,18 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
                            const Grid_Driver& GridD,
                            const int nks,
                            const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                           std::vector<hamilt::HContainer<double>*> psialpha,
                            const bool isstress)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_gdmx");
-    ModuleBase::timer::tick("LCAO_Deepks","cal_gdmx");
-    //get DS_alpha_mu and S_nu_beta
+    ModuleBase::timer::tick("LCAO_Deepks", "cal_gdmx");
+    // get DS_alpha_mu and S_nu_beta
 
     int size = (2 * lmaxd + 1) * (2 * lmaxd + 1);
     int nrow = this->pv->nrow;
-    for (int iat = 0;iat < ucell.nat;iat++)
+    for (int iat = 0; iat < ucell.nat; iat++)
     {
-        for (int inl = 0;inl < inlmax;inl++)
+        for (int inl = 0; inl < inlmax; inl++)
         {
             ModuleBase::GlobalFunc::ZEROS(gdmx[iat][inl], size);
             ModuleBase::GlobalFunc::ZEROS(gdmy[iat][inl], size);
@@ -46,9 +47,9 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
 
     if (isstress)
     {
-        for (int ipol = 0;ipol < 6;ipol++)
+        for (int ipol = 0; ipol < 6; ipol++)
         {
-            for (int inl = 0;inl < inlmax;inl++)
+            for (int inl = 0; inl < inlmax; inl++)
             {
                 ModuleBase::GlobalFunc::ZEROS(gdm_epsl[ipol][inl], size);
             }
@@ -59,65 +60,69 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
 
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        for (int I0 =0; I0< atom0->na; I0++)
+        Atom* atom0 = &ucell.atoms[T0];
+        for (int I0 = 0; I0 < atom0->na; I0++)
         {
-            const int iat = ucell.itia2iat(T0,I0);//on which alpha is located
+            const int iat = ucell.itia2iat(T0, I0); // on which alpha is located
             const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
+            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
 
-            for (int ad1=0; ad1<GridD.getAdjacentNum()+1 ; ++ad1)
+            for (int ad1 = 0; ad1 < GridD.getAdjacentNum() + 1; ++ad1)
             {
                 const int T1 = GridD.getType(ad1);
                 const int I1 = GridD.getNatom(ad1);
-                const int ibt1 = ucell.itia2iat(T1,I1); //on which chi_mu is located
+                const int ibt1 = ucell.itia2iat(T1, I1); // on which chi_mu is located
                 const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-                
+
                 const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
-				const Atom* atom1 = &ucell.atoms[T1];
-				const int nw1_tot = atom1->nw*PARAM.globalv.npol;
-				const double Rcut_AO1 = orb.Phi[T1].getRcut(); 
-
-                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z); 
-
-				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
-				{
-					const int T2 = GridD.getType(ad2);
-					const int I2 = GridD.getNatom(ad2);
-					const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
-                    const int ibt2 = ucell.itia2iat(T2,I2);
-					const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
-					const Atom* atom2 = &ucell.atoms[T2];
-					const int nw2_tot = atom2->nw*PARAM.globalv.npol;
-					ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
-					
-					const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                	const double dist1 = (tau1-tau0).norm() * ucell.lat0;
-                	const double dist2 = (tau2-tau0).norm() * ucell.lat0;
-
-					if (dist1 > Rcut_Alpha + Rcut_AO1
-							|| dist2 > Rcut_Alpha + Rcut_AO2)
-					{
-						continue;
-					}
+                const Atom* atom1 = &ucell.atoms[T1];
+                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
+
+                ModuleBase::Vector3<int> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z);
+
+                for (int ad2 = 0; ad2 < GridD.getAdjacentNum() + 1; ad2++)
+                {
+                    const int T2 = GridD.getType(ad2);
+                    const int I2 = GridD.getNatom(ad2);
+                    const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+                    const int ibt2 = ucell.itia2iat(T2, I2);
+                    const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
+                    const Atom* atom2 = &ucell.atoms[T2];
+                    const int nw2_tot = atom2->nw * PARAM.globalv.npol;
+                    ModuleBase::Vector3<int> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
+
+                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
+                    const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
+                    const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
+
+                    if (dist1 > Rcut_Alpha + Rcut_AO1 || dist2 > Rcut_Alpha + Rcut_AO2)
+                    {
+                        continue;
+                    }
 
                     double r0[3];
                     double r1[3];
-                    if(isstress)
+                    if (isstress)
                     {
-                        r1[0] = ( tau1.x - tau0.x) ;
-                        r1[1] = ( tau1.y - tau0.y) ;
-                        r1[2] = ( tau1.z - tau0.z) ;
-                        r0[0] = ( tau2.x - tau0.x) ;
-                        r0[1] = ( tau2.y - tau0.y) ;
-                        r0[2] = ( tau2.z - tau0.z) ;
+                        r1[0] = (tau1.x - tau0.x);
+                        r1[1] = (tau1.y - tau0.y);
+                        r1[2] = (tau1.z - tau0.z);
+                        r0[0] = (tau2.x - tau0.x);
+                        r0[1] = (tau2.y - tau0.y);
+                        r0[2] = (tau2.z - tau0.z);
                     }
                     auto row_indexes = pv->get_indexes_row(ibt1);
                     auto col_indexes = pv->get_indexes_col(ibt2);
-                    if(row_indexes.size() * col_indexes.size() == 0) continue;
-                    
+                    if (row_indexes.size() * col_indexes.size() == 0)
+                    {
+                        continue;
+                    }
+
                     double* dm_current;
-                    int dRx, dRy, dRz;
+                    int dRx;
+                    int dRy;
+                    int dRz;
                     if constexpr (std::is_same<TK, double>::value)
                     {
                         dRx = 0;
@@ -126,13 +131,15 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
                     }
                     else
                     {
-                        dRx = (dR2-dR1).x;
-                        dRy = (dR2-dR1).y;
-                        dRz = (dR2-dR1).z;
+                        dRx = (dR2 - dR1).x;
+                        dRy = (dR2 - dR1).y;
+                        dRz = (dR2 - dR1).z;
                     }
+                    ModuleBase::Vector3<double> dR(dRx, dRy, dRz);
+
                     hamilt::AtomPair<double> dm_pair(ibt1, ibt2, dRx, dRy, dRz, pv);
                     dm_pair.allocate(nullptr, 1);
-                    for(int ik=0;ik<nks;ik++)
+                    for (int ik = 0; ik < nks; ik++)
                     {
                         TK kphase;
                         if constexpr (std::is_same<TK, double>::value)
@@ -141,12 +148,12 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
                         }
                         else
                         {
-                            const double arg = - (kvec_d[ik] * (dR2-dR1) ) * ModuleBase::TWO_PI;
+                            const double arg = -(kvec_d[ik] * dR) * ModuleBase::TWO_PI;
                             double sinp, cosp;
                             ModuleBase::libm::sincos(arg, &sinp, &cosp);
                             kphase = TK(cosp, sinp);
                         }
-                        if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
+                        if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
                         {
                             dm_pair.add_from_matrix(dm[ik].data(), pv->get_row_size(), kphase, 1);
                         }
@@ -155,146 +162,163 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
                             dm_pair.add_from_matrix(dm[ik].data(), pv->get_col_size(), kphase, 0);
                         }
                     }
-                    
+
                     dm_current = dm_pair.get_pointer();
 
-                    key_tuple key_1(ibt1,dR1.x,dR1.y,dR1.z);
-                    key_tuple key_2(ibt2,dR2.x,dR2.y,dR2.z);
-					for (int iw1=0; iw1<row_indexes.size(); ++iw1)
+                    for (int iw1 = 0; iw1 < row_indexes.size(); ++iw1)
                     {
-                        for (int iw2=0; iw2<col_indexes.size(); ++iw2)
+                        for (int iw2 = 0; iw2 < col_indexes.size(); ++iw2)
                         {
-                            std::vector<double> nlm1;
-                            std::vector<std::vector<double>> nlm2;
+                            hamilt::BaseMatrix<double>* overlap_1 = psialpha[0]->find_matrix(iat, ibt1, dR1);
+                            hamilt::BaseMatrix<double>* overlap_2 = psialpha[0]->find_matrix(iat, ibt2, dR2);
+                            std::vector<hamilt::BaseMatrix<double>*> grad_overlap_1(3);
+                            std::vector<hamilt::BaseMatrix<double>*> grad_overlap_2(3);
 
-                            if constexpr (std::is_same<TK, double>::value)
-                            {
-                                nlm1 = this->nlm_save[iat][ad1][row_indexes[iw1]][0];
-                                nlm2 = this->nlm_save[iat][ad2][col_indexes[iw2]];
-                            }
-                            else
+                            assert(overlap_1->get_col_size() == overlap_2->get_col_size());
+
+                            for (int i = 0; i < 3; ++i)
                             {
-                                nlm1 = this->nlm_save_k[iat][key_1][row_indexes[iw1]][0];
-                                nlm2 = this->nlm_save_k[iat][key_2][col_indexes[iw2]];
+                                grad_overlap_1[i] = psialpha[i + 1]->find_matrix(iat, ibt1, dR1);
+                                grad_overlap_2[i] = psialpha[i + 1]->find_matrix(iat, ibt2, dR2);
                             }
 
-                            assert(nlm1.size()==nlm2[0].size());
-
-                            int ib=0;
-                            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                            int ib = 0;
+                            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
                             {
-                                for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                                for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
                                 {
                                     const int inl = this->inl_index[T0](I0, L0, N0);
-                                    const int nm = 2*L0+1;
-                                    for (int m1 = 0;m1 < nm;++m1)
+                                    const int nm = 2 * L0 + 1;
+                                    for (int m1 = 0; m1 < nm; ++m1)
                                     {
-                                        for (int m2 = 0; m2 <nm; ++m2)
+                                        for (int m2 = 0; m2 < nm; ++m2)
                                         {
                                             //(<d/dX chi_mu|alpha_m>)<chi_nu|alpha_m'>
-                                            gdmx[iat][inl][m1*nm+m2] += nlm2[1][ib+m2] * nlm1[ib+m1] * *dm_current; 
-                                            gdmy[iat][inl][m1*nm+m2] += nlm2[2][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmz[iat][inl][m1*nm+m2] += nlm2[3][ib+m2] * nlm1[ib+m1] * *dm_current;
+                                            gdmx[iat][inl][m1 * nm + m2]
+                                                += grad_overlap_2[0]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+                                            gdmy[iat][inl][m1 * nm + m2]
+                                                += grad_overlap_2[1]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+                                            gdmz[iat][inl][m1 * nm + m2]
+                                                += grad_overlap_2[2]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
 
                                             //(<d/dX chi_nu|alpha_m'>)<chi_mu|alpha_m>
-                                            gdmx[iat][inl][m2*nm+m1] += nlm2[1][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmy[iat][inl][m2*nm+m1] += nlm2[2][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmz[iat][inl][m2*nm+m1] += nlm2[3][ib+m2] * nlm1[ib+m1] * *dm_current;
+                                            gdmx[iat][inl][m2 * nm + m1]
+                                                += grad_overlap_2[0]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+                                            gdmy[iat][inl][m2 * nm + m1]
+                                                += grad_overlap_2[1]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+                                            gdmz[iat][inl][m2 * nm + m1]
+                                                += grad_overlap_2[2]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
 
-                                            //(<chi_mu|d/dX alpha_m>)<chi_nu|alpha_m'> = -(<d/dX chi_mu|alpha_m>)<chi_nu|alpha_m'>
-                                            gdmx[ibt2][inl][m1*nm+m2] -= nlm2[1][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmy[ibt2][inl][m1*nm+m2] -= nlm2[2][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmz[ibt2][inl][m1*nm+m2] -= nlm2[3][ib+m2] * nlm1[ib+m1] * *dm_current;
+                                            //(<chi_mu|d/dX alpha_m>)<chi_nu|alpha_m'> = -(<d/dX
+                                            //chi_mu|alpha_m>)<chi_nu|alpha_m'>
+                                            gdmx[ibt2][inl][m1 * nm + m2]
+                                                -= grad_overlap_2[0]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+                                            gdmy[ibt2][inl][m1 * nm + m2]
+                                                -= grad_overlap_2[1]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+                                            gdmz[ibt2][inl][m1 * nm + m2]
+                                                -= grad_overlap_2[2]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
 
-                                            //(<chi_nu|d/dX alpha_m'>)<chi_mu|alpha_m> = -(<d/dX chi_nu|alpha_m'>)<chi_mu|alpha_m>
-                                            gdmx[ibt2][inl][m2*nm+m1] -= nlm2[1][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmy[ibt2][inl][m2*nm+m1] -= nlm2[2][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmz[ibt2][inl][m2*nm+m1] -= nlm2[3][ib+m2] * nlm1[ib+m1] * *dm_current;
+                                            //(<chi_nu|d/dX alpha_m'>)<chi_mu|alpha_m> = -(<d/dX
+                                            //chi_nu|alpha_m'>)<chi_mu|alpha_m>
+                                            gdmx[ibt2][inl][m2 * nm + m1]
+                                                -= grad_overlap_2[0]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+                                            gdmy[ibt2][inl][m2 * nm + m1]
+                                                -= grad_overlap_2[1]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+                                            gdmz[ibt2][inl][m2 * nm + m1]
+                                                -= grad_overlap_2[2]->get_value(col_indexes[iw2], ib + m2)
+                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
 
                                             if (isstress)
                                             {
                                                 int mm = 0;
-                                                for(int ipol=0;ipol<3;ipol++)
+                                                for (int ipol = 0; ipol < 3; ipol++)
                                                 {
-                                                    for(int jpol=ipol;jpol<3;jpol++)
+                                                    for (int jpol = ipol; jpol < 3; jpol++)
                                                     {
-                                                        gdm_epsl[mm][inl][m2*nm+m1] += ucell.lat0 * 
-                                                        *dm_current * (nlm2[jpol+1][ib+m2] * nlm1[ib+m1] * r0[ipol]);
+                                                        gdm_epsl[mm][inl][m2 * nm + m1]
+                                                            += ucell.lat0 * *dm_current
+                                                               * (grad_overlap_2[jpol]->get_value(col_indexes[iw2],
+                                                                                                  ib + m2)
+                                                                  * overlap_1->get_value(row_indexes[iw1], ib + m1)
+                                                                  * r0[ipol]);
                                                         mm++;
                                                     }
                                                 }
                                             }
                                         }
                                     }
-                                    ib+=nm;
+                                    ib += nm;
                                 }
                             }
-                            assert(ib==nlm1.size());
-                            if  (isstress)
+                            assert(ib == overlap_1->get_col_size());
+                            if (isstress)
                             {
-                                if constexpr (std::is_same<TK, double>::value)
-                                {
-                                    nlm1 = this->nlm_save[iat][ad2][col_indexes[iw2]][0];
-                                    nlm2 = this->nlm_save[iat][ad1][row_indexes[iw1]];
-                                }
-                                else
-                                {
-                                    nlm1 = this->nlm_save_k[iat][key_2][col_indexes[iw2]][0];
-                                    nlm2 = this->nlm_save_k[iat][key_1][row_indexes[iw1]];
-                                }
-
-                                assert(nlm1.size()==nlm2[0].size());  
-                                int ib=0;
-                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                                int ib = 0;
+                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
                                 {
-                                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                                    for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
                                     {
                                         const int inl = this->inl_index[T0](I0, L0, N0);
-                                        const int nm = 2*L0+1;
-                                        for (int m1 = 0;m1 < nm; ++m1)
+                                        const int nm = 2 * L0 + 1;
+                                        for (int m1 = 0; m1 < nm; ++m1)
                                         {
                                             for (int m2 = 0; m2 < nm; ++m2)
                                             {
                                                 int mm = 0;
-                                                for(int ipol=0;ipol<3;ipol++)
+                                                for (int ipol = 0; ipol < 3; ipol++)
                                                 {
-                                                    for(int jpol=ipol;jpol<3;jpol++)
+                                                    for (int jpol = ipol; jpol < 3; jpol++)
                                                     {
-                                                        gdm_epsl[mm][inl][m2*nm+m1]  += ucell.lat0 * 
-                                                        *dm_current * (nlm1[ib+m1] * nlm2[jpol+1][ib+m2] * r1[ipol]);
+                                                        gdm_epsl[mm][inl][m2 * nm + m1]
+                                                            += ucell.lat0 * *dm_current
+                                                               * (overlap_2->get_value(col_indexes[iw2], ib + m1)
+                                                                  * grad_overlap_1[jpol]->get_value(row_indexes[iw1],
+                                                                                                    ib + m2)
+                                                                  * r1[ipol]);
                                                         mm++;
                                                     }
                                                 }
                                             }
                                         }
-                                        ib+=nm;
+                                        ib += nm;
                                     }
                                 }
                             }
                             dm_current++;
-						}//iw2
-					}//iw1
-				}//ad2
-			}//ad1
-        }//I0
-    }//T0
+                        } // iw2
+                    }     // iw1
+                }         // ad2
+            }             // ad1
+        }                 // I0
+    }                     // T0
 
 #ifdef __MPI
-    for(int iat=0;iat<ucell.nat;iat++)
+    for (int iat = 0; iat < ucell.nat; iat++)
     {
-        allsum_deepks(this->inlmax,size,this->gdmx[iat]);
-        allsum_deepks(this->inlmax,size,this->gdmy[iat]);
-        allsum_deepks(this->inlmax,size,this->gdmz[iat]);
+        allsum_deepks(this->inlmax, size, this->gdmx[iat]);
+        allsum_deepks(this->inlmax, size, this->gdmy[iat]);
+        allsum_deepks(this->inlmax, size, this->gdmz[iat]);
     }
     if (isstress)
     {
-        for(int ipol=0;ipol<6;ipol++)
+        for (int ipol = 0; ipol < 6; ipol++)
         {
-            allsum_deepks(this->inlmax,size,this->gdm_epsl[ipol]);
+            allsum_deepks(this->inlmax, size, this->gdm_epsl[ipol]);
         }
     }
 #endif
-    ModuleBase::timer::tick("LCAO_Deepks","cal_gdmx");
+    ModuleBase::timer::tick("LCAO_Deepks", "cal_gdmx");
     return;
 }
 
@@ -305,26 +329,26 @@ void LCAO_Deepks::check_gdmx(const int nat)
     std::ofstream ofs_y;
     std::ofstream ofs_z;
 
-    ofs_x<<std::setprecision(10);
-    ofs_y<<std::setprecision(10);
-    ofs_z<<std::setprecision(10);
+    ofs_x << std::setprecision(10);
+    ofs_y << std::setprecision(10);
+    ofs_z << std::setprecision(10);
 
     const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
-    for(int ia=0;ia<nat;ia++)
+    for (int ia = 0; ia < nat; ia++)
     {
         ss.str("");
-        ss<<"gdmx_"<<ia<<".dat";
+        ss << "gdmx_" << ia << ".dat";
         ofs_x.open(ss.str().c_str());
         ss.str("");
-        ss<<"gdmy_"<<ia<<".dat";
+        ss << "gdmy_" << ia << ".dat";
         ofs_y.open(ss.str().c_str());
         ss.str("");
-        ss<<"gdmz_"<<ia<<".dat";
+        ss << "gdmz_" << ia << ".dat";
         ofs_z.open(ss.str().c_str());
 
-        for(int inl=0;inl<inlmax;inl++)
+        for (int inl = 0; inl < inlmax; inl++)
         {
-            for(int ind=0;ind<pdm_size;ind++)
+            for (int ind = 0; ind < pdm_size; ind++)
             {
                 ofs_x << gdmx[ia][inl][ind] << " ";
                 ofs_y << gdmy[ia][inl][ind] << " ";
@@ -346,6 +370,7 @@ template void LCAO_Deepks::cal_gdmx<double>(const std::vector<std::vector<double
                                             const Grid_Driver& GridD,
                                             const int nks,
                                             const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                            std::vector<hamilt::HContainer<double>*> psialpha,
                                             const bool isstress);
 
 template void LCAO_Deepks::cal_gdmx<std::complex<double>>(const std::vector<std::vector<std::complex<double>>>& dm,
@@ -354,6 +379,7 @@ template void LCAO_Deepks::cal_gdmx<std::complex<double>>(const std::vector<std:
                                                           const Grid_Driver& GridD,
                                                           const int nks,
                                                           const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                                          std::vector<hamilt::HContainer<double>*> psialpha,
                                                           const bool isstress);
 
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_fgamma.cpp b/source/module_hamilt_lcao/module_deepks/deepks_fgamma.cpp
deleted file mode 100644
index 60077ebdac..0000000000
--- a/source/module_hamilt_lcao/module_deepks/deepks_fgamma.cpp
+++ /dev/null
@@ -1,381 +0,0 @@
-//wenfei 2022-1-11
-//This file contains subroutines for calculating F_delta,
-#include "module_parameter/parameter.h"
-//which is defind as sum_mu,nu rho_mu,nu d/dX (<chi_mu|alpha>V(D)<alpha|chi_nu>)
-
-//There are 2 subroutines in this file:
-//1. cal_f_delta_gamma, which is used for gamma point calculation
-//2. check_f_delta, which prints F_delta into F_delta.dat for checking
-
-#ifdef __DEEPKS
-
-#include "deepks_force.h"
-#include "module_base/vector3.h"
-#include "module_base/timer.h"
-#include "module_base/constants.h"
-#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
-#include "module_base/libm/libm.h"
-
-typedef std::tuple<int, int, int, int> key_tuple;
-
-//force for gamma only calculations
-//Pulay and HF terms are calculated together
-void DeePKS_domain::cal_f_delta_gamma(
-    const std::vector<std::vector<double>>& dm,
-    const UnitCell& ucell,
-    const LCAO_Orbitals& orb,
-    const Grid_Driver& GridD,
-    const Parallel_Orbitals& pv,
-    const int lmaxd,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>> &kvec_d,
-    std::vector<std::vector<std::unordered_map<int, std::vector<std::vector<double>>>>>& nlm_save,
-    double** gedm,
-    ModuleBase::IntArray* inl_index,
-    ModuleBase::matrix& f_delta,
-    const bool isstress,
-    ModuleBase::matrix& svnl_dalpha)
-{
-    ModuleBase::TITLE("DeePKS_domain", "cal_f_delta_gamma");
-
-    using TK = double; // temporary used, will be replaced by the template parameter
-
-    f_delta.zero_out();
-
-    const double Rcut_Alpha = orb.Alpha[0].getRcut();
-
-    const int nrow = pv.nrow;
-
-    for (int T0 = 0; T0 < ucell.ntype; T0++)
-    {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        for (int I0 =0; I0< atom0->na; I0++)
-        {
-            const int iat = ucell.itia2iat(T0,I0);
-            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
-
-            for (int ad1=0; ad1<GridD.getAdjacentNum()+1 ; ++ad1)
-            {
-                const int T1 = GridD.getType(ad1);
-                const int I1 = GridD.getNatom(ad1);
-                const int ibt1 = ucell.itia2iat(T1,I1);
-                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
-                const Atom* atom1 = &ucell.atoms[T1];
-                const int nw1_tot = atom1->nw*PARAM.globalv.npol;
-                const double Rcut_AO1 = orb.Phi[T1].getRcut();
-
-                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z);
-
-                for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
-                {
-                    const int T2 = GridD.getType(ad2);
-                    const int I2 = GridD.getNatom(ad2);
-                    const int ibt2 = ucell.itia2iat(T2,I2);
-                    const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
-                    const Atom* atom2 = &ucell.atoms[T2];
-                    const int nw2_tot = atom2->nw*PARAM.globalv.npol;
-                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
-                    
-                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                    const double dist1 = (tau1-tau0).norm() * ucell.lat0;
-                    const double dist2 = (tau2-tau0).norm() * ucell.lat0;
-
-                    if (dist1 > Rcut_Alpha + Rcut_AO1
-                            || dist2 > Rcut_Alpha + Rcut_AO2)
-                    {
-                        continue;
-                    }					
-
-                    double r0[3];
-                    double r1[3];
-                    if(isstress)
-                    {
-                        r1[0] = ( tau1.x - tau0.x) ;
-                        r1[1] = ( tau1.y - tau0.y) ;
-                        r1[2] = ( tau1.z - tau0.z) ;
-                        r0[0] = ( tau2.x - tau0.x) ;
-                        r0[1] = ( tau2.y - tau0.y) ;
-                        r0[2] = ( tau2.z - tau0.z) ;
-                    }
-
-                    auto row_indexes = pv.get_indexes_row(ibt1);
-                    auto col_indexes = pv.get_indexes_col(ibt2);
-
-					if(row_indexes.size() * col_indexes.size() == 0) 
-					{
-						continue;
-					}
-
-                    int dRx;
-                    int dRy;
-                    int dRz;
-                    if constexpr (std::is_same<TK, double>::value)
-                    {
-                        dRx = 0;
-                        dRy = 0;
-                        dRz = 0;
-                    }
-                    else
-                    {
-                        dRx = dR2.x - dR1.x;
-                        dRy = dR2.y - dR1.y;
-                        dRz = dR2.z - dR1.z;
-                    }
-
-                    hamilt::AtomPair<double> dm_pair(ibt1, ibt2, dRx, dRy, dRz, &pv);
-
-                    dm_pair.allocate(nullptr, 1);
-
-                    if constexpr (std::is_same<TK, double>::value)
-                    {
-                        for(int is=0;is<dm.size();is++)
-                        {
-                            if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
-                            {
-                                dm_pair.add_from_matrix(dm[is].data(), pv.get_row_size(), 1.0, 1);
-                            }
-                            else
-                            {
-                                dm_pair.add_from_matrix(dm[is].data(), pv.get_col_size(), 1.0, 0);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        // for(int ik=0;ik<nks;ik++)
-                        // {
-                        //     const double arg = - (kvec_d[ik] * (dR2-dR1) ) * ModuleBase::TWO_PI;
-                        //     double sinp, cosp;
-                        //     ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                        //     const std::complex<double> kphase = std::complex<double>(cosp, sinp);
-                        //     if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
-                        //     {
-                        //         dm_pair.add_from_matrix(dm[ik].data(), pv.get_row_size(), kphase, 1);
-                        //     }
-                        //     else
-                        //     {
-                        //         dm_pair.add_from_matrix(dm[ik].data(), pv.get_col_size(), kphase, 0);
-                        //     }
-                        // }
-                    }
-
-                    const double* dm_current = dm_pair.get_pointer();
-
-                    for (int iw1=0; iw1<row_indexes.size(); ++iw1)
-                    {
-                        for (int iw2=0; iw2<col_indexes.size(); ++iw2)
-                        {
-                            double nlm[3]={0,0,0};
-                            double nlm_t[3] = {0,0,0}; //for stress
-                            key_tuple key_1(ibt1,dR1.x,dR1.y,dR1.z);
-                            key_tuple key_2(ibt2,dR2.x,dR2.y,dR2.z);
-                            std::vector<double> nlm1;
-                            std::vector<std::vector<double>> nlm2;
-                            nlm2.resize(3);
-
-                            if constexpr (std::is_same<TK, double>::value)
-                            {
-                                nlm1 = nlm_save[iat][ad1][row_indexes[iw1]][0];
-                                for(int dim=0;dim<3;dim++)
-                                {
-                                    nlm2[dim] = nlm_save[iat][ad2][col_indexes[iw2]][dim+1];
-                                }
-                            }
-                            else
-                            {
-                                // nlm1 = nlm_save_k[iat][key_1][row_indexes[iw1]][0];
-                                // for(int dim=0;dim<3;dim++)
-                                // {
-                                //     nlm2[dim] = nlm_save_k[iat][key_2][col_indexes[iw2]][dim+1];
-                                // }
-                            }
-
-                            assert(nlm1.size()==nlm2[0].size());
-
-                            if(!PARAM.inp.deepks_equiv)
-                            {
-                                int ib=0;
-                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
-                                {
-                                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
-                                    {
-                                        const int inl = inl_index[T0](I0, L0, N0);
-                                        const int nm = 2*L0+1;
-                                        for (int m1 = 0;m1 < nm; ++m1)
-                                        {
-                                            for (int m2 = 0; m2 < nm; ++m2)
-                                            {
-                                                for(int dim=0;dim<3;dim++)
-                                                {                                            
-                                                    nlm[dim] += gedm[inl][m1*nm+m2]*nlm1[ib+m1]*nlm2[dim][ib+m2];
-                                                }
-                                            }
-                                        }
-                                        ib+=nm;
-                                    }
-                                }
-                                assert(ib==nlm1.size());
-                            }
-                            else
-                            {
-                                int nproj = 0;
-                                for(int il = 0; il < lmaxd + 1; il++)
-                                {
-                                    nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
-                                }
-                                for(int iproj = 0; iproj < nproj; iproj ++)
-                                {
-                                    for(int jproj = 0; jproj < nproj; jproj ++)
-                                    {
-                                        for(int dim=0;dim<3;dim++)
-                                        {
-                                            nlm[dim] += gedm[iat][iproj*nproj+jproj] * nlm1[iproj] * nlm2[dim][jproj];
-                                        }
-                                    }
-                                }                    
-                            }
-
-                            // HF term is minus, only one projector for each atom force.
-                            f_delta(iat, 0) -= 2 * *dm_current * nlm[0];
-                            f_delta(iat, 1) -= 2 * *dm_current * nlm[1];
-                            f_delta(iat, 2) -= 2 * *dm_current * nlm[2];
-
-                            // Pulay term is plus, only one projector for each atom force.
-                            f_delta(ibt2, 0) += 2 * *dm_current * nlm[0];
-                            f_delta(ibt2, 1) += 2 * *dm_current * nlm[1];
-                            f_delta(ibt2, 2) += 2 * *dm_current * nlm[2];
-
-                            if(isstress)
-                            {
-                                if constexpr (std::is_same<TK, double>::value)
-                                {
-                                    nlm1 = nlm_save[iat][ad2][col_indexes[iw2]][0];
-                                    for(int i=0;i<3;i++)
-                                    {
-                                        nlm2[i] = nlm_save[iat][ad1][row_indexes[iw1]][i+1];
-                                    }
-                                }
-                                else
-                                {
-                                    // nlm1 = nlm_save_k[iat][key_2][col_indexes[iw2]][0];
-                                    // for(int i=0;i<3;i++)
-                                    // {
-                                    //     nlm2[i] = nlm_save_k[iat][key_1][row_indexes[iw1]][i+1];
-                                    // }
-                                }
-
-                                assert(nlm1.size()==nlm2[0].size());                                
-
-                                if(!PARAM.inp.deepks_equiv)
-                                {
-                                    int ib=0;
-                                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
-                                    {
-                                        for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
-                                        {
-                                            const int inl = inl_index[T0](I0, L0, N0);
-                                            const int nm = 2*L0+1;
-
-                                            for (int m1 = 0;m1 < nm; ++m1)
-                                            {
-                                                for (int m2 = 0; m2 < nm; ++m2)
-                                                {
-                                                    for(int dim=0;dim<3;++dim)
-                                                    {                                            
-                                                        nlm_t[dim] += gedm[inl][m1*nm+m2]*nlm1[ib+m1]*nlm2[dim][ib+m2];
-                                                    }
-                                                }
-                                            }
-                                            ib+=nm;
-                                        }
-                                    }
-                                    assert(ib==nlm1.size());
-                                }
-                                else
-                                {
-                                    int nproj = 0;
-                                    for(int il = 0; il < lmaxd + 1; il++)
-                                    {
-                                        nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
-                                    }
-                                    for(int iproj = 0; iproj < nproj; iproj ++)
-                                    {
-                                        for(int jproj = 0; jproj < nproj; jproj ++)
-                                        {
-                                            for(int dim=0;dim<3;dim++)
-                                            {
-                                                nlm_t[dim] += gedm[iat][iproj*nproj+jproj] * nlm1[iproj] * nlm2[dim][jproj];
-                                            }
-                                        }
-                                    }
-                                }
-
-                                for(int ipol=0;ipol<3;ipol++)
-                                {
-                                    for(int jpol=ipol;jpol<3;jpol++)
-                                    {
-                                        svnl_dalpha(ipol, jpol) += *dm_current * 
-                                        (nlm[jpol] * r0[ipol] + nlm_t[jpol] * r1[ipol]);
-                                    }
-                                }
-                            }
-                            dm_current++;
-                        }//iw2
-                    }//iw1
-                }//ad2
-            }//ad1
-        }//end I0
-    }//end T0
-
-    if(isstress)
-	{
-		assert(ucell.omega>0.0);
-		const double weight = ucell.lat0 / ucell.omega ;
-		for(int i=0;i<3;++i)
-		{
-			for(int j=0;j<3;++j)
-			{
-				if(j>i) 
-                { 
-                    svnl_dalpha(j,i) = svnl_dalpha(i,j);
-                }
-				svnl_dalpha(i,j) *= weight;
-			}
-		}
-	}
-
-    return;
-}
-
-
-//prints forces and stress from DeePKS (LCAO) 
-void DeePKS_domain::check_f_delta(
-    const int nat, 
-    ModuleBase::matrix& f_delta,
-    ModuleBase::matrix& svnl_dalpha)
-{
-    ModuleBase::TITLE("LCAO_Deepks", "check_F_delta");
-
-    std::ofstream ofs("F_delta.dat");
-    ofs<<std::setprecision(10);
-
-    for (int iat=0; iat<nat; iat++)
-    {
-        ofs << f_delta(iat,0) << " " << f_delta(iat,1) << " " << f_delta(iat,2) << std::endl;
-    }
-
-    std::ofstream ofs1("stress_delta.dat");
-    ofs1<<std::setprecision(10);
-    for (int ipol=0; ipol<3; ipol++)
-    {
-        for (int jpol=0; jpol<3; jpol++)
-        {
-            ofs1 << svnl_dalpha(ipol,jpol) << " ";
-        }
-        ofs1 << std::endl;
-    }    
-    return;
-}
-
-#endif
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_fk.cpp b/source/module_hamilt_lcao/module_deepks/deepks_fk.cpp
deleted file mode 100644
index 8a724bf098..0000000000
--- a/source/module_hamilt_lcao/module_deepks/deepks_fk.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-// cal_f_delta_k, which is used for multi-k calculation
-
-#include "module_parameter/parameter.h"
-#ifdef __DEEPKS
-
-#include "deepks_force.h"
-#include "module_base/vector3.h"
-#include "module_base/timer.h"
-#include "module_base/constants.h"
-#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
-#include "module_base/libm/libm.h"
-
-//force for multi-k calculations
-//Pulay and HF terms are calculated together
-
-typedef std::tuple<int, int, int, int> key_tuple; // used in nlm_save_k
-
-void DeePKS_domain::cal_f_delta_k(
-    const std::vector<std::vector<std::complex<double>>>& dm, /**<[in] density matrix*/
-    const UnitCell& ucell,
-    const LCAO_Orbitals& orb,
-    const Grid_Driver& GridD,
-    const Parallel_Orbitals& pv,
-    const int lmaxd,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-    std::vector<std::map<key_tuple, std::unordered_map<int, std::vector<std::vector<double>>>>>& nlm_save_k,
-    double** gedm,
-    ModuleBase::IntArray* inl_index,
-    ModuleBase::matrix& f_delta,
-    const bool isstress,
-    ModuleBase::matrix& svnl_dalpha)
-{
-    ModuleBase::TITLE("LCAO_Deepks", "cal_f_delta_hf_k_new");
-    ModuleBase::timer::tick("LCAO_Deepks","cal_f_delta_hf_k_new");
-    f_delta.zero_out();
-
-    const double Rcut_Alpha = orb.Alpha[0].getRcut();
-    const int nrow = pv.nrow;
-
-    for (int T0 = 0; T0 < ucell.ntype; T0++)
-    {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        for (int I0 =0; I0< atom0->na; I0++)
-        {
-            const int iat = ucell.itia2iat(T0,I0);
-            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
-	
-            for (int ad1=0; ad1<GridD.getAdjacentNum()+1 ; ++ad1)
-            {
-                const int T1 = GridD.getType(ad1);
-                const int I1 = GridD.getNatom(ad1);
-                const int ibt1 = ucell.itia2iat(T1,I1);
-                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
-                const Atom* atom1 = &ucell.atoms[T1];
-                const int nw1_tot = atom1->nw*PARAM.globalv.npol;
-                const double Rcut_AO1 = orb.Phi[T1].getRcut();
-
-                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z);
-
-                for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
-                {
-                    const int T2 = GridD.getType(ad2);
-                    const int I2 = GridD.getNatom(ad2);
-                    const int ibt2 = ucell.itia2iat(T2,I2);
-                    const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
-                    const Atom* atom2 = &ucell.atoms[T2];
-                    const int nw2_tot = atom2->nw*PARAM.globalv.npol;
-                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
-                    
-                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                    const double dist1 = (tau1-tau0).norm() * ucell.lat0;
-                    const double dist2 = (tau2-tau0).norm() * ucell.lat0;
-
-                    if (dist1 > Rcut_Alpha + Rcut_AO1
-                            || dist2 > Rcut_Alpha + Rcut_AO2)
-                    {
-                        continue;
-                    }
-
-                    double r0[3];
-                    double r1[3];
-                    if(isstress)
-                    {
-                        r1[0] = ( tau1.x - tau0.x) ;
-                        r1[1] = ( tau1.y - tau0.y) ;
-                        r1[2] = ( tau1.z - tau0.z) ;
-                        r0[0] = ( tau2.x - tau0.x) ;
-                        r0[1] = ( tau2.y - tau0.y) ;
-                        r0[2] = ( tau2.z - tau0.z) ;
-                    }
-
-                    auto row_indexes = pv.get_indexes_row(ibt1);
-                    auto col_indexes = pv.get_indexes_col(ibt2);
-                    if(row_indexes.size() * col_indexes.size() == 0) { continue;
-}
-
-                    hamilt::AtomPair<double> dm_pair(ibt1, ibt2, (dR2-dR1).x, (dR2-dR1).y, (dR2-dR1).z, &pv);
-                    dm_pair.allocate(nullptr, 1);
-                    for(int ik=0;ik<nks;ik++)
-                    {
-                        const double arg = - (kvec_d[ik] * (dR2-dR1) ) * ModuleBase::TWO_PI;
-                        double sinp, cosp;
-                        ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                        const std::complex<double> kphase = std::complex<double>(cosp, sinp);
-                        if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
-                        {
-                            dm_pair.add_from_matrix(dm[ik].data(), pv.get_row_size(), kphase, 1);
-                        }
-                        else
-                        {
-                            dm_pair.add_from_matrix(dm[ik].data(), pv.get_col_size(), kphase, 0);
-                        }
-                    }
-
-                    const double* dm_current = dm_pair.get_pointer();
-
-                    for (int iw1=0; iw1<row_indexes.size(); ++iw1)
-                    {
-                        for (int iw2=0; iw2<col_indexes.size(); ++iw2)
-                        {
-                            double nlm[3]={0,0,0};
-                            double nlm_t[3] = {0,0,0}; //for stress
-                            key_tuple key_1(ibt1,dR1.x,dR1.y,dR1.z);
-                            key_tuple key_2(ibt2,dR2.x,dR2.y,dR2.z);
-                            std::vector<double> nlm1 = nlm_save_k[iat][key_1][row_indexes[iw1]][0];
-                            std::vector<std::vector<double>> nlm2;
-                            nlm2.resize(3);
-                            for(int dim=0;dim<3;dim++)
-                            {
-                                nlm2[dim] = nlm_save_k[iat][key_2][col_indexes[iw2]][dim+1];
-                            }
-
-                            assert(nlm1.size()==nlm2[0].size());
-
-                            if(!PARAM.inp.deepks_equiv)
-                            {
-                                int ib=0;
-                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
-                                {
-                                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
-                                    {
-                                        const int inl = inl_index[T0](I0, L0, N0);
-                                        const int nm = 2*L0+1;
-                                        for (int m1 = 0;m1 < nm; ++m1)
-                                        {
-                                            for (int m2 = 0; m2 < nm; ++m2)
-                                            {
-                                                for(int dim=0;dim<3;dim++)
-                                                {                                            
-                                                    nlm[dim] += gedm[inl][m1*nm+m2]*nlm1[ib+m1]*nlm2[dim][ib+m2];
-                                                }
-                                            }
-                                        }
-                                        ib+=nm;
-                                    }
-                                }
-                                assert(ib==nlm1.size());
-                            }
-                            else
-                            {
-                                int nproj = 0;
-                                for(int il = 0; il < lmaxd + 1; il++)
-                                {
-                                    nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
-                                }
-                                for(int iproj = 0; iproj < nproj; iproj ++)
-                                {
-                                    for(int jproj = 0; jproj < nproj; jproj ++)
-                                    {
-                                        for(int dim=0;dim<3;dim++)
-                                        {
-                                            nlm[dim] += gedm[iat][iproj*nproj+jproj] * nlm1[iproj] * nlm2[dim][jproj];
-                                        }
-                                    }
-                                }
-                            }
-
-                            // Pulay term is plus
-                            f_delta(ibt2, 0) += 2.0 * *dm_current * nlm[0];
-                            f_delta(ibt2, 1) += 2.0 * *dm_current * nlm[1];
-                            f_delta(ibt2, 2) += 2.0 * *dm_current * nlm[2];
-
-                            // HF term is minus, only one projector for each atom force.
-                            f_delta(iat, 0) -= 2.0 * *dm_current * nlm[0];
-                            f_delta(iat, 1) -= 2.0 * *dm_current * nlm[1];
-                            f_delta(iat, 2) -= 2.0 * *dm_current * nlm[2];
-
-                            if(isstress)
-                            {
-                                nlm1 = nlm_save_k[iat][key_2][col_indexes[iw2]][0];
-                                for(int i=0;i<3;i++)
-                                {
-                                    nlm2[i] = nlm_save_k[iat][key_1][row_indexes[iw1]][i+1];
-                                }
-
-                                assert(nlm1.size()==nlm2[0].size());                                
-
-                                if(!PARAM.inp.deepks_equiv)
-                                {
-                                    int ib=0;
-                                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
-                                    {
-                                        for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
-                                        {
-                                            const int inl = inl_index[T0](I0, L0, N0);
-                                            const int nm = 2*L0+1;
-                                            for (int m1 = 0;m1 < nm; ++m1)
-                                            {
-                                                for (int m2 = 0; m2 < nm; ++m2)
-                                                {
-                                                    for(int dim=0;dim<3;dim++)
-                                                    {                                            
-                                                        nlm_t[dim] += gedm[inl][m1*nm+m2]*nlm1[ib+m1]*nlm2[dim][ib+m2];
-                                                    }
-                                                }
-                                            }
-                                            ib+=nm;
-                                        }
-                                    }
-                                    assert(ib==nlm1.size());
-                                }
-                                else
-                                {
-                                    int nproj = 0;
-                                    for(int il = 0; il < lmaxd + 1; il++)
-                                    {
-                                        nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
-                                    }
-                                    for(int iproj = 0; iproj < nproj; iproj ++)
-                                    {
-                                        for(int jproj = 0; jproj < nproj; jproj ++)
-                                        {
-                                            for(int dim=0;dim<3;dim++)
-                                            {
-                                                nlm_t[dim] += gedm[iat][iproj*nproj+jproj] * nlm1[iproj] * nlm2[dim][jproj];
-                                            }
-                                        }
-                                    }
-                                }
-                                   
-                                for(int ipol=0;ipol<3;ipol++)
-                                {
-                                    svnl_dalpha(0,ipol) -= *dm_current * (nlm[0] * r0[ipol] + nlm_t[0] * r1[ipol])* -1.0;
-                                    svnl_dalpha(1,ipol) -= *dm_current * (nlm[1] * r0[ipol] + nlm_t[1] * r1[ipol])* -1.0;
-                                    svnl_dalpha(2,ipol) -= *dm_current * (nlm[2] * r0[ipol] + nlm_t[2] * r1[ipol])* -1.0;
-                                }
-
-                            }
-                            dm_current++;
-                        }//iw2
-                    }//iw1
-                }//ad2
-            }//ad1
-        }//end I0
-    }//end T0
-
-    if(isstress)
-    {
-		assert(ucell.omega>0.0);
-		const double weight = ucell.lat0 / ucell.omega ;
-		for(int i=0;i<3;++i)
-		{
-			for(int j=0;j<3;++j)
-			{
-				if(j>i) { svnl_dalpha(j,i) = svnl_dalpha(i,j);
-}
-				svnl_dalpha(i,j) *= weight ;
-			}
-		}
-    }
-    ModuleBase::timer::tick("LCAO_Deepks","cal_f_delta_hf_k_new");
-    return;
-}
-
-#endif
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_force.cpp b/source/module_hamilt_lcao/module_deepks/deepks_force.cpp
new file mode 100644
index 0000000000..069109229f
--- /dev/null
+++ b/source/module_hamilt_lcao/module_deepks/deepks_force.cpp
@@ -0,0 +1,381 @@
+// wenfei 2022-1-11
+// This file contains subroutines for calculating F_delta,
+#include "module_parameter/parameter.h"
+// which is defind as sum_mu,nu rho_mu,nu d/dX (<chi_mu|alpha>V(D)<alpha|chi_nu>)
+
+// There are 2 subroutines in this file:
+// 1. cal_f_delta_gamma, which is used for gamma point calculation
+// 2. check_f_delta, which prints F_delta into F_delta.dat for checking
+
+#ifdef __DEEPKS
+
+#include "deepks_force.h"
+#include "module_base/constants.h"
+#include "module_base/libm/libm.h"
+#include "module_base/timer.h"
+#include "module_base/vector3.h"
+#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
+
+template <typename TK>
+void DeePKS_domain::cal_f_delta(const std::vector<std::vector<TK>>& dm,
+                                const UnitCell& ucell,
+                                const LCAO_Orbitals& orb,
+                                const Grid_Driver& GridD,
+                                const Parallel_Orbitals& pv,
+                                const int lmaxd,
+                                const int nks,
+                                const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                std::vector<hamilt::HContainer<double>*> psialpha,
+                                double** gedm,
+                                ModuleBase::IntArray* inl_index,
+                                ModuleBase::matrix& f_delta,
+                                const bool isstress,
+                                ModuleBase::matrix& svnl_dalpha)
+{
+    ModuleBase::TITLE("DeePKS_domain", "cal_f_delta");
+
+    f_delta.zero_out();
+
+    const double Rcut_Alpha = orb.Alpha[0].getRcut();
+
+    const int nrow = pv.nrow;
+
+    for (int T0 = 0; T0 < ucell.ntype; T0++)
+    {
+        Atom* atom0 = &ucell.atoms[T0];
+        for (int I0 = 0; I0 < atom0->na; I0++)
+        {
+            const int iat = ucell.itia2iat(T0, I0);
+            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
+            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
+
+            for (int ad1 = 0; ad1 < GridD.getAdjacentNum() + 1; ++ad1)
+            {
+                const int T1 = GridD.getType(ad1);
+                const int I1 = GridD.getNatom(ad1);
+                const int ibt1 = ucell.itia2iat(T1, I1);
+                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
+                const Atom* atom1 = &ucell.atoms[T1];
+                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
+
+                ModuleBase::Vector3<int> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z);
+
+                for (int ad2 = 0; ad2 < GridD.getAdjacentNum() + 1; ad2++)
+                {
+                    const int T2 = GridD.getType(ad2);
+                    const int I2 = GridD.getNatom(ad2);
+                    const int ibt2 = ucell.itia2iat(T2, I2);
+                    const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
+                    const Atom* atom2 = &ucell.atoms[T2];
+                    const int nw2_tot = atom2->nw * PARAM.globalv.npol;
+                    ModuleBase::Vector3<int> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
+
+                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
+                    const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
+                    const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
+
+                    if (dist1 > Rcut_Alpha + Rcut_AO1 || dist2 > Rcut_Alpha + Rcut_AO2)
+                    {
+                        continue;
+                    }
+
+                    double r0[3];
+                    double r1[3];
+                    if (isstress)
+                    {
+                        r1[0] = (tau1.x - tau0.x);
+                        r1[1] = (tau1.y - tau0.y);
+                        r1[2] = (tau1.z - tau0.z);
+                        r0[0] = (tau2.x - tau0.x);
+                        r0[1] = (tau2.y - tau0.y);
+                        r0[2] = (tau2.z - tau0.z);
+                    }
+
+                    auto row_indexes = pv.get_indexes_row(ibt1);
+                    auto col_indexes = pv.get_indexes_col(ibt2);
+
+                    if (row_indexes.size() * col_indexes.size() == 0)
+                    {
+                        continue;
+                    }
+
+                    int dRx;
+                    int dRy;
+                    int dRz;
+                    if constexpr (std::is_same<TK, double>::value) // for gamma-only
+                    {
+                        dRx = 0;
+                        dRy = 0;
+                        dRz = 0;
+                    }
+                    else // for multi-k
+                    {
+                        dRx = dR2.x - dR1.x;
+                        dRy = dR2.y - dR1.y;
+                        dRz = dR2.z - dR1.z;
+                    }
+                    ModuleBase::Vector3<double> dR(dRx, dRy, dRz);
+
+                    hamilt::AtomPair<double> dm_pair(ibt1, ibt2, dRx, dRy, dRz, &pv);
+
+                    dm_pair.allocate(nullptr, 1);
+
+                    if constexpr (std::is_same<TK, double>::value) // for gamma-only
+                    {
+                        for (int is = 0; is < dm.size(); is++)
+                        {
+                            if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
+                            {
+                                dm_pair.add_from_matrix(dm[is].data(), pv.get_row_size(), 1.0, 1);
+                            }
+                            else
+                            {
+                                dm_pair.add_from_matrix(dm[is].data(), pv.get_col_size(), 1.0, 0);
+                            }
+                        }
+                    }
+                    else // for multi-k
+                    {
+                        for (int ik = 0; ik < nks; ik++)
+                        {
+                            const double arg = -(kvec_d[ik] * dR) * ModuleBase::TWO_PI;
+                            double sinp, cosp;
+                            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                            const std::complex<double> kphase = std::complex<double>(cosp, sinp);
+                            if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
+                            {
+                                dm_pair.add_from_matrix(dm[ik].data(), pv.get_row_size(), kphase, 1);
+                            }
+                            else
+                            {
+                                dm_pair.add_from_matrix(dm[ik].data(), pv.get_col_size(), kphase, 0);
+                            }
+                        }
+                    }
+
+                    const double* dm_current = dm_pair.get_pointer();
+
+                    for (int iw1 = 0; iw1 < row_indexes.size(); ++iw1)
+                    {
+                        for (int iw2 = 0; iw2 < col_indexes.size(); ++iw2)
+                        {
+                            double nlm[3] = {0, 0, 0};
+                            double nlm_t[3] = {0, 0, 0}; // for stress
+
+                            hamilt::BaseMatrix<double>* overlap_1 = psialpha[0]->find_matrix(iat, ibt1, dR1);
+                            hamilt::BaseMatrix<double>* overlap_2 = psialpha[0]->find_matrix(iat, ibt2, dR2);
+                            std::vector<hamilt::BaseMatrix<double>*> grad_overlap_1(3);
+                            std::vector<hamilt::BaseMatrix<double>*> grad_overlap_2(3);
+                            for (int i = 0; i < 3; ++i)
+                            {
+                                grad_overlap_1[i] = psialpha[i + 1]->find_matrix(iat, ibt1, dR1);
+                                grad_overlap_2[i] = psialpha[i + 1]->find_matrix(iat, ibt2, dR2);
+                            }
+
+                            assert(overlap_1->get_col_size() == overlap_2->get_col_size());
+
+                            if (!PARAM.inp.deepks_equiv)
+                            {
+                                int ib = 0;
+                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
+                                {
+                                    for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
+                                    {
+                                        const int inl = inl_index[T0](I0, L0, N0);
+                                        const int nm = 2 * L0 + 1;
+                                        for (int m1 = 0; m1 < nm; ++m1)
+                                        {
+                                            for (int m2 = 0; m2 < nm; ++m2)
+                                            {
+                                                for (int dim = 0; dim < 3; dim++)
+                                                {
+                                                    nlm[dim]
+                                                        += gedm[inl][m1 * nm + m2]
+                                                           * overlap_1->get_value(row_indexes[iw1], ib + m1)
+                                                           * grad_overlap_2[dim]->get_value(col_indexes[iw2], ib + m2);
+                                                }
+                                            }
+                                        }
+                                        ib += nm;
+                                    }
+                                }
+                                assert(ib == overlap_1->get_col_size());
+                            }
+                            else
+                            {
+                                int nproj = 0;
+                                for (int il = 0; il < lmaxd + 1; il++)
+                                {
+                                    nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+                                }
+                                for (int iproj = 0; iproj < nproj; iproj++)
+                                {
+                                    for (int jproj = 0; jproj < nproj; jproj++)
+                                    {
+                                        for (int dim = 0; dim < 3; dim++)
+                                        {
+                                            nlm[dim] += gedm[iat][iproj * nproj + jproj]
+                                                        * overlap_1->get_value(row_indexes[iw1], iproj)
+                                                        * grad_overlap_2[dim]->get_value(col_indexes[iw2], jproj);
+                                        }
+                                    }
+                                }
+                            }
+
+                            // HF term is minus, only one projector for each atom force.
+                            f_delta(iat, 0) -= 2.0 * *dm_current * nlm[0];
+                            f_delta(iat, 1) -= 2.0 * *dm_current * nlm[1];
+                            f_delta(iat, 2) -= 2.0 * *dm_current * nlm[2];
+
+                            // Pulay term is plus, only one projector for each atom force.
+                            f_delta(ibt2, 0) += 2.0 * *dm_current * nlm[0];
+                            f_delta(ibt2, 1) += 2.0 * *dm_current * nlm[1];
+                            f_delta(ibt2, 2) += 2.0 * *dm_current * nlm[2];
+
+                            if (isstress)
+                            {
+                                if (!PARAM.inp.deepks_equiv)
+                                {
+                                    int ib = 0;
+                                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
+                                    {
+                                        for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
+                                        {
+                                            const int inl = inl_index[T0](I0, L0, N0);
+                                            const int nm = 2 * L0 + 1;
+
+                                            for (int m1 = 0; m1 < nm; ++m1)
+                                            {
+                                                for (int m2 = 0; m2 < nm; ++m2)
+                                                {
+                                                    for (int dim = 0; dim < 3; ++dim)
+                                                    {
+                                                        nlm_t[dim] += gedm[inl][m1 * nm + m2]
+                                                                      * overlap_2->get_value(col_indexes[iw2], ib + m1)
+                                                                      * grad_overlap_1[dim]->get_value(row_indexes[iw1],
+                                                                                                       ib + m2);
+                                                    }
+                                                }
+                                            }
+                                            ib += nm;
+                                        }
+                                    }
+                                    assert(ib == overlap_2->get_col_size());
+                                }
+                                else
+                                {
+                                    int nproj = 0;
+                                    for (int il = 0; il < lmaxd + 1; il++)
+                                    {
+                                        nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+                                    }
+                                    for (int iproj = 0; iproj < nproj; iproj++)
+                                    {
+                                        for (int jproj = 0; jproj < nproj; jproj++)
+                                        {
+                                            for (int dim = 0; dim < 3; dim++)
+                                            {
+                                                nlm_t[dim] += gedm[iat][iproj * nproj + jproj]
+                                                              * overlap_2->get_value(col_indexes[iw2], iproj)
+                                                              * grad_overlap_1[dim]->get_value(row_indexes[iw1], jproj);
+                                            }
+                                        }
+                                    }
+                                }
+
+                                for (int ipol = 0; ipol < 3; ipol++)
+                                {
+                                    for (int jpol = ipol; jpol < 3; jpol++)
+                                    {
+                                        svnl_dalpha(ipol, jpol)
+                                            += *dm_current * (nlm[ipol] * r0[jpol] + nlm_t[ipol] * r1[jpol]);
+                                    }
+                                }
+                            }
+                            dm_current++;
+                        } // iw2
+                    }     // iw1
+                }         // ad2
+            }             // ad1
+        }                 // end I0
+    }                     // end T0
+
+    if (isstress)
+    {
+        assert(ucell.omega > 0.0);
+        const double weight = ucell.lat0 / ucell.omega;
+        // use upper triangle to make symmetric stress tensor
+        for (int i = 0; i < 3; ++i)
+        {
+            for (int j = 0; j < 3; ++j)
+            {
+                if (j > i)
+                {
+                    svnl_dalpha(j, i) = svnl_dalpha(i, j);
+                }
+                svnl_dalpha(i, j) *= weight;
+            }
+        }
+    }
+
+    return;
+}
+
+// prints forces and stress from DeePKS (LCAO)
+void DeePKS_domain::check_f_delta(const int nat, ModuleBase::matrix& f_delta, ModuleBase::matrix& svnl_dalpha)
+{
+    ModuleBase::TITLE("LCAO_Deepks", "check_F_delta");
+
+    std::ofstream ofs("F_delta.dat");
+    ofs << std::setprecision(10);
+
+    for (int iat = 0; iat < nat; iat++)
+    {
+        ofs << f_delta(iat, 0) << " " << f_delta(iat, 1) << " " << f_delta(iat, 2) << std::endl;
+    }
+
+    std::ofstream ofs1("stress_delta.dat");
+    ofs1 << std::setprecision(10);
+    for (int ipol = 0; ipol < 3; ipol++)
+    {
+        for (int jpol = 0; jpol < 3; jpol++)
+        {
+            ofs1 << svnl_dalpha(ipol, jpol) << " ";
+        }
+        ofs1 << std::endl;
+    }
+    return;
+}
+
+template void DeePKS_domain::cal_f_delta<double>(const std::vector<std::vector<double>>& dm,
+                                                 const UnitCell& ucell,
+                                                 const LCAO_Orbitals& orb,
+                                                 const Grid_Driver& GridD,
+                                                 const Parallel_Orbitals& pv,
+                                                 const int lmaxd,
+                                                 const int nks,
+                                                 const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                                 std::vector<hamilt::HContainer<double>*> psialpha,
+                                                 double** gedm,
+                                                 ModuleBase::IntArray* inl_index,
+                                                 ModuleBase::matrix& f_delta,
+                                                 const bool isstress,
+                                                 ModuleBase::matrix& svnl_dalpha);
+
+template void DeePKS_domain::cal_f_delta<std::complex<double>>(const std::vector<std::vector<std::complex<double>>>& dm,
+                                                               const UnitCell& ucell,
+                                                               const LCAO_Orbitals& orb,
+                                                               const Grid_Driver& GridD,
+                                                               const Parallel_Orbitals& pv,
+                                                               const int lmaxd,
+                                                               const int nks,
+                                                               const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                                               std::vector<hamilt::HContainer<double>*> psialpha,
+                                                               double** gedm,
+                                                               ModuleBase::IntArray* inl_index,
+                                                               ModuleBase::matrix& f_delta,
+                                                               const bool isstress,
+                                                               ModuleBase::matrix& svnl_dalpha);
+
+#endif
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_force.h b/source/module_hamilt_lcao/module_deepks/deepks_force.h
index 58474b9d56..e6f5b382d0 100644
--- a/source/module_hamilt_lcao/module_deepks/deepks_force.h
+++ b/source/module_hamilt_lcao/module_deepks/deepks_force.h
@@ -20,21 +20,19 @@
 namespace DeePKS_domain
 {
     //------------------------
-    // LCAO_deepks_fgamma.cpp
-    // LCAO_deepks_fk.cpp
+    // LCAO_deepks_force.cpp
     //------------------------
 
     // This file contains subroutines for calculating F_delta,
     // which is defind as sum_mu,nu rho_mu,nu d/dX (<chi_mu|alpha>V(D)<alpha|chi_nu>)
 
-    // There are 3 subroutines in this file:
-    // 1. cal_f_delta_gamma, which is used for gamma point calculation
-    // 2. cal_f_delta_k, which is used for multi-k calculation
-    // 3. check_f_delta, which prints F_delta into F_delta.dat for checking
+    // There are 2 subroutines in this file:
+    // 1. cal_f_delta, which is used for F_delta calculation
+    // 2. check_f_delta, which prints F_delta into F_delta.dat for checking
 
-// for gamma only, pulay and HF terms of force are calculated together
-void cal_f_delta_gamma(
-    const std::vector<std::vector<double>>& dm,
+template <typename TK>
+void cal_f_delta(
+    const std::vector<std::vector<TK>>& dm,
     const UnitCell& ucell,
     const LCAO_Orbitals& orb,
     const Grid_Driver& GridD,
@@ -42,27 +40,7 @@ void cal_f_delta_gamma(
     const int lmaxd,
     const int nks,
     const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-    std::vector<std::vector<std::unordered_map<int, std::vector<std::vector<double>>>>>& nlm_save,
-    double** gedm,
-    ModuleBase::IntArray* inl_index,
-    ModuleBase::matrix& f_delta,
-    const bool isstress,
-    ModuleBase::matrix& svnl_dalpha);
-
-// for multi-k, pulay and HF terms of force are calculated together
-
-typedef std::tuple<int, int, int, int> key_tuple;
-
-void cal_f_delta_k(
-    const std::vector<std::vector<std::complex<double>>>& dm, /**<[in] density matrix*/
-    const UnitCell& ucell,
-    const LCAO_Orbitals& orb,
-    const Grid_Driver& GridD,
-    const Parallel_Orbitals& pv,
-    const int lmaxd,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-    std::vector<std::map<key_tuple, std::unordered_map<int, std::vector<std::vector<double>>>>>& nlm_save_k,
+    std::vector<hamilt::HContainer<double>*> psialpha,
     double** gedm,
     ModuleBase::IntArray* inl_index,
     ModuleBase::matrix& f_delta,
diff --git a/source/module_hamilt_lcao/module_deepks/orbital_precalc.cpp b/source/module_hamilt_lcao/module_deepks/orbital_precalc.cpp
index da8dcdd539..a8fec8f9e9 100644
--- a/source/module_hamilt_lcao/module_deepks/orbital_precalc.cpp
+++ b/source/module_hamilt_lcao/module_deepks/orbital_precalc.cpp
@@ -33,11 +33,11 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
 
     this->init_orbital_pdm_shell(nks);
 
-    for (int T0 = 0; T0 < ucell.ntype; T0++) 
+    for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
         Atom* atom0 = &ucell.atoms[T0];
 
-        for (int I0 = 0; I0 < atom0->na; I0++) 
+        for (int I0 = 0; I0 < atom0->na; I0++)
         {
             const int iat = ucell.itia2iat(T0, I0);
             const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
@@ -47,9 +47,9 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
             std::vector<int> trace_alpha_row;
             std::vector<int> trace_alpha_col;
             int ib = 0;
-            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0) 
+            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
             {
-                for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0) 
+                for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
                 {
                     const int inl = this->inl_index[T0](I0, L0, N0);
                     const int nm = 2 * L0 + 1;
@@ -67,33 +67,27 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
             }
             const int trace_alpha_size = trace_alpha_row.size();
 
-            for (int ad1 = 0; ad1 < GridD.getAdjacentNum() + 1; ++ad1) 
+            for (int ad1 = 0; ad1 < GridD.getAdjacentNum() + 1; ++ad1)
             {
                 const int T1 = GridD.getType(ad1);
                 const int I1 = GridD.getNatom(ad1);
                 const int ibt1 = ucell.itia2iat(T1, I1);
-                const ModuleBase::Vector3<double> tau1
-                    = GridD.getAdjacentTau(ad1);
+                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
                 const Atom* atom1 = &ucell.atoms[T1];
                 const int nw1_tot = atom1->nw * PARAM.globalv.npol;
                 const double Rcut_AO1 = orb.Phi[T1].getRcut();
 
                 const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
-                if (dist1 >= Rcut_Alpha + Rcut_AO1) 
+                if (dist1 >= Rcut_Alpha + Rcut_AO1)
                 {
                     continue;
                 }
 
-                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x,
-                                                GridD.getBox(ad1).y,
-                                                GridD.getBox(ad1).z);
-
-                key_tuple key_1(ibt1, dR1.x, dR1.y, dR1.z);
+                ModuleBase::Vector3<int> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z);
 
                 if constexpr (std::is_same<TK, std::complex<double>>::value)
                 {
-                    if (this->nlm_save_k[iat].find(key_1)
-                        == this->nlm_save_k[iat].end()) 
+                    if (this->psialpha[0]->find_matrix(iat, ibt1, dR1.x, dR1.y, dR1.z) == nullptr)
                     {
                         continue;
                     }
@@ -103,67 +97,53 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
 
                 const int row_size = row_indexes.size();
 
-                if (row_size == 0) 
+                if (row_size == 0)
                 {
                     continue;
                 }
 
                 std::vector<double> s_1t(trace_alpha_size * row_size);
 
-				std::vector<double> g_1dmt(nks * trace_alpha_size * row_size, 0.0);
+                std::vector<double> g_1dmt(nks * trace_alpha_size * row_size, 0.0);
 
-				for (int irow = 0; irow < row_size; irow++) 
-				{
-                    double* row_ptr;
-                    if constexpr (std::is_same<TK, double>::value)
-                    {
-                        row_ptr = this->nlm_save[iat][ad1][row_indexes[irow]][0].data();
-                    }
-                    else
-                    {
-                        row_ptr = this->nlm_save_k[iat][key_1][row_indexes[irow]][0].data();
-                    }
-                    for (int i = 0; i < trace_alpha_size; i++) 
+                for (int irow = 0; irow < row_size; irow++)
+                {
+                    hamilt::BaseMatrix<double>* row_ptr = this->psialpha[0]->find_matrix(iat, ibt1, dR1);
+                    for (int i = 0; i < trace_alpha_size; i++)
                     {
-                        s_1t[i * row_size + irow] = row_ptr[trace_alpha_row[i]];
+                        s_1t[i * row_size + irow] = row_ptr->get_value(row_indexes[irow], trace_alpha_row[i]);
                     }
                 }
 
-                for (int ad2 = 0; ad2 < GridD.getAdjacentNum() + 1; ad2++) 
+                for (int ad2 = 0; ad2 < GridD.getAdjacentNum() + 1; ad2++)
                 {
                     const int T2 = GridD.getType(ad2);
                     const int I2 = GridD.getNatom(ad2);
                     const int ibt2 = ucell.itia2iat(T2, I2);
                     if constexpr (std::is_same<TK, std::complex<double>>::value) // Why only for multi-k?
                     {
-                        if (ibt1 > ibt2) 
+                        if (ibt1 > ibt2)
                         {
                             continue;
                         }
                     }
-                    const ModuleBase::Vector3<double> tau2
-                        = GridD.getAdjacentTau(ad2);
+                    const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
                     const Atom* atom2 = &ucell.atoms[T2];
                     const int nw2_tot = atom2->nw * PARAM.globalv.npol;
 
                     const double Rcut_AO2 = orb.Phi[T2].getRcut();
                     const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
 
-                    if (dist2 >= Rcut_Alpha + Rcut_AO2) 
+                    if (dist2 >= Rcut_Alpha + Rcut_AO2)
                     {
                         continue;
                     }
 
-                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x,
-                                                    GridD.getBox(ad2).y,
-                                                    GridD.getBox(ad2).z);
+                    ModuleBase::Vector3<int> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
 
-                    key_tuple key_2(ibt2, dR2.x, dR2.y, dR2.z);
-                    
                     if constexpr (std::is_same<TK, std::complex<double>>::value)
                     {
-                        if (this->nlm_save_k[iat].find(key_2)
-                                == this->nlm_save_k[iat].end()) 
+                        if (this->psialpha[0]->find_matrix(iat, ibt2, dR2.x, dR2.y, dR2.z) == nullptr)
                         {
                             continue;
                         }
@@ -171,27 +151,18 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
 
                     auto col_indexes = pv->get_indexes_col(ibt2);
                     const int col_size = col_indexes.size();
-					if (col_size == 0) 
-					{
-						continue;
-					}
+                    if (col_size == 0)
+                    {
+                        continue;
+                    }
 
                     std::vector<double> s_2t(trace_alpha_size * col_size);
-                    for (int icol = 0; icol < col_size; icol++) 
+                    for (int icol = 0; icol < col_size; icol++)
                     {
-                        double* col_ptr;
-                        if constexpr (std::is_same<TK, double>::value)
-                        {
-                           col_ptr = this->nlm_save[iat][ad2][col_indexes[icol]][0].data();
-                        }
-                        else
-                        {
-                            col_ptr = this->nlm_save_k[iat][key_2][col_indexes[icol]][0].data();
-                        }
-                        for (int i = 0; i < trace_alpha_size; i++) 
+                        hamilt::BaseMatrix<double>* col_ptr = this->psialpha[0]->find_matrix(iat, ibt2, dR2);
+                        for (int i = 0; i < trace_alpha_size; i++)
                         {
-                            s_2t[i * col_size + icol]
-                                = col_ptr[trace_alpha_col[i]];
+                            s_2t[i * col_size + icol] = col_ptr->get_value(col_indexes[icol], trace_alpha_col[i]);
                         }
                     }
 
@@ -201,69 +172,51 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
 
                     if constexpr (std::is_same<TK, double>::value)
                     {
-                        for (int is = 0; is < PARAM.inp.nspin; is++) 
+                        for (int is = 0; is < PARAM.inp.nspin; is++)
                         {
-                            hamilt::AtomPair<double> dm_pair(ibt1,
-                                                            ibt2,
-                                                            0,
-                                                            0,
-                                                            0,
-                                                            pv);
+                            hamilt::AtomPair<double> dm_pair(ibt1, ibt2, 0, 0, 0, pv);
 
                             dm_pair.allocate(dm_array.data(), 0);
 
-                            if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver)) 
+                            if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
                             {
-                                dm_pair.add_from_matrix(dm_hl[0][is].c,
-                                                        pv->get_row_size(),
-                                                        1.0,
-                                                        1);
-                            } 
-                            else 
+                                dm_pair.add_from_matrix(dm_hl[0][is].c, pv->get_row_size(), 1.0, 1);
+                            }
+                            else
                             {
-                                dm_pair.add_from_matrix(dm_hl[0][is].c,
-                                                        pv->get_col_size(),
-                                                        1.0,
-                                                        0);
+                                dm_pair.add_from_matrix(dm_hl[0][is].c, pv->get_col_size(), 1.0, 0);
                             }
                         } // is
                     }
                     else
                     {
-                        for (int ik = 0; ik < nks; ik++) 
+                        for (int ik = 0; ik < nks; ik++)
                         {
                             hamilt::AtomPair<double> dm_pair(ibt1,
-                                                            ibt2,
-                                                            (dR2 - dR1).x,
-                                                            (dR2 - dR1).y,
-                                                            (dR2 - dR1).z,
-                                                            pv);
+                                                             ibt2,
+                                                             (dR2 - dR1).x,
+                                                             (dR2 - dR1).y,
+                                                             (dR2 - dR1).z,
+                                                             pv);
 
                             dm_pair.allocate(&dm_array[ik * row_size * col_size], 0);
 
                             const double arg
-                                = -(kvec_d[ik] * (dR2 - dR1)) * ModuleBase::TWO_PI;
+                                = -(kvec_d[ik] * ModuleBase::Vector3<double>(dR2 - dR1)) * ModuleBase::TWO_PI;
 
                             double sinp, cosp;
 
                             ModuleBase::libm::sincos(arg, &sinp, &cosp);
 
-                            const std::complex<double> kphase
-                                = std::complex<double>(cosp, sinp);
+                            const std::complex<double> kphase = std::complex<double>(cosp, sinp);
 
-                            if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver)) 
+                            if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
                             {
-                                dm_pair.add_from_matrix(dm_hl[0][ik].c,
-                                        pv->get_row_size(),
-                                        kphase,
-                                        1);
-                            } 
-                            else 
+                                dm_pair.add_from_matrix(dm_hl[0][ik].c, pv->get_row_size(), kphase, 1);
+                            }
+                            else
                             {
-                                dm_pair.add_from_matrix(dm_hl[0][ik].c,
-                                        pv->get_col_size(),
-                                        kphase,
-                                        0);
+                                dm_pair.add_from_matrix(dm_hl[0][ik].c, pv->get_col_size(), kphase, 0);
                             }
                         } // ik
                     }
@@ -272,9 +225,9 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
                     constexpr char transa = 'T', transb = 'N';
                     double gemm_alpha = 1.0, gemm_beta = 1.0;
 
-                    if constexpr (std::is_same<TK, std::complex<double>>::value) 
+                    if constexpr (std::is_same<TK, std::complex<double>>::value)
                     {
-                        if (ibt1 < ibt2) 
+                        if (ibt1 < ibt2)
                         {
                             gemm_alpha = 2.0;
                         }
@@ -302,9 +255,9 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
 
                     int ib = 0, index = 0, inc = 1;
 
-                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0) 
+                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
                     {
-                        for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0) 
+                        for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
                         {
                             const int inl = this->inl_index[T0](I0, L0, N0);
                             const int nm = 2 * L0 + 1;
@@ -315,10 +268,10 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
                                 {
                                     orbital_pdm_shell[ik][0][inl][m1 * nm + m2]
                                         += ddot_(&row_size,
-                                                p_g1dmt + index * row_size * nks,
-                                                &inc,
-                                                s_1t.data() + index * row_size,
-                                                &inc);
+                                                 p_g1dmt + index * row_size * nks,
+                                                 &inc,
+                                                 s_1t.data() + index * row_size,
+                                                 &inc);
                                     index++;
                                 }
                             }
@@ -330,11 +283,11 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
         }
     }
 #ifdef __MPI
-    for (int iks = 0; iks < nks; iks++) 
+    for (int iks = 0; iks < nks; iks++)
     {
         for (int hl = 0; hl < 1; hl++)
         {
-            for (int inl = 0; inl < this->inlmax; inl++) 
+            for (int inl = 0; inl < this->inlmax; inl++)
             {
                 Parallel_Reduce::reduce_all(this->orbital_pdm_shell[iks][hl][inl],
                                             (2 * this->lmaxd + 1) * (2 * this->lmaxd + 1));
@@ -348,16 +301,16 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
     int nlmax = this->inlmax / nat;
 
     std::vector<torch::Tensor> orbital_pdm_shell_vector;
-    for (int nl = 0; nl < nlmax; ++nl) 
+    for (int nl = 0; nl < nlmax; ++nl)
     {
         std::vector<torch::Tensor> kiammv;
-        for (int iks = 0; iks < nks; ++iks) 
+        for (int iks = 0; iks < nks; ++iks)
         {
             std::vector<torch::Tensor> iammv;
             for (int hl = 0; hl < 1; ++hl)
             {
                 std::vector<torch::Tensor> ammv;
-                for (int iat = 0; iat < nat; ++iat) 
+                for (int iat = 0; iat < nat; ++iat)
                 {
                     int inl = iat * nlmax + nl;
                     int nm = 2 * this->inl_l[inl] + 1;
@@ -367,12 +320,11 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
                     {
                         for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
                         {
-                            mmv.push_back(
-                                this->orbital_pdm_shell[iks][hl][inl][m1 * nm + m2]);
+                            mmv.push_back(this->orbital_pdm_shell[iks][hl][inl][m1 * nm + m2]);
                         }
                     }
-                    torch::Tensor mm = torch::tensor(mmv,
-                        torch::TensorOptions().dtype(torch::kFloat64)).reshape({nm, nm}); // nm*nm
+                    torch::Tensor mm
+                        = torch::tensor(mmv, torch::TensorOptions().dtype(torch::kFloat64)).reshape({nm, nm}); // nm*nm
 
                     ammv.push_back(mm);
                 }
@@ -390,11 +342,10 @@ void LCAO_Deepks::cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl,
 
     // einsum for each nl:
     std::vector<torch::Tensor> orbital_precalc_vector;
-    for (int nl = 0; nl < nlmax; ++nl) 
+    for (int nl = 0; nl < nlmax; ++nl)
     {
         orbital_precalc_vector.push_back(
-            at::einsum("kiamn, avmn->kiav",
-                       {orbital_pdm_shell_vector[nl], this->gevdm_vector[nl]}));
+            at::einsum("kiamn, avmn->kiav", {orbital_pdm_shell_vector[nl], this->gevdm_vector[nl]}));
     }
 
     this->orbital_precalc_tensor = torch::cat(orbital_precalc_vector, -1);
diff --git a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
index ae8384057c..77bcc9a6cf 100644
--- a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
+++ b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
@@ -126,11 +126,11 @@ void test_deepks::check_gdmx()
     this->ld.init_gdmx(ucell.nat);
     if (PARAM.sys.gamma_only_local)
     {
-        this->ld.cal_gdmx(dm_new, ucell, ORB, Test_Deepks::GridD, kv.get_nkstot(), kv.kvec_d, 0);
+        this->ld.cal_gdmx(dm_new, ucell, ORB, Test_Deepks::GridD, kv.get_nkstot(), kv.kvec_d, this->ld.psialpha, 0);
     }
     else
     {
-        this->ld.cal_gdmx(dm_k_new, ucell, ORB, Test_Deepks::GridD, kv.get_nkstot(), kv.kvec_d, 0);
+        this->ld.cal_gdmx(dm_k_new, ucell, ORB, Test_Deepks::GridD, kv.get_nkstot(), kv.kvec_d, this->ld.psialpha, 0);
     }
     this->ld.check_gdmx(ucell.nat);
 
@@ -241,7 +241,7 @@ void test_deepks::check_f_delta()
     svnl_dalpha.create(3, 3);
     if (PARAM.sys.gamma_only_local)
     {
-        const int nks=1;
+        const int nks = 1;
         ld.cal_f_delta_gamma(dm_new, ucell, ORB, Test_Deepks::GridD, nks, kv.kvec_d, 1, svnl_dalpha);
     }
     else
diff --git a/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp b/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
index 6bb4c0ac6a..5ffe41a22d 100644
--- a/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
+++ b/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
@@ -14,7 +14,6 @@
 #include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
 #include "module_parameter/parameter.h"
 
-
 // calculates v_delta_precalc[nks,nlocal,nlocal,NAt,NDscrpt] = gvdm * v_delta_pdm_shell;
 // v_delta_pdm_shell[nks,nlocal,nlocal,Inl,nm*nm] = overlap * overlap;
 // for deepks_v_delta = 1
@@ -33,266 +32,263 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
 
     this->cal_gvdm(nat);
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
-    this->init_v_delta_pdm_shell(nks,nlocal); 
-   
+    this->init_v_delta_pdm_shell(nks, nlocal);
+
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        
-        for (int I0 =0; I0< atom0->na; I0++)
+        Atom* atom0 = &ucell.atoms[T0];
+
+        for (int I0 = 0; I0 < atom0->na; I0++)
         {
-            const int iat = ucell.itia2iat(T0,I0);
+            const int iat = ucell.itia2iat(T0, I0);
             const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
+            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
 
-            for (int ad1=0; ad1<GridD.getAdjacentNum()+1 ; ++ad1)
+            for (int ad1 = 0; ad1 < GridD.getAdjacentNum() + 1; ++ad1)
             {
                 const int T1 = GridD.getType(ad1);
                 const int I1 = GridD.getNatom(ad1);
-                const int ibt1 = ucell.itia2iat(T1, I1); 
+                const int ibt1 = ucell.itia2iat(T1, I1);
                 const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
                 const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
-				const Atom* atom1 = &ucell.atoms[T1];
-				const int nw1_tot = atom1->nw*PARAM.globalv.npol;
-				const double Rcut_AO1 = orb.Phi[T1].getRcut(); 
+                const Atom* atom1 = &ucell.atoms[T1];
+                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
 
-                const double dist1 = (tau1-tau0).norm() * ucell.lat0;
+                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
                 if (dist1 >= Rcut_Alpha + Rcut_AO1)
                 {
                     continue;
                 }
-                
-                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x,
-                                            GridD.getBox(ad1).y,
-                                            GridD.getBox(ad1).z);
 
-                key_tuple key_1(ibt1, dR1.x, dR1.y, dR1.z); 
+                ModuleBase::Vector3<int> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z);
 
                 if constexpr (std::is_same<TK, std::complex<double>>::value)
                 {
-                    if (this->nlm_save_k[iat].find(key_1)
-                        == this->nlm_save_k[iat].end()) 
+                    if (this->psialpha[0]->find_matrix(iat, ibt1, dR1.x, dR1.y, dR1.z) == nullptr)
                     {
                         continue;
                     }
                 }
 
-				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
-				{
-					const int T2 = GridD.getType(ad2);
-					const int I2 = GridD.getNatom(ad2);
+                for (int ad2 = 0; ad2 < GridD.getAdjacentNum() + 1; ad2++)
+                {
+                    const int T2 = GridD.getType(ad2);
+                    const int I2 = GridD.getNatom(ad2);
                     const int ibt2 = ucell.itia2iat(T2, I2);
-					const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
-					const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
-					const Atom* atom2 = &ucell.atoms[T2];
-					const int nw2_tot = atom2->nw*PARAM.globalv.npol;
-					
-					const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                	const double dist2 = (tau2-tau0).norm() * ucell.lat0;
-
-					if (dist2 >= Rcut_Alpha + Rcut_AO2)
-					{
-						continue;
-					}
-
-                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x,
-                                                        GridD.getBox(ad2).y,
-                                                        GridD.getBox(ad2).z);
-                    key_tuple key_2(ibt2, dR2.x, dR2.y, dR2.z);
+                    const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+                    const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
+                    const Atom* atom2 = &ucell.atoms[T2];
+                    const int nw2_tot = atom2->nw * PARAM.globalv.npol;
+
+                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
+                    const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
+
+                    if (dist2 >= Rcut_Alpha + Rcut_AO2)
+                    {
+                        continue;
+                    }
+
+                    ModuleBase::Vector3<int> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
 
                     if constexpr (std::is_same<TK, std::complex<double>>::value)
                     {
-                        if (this->nlm_save_k[iat].find(key_2)
-                            == this->nlm_save_k[iat].end()) 
+                        if (this->psialpha[0]->find_matrix(iat, ibt2, dR2.x, dR2.y, dR2.z) == nullptr)
                         {
                             continue;
                         }
                     }
 
-					for (int iw1=0; iw1<nw1_tot; ++iw1)
-					{
-						const int iw1_all = start1 + iw1; // this is \mu
-						const int iw1_local = pv->global2local_row(iw1_all);
-						if(iw1_local < 0) {continue;}
-						const int iw1_0 = iw1/PARAM.globalv.npol;
-						for (int iw2=0; iw2<nw2_tot; ++iw2)
-						{
-							const int iw2_all = start2 + iw2; // this is \nu
-							const int iw2_local = pv->global2local_col(iw2_all);
-							if(iw2_local < 0) {continue;}
-							const int iw2_0 = iw2/PARAM.globalv.npol;
-
-                            std::vector<double> nlm1;
-                            std::vector<double> nlm2;
-                            if constexpr (std::is_same<TK, double>::value)
-                            {
-                                nlm1 = this->nlm_save[iat][ad1][iw1][0];
-                                nlm2 = this->nlm_save[iat][ad2][iw2][0];
-                            }
-                            else
+                    for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
+                    {
+                        const int iw1_all = start1 + iw1; // this is \mu
+                        const int iw1_local = pv->global2local_row(iw1_all);
+                        if (iw1_local < 0)
+                        {
+                            continue;
+                        }
+                        for (int iw2 = 0; iw2 < nw2_tot; ++iw2)
+                        {
+                            const int iw2_all = start2 + iw2; // this is \nu
+                            const int iw2_local = pv->global2local_col(iw2_all);
+                            if (iw2_local < 0)
                             {
-                                nlm1 = this->nlm_save_k[iat][key_1][iw1][0];
-                                nlm2 = this->nlm_save_k[iat][key_2][iw2][0];
+                                continue;
                             }
 
-                            assert(nlm1.size()==nlm2.size());
+                            hamilt::BaseMatrix<double>* overlap_1 = psialpha[0]->find_matrix(iat, ibt1, dR1);
+                            hamilt::BaseMatrix<double>* overlap_2 = psialpha[0]->find_matrix(iat, ibt2, dR2);
+                            assert(overlap_1->get_col_size() == overlap_2->get_col_size());
+
                             for (int ik = 0; ik < nks; ik++)
                             {
-                                int ib=0;
+                                int ib = 0;
                                 std::complex<double> kphase = std::complex<double>(1.0, 0.0);
                                 if constexpr (std::is_same<TK, std::complex<double>>::value)
                                 {
-                                    const double arg = - (kvec_d[ik] * (dR1-dR2) ) * ModuleBase::TWO_PI;
+                                    const double arg
+                                        = -(kvec_d[ik] * ModuleBase::Vector3<double>(dR1 - dR2)) * ModuleBase::TWO_PI;
                                     double sinp, cosp;
                                     ModuleBase::libm::sincos(arg, &sinp, &cosp);
                                     kphase = std::complex<double>(cosp, sinp);
                                 }
-                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
                                 {
-                                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                                    for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
                                     {
                                         const int inl = this->inl_index[T0](I0, L0, N0);
-                                        const int nm = 2*L0+1;
-                                        
-                                        for (int m1=0; m1<nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
+                                        const int nm = 2 * L0 + 1;
+
+                                        for (int m1 = 0; m1 < nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
                                         {
-                                            for (int m2=0; m2<nm; ++m2) // nm = 1 for s, 3 for p, 5 for d
+                                            for (int m2 = 0; m2 < nm; ++m2) // nm = 1 for s, 3 for p, 5 for d
                                             {
                                                 if constexpr (std::is_same<TK, double>::value)
                                                 {
-                                                    this->v_delta_pdm_shell[ik][iw1_all][iw2_all][inl][m1*nm+m2] += nlm1[ib+m1]*nlm2[ib+m2];
+                                                    this->v_delta_pdm_shell[ik][iw1_all][iw2_all][inl][m1 * nm + m2]
+                                                        += overlap_1->get_value(iw1, ib + m1)
+                                                           * overlap_2->get_value(iw2, ib + m2);
                                                 }
                                                 else
                                                 {
-                                                    this->v_delta_pdm_shell_complex[ik][iw1_all][iw2_all][inl][m1*nm+m2] += nlm1[ib+m1]*nlm2[ib+m2]*kphase;
+                                                    this->v_delta_pdm_shell_complex[ik][iw1_all][iw2_all][inl]
+                                                                                   [m1 * nm + m2]
+                                                        += overlap_1->get_value(iw1, ib + m1)
+                                                           * overlap_2->get_value(iw2, ib + m2) * kphase;
                                                 }
                                             }
                                         }
-                                        ib+=nm;
+                                        ib += nm;
                                     }
-                                }  
-                            } //ik                         
-						}//iw2
-					}//iw1
-				}//ad2
-			}//ad1   
-            
+                                }
+                            } // ik
+                        }     // iw2
+                    }         // iw1
+                }             // ad2
+            }                 // ad1
         }
     }
 #ifdef __MPI
-    const int mn_size=(2 * this->lmaxd + 1) * (2 * this->lmaxd + 1);
+    const int mn_size = (2 * this->lmaxd + 1) * (2 * this->lmaxd + 1);
     for (int ik = 0; ik < nks; ik++)
     {
-        for(int inl = 0; inl < this->inlmax; inl++)
+        for (int inl = 0; inl < this->inlmax; inl++)
         {
-            for(int mu = 0; mu < nlocal ; mu++)
+            for (int mu = 0; mu < nlocal; mu++)
             {
-                for(int nu=0; nu< nlocal ; nu++)
+                for (int nu = 0; nu < nlocal; nu++)
                 {
                     if constexpr (std::is_same<TK, double>::value)
                     {
-                        Parallel_Reduce::reduce_all(this->v_delta_pdm_shell[ik][mu][nu][inl],mn_size);
+                        Parallel_Reduce::reduce_all(this->v_delta_pdm_shell[ik][mu][nu][inl], mn_size);
                     }
                     else
                     {
-                        Parallel_Reduce::reduce_all(this->v_delta_pdm_shell_complex[ik][mu][nu][inl],mn_size);
+                        Parallel_Reduce::reduce_all(this->v_delta_pdm_shell_complex[ik][mu][nu][inl], mn_size);
                     }
                 }
             }
         }
     }
-#endif    
-    
+#endif
+
     // transfer v_delta_pdm_shell to v_delta_pdm_shell_vector
-    
-    int nlmax = this->inlmax/nat;
-   
+
+    int nlmax = this->inlmax / nat;
+
     std::vector<torch::Tensor> v_delta_pdm_shell_vector;
-    for(int nl = 0; nl < nlmax; ++nl)
+    for (int nl = 0; nl < nlmax; ++nl)
     {
         std::vector<torch::Tensor> kuuammv;
-        for(int iks = 0; iks < nks; ++iks)
+        for (int iks = 0; iks < nks; ++iks)
         {
             std::vector<torch::Tensor> uuammv;
-            for(int mu = 0; mu < nlocal; ++mu)
+            for (int mu = 0; mu < nlocal; ++mu)
             {
                 std::vector<torch::Tensor> uammv;
-                for(int nu =0 ; nu < nlocal; ++nu)
+                for (int nu = 0; nu < nlocal; ++nu)
                 {
                     std::vector<torch::Tensor> ammv;
-                    for (int iat=0; iat<nat; ++iat)
+                    for (int iat = 0; iat < nat; ++iat)
                     {
-                        int inl = iat*nlmax+nl;
-                        int nm = 2*this->inl_l[inl]+1;
+                        int inl = iat * nlmax + nl;
+                        int nm = 2 * this->inl_l[inl] + 1;
                         std::vector<TK> mmv;
-                    
-                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+
+                        for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                         {
-                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
                             {
                                 if constexpr (std::is_same<TK, double>::value)
                                 {
-                                    mmv.push_back(this->v_delta_pdm_shell[iks][mu][nu][inl][m1*nm+m2]);
+                                    mmv.push_back(this->v_delta_pdm_shell[iks][mu][nu][inl][m1 * nm + m2]);
                                 }
                                 else
                                 {
-                                    mmv.push_back(this->v_delta_pdm_shell_complex[iks][mu][nu][inl][m1*nm+m2]);
+                                    mmv.push_back(this->v_delta_pdm_shell_complex[iks][mu][nu][inl][m1 * nm + m2]);
                                 }
                             }
                         }
                         if constexpr (std::is_same<TK, double>::value)
                         {
-                            torch::Tensor mm = torch::tensor(mmv, torch::TensorOptions().dtype(torch::kFloat64) ).reshape({nm, nm});    //nm*nm
+                            torch::Tensor mm = torch::tensor(mmv, torch::TensorOptions().dtype(torch::kFloat64))
+                                                   .reshape({nm, nm}); // nm*nm
                             ammv.push_back(mm);
                         }
                         else
                         {
-                            torch::Tensor mm = torch::from_blob(mmv.data(), {nm, nm}, torch::TensorOptions().dtype(torch::kComplexDouble)).clone();    //nm*nm
+                            torch::Tensor mm = torch::from_blob(mmv.data(),
+                                                                {nm, nm},
+                                                                torch::TensorOptions().dtype(torch::kComplexDouble))
+                                                   .clone(); // nm*nm
                             ammv.push_back(mm);
                         }
                     }
-                    torch::Tensor amm = torch::stack(ammv, 0); 
-                    uammv.push_back(amm);                    
+                    torch::Tensor amm = torch::stack(ammv, 0);
+                    uammv.push_back(amm);
                 }
-                torch::Tensor uamm = torch::stack(uammv, 0); 
+                torch::Tensor uamm = torch::stack(uammv, 0);
                 uuammv.push_back(uamm);
             }
-            torch::Tensor uuamm = torch::stack(uuammv, 0); 
+            torch::Tensor uuamm = torch::stack(uuammv, 0);
             kuuammv.push_back(uuamm);
         }
-        torch::Tensor kuuamm = torch::stack(kuuammv, 0);  
+        torch::Tensor kuuamm = torch::stack(kuuammv, 0);
         v_delta_pdm_shell_vector.push_back(kuuamm);
     }
-       
+
     assert(v_delta_pdm_shell_vector.size() == nlmax);
-    
-    //einsum for each nl: 
+
+    // einsum for each nl:
     std::vector<torch::Tensor> v_delta_precalc_vector;
-    for (int nl = 0; nl<nlmax; ++nl)
-    {   
+    for (int nl = 0; nl < nlmax; ++nl)
+    {
         if constexpr (std::is_same<TK, double>::value)
         {
-            v_delta_precalc_vector.push_back(at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], this->gevdm_vector[nl]}));
+            v_delta_precalc_vector.push_back(
+                at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], this->gevdm_vector[nl]}));
         }
         else
         {
             torch::Tensor gevdm_vector_complex = this->gevdm_vector[nl].to(torch::kComplexDouble);
-            v_delta_precalc_vector.push_back(at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], gevdm_vector_complex}));
+            v_delta_precalc_vector.push_back(
+                at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], gevdm_vector_complex}));
         }
     }
 
     this->v_delta_precalc_tensor = torch::cat(v_delta_precalc_vector, -1);
-    this->del_v_delta_pdm_shell(nks,nlocal);
+    this->del_v_delta_pdm_shell(nks, nlocal);
 
-    //check_v_delta_precalc(nlocal,nat);
-    // timeval t_end;
-    // gettimeofday(&t_end,NULL);
-    // std::cout<<"calculate v_delta_precalc time:\t"<<(double)(t_end.tv_sec-t_start.tv_sec) + (double)(t_end.tv_usec-t_start.tv_usec)/1000000.0<<std::endl;
+    // check_v_delta_precalc(nlocal,nat);
+    //  timeval t_end;
+    //  gettimeofday(&t_end,NULL);
+    //  std::cout<<"calculate v_delta_precalc time:\t"<<(double)(t_end.tv_sec-t_start.tv_sec) +
+    //  (double)(t_end.tv_usec-t_start.tv_usec)/1000000.0<<std::endl;
     return;
 }
 
 template <typename TK>
-void LCAO_Deepks::check_v_delta_precalc(const int nat, const int nks,const int nlocal)
+void LCAO_Deepks::check_v_delta_precalc(const int nat, const int nks, const int nlocal)
 {
     std::ofstream ofs("v_delta_precalc.dat");
     ofs << std::setprecision(10);
@@ -302,22 +298,24 @@ void LCAO_Deepks::check_v_delta_precalc(const int nat, const int nks,const int n
         {
             for (int nu = 0; nu < nlocal; ++nu)
             {
-                for (int iat = 0;iat < nat;++iat)
+                for (int iat = 0; iat < nat; ++iat)
                 {
-                    for(int p=0; p<this->des_per_atom; ++p)
+                    for (int p = 0; p < this->des_per_atom; ++p)
                     {
                         if constexpr (std::is_same<TK, double>::value)
                         {
-                            ofs << this->v_delta_precalc_tensor.index({iks, mu, nu, iat, p }).item().toDouble() << " " ;
+                            ofs << this->v_delta_precalc_tensor.index({iks, mu, nu, iat, p}).item().toDouble() << " ";
                         }
                         else
                         {
                             auto tensor_value = this->v_delta_precalc_tensor.index({iks, mu, nu, iat, p});
-                            ofs << std::complex<double>(torch::real(tensor_value).item<double>(), torch::imag(tensor_value).item<double>()) << " " ;
+                            ofs << std::complex<double>(torch::real(tensor_value).item<double>(),
+                                                        torch::imag(tensor_value).item<double>())
+                                << " ";
                         }
                     }
                 }
-                ofs << std::endl;                
+                ofs << std::endl;
             }
         }
     }
diff --git a/source/module_hamilt_lcao/module_hcontainer/atom_pair.cpp b/source/module_hamilt_lcao/module_hcontainer/atom_pair.cpp
index d35553a2f8..cef77a34ac 100644
--- a/source/module_hamilt_lcao/module_hcontainer/atom_pair.cpp
+++ b/source/module_hamilt_lcao/module_hcontainer/atom_pair.cpp
@@ -347,6 +347,10 @@ void AtomPair<T>::set_size(const int& col_size_in, const int& row_size_in)
 {
     this->col_size = col_size_in;
     this->row_size = row_size_in;
+    for (int i = 0; i < this->values.size(); i++)
+    {
+        this->values[i].set_size(row_size_in, col_size_in);
+    }
 }
 
 // get paraV for check
diff --git a/source/module_hamilt_lcao/module_hcontainer/base_matrix.cpp b/source/module_hamilt_lcao/module_hcontainer/base_matrix.cpp
index a7531a1245..4b1850801a 100644
--- a/source/module_hamilt_lcao/module_hcontainer/base_matrix.cpp
+++ b/source/module_hamilt_lcao/module_hcontainer/base_matrix.cpp
@@ -216,6 +216,14 @@ size_t BaseMatrix<T>::get_memory_size() const
     return memory_size;
 }
 
+// set size
+template <typename T>
+void BaseMatrix<T>::set_size(const int& nrow, const int& ncol)
+{
+    this->nrow_local = nrow;
+    this->ncol_local = ncol;
+}
+
 // T of BaseMatrix can be double or complex<double>
 template class BaseMatrix<double>;
 template class BaseMatrix<std::complex<double>>;
diff --git a/source/module_hamilt_lcao/module_hcontainer/base_matrix.h b/source/module_hamilt_lcao/module_hcontainer/base_matrix.h
index 227666ebc3..bd47d49d79 100644
--- a/source/module_hamilt_lcao/module_hcontainer/base_matrix.h
+++ b/source/module_hamilt_lcao/module_hcontainer/base_matrix.h
@@ -80,7 +80,7 @@ class BaseMatrix
     */
     size_t get_memory_size() const;
 
-        /**
+    /**
      * @brief get col_size for this matrix
     */
     int get_col_size() const {return ncol_local;};
@@ -88,6 +88,10 @@ class BaseMatrix
      * @brief get row_size for this matrix
     */
     int get_row_size() const {return nrow_local;};
+    /**
+     * @brief set col_size and row_size
+    */
+    void set_size(const int& col_size_in, const int& row_size_in);
 
   private:
     bool allocated = false;
diff --git a/source/module_io/read_input_item_deepks.cpp b/source/module_io/read_input_item_deepks.cpp
index a641812d4c..f2d53c94d7 100644
--- a/source/module_io/read_input_item_deepks.cpp
+++ b/source/module_io/read_input_item_deepks.cpp
@@ -67,9 +67,9 @@ void ReadInput::item_deepks()
         };
         item.check_value = [](const Input_Item& item, const Parameter& para) {
             if (para.input.deepks_out_unittest){
-                if (para.input.cal_force != 1)
+                if (para.input.cal_force != 1 || para.input.cal_stress != 1)
                 {
-                    ModuleBase::WARNING_QUIT("ReadInput", "force is required in generating deepks unittest");
+                    ModuleBase::WARNING_QUIT("ReadInput", "force and stress are required in generating deepks unittest");
                 }
             }
         };
diff --git a/tests/deepks/602_NO_deepks_d_H2O_md_lda2pbe/result.ref b/tests/deepks/602_NO_deepks_d_H2O_md_lda2pbe/result.ref
index c197e2c934..481746130e 100644
--- a/tests/deepks/602_NO_deepks_d_H2O_md_lda2pbe/result.ref
+++ b/tests/deepks/602_NO_deepks_d_H2O_md_lda2pbe/result.ref
@@ -1,9 +1,9 @@
-etotref -465.9986234579140
-etotperatomref -155.3328744860
+etotref -465.9986234576679
+etotperatomref -155.3328744859
 totalforceref 5.535106
-totalstressref 1.520003
+totalstressref 1.522353
 totaldes 2.163703
-deepks_e_dm -57.885718058035934
-deepks_f_label 19.095598453044364
-deepks_s_label 19.250415261605152
-totaltimeref 12.68
+deepks_e_dm -57.8857180593137
+deepks_f_label 19.09559844689178
+deepks_s_label 19.250590727951906
+totaltimeref 12.58
diff --git a/tests/deepks/602_NO_deepks_d_H2O_scf_lda2pbe/result.ref b/tests/deepks/602_NO_deepks_d_H2O_scf_lda2pbe/result.ref
index bfbc3f2cf0..7e71a478e6 100644
--- a/tests/deepks/602_NO_deepks_d_H2O_scf_lda2pbe/result.ref
+++ b/tests/deepks/602_NO_deepks_d_H2O_scf_lda2pbe/result.ref
@@ -1,9 +1,9 @@
-etotref -466.0342709734272262
+etotref -466.0342709734266577
 etotperatomref -155.3447569911
 totalforceref 3.194893
-totalstressref 1.190505
+totalstressref 1.190889
 totaldes 2.319513
-deepks_e_dm -57.521935314706305
-deepks_f_label 19.875352737513325
-deepks_s_label 19.58791026543513
-totaltimeref 5.81
+deepks_e_dm -57.521935314706134
+deepks_f_label 19.87535273614641
+deepks_s_label 19.587939082569576
+totaltimeref 5.75