diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index f209a16513..f1b7ac2a8f 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -193,20 +193,20 @@ OBJS_CELL=atom_pseudo.o\
 OBJS_DEEPKS=LCAO_deepks.o\
         deepks_force.o\
         deepks_orbital.o\
+        deepks_orbpre.o\
+        deepks_vdpre.o\
+        deepks_hmat.o\
         LCAO_deepks_io.o\
-        LCAO_deepks_mpi.o\
         LCAO_deepks_pdm.o\
         LCAO_deepks_phialpha.o\
         LCAO_deepks_torch.o\
         LCAO_deepks_vdelta.o\
-        deepks_hmat.o\
         LCAO_deepks_interface.o\
-        deepks_orbpre.o\
         cal_gdmx.o\
+        cal_gdmepsl.o\
         cal_gedm.o\
         cal_gvx.o\
         cal_descriptor.o\
-        v_delta_precalc.o\
         
 
 OBJS_ELECSTAT=elecstate.o\
diff --git a/source/module_esolver/esolver_ks.h b/source/module_esolver/esolver_ks.h
index 16932bcc6a..00e1352c50 100644
--- a/source/module_esolver/esolver_ks.h
+++ b/source/module_esolver/esolver_ks.h
@@ -55,7 +55,7 @@ class ESolver_KS : public ESolver_FP
     virtual void after_scf(UnitCell& ucell, const int istep) override;
 
     //! <Temporary> It should be replaced by a function in Hamilt Class
-    virtual void update_pot(UnitCell& ucell, const int istep, const int iter) {};
+    virtual void update_pot(UnitCell& ucell, const int istep, const int iter){};
 
     //! Hamiltonian
     hamilt::Hamilt<T, Device>* p_hamilt = nullptr;
@@ -72,7 +72,7 @@ class ESolver_KS : public ESolver_FP
     //! Electronic wavefunctions
     psi::Psi<T>* psi = nullptr;
 
-    //! plane wave or LCAO 
+    //! plane wave or LCAO
     std::string basisname;
 
     //! number of electrons
@@ -83,18 +83,18 @@ class ESolver_KS : public ESolver_FP
 
     //! the start time of scf iteration
 #ifdef __MPI
-    double iter_time;               
+    double iter_time;
 #else
     std::chrono::system_clock::time_point iter_time;
 #endif
 
-    double diag_ethr;               //! the threshold for diagonalization
-    double scf_thr;                 //! scf density threshold
-    double scf_ene_thr;             //! scf energy threshold
-    double drho;                    //! the difference between rho_in (before HSolver) and rho_out (After HSolver)
-    double hsolver_error;           //! the error of HSolver
-    int maxniter;                   //! maximum iter steps for scf
-    int niter;                      //! iter steps actually used in scf
+    double diag_ethr;     //! the threshold for diagonalization
+    double scf_thr;       //! scf density threshold
+    double scf_ene_thr;   //! scf energy threshold
+    double drho;          //! the difference between rho_in (before HSolver) and rho_out (After HSolver)
+    double hsolver_error; //! the error of HSolver
+    int maxniter;         //! maximum iter steps for scf
+    int niter;            //! iter steps actually used in scf
 };
 } // namespace ModuleESolver
 #endif
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
index bc98636d3a..89f328e12f 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
@@ -513,18 +513,14 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
 
                 if (!PARAM.inp.deepks_equiv) // training with force label not supported by equivariant version now
                 {
+                    torch::Tensor gdmx;
                     if (PARAM.globalv.gamma_only_local)
                     {
                         const std::vector<std::vector<double>>& dm_gamma
                             = dynamic_cast<const elecstate::ElecStateLCAO<double>*>(pelec)->get_DM()->get_DMK_vector();
-                        GlobalC::ld.cal_gdmx(dm_gamma,
-                                             ucell,
-                                             orb,
-                                             gd,
-                                             kv.get_nks(),
-                                             kv.kvec_d,
-                                             GlobalC::ld.phialpha,
-                                             isstress);
+
+                        GlobalC::ld
+                            .cal_gdmx(dm_gamma, ucell, orb, gd, kv.get_nks(), kv.kvec_d, GlobalC::ld.phialpha, gdmx);
                     }
                     else
                     {
@@ -533,25 +529,25 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
                                   ->get_DM()
                                   ->get_DMK_vector();
 
-                        GlobalC::ld
-                            .cal_gdmx(dm_k, ucell, orb, gd, kv.get_nks(), kv.kvec_d, GlobalC::ld.phialpha, isstress);
+                        GlobalC::ld.cal_gdmx(dm_k, ucell, orb, gd, kv.get_nks(), kv.kvec_d, GlobalC::ld.phialpha, gdmx);
                     }
                     if (PARAM.inp.deepks_out_unittest)
                     {
-                        GlobalC::ld.check_gdmx(ucell.nat);
+                        GlobalC::ld.check_gdmx(ucell.nat, gdmx);
                     }
                     std::vector<torch::Tensor> gevdm;
                     GlobalC::ld.cal_gevdm(ucell.nat, gevdm);
-                    GlobalC::ld.cal_gvx(ucell.nat, gevdm);
+                    torch::Tensor gvx;
+                    GlobalC::ld.cal_gvx(ucell.nat, gevdm, gdmx, gvx);
 
                     if (PARAM.inp.deepks_out_unittest)
                     {
-                        GlobalC::ld.check_gvx(ucell.nat);
+                        GlobalC::ld.check_gvx(ucell.nat, gvx);
                     }
 
                     LCAO_deepks_io::save_npy_gvx(ucell.nat,
                                                  GlobalC::ld.des_per_atom,
-                                                 GlobalC::ld.gvx_tensor,
+                                                 gvx,
                                                  PARAM.globalv.global_out_dir,
                                                  GlobalV::MY_RANK);
                 }
@@ -715,6 +711,12 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
                 {
                     scs(i, j) += stress_exx(i, j);
                 }
+#endif
+#ifdef __DEEPKS
+                if (PARAM.inp.deepks_scf)
+                {
+                    scs(i, j) += svnl_dalpha(i, j);
+                }
 #endif
             }
         }
@@ -726,47 +728,61 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
 #ifdef __DEEPKS
         if (PARAM.inp.deepks_out_labels) // not parallelized yet
         {
-            const std::string file_s = PARAM.globalv.global_out_dir + "deepks_sbase.npy";
+            const std::string file_stot = PARAM.globalv.global_out_dir + "deepks_stot.npy";
             LCAO_deepks_io::save_npy_s(scs,
-                                       file_s,
-                                       ucell.omega,
-                                       GlobalV::MY_RANK); // change to energy unit Ry when printing, S_base;
-        }
-        if (PARAM.inp.deepks_scf)
-        {
-            if (ModuleSymmetry::Symmetry::symm_flag == 1)
-            {
-                symm->symmetrize_mat3(svnl_dalpha, ucell.lat);
-            } // end symmetry
-            for (int i = 0; i < 3; i++)
-            {
-                for (int j = 0; j < 3; j++)
-                {
-                    scs(i, j) += svnl_dalpha(i, j);
-                }
-            }
-        }
-        if (PARAM.inp.deepks_out_labels) // not parallelized yet
-        {
-            const std::string file_s = PARAM.globalv.global_out_dir + "deepks_stot.npy";
-            LCAO_deepks_io::save_npy_s(scs,
-                                       file_s,
+                                       file_stot,
                                        ucell.omega,
                                        GlobalV::MY_RANK); // change to energy unit Ry when printing, S_tot, w/ model
 
             // wenfei add 2021/11/2
             if (PARAM.inp.deepks_scf)
             {
+                const std::string file_sbase = PARAM.globalv.global_out_dir + "deepks_sbase.npy";
+                LCAO_deepks_io::save_npy_s(scs - svnl_dalpha,
+                                           file_sbase,
+                                           ucell.omega,
+                                           GlobalV::MY_RANK); // change to energy unit Ry when printing, S_base;
 
                 if (!PARAM.inp.deepks_equiv) // training with stress label not supported by equivariant version now
                 {
+                    torch::Tensor gdmepsl;
+                    if (PARAM.globalv.gamma_only_local)
+                    {
+                        const std::vector<std::vector<double>>& dm_gamma
+                            = dynamic_cast<const elecstate::ElecStateLCAO<double>*>(pelec)->get_DM()->get_DMK_vector();
+
+                        GlobalC::ld.cal_gdmepsl(dm_gamma,
+                                                ucell,
+                                                orb,
+                                                gd,
+                                                kv.get_nks(),
+                                                kv.kvec_d,
+                                                GlobalC::ld.phialpha,
+                                                gdmepsl);
+                    }
+                    else
+                    {
+                        const std::vector<std::vector<std::complex<double>>>& dm_k
+                            = dynamic_cast<const elecstate::ElecStateLCAO<std::complex<double>>*>(pelec)
+                                  ->get_DM()
+                                  ->get_DMK_vector();
+
+                        GlobalC::ld
+                            .cal_gdmepsl(dm_k, ucell, orb, gd, kv.get_nks(), kv.kvec_d, GlobalC::ld.phialpha, gdmepsl);
+                    }
+                    if (PARAM.inp.deepks_out_unittest)
+                    {
+                        GlobalC::ld.check_gdmepsl(gdmepsl);
+                    }
+
                     std::vector<torch::Tensor> gevdm;
                     GlobalC::ld.cal_gevdm(ucell.nat, gevdm);
-                    GlobalC::ld.cal_gvepsl(ucell.nat, gevdm);
+                    torch::Tensor gvepsl;
+                    GlobalC::ld.cal_gvepsl(ucell.nat, gevdm, gdmepsl, gvepsl);
 
                     LCAO_deepks_io::save_npy_gvepsl(ucell.nat,
                                                     GlobalC::ld.des_per_atom,
-                                                    GlobalC::ld.gvepsl_tensor,
+                                                    gvepsl,
                                                     PARAM.globalv.global_out_dir,
                                                     GlobalV::MY_RANK); //  unitless, grad_vepsl
                 }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
index 83f7987dd7..d70e147a98 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
@@ -248,13 +248,14 @@ void Force_LCAO<double>::ftable(const bool isforce,
 
 #ifdef __DEEPKS
     const std::vector<std::vector<double>>& dm_gamma = dm->get_DMK_vector();
+    std::vector<torch::Tensor> descriptor;
     if (PARAM.inp.deepks_scf)
     {
         // when deepks_scf is on, the init pdm should be same as the out pdm, so we should not recalculate the pdm
         // GlobalC::ld.cal_projected_DM(dm, ucell, orb, gd);
 
-        GlobalC::ld.cal_descriptor(ucell.nat);
-        GlobalC::ld.cal_gedm(ucell.nat);
+        GlobalC::ld.cal_descriptor(ucell.nat, descriptor);
+        GlobalC::ld.cal_gedm(ucell.nat, descriptor);
 
         const int nks = 1;
         DeePKS_domain::cal_f_delta<double>(dm_gamma,
@@ -305,7 +306,7 @@ void Force_LCAO<double>::ftable(const bool isforce,
 
         GlobalC::ld.check_projected_dm();
 
-        GlobalC::ld.check_descriptor(ucell, PARAM.globalv.global_out_dir);
+        GlobalC::ld.check_descriptor(ucell, PARAM.globalv.global_out_dir, descriptor);
 
         GlobalC::ld.check_gedm();
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
index 7739a02338..ee1fde343c 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
@@ -349,9 +349,9 @@ void Force_LCAO<std::complex<double>>::ftable(const bool isforce,
         // when deepks_scf is on, the init pdm should be same as the out pdm, so we should not recalculate the pdm
         // GlobalC::ld.cal_projected_DM(dm, ucell, orb, gd);
 
-        GlobalC::ld.cal_descriptor(ucell.nat);
-
-        GlobalC::ld.cal_gedm(ucell.nat);
+        std::vector<torch::Tensor> descriptor;
+        GlobalC::ld.cal_descriptor(ucell.nat, descriptor);
+        GlobalC::ld.cal_gedm(ucell.nat, descriptor);
 
         DeePKS_domain::cal_f_delta<std::complex<double>>(dm_k,
                                                          ucell,
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
index 95559724ba..7dbb8d09ca 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
@@ -58,7 +58,7 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(const Grid_Driv
     this->H_V_delta = new HContainer<TR>(paraV);
     if (std::is_same<TK, double>::value)
     {
-        //this->H_V_delta = new HContainer<TR>(paraV);
+        // this->H_V_delta = new HContainer<TR>(paraV);
         this->H_V_delta->fix_gamma();
     }
 
@@ -138,8 +138,8 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(const Grid_Driv
     // if (std::is_same<TK, double>::value)
     // {
     this->H_V_delta->allocate(nullptr, true);
-        // expand hR with H_V_delta
-        // update : for computational rigor, gamma-only and multi-k cases both have full size of Hamiltonian of DeePKS now
+    // expand hR with H_V_delta
+    // update : for computational rigor, gamma-only and multi-k cases both have full size of Hamiltonian of DeePKS now
     this->hR->add(*this->H_V_delta);
     this->hR->allocate(nullptr, false);
     // }
@@ -161,8 +161,10 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
         ModuleBase::timer::tick("DeePKS", "contributeHR");
 
         GlobalC::ld.cal_projected_DM<TK>(this->DM, *this->ucell, *ptr_orb_, *(this->gd));
-        GlobalC::ld.cal_descriptor(this->ucell->nat);
-        GlobalC::ld.cal_gedm(this->ucell->nat);
+
+        std::vector<torch::Tensor> descriptor;
+        GlobalC::ld.cal_descriptor(this->ucell->nat, descriptor);
+        GlobalC::ld.cal_gedm(this->ucell->nat, descriptor);
 
         // // recalculate the H_V_delta
         // if (this->H_V_delta == nullptr)
diff --git a/source/module_hamilt_lcao/module_deepks/CMakeLists.txt b/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
index bc6c2356e3..c3f71f7dd3 100644
--- a/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
+++ b/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
@@ -3,20 +3,21 @@ if(ENABLE_DEEPKS)
       LCAO_deepks.cpp
       deepks_force.cpp
       deepks_orbital.cpp
+      deepks_orbpre.cpp
+      deepks_vdpre.cpp
+      deepks_hmat.cpp
       LCAO_deepks_io.cpp
-      LCAO_deepks_mpi.cpp
       LCAO_deepks_pdm.cpp
       LCAO_deepks_phialpha.cpp
       LCAO_deepks_torch.cpp
       LCAO_deepks_vdelta.cpp
-      deepks_hmat.cpp
       LCAO_deepks_interface.cpp
-      deepks_orbpre.cpp
       cal_gdmx.cpp
+      cal_gdmepsl.cpp
       cal_gedm.cpp
       cal_gvx.cpp      
       cal_descriptor.cpp
-      v_delta_precalc.cpp
+      
   )
 
   add_library(
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
index bffeb97efd..cbfb5eb4cb 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
@@ -7,15 +7,8 @@
 // 1. subroutines that are related to calculating descriptors:
 //   - init : allocates some arrays
 //   - init_index : records the index (inl)
-// 2. subroutines that are related to calculating force label:
-//   - init_gdmx : allocates gdmx; it is a private subroutine
-//   - del_gdmx : releases gdmx
-// 3. subroutines that are related to calculating force label:
-//   - init_gdmepsl : allocates gdm_epsl; it is a private subroutine
-//   - del_gdmepsl : releases gdm_epsl
-// 4. subroutines that are related to V_delta:
-//   - allocate_V_delta : allocates H_V_delta; if calculating force, it also calls
-//       init_gdmx, as well as allocating F_delta
+// 2. subroutines that are related to V_delta:
+//   - allocate_V_delta : allocates H_V_delta; if calculating force, it also allocates F_delta
 
 #ifdef __DEEPKS
 
@@ -57,8 +50,6 @@ LCAO_Deepks::~LCAO_Deepks()
         }
         delete[] gedm;
     }
-
-    del_gdmx();
 }
 
 void LCAO_Deepks::init(const LCAO_Orbitals& orb,
@@ -120,7 +111,6 @@ void LCAO_Deepks::init(const LCAO_Orbitals& orb,
             int nm = 2 * inl_l[inl] + 1;
             pdm_size += nm * nm;
             this->pdm[inl] = torch::zeros({nm, nm}, torch::kFloat64);
-            // this->pdm[inl].requires_grad_(true);
         }
     }
     else
@@ -196,101 +186,6 @@ void LCAO_Deepks::init_index(const int ntype,
     return;
 }
 
-void LCAO_Deepks::init_gdmx(const int nat)
-{
-    this->gdmx = new double**[nat];
-    this->gdmy = new double**[nat];
-    this->gdmz = new double**[nat];
-    int pdm_size = 0;
-    if (!PARAM.inp.deepks_equiv)
-    {
-        pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
-    }
-    else
-    {
-        pdm_size = this->des_per_atom;
-    }
-
-    for (int iat = 0; iat < nat; iat++)
-    {
-        this->gdmx[iat] = new double*[inlmax];
-        this->gdmy[iat] = new double*[inlmax];
-        this->gdmz[iat] = new double*[inlmax];
-        for (int inl = 0; inl < inlmax; inl++)
-        {
-            this->gdmx[iat][inl] = new double[pdm_size];
-            this->gdmy[iat][inl] = new double[pdm_size];
-            this->gdmz[iat][inl] = new double[pdm_size];
-            ModuleBase::GlobalFunc::ZEROS(gdmx[iat][inl], pdm_size);
-            ModuleBase::GlobalFunc::ZEROS(gdmy[iat][inl], pdm_size);
-            ModuleBase::GlobalFunc::ZEROS(gdmz[iat][inl], pdm_size);
-        }
-    }
-    this->nat_gdm = nat;
-    return;
-}
-
-// void LCAO_Deepks::del_gdmx(const int nat)
-void LCAO_Deepks::del_gdmx()
-{
-    for (int iat = 0; iat < nat_gdm; iat++)
-    {
-        for (int inl = 0; inl < inlmax; inl++)
-        {
-            delete[] this->gdmx[iat][inl];
-            delete[] this->gdmy[iat][inl];
-            delete[] this->gdmz[iat][inl];
-        }
-        delete[] this->gdmx[iat];
-        delete[] this->gdmy[iat];
-        delete[] this->gdmz[iat];
-    }
-    delete[] this->gdmx;
-    delete[] this->gdmy;
-    delete[] this->gdmz;
-    return;
-}
-
-void LCAO_Deepks::init_gdmepsl()
-{
-    this->gdm_epsl = new double**[6];
-
-    int pdm_size = 0;
-    if (!PARAM.inp.deepks_equiv)
-    {
-        pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
-    }
-    else
-    {
-        pdm_size = this->des_per_atom;
-    }
-
-    for (int ipol = 0; ipol < 6; ipol++)
-    {
-        this->gdm_epsl[ipol] = new double*[inlmax];
-        for (int inl = 0; inl < inlmax; inl++)
-        {
-            this->gdm_epsl[ipol][inl] = new double[pdm_size];
-            ModuleBase::GlobalFunc::ZEROS(gdm_epsl[ipol][inl], pdm_size);
-        }
-    }
-    return;
-}
-
-void LCAO_Deepks::del_gdmepsl()
-{
-    for (int ipol = 0; ipol < 6; ipol++)
-    {
-        for (int inl = 0; inl < inlmax; inl++)
-        {
-            delete[] this->gdm_epsl[ipol][inl];
-        }
-        delete[] this->gdm_epsl[ipol];
-    }
-    delete[] this->gdm_epsl;
-    return;
-}
-
 void LCAO_Deepks::allocate_V_delta(const int nat, const int nks)
 {
     ModuleBase::TITLE("LCAO_Deepks", "allocate_V_delta");
@@ -330,116 +225,6 @@ void LCAO_Deepks::allocate_V_delta(const int nat, const int nks)
         this->gedm[inl] = new double[pdm_size];
         ModuleBase::GlobalFunc::ZEROS(this->gedm[inl], pdm_size);
     }
-    if (PARAM.inp.cal_force)
-    {
-        if (PARAM.inp.deepks_out_labels)
-        {
-            this->init_gdmx(nat);
-            this->init_gdmepsl();
-        }
-        // gdmx is used only in calculating gvx
-    }
-
-    return;
-}
-
-void LCAO_Deepks::init_v_delta_pdm_shell(const int nks, const int nlocal)
-{
-    const int mn_size = (2 * this->lmaxd + 1) * (2 * this->lmaxd + 1);
-    if (nks == 1)
-    {
-        this->v_delta_pdm_shell = new double****[nks];
-        for (int iks = 0; iks < nks; iks++)
-        {
-            this->v_delta_pdm_shell[iks] = new double***[nlocal];
-
-            for (int mu = 0; mu < nlocal; mu++)
-            {
-                this->v_delta_pdm_shell[iks][mu] = new double**[nlocal];
-
-                for (int nu = 0; nu < nlocal; nu++)
-                {
-                    this->v_delta_pdm_shell[iks][mu][nu] = new double*[this->inlmax];
-
-                    for (int inl = 0; inl < this->inlmax; inl++)
-                    {
-                        this->v_delta_pdm_shell[iks][mu][nu][inl] = new double[mn_size];
-                        ModuleBase::GlobalFunc::ZEROS(v_delta_pdm_shell[iks][mu][nu][inl], mn_size);
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        this->v_delta_pdm_shell_complex = new std::complex<double>****[nks];
-        for (int iks = 0; iks < nks; iks++)
-        {
-            this->v_delta_pdm_shell_complex[iks] = new std::complex<double>***[nlocal];
-
-            for (int mu = 0; mu < nlocal; mu++)
-            {
-                this->v_delta_pdm_shell_complex[iks][mu] = new std::complex<double>**[nlocal];
-
-                for (int nu = 0; nu < nlocal; nu++)
-                {
-                    this->v_delta_pdm_shell_complex[iks][mu][nu] = new std::complex<double>*[this->inlmax];
-
-                    for (int inl = 0; inl < this->inlmax; inl++)
-                    {
-                        this->v_delta_pdm_shell_complex[iks][mu][nu][inl] = new std::complex<double>[mn_size];
-                        ModuleBase::GlobalFunc::ZEROS(v_delta_pdm_shell_complex[iks][mu][nu][inl], mn_size);
-                    }
-                }
-            }
-        }
-    }
-
-    return;
-}
-
-void LCAO_Deepks::del_v_delta_pdm_shell(const int nks, const int nlocal)
-{
-    if (nks == 1)
-    {
-        for (int iks = 0; iks < nks; iks++)
-        {
-            for (int mu = 0; mu < nlocal; mu++)
-            {
-                for (int nu = 0; nu < nlocal; nu++)
-                {
-                    for (int inl = 0; inl < this->inlmax; inl++)
-                    {
-                        delete[] this->v_delta_pdm_shell[iks][mu][nu][inl];
-                    }
-                    delete[] this->v_delta_pdm_shell[iks][mu][nu];
-                }
-                delete[] this->v_delta_pdm_shell[iks][mu];
-            }
-            delete[] this->v_delta_pdm_shell[iks];
-        }
-        delete[] this->v_delta_pdm_shell;
-    }
-    else
-    {
-        for (int iks = 0; iks < nks; iks++)
-        {
-            for (int mu = 0; mu < nlocal; mu++)
-            {
-                for (int nu = 0; nu < nlocal; nu++)
-                {
-                    for (int inl = 0; inl < this->inlmax; inl++)
-                    {
-                        delete[] this->v_delta_pdm_shell_complex[iks][mu][nu][inl];
-                    }
-                    delete[] this->v_delta_pdm_shell_complex[iks][mu][nu];
-                }
-                delete[] this->v_delta_pdm_shell_complex[iks][mu];
-            }
-            delete[] this->v_delta_pdm_shell_complex[iks];
-        }
-        delete[] this->v_delta_pdm_shell_complex;
-    }
 
     return;
 }
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
index 9238ed73fd..6755b62df0 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
@@ -7,6 +7,7 @@
 #include "deepks_hmat.h"
 #include "deepks_orbital.h"
 #include "deepks_orbpre.h"
+#include "deepks_vdpre.h"
 #include "module_base/complexmatrix.h"
 #include "module_base/intarray.h"
 #include "module_base/matrix.h"
@@ -108,44 +109,12 @@ class LCAO_Deepks
     // [nat][nlm*nlm] for equivariant version
     std::vector<torch::Tensor> pdm;
 
-    // descriptors
-    std::vector<torch::Tensor> d_tensor;
-
     // gedm:dE/dD, [tot_Inl][2l+1][2l+1]	(E: Hartree)
     std::vector<torch::Tensor> gedm_tensor;
 
-    // gdmx: dD/dX		\sum_{mu,nu} 2*c_mu*c_nu * <dphi_mu/dx|alpha_m><alpha_m'|phi_nu>
-    double*** gdmx; //[natom][tot_Inl][2l+1][2l+1]
-    double*** gdmy;
-    double*** gdmz;
-
-    // gdm_epsl: dD/d\epsilon_{\alpha\beta}
-    double*** gdm_epsl; //[6][tot_Inl][2l+1][2l+1]
-
-    // dD/d\epsilon_{\alpha\beta}, tensor form of gdm_epsl
-    std::vector<torch::Tensor> gdmepsl_vector;
-
-    // gv_epsl:d(d)/d\epsilon_{\alpha\beta}, [natom][6][des_per_atom]
-    torch::Tensor gvepsl_tensor;
-
     /// dE/dD, autograd from loaded model(E: Ry)
     double** gedm; //[tot_Inl][2l+1][2l+1]
 
-    // gvx:d(d)/dX, [natom][3][natom][des_per_atom]
-    torch::Tensor gvx_tensor;
-
-    // dD/dX, tensor form of gdmx
-    std::vector<torch::Tensor> gdmr_vector;
-
-    // v_delta_pdm_shell[nks,nlocal,nlocal,Inl,nm*nm] = overlap * overlap
-    double***** v_delta_pdm_shell;
-    std::complex<double>***** v_delta_pdm_shell_complex; // for multi-k
-    // v_delta_precalc[nks,nlocal,nlocal,NAt,NDscrpt] = gvdm * v_delta_pdm_shell;
-    torch::Tensor v_delta_precalc_tensor;
-    // for v_delta==2 , new v_delta_precalc storage method
-    torch::Tensor phialpha_tensor;
-    torch::Tensor gevdm_tensor;
-
     /// size of descriptor(projector) basis set
     int n_descriptor;
 
@@ -176,12 +145,8 @@ class LCAO_Deepks
     // 1. subroutines that are related to calculating descriptors:
     //   - init : allocates some arrays
     //   - init_index : records the index (inl)
-    // 2. subroutines that are related to calculating force label:
-    //   - init_gdmx : allocates gdmx; it is a private subroutine
-    //   - del_gdmx : releases gdmx
-    // 3. subroutines that are related to V_delta:
-    //   - allocate_V_delta : allocates H_V_delta; if calculating force, it also calls
-    //       init_gdmx, as well as allocating F_delta
+    // 2. subroutines that are related to V_delta:
+    //   - allocate_V_delta : allocates H_V_delta; if calculating force, it also allocates F_delta
 
   public:
     explicit LCAO_Deepks();
@@ -199,23 +164,10 @@ class LCAO_Deepks
     /// Allocate memory for correction to Hamiltonian
     void allocate_V_delta(const int nat, const int nks = 1);
 
-    // array for storing gdmx, used for calculating gvx
-    void init_gdmx(const int nat);
-    // void del_gdmx(const int nat);
-    void del_gdmx();
-
-    // array for storing gdm_epsl, used for calculating gvx
-    void init_gdmepsl();
-    void del_gdmepsl();
-
   private:
     // arrange index of descriptor in all atoms
     void init_index(const int ntype, const int nat, std::vector<int> na, const int tot_inl, const LCAO_Orbitals& orb);
 
-    // for v_delta label calculation; xinyuan added on 2023-2-22
-    void init_v_delta_pdm_shell(const int nks, const int nlocal);
-    void del_v_delta_pdm_shell(const int nks, const int nlocal);
-
     //-------------------
     // LCAO_deepks_phialpha.cpp
     //-------------------
@@ -263,7 +215,7 @@ class LCAO_Deepks
     // 1. cal_projected_DM, which is used for calculating pdm
     // 2. check_projected_dm, which prints pdm to descriptor.dat
 
-    // 3. cal_gdmx, calculating gdmx (and optionally gdm_epsl for stress)
+    // 3. cal_gdmx, calculating gdmx (and optionally gdmepsl for stress)
     // 4. check_gdmx, which prints gdmx to a series of .dat files
 
   public:
@@ -295,9 +247,22 @@ class LCAO_Deepks
         const int nks,
         const std::vector<ModuleBase::Vector3<double>>& kvec_d,
         std::vector<hamilt::HContainer<double>*> phialpha,
-        const bool isstress);
+        torch::Tensor& gdmx);
 
-    void check_gdmx(const int nat);
+    void check_gdmx(const int nat, const torch::Tensor& gdmx);
+
+    template <typename TK>
+    void cal_gdmepsl( // const ModuleBase::matrix& dm,
+        const std::vector<std::vector<TK>>& dm,
+        const UnitCell& ucell,
+        const LCAO_Orbitals& orb,
+        const Grid_Driver& GridD,
+        const int nks,
+        const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+        std::vector<hamilt::HContainer<double>*> phialpha,
+        torch::Tensor& gdmepsl);
+
+    void check_gdmepsl(const torch::Tensor& gdmepsl);
 
     /**
      * @brief set init_pdm to skip the calculation of pdm in SCF iteration
@@ -353,7 +318,7 @@ class LCAO_Deepks
     // 4. check_gvx : prints gvx into gvx.dat for checking
     // 5. cal_gvepsl : gvepsl is used for training with stress label, which is derivative of
     //       descriptors wrt strain tensor, calculated by
-    //       d(des)/d\epsilon_{ab} = d(pdm)/d\epsilon_{ab} * d(des)/d(pdm) = gdm_epsl * gvdm
+    //       d(des)/d\epsilon_{ab} = d(pdm)/d\epsilon_{ab} * d(des)/d(pdm) = gdmepsl * gvdm
     //       using einsum
     // 6. cal_gevdm : d(des)/d(pdm)
     //       calculated using torch::autograd::grad
@@ -362,24 +327,17 @@ class LCAO_Deepks
     //       this is the term V(D) that enters the expression H_V_delta = |alpha>V(D)<alpha|
     //       caculated using torch::autograd::grad
     // 9. check_gedm : prints gedm for checking
-    // 10. cal_orbital_precalc : orbital_precalc is usted for training with orbital label,
-    //                          which equals gvdm * orbital_pdm,
-    //                          orbital_pdm[nks,Inl,nm,nm] = dm_hl * overlap * overlap
-    // 11. cal_v_delta_precalc : v_delta_precalc is used for training with v_delta label,
-    //                         which equals gvdm * v_delta_pdm_shell,
-    //                         v_delta_pdm_shell = overlap * overlap
-    // 12. check_v_delta_precalc : check v_delta_precalc
-    // 13. prepare_phialpha : prepare phialpha for outputting npy file
-    // 14. prepare_gevdm : prepare gevdm for outputting npy file
 
   public:
     /// Calculates descriptors
     /// which are eigenvalues of pdm in blocks of I_n_l
-    void cal_descriptor(const int nat);
+    void cal_descriptor(const int nat, std::vector<torch::Tensor>& descriptor);
     /// print descriptors based on LCAO basis
-    void check_descriptor(const UnitCell& ucell, const std::string& out_dir);
+    void check_descriptor(const UnitCell& ucell,
+                          const std::string& out_dir,
+                          const std::vector<torch::Tensor>& descriptor);
 
-    void cal_descriptor_equiv(const int nat);
+    void cal_descriptor_equiv(const int nat, std::vector<torch::Tensor>& descriptor);
 
     /// calculates gradient of descriptors w.r.t atomic positions
     ///----------------------------------------------------
@@ -390,80 +348,28 @@ class LCAO_Deepks
     ///  - b: the atoms whose force being calculated)
     /// gvdm*gdmx->gvx
     ///----------------------------------------------------
-    void cal_gvx(const int nat, const std::vector<torch::Tensor>& gevdm);
-    void check_gvx(const int nat);
+    void cal_gvx(const int nat, const std::vector<torch::Tensor>& gevdm, const torch::Tensor& gdmx, torch::Tensor& gvx);
+    void check_gvx(const int nat, const torch::Tensor& gvx);
 
     // for stress
-    void cal_gvepsl(const int nat, const std::vector<torch::Tensor>& gevdm);
+    void cal_gvepsl(const int nat,
+                    const std::vector<torch::Tensor>& gevdm,
+                    const torch::Tensor& gdmepsl,
+                    torch::Tensor& gvepsl);
 
     // load the trained neural network model
     void load_model(const std::string& model_file);
 
     /// calculate partial of energy correction to descriptors
-    void cal_gedm(const int nat);
+    void cal_gedm(const int nat, const std::vector<torch::Tensor>& descriptor);
     void check_gedm();
-    void cal_gedm_equiv(const int nat);
-
-    // calculates orbital_precalc
-    // template <typename TK, typename TH>
-    // void cal_orbital_precalc(const std::vector<TH>& dm_hl,
-    //                          const int lmaxd,
-    //                          const int inlmax,
-    //                          const int nat,
-    //                          const int nks,
-    //                          const int* inl_l,
-    //                          const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-    //                          const std::vector<hamilt::HContainer<double>*> phialpha,
-    //                          const std::vector<torch::Tensor> gevdm,
-    //                          const ModuleBase::IntArray* inl_index,
-    //                          const UnitCell& ucell,
-    //                          const LCAO_Orbitals& orb,
-    //                          const Parallel_Orbitals& pv,
-    //                          const Grid_Driver& GridD,
-    //                          torch::Tensor& orbital_precalc);
-
-    // calculates v_delta_precalc
-    template <typename TK>
-    void cal_v_delta_precalc(const int nlocal,
-                             const int nat,
-                             const int nks,
-                             const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                             const UnitCell& ucell,
-                             const LCAO_Orbitals& orb,
-                             const Grid_Driver& GridD);
+    void cal_gedm_equiv(const int nat, const std::vector<torch::Tensor>& descriptor);
 
-    template <typename TK>
-    void check_v_delta_precalc(const int nat, const int nks, const int nlocal);
-
-    // prepare phialpha for outputting npy file
-    template <typename TK>
-    void prepare_phialpha(const int nlocal,
-                          const int nat,
-                          const int nks,
-                          const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                          const UnitCell& ucell,
-                          const LCAO_Orbitals& orb,
-                          const Grid_Driver& GridD);
-
-    template <typename TK>
-    void check_vdp_phialpha(const int nat, const int nks, const int nlocal);
-
-    // prepare gevdm for outputting npy file
-    void prepare_gevdm(const int nat, const LCAO_Orbitals& orb);
+    // calculate gevdm
     void cal_gevdm(const int nat, std::vector<torch::Tensor>& gevdm);
-    void check_vdp_gevdm(const int nat);
 
   private:
     const Parallel_Orbitals* pv;
-
-#ifdef __MPI
-
-  public:
-    // reduces a dim 2 array
-    void allsum_deepks(int inlmax,    // first dimension
-                       int ndim,      // second dimension
-                       double** mat); // the array being reduced
-#endif
 };
 
 namespace GlobalC
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
index 8f2c4d8779..d0c5862fb9 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
@@ -69,7 +69,7 @@ void LCAO_Deepks_Interface<TK, TR>::out_deepks_labels(const double& etot,
 
         if (PARAM.inp.deepks_bandgap)
         {
-            const int nocc = (PARAM.inp.nelec+1) / 2;
+            const int nocc = (PARAM.inp.nelec + 1) / 2;
             std::vector<double> o_tot(nks);
             for (int iks = 0; iks < nks; ++iks)
             {
@@ -209,35 +209,68 @@ void LCAO_Deepks_Interface<TK, TR>::out_deepks_labels(const double& etot,
 
                 if (PARAM.inp.deepks_v_delta == 1) // v_delta_precalc storage method 1
                 {
-                    ld->cal_v_delta_precalc<TK>(nlocal, nat, nks, kvec_d, ucell, orb, GridD);
+                    std::vector<torch::Tensor> gevdm;
+                    ld->cal_gevdm(nat, gevdm);
+
+                    torch::Tensor v_delta_precalc;
+                    DeePKS_domain::cal_v_delta_precalc<TK>(nlocal,
+                                                           ld->lmaxd,
+                                                           ld->inlmax,
+                                                           nat,
+                                                           nks,
+                                                           ld->inl_l,
+                                                           kvec_d,
+                                                           ld->phialpha,
+                                                           gevdm,
+                                                           ld->inl_index,
+                                                           ucell,
+                                                           orb,
+                                                           *ParaV,
+                                                           GridD,
+                                                           v_delta_precalc);
 
                     LCAO_deepks_io::save_npy_v_delta_precalc<TK>(nat,
                                                                  nks,
                                                                  nlocal,
                                                                  ld->des_per_atom,
-                                                                 ld->v_delta_precalc_tensor,
+                                                                 v_delta_precalc,
                                                                  PARAM.globalv.global_out_dir,
                                                                  my_rank);
                 }
                 else if (PARAM.inp.deepks_v_delta == 2) // v_delta_precalc storage method 2
                 {
-                    ld->prepare_phialpha<TK>(nlocal, nat, nks, kvec_d, ucell, orb, GridD);
+                    torch::Tensor phialpha_out;
+                    DeePKS_domain::prepare_phialpha<TK>(nlocal,
+                                                        ld->lmaxd,
+                                                        ld->inlmax,
+                                                        nat,
+                                                        nks,
+                                                        kvec_d,
+                                                        ld->phialpha,
+                                                        ucell,
+                                                        orb,
+                                                        *ParaV,
+                                                        GridD,
+                                                        phialpha_out);
 
                     LCAO_deepks_io::save_npy_phialpha<TK>(nat,
                                                           nks,
                                                           nlocal,
                                                           ld->inlmax,
                                                           ld->lmaxd,
-                                                          ld->phialpha_tensor,
+                                                          phialpha_out,
                                                           PARAM.globalv.global_out_dir,
                                                           my_rank);
+                    std::vector<torch::Tensor> gevdm;
+                    ld->cal_gevdm(nat, gevdm);
 
-                    ld->prepare_gevdm(nat, orb);
+                    torch::Tensor gevdm_out;
+                    DeePKS_domain::prepare_gevdm(nat, ld->lmaxd, ld->inlmax, orb, gevdm, gevdm_out);
 
                     LCAO_deepks_io::save_npy_gevdm(nat,
                                                    ld->inlmax,
                                                    ld->lmaxd,
-                                                   ld->gevdm_tensor,
+                                                   gevdm_out,
                                                    PARAM.globalv.global_out_dir,
                                                    my_rank);
                 }
@@ -264,9 +297,9 @@ void LCAO_Deepks_Interface<TK, TR>::out_deepks_labels(const double& etot,
 
         ld->check_projected_dm(); // print out the projected dm for NSCF calculaiton
 
-        ld->cal_descriptor(nat); // final descriptor
-
-        ld->check_descriptor(ucell, PARAM.globalv.global_out_dir);
+        std::vector<torch::Tensor> descriptor;
+        ld->cal_descriptor(nat, descriptor); // final descriptor
+        ld->check_descriptor(ucell, PARAM.globalv.global_out_dir, descriptor);
 
         if (PARAM.inp.deepks_out_labels)
         {
@@ -275,7 +308,7 @@ void LCAO_Deepks_Interface<TK, TR>::out_deepks_labels(const double& etot,
                                        ld->inlmax,
                                        ld->inl_l,
                                        PARAM.inp.deepks_equiv,
-                                       ld->d_tensor,
+                                       descriptor,
                                        PARAM.globalv.global_out_dir,
                                        GlobalV::MY_RANK); // libnpy needed
         }
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
index fdf3ba7e38..c0b1b3cd93 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
@@ -100,7 +100,7 @@ void LCAO_deepks_io::save_npy_d(const int nat,
                                 const int inlmax,
                                 const int* inl_l,
                                 const bool deepks_equiv,
-                                const std::vector<torch::Tensor>& d_tensor,
+                                const std::vector<torch::Tensor>& descriptor,
                                 const std::string& out_dir,
                                 const int rank)
 {
@@ -118,10 +118,11 @@ void LCAO_deepks_io::save_npy_d(const int nat,
         std::vector<double> npy_des;
         for (int inl = 0; inl < inlmax; ++inl)
         {
+            auto accessor = descriptor[inl].accessor<double, 1>();
             int nm = 2 * inl_l[inl] + 1;
             for (int im = 0; im < nm; im++)
             {
-                npy_des.push_back(d_tensor[inl].index({im}).item().toDouble());
+                npy_des.push_back(accessor[im]);
             }
         }
         const long unsigned dshape[] = {static_cast<unsigned long>(nat), static_cast<unsigned long>(des_per_atom)};
@@ -138,9 +139,10 @@ void LCAO_deepks_io::save_npy_d(const int nat,
         std::vector<double> npy_des;
         for (int iat = 0; iat < nat; iat++)
         {
+            auto accessor = descriptor[iat].accessor<double, 1>();
             for (int i = 0; i < des_per_atom; i++)
             {
-                npy_des.push_back(d_tensor[iat].index({i}).item().toDouble());
+                npy_des.push_back(accessor[i]);
             }
         }
         const long unsigned dshape[] = {static_cast<unsigned long>(nat), static_cast<unsigned long>(des_per_atom)};
@@ -157,7 +159,7 @@ void LCAO_deepks_io::save_npy_d(const int nat,
 // saves gvx into grad_vx.npy
 void LCAO_deepks_io::save_npy_gvx(const int nat,
                                   const int des_per_atom,
-                                  const torch::Tensor& gvx_tensor,
+                                  const torch::Tensor& gvx,
                                   const std::string& out_dir,
                                   const int rank)
 {
@@ -178,6 +180,7 @@ void LCAO_deepks_io::save_npy_gvx(const int nat,
                                     static_cast<unsigned long>(des_per_atom)};
 
     std::vector<double> npy_gvx;
+    auto accessor = gvx.accessor<double, 4>();
     for (int ibt = 0; ibt < nat; ++ibt)
     {
         for (int i = 0; i < 3; i++)
@@ -186,7 +189,7 @@ void LCAO_deepks_io::save_npy_gvx(const int nat,
             {
                 for (int p = 0; p < des_per_atom; ++p)
                 {
-                    npy_gvx.push_back(gvx_tensor.index({ibt, i, iat, p}).item().toDouble());
+                    npy_gvx.push_back(accessor[ibt][i][iat][p]);
                 }
             }
         }
@@ -200,7 +203,7 @@ void LCAO_deepks_io::save_npy_gvx(const int nat,
 // saves gvx into grad_vepsl.npy
 void LCAO_deepks_io::save_npy_gvepsl(const int nat,
                                      const int des_per_atom,
-                                     const torch::Tensor& gvepsl_tensor,
+                                     const torch::Tensor& gvepsl,
                                      const std::string& out_dir,
                                      const int rank)
 {
@@ -216,6 +219,7 @@ void LCAO_deepks_io::save_npy_gvepsl(const int nat,
     const long unsigned gshape[] = {6UL, static_cast<unsigned long>(nat), static_cast<unsigned long>(des_per_atom)};
 
     std::vector<double> npy_gvepsl;
+    auto accessor = gvepsl.accessor<double, 3>();
 
     for (int i = 0; i < 6; i++)
     {
@@ -224,7 +228,7 @@ void LCAO_deepks_io::save_npy_gvepsl(const int nat,
 
             for (int p = 0; p < des_per_atom; ++p)
             {
-                npy_gvepsl.push_back(gvepsl_tensor.index({i, ibt, p}).item().toDouble());
+                npy_gvepsl.push_back(accessor[i][ibt][p]);
             }
         }
     }
@@ -341,13 +345,14 @@ void LCAO_deepks_io::save_npy_orbital_precalc(const int nat,
         = {static_cast<unsigned long>(nks), static_cast<unsigned long>(nat), static_cast<unsigned long>(des_per_atom)};
 
     std::vector<double> npy_orbital_precalc;
+    auto accessor = orbital_precalc.accessor<double, 3>();
     for (int iks = 0; iks < nks; ++iks)
     {
         for (int iat = 0; iat < nat; ++iat)
         {
             for (int p = 0; p < des_per_atom; ++p)
             {
-                npy_orbital_precalc.push_back(orbital_precalc.index({iks, iat, p}).item().toDouble());
+                npy_orbital_precalc.push_back(accessor[iks][iat][p]);
             }
         }
     }
@@ -394,7 +399,7 @@ void LCAO_deepks_io::save_npy_v_delta_precalc(const int nat,
                                               const int nks,
                                               const int nlocal,
                                               const int des_per_atom,
-                                              const torch::Tensor& v_delta_precalc_tensor,
+                                              const torch::Tensor& v_delta_precalc,
                                               const std::string& out_dir,
                                               const int rank)
 {
@@ -415,6 +420,9 @@ void LCAO_deepks_io::save_npy_v_delta_precalc(const int nat,
                                     static_cast<unsigned long>(des_per_atom)};
 
     std::vector<TK> npy_v_delta_precalc;
+    auto accessor
+        = v_delta_precalc
+              .accessor<std::conditional_t<std::is_same<TK, double>::value, double, c10::complex<double>>, 5>();
     for (int iks = 0; iks < nks; ++iks)
     {
         for (int mu = 0; mu < nlocal; ++mu)
@@ -427,15 +435,13 @@ void LCAO_deepks_io::save_npy_v_delta_precalc(const int nat,
                     {
                         if constexpr (std::is_same<TK, double>::value)
                         {
-                            npy_v_delta_precalc.push_back(
-                                v_delta_precalc_tensor.index({iks, mu, nu, iat, p}).item().toDouble());
+                            npy_v_delta_precalc.push_back(accessor[iks][mu][nu][iat][p]);
                         }
                         else
                         {
-                            std::complex<double> value(
-                                torch::real(v_delta_precalc_tensor.index({iks, mu, nu, iat, p})).item<double>(),
-                                torch::imag(v_delta_precalc_tensor.index({iks, mu, nu, iat, p})).item<double>());
-                            npy_v_delta_precalc.push_back(value);
+                            c10::complex<double> tmp_c10 = accessor[iks][mu][nu][iat][p];
+                            std::complex<double> tmp = std::complex<double>(tmp_c10.real(), tmp_c10.imag());
+                            npy_v_delta_precalc.push_back(tmp);
                         }
                     }
                 }
@@ -473,6 +479,9 @@ void LCAO_deepks_io::save_npy_phialpha(const int nat,
                                     static_cast<unsigned long>(nlocal),
                                     static_cast<unsigned long>(mmax)};
     std::vector<TK> npy_phialpha;
+    auto accessor
+        = phialpha_tensor
+              .accessor<std::conditional_t<std::is_same<TK, double>::value, double, c10::complex<double>>, 5>();
     for (int iat = 0; iat < nat; iat++)
     {
         for (int nl = 0; nl < nlmax; nl++)
@@ -485,14 +494,13 @@ void LCAO_deepks_io::save_npy_phialpha(const int nat,
                     {
                         if constexpr (std::is_same<TK, double>::value)
                         {
-                            npy_phialpha.push_back(phialpha_tensor.index({iat, nl, iks, mu, m}).item().toDouble());
+                            npy_phialpha.push_back(accessor[iat][nl][iks][mu][m]);
                         }
                         else
                         {
-                            std::complex<double> value(
-                                torch::real(phialpha_tensor.index({iat, nl, iks, mu, m})).item<double>(),
-                                torch::imag(phialpha_tensor.index({iat, nl, iks, mu, m})).item<double>());
-                            npy_phialpha.push_back(value);
+                            c10::complex<double> tmp_c10 = accessor[iat][nl][iks][mu][m];
+                            std::complex<double> tmp = std::complex<double>(tmp_c10.real(), tmp_c10.imag());
+                            npy_phialpha.push_back(tmp);
                         }
                     }
                 }
@@ -507,7 +515,7 @@ void LCAO_deepks_io::save_npy_phialpha(const int nat,
 void LCAO_deepks_io::save_npy_gevdm(const int nat,
                                     const int inlmax,
                                     const int lmaxd,
-                                    const torch::Tensor& gevdm_tensor,
+                                    const torch::Tensor& gevdm,
                                     const std::string& out_dir,
                                     const int rank)
 {
@@ -529,6 +537,7 @@ void LCAO_deepks_io::save_npy_gevdm(const int nat,
                                     static_cast<unsigned long>(mmax),
                                     static_cast<unsigned long>(mmax)};
     std::vector<double> npy_gevdm;
+    auto accessor = gevdm.accessor<double, 5>();
     for (int iat = 0; iat < nat; iat++)
     {
         for (int nl = 0; nl < nlmax; nl++)
@@ -539,7 +548,7 @@ void LCAO_deepks_io::save_npy_gevdm(const int nat,
                 {
                     for (int n = 0; n < mmax; n++)
                     {
-                        npy_gevdm.push_back(gevdm_tensor.index({iat, nl, v, m, n}).item().toDouble());
+                        npy_gevdm.push_back(accessor[iat][nl][v][m][n]);
                     }
                 }
             }
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h
index 8c537f1e46..8bd3134007 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h
@@ -51,7 +51,7 @@ void save_npy_d(const int nat,
                 const int inlmax,
                 const int* inl_l,
                 const bool deepks_equiv,
-                const std::vector<torch::Tensor>& d_tensor,
+                const std::vector<torch::Tensor>& descriptor,
                 const std::string& out_dir,
                 const int rank);
 
@@ -68,7 +68,7 @@ void save_npy_f(const ModuleBase::matrix& f, /**<[in] \f$F_{base}\f$ or \f$F_{to
 
 void save_npy_gvx(const int nat,
                   const int des_per_atom,
-                  const torch::Tensor& gvx_tensor,
+                  const torch::Tensor& gvx,
                   const std::string& out_dir,
                   const int rank);
 
@@ -80,7 +80,7 @@ void save_npy_s(const ModuleBase::matrix& stress, /**<[in] \f$S_{base}\f$ or \f$
 
 void save_npy_gvepsl(const int nat,
                      const int des_per_atom,
-                     const torch::Tensor& gvepsl_tensor,
+                     const torch::Tensor& gvepsl,
                      const std::string& out_dir,
                      const int rank);
 
@@ -110,7 +110,7 @@ void save_npy_v_delta_precalc(const int nat,
                               const int nks,
                               const int nlocal,
                               const int des_per_atom,
-                              const torch::Tensor& v_delta_precalc_tensor,
+                              const torch::Tensor& v_delta_precalc,
                               const std::string& out_dir,
                               const int rank);
 
@@ -128,7 +128,7 @@ void save_npy_phialpha(const int nat,
 void save_npy_gevdm(const int nat,
                     const int inlmax,
                     const int lmaxd,
-                    const torch::Tensor& gevdm_tensor,
+                    const torch::Tensor& gevdm,
                     const std::string& out_dir,
                     const int rank);
 }; // namespace LCAO_deepks_io
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_mpi.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_mpi.cpp
deleted file mode 100644
index 3be7413ac4..0000000000
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_mpi.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//This file contains only one subroutine, allsum_deepks
-//which is used to perform allsum on a two-level pointer
-//It is used in a few places in the deepks code
-
-#ifdef __DEEPKS
-
-#include "LCAO_deepks.h"
-#include "module_base/parallel_reduce.h"
-
-#ifdef __MPI
-void LCAO_Deepks::allsum_deepks(
-    int inlmax, //first dimension
-    int ndim, //second dimension
-    double** mat) //the array being reduced 
-{
-    for(int inl=0;inl<inlmax;inl++)
-    {
-        Parallel_Reduce::reduce_all(mat[inl], ndim);
-    }
-}
-#endif
-
-#endif
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
index 689a908f19..7ac566c85f 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
@@ -41,13 +41,14 @@ void LCAO_Deepks::read_projected_DM(bool read_pdm_file, bool is_equiv, const Num
             for (int inl = 0; inl < this->inlmax; inl++)
             {
                 int nm = this->inl_l[inl] * 2 + 1;
+                auto accessor = this->pdm[inl].accessor<double, 2>();
                 for (int m1 = 0; m1 < nm; m1++)
                 {
                     for (int m2 = 0; m2 < nm; m2++)
                     {
                         double c;
                         ifs >> c;
-                        this->pdm[inl][m1][m2] = c;
+                        accessor[m1][m2] = c;
                     }
                 }
             }
@@ -63,11 +64,12 @@ void LCAO_Deepks::read_projected_DM(bool read_pdm_file, bool is_equiv, const Num
             pdm_size = nproj * nproj;
             for (int inl = 0; inl < this->inlmax; inl++)
             {
+                auto accessor = this->pdm[inl].accessor<double, 1>();
                 for (int ind = 0; ind < pdm_size; ind++)
                 {
                     double c;
                     ifs >> c;
-                    this->pdm[inl][ind] = c;
+                    accessor[ind] = c;
                 }
             }
         }
@@ -324,12 +326,12 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<TK, double>* d
                             const int inl = this->inl_index[T0](I0, L0, N0);
                             const int nm = 2 * L0 + 1;
 
+                            auto accessor = this->pdm[inl].accessor<double, 2>();
                             for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                             {
                                 for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
                                 {
-                                    int ind = m1 * nm + m2;
-                                    pdm[inl][m1][m2] += ddot_(&row_size,
+                                    accessor[m1][m2] += ddot_(&row_size,
                                                               g_1dmt.data() + index * row_size,
                                                               &inc,
                                                               s_1t.data() + index * row_size,
@@ -343,6 +345,7 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<TK, double>* d
                 }
                 else
                 {
+                    auto accessor = this->pdm[iat].accessor<double, 1>();
                     int index = 0, inc = 1;
                     int nproj = 0;
                     for (int il = 0; il < this->lmaxd + 1; il++)
@@ -353,7 +356,7 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<TK, double>* d
                     {
                         for (int jproj = 0; jproj < nproj; jproj++)
                         {
-                            pdm[iat][iproj * nproj + jproj] += ddot_(&row_size,
+                            accessor[iproj * nproj + jproj] += ddot_(&row_size,
                                                                      g_1dmt.data() + index * row_size,
                                                                      &inc,
                                                                      s_1t.data() + index * row_size,
@@ -386,11 +389,12 @@ void LCAO_Deepks::check_projected_dm()
     for (int inl = 0; inl < inlmax; inl++)
     {
         const int nm = 2 * this->inl_l[inl] + 1;
+        auto accessor = pdm[inl].accessor<double, 2>();
         for (int m1 = 0; m1 < nm; m1++)
         {
             for (int m2 = 0; m2 < nm; m2++)
             {
-                ofs << pdm[inl][m1][m2].item<double>() << " ";
+                ofs << accessor[m1][m2] << " ";
             }
         }
         ofs << std::endl;
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
index 5d881f82f9..7ade93876a 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
@@ -2,16 +2,14 @@
 // including loading of model and calculating gradients
 // as well as subroutines that prints the results for checking
 
-// The file contains 8 subroutines:
+// The file contains 3 subroutines:
 //  cal_gvepsl : gvepsl is used for training with stress label, which is derivative of
 //        descriptors wrt strain tensor, calculated by
-//        d(des)/d\epsilon_{ab} = d(pdm)/d\epsilon_{ab} * d(des)/d(pdm) = gdm_epsl * gvdm
+//        d(des)/d\epsilon_{ab} = d(pdm)/d\epsilon_{ab} * d(des)/d(pdm) = gdmepsl * gvdm
 //        using einsum
 //  cal_gevdm : d(des)/d(pdm)
 //        calculated using torch::autograd::grad
 //  load_model : loads model for applying V_delta
-//  prepare_phialpha : prepare phialpha for outputting npy file
-//  prepare_gevdm : prepare gevdm for outputting npy file
 
 #ifdef __DEEPKS
 
@@ -25,14 +23,16 @@
 #include "module_parameter/parameter.h"
 
 // calculates stress of descriptors from gradient of projected density matrices
-void LCAO_Deepks::cal_gvepsl(const int nat, const std::vector<torch::Tensor>& gevdm)
+// gv_epsl:d(d)/d\epsilon_{\alpha\beta}, [natom][6][des_per_atom]
+void LCAO_Deepks::cal_gvepsl(const int nat,
+                             const std::vector<torch::Tensor>& gevdm,
+                             const torch::Tensor& gdmepsl,
+                             torch::Tensor& gvepsl)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_gvepsl");
-    if (!gdmepsl_vector.empty())
-    {
-        gdmepsl_vector.erase(gdmepsl_vector.begin(), gdmepsl_vector.end());
-    }
-    // gdmr_vector : nat(derivative) * 3 * inl(projector) * nm * nm
+    // dD/d\epsilon_{\alpha\beta}, tensor vector form of gdmepsl
+    std::vector<torch::Tensor> gdmepsl_vector;
+    auto accessor = gdmepsl.accessor<double, 4>();
     if (GlobalV::MY_RANK == 0)
     {
         // make gdmx as tensor
@@ -40,9 +40,6 @@ void LCAO_Deepks::cal_gvepsl(const int nat, const std::vector<torch::Tensor>& ge
         for (int nl = 0; nl < nlmax; ++nl)
         {
             std::vector<torch::Tensor> bmmv;
-            // for (int ipol=0;ipol<6;++ipol)
-            //{
-            //     std::vector<torch::Tensor> xmmv;
             for (int i = 0; i < 6; ++i)
             {
                 std::vector<torch::Tensor> ammv;
@@ -55,7 +52,7 @@ void LCAO_Deepks::cal_gvepsl(const int nat, const std::vector<torch::Tensor>& ge
                     {
                         for (int m2 = 0; m2 < nm; ++m2)
                         {
-                            mmv.push_back(this->gdm_epsl[i][inl][m1 * nm + m2]);
+                            mmv.push_back(accessor[i][inl][m1][m2]);
                         }
                     } // nm^2
                     torch::Tensor mm
@@ -65,12 +62,9 @@ void LCAO_Deepks::cal_gvepsl(const int nat, const std::vector<torch::Tensor>& ge
                 torch::Tensor bmm = torch::stack(ammv, 0); // nat*nm*nm
                 bmmv.push_back(bmm);
             }
-            // torch::Tensor bmm = torch::stack(xmmv, 0);  //3*nat*nm*nm
-            // bmmv.push_back(bmm);
-            //}
-            this->gdmepsl_vector.push_back(torch::stack(bmmv, 0)); // nbt*3*nat*nm*nm
+            gdmepsl_vector.push_back(torch::stack(bmmv, 0)); // nbt*3*nat*nm*nm
         }
-        assert(this->gdmepsl_vector.size() == nlmax);
+        assert(gdmepsl_vector.size() == nlmax);
 
         // einsum for each inl:
         // gdmepsl_vector : b:npol * a:inl(projector) * m:nm * n:nm
@@ -80,15 +74,15 @@ void LCAO_Deepks::cal_gvepsl(const int nat, const std::vector<torch::Tensor>& ge
         std::vector<torch::Tensor> gvepsl_vector;
         for (int nl = 0; nl < nlmax; ++nl)
         {
-            gvepsl_vector.push_back(at::einsum("bamn, avmn->bav", {this->gdmepsl_vector[nl], gevdm[nl]}));
+            gvepsl_vector.push_back(at::einsum("bamn, avmn->bav", {gdmepsl_vector[nl], gevdm[nl]}));
         }
 
         // cat nv-> \sum_nl(nv) = \sum_nl(nm_nl)=des_per_atom
         // concatenate index a(inl) and m(nm)
-        this->gvepsl_tensor = torch::cat(gvepsl_vector, -1);
-        assert(this->gvepsl_tensor.size(0) == 6);
-        assert(this->gvepsl_tensor.size(1) == nat);
-        assert(this->gvepsl_tensor.size(2) == this->des_per_atom);
+        gvepsl = torch::cat(gvepsl_vector, -1);
+        assert(gvepsl.size(0) == 6);
+        assert(gvepsl.size(1) == nat);
+        assert(gvepsl.size(2) == this->des_per_atom);
     }
 
     return;
@@ -156,289 +150,4 @@ void LCAO_Deepks::load_model(const std::string& deepks_model)
     return;
 }
 
-// prepare_phialpha and prepare_gevdm for deepks_v_delta = 2
-template <typename TK>
-void LCAO_Deepks::prepare_phialpha(const int nlocal,
-                                   const int nat,
-                                   const int nks,
-                                   const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                                   const UnitCell& ucell,
-                                   const LCAO_Orbitals& orb,
-                                   const Grid_Driver& GridD)
-{
-    ModuleBase::TITLE("LCAO_Deepks", "prepare_phialpha");
-    int nlmax = this->inlmax / nat;
-    int mmax = 2 * this->lmaxd + 1;
-    if constexpr (std::is_same<TK, double>::value)
-    {
-        this->phialpha_tensor = torch::zeros({nat, nlmax, nks, nlocal, mmax}, torch::kFloat64); // support gamma-only
-    }
-    else
-    {
-        this->phialpha_tensor = torch::zeros({nat, nlmax, nks, nlocal, mmax}, torch::kComplexDouble); // support multi-k
-    }
-
-    // cutoff for alpha is same for all types of atoms
-    const double Rcut_Alpha = orb.Alpha[0].getRcut();
-
-    for (int T0 = 0; T0 < ucell.ntype; T0++)
-    {
-        Atom* atom0 = &ucell.atoms[T0];
-        for (int I0 = 0; I0 < atom0->na; I0++)
-        {
-            // iat: atom index on which |alpha> is located
-            const int iat = ucell.itia2iat(T0, I0);
-            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
-
-            // outermost loop : find all adjacent atoms
-            for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
-            {
-                const int T1 = GridD.getType(ad);
-                const int I1 = GridD.getNatom(ad);
-                const int ibt = ucell.itia2iat(T1, I1);
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-                const double Rcut_AO1 = orb.Phi[T1].getRcut();
-
-                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad);
-                const Atom* atom1 = &ucell.atoms[T1];
-                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
-
-                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
-
-                if (dist1 > Rcut_Alpha + Rcut_AO1)
-                {
-                    continue;
-                }
-
-                ModuleBase::Vector3<int> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z);
-
-                if constexpr (std::is_same<TK, std::complex<double>>::value)
-                {
-                    if (this->phialpha[0]->find_matrix(iat, ibt, dR.x, dR.y, dR.z) == nullptr)
-                    {
-                        continue;
-                    }
-                }
-
-                // middle loop : all atomic basis on the adjacent atom ad
-                for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
-                {
-                    const int iw1_all = start1 + iw1;
-                    const int iw1_local = pv->global2local_row(iw1_all);
-                    const int iw2_local = pv->global2local_col(iw1_all);
-                    if (iw1_local < 0 || iw2_local < 0)
-                    {
-                        continue;
-                    }
-                    hamilt::BaseMatrix<double>* overlap = phialpha[0]->find_matrix(iat, ibt, dR);
-
-                    for (int ik = 0; ik < nks; ik++)
-                    {
-                        std::complex<double> kphase = std::complex<double>(1.0, 0.0);
-                        if constexpr (std::is_same<TK, std::complex<double>>::value)
-                        {
-                            const double arg = -(kvec_d[ik] * ModuleBase::Vector3<double>(dR)) * ModuleBase::TWO_PI;
-                            double sinp, cosp;
-                            ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                            kphase = std::complex<double>(cosp, sinp);
-                        }
-                        int ib = 0;
-                        int nl = 0;
-                        for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
-                        {
-                            for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
-                            {
-                                const int nm = 2 * L0 + 1;
-
-                                for (int m1 = 0; m1 < nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
-                                {
-                                    if constexpr (std::is_same<TK, double>::value)
-                                    {
-                                        this->phialpha_tensor[iat][nl][ik][iw1_all][m1]
-                                            = overlap->get_value(iw1, ib + m1);
-                                    }
-                                    else
-                                    {
-                                        std::complex<double> nlm_phase = overlap->get_value(iw1, ib + m1) * kphase;
-                                        torch::Tensor nlm_tensor
-                                            = torch::tensor({nlm_phase.real(), nlm_phase.imag()}, torch::kDouble);
-                                        torch::Tensor complex_tensor = torch::complex(nlm_tensor[0], nlm_tensor[1]);
-                                        this->phialpha_tensor[iat][nl][ik][iw1_all][m1] = complex_tensor;
-                                    }
-                                }
-                                ib += nm;
-                                nl++;
-                            }
-                        }
-                    } // end ik
-                }     // end iw
-            }         // end ad
-        }             // end I0
-    }                 // end T0
-
-#ifdef __MPI
-    TK msg[mmax];
-    for (int iat = 0; iat < nat; iat++)
-    {
-        for (int nl = 0; nl < nlmax; nl++)
-        {
-            for (int ik = 0; ik < nks; ik++)
-            {
-                for (int mu = 0; mu < nlocal; mu++)
-                {
-                    for (int m = 0; m < mmax; m++)
-                    {
-                        if constexpr (std::is_same<TK, double>::value)
-                        {
-                            msg[m] = this->phialpha_tensor[iat][nl][ik][mu][m].item().toDouble();
-                        }
-                        else
-                        {
-                            auto tensor_value = this->phialpha_tensor.index({iat, nl, ik, mu, m});
-                            msg[m] = std::complex<double>(torch::real(tensor_value).item<double>(),
-                                                          torch::imag(tensor_value).item<double>());
-                        }
-                    }
-                    Parallel_Reduce::reduce_all(msg, mmax);
-                    for (int m = 0; m < mmax; m++)
-                    {
-                        if constexpr (std::is_same<TK, double>::value)
-                        {
-                            this->phialpha_tensor[iat][nl][ik][mu][m] = msg[m];
-                        }
-                        else
-                        {
-                            torch::Tensor msg_tensor = torch::tensor({msg[m].real(), msg[m].imag()}, torch::kDouble);
-                            torch::Tensor complex_tensor = torch::complex(msg_tensor[0], msg_tensor[1]);
-                            this->phialpha_tensor[iat][nl][ik][mu][m] = complex_tensor;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-#endif
-}
-
-template <typename TK>
-void LCAO_Deepks::check_vdp_phialpha(const int nat, const int nks, const int nlocal)
-{
-    std::ofstream ofs("vdp_phialpha.dat");
-    ofs << std::setprecision(10);
-
-    int nlmax = this->inlmax / nat;
-    int mmax = 2 * this->lmaxd + 1;
-    for (int iat = 0; iat < nat; iat++)
-    {
-        for (int nl = 0; nl < nlmax; nl++)
-        {
-            for (int iks = 0; iks < nks; iks++)
-            {
-                for (int mu = 0; mu < nlocal; mu++)
-                {
-                    for (int m = 0; m < mmax; m++)
-                    {
-                        if constexpr (std::is_same<TK, double>::value)
-                        {
-                            ofs << this->phialpha_tensor.index({iat, nl, iks, mu, m}).item().toDouble() << " ";
-                        }
-                        else
-                        {
-                            auto tensor_value = this->phialpha_tensor.index({iat, nl, iks, mu, m});
-                            ofs << std::complex<double>(torch::real(tensor_value).item<double>(),
-                                                        torch::imag(tensor_value).item<double>())
-                                << " ";
-                        }
-                    }
-                }
-            }
-            ofs << std::endl;
-        }
-    }
-    ofs.close();
-}
-
-void LCAO_Deepks::prepare_gevdm(const int nat, const LCAO_Orbitals& orb)
-{
-    int nlmax = this->inlmax / nat;
-    int mmax = 2 * this->lmaxd + 1;
-    this->gevdm_tensor = torch::zeros({nat, nlmax, mmax, mmax, mmax}, torch::TensorOptions().dtype(torch::kFloat64));
-
-    std::vector<torch::Tensor> gevdm;
-    this->cal_gevdm(nat, gevdm);
-
-    int nl = 0;
-    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
-    {
-        for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
-        {
-            for (int iat = 0; iat < nat; iat++)
-            {
-                const int nm = 2 * L0 + 1;
-                for (int v = 0; v < nm; ++v) // nm = 1 for s, 3 for p, 5 for d
-                {
-                    for (int m = 0; m < nm; ++m)
-                    {
-                        for (int n = 0; n < nm; ++n)
-                        {
-                            this->gevdm_tensor[iat][nl][v][m][n] = gevdm[nl][iat][v][m][n];
-                        }
-                    }
-                }
-            }
-            nl++;
-        }
-    }
-    assert(nl == nlmax);
-}
-
-void LCAO_Deepks::check_vdp_gevdm(const int nat)
-{
-    std::ofstream ofs("vdp_gevdm.dat");
-    ofs << std::setprecision(10);
-
-    int nlmax = this->inlmax / nat;
-    int mmax = 2 * this->lmaxd + 1;
-    for (int iat = 0; iat < nat; iat++)
-    {
-        for (int nl = 0; nl < nlmax; nl++)
-        {
-            for (int v = 0; v < mmax; v++)
-            {
-                for (int m = 0; m < mmax; m++)
-                {
-                    for (int n = 0; n < mmax; n++)
-                    {
-                        ofs << this->gevdm_tensor.index({iat, nl, v, m, n}).item().toDouble() << " ";
-                    }
-                }
-            }
-            ofs << std::endl;
-        }
-    }
-    ofs.close();
-}
-
-template void LCAO_Deepks::prepare_phialpha<double>(const int nlocal,
-                                                    const int nat,
-                                                    const int nks,
-                                                    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                                                    const UnitCell& ucell,
-                                                    const LCAO_Orbitals& orb,
-                                                    const Grid_Driver& GridD);
-
-template void LCAO_Deepks::prepare_phialpha<std::complex<double>>(
-    const int nlocal,
-    const int nat,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-    const UnitCell& ucell,
-    const LCAO_Orbitals& orb,
-    const Grid_Driver& GridD);
-
-template void LCAO_Deepks::check_vdp_phialpha<double>(const int nat, const int nks, const int nlocal);
-template void LCAO_Deepks::check_vdp_phialpha<std::complex<double>>(const int nat, const int nks, const int nlocal);
-
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/cal_descriptor.cpp b/source/module_hamilt_lcao/module_deepks/cal_descriptor.cpp
index e58473ec24..6a0729f220 100644
--- a/source/module_hamilt_lcao/module_deepks/cal_descriptor.cpp
+++ b/source/module_hamilt_lcao/module_deepks/cal_descriptor.cpp
@@ -13,68 +13,64 @@
 #include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
 #include "module_parameter/parameter.h"
 
-void LCAO_Deepks::cal_descriptor_equiv(const int nat)
+void LCAO_Deepks::cal_descriptor_equiv(const int nat, std::vector<torch::Tensor>& descriptor)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_descriptor_equiv");
     ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor_equiv");
 
-    // a rather unnecessary way of writing this, but I'll do it for now
-    if (!this->d_tensor.empty())
-    {
-        this->d_tensor.erase(this->d_tensor.begin(), this->d_tensor.end());
-    }
-
     for (int iat = 0; iat < nat; iat++)
     {
         auto tmp = torch::zeros(des_per_atom, torch::kFloat64);
-        std::memcpy(tmp.data_ptr(), pdm[iat].data_ptr<double>(), sizeof(double) * tmp.numel());
-        this->d_tensor.push_back(tmp);
+        std::memcpy(tmp.data_ptr(), this->pdm[iat].data_ptr<double>(), sizeof(double) * tmp.numel());
+        descriptor.push_back(tmp);
     }
 
     ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor_equiv");
 }
 
 // calculates descriptors from projected density matrices
-void LCAO_Deepks::cal_descriptor(const int nat)
+void LCAO_Deepks::cal_descriptor(const int nat, std::vector<torch::Tensor>& descriptor)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_descriptor");
     ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor");
 
-    if (PARAM.inp.deepks_equiv)
+    // init descriptor
+    // if descriptor is not empty, clear it !!
+    if (!descriptor.empty())
     {
-        this->cal_descriptor_equiv(nat);
-        return;
+        descriptor.erase(descriptor.begin(), descriptor.end());
     }
 
-    // init d_tensor
-    // if d_tensor is not empty, clear it !!
-    if (!this->d_tensor.empty())
+    if (PARAM.inp.deepks_equiv)
     {
-        this->d_tensor.erase(this->d_tensor.begin(), this->d_tensor.end());
+        this->cal_descriptor_equiv(nat, descriptor);
+        return;
     }
 
     for (int inl = 0; inl < this->inlmax; ++inl)
     {
         const int nm = 2 * inl_l[inl] + 1;
         this->pdm[inl].requires_grad_(true);
-        this->d_tensor.push_back(torch::ones({nm}, torch::requires_grad(true)));
+        descriptor.push_back(torch::ones({nm}, torch::requires_grad(true)));
     }
 
-    // cal d_tensor
+    // cal descriptor
     for (int inl = 0; inl < inlmax; ++inl)
     {
         torch::Tensor vd;
-        std::tuple<torch::Tensor, torch::Tensor> d_v(this->d_tensor[inl], vd);
+        std::tuple<torch::Tensor, torch::Tensor> d_v(descriptor[inl], vd);
         // d_v = torch::symeig(pdm[inl], /*eigenvalues=*/true,
         // /*upper=*/true);
         d_v = torch::linalg::eigh(pdm[inl], /*uplo*/ "U");
-        d_tensor[inl] = std::get<0>(d_v);
+        descriptor[inl] = std::get<0>(d_v);
     }
     ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor");
     return;
 }
 
-void LCAO_Deepks::check_descriptor(const UnitCell& ucell, const std::string& out_dir)
+void LCAO_Deepks::check_descriptor(const UnitCell& ucell,
+                                   const std::string& out_dir,
+                                   const std::vector<torch::Tensor>& descriptor)
 {
     ModuleBase::TITLE("LCAO_Deepks", "check_descriptor");
 
@@ -101,11 +97,11 @@ void LCAO_Deepks::check_descriptor(const UnitCell& ucell, const std::string& out
                 for (int inl = 0; inl < inlmax / ucell.nat; inl++)
                 {
                     int nm = 2 * inl_l[inl] + 1;
+                    const int ind = iat * inlmax / ucell.nat + inl;
+                    auto accessor = descriptor[ind].accessor<double, 1>();
                     for (int im = 0; im < nm; im++)
                     {
-                        const int ind = iat * inlmax / ucell.nat + inl;
-                        ofs << d_tensor[ind].index({im}).item().toDouble() << " ";
-
+                        ofs << accessor[im] << " ";
                         if (id % 8 == 7)
                         {
                             ofs << std::endl;
@@ -124,9 +120,10 @@ void LCAO_Deepks::check_descriptor(const UnitCell& ucell, const std::string& out
             const int it = ucell.iat2it[iat];
             ofs << ucell.atoms[it].label << " atom_index " << iat + 1 << " n_descriptor " << this->des_per_atom
                 << std::endl;
+            auto accessor = descriptor[iat].accessor<double, 1>();
             for (int i = 0; i < this->des_per_atom; i++)
             {
-                ofs << this->pdm[iat][i].item<double>() << " ";
+                ofs << accessor[i] << " ";
                 if (i % 8 == 7)
                 {
                     ofs << std::endl;
diff --git a/source/module_hamilt_lcao/module_deepks/cal_gdmepsl.cpp b/source/module_hamilt_lcao/module_deepks/cal_gdmepsl.cpp
new file mode 100644
index 0000000000..ecba99b1f3
--- /dev/null
+++ b/source/module_hamilt_lcao/module_deepks/cal_gdmepsl.cpp
@@ -0,0 +1,263 @@
+#ifdef __DEEPKS
+
+#include "LCAO_deepks.h"
+#include "module_base/constants.h"
+#include "module_base/libm/libm.h"
+#include "module_base/parallel_reduce.h"
+#include "module_base/timer.h"
+#include "module_base/vector3.h"
+#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
+#include "module_parameter/parameter.h"
+
+/// this subroutine calculates the gradient of PDM wrt strain tensor:
+/// gdmepsl = d/d\epsilon_{ab} *
+///           sum_{mu,nu} rho_{mu,nu} <chi_mu|alpha_m><alpha_m'|chi_nu>
+
+// There are 2 subroutines in this file:
+// 1. cal_gdmepsl, calculating gdmepsl
+// 2. check_gdmepsl, which prints gdmepsl to a series of .dat files
+
+template <typename TK>
+void LCAO_Deepks::cal_gdmepsl(const std::vector<std::vector<TK>>& dm,
+                              const UnitCell& ucell,
+                              const LCAO_Orbitals& orb,
+                              const Grid_Driver& GridD,
+                              const int nks,
+                              const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                              std::vector<hamilt::HContainer<double>*> phialpha,
+                              torch::Tensor& gdmepsl)
+{
+    ModuleBase::TITLE("LCAO_Deepks", "cal_gdmepsl");
+    ModuleBase::timer::tick("LCAO_Deepks", "cal_gdmepsl");
+    // get DS_alpha_mu and S_nu_beta
+
+    int nrow = this->pv->nrow;
+    const int nm = 2 * lmaxd + 1;
+    // gdmepsl: dD/d\epsilon_{\alpha\beta}
+    // size: [6][tot_Inl][2l+1][2l+1]
+    gdmepsl = torch::zeros({6, inlmax, nm, nm}, torch::dtype(torch::kFloat64));
+    auto accessor = gdmepsl.accessor<double, 4>();
+
+    const double Rcut_Alpha = orb.Alpha[0].getRcut();
+
+    for (int T0 = 0; T0 < ucell.ntype; T0++)
+    {
+        Atom* atom0 = &ucell.atoms[T0];
+        for (int I0 = 0; I0 < atom0->na; I0++)
+        {
+            const int iat = ucell.itia2iat(T0, I0); // on which alpha is located
+            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
+            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
+
+            for (int ad1 = 0; ad1 < GridD.getAdjacentNum() + 1; ++ad1)
+            {
+                const int T1 = GridD.getType(ad1);
+                const int I1 = GridD.getNatom(ad1);
+                const int ibt1 = ucell.itia2iat(T1, I1); // on which chi_mu is located
+                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+
+                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
+                const Atom* atom1 = &ucell.atoms[T1];
+                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
+
+                ModuleBase::Vector3<int> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z);
+
+                for (int ad2 = 0; ad2 < GridD.getAdjacentNum() + 1; ad2++)
+                {
+                    const int T2 = GridD.getType(ad2);
+                    const int I2 = GridD.getNatom(ad2);
+                    const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+                    const int ibt2 = ucell.itia2iat(T2, I2);
+                    const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
+                    const Atom* atom2 = &ucell.atoms[T2];
+                    const int nw2_tot = atom2->nw * PARAM.globalv.npol;
+                    ModuleBase::Vector3<int> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
+
+                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
+                    const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
+                    const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
+
+                    if (dist1 > Rcut_Alpha + Rcut_AO1 || dist2 > Rcut_Alpha + Rcut_AO2)
+                    {
+                        continue;
+                    }
+
+                    double r0[3];
+                    double r1[3];
+                    r1[0] = (tau1.x - tau0.x);
+                    r1[1] = (tau1.y - tau0.y);
+                    r1[2] = (tau1.z - tau0.z);
+                    r0[0] = (tau2.x - tau0.x);
+                    r0[1] = (tau2.y - tau0.y);
+                    r0[2] = (tau2.z - tau0.z);
+                    auto row_indexes = pv->get_indexes_row(ibt1);
+                    auto col_indexes = pv->get_indexes_col(ibt2);
+                    if (row_indexes.size() * col_indexes.size() == 0)
+                    {
+                        continue;
+                    }
+
+                    double* dm_current;
+                    int dRx;
+                    int dRy;
+                    int dRz;
+                    if constexpr (std::is_same<TK, double>::value)
+                    {
+                        dRx = 0;
+                        dRy = 0;
+                        dRz = 0;
+                    }
+                    else
+                    {
+                        dRx = (dR2 - dR1).x;
+                        dRy = (dR2 - dR1).y;
+                        dRz = (dR2 - dR1).z;
+                    }
+                    ModuleBase::Vector3<double> dR(dRx, dRy, dRz);
+
+                    hamilt::AtomPair<double> dm_pair(ibt1, ibt2, dRx, dRy, dRz, pv);
+                    dm_pair.allocate(nullptr, 1);
+                    for (int ik = 0; ik < nks; ik++)
+                    {
+                        TK kphase;
+                        if constexpr (std::is_same<TK, double>::value)
+                        {
+                            kphase = 1.0;
+                        }
+                        else
+                        {
+                            const double arg = -(kvec_d[ik] * dR) * ModuleBase::TWO_PI;
+                            double sinp, cosp;
+                            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                            kphase = TK(cosp, sinp);
+                        }
+                        if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
+                        {
+                            dm_pair.add_from_matrix(dm[ik].data(), pv->get_row_size(), kphase, 1);
+                        }
+                        else
+                        {
+                            dm_pair.add_from_matrix(dm[ik].data(), pv->get_col_size(), kphase, 0);
+                        }
+                    }
+
+                    dm_current = dm_pair.get_pointer();
+
+                    for (int iw1 = 0; iw1 < row_indexes.size(); ++iw1)
+                    {
+                        for (int iw2 = 0; iw2 < col_indexes.size(); ++iw2)
+                        {
+                            hamilt::BaseMatrix<double>* overlap_1 = phialpha[0]->find_matrix(iat, ibt1, dR1);
+                            hamilt::BaseMatrix<double>* overlap_2 = phialpha[0]->find_matrix(iat, ibt2, dR2);
+                            std::vector<hamilt::BaseMatrix<double>*> grad_overlap_1(3);
+                            std::vector<hamilt::BaseMatrix<double>*> grad_overlap_2(3);
+
+                            assert(overlap_1->get_col_size() == overlap_2->get_col_size());
+
+                            for (int i = 0; i < 3; ++i)
+                            {
+                                grad_overlap_1[i] = phialpha[i + 1]->find_matrix(iat, ibt1, dR1);
+                                grad_overlap_2[i] = phialpha[i + 1]->find_matrix(iat, ibt2, dR2);
+                            }
+
+                            int ib = 0;
+                            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
+                            {
+                                for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
+                                {
+                                    const int inl = this->inl_index[T0](I0, L0, N0);
+                                    const int nm = 2 * L0 + 1;
+                                    for (int m1 = 0; m1 < nm; ++m1)
+                                    {
+                                        for (int m2 = 0; m2 < nm; ++m2)
+                                        {
+                                            int mm = 0;
+                                            for (int ipol = 0; ipol < 3; ipol++)
+                                            {
+                                                for (int jpol = ipol; jpol < 3; jpol++)
+                                                {
+                                                    accessor[mm][inl][m2][m1]
+                                                        += ucell.lat0 * *dm_current
+                                                           * (grad_overlap_2[jpol]->get_value(col_indexes[iw2], ib + m2)
+                                                              * overlap_1->get_value(row_indexes[iw1], ib + m1)
+                                                              * r0[ipol]);
+                                                    accessor[mm][inl][m2][m1]
+                                                        += ucell.lat0 * *dm_current
+                                                           * (overlap_2->get_value(col_indexes[iw2], ib + m1)
+                                                              * grad_overlap_1[jpol]->get_value(row_indexes[iw1],
+                                                                                                ib + m2)
+                                                              * r1[ipol]);
+                                                    mm++;
+                                                }
+                                            }
+                                        }
+                                    }
+                                    ib += nm;
+                                }
+                            }
+                            assert(ib == overlap_1->get_col_size());
+                            dm_current++;
+                        } // iw2
+                    }     // iw1
+                }         // ad2
+            }             // ad1
+        }                 // I0
+    }                     // T0
+
+#ifdef __MPI
+    Parallel_Reduce::reduce_all(gdmepsl.data_ptr<double>(), 6 * inlmax * nm * nm);
+#endif
+    ModuleBase::timer::tick("LCAO_Deepks", "cal_gdmepsl");
+    return;
+}
+
+void LCAO_Deepks::check_gdmepsl(const torch::Tensor& gdmepsl)
+{
+    std::stringstream ss;
+    std::ofstream ofs;
+
+    ofs << std::setprecision(10);
+
+    const int nm = 2 * this->lmaxd + 1;
+    auto accessor = gdmepsl.accessor<double, 4>();
+    for (int i = 0; i < 6; i++)
+    {
+        ss.str("");
+        ss << "gdmepsl_" << i << ".dat";
+        ofs.open(ss.str().c_str());
+
+        for (int inl = 0; inl < inlmax; inl++)
+        {
+            for (int m1 = 0; m1 < nm; m1++)
+            {
+                for (int m2 = 0; m2 < nm; m2++)
+                {
+                    ofs << accessor[i][inl][m1][m2] << " ";
+                }
+            }
+            ofs << std::endl;
+        }
+        ofs.close();
+    }
+}
+
+template void LCAO_Deepks::cal_gdmepsl<double>(const std::vector<std::vector<double>>& dm,
+                                               const UnitCell& ucell,
+                                               const LCAO_Orbitals& orb,
+                                               const Grid_Driver& GridD,
+                                               const int nks,
+                                               const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                               std::vector<hamilt::HContainer<double>*> phialpha,
+                                               torch::Tensor& gdmepsl);
+
+template void LCAO_Deepks::cal_gdmepsl<std::complex<double>>(const std::vector<std::vector<std::complex<double>>>& dm,
+                                                             const UnitCell& ucell,
+                                                             const LCAO_Orbitals& orb,
+                                                             const Grid_Driver& GridD,
+                                                             const int nks,
+                                                             const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                                             std::vector<hamilt::HContainer<double>*> phialpha,
+                                                             torch::Tensor& gdmepsl);
+
+#endif
diff --git a/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp b/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp
index 6a3b47df45..4dc151af64 100644
--- a/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp
+++ b/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp
@@ -3,6 +3,7 @@
 #include "LCAO_deepks.h"
 #include "module_base/constants.h"
 #include "module_base/libm/libm.h"
+#include "module_base/parallel_reduce.h"
 #include "module_base/timer.h"
 #include "module_base/vector3.h"
 #include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
@@ -10,13 +11,9 @@
 
 /// this subroutine calculates the gradient of projected density matrices
 /// gdmx_m,m = d/dX sum_{mu,nu} rho_{mu,nu} <chi_mu|alpha_m><alpha_m'|chi_nu>
-/// if stress label is enabled, the gradient of PDM wrt strain tensor will
-/// be calculated:
-/// gdm_epsl = d/d\epsilon_{ab} *
-///           sum_{mu,nu} rho_{mu,nu} <chi_mu|alpha_m><alpha_m'|chi_nu>
 
 // There are 2 subroutines in this file:
-// 1. cal_gdmx, calculating gdmx (and optionally gdm_epsl for stress) for gamma point
+// 1. cal_gdmx, calculating gdmx
 // 2. check_gdmx, which prints gdmx to a series of .dat files
 
 template <typename TK>
@@ -27,34 +24,19 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
                            const int nks,
                            const std::vector<ModuleBase::Vector3<double>>& kvec_d,
                            std::vector<hamilt::HContainer<double>*> phialpha,
-                           const bool isstress)
+                           torch::Tensor& gdmx)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_gdmx");
     ModuleBase::timer::tick("LCAO_Deepks", "cal_gdmx");
     // get DS_alpha_mu and S_nu_beta
 
-    int size = (2 * lmaxd + 1) * (2 * lmaxd + 1);
     int nrow = this->pv->nrow;
-    for (int iat = 0; iat < ucell.nat; iat++)
-    {
-        for (int inl = 0; inl < inlmax; inl++)
-        {
-            ModuleBase::GlobalFunc::ZEROS(gdmx[iat][inl], size);
-            ModuleBase::GlobalFunc::ZEROS(gdmy[iat][inl], size);
-            ModuleBase::GlobalFunc::ZEROS(gdmz[iat][inl], size);
-        }
-    }
-
-    if (isstress)
-    {
-        for (int ipol = 0; ipol < 6; ipol++)
-        {
-            for (int inl = 0; inl < inlmax; inl++)
-            {
-                ModuleBase::GlobalFunc::ZEROS(gdm_epsl[ipol][inl], size);
-            }
-        }
-    }
+    const int nm = 2 * lmaxd + 1;
+    // gdmx: dD/dX
+    // \sum_{mu,nu} 2*c_mu*c_nu * <dphi_mu/dx|alpha_m><alpha_m'|phi_nu>
+    // size: [3][natom][tot_Inl][2l+1][2l+1]
+    gdmx = torch::zeros({3, ucell.nat, inlmax, nm, nm}, torch::dtype(torch::kFloat64));
+    auto accessor = gdmx.accessor<double, 5>();
 
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
 
@@ -101,17 +83,6 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
                         continue;
                     }
 
-                    double r0[3];
-                    double r1[3];
-                    if (isstress)
-                    {
-                        r1[0] = (tau1.x - tau0.x);
-                        r1[1] = (tau1.y - tau0.y);
-                        r1[2] = (tau1.z - tau0.z);
-                        r0[0] = (tau2.x - tau0.x);
-                        r0[1] = (tau2.y - tau0.y);
-                        r0[2] = (tau2.z - tau0.z);
-                    }
                     auto row_indexes = pv->get_indexes_row(ibt1);
                     auto col_indexes = pv->get_indexes_col(ibt2);
                     if (row_indexes.size() * col_indexes.size() == 0)
@@ -193,68 +164,29 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
                                     {
                                         for (int m2 = 0; m2 < nm; ++m2)
                                         {
-                                            //(<d/dX chi_mu|alpha_m>)<chi_nu|alpha_m'>
-                                            gdmx[iat][inl][m1 * nm + m2]
-                                                += grad_overlap_2[0]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-                                            gdmy[iat][inl][m1 * nm + m2]
-                                                += grad_overlap_2[1]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-                                            gdmz[iat][inl][m1 * nm + m2]
-                                                += grad_overlap_2[2]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-
-                                            //(<d/dX chi_nu|alpha_m'>)<chi_mu|alpha_m>
-                                            gdmx[iat][inl][m2 * nm + m1]
-                                                += grad_overlap_2[0]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-                                            gdmy[iat][inl][m2 * nm + m1]
-                                                += grad_overlap_2[1]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-                                            gdmz[iat][inl][m2 * nm + m1]
-                                                += grad_overlap_2[2]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-
-                                            //(<chi_mu|d/dX alpha_m>)<chi_nu|alpha_m'> = -(<d/dX
-                                            //chi_mu|alpha_m>)<chi_nu|alpha_m'>
-                                            gdmx[ibt2][inl][m1 * nm + m2]
-                                                -= grad_overlap_2[0]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-                                            gdmy[ibt2][inl][m1 * nm + m2]
-                                                -= grad_overlap_2[1]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-                                            gdmz[ibt2][inl][m1 * nm + m2]
-                                                -= grad_overlap_2[2]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-
-                                            //(<chi_nu|d/dX alpha_m'>)<chi_mu|alpha_m> = -(<d/dX
-                                            //chi_nu|alpha_m'>)<chi_mu|alpha_m>
-                                            gdmx[ibt2][inl][m2 * nm + m1]
-                                                -= grad_overlap_2[0]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-                                            gdmy[ibt2][inl][m2 * nm + m1]
-                                                -= grad_overlap_2[1]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-                                            gdmz[ibt2][inl][m2 * nm + m1]
-                                                -= grad_overlap_2[2]->get_value(col_indexes[iw2], ib + m2)
-                                                   * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
-
-                                            if (isstress)
+                                            for (int i = 0; i < 3; i++)
                                             {
-                                                int mm = 0;
-                                                for (int ipol = 0; ipol < 3; ipol++)
-                                                {
-                                                    for (int jpol = ipol; jpol < 3; jpol++)
-                                                    {
-                                                        gdm_epsl[mm][inl][m2 * nm + m1]
-                                                            += ucell.lat0 * *dm_current
-                                                               * (grad_overlap_2[jpol]->get_value(col_indexes[iw2],
-                                                                                                  ib + m2)
-                                                                  * overlap_1->get_value(row_indexes[iw1], ib + m1)
-                                                                  * r0[ipol]);
-                                                        mm++;
-                                                    }
-                                                }
+                                                //(<d/dX chi_mu|alpha_m>)<chi_nu|alpha_m'>
+                                                accessor[i][iat][inl][m1][m2]
+                                                    += grad_overlap_2[i]->get_value(col_indexes[iw2], ib + m2)
+                                                       * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+
+                                                //(<d/dX chi_nu|alpha_m'>)<chi_mu|alpha_m>
+                                                accessor[i][iat][inl][m2][m1]
+                                                    += grad_overlap_2[i]->get_value(col_indexes[iw2], ib + m2)
+                                                       * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+
+                                                // (<chi_mu|d/dX alpha_m>)<chi_nu|alpha_m'> = -(<d/dX
+                                                // chi_mu|alpha_m>)<chi_nu|alpha_m'>
+                                                accessor[i][ibt2][inl][m1][m2]
+                                                    -= grad_overlap_2[i]->get_value(col_indexes[iw2], ib + m2)
+                                                       * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
+
+                                                //(<chi_nu|d/dX alpha_m'>)<chi_mu|alpha_m> = -(<d/dX
+                                                //chi_nu|alpha_m'>)<chi_mu|alpha_m>
+                                                accessor[i][ibt2][inl][m2][m1]
+                                                    -= grad_overlap_2[i]->get_value(col_indexes[iw2], ib + m2)
+                                                       * overlap_1->get_value(row_indexes[iw1], ib + m1) * *dm_current;
                                             }
                                         }
                                     }
@@ -262,39 +194,6 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
                                 }
                             }
                             assert(ib == overlap_1->get_col_size());
-                            if (isstress)
-                            {
-                                int ib = 0;
-                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
-                                {
-                                    for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
-                                    {
-                                        const int inl = this->inl_index[T0](I0, L0, N0);
-                                        const int nm = 2 * L0 + 1;
-                                        for (int m1 = 0; m1 < nm; ++m1)
-                                        {
-                                            for (int m2 = 0; m2 < nm; ++m2)
-                                            {
-                                                int mm = 0;
-                                                for (int ipol = 0; ipol < 3; ipol++)
-                                                {
-                                                    for (int jpol = ipol; jpol < 3; jpol++)
-                                                    {
-                                                        gdm_epsl[mm][inl][m2 * nm + m1]
-                                                            += ucell.lat0 * *dm_current
-                                                               * (overlap_2->get_value(col_indexes[iw2], ib + m1)
-                                                                  * grad_overlap_1[jpol]->get_value(row_indexes[iw1],
-                                                                                                    ib + m2)
-                                                                  * r1[ipol]);
-                                                        mm++;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                        ib += nm;
-                                    }
-                                }
-                            }
                             dm_current++;
                         } // iw2
                     }     // iw1
@@ -304,25 +203,13 @@ void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
     }                     // T0
 
 #ifdef __MPI
-    for (int iat = 0; iat < ucell.nat; iat++)
-    {
-        allsum_deepks(this->inlmax, size, this->gdmx[iat]);
-        allsum_deepks(this->inlmax, size, this->gdmy[iat]);
-        allsum_deepks(this->inlmax, size, this->gdmz[iat]);
-    }
-    if (isstress)
-    {
-        for (int ipol = 0; ipol < 6; ipol++)
-        {
-            allsum_deepks(this->inlmax, size, this->gdm_epsl[ipol]);
-        }
-    }
+    Parallel_Reduce::reduce_all(gdmx.data_ptr<double>(), 3 * ucell.nat * inlmax * nm * nm);
 #endif
     ModuleBase::timer::tick("LCAO_Deepks", "cal_gdmx");
     return;
 }
 
-void LCAO_Deepks::check_gdmx(const int nat)
+void LCAO_Deepks::check_gdmx(const int nat, const torch::Tensor& gdmx)
 {
     std::stringstream ss;
     std::ofstream ofs_x;
@@ -333,7 +220,8 @@ void LCAO_Deepks::check_gdmx(const int nat)
     ofs_y << std::setprecision(10);
     ofs_z << std::setprecision(10);
 
-    const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+    const int nm = 2 * this->lmaxd + 1;
+    auto accessor = gdmx.accessor<double, 5>();
     for (int ia = 0; ia < nat; ia++)
     {
         ss.str("");
@@ -348,11 +236,14 @@ void LCAO_Deepks::check_gdmx(const int nat)
 
         for (int inl = 0; inl < inlmax; inl++)
         {
-            for (int ind = 0; ind < pdm_size; ind++)
+            for (int m1 = 0; m1 < nm; m1++)
             {
-                ofs_x << gdmx[ia][inl][ind] << " ";
-                ofs_y << gdmy[ia][inl][ind] << " ";
-                ofs_z << gdmz[ia][inl][ind] << " ";
+                for (int m2 = 0; m2 < nm; m2++)
+                {
+                    ofs_x << accessor[0][ia][inl][m1][m2] << " ";
+                    ofs_y << accessor[1][ia][inl][m1][m2] << " ";
+                    ofs_z << accessor[2][ia][inl][m1][m2] << " ";
+                }
             }
             ofs_x << std::endl;
             ofs_y << std::endl;
@@ -371,7 +262,7 @@ template void LCAO_Deepks::cal_gdmx<double>(const std::vector<std::vector<double
                                             const int nks,
                                             const std::vector<ModuleBase::Vector3<double>>& kvec_d,
                                             std::vector<hamilt::HContainer<double>*> phialpha,
-                                            const bool isstress);
+                                            torch::Tensor& gdmx);
 
 template void LCAO_Deepks::cal_gdmx<std::complex<double>>(const std::vector<std::vector<std::complex<double>>>& dm,
                                                           const UnitCell& ucell,
@@ -380,6 +271,6 @@ template void LCAO_Deepks::cal_gdmx<std::complex<double>>(const std::vector<std:
                                                           const int nks,
                                                           const std::vector<ModuleBase::Vector3<double>>& kvec_d,
                                                           std::vector<hamilt::HContainer<double>*> phialpha,
-                                                          const bool isstress);
+                                                          torch::Tensor& gdmx);
 
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/cal_gedm.cpp b/source/module_hamilt_lcao/module_deepks/cal_gedm.cpp
index d35972205b..3f0b2cf025 100644
--- a/source/module_hamilt_lcao/module_deepks/cal_gedm.cpp
+++ b/source/module_hamilt_lcao/module_deepks/cal_gedm.cpp
@@ -68,7 +68,7 @@ inline void generate_py_files(const int lmaxd, const int nmaxd, const std::strin
     }
 }
 
-void LCAO_Deepks::cal_gedm_equiv(const int nat)
+void LCAO_Deepks::cal_gedm_equiv(const int nat, const std::vector<torch::Tensor>& descriptor)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_gedm_equiv");
 
@@ -77,7 +77,7 @@ void LCAO_Deepks::cal_gedm_equiv(const int nat)
                                this->inlmax,
                                this->inl_l,
                                PARAM.inp.deepks_equiv,
-                               this->d_tensor,
+                               descriptor,
                                PARAM.globalv.global_out_dir,
                                GlobalV::MY_RANK); // libnpy needed
 
@@ -99,12 +99,12 @@ void LCAO_Deepks::cal_gedm_equiv(const int nat)
 }
 
 // obtain from the machine learning model dE_delta/dDescriptor
-void LCAO_Deepks::cal_gedm(const int nat)
+void LCAO_Deepks::cal_gedm(const int nat, const std::vector<torch::Tensor>& descriptor)
 {
 
     if (PARAM.inp.deepks_equiv)
     {
-        this->cal_gedm_equiv(nat);
+        this->cal_gedm_equiv(nat, descriptor);
         return;
     }
 
@@ -114,10 +114,10 @@ void LCAO_Deepks::cal_gedm(const int nat)
     std::vector<torch::jit::IValue> inputs;
 
     // input_dim:(natom, des_per_atom)
-    inputs.push_back(torch::cat(this->d_tensor, 0).reshape({1, nat, this->des_per_atom}));
+    inputs.push_back(torch::cat(descriptor, 0).reshape({1, nat, this->des_per_atom}));
     std::vector<torch::Tensor> ec;
     ec.push_back(module.forward(inputs).toTensor()); // Hartree
-    this->E_delta = ec[0].item().toDouble() * 2;     // Ry; *2 is for Hartree to Ry
+    this->E_delta = ec[0].item<double>() * 2;        // Ry; *2 is for Hartree to Ry
 
     // cal gedm
     std::vector<torch::Tensor> gedm_shell;
@@ -133,13 +133,14 @@ void LCAO_Deepks::cal_gedm(const int nat)
     for (int inl = 0; inl < inlmax; ++inl)
     {
         int nm = 2 * inl_l[inl] + 1;
+        auto accessor = this->gedm_tensor[inl].accessor<double, 2>();
         for (int m1 = 0; m1 < nm; ++m1)
         {
             for (int m2 = 0; m2 < nm; ++m2)
             {
                 int index = m1 * nm + m2;
                 //*2 is for Hartree to Ry
-                this->gedm[inl][index] = this->gedm_tensor[inl].index({m1, m2}).item().toDouble() * 2;
+                this->gedm[inl][index] = accessor[m1][m2] * 2;
             }
         }
     }
diff --git a/source/module_hamilt_lcao/module_deepks/cal_gvx.cpp b/source/module_hamilt_lcao/module_deepks/cal_gvx.cpp
index cec0f1d84a..acd4044510 100644
--- a/source/module_hamilt_lcao/module_deepks/cal_gvx.cpp
+++ b/source/module_hamilt_lcao/module_deepks/cal_gvx.cpp
@@ -16,16 +16,17 @@
 
 // calculates gradient of descriptors from gradient of projected density
 // matrices
-void LCAO_Deepks::cal_gvx(const int nat, const std::vector<torch::Tensor>& gevdm)
+void LCAO_Deepks::cal_gvx(const int nat,
+                          const std::vector<torch::Tensor>& gevdm,
+                          const torch::Tensor& gdmx,
+                          torch::Tensor& gvx)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_gvx");
 
-    if (!gdmr_vector.empty())
-    {
-        gdmr_vector.erase(gdmr_vector.begin(), gdmr_vector.end());
-    }
+    // gdmr : nat(derivative) * 3 * inl(projector) * nm * nm
+    std::vector<torch::Tensor> gdmr;
+    auto accessor = gdmx.accessor<double, 5>();
 
-    // gdmr_vector : nat(derivative) * 3 * inl(projector) * nm * nm
     if (GlobalV::MY_RANK == 0)
     {
         // make gdmx as tensor
@@ -48,18 +49,7 @@ void LCAO_Deepks::cal_gvx(const int nat, const std::vector<torch::Tensor>& gevdm
                         {
                             for (int m2 = 0; m2 < nm; ++m2)
                             {
-                                if (i == 0)
-                                {
-                                    mmv.push_back(this->gdmx[ibt][inl][m1 * nm + m2]);
-                                }
-                                if (i == 1)
-                                {
-                                    mmv.push_back(this->gdmy[ibt][inl][m1 * nm + m2]);
-                                }
-                                if (i == 2)
-                                {
-                                    mmv.push_back(this->gdmz[ibt][inl][m1 * nm + m2]);
-                                }
+                                mmv.push_back(accessor[i][ibt][inl][m1][m2]);
                             }
                         } // nm^2
                         torch::Tensor mm = torch::tensor(mmv, torch::TensorOptions().dtype(torch::kFloat64))
@@ -72,46 +62,54 @@ void LCAO_Deepks::cal_gvx(const int nat, const std::vector<torch::Tensor>& gevdm
                 torch::Tensor bmm = torch::stack(xmmv, 0); // 3*nat*nm*nm
                 bmmv.push_back(bmm);
             }
-            this->gdmr_vector.push_back(torch::stack(bmmv, 0)); // nbt*3*nat*nm*nm
+            gdmr.push_back(torch::stack(bmmv, 0)); // nbt*3*nat*nm*nm
         }
 
-        assert(this->gdmr_vector.size() == nlmax);
+        assert(gdmr.size() == nlmax);
 
         // einsum for each inl:
-        // gdmr_vector : b:nat(derivative) * x:3 * a:inl(projector) * m:nm *
+        // gdmr : b:nat(derivative) * x:3 * a:inl(projector) * m:nm *
         // n:nm gevdm : a:inl * v:nm (descriptor) * m:nm (pdm, dim1) *
         // n:nm (pdm, dim2) gvx_vector : b:nat(derivative) * x:3 *
         // a:inl(projector) * m:nm(descriptor)
         std::vector<torch::Tensor> gvx_vector;
         for (int nl = 0; nl < nlmax; ++nl)
         {
-            gvx_vector.push_back(at::einsum("bxamn, avmn->bxav", {this->gdmr_vector[nl], gevdm[nl]}));
+            gvx_vector.push_back(at::einsum("bxamn, avmn->bxav", {gdmr[nl], gevdm[nl]}));
         }
 
         // cat nv-> \sum_nl(nv) = \sum_nl(nm_nl)=des_per_atom
         // concatenate index a(inl) and m(nm)
-        this->gvx_tensor = torch::cat(gvx_vector, -1);
+        // gvx:d(d)/dX, size: [natom][3][natom][des_per_atom]
+        gvx = torch::cat(gvx_vector, -1);
 
-        assert(this->gvx_tensor.size(0) == nat);
-        assert(this->gvx_tensor.size(1) == 3);
-        assert(this->gvx_tensor.size(2) == nat);
-        assert(this->gvx_tensor.size(3) == this->des_per_atom);
+        assert(gvx.size(0) == nat);
+        assert(gvx.size(1) == 3);
+        assert(gvx.size(2) == nat);
+        assert(gvx.size(3) == this->des_per_atom);
     }
 
     return;
 }
 
-void LCAO_Deepks::check_gvx(const int nat)
+void LCAO_Deepks::check_gvx(const int nat, const torch::Tensor& gvx)
 {
     std::stringstream ss;
     std::ofstream ofs_x;
     std::ofstream ofs_y;
     std::ofstream ofs_z;
+    
+    if (GlobalV::MY_RANK != 0)
+    {
+        return;
+    }
 
     ofs_x << std::setprecision(12);
     ofs_y << std::setprecision(12);
     ofs_z << std::setprecision(12);
 
+    auto accessor = gvx.accessor<double, 4>();
+
     for (int ia = 0; ia < nat; ia++)
     {
         ss.str("");
@@ -134,10 +132,9 @@ void LCAO_Deepks::check_gvx(const int nat)
             {
                 int nm = 2 * inl_l[inl] + 1;
                 {
-                    const int ind = ib * inlmax / nat + inl;
-                    ofs_x << gvx_tensor.index({ia, 0, ib, inl}).item().toDouble() << " ";
-                    ofs_y << gvx_tensor.index({ia, 1, ib, inl}).item().toDouble() << " ";
-                    ofs_z << gvx_tensor.index({ia, 2, ib, inl}).item().toDouble() << " ";
+                    ofs_x << accessor[ia][0][ib][inl] << " ";
+                    ofs_y << accessor[ia][1][ib][inl] << " ";
+                    ofs_z << accessor[ia][2][ib][inl] << " ";
                 }
             }
             ofs_x << std::endl;
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_orbpre.cpp b/source/module_hamilt_lcao/module_deepks/deepks_orbpre.cpp
index 8d48b981c0..8c912bece8 100644
--- a/source/module_hamilt_lcao/module_deepks/deepks_orbpre.cpp
+++ b/source/module_hamilt_lcao/module_deepks/deepks_orbpre.cpp
@@ -40,6 +40,7 @@ void DeePKS_domain::cal_orbital_precalc(const std::vector<TH>& dm_hl,
 
     torch::Tensor orbital_pdm
         = torch::zeros({nks, inlmax, (2 * lmaxd + 1), (2 * lmaxd + 1)}, torch::dtype(torch::kFloat64));
+    auto accessor = orbital_pdm.accessor<double, 4>();
 
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
@@ -275,11 +276,11 @@ void DeePKS_domain::cal_orbital_precalc(const std::vector<TH>& dm_hl,
                             {
                                 for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
                                 {
-                                    orbital_pdm[ik][inl][m1][m2] += ddot_(&row_size,
-                                                                          p_g1dmt + index * row_size * nks,
-                                                                          &inc,
-                                                                          s_1t.data() + index * row_size,
-                                                                          &inc);
+                                    accessor[ik][inl][m1][m2] += ddot_(&row_size,
+                                                                       p_g1dmt + index * row_size * nks,
+                                                                       &inc,
+                                                                       s_1t.data() + index * row_size,
+                                                                       &inc);
                                     index++;
                                 }
                             }
@@ -291,14 +292,8 @@ void DeePKS_domain::cal_orbital_precalc(const std::vector<TH>& dm_hl,
         }
     }
 #ifdef __MPI
-    for (int iks = 0; iks < nks; iks++)
-    {
-        for (int inl = 0; inl < inlmax; inl++)
-        {
-            auto tensor_slice = orbital_pdm[iks][inl];
-            Parallel_Reduce::reduce_all(tensor_slice.data_ptr<double>(), (2 * lmaxd + 1) * (2 * lmaxd + 1));
-        }
-    }
+    const int size = nks * inlmax * (2 * lmaxd + 1) * (2 * lmaxd + 1);
+    Parallel_Reduce::reduce_all(orbital_pdm.data_ptr<double>(), size);
 #endif
 
     // transfer orbital_pdm [nks,inl,nm,nm] to orbital_pdm_vector [nl,[nks,nat,nm,nm]]
@@ -321,7 +316,7 @@ void DeePKS_domain::cal_orbital_precalc(const std::vector<TH>& dm_hl,
                 {
                     for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
                     {
-                        mmv.push_back(orbital_pdm[iks][inl][m1][m2].item<double>());
+                        mmv.push_back(accessor[iks][inl][m1][m2]);
                     }
                 }
                 torch::Tensor mm
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_vdpre.cpp b/source/module_hamilt_lcao/module_deepks/deepks_vdpre.cpp
new file mode 100644
index 0000000000..e0c9a0be8e
--- /dev/null
+++ b/source/module_hamilt_lcao/module_deepks/deepks_vdpre.cpp
@@ -0,0 +1,645 @@
+/// cal_v_delta_precalc : v_delta_precalc is used for training with v_delta label,
+//                         which equals gvdm * v_delta_pdm,
+//                         v_delta_pdm = overlap * overlap
+/// check_v_delta_precalc : check v_delta_precalc
+//  prepare_phialpha : prepare phialpha for outputting npy file
+//  prepare_gevdm : prepare gevdm for outputting npy file
+
+#ifdef __DEEPKS
+
+#include "deepks_vdpre.h"
+
+#include "LCAO_deepks_io.h" // mohan add 2024-07-22
+#include "module_base/blas_connector.h"
+#include "module_base/constants.h"
+#include "module_base/libm/libm.h"
+#include "module_base/parallel_reduce.h"
+#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
+#include "module_parameter/parameter.h"
+
+// calculates v_delta_precalc[nks,nlocal,nlocal,NAt,NDscrpt] = gvdm * v_delta_pdm;
+// v_delta_pdm[nks,nlocal,nlocal,Inl,nm,nm] = overlap * overlap;
+// for deepks_v_delta = 1
+template <typename TK>
+void DeePKS_domain::cal_v_delta_precalc(const int nlocal,
+                                        const int lmaxd,
+                                        const int inlmax,
+                                        const int nat,
+                                        const int nks,
+                                        const int* inl_l,
+                                        const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                        const std::vector<hamilt::HContainer<double>*> phialpha,
+                                        const std::vector<torch::Tensor> gevdm,
+                                        const ModuleBase::IntArray* inl_index,
+                                        const UnitCell& ucell,
+                                        const LCAO_Orbitals& orb,
+                                        const Parallel_Orbitals& pv,
+                                        const Grid_Driver& GridD,
+                                        torch::Tensor& v_delta_precalc)
+{
+    ModuleBase::TITLE("DeePKS_domain", "calc_v_delta_precalc");
+    // timeval t_start;
+    // gettimeofday(&t_start,NULL);
+
+    constexpr torch::Dtype dtype = std::is_same<TK, double>::value ? torch::kFloat64 : torch::kComplexDouble;
+
+    const double Rcut_Alpha = orb.Alpha[0].getRcut();
+
+    torch::Tensor v_delta_pdm
+        = torch::zeros({nks, nlocal, nlocal, inlmax, (2 * lmaxd + 1), (2 * lmaxd + 1)}, torch::dtype(dtype));
+    auto accessor
+        = v_delta_pdm.accessor<std::conditional_t<std::is_same<TK, double>::value, double, c10::complex<double>>, 6>();
+
+    for (int T0 = 0; T0 < ucell.ntype; T0++)
+    {
+        Atom* atom0 = &ucell.atoms[T0];
+
+        for (int I0 = 0; I0 < atom0->na; I0++)
+        {
+            const int iat = ucell.itia2iat(T0, I0);
+            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
+            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
+
+            for (int ad1 = 0; ad1 < GridD.getAdjacentNum() + 1; ++ad1)
+            {
+                const int T1 = GridD.getType(ad1);
+                const int I1 = GridD.getNatom(ad1);
+                const int ibt1 = ucell.itia2iat(T1, I1);
+                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
+                const Atom* atom1 = &ucell.atoms[T1];
+                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
+
+                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
+                if (dist1 >= Rcut_Alpha + Rcut_AO1)
+                {
+                    continue;
+                }
+
+                ModuleBase::Vector3<int> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z);
+
+                if constexpr (std::is_same<TK, std::complex<double>>::value)
+                {
+                    if (phialpha[0]->find_matrix(iat, ibt1, dR1.x, dR1.y, dR1.z) == nullptr)
+                    {
+                        continue;
+                    }
+                }
+
+                for (int ad2 = 0; ad2 < GridD.getAdjacentNum() + 1; ad2++)
+                {
+                    const int T2 = GridD.getType(ad2);
+                    const int I2 = GridD.getNatom(ad2);
+                    const int ibt2 = ucell.itia2iat(T2, I2);
+                    const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+                    const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
+                    const Atom* atom2 = &ucell.atoms[T2];
+                    const int nw2_tot = atom2->nw * PARAM.globalv.npol;
+
+                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
+                    const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
+
+                    if (dist2 >= Rcut_Alpha + Rcut_AO2)
+                    {
+                        continue;
+                    }
+
+                    ModuleBase::Vector3<int> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
+
+                    if constexpr (std::is_same<TK, std::complex<double>>::value)
+                    {
+                        if (phialpha[0]->find_matrix(iat, ibt2, dR2.x, dR2.y, dR2.z) == nullptr)
+                        {
+                            continue;
+                        }
+                    }
+
+                    for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
+                    {
+                        const int iw1_all = start1 + iw1; // this is \mu
+                        const int iw1_local = pv.global2local_row(iw1_all);
+                        if (iw1_local < 0)
+                        {
+                            continue;
+                        }
+                        for (int iw2 = 0; iw2 < nw2_tot; ++iw2)
+                        {
+                            const int iw2_all = start2 + iw2; // this is \nu
+                            const int iw2_local = pv.global2local_col(iw2_all);
+                            if (iw2_local < 0)
+                            {
+                                continue;
+                            }
+
+                            hamilt::BaseMatrix<double>* overlap_1 = phialpha[0]->find_matrix(iat, ibt1, dR1);
+                            hamilt::BaseMatrix<double>* overlap_2 = phialpha[0]->find_matrix(iat, ibt2, dR2);
+                            assert(overlap_1->get_col_size() == overlap_2->get_col_size());
+
+                            for (int ik = 0; ik < nks; ik++)
+                            {
+                                int ib = 0;
+                                std::complex<double> kphase = std::complex<double>(1.0, 0.0);
+                                if constexpr (std::is_same<TK, std::complex<double>>::value)
+                                {
+                                    const double arg
+                                        = -(kvec_d[ik] * ModuleBase::Vector3<double>(dR1 - dR2)) * ModuleBase::TWO_PI;
+                                    double sinp, cosp;
+                                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                                    kphase = std::complex<double>(cosp, sinp);
+                                }
+                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
+                                {
+                                    for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
+                                    {
+                                        const int inl = inl_index[T0](I0, L0, N0);
+                                        const int nm = 2 * L0 + 1;
+
+                                        for (int m1 = 0; m1 < nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
+                                        {
+                                            for (int m2 = 0; m2 < nm; ++m2) // nm = 1 for s, 3 for p, 5 for d
+                                            {
+                                                if constexpr (std::is_same<TK, double>::value)
+                                                {
+                                                    accessor[ik][iw1][iw2][inl][m1][m2]
+                                                        += overlap_1->get_value(iw1, ib + m1)
+                                                           * overlap_2->get_value(iw2, ib + m2);
+                                                }
+                                                else
+                                                {
+                                                    c10::complex<double> tmp;
+                                                    tmp = overlap_1->get_value(iw1, ib + m1)
+                                                          * overlap_2->get_value(iw2, ib + m2)
+                                                          * kphase; // from std::complex to c10::complex
+                                                    accessor[ik][iw1][iw2][inl][m1][m2] += tmp;
+                                                }
+                                            }
+                                        }
+                                        ib += nm;
+                                    }
+                                }
+                            } // ik
+                        }     // iw2
+                    }         // iw1
+                }             // ad2
+            }                 // ad1
+        }
+    }
+#ifdef __MPI
+    const int size = nks * nlocal * nlocal * inlmax * (2 * lmaxd + 1) * (2 * lmaxd + 1);
+    TK* data_ptr;
+    if constexpr (std::is_same<TK, double>::value)
+    {
+        data_ptr = v_delta_pdm.data_ptr<double>();
+    }
+    else
+    {
+        c10::complex<double>* c10_ptr = v_delta_pdm.data_ptr<c10::complex<double>>();
+        data_ptr = reinterpret_cast<TK*>(c10_ptr);
+    }
+    Parallel_Reduce::reduce_all(data_ptr, size);
+#endif
+
+    // transfer v_delta_pdm to v_delta_pdm_vector
+    int nlmax = inlmax / nat;
+
+    std::vector<torch::Tensor> v_delta_pdm_vector;
+    for (int nl = 0; nl < nlmax; ++nl)
+    {
+        std::vector<torch::Tensor> kuuammv;
+        for (int iks = 0; iks < nks; ++iks)
+        {
+            std::vector<torch::Tensor> uuammv;
+            for (int mu = 0; mu < nlocal; ++mu)
+            {
+                std::vector<torch::Tensor> uammv;
+                for (int nu = 0; nu < nlocal; ++nu)
+                {
+                    std::vector<torch::Tensor> ammv;
+                    for (int iat = 0; iat < nat; ++iat)
+                    {
+                        int inl = iat * nlmax + nl;
+                        int nm = 2 * inl_l[inl] + 1;
+                        std::vector<TK> mmv;
+
+                        for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                        {
+                            for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            {
+                                if constexpr (std::is_same<TK, double>::value)
+                                {
+                                    mmv.push_back(accessor[iks][mu][nu][inl][m1][m2]);
+                                }
+                                else
+                                {
+                                    c10::complex<double> tmp_c10 = accessor[iks][mu][nu][inl][m1][m2];
+                                    std::complex<double> tmp = std::complex<double>(tmp_c10.real(), tmp_c10.imag());
+                                    mmv.push_back(tmp);
+                                }
+                            }
+                        }
+                        torch::Tensor mm = torch::from_blob(mmv.data(),
+                                                            {nm, nm},
+                                                            torch::TensorOptions().dtype(dtype))
+                                               .clone(); // nm*nm
+                        ammv.push_back(mm);
+                    }
+                    torch::Tensor amm = torch::stack(ammv, 0);
+                    uammv.push_back(amm);
+                }
+                torch::Tensor uamm = torch::stack(uammv, 0);
+                uuammv.push_back(uamm);
+            }
+            torch::Tensor uuamm = torch::stack(uuammv, 0);
+            kuuammv.push_back(uuamm);
+        }
+        torch::Tensor kuuamm = torch::stack(kuuammv, 0);
+        v_delta_pdm_vector.push_back(kuuamm);
+    }
+
+    assert(v_delta_pdm_vector.size() == nlmax);
+
+    // einsum for each nl:
+    std::vector<torch::Tensor> v_delta_precalc_vector;
+    for (int nl = 0; nl < nlmax; ++nl)
+    {
+        torch::Tensor gevdm_complex = gevdm[nl].to(dtype);
+        v_delta_precalc_vector.push_back(at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_vector[nl], gevdm[nl]}));
+    }
+
+    v_delta_precalc = torch::cat(v_delta_precalc_vector, -1);
+    //  timeval t_end;
+    //  gettimeofday(&t_end,NULL);
+    //  std::cout<<"calculate v_delta_precalc time:\t"<<(double)(t_end.tv_sec-t_start.tv_sec) +
+    //  (double)(t_end.tv_usec-t_start.tv_usec)/1000000.0<<std::endl;
+    return;
+}
+
+template <typename TK>
+void DeePKS_domain::check_v_delta_precalc(const int nat,
+                                          const int nks,
+                                          const int nlocal,
+                                          const int des_per_atom,
+                                          const torch::Tensor& v_delta_precalc)
+{
+    std::ofstream ofs("v_delta_precalc.dat");
+    ofs << std::setprecision(10);
+    auto accessor
+        = v_delta_precalc
+              .accessor<std::conditional_t<std::is_same<TK, double>::value, double, c10::complex<double>>, 5>();
+    for (int iks = 0; iks < nks; ++iks)
+    {
+        for (int mu = 0; mu < nlocal; ++mu)
+        {
+            for (int nu = 0; nu < nlocal; ++nu)
+            {
+                for (int iat = 0; iat < nat; ++iat)
+                {
+                    for (int p = 0; p < des_per_atom; ++p)
+                    {
+                        if constexpr (std::is_same<TK, double>::value)
+                        {
+                            ofs << accessor[iks][mu][nu][iat][p] << " ";
+                        }
+                        else
+                        {
+                            c10::complex<double> tmp_c10 = accessor[iks][mu][nu][iat][p];
+                            std::complex<double> tmp = std::complex<double>(tmp_c10.real(), tmp_c10.imag());
+                            ofs << tmp << " ";
+                        }
+                    }
+                }
+                ofs << std::endl;
+            }
+        }
+    }
+    ofs.close();
+}
+
+// prepare_phialpha and prepare_gevdm for deepks_v_delta = 2
+template <typename TK>
+void DeePKS_domain::prepare_phialpha(const int nlocal,
+                                     const int lmaxd,
+                                     const int inlmax,
+                                     const int nat,
+                                     const int nks,
+                                     const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                     const std::vector<hamilt::HContainer<double>*> phialpha,
+                                     const UnitCell& ucell,
+                                     const LCAO_Orbitals& orb,
+                                     const Parallel_Orbitals& pv,
+                                     const Grid_Driver& GridD,
+                                     torch::Tensor& phialpha_out)
+{
+    ModuleBase::TITLE("DeePKS_domain", "prepare_phialpha");
+    constexpr torch::Dtype dtype = std::is_same<TK, double>::value ? torch::kFloat64 : torch::kComplexDouble;
+    int nlmax = inlmax / nat;
+    int mmax = 2 * lmaxd + 1;
+    phialpha_out = torch::zeros({nat, nlmax, nks, nlocal, mmax}, dtype);
+
+    // cutoff for alpha is same for all types of atoms
+    const double Rcut_Alpha = orb.Alpha[0].getRcut();
+
+    for (int T0 = 0; T0 < ucell.ntype; T0++)
+    {
+        Atom* atom0 = &ucell.atoms[T0];
+        for (int I0 = 0; I0 < atom0->na; I0++)
+        {
+            // iat: atom index on which |alpha> is located
+            const int iat = ucell.itia2iat(T0, I0);
+            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
+            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
+
+            // outermost loop : find all adjacent atoms
+            for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
+            {
+                const int T1 = GridD.getType(ad);
+                const int I1 = GridD.getNatom(ad);
+                const int ibt = ucell.itia2iat(T1, I1);
+                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
+
+                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad);
+                const Atom* atom1 = &ucell.atoms[T1];
+                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
+
+                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
+
+                if (dist1 > Rcut_Alpha + Rcut_AO1)
+                {
+                    continue;
+                }
+
+                ModuleBase::Vector3<int> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z);
+
+                if constexpr (std::is_same<TK, std::complex<double>>::value)
+                {
+                    if (phialpha[0]->find_matrix(iat, ibt, dR.x, dR.y, dR.z) == nullptr)
+                    {
+                        continue;
+                    }
+                }
+
+                // middle loop : all atomic basis on the adjacent atom ad
+                for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
+                {
+                    const int iw1_all = start1 + iw1;
+                    const int iw1_local = pv.global2local_row(iw1_all);
+                    const int iw2_local = pv.global2local_col(iw1_all);
+                    if (iw1_local < 0 || iw2_local < 0)
+                    {
+                        continue;
+                    }
+                    hamilt::BaseMatrix<double>* overlap = phialpha[0]->find_matrix(iat, ibt, dR);
+
+                    for (int ik = 0; ik < nks; ik++)
+                    {
+                        std::complex<double> kphase = std::complex<double>(1.0, 0.0);
+                        if constexpr (std::is_same<TK, std::complex<double>>::value)
+                        {
+                            const double arg = -(kvec_d[ik] * ModuleBase::Vector3<double>(dR)) * ModuleBase::TWO_PI;
+                            double sinp, cosp;
+                            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                            kphase = std::complex<double>(cosp, sinp);
+                        }
+                        int ib = 0;
+                        int nl = 0;
+                        for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
+                        {
+                            for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
+                            {
+                                const int nm = 2 * L0 + 1;
+
+                                for (int m1 = 0; m1 < nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
+                                {
+                                    if constexpr (std::is_same<TK, double>::value)
+                                    {
+                                        phialpha_out[iat][nl][ik][iw1_all][m1] = overlap->get_value(iw1, ib + m1);
+                                    }
+                                    else
+                                    {
+                                        c10::complex<double> tmp;
+                                        tmp = overlap->get_value(iw1, ib + m1) * kphase;
+                                        phialpha_out.index_put_({iat, nl, ik, iw1_all, m1}, tmp);
+                                    }
+                                }
+                                ib += nm;
+                                nl++;
+                            }
+                        }
+                    } // end ik
+                }     // end iw
+            }         // end ad
+        }             // end I0
+    }                 // end T0
+
+#ifdef __MPI
+    int size = nat * nlmax * nks * nlocal * mmax;
+    TK* data_ptr;
+    if constexpr (std::is_same<TK, double>::value)
+    {
+        data_ptr = phialpha_out.data_ptr<double>();
+    }
+    else
+    {
+        c10::complex<double>* c10_ptr = phialpha_out.data_ptr<c10::complex<double>>();
+        data_ptr = reinterpret_cast<TK*>(c10_ptr);
+    }
+    Parallel_Reduce::reduce_all(data_ptr, size);
+
+#endif
+}
+
+template <typename TK>
+void DeePKS_domain::check_vdp_phialpha(const int nat,
+                                       const int nks,
+                                       const int nlocal,
+                                       const int inlmax,
+                                       const int lmaxd,
+                                       const torch::Tensor& phialpha_out)
+{
+    std::ofstream ofs("vdp_phialpha.dat");
+    ofs << std::setprecision(10);
+    auto accessor
+        = phialpha_out.accessor<std::conditional_t<std::is_same<TK, double>::value, double, c10::complex<double>>, 5>();
+
+    int nlmax = inlmax / nat;
+    int mmax = 2 * lmaxd + 1;
+    for (int iat = 0; iat < nat; iat++)
+    {
+        for (int nl = 0; nl < nlmax; nl++)
+        {
+            for (int iks = 0; iks < nks; iks++)
+            {
+                for (int mu = 0; mu < nlocal; mu++)
+                {
+                    for (int m = 0; m < mmax; m++)
+                    {
+                        if constexpr (std::is_same<TK, double>::value)
+                        {
+                            ofs << accessor[iat][nl][iks][mu][m] << " ";
+                        }
+                        else
+                        {
+                            c10::complex<double> tmp_c10 = accessor[iat][nl][iks][mu][m];
+                            std::complex<double> tmp = std::complex<double>(tmp_c10.real(), tmp_c10.imag());
+                            ofs << tmp << " ";
+                        }
+                    }
+                }
+            }
+            ofs << std::endl;
+        }
+    }
+    ofs.close();
+}
+
+void DeePKS_domain::prepare_gevdm(const int nat,
+                                  const int lmaxd,
+                                  const int inlmax,
+                                  const LCAO_Orbitals& orb,
+                                  const std::vector<torch::Tensor>& gevdm_in,
+                                  torch::Tensor& gevdm_out)
+{
+    int nlmax = inlmax / nat;
+    int mmax = 2 * lmaxd + 1;
+    gevdm_out = torch::zeros({nat, nlmax, mmax, mmax, mmax}, torch::TensorOptions().dtype(torch::kFloat64));
+
+    int nl = 0;
+    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
+    {
+        for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
+        {
+            for (int iat = 0; iat < nat; iat++)
+            {
+                const int nm = 2 * L0 + 1;
+                for (int v = 0; v < nm; ++v) // nm = 1 for s, 3 for p, 5 for d
+                {
+                    for (int m = 0; m < nm; ++m)
+                    {
+                        for (int n = 0; n < nm; ++n)
+                        {
+                            gevdm_out[iat][nl][v][m][n] = gevdm_in[nl][iat][v][m][n];
+                        }
+                    }
+                }
+            }
+            nl++;
+        }
+    }
+    assert(nl == nlmax);
+}
+
+void DeePKS_domain::check_vdp_gevdm(const int nat, const int lmaxd, const int inlmax, const torch::Tensor& gevdm)
+{
+    std::ofstream ofs("vdp_gevdm.dat");
+    ofs << std::setprecision(10);
+
+    auto accessor = gevdm.accessor<double, 5>();
+
+    int nlmax = inlmax / nat;
+    int mmax = 2 * lmaxd + 1;
+    for (int iat = 0; iat < nat; iat++)
+    {
+        for (int nl = 0; nl < nlmax; nl++)
+        {
+            for (int v = 0; v < mmax; v++)
+            {
+                for (int m = 0; m < mmax; m++)
+                {
+                    for (int n = 0; n < mmax; n++)
+                    {
+                        ofs << accessor[iat][nl][v][m][n] << " ";
+                    }
+                }
+            }
+            ofs << std::endl;
+        }
+    }
+    ofs.close();
+}
+
+template void DeePKS_domain::cal_v_delta_precalc<double>(const int nlocal,
+                                                         const int lmaxd,
+                                                         const int inlmax,
+                                                         const int nat,
+                                                         const int nks,
+                                                         const int* inl_l,
+                                                         const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                                         const std::vector<hamilt::HContainer<double>*> phialpha,
+                                                         const std::vector<torch::Tensor> gevdm,
+                                                         const ModuleBase::IntArray* inl_index,
+                                                         const UnitCell& ucell,
+                                                         const LCAO_Orbitals& orb,
+                                                         const Parallel_Orbitals& pv,
+                                                         const Grid_Driver& GridD,
+                                                         torch::Tensor& v_delta_precalc);
+template void DeePKS_domain::cal_v_delta_precalc<std::complex<double>>(
+    const int nlocal,
+    const int lmaxd,
+    const int inlmax,
+    const int nat,
+    const int nks,
+    const int* inl_l,
+    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+    const std::vector<hamilt::HContainer<double>*> phialpha,
+    const std::vector<torch::Tensor> gevdm,
+    const ModuleBase::IntArray* inl_index,
+    const UnitCell& ucell,
+    const LCAO_Orbitals& orb,
+    const Parallel_Orbitals& pv,
+    const Grid_Driver& GridD,
+    torch::Tensor& v_delta_precalc);
+
+template void DeePKS_domain::check_v_delta_precalc<double>(const int nat,
+                                                           const int nks,
+                                                           const int nlocal,
+                                                           const int des_per_atom,
+                                                           const torch::Tensor& v_delta_precalc);
+template void DeePKS_domain::check_v_delta_precalc<std::complex<double>>(const int nat,
+                                                                         const int nks,
+                                                                         const int nlocal,
+                                                                         const int des_per_atom,
+                                                                         const torch::Tensor& v_delta_precalc);
+
+template void DeePKS_domain::prepare_phialpha<double>(const int nlocal,
+                                                      const int lmaxd,
+                                                      const int inlmax,
+                                                      const int nat,
+                                                      const int nks,
+                                                      const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                                      const std::vector<hamilt::HContainer<double>*> phialpha,
+                                                      const UnitCell& ucell,
+                                                      const LCAO_Orbitals& orb,
+                                                      const Parallel_Orbitals& pv,
+                                                      const Grid_Driver& GridD,
+                                                      torch::Tensor& phialpha_out);
+
+template void DeePKS_domain::prepare_phialpha<std::complex<double>>(
+    const int nlocal,
+    const int lmaxd,
+    const int inlmax,
+    const int nat,
+    const int nks,
+    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+    const std::vector<hamilt::HContainer<double>*> phialpha,
+    const UnitCell& ucell,
+    const LCAO_Orbitals& orb,
+    const Parallel_Orbitals& pv,
+    const Grid_Driver& GridD,
+    torch::Tensor& phialpha_out);
+
+template void DeePKS_domain::check_vdp_phialpha<double>(const int nat,
+                                                        const int nks,
+                                                        const int nlocal,
+                                                        const int inlmax,
+                                                        const int lmaxd,
+                                                        const torch::Tensor& phialpha_out);
+template void DeePKS_domain::check_vdp_phialpha<std::complex<double>>(const int nat,
+                                                                      const int nks,
+                                                                      const int nlocal,
+                                                                      const int inlmax,
+                                                                      const int lmaxd,
+                                                                      const torch::Tensor& phialpha_out);
+
+#endif
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_vdpre.h b/source/module_hamilt_lcao/module_deepks/deepks_vdpre.h
new file mode 100644
index 0000000000..41a6e1be80
--- /dev/null
+++ b/source/module_hamilt_lcao/module_deepks/deepks_vdpre.h
@@ -0,0 +1,95 @@
+#ifndef DEEPKS_VDPRE_H
+#define DEEPKS_VDPRE_H
+
+#ifdef __DEEPKS
+
+#include "module_base/complexmatrix.h"
+#include "module_base/intarray.h"
+#include "module_base/matrix.h"
+#include "module_base/timer.h"
+#include "module_basis/module_ao/parallel_orbitals.h"
+#include "module_basis/module_nao/two_center_integrator.h"
+#include "module_cell/module_neighbor/sltk_grid_driver.h"
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+
+#include <torch/script.h>
+#include <torch/torch.h>
+
+namespace DeePKS_domain
+{
+//------------------------
+// deepks_vdpre.cpp
+//------------------------
+
+// This file contains 6 subroutines for calculating v_delta,
+// 1. cal_v_delta_precalc : v_delta_precalc is used for training with v_delta label,
+//                         which equals gvdm * v_delta_pdm,
+//                         v_delta_pdm = overlap * overlap
+// 2. check_v_delta_precalc : check v_delta_precalc
+// 3. prepare_phialpha : prepare phialpha for outputting npy file
+// 4. check_vdp_phialpha : check phialpha
+// 5. prepare_gevdm : prepare gevdm for outputting npy file
+// 6. check_vdp_gevdm : check gevdm
+
+// for deepks_v_delta = 1
+// calculates v_delta_precalc
+template <typename TK>
+void cal_v_delta_precalc(const int nlocal,
+                         const int lmaxd,
+                         const int inlmax,
+                         const int nat,
+                         const int nks,
+                         const int* inl_l,
+                         const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                         const std::vector<hamilt::HContainer<double>*> phialpha,
+                         const std::vector<torch::Tensor> gevdm,
+                         const ModuleBase::IntArray* inl_index,
+                         const UnitCell& ucell,
+                         const LCAO_Orbitals& orb,
+                         const Parallel_Orbitals& pv,
+                         const Grid_Driver& GridD,
+                         torch::Tensor& v_delta_precalc);
+
+template <typename TK>
+void check_v_delta_precalc(const int nat,
+                           const int nks,
+                           const int nlocal,
+                           const int des_per_atom,
+                           const torch::Tensor& v_delta_precalc);
+
+// for deepks_v_delta = 2
+// prepare phialpha for outputting npy file
+template <typename TK>
+void prepare_phialpha(const int nlocal,
+                      const int lmaxd,
+                      const int inlmax,
+                      const int nat,
+                      const int nks,
+                      const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                      const std::vector<hamilt::HContainer<double>*> phialpha,
+                      const UnitCell& ucell,
+                      const LCAO_Orbitals& orb,
+                      const Parallel_Orbitals& pv,
+                      const Grid_Driver& GridD,
+                      torch::Tensor& phialpha_out);
+
+template <typename TK>
+void check_vdp_phialpha(const int nat,
+                        const int nks,
+                        const int nlocal,
+                        const int inlmax,
+                        const int lmaxd,
+                        const torch::Tensor& phialpha_out);
+
+// prepare gevdm for outputting npy file
+void prepare_gevdm(const int nat,
+                   const int lmaxd,
+                   const int inlmax,
+                   const LCAO_Orbitals& orb,
+                   const std::vector<torch::Tensor>& gevdm_in,
+                   torch::Tensor& gevdm_out);
+
+void check_vdp_gevdm(const int nat, const int lmaxd, const int inlmax, const torch::Tensor& gevdm);
+} // namespace DeePKS_domain
+#endif
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
index 76a8eb2d71..067bd42625 100644
--- a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
+++ b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
@@ -113,8 +113,8 @@ void test_deepks::set_dm_k_new()
 void test_deepks::set_p_elec_DM()
 {
     // gamma
-    int nspin=PARAM.inp.nspin;
-    this->p_elec_DM=new elecstate::DensityMatrix<double, double>(&ParaO,nspin);
+    int nspin = PARAM.inp.nspin;
+    this->p_elec_DM = new elecstate::DensityMatrix<double, double>(&ParaO, nspin);
     p_elec_DM->init_DMR(&Test_Deepks::GridD, &ucell);
     for (int ik = 0; ik < nspin; ik++)
     {
@@ -126,7 +126,10 @@ void test_deepks::set_p_elec_DM()
 void test_deepks::set_p_elec_DM_k()
 {
     // multi k
-    this->p_elec_DM_k=new elecstate::DensityMatrix<std::complex<double>, double>(&ParaO, PARAM.inp.nspin, kv.kvec_d, kv.nkstot / PARAM.inp.nspin);
+    this->p_elec_DM_k = new elecstate::DensityMatrix<std::complex<double>, double>(&ParaO,
+                                                                                   PARAM.inp.nspin,
+                                                                                   kv.kvec_d,
+                                                                                   kv.nkstot / PARAM.inp.nspin);
     p_elec_DM_k->init_DMR(&Test_Deepks::GridD, &ucell);
     for (int ik = 0; ik < kv.nkstot; ++ik)
     {
@@ -155,18 +158,18 @@ void test_deepks::check_pdm()
     this->compare_with_ref("pdm.dat", "pdm_ref.dat");
 }
 
-void test_deepks::check_gdmx()
+void test_deepks::check_gdmx(torch::Tensor& gdmx)
 {
-    GlobalC::ld.init_gdmx(ucell.nat);
     if (PARAM.sys.gamma_only_local)
     {
-        GlobalC::ld.cal_gdmx(dm_new, ucell, ORB, Test_Deepks::GridD, kv.nkstot, kv.kvec_d, GlobalC::ld.phialpha, 0);
+        GlobalC::ld.cal_gdmx(dm_new, ucell, ORB, Test_Deepks::GridD, kv.nkstot, kv.kvec_d, GlobalC::ld.phialpha, gdmx);
     }
     else
     {
-        GlobalC::ld.cal_gdmx(dm_k_new, ucell, ORB, Test_Deepks::GridD, kv.nkstot, kv.kvec_d, GlobalC::ld.phialpha, 0);
+        GlobalC::ld
+            .cal_gdmx(dm_k_new, ucell, ORB, Test_Deepks::GridD, kv.nkstot, kv.kvec_d, GlobalC::ld.phialpha, gdmx);
     }
-    GlobalC::ld.check_gdmx(ucell.nat);
+    GlobalC::ld.check_gdmx(ucell.nat, gdmx);
 
     for (int ia = 0; ia < ucell.nat; ia++)
     {
@@ -193,19 +196,20 @@ void test_deepks::check_gdmx()
     }
 }
 
-void test_deepks::check_descriptor()
+void test_deepks::check_descriptor(std::vector<torch::Tensor>& descriptor)
 {
-    GlobalC::ld.cal_descriptor(ucell.nat);
-    GlobalC::ld.check_descriptor(ucell,"./");
+    GlobalC::ld.cal_descriptor(ucell.nat, descriptor);
+    GlobalC::ld.check_descriptor(ucell, "./", descriptor);
     this->compare_with_ref("deepks_desc.dat", "descriptor_ref.dat");
 }
 
-void test_deepks::check_gvx()
+void test_deepks::check_gvx(torch::Tensor& gdmx)
 {
     std::vector<torch::Tensor> gevdm;
     GlobalC::ld.cal_gevdm(ucell.nat, gevdm);
-    GlobalC::ld.cal_gvx(ucell.nat, gevdm);
-    GlobalC::ld.check_gvx(ucell.nat);
+    torch::Tensor gvx;
+    GlobalC::ld.cal_gvx(ucell.nat, gevdm, gdmx, gvx);
+    GlobalC::ld.check_gvx(ucell.nat, gvx);
 
     for (int ia = 0; ia < ucell.nat; ia++)
     {
@@ -231,7 +235,7 @@ void test_deepks::check_gvx()
     }
 }
 
-void test_deepks::check_edelta()
+void test_deepks::check_edelta(std::vector<torch::Tensor>& descriptor)
 {
     GlobalC::ld.load_model("model.ptg");
     if (PARAM.sys.gamma_only_local)
@@ -242,7 +246,7 @@ void test_deepks::check_edelta()
     {
         GlobalC::ld.allocate_V_delta(ucell.nat, kv.nkstot);
     }
-    GlobalC::ld.cal_gedm(ucell.nat);
+    GlobalC::ld.cal_gedm(ucell.nat, descriptor);
 
     std::ofstream ofs("E_delta.dat");
     ofs << std::setprecision(10) << GlobalC::ld.E_delta << std::endl;
@@ -256,16 +260,17 @@ void test_deepks::check_edelta()
 void test_deepks::cal_H_V_delta()
 {
     hamilt::HS_Matrix_K<double>* hsk = new hamilt::HS_Matrix_K<double>(&ParaO);
-    hamilt::HContainer<double>* hR = new hamilt::HContainer<double>(ucell,&ParaO);
-    hamilt::Operator<double>* op_deepks = new hamilt::DeePKS<hamilt::OperatorLCAO<double, double>>(hsk,
-                                                            kv.kvec_d,
-                                                            hR, // no explicit call yet
-                                                            &ucell,
-                                                            &Test_Deepks::GridD,
-                                                            &overlap_orb_alpha_,
-                                                            &ORB,
-                                                            kv.nkstot,
-                                                            p_elec_DM);
+    hamilt::HContainer<double>* hR = new hamilt::HContainer<double>(ucell, &ParaO);
+    hamilt::Operator<double>* op_deepks
+        = new hamilt::DeePKS<hamilt::OperatorLCAO<double, double>>(hsk,
+                                                                   kv.kvec_d,
+                                                                   hR, // no explicit call yet
+                                                                   &ucell,
+                                                                   &Test_Deepks::GridD,
+                                                                   &overlap_orb_alpha_,
+                                                                   &ORB,
+                                                                   kv.nkstot,
+                                                                   p_elec_DM);
     for (int ik = 0; ik < kv.nkstot; ++ik)
     {
         op_deepks->init(ik);
@@ -277,15 +282,16 @@ void test_deepks::cal_H_V_delta_k()
     hamilt::HS_Matrix_K<std::complex<double>>* hsk = new hamilt::HS_Matrix_K<std::complex<double>>(&ParaO);
     hamilt::HContainer<double>* hR = new hamilt::HContainer<double>(&ParaO);
 
-    hamilt::Operator<std::complex<double>>* op_deepks = new hamilt::DeePKS<hamilt::OperatorLCAO<std::complex<double>, double>>(hsk,
-                                                            kv.kvec_d,
-                                                            hR, // no explicit call yet
-                                                            &ucell,
-                                                            &Test_Deepks::GridD,
-                                                            &overlap_orb_alpha_,
-                                                            &ORB,
-                                                            kv.nkstot,
-                                                            p_elec_DM_k);
+    hamilt::Operator<std::complex<double>>* op_deepks
+        = new hamilt::DeePKS<hamilt::OperatorLCAO<std::complex<double>, double>>(hsk,
+                                                                                 kv.kvec_d,
+                                                                                 hR, // no explicit call yet
+                                                                                 &ucell,
+                                                                                 &Test_Deepks::GridD,
+                                                                                 &overlap_orb_alpha_,
+                                                                                 &ORB,
+                                                                                 kv.nkstot,
+                                                                                 p_elec_DM_k);
     for (int ik = 0; ik < kv.nkstot; ++ik)
     {
         op_deepks->init(ik);
@@ -297,7 +303,7 @@ void test_deepks::check_e_deltabands()
     if (PARAM.sys.gamma_only_local)
     {
         this->cal_H_V_delta();
-        GlobalC::ld.cal_e_delta_band(dm_new,1);
+        GlobalC::ld.cal_e_delta_band(dm_new, 1);
     }
     else
     {
@@ -341,19 +347,19 @@ void test_deepks::check_f_delta_and_stress_delta()
     {
         const int nks = kv.nkstot;
         DeePKS_domain::cal_f_delta<std::complex<double>>(dm_k_new,
-                                           ucell,
-                                           ORB,
-                                           Test_Deepks::GridD,
-                                           ParaO,
-                                           GlobalC::ld.lmaxd,
-                                           nks,
-                                           kv.kvec_d,
-                                           GlobalC::ld.phialpha,
-                                           GlobalC::ld.gedm,
-                                           GlobalC::ld.inl_index,
-                                           fvnl_dalpha,
-                                           cal_stress,
-                                           svnl_dalpha);
+                                                         ucell,
+                                                         ORB,
+                                                         Test_Deepks::GridD,
+                                                         ParaO,
+                                                         GlobalC::ld.lmaxd,
+                                                         nks,
+                                                         kv.kvec_d,
+                                                         GlobalC::ld.phialpha,
+                                                         GlobalC::ld.gedm,
+                                                         GlobalC::ld.inl_index,
+                                                         fvnl_dalpha,
+                                                         cal_stress,
+                                                         svnl_dalpha);
     }
     DeePKS_domain::check_f_delta(ucell.nat, fvnl_dalpha, svnl_dalpha);
 
diff --git a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.h b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.h
index 5dca7f1261..84105b91d9 100644
--- a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.h
+++ b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.h
@@ -13,6 +13,8 @@
 #include <iomanip>
 #include <iostream>
 #include <sstream>
+#include <torch/script.h>
+#include <torch/torch.h>
 
 namespace Test_Deepks
 {
@@ -21,7 +23,7 @@ extern Grid_Driver GridD;
 
 namespace GlobalC
 {
-  extern LCAO_Deepks ld;
+extern LCAO_Deepks ld;
 }
 
 class test_deepks
@@ -95,12 +97,12 @@ class test_deepks
     void read_dm_k(const int nks);
 
     void check_pdm();
-    void check_gdmx();
+    void check_gdmx(torch::Tensor& gdmx);
 
-    void check_descriptor();
-    void check_gvx();
+    void check_descriptor(std::vector<torch::Tensor>& descriptor);
+    void check_gvx(torch::Tensor& gdmx);
 
-    void check_edelta();
+    void check_edelta(std::vector<torch::Tensor>& descriptor);
 
     // calculate H_V_delta
     void cal_H_V_delta();
diff --git a/source/module_hamilt_lcao/module_deepks/test/main_deepks.cpp b/source/module_hamilt_lcao/module_deepks/test/main_deepks.cpp
index da8b64e4e6..6f7f636d10 100644
--- a/source/module_hamilt_lcao/module_deepks/test/main_deepks.cpp
+++ b/source/module_hamilt_lcao/module_deepks/test/main_deepks.cpp
@@ -5,54 +5,57 @@
 
 int calculate();
 
-int main(int argc, char **argv)
+int main(int argc, char** argv)
 {
 #ifdef __MPI
-	MPI_Init(&argc,&argv);
+    MPI_Init(&argc, &argv);
 #endif
     int status = calculate();
 #ifdef __MPI
-	MPI_Finalize();
+    MPI_Finalize();
 #endif
 
-	if(status>0)
-	{
-		return 1;
-	}
-	else
-	{
-    	return 0;
-	}
+    if (status > 0)
+    {
+        return 1;
+    }
+    else
+    {
+        return 0;
+    }
 }
 
 int calculate()
 {
-	test_deepks test;
+    test_deepks test;
 
-	test.preparation();
+    test.preparation();
 
-	test.check_dstable();
-	test.check_phialpha();
+    test.check_dstable();
+    test.check_phialpha();
 
-	test.check_pdm();
-	test.check_gdmx();
+    test.check_pdm();
 
-	test.check_descriptor();
-	test.check_gvx();
+    torch::Tensor gdmx;
+    test.check_gdmx(gdmx);
 
-	test.check_edelta();
-	test.check_e_deltabands();
-	test.check_f_delta_and_stress_delta();
+    std::vector<torch::Tensor> descriptor;
+    test.check_descriptor(descriptor);
+    test.check_gvx(gdmx);
 
-	std::cout << " [  ------  ] Total checks : " << test.total_check <<std::endl;
-	if(test.failed_check>0)
-	{
-		std::cout << "\e[1;31m [  FAILED  ]\e[0m Failed checks : " << test.failed_check <<std::endl;
-	}
-	else
-	{
-		std::cout << "\e[1;32m [  PASS    ]\e[0m All checks passed!" << std::endl;
-	}
+    test.check_edelta(descriptor);
+    test.check_e_deltabands();
+    test.check_f_delta_and_stress_delta();
+
+    std::cout << " [  ------  ] Total checks : " << test.total_check << std::endl;
+    if (test.failed_check > 0)
+    {
+        std::cout << "\e[1;31m [  FAILED  ]\e[0m Failed checks : " << test.failed_check << std::endl;
+    }
+    else
+    {
+        std::cout << "\e[1;32m [  PASS    ]\e[0m All checks passed!" << std::endl;
+    }
 
     return test.failed_check;
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp b/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
deleted file mode 100644
index c3cea5eb76..0000000000
--- a/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/// cal_v_delta_precalc : v_delta_precalc is used for training with v_delta label,
-//                         which equals gvdm * v_delta_pdm_shell,
-//                         v_delta_pdm_shell = overlap * overlap
-/// check_v_delta_precalc : check v_delta_precalc
-
-#ifdef __DEEPKS
-
-#include "LCAO_deepks.h"
-#include "LCAO_deepks_io.h" // mohan add 2024-07-22
-#include "module_base/blas_connector.h"
-#include "module_base/constants.h"
-#include "module_base/libm/libm.h"
-#include "module_base/parallel_reduce.h"
-#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
-#include "module_parameter/parameter.h"
-
-// calculates v_delta_precalc[nks,nlocal,nlocal,NAt,NDscrpt] = gvdm * v_delta_pdm_shell;
-// v_delta_pdm_shell[nks,nlocal,nlocal,Inl,nm*nm] = overlap * overlap;
-// for deepks_v_delta = 1
-template <typename TK>
-void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
-                                      const int nat,
-                                      const int nks,
-                                      const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                                      const UnitCell& ucell,
-                                      const LCAO_Orbitals& orb,
-                                      const Grid_Driver& GridD)
-{
-    ModuleBase::TITLE("LCAO_Deepks", "calc_v_delta_precalc");
-    // timeval t_start;
-    // gettimeofday(&t_start,NULL);
-
-    std::vector<torch::Tensor> gevdm;
-    this->cal_gevdm(nat, gevdm);
-    const double Rcut_Alpha = orb.Alpha[0].getRcut();
-    this->init_v_delta_pdm_shell(nks, nlocal);
-
-    for (int T0 = 0; T0 < ucell.ntype; T0++)
-    {
-        Atom* atom0 = &ucell.atoms[T0];
-
-        for (int I0 = 0; I0 < atom0->na; I0++)
-        {
-            const int iat = ucell.itia2iat(T0, I0);
-            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
-
-            for (int ad1 = 0; ad1 < GridD.getAdjacentNum() + 1; ++ad1)
-            {
-                const int T1 = GridD.getType(ad1);
-                const int I1 = GridD.getNatom(ad1);
-                const int ibt1 = ucell.itia2iat(T1, I1);
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
-                const Atom* atom1 = &ucell.atoms[T1];
-                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
-                const double Rcut_AO1 = orb.Phi[T1].getRcut();
-
-                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
-                if (dist1 >= Rcut_Alpha + Rcut_AO1)
-                {
-                    continue;
-                }
-
-                ModuleBase::Vector3<int> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z);
-
-                if constexpr (std::is_same<TK, std::complex<double>>::value)
-                {
-                    if (this->phialpha[0]->find_matrix(iat, ibt1, dR1.x, dR1.y, dR1.z) == nullptr)
-                    {
-                        continue;
-                    }
-                }
-
-                for (int ad2 = 0; ad2 < GridD.getAdjacentNum() + 1; ad2++)
-                {
-                    const int T2 = GridD.getType(ad2);
-                    const int I2 = GridD.getNatom(ad2);
-                    const int ibt2 = ucell.itia2iat(T2, I2);
-                    const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
-                    const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
-                    const Atom* atom2 = &ucell.atoms[T2];
-                    const int nw2_tot = atom2->nw * PARAM.globalv.npol;
-
-                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                    const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
-
-                    if (dist2 >= Rcut_Alpha + Rcut_AO2)
-                    {
-                        continue;
-                    }
-
-                    ModuleBase::Vector3<int> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
-
-                    if constexpr (std::is_same<TK, std::complex<double>>::value)
-                    {
-                        if (this->phialpha[0]->find_matrix(iat, ibt2, dR2.x, dR2.y, dR2.z) == nullptr)
-                        {
-                            continue;
-                        }
-                    }
-
-                    for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
-                    {
-                        const int iw1_all = start1 + iw1; // this is \mu
-                        const int iw1_local = pv->global2local_row(iw1_all);
-                        if (iw1_local < 0)
-                        {
-                            continue;
-                        }
-                        for (int iw2 = 0; iw2 < nw2_tot; ++iw2)
-                        {
-                            const int iw2_all = start2 + iw2; // this is \nu
-                            const int iw2_local = pv->global2local_col(iw2_all);
-                            if (iw2_local < 0)
-                            {
-                                continue;
-                            }
-
-                            hamilt::BaseMatrix<double>* overlap_1 = phialpha[0]->find_matrix(iat, ibt1, dR1);
-                            hamilt::BaseMatrix<double>* overlap_2 = phialpha[0]->find_matrix(iat, ibt2, dR2);
-                            assert(overlap_1->get_col_size() == overlap_2->get_col_size());
-
-                            for (int ik = 0; ik < nks; ik++)
-                            {
-                                int ib = 0;
-                                std::complex<double> kphase = std::complex<double>(1.0, 0.0);
-                                if constexpr (std::is_same<TK, std::complex<double>>::value)
-                                {
-                                    const double arg
-                                        = -(kvec_d[ik] * ModuleBase::Vector3<double>(dR1 - dR2)) * ModuleBase::TWO_PI;
-                                    double sinp, cosp;
-                                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                                    kphase = std::complex<double>(cosp, sinp);
-                                }
-                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
-                                {
-                                    for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
-                                    {
-                                        const int inl = this->inl_index[T0](I0, L0, N0);
-                                        const int nm = 2 * L0 + 1;
-
-                                        for (int m1 = 0; m1 < nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
-                                        {
-                                            for (int m2 = 0; m2 < nm; ++m2) // nm = 1 for s, 3 for p, 5 for d
-                                            {
-                                                if constexpr (std::is_same<TK, double>::value)
-                                                {
-                                                    this->v_delta_pdm_shell[ik][iw1_all][iw2_all][inl][m1 * nm + m2]
-                                                        += overlap_1->get_value(iw1, ib + m1)
-                                                           * overlap_2->get_value(iw2, ib + m2);
-                                                }
-                                                else
-                                                {
-                                                    this->v_delta_pdm_shell_complex[ik][iw1_all][iw2_all][inl]
-                                                                                   [m1 * nm + m2]
-                                                        += overlap_1->get_value(iw1, ib + m1)
-                                                           * overlap_2->get_value(iw2, ib + m2) * kphase;
-                                                }
-                                            }
-                                        }
-                                        ib += nm;
-                                    }
-                                }
-                            } // ik
-                        }     // iw2
-                    }         // iw1
-                }             // ad2
-            }                 // ad1
-        }
-    }
-#ifdef __MPI
-    const int mn_size = (2 * this->lmaxd + 1) * (2 * this->lmaxd + 1);
-    for (int ik = 0; ik < nks; ik++)
-    {
-        for (int inl = 0; inl < this->inlmax; inl++)
-        {
-            for (int mu = 0; mu < nlocal; mu++)
-            {
-                for (int nu = 0; nu < nlocal; nu++)
-                {
-                    if constexpr (std::is_same<TK, double>::value)
-                    {
-                        Parallel_Reduce::reduce_all(this->v_delta_pdm_shell[ik][mu][nu][inl], mn_size);
-                    }
-                    else
-                    {
-                        Parallel_Reduce::reduce_all(this->v_delta_pdm_shell_complex[ik][mu][nu][inl], mn_size);
-                    }
-                }
-            }
-        }
-    }
-#endif
-
-    // transfer v_delta_pdm_shell to v_delta_pdm_shell_vector
-
-    int nlmax = this->inlmax / nat;
-
-    std::vector<torch::Tensor> v_delta_pdm_shell_vector;
-    for (int nl = 0; nl < nlmax; ++nl)
-    {
-        std::vector<torch::Tensor> kuuammv;
-        for (int iks = 0; iks < nks; ++iks)
-        {
-            std::vector<torch::Tensor> uuammv;
-            for (int mu = 0; mu < nlocal; ++mu)
-            {
-                std::vector<torch::Tensor> uammv;
-                for (int nu = 0; nu < nlocal; ++nu)
-                {
-                    std::vector<torch::Tensor> ammv;
-                    for (int iat = 0; iat < nat; ++iat)
-                    {
-                        int inl = iat * nlmax + nl;
-                        int nm = 2 * this->inl_l[inl] + 1;
-                        std::vector<TK> mmv;
-
-                        for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
-                        {
-                            for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
-                            {
-                                if constexpr (std::is_same<TK, double>::value)
-                                {
-                                    mmv.push_back(this->v_delta_pdm_shell[iks][mu][nu][inl][m1 * nm + m2]);
-                                }
-                                else
-                                {
-                                    mmv.push_back(this->v_delta_pdm_shell_complex[iks][mu][nu][inl][m1 * nm + m2]);
-                                }
-                            }
-                        }
-                        if constexpr (std::is_same<TK, double>::value)
-                        {
-                            torch::Tensor mm = torch::tensor(mmv, torch::TensorOptions().dtype(torch::kFloat64))
-                                                   .reshape({nm, nm}); // nm*nm
-                            ammv.push_back(mm);
-                        }
-                        else
-                        {
-                            torch::Tensor mm = torch::from_blob(mmv.data(),
-                                                                {nm, nm},
-                                                                torch::TensorOptions().dtype(torch::kComplexDouble))
-                                                   .clone(); // nm*nm
-                            ammv.push_back(mm);
-                        }
-                    }
-                    torch::Tensor amm = torch::stack(ammv, 0);
-                    uammv.push_back(amm);
-                }
-                torch::Tensor uamm = torch::stack(uammv, 0);
-                uuammv.push_back(uamm);
-            }
-            torch::Tensor uuamm = torch::stack(uuammv, 0);
-            kuuammv.push_back(uuamm);
-        }
-        torch::Tensor kuuamm = torch::stack(kuuammv, 0);
-        v_delta_pdm_shell_vector.push_back(kuuamm);
-    }
-
-    assert(v_delta_pdm_shell_vector.size() == nlmax);
-
-    // einsum for each nl:
-    std::vector<torch::Tensor> v_delta_precalc_vector;
-    for (int nl = 0; nl < nlmax; ++nl)
-    {
-        if constexpr (std::is_same<TK, double>::value)
-        {
-            v_delta_precalc_vector.push_back(
-                at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], gevdm[nl]}));
-        }
-        else
-        {
-            torch::Tensor gevdm_complex = gevdm[nl].to(torch::kComplexDouble);
-            v_delta_precalc_vector.push_back(
-                at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], gevdm_complex}));
-        }
-    }
-
-    this->v_delta_precalc_tensor = torch::cat(v_delta_precalc_vector, -1);
-    this->del_v_delta_pdm_shell(nks, nlocal);
-
-    // check_v_delta_precalc(nlocal,nat);
-    //  timeval t_end;
-    //  gettimeofday(&t_end,NULL);
-    //  std::cout<<"calculate v_delta_precalc time:\t"<<(double)(t_end.tv_sec-t_start.tv_sec) +
-    //  (double)(t_end.tv_usec-t_start.tv_usec)/1000000.0<<std::endl;
-    return;
-}
-
-template <typename TK>
-void LCAO_Deepks::check_v_delta_precalc(const int nat, const int nks, const int nlocal)
-{
-    std::ofstream ofs("v_delta_precalc.dat");
-    ofs << std::setprecision(10);
-    for (int iks = 0; iks < nks; ++iks)
-    {
-        for (int mu = 0; mu < nlocal; ++mu)
-        {
-            for (int nu = 0; nu < nlocal; ++nu)
-            {
-                for (int iat = 0; iat < nat; ++iat)
-                {
-                    for (int p = 0; p < this->des_per_atom; ++p)
-                    {
-                        if constexpr (std::is_same<TK, double>::value)
-                        {
-                            ofs << this->v_delta_precalc_tensor.index({iks, mu, nu, iat, p}).item().toDouble() << " ";
-                        }
-                        else
-                        {
-                            auto tensor_value = this->v_delta_precalc_tensor.index({iks, mu, nu, iat, p});
-                            ofs << std::complex<double>(torch::real(tensor_value).item<double>(),
-                                                        torch::imag(tensor_value).item<double>())
-                                << " ";
-                        }
-                    }
-                }
-                ofs << std::endl;
-            }
-        }
-    }
-    ofs.close();
-}
-
-template void LCAO_Deepks::cal_v_delta_precalc<double>(const int nlocal,
-                                                       const int nat,
-                                                       const int nks,
-                                                       const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                                                       const UnitCell& ucell,
-                                                       const LCAO_Orbitals& orb,
-                                                       const Grid_Driver& GridD);
-
-template void LCAO_Deepks::cal_v_delta_precalc<std::complex<double>>(
-    const int nlocal,
-    const int nat,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-    const UnitCell& ucell,
-    const LCAO_Orbitals& orb,
-    const Grid_Driver& GridD);
-
-template void LCAO_Deepks::check_v_delta_precalc<double>(const int nat, const int nks, const int nlocal);
-template void LCAO_Deepks::check_v_delta_precalc<std::complex<double>>(const int nat, const int nks, const int nlocal);
-
-#endif