diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index a04e737697..79c24632a7 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -208,6 +208,7 @@ OBJS_DEEPKS=LCAO_deepks.o\
         cal_gvx.o\
         cal_descriptor.o\
         v_delta_precalc.o\
+        v_delta_precalc_k.o\
         
 
 OBJS_ELECSTAT=elecstate.o\
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index e6a4ab6632..c7f73e23e8 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -1000,7 +1000,7 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(UnitCell& ucell, const int istep)
 #ifdef __DEEPKS
             if (PARAM.inp.deepks_out_labels && PARAM.inp.deepks_v_delta)
             {
-                DeePKS_domain::save_h_mat(h_mat.p, this->pv.nloc);
+                DeePKS_domain::save_h_mat(h_mat.p, this->pv.nloc, ik);
             }
 #endif
         }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_allocate.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_allocate.cpp
index 2b9969554e..c7b7e6ee10 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_allocate.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_allocate.cpp
@@ -27,6 +27,7 @@ void divide_HS_in_frag(const bool isGamma, const UnitCell& ucell, Parallel_Orbit
         GlobalC::ld.init(orb,
                          ucell.nat,
                          ucell.ntype,
+                         nks,
                          pv,
                          na);
 
diff --git a/source/module_hamilt_lcao/module_deepks/CMakeLists.txt b/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
index fcebebdaeb..e7e5110c6f 100644
--- a/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
+++ b/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
@@ -20,6 +20,7 @@ if(ENABLE_DEEPKS)
       cal_gvx.cpp      
       cal_descriptor.cpp
       v_delta_precalc.cpp
+      v_delta_precalc_k.cpp
   )
 
   add_library(
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
index 0c6701e83d..84eb2d6e6b 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
@@ -74,6 +74,7 @@ void LCAO_Deepks::init(
     const LCAO_Orbitals& orb,
     const int nat,
     const int ntype,
+    const int nks,
     const Parallel_Orbitals& pv_in,
     std::vector<int> na)
 {
@@ -97,13 +98,13 @@ void LCAO_Deepks::init(
     this->nmaxd = nm;
     
     GlobalV::ofs_running << " lmax of descriptor = " << this->lmaxd << std::endl;
-    GlobalV::ofs_running << " nmax of descriptor= " << nmaxd << std::endl;
+    GlobalV::ofs_running << " nmax of descriptor = " << nmaxd << std::endl;
 
     int pdm_size = 0;
     this->inlmax = tot_inl;
     if(!PARAM.inp.deepks_equiv)
     {
-        GlobalV::ofs_running << " total basis (all atoms) for descriptor= " << std::endl;
+        GlobalV::ofs_running << " total basis (all atoms) for descriptor = " << std::endl;
 
         //init pdm**
         pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
@@ -150,6 +151,15 @@ void LCAO_Deepks::init(
             int nloc=this->pv->nloc;
             this->h_mat.resize(nloc,0.0);
         }
+        else
+        {
+            int nloc=this->pv->nloc;
+            this->h_mat_k.resize(nks);
+            for (int ik = 0; ik < nks; ik++)
+            {
+                this->h_mat_k[ik].resize(nloc,std::complex<double>(0.0,0.0));
+            }
+        }
     }
 
     return;
@@ -431,27 +441,51 @@ void LCAO_Deepks::del_orbital_pdm_shell(const int nks)
 
 void LCAO_Deepks::init_v_delta_pdm_shell(const int nks,const int nlocal)
 {
-    
-    this->v_delta_pdm_shell = new double**** [nks];
-
     const int mn_size=(2 * this->lmaxd + 1) * (2 * this->lmaxd + 1);
-    for (int iks=0; iks<nks; iks++)
-    {
-        this->v_delta_pdm_shell[iks] = new double*** [nlocal];
+    if (nks==1){
+        this->v_delta_pdm_shell = new double**** [nks];
+        for (int iks=0; iks<nks; iks++)
+        {
+            this->v_delta_pdm_shell[iks] = new double*** [nlocal];
 
-        for (int mu=0; mu<nlocal; mu++)
+            for (int mu=0; mu<nlocal; mu++)
+            {
+                this->v_delta_pdm_shell[iks][mu] = new double** [nlocal];
+
+                for (int nu=0; nu<nlocal; nu++)
+                {
+                    this->v_delta_pdm_shell[iks][mu][nu] = new double* [this->inlmax];
+
+                    for(int inl = 0; inl < this->inlmax; inl++)
+                    {
+                        this->v_delta_pdm_shell[iks][mu][nu][inl] = new double [mn_size];
+                        ModuleBase::GlobalFunc::ZEROS(v_delta_pdm_shell[iks][mu][nu][inl], mn_size);
+                    }                
+                }
+            }
+        }
+    }
+    else
+    {
+        this->v_delta_pdm_shell_complex = new std::complex<double>**** [nks];
+        for (int iks=0; iks<nks; iks++)
         {
-            this->v_delta_pdm_shell[iks][mu] = new double** [nlocal];
+            this->v_delta_pdm_shell_complex[iks] = new std::complex<double>*** [nlocal];
 
-            for (int nu=0; nu<nlocal; nu++)
+            for (int mu=0; mu<nlocal; mu++)
             {
-                this->v_delta_pdm_shell[iks][mu][nu] = new double* [this->inlmax];
+                this->v_delta_pdm_shell_complex[iks][mu] = new std::complex<double>** [nlocal];
 
-                for(int inl = 0; inl < this->inlmax; inl++)
+                for (int nu=0; nu<nlocal; nu++)
                 {
-                    this->v_delta_pdm_shell[iks][mu][nu][inl] = new double [mn_size];
-                    ModuleBase::GlobalFunc::ZEROS(v_delta_pdm_shell[iks][mu][nu][inl], mn_size);
-                }                
+                    this->v_delta_pdm_shell_complex[iks][mu][nu] = new std::complex<double>* [this->inlmax];
+
+                    for(int inl = 0; inl < this->inlmax; inl++)
+                    {
+                        this->v_delta_pdm_shell_complex[iks][mu][nu][inl] = new std::complex<double> [mn_size];
+                        ModuleBase::GlobalFunc::ZEROS(v_delta_pdm_shell_complex[iks][mu][nu][inl], mn_size);
+                    }                
+                }
             }
         }
     }
@@ -461,23 +495,46 @@ void LCAO_Deepks::init_v_delta_pdm_shell(const int nks,const int nlocal)
 
 void LCAO_Deepks::del_v_delta_pdm_shell(const int nks,const int nlocal)
 {
-    for (int iks=0; iks<nks; iks++)
+    if (nks==1)
     {
-        for (int mu=0; mu<nlocal; mu++)
+        for (int iks=0; iks<nks; iks++)
         {
-            for (int nu=0; nu<nlocal; nu++)
+            for (int mu=0; mu<nlocal; mu++)
             {
-                for (int inl = 0;inl < this->inlmax; inl++)
+                for (int nu=0; nu<nlocal; nu++)
                 {
-                    delete[] this->v_delta_pdm_shell[iks][mu][nu][inl];
+                    for (int inl = 0;inl < this->inlmax; inl++)
+                    {
+                        delete[] this->v_delta_pdm_shell[iks][mu][nu][inl];
+                    }
+                    delete[] this->v_delta_pdm_shell[iks][mu][nu];                
+                }
+                delete[] this->v_delta_pdm_shell[iks][mu]; 
+            }
+            delete[] this->v_delta_pdm_shell[iks]; 
+        }
+        delete[] this->v_delta_pdm_shell;    
+    }
+    else
+    {
+        for (int iks=0; iks<nks; iks++)
+        {
+            for (int mu=0; mu<nlocal; mu++)
+            {
+                for (int nu=0; nu<nlocal; nu++)
+                {
+                    for (int inl = 0;inl < this->inlmax; inl++)
+                    {
+                        delete[] this->v_delta_pdm_shell_complex[iks][mu][nu][inl];
+                    }
+                    delete[] this->v_delta_pdm_shell_complex[iks][mu][nu];                
                 }
-                delete[] this->v_delta_pdm_shell[iks][mu][nu];                
+                delete[] this->v_delta_pdm_shell_complex[iks][mu]; 
             }
-            delete[] this->v_delta_pdm_shell[iks][mu]; 
+            delete[] this->v_delta_pdm_shell_complex[iks]; 
         }
-        delete[] this->v_delta_pdm_shell[iks]; 
+        delete[] this->v_delta_pdm_shell_complex;    
     }
-    delete[] this->v_delta_pdm_shell;    
 
     return;
 }
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
index cd8d899a65..57b3609059 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
@@ -54,8 +54,11 @@ class LCAO_Deepks
     ///\rho_{HL} = c_{L, \mu}c_{L,\nu} - c_{H, \mu}c_{H,\nu} \f$ (for gamma_only)
     ModuleBase::matrix o_delta;
 
-    ///(Unit: Ry) Hamiltonian matrix
+    ///(Unit: Ry) Hamiltonian matrix in k space
+    /// for gamma only
     std::vector<double> h_mat;    
+    /// for multi-k
+    std::vector<std::vector<std::complex<double>>> h_mat_k;
 
     /// Correction term to the Hamiltonian matrix: \f$\langle\psi|V_\delta|\psi\rangle\f$ (for gamma only)
     std::vector<double> H_V_delta;
@@ -159,13 +162,14 @@ class LCAO_Deepks
     // dD/dX, tensor form of gdmx
     std::vector<torch::Tensor> gdmr_vector;
 
-    // orbital_pdm_shell:[1,Inl,nm*nm]; \langle \phi_\mu|\alpha\rangle\langle\alpha|\phi_\nu\rnalge
+    // orbital_pdm_shell:[1,Inl,nm*nm]; \langle \phi_\mu|\alpha\rangle\langle\alpha|\phi_\nu\ranlge
     double**** orbital_pdm_shell;
     // orbital_precalc:[1,NAt,NDscrpt]; gvdm*orbital_pdm_shell
     torch::Tensor orbital_precalc_tensor;
 
     // v_delta_pdm_shell[nks,nlocal,nlocal,Inl,nm*nm] = overlap * overlap
     double***** v_delta_pdm_shell;
+    std::complex<double>***** v_delta_pdm_shell_complex; // for multi-k
     // v_delta_precalc[nks,nlocal,nlocal,NAt,NDscrpt] = gvdm * v_delta_pdm_shell;
     torch::Tensor v_delta_precalc_tensor;
     //for v_delta==2 , new v_delta_precalc storage method
@@ -220,6 +224,7 @@ class LCAO_Deepks
     void init(const LCAO_Orbitals& orb,
               const int nat,
               const int ntype,
+              const int nks,
               const Parallel_Orbitals& pv_in,
               std::vector<int> na);
 
@@ -437,12 +442,15 @@ class LCAO_Deepks
     // 11. cal_orbital_precalc_k : orbital_precalc is usted for training with orbital label,
     //                          for multi-k case, which equals gvdm * orbital_pdm_shell,
     //                          orbital_pdm_shell[1,Inl,nm*nm] = dm_hl_k * overlap * overlap
-    //12. cal_v_delta_precalc : v_delta_precalc is used for training with v_delta label,
+    // 12. cal_v_delta_precalc : v_delta_precalc is used for training with v_delta label,
     //                         which equals gvdm * v_delta_pdm_shell,
     //                         v_delta_pdm_shell = overlap * overlap
-    //13. check_v_delta_precalc : check v_delta_precalc
-    //14. prepare_psialpha : prepare psialpha for outputting npy file
-    //15. prepare_gevdm : prepare gevdm for outputting npy file
+    // 13. cal_v_delta_precalc_k : v_delta_precalc is used for training with v_delta label,
+    //                         for multi-k case, which equals ???
+    //                         ???
+    // 14. check_v_delta_precalc : check v_delta_precalc
+    // 15. prepare_psialpha : prepare psialpha for outputting npy file
+    // 16. prepare_gevdm : prepare gevdm for outputting npy file
 
   public:
     /// Calculates descriptors
@@ -500,6 +508,14 @@ class LCAO_Deepks
         const LCAO_Orbitals &orb,
         Grid_Driver &GridD);
 
+    void cal_v_delta_precalc_k(const int nlocal,
+        const int nat,
+        const int nks,
+        const std::vector<ModuleBase::Vector3<double>> &kvec_d,
+        const UnitCell &ucell,
+        const LCAO_Orbitals &orb,
+        Grid_Driver &GridD);
+
     void check_v_delta_precalc(const int nat, const int nks,const int nlocal);
 
     // prepare psialpha for outputting npy file
@@ -508,6 +524,13 @@ class LCAO_Deepks
         const UnitCell &ucell,
         const LCAO_Orbitals &orb,
         Grid_Driver &GridD);
+    void prepare_psialpha_k(const int nlocal,
+        const int nat,
+        const int nks,
+        const std::vector<ModuleBase::Vector3<double>> &kvec_d,
+        const UnitCell &ucell,
+        const LCAO_Orbitals &orb,
+        Grid_Driver &GridD);
     void check_vdp_psialpha(const int nat, const int nks, const int nlocal);
     
     // prepare gevdm for outputting npy file
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
index 8b10d55d41..52d3811692 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
@@ -291,7 +291,7 @@ void LCAO_Deepks_Interface::out_deepks_labels(const double& etot,
 
             if (PARAM.inp.deepks_scf)
             {
-                int nocc = PARAM.inp.nelec / 2;
+                int nocc = PARAM.inp.nelec / 2; // redundant!
                 ModuleBase::matrix wg_hl;
                 wg_hl.create(nks, PARAM.inp.nbands);
                 std::vector<std::vector<ModuleBase::ComplexMatrix>> dm_bandgap_k;
@@ -333,7 +333,94 @@ void LCAO_Deepks_Interface::out_deepks_labels(const double& etot,
         } // end bandgap label
         if(deepks_v_delta)
         {
-            ModuleBase::WARNING_QUIT("ESolver_KS_LCAO", "V_delta label has not been developed for multi-k now!");
+            std::vector<ModuleBase::ComplexMatrix> h_tot(nks);
+            for (int ik = 0; ik < nks; ik++)
+            {
+                h_tot[ik].create(nlocal, nlocal);
+            }
+
+            DeePKS_domain::collect_h_mat(*ParaV, ld->h_mat_k,h_tot,nlocal,nks);
+
+            const std::string file_htot = PARAM.globalv.global_out_dir + "deepks_htot.npy";
+            LCAO_deepks_io::save_npy_h(h_tot, file_htot, nlocal, nks, my_rank);
+
+            if(PARAM.inp.deepks_scf)
+            {
+                std::vector<ModuleBase::ComplexMatrix> v_delta(nks);
+                std::vector<ModuleBase::ComplexMatrix> hbase(nks);
+                for (int ik = 0; ik < nks; ik++)
+                {
+                    v_delta[ik].create(nlocal, nlocal);
+                    hbase[ik].create(nlocal, nlocal);
+                }
+                DeePKS_domain::collect_h_mat(*ParaV, ld->H_V_delta_k,v_delta,nlocal,nks); 
+
+                const std::string file_hbase = PARAM.globalv.global_out_dir + "deepks_hbase.npy";
+                for (int ik = 0; ik < nks; ik++)
+                {
+                    hbase[ik] = h_tot[ik] - v_delta[ik];
+                }
+                LCAO_deepks_io::save_npy_h(hbase, file_hbase, nlocal, nks, my_rank);
+
+                const std::string file_vdelta = PARAM.globalv.global_out_dir + "deepks_vdelta.npy";
+                LCAO_deepks_io::save_npy_h(v_delta, file_vdelta, nlocal, nks, my_rank);
+
+                if(deepks_v_delta==1)//v_delta_precalc storage method 1
+                {
+                    ld->cal_v_delta_precalc_k(nlocal,
+                            nat,
+                            nks,
+                            kvec_d,
+                            ucell,
+                            orb,
+                            GridD);
+                
+                    LCAO_deepks_io::save_npy_v_delta_precalc(
+                      nat, 
+                      nks, 
+                      nlocal,
+                      ld->des_per_atom,
+                      ld->v_delta_precalc_tensor,
+                      PARAM.globalv.global_out_dir,
+                      my_rank);
+
+                }
+                else if(deepks_v_delta==2)//v_delta_precalc storage method 2
+                {
+                    ld->prepare_psialpha_k(nlocal,
+                                nat,
+                                nks,
+                                kvec_d,
+                                ucell,
+                                orb,
+                                GridD);
+                    
+                    LCAO_deepks_io::save_npy_psialpha(nat, 
+                                nks, 
+                                nlocal,
+                                ld->inlmax,
+                                ld->lmaxd,
+                                ld->psialpha_tensor,
+                                PARAM.globalv.global_out_dir,
+                                my_rank);
+
+                    ld->prepare_gevdm(
+                                nat,
+                                orb);
+
+                    LCAO_deepks_io::save_npy_gevdm(nat,
+                      ld->inlmax,
+                      ld->lmaxd,
+                      ld->gevdm_tensor,
+                      PARAM.globalv.global_out_dir,
+                      my_rank);
+                }
+            }
+            else //deepks_scf == 0
+            {
+                const std::string file_hbase = PARAM.globalv.global_out_dir + "deepks_hbase.npy";
+                LCAO_deepks_io::save_npy_h(h_tot, file_hbase, nlocal, nks, my_rank);
+            }
         }
     }  // end deepks_out_labels
 
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
index 05ab39b555..dd7d39227f 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
@@ -444,6 +444,38 @@ void LCAO_deepks_io::save_npy_h(const ModuleBase::matrix &hamilt,
     return;    
 }
 
+// for multi-k, should be combined with gamma-only version in future
+void LCAO_deepks_io::save_npy_h(const std::vector<ModuleBase::ComplexMatrix> &hamilt,
+                                const std::string &h_file,
+                                const int nlocal,
+                                const int nks,
+                                const int rank)
+{
+    ModuleBase::TITLE("LCAO_deepks_io", "save_npy_h");
+	if(rank!=0)
+	{
+		return;
+	}
+
+    const long unsigned hshape[] = {static_cast<unsigned long>(nks),
+                                    static_cast<unsigned long>(nlocal), 
+                                    static_cast<unsigned long>(nlocal) };
+
+    std::vector<std::complex<double>> npy_h;
+    for(int k=0; k<nks; k++)
+    {
+        for (int i=0; i<nlocal; i++)
+        {
+            for (int j=0; j<nlocal; j++)
+            {
+                npy_h.push_back(hamilt[k](i,j));
+            }
+        }         
+    }
+
+    npy::SaveArrayAsNumpy(h_file, false, 3, hshape, npy_h);
+    return;    
+}
 
 void LCAO_deepks_io::save_npy_v_delta_precalc(const int nat, 
                                               const int nks,
@@ -468,28 +500,57 @@ void LCAO_deepks_io::save_npy_v_delta_precalc(const int nat,
                                     static_cast<unsigned long>(nlocal),
                                     static_cast<unsigned long>(nat),
                                     static_cast<unsigned long>(des_per_atom)};
-
-    std::vector<double> npy_v_delta_precalc;
-
-    for (int iks = 0; iks < nks; ++iks)
+    if (nks==1)
     {
-        for (int mu = 0; mu < nlocal; ++mu)
+        std::vector<double> npy_v_delta_precalc;    
+        for (int iks = 0; iks < nks; ++iks)
         {
-            for (int nu = 0; nu < nlocal; ++nu)
+            for (int mu = 0; mu < nlocal; ++mu)
             {
-                for (int iat = 0;iat < nat;++iat)
+                for (int nu = 0; nu < nlocal; ++nu)
                 {
-                    for(int p=0; p<des_per_atom; ++p)
+                    for (int iat = 0;iat < nat;++iat)
                     {
-                        npy_v_delta_precalc.push_back(v_delta_precalc_tensor.index({iks, mu, nu, iat, p }).item().toDouble());
-                    }
-                }                
+                        for(int p=0; p<des_per_atom; ++p)
+                        {
+                            npy_v_delta_precalc.push_back(v_delta_precalc_tensor.index({iks, mu, nu, iat, p }).item().toDouble());
+                        }
+                    }                
+                }
             }
         }
+        const std::string file_vdpre = out_dir + "deepks_vdpre.npy";
+        npy::SaveArrayAsNumpy(file_vdpre, false, 5, gshape, npy_v_delta_precalc);
+        return;
     }
-    const std::string file_vdpre = out_dir + "deepks_vdpre.npy";
-    npy::SaveArrayAsNumpy(file_vdpre, false, 5, gshape, npy_v_delta_precalc);
-    return;
+    else
+    {
+        std::vector<std::complex<double>> npy_v_delta_precalc;
+        for (int iks = 0; iks < nks; ++iks)
+        {
+            for (int mu = 0; mu < nlocal; ++mu)
+            {
+                for (int nu = 0; nu < nlocal; ++nu)
+                {
+                    for (int iat = 0;iat < nat;++iat)
+                    {
+                        for(int p=0; p<des_per_atom; ++p)
+                        {
+                            auto real_part = torch::real(v_delta_precalc_tensor.index({iks, mu, nu, iat, p})).item<double>();
+                            auto imag_part = torch::imag(v_delta_precalc_tensor.index({iks, mu, nu, iat, p})).item<double>();
+                            std::complex<double> value(real_part, imag_part);
+                            npy_v_delta_precalc.push_back(value);
+                        }
+                    }                
+                }
+            }
+        }
+        const std::string file_vdpre = out_dir + "deepks_vdpre.npy";
+        npy::SaveArrayAsNumpy(file_vdpre, false, 5, gshape, npy_v_delta_precalc);
+        return;
+    }
+
+    
 }
 
 
@@ -517,26 +578,55 @@ void LCAO_deepks_io::save_npy_psialpha(const int nat,
                                     static_cast<unsigned long>(nks),
                                     static_cast<unsigned long>(nlocal),
                                     static_cast<unsigned long>(mmax)};
-    std::vector<double> npy_psialpha;
-    for(int iat=0; iat< nat ; iat++) 
+    if(nks==1)
     {
-        for(int nl = 0; nl < nlmax; nl++)
+        std::vector<double> npy_psialpha;
+        for(int iat=0; iat< nat ; iat++) 
         {
-            for (int iks = 0; iks < nks ; iks++)
+            for(int nl = 0; nl < nlmax; nl++)
             {
-                for(int mu = 0; mu < nlocal ; mu++)
+                for (int iks = 0; iks < nks ; iks++)
                 {
-                    for(int m=0; m< mmax; m++)
+                    for(int mu = 0; mu < nlocal ; mu++)
                     {
-                        npy_psialpha.push_back(psialpha_tensor.index({ iat,nl, iks, mu, m }).item().toDouble());
-                    }
-                }                
+                        for(int m=0; m< mmax; m++)
+                        {
+                            npy_psialpha.push_back(psialpha_tensor.index({ iat,nl, iks, mu, m }).item().toDouble());
+                        }
+                    }                
+                }
             }
         }
+        const std::string file_psialpha = out_dir + "deepks_psialpha.npy";
+        npy::SaveArrayAsNumpy(file_psialpha, false, 5, gshape, npy_psialpha);
+        return;
     }
-    const std::string file_psialpha = out_dir + "deepks_psialpha.npy";
-    npy::SaveArrayAsNumpy(file_psialpha, false, 5, gshape, npy_psialpha);
-    return;
+    else
+    {
+        std::vector<std::complex<double>> npy_psialpha;
+        for(int iat=0; iat< nat ; iat++) 
+        {
+            for(int nl = 0; nl < nlmax; nl++)
+            {
+                for (int iks = 0; iks < nks ; iks++)
+                {
+                    for(int mu = 0; mu < nlocal ; mu++)
+                    {
+                        for(int m=0; m< mmax; m++)
+                        {
+                            std::complex<double> value(torch::real(psialpha_tensor.index({ iat,nl, iks, mu, m })).item<double>(), 
+                                                       torch::imag(psialpha_tensor.index({ iat,nl, iks, mu, m })).item<double>());
+                            npy_psialpha.push_back(value);
+                        }
+                    }                
+                }
+            }
+        }
+        const std::string file_psialpha = out_dir + "deepks_psialpha.npy";
+        npy::SaveArrayAsNumpy(file_psialpha, false, 5, gshape, npy_psialpha);
+        return;
+    }
+
 }
 
 
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h
index d20a7bf113..8f88fa20d4 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h
@@ -7,6 +7,7 @@
 #include <vector>
 #include "module_base/tool_title.h"
 #include "module_base/matrix.h"
+#include "module_base/complexmatrix.h"
 
 #include <torch/script.h>
 #include <torch/torch.h>
@@ -120,6 +121,13 @@ void save_npy_h(const ModuleBase::matrix &hamilt,
 		const int nlocal,
 		const int rank);
 
+/// for multi-k
+void save_npy_h(const std::vector<ModuleBase::ComplexMatrix> &hamilt,
+        const std::string &h_file,
+        const int nlocal,
+        const int nks,
+        const int rank);
+
 void save_npy_v_delta_precalc(const int nat,
 		const int nks,
 		const int nlocal,
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
index cadc3a79f1..97aef62322 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
@@ -162,7 +162,7 @@ void LCAO_Deepks::prepare_psialpha(const int nlocal,
     ModuleBase::TITLE("LCAO_Deepks", "prepare_psialpha");
     int nlmax = this->inlmax/nat;
     int mmax = 2*this->lmaxd+1;
-    this->psialpha_tensor = torch::zeros({ nat, nlmax, 1, nlocal, mmax }, torch::TensorOptions().dtype(torch::kFloat64));
+    this->psialpha_tensor = torch::zeros({ nat, nlmax, 1, nlocal, mmax }, torch::TensorOptions().dtype(torch::kFloat64)); // support gamma-only
 
     //cutoff for alpha is same for all types of atoms
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
@@ -252,6 +252,135 @@ void LCAO_Deepks::prepare_psialpha(const int nlocal,
 #endif  
 }
 
+void LCAO_Deepks::prepare_psialpha_k(const int nlocal,
+    const int nat,
+    const int nks,
+    const std::vector<ModuleBase::Vector3<double>> &kvec_d,
+    const UnitCell &ucell,
+    const LCAO_Orbitals &orb,
+    Grid_Driver &GridD)
+{
+    ModuleBase::TITLE("LCAO_Deepks", "prepare_psialpha");
+    int nlmax = this->inlmax/nat;
+    int mmax = 2*this->lmaxd+1;
+    this->psialpha_tensor = torch::zeros({ nat, nlmax, nks, nlocal, mmax }, torch::kComplexDouble); // support multi-k
+
+    //cutoff for alpha is same for all types of atoms
+    const double Rcut_Alpha = orb.Alpha[0].getRcut();
+
+    for (int T0 = 0; T0 < ucell.ntype; T0++)
+    {
+		Atom* atom0 = &ucell.atoms[T0]; 
+        for (int I0 =0; I0< atom0->na; I0++)
+        {
+            //iat: atom index on which |alpha> is located
+            const int iat = ucell.itia2iat(T0,I0);
+			const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
+            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
+
+            //outermost loop : find all adjacent atoms
+            for (int ad=0; ad<GridD.getAdjacentNum()+1 ; ++ad)
+            {
+                const int T1 = GridD.getType(ad);
+                const int I1 = GridD.getNatom(ad);
+                const int ibt = ucell.itia2iat(T1, I1);
+                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+				const double Rcut_AO1 = orb.Phi[T1].getRcut();
+
+                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad);
+				const Atom* atom1 = &ucell.atoms[T1];
+				const int nw1_tot = atom1->nw*PARAM.globalv.npol;
+
+				const double dist1 = (tau1-tau0).norm() * ucell.lat0;
+
+				if (dist1 > Rcut_Alpha + Rcut_AO1)
+				{
+					continue;
+				}
+
+                ModuleBase::Vector3<double> dR(GridD.getBox(ad).x,
+                                               GridD.getBox(ad).y,
+                                               GridD.getBox(ad).z);
+
+                key_tuple key(ibt, dR.x, dR.y, dR.z);
+
+                if (this->nlm_save_k[iat].find(key)
+                    == this->nlm_save_k[iat].end()) 
+                {
+                    continue;
+                }
+
+                //middle loop : all atomic basis on the adjacent atom ad
+				for (int iw1=0; iw1<nw1_tot; ++iw1)
+				{
+					const int iw1_all = start1 + iw1;
+					const int iw1_local = pv->global2local_row(iw1_all);
+					const int iw2_local = pv->global2local_col(iw1_all);
+					if(iw1_local < 0 || iw2_local < 0) {continue;}
+					const int iw1_0 = iw1/PARAM.globalv.npol;
+					std::vector<double> nlm = this->nlm_save_k[iat][key][iw1][0];
+                    
+                    for (int ik = 0; ik <nks; ik++)
+                    {
+                        const double arg = kvec_d[ik] * dR * ModuleBase::TWO_PI;
+                        double sinp, cosp;
+                        ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                        const std::complex<double> kphase = std::complex<double>(cosp, sinp);
+                        int ib=0;
+                        int nl=0;
+                        for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                        {
+                            for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                            {
+                                const int nm = 2*L0+1;
+                                
+                                for (int m1=0; m1<nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
+                                {
+                                    std::complex<double> nlm_phase = nlm[ib + m1] * kphase;
+                                    torch::Tensor nlm_tensor = torch::tensor({nlm_phase.real(), nlm_phase.imag()}, torch::kDouble);
+                                    torch::Tensor complex_tensor = torch::complex(nlm_tensor[0], nlm_tensor[1]);
+                                    this->psialpha_tensor[iat][nl][ik][iw1_all][m1] = complex_tensor;
+                                }
+                                ib+=nm;
+                                nl++;
+                            }
+                        }  
+                    }//end ik
+				}//end iw
+			}//end ad
+		}//end I0
+	}//end T0
+
+#ifdef __MPI
+    std::complex<double> msg[mmax];
+    for(int iat=0; iat< nat ; iat++)
+    {
+        for(int nl = 0; nl < nlmax; nl++)
+        {
+            for(int ik = 0; ik < nks; ik++)
+            {
+                for(int mu = 0; mu < nlocal ; mu++)
+                {
+                    for(int m=0;m<mmax;m++) 
+                    {
+                        auto tensor_value = this->psialpha_tensor.index({iat, nl, ik, mu, m});
+                        msg[m] = std::complex<double>(torch::real(tensor_value).item<double>(), torch::imag(tensor_value).item<double>());
+                    }
+                    Parallel_Reduce::reduce_all(msg,mmax);
+                    for(int m=0;m<mmax;m++) 
+                    {
+                        torch::Tensor msg_tensor = torch::tensor({msg[m].real(), msg[m].imag()}, torch::kDouble);
+                        torch::Tensor complex_tensor = torch::complex(msg_tensor[0], msg_tensor[1]);
+                        this->psialpha_tensor[iat][nl][ik][mu][m] = complex_tensor;
+                    }
+                }
+            }
+        }
+    }
+
+#endif  
+}
+
 void LCAO_Deepks::check_vdp_psialpha(const int nat, const int nks, const int nlocal)
 {
     std::ofstream ofs("vdp_psialpha.dat");
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_hmat.cpp b/source/module_hamilt_lcao/module_deepks/deepks_hmat.cpp
index 85d3b443da..66125bf042 100644
--- a/source/module_hamilt_lcao/module_deepks/deepks_hmat.cpp
+++ b/source/module_hamilt_lcao/module_deepks/deepks_hmat.cpp
@@ -7,7 +7,8 @@
 
 void DeePKS_domain::save_h_mat(
 		const double *h_mat_in,
-		const int nloc)
+		const int nloc,
+        const int ik)
 {
 	for(int i=0;i<nloc;i++)
 	{
@@ -17,9 +18,13 @@ void DeePKS_domain::save_h_mat(
 
 void DeePKS_domain::save_h_mat(
 		const std::complex<double> *h_mat_in,
-		const int nloc)
+		const int nloc,
+        const int ik)
 {
-
+    for(int i=0;i<nloc;i++)
+    {
+        GlobalC::ld.h_mat_k[ik][i]=h_mat_in[i];
+    }  
 }
 
 
@@ -86,6 +91,73 @@ void DeePKS_domain::collect_h_mat(
 #endif
 }
 
+// for multi-k
+void DeePKS_domain::collect_h_mat(
+        const Parallel_Orbitals &pv,
+		const std::vector<std::vector<std::complex<double>>>& h_in,
+		std::vector<ModuleBase::ComplexMatrix> &h_out,
+		const int nlocal,
+        const int nks)
+{
+    ModuleBase::TITLE("DeePKS_domain", "collect_h_tot");
+
+    //construct the total H matrix
+    for (int k=0; k<nks; k++) 
+    {
+#ifdef __MPI
+        int ir=0;
+        int ic=0;
+        for (int i=0; i<nlocal; i++)
+        {
+            std::vector<std::complex<double>> lineH(nlocal-i, std::complex<double>(0.0, 0.0));
+
+            ir = pv.global2local_row(i);
+            if (ir>=0)
+            {
+                // data collection
+                for (int j=i; j<nlocal; j++)
+                {
+                    ic = pv.global2local_col(j);
+                    if (ic>=0)
+                    {
+                        int iic=0;
+                        if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
+                        {
+                            iic=ir+ic*pv.nrow;
+                        }
+                        else
+                        {
+                            iic=ir*pv.ncol+ic;
+                        }
+                        lineH[j-i] = h_in[k][iic];
+                    }
+                }
+            }
+            else
+            {
+                //do nothing
+            }
+
+            Parallel_Reduce::reduce_all(lineH.data(),nlocal-i);
+
+            for (int j=i; j<nlocal; j++)
+            {
+                h_out[k](i,j)=lineH[j-i];
+                h_out[k](j,i)=h_out[k](i,j);//H is a symmetric matrix
+            }
+        }
+#else
+        for (int i=0; i<nlocal; i++)
+        {
+            for (int j=i; j<nlocal; j++)
+            {
+                h_out[k](i,j)=h_in[k][i*nlocal+j];
+                h_out[k](j,i)=h_out[k](i,j);//H is a symmetric matrix
+            }
+        }
+#endif
+    }
+}
 
 //just for gamma-only now
 void DeePKS_domain::check_h_mat(
@@ -106,4 +178,28 @@ void DeePKS_domain::check_h_mat(
     ofs.close();
 }
 
+// for multi-k
+void DeePKS_domain::check_h_mat(
+		const std::vector<ModuleBase::ComplexMatrix> &H,
+		const std::string &h_file,
+		const int nlocal,
+        const int nks)
+{
+    std::ofstream ofs(h_file.c_str());
+    ofs << std::setprecision(10);
+    for (int k=0; k<nks; k++)
+    {
+        for (int i=0; i<nlocal; i++)
+        {
+            for (int j=0; j<nlocal; j++)
+            {
+                ofs << H[k](i,j) << " ";
+            }
+            ofs << std::endl;
+        }
+        ofs << std::endl;
+    }
+    ofs.close();
+}
+
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_hmat.h b/source/module_hamilt_lcao/module_deepks/deepks_hmat.h
index e62140c738..fe4d51dea5 100644
--- a/source/module_hamilt_lcao/module_deepks/deepks_hmat.h
+++ b/source/module_hamilt_lcao/module_deepks/deepks_hmat.h
@@ -4,6 +4,7 @@
 #ifdef __DEEPKS
 
 #include "module_base/matrix.h"
+#include "module_base/complexmatrix.h"
 #include "module_base/timer.h"
 #include "module_basis/module_ao/parallel_orbitals.h"
 
@@ -15,11 +16,13 @@ namespace DeePKS_domain
 {
 	void save_h_mat(
 			const double *h_mat_in,
-			const int nloc);
+			const int nloc,
+            const int ik);
 
 	void save_h_mat(
 			const std::complex<double> *h_mat_in,
-			const int nloc);
+			const int nloc,
+            const int ik);
 
     //Collect data in h_in to matrix h_out. Note that left lower trianger in h_out is filled
     void collect_h_mat(
@@ -28,10 +31,23 @@ namespace DeePKS_domain
         ModuleBase::matrix &h_out,
         const int nlocal);
 
+    void collect_h_mat(
+        const Parallel_Orbitals &pv,
+		const std::vector<std::vector<std::complex<double>>>& h_in,
+		std::vector<ModuleBase::ComplexMatrix> &h_out,
+		const int nlocal,
+        const int nks);
+
     void check_h_mat(
         const ModuleBase::matrix &H,
         const std::string &h_file,
         const int nlocal);
+
+    void check_h_mat(
+		const std::vector<ModuleBase::ComplexMatrix> &H,
+		const std::string &h_file,
+		const int nlocal,
+        const int nks);
 }
 
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp b/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
index 0174e4137c..0c91180d9a 100644
--- a/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
+++ b/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
@@ -30,7 +30,7 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
 
     this->cal_gvdm(nat);
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
-    this->init_v_delta_pdm_shell(1,nlocal);
+    this->init_v_delta_pdm_shell(1,nlocal); // 1 for gamma-only
    
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
diff --git a/source/module_hamilt_lcao/module_deepks/v_delta_precalc_k.cpp b/source/module_hamilt_lcao/module_deepks/v_delta_precalc_k.cpp
new file mode 100644
index 0000000000..1a1d3b4006
--- /dev/null
+++ b/source/module_hamilt_lcao/module_deepks/v_delta_precalc_k.cpp
@@ -0,0 +1,234 @@
+#ifdef __DEEPKS
+
+#include "LCAO_deepks.h"
+#include "LCAO_deepks_io.h" 
+#include "module_base/blas_connector.h"
+#include "module_base/constants.h"
+#include "module_base/libm/libm.h"
+#include "module_base/parallel_reduce.h"
+#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
+#include "module_parameter/parameter.h"
+
+
+// calculates v_delta_precalc[nks,nlocal,nlocal,NAt,NDscrpt] = gvdm * v_delta_pdm_shell;
+// v_delta_pdm_shell[nks,nlocal,nlocal,Inl,nm*nm] = overlap * overlap;
+// for deepks_v_delta = 1
+void LCAO_Deepks::cal_v_delta_precalc_k(const int nlocal,
+    const int nat,
+    const int nks,
+    const std::vector<ModuleBase::Vector3<double>> &kvec_d,
+    const UnitCell &ucell,
+    const LCAO_Orbitals &orb,
+    Grid_Driver &GridD)
+{
+    ModuleBase::TITLE("LCAO_Deepks", "calc_v_delta_precalc");
+    // timeval t_start;
+    // gettimeofday(&t_start,NULL);
+
+    this->cal_gvdm(nat);
+    const double Rcut_Alpha = orb.Alpha[0].getRcut();
+    this->init_v_delta_pdm_shell(nks,nlocal); // multi-k
+   
+    for (int T0 = 0; T0 < ucell.ntype; T0++)
+    {
+		Atom* atom0 = &ucell.atoms[T0]; 
+        
+        for (int I0 =0; I0< atom0->na; I0++)
+        {
+            const int iat = ucell.itia2iat(T0,I0);
+            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
+            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
+
+            for (int ad1=0; ad1<GridD.getAdjacentNum()+1 ; ++ad1)
+            {
+                const int T1 = GridD.getType(ad1);
+                const int I1 = GridD.getNatom(ad1);
+                const int ibt1 = ucell.itia2iat(T1, I1); 
+                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
+				const Atom* atom1 = &ucell.atoms[T1];
+				const int nw1_tot = atom1->nw*PARAM.globalv.npol;
+				const double Rcut_AO1 = orb.Phi[T1].getRcut(); 
+
+                const double dist1 = (tau1-tau0).norm() * ucell.lat0;
+                if (dist1 >= Rcut_Alpha + Rcut_AO1)
+                {
+                    continue;
+                }
+
+                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x,
+                                                GridD.getBox(ad1).y,
+                                                GridD.getBox(ad1).z);
+                
+                key_tuple key_1(ibt1, dR1.x, dR1.y, dR1.z);
+                
+                if (this->nlm_save_k[iat].find(key_1)
+                    == this->nlm_save_k[iat].end()) 
+                {
+                    continue;
+                }
+
+				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
+				{
+					const int T2 = GridD.getType(ad2);
+					const int I2 = GridD.getNatom(ad2);
+                    const int ibt2 = ucell.itia2iat(T2, I2);
+					const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+					const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
+					const Atom* atom2 = &ucell.atoms[T2];
+					const int nw2_tot = atom2->nw*PARAM.globalv.npol;
+					
+					const double Rcut_AO2 = orb.Phi[T2].getRcut();
+                	const double dist2 = (tau2-tau0).norm() * ucell.lat0;
+
+					if (dist2 >= Rcut_Alpha + Rcut_AO2)
+					{
+						continue;
+					}
+
+                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x,
+                                                    GridD.getBox(ad2).y,
+                                                    GridD.getBox(ad2).z);
+
+                    key_tuple key_2(ibt2, dR2.x, dR2.y, dR2.z);
+
+                    if (this->nlm_save_k[iat].find(key_2)
+                        == this->nlm_save_k[iat].end()) 
+                    {
+                        continue;
+                    }
+
+					for (int iw1=0; iw1<nw1_tot; ++iw1)
+					{
+						const int iw1_all = start1 + iw1; // this is \mu
+						const int iw1_local = pv->global2local_row(iw1_all);
+						if(iw1_local < 0) {continue;}
+						const int iw1_0 = iw1/PARAM.globalv.npol;
+						for (int iw2=0; iw2<nw2_tot; ++iw2)
+						{
+							const int iw2_all = start2 + iw2; // this is \nu
+							const int iw2_local = pv->global2local_col(iw2_all);
+							if(iw2_local < 0) {continue;}
+							const int iw2_0 = iw2/PARAM.globalv.npol;
+                            // Should use nlm_save_k, to be modified here!!!
+                            std::vector<double> nlm1 = this->nlm_save_k[iat][key_1][iw1][0];
+                            std::vector<double> nlm2 = this->nlm_save_k[iat][key_2][iw2][0];
+                            assert(nlm1.size()==nlm2.size());
+                            for (int ik = 0; ik < nks; ik++)
+                            {
+                                int ib=0;
+                                const double arg = - (kvec_d[ik] * (dR1-dR2) ) * ModuleBase::TWO_PI;
+                                double sinp, cosp;
+                                ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                                const std::complex<double> kphase = std::complex<double>(cosp, sinp);
+                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                                {
+                                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                                    {
+                                        const int inl = this->inl_index[T0](I0, L0, N0);
+                                        const int nm = 2*L0+1;
+                                        
+                                        for (int m1=0; m1<nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
+                                        {
+                                            for (int m2=0; m2<nm; ++m2) // nm = 1 for s, 3 for p, 5 for d
+                                            {
+                                                v_delta_pdm_shell_complex[ik][iw1_all][iw2_all][inl][m1*nm+m2] += nlm1[ib+m1]*nlm2[ib+m2]*kphase;
+                                            }
+                                        }
+
+                                        ib+=nm;
+                                    }
+                                }
+                            }
+                                                        
+						}//iw2
+					}//iw1
+				}//ad2
+			}//ad1   
+            
+        }
+    }
+
+#ifdef __MPI
+    const int mn_size=(2 * this->lmaxd + 1) * (2 * this->lmaxd + 1);
+    for(int ik = 0; ik < nks; ik++)
+    {
+        for(int inl = 0; inl < this->inlmax; inl++)
+        {
+            for(int mu = 0; mu < nlocal ; mu++)
+            {
+                for(int nu=0; nu< nlocal ; nu++)
+                {
+                    Parallel_Reduce::reduce_all(this->v_delta_pdm_shell_complex[ik][mu][nu][inl],mn_size);
+                }
+            }
+        }
+    }
+#endif    
+    // transfer v_delta_pdm_shell to v_delta_pdm_shell_vector
+    
+    int nlmax = this->inlmax/nat;
+   
+    std::vector<torch::Tensor> v_delta_pdm_shell_vector;
+    for(int nl = 0; nl < nlmax; ++nl)
+    {
+        std::vector<torch::Tensor> kuuammv;
+        for(int iks = 0; iks < nks; ++iks)
+        {
+            std::vector<torch::Tensor> uuammv;
+            for(int mu = 0; mu < nlocal; ++mu)
+            {
+                std::vector<torch::Tensor> uammv;
+                for(int nu =0 ; nu < nlocal; ++nu)
+                {
+                    std::vector<torch::Tensor> ammv;
+                    for (int iat=0; iat<nat; ++iat)
+                    {
+                        int inl = iat*nlmax+nl;
+                        int nm = 2*this->inl_l[inl]+1;
+                        std::vector<std::complex<double>> mmv;
+                    
+                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                        {
+                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            {
+                                mmv.push_back(this->v_delta_pdm_shell_complex[iks][mu][nu][inl][m1*nm+m2]);
+                            }
+                        }
+                        torch::Tensor mm = torch::from_blob(mmv.data(), {nm, nm}, torch::TensorOptions().dtype(torch::kComplexDouble)).clone();    //nm*nm
+                        ammv.push_back(mm);
+                    }
+                    torch::Tensor amm = torch::stack(ammv, 0); 
+                    uammv.push_back(amm);                    
+                }
+                torch::Tensor uamm = torch::stack(uammv, 0); 
+                uuammv.push_back(uamm);
+            }
+            torch::Tensor uuamm = torch::stack(uuammv, 0); 
+            kuuammv.push_back(uuamm);
+        }
+        torch::Tensor kuuamm = torch::stack(kuuammv, 0);  
+        v_delta_pdm_shell_vector.push_back(kuuamm);
+    }
+
+    assert(v_delta_pdm_shell_vector.size() == nlmax);
+    
+    //einsum for each nl: 
+    std::vector<torch::Tensor> v_delta_precalc_vector;
+    for (int nl = 0; nl<nlmax; ++nl)
+    {
+        torch::Tensor gevdm_vector_complex = this->gevdm_vector[nl].to(torch::kComplexDouble);
+        v_delta_precalc_vector.push_back(at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], gevdm_vector_complex}));
+    }
+
+    this->v_delta_precalc_tensor = torch::cat(v_delta_precalc_vector, -1);
+    this->del_v_delta_pdm_shell(nks,nlocal);
+
+    //check_v_delta_precalc(nlocal,nat);
+    // timeval t_end;
+    // gettimeofday(&t_end,NULL);
+    // std::cout<<"calculate v_delta_precalc time:\t"<<(double)(t_end.tv_sec-t_start.tv_sec) + (double)(t_end.tv_usec-t_start.tv_usec)/1000000.0<<std::endl;
+    return;
+}
+
+#endif
\ No newline at end of file