deepmodeling
diff --git a/‎source/module_esolver/esolver_ks_lcao_tddft.cpp‎
Lines changed: 41 additions & 6 deletions b/‎source/module_esolver/esolver_ks_lcao_tddft.cpp‎
Lines changed: 41 additions & 6 deletions
diff --git a/‎source/module_esolver/esolver_ks_lcao_tddft.h‎
Lines changed: 43 additions & 4 deletions b/‎source/module_esolver/esolver_ks_lcao_tddft.h‎
Lines changed: 43 additions & 4 deletions
diff --git a/‎source/module_hamilt_lcao/module_tddft/bandenergy.cpp‎
Lines changed: 8 additions & 12 deletions b/‎source/module_hamilt_lcao/module_tddft/bandenergy.cpp‎
Lines changed: 8 additions & 12 deletions
diff --git a/‎source/module_hamilt_lcao/module_tddft/evolve_elec.cpp‎
Lines changed: 81 additions & 57 deletions b/‎source/module_hamilt_lcao/module_tddft/evolve_elec.cpp‎
Lines changed: 81 additions & 57 deletions
@@ -43,6 +43,14 @@ ESolver_KS_LCAO_TDDFT<Device>::ESolver_KS_LCAO_TDDFT()
 {
     classname = "ESolver_KS_LCAO_TDDFT";
     basisname = "LCAO";
+
+    // If the device is GPU, we must open use_tensor and use_lapack
+    ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
+    if (ct_device_type == ct::DeviceType::GpuDevice)
+    {
+        use_tensor = true;
+        use_lapack = true;
+    }
 }
 
 template <typename Device>
@@ -217,22 +225,26 @@ void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell, const int istep,
 
         if (td_htype == 1)
         {
+            const int len_HS = use_tensor && use_lapack ? nlocal * nlocal : nloc;
+
             if (this->Hk_laststep == nullptr)
             {
                 this->Hk_laststep = new std::complex<double>*[kv.get_nks()];
                 for (int ik = 0; ik < kv.get_nks(); ++ik)
                 {
-                    this->Hk_laststep[ik] = new std::complex<double>[nloc];
-                    ModuleBase::GlobalFunc::ZEROS(Hk_laststep[ik], nloc);
+                    // Allocate memory for Hk_laststep, if (use_tensor && use_lapack), should be global
+                    this->Hk_laststep[ik] = new std::complex<double>[len_HS];
+                    ModuleBase::GlobalFunc::ZEROS(Hk_laststep[ik], len_HS);
                 }
             }
             if (this->Sk_laststep == nullptr)
             {
                 this->Sk_laststep = new std::complex<double>*[kv.get_nks()];
                 for (int ik = 0; ik < kv.get_nks(); ++ik)
                 {
-                    this->Sk_laststep[ik] = new std::complex<double>[nloc];
-                    ModuleBase::GlobalFunc::ZEROS(Sk_laststep[ik], nloc);
+                    // Allocate memory for Sk_laststep, if (use_tensor && use_lapack), should be global
+                    this->Sk_laststep[ik] = new std::complex<double>[len_HS];
+                    ModuleBase::GlobalFunc::ZEROS(Sk_laststep[ik], len_HS);
                 }
             }
         }
@@ -253,8 +265,31 @@ void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell, const int istep,
                 this->p_hamilt->updateHk(ik);
                 hamilt::MatrixBlock<complex<double>> h_mat, s_mat;
                 this->p_hamilt->matrix(h_mat, s_mat);
-                BlasConnector::copy(nloc, h_mat.p, 1, Hk_laststep[ik], 1);
-                BlasConnector::copy(nloc, s_mat.p, 1, Sk_laststep[ik], 1);
+
+                if (use_tensor && use_lapack)
+                {
+                    // Gather H and S matrices to root process
+#ifdef __MPI
+                    int myid, num_procs;
+                    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+                    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+
+                    Matrix_g<std::complex<double>> h_mat_g, s_mat_g; // Global matrix structure
+
+                    // Collect H matrix
+                    gatherMatrix(myid, 0, h_mat, h_mat_g);
+                    BlasConnector::copy(nlocal * nlocal, h_mat_g.p.get(), 1, Hk_laststep[ik], 1);
+
+                    // Collect S matrix
+                    gatherMatrix(myid, 0, s_mat, s_mat_g);
+                    BlasConnector::copy(nlocal * nlocal, s_mat_g.p.get(), 1, Sk_laststep[ik], 1);
+#endif
+                }
+                else
+                {
+                    BlasConnector::copy(nloc, h_mat.p, 1, Hk_laststep[ik], 1);
+                    BlasConnector::copy(nloc, s_mat.p, 1, Sk_laststep[ik], 1);
+                }
             }
         }
 
 
@@ -2,11 +2,51 @@
 #define ESOLVER_KS_LCAO_TDDFT_H
 #include "esolver_ks.h"
 #include "esolver_ks_lcao.h"
+#include "module_base/scalapack_connector.h" // Cpxgemr2d
 #include "module_hamilt_lcao/hamilt_lcaodft/record_adj.h"
 #include "module_psi/psi.h"
 
 namespace ModuleESolver
 {
+//------------------------ MPI gathering and distributing functions ------------------------//
+// This struct is used for collecting matrices from all processes to root process
+template <typename T>
+struct Matrix_g
+{
+    std::shared_ptr<T> p;
+    size_t row;
+    size_t col;
+    std::shared_ptr<int> desc;
+};
+
+// Collect matrices from all processes to root process
+template <typename T>
+void gatherMatrix(const int myid, const int root_proc, const hamilt::MatrixBlock<T>& mat_l, Matrix_g<T>& mat_g)
+{
+    const int* desca = mat_l.desc; // Obtain the descriptor of the local matrix
+    int ctxt = desca[1];           // BLACS context
+    int nrows = desca[2];          // Global matrix row number
+    int ncols = desca[3];          // Global matrix column number
+
+    if (myid == root_proc)
+    {
+        mat_g.p.reset(new T[nrows * ncols]); // No need to delete[] since it is a shared_ptr
+    }
+    else
+    {
+        mat_g.p.reset(new T[nrows * ncols]); // Placeholder for non-root processes
+    }
+
+    // Set the descriptor of the global matrix
+    mat_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows});
+    mat_g.row = nrows;
+    mat_g.col = ncols;
+
+    // Call the Cpxgemr2d function in ScaLAPACK to collect the matrix data
+    Cpxgemr2d(nrows, ncols, mat_l.p, 1, 1, const_cast<int*>(desca), mat_g.p.get(), 1, 1, mat_g.desc.get(), ctxt);
+}
+
+//------------------------ MPI gathering and distributing functions ------------------------//
 
 template <typename Device = base_device::DEVICE_CPU>
 class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, double>
@@ -38,10 +78,9 @@ class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, doubl
 
     const int td_htype = 1;
 
-    // const bool use_tensor = true;
-    const bool use_tensor = false;
-    const bool use_lapack = true;
-    // const bool use_lapack = false;
+    //! Control heterogeneous computing of the TDDFT solver
+    bool use_tensor = false;
+    bool use_lapack = false;
 
   private:
     void weight_dm_rho();
 
@@ -290,10 +290,6 @@ void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
                                const ct::Tensor& psi_k,
                                ct::Tensor& ekb)
 {
-    /// ctx is nothing but the devices used in op (Device* ctx = nullptr;),
-    /// it controls the ops to use the corresponding device to calculate results
-    Device* ctx = {};
-    base_device::DEVICE_CPU* cpu_ctx = {};
     // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
     ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
     // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
@@ -302,12 +298,12 @@ void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
     // Create Tensor objects for temporary data
     ct::Tensor tmp1(ct::DataType::DT_COMPLEX_DOUBLE,
                     ct_device_type,
-                    ct::TensorShape({pv->nloc_wfc})); // tmp1 shape: nlocal * nband
+                    ct::TensorShape({nlocal * nband})); // tmp1 shape: nlocal * nband
     tmp1.zero();
 
     ct::Tensor Eij(ct::DataType::DT_COMPLEX_DOUBLE,
                    ct_device_type,
-                   ct::TensorShape({pv->nloc})); // Eij shape: nlocal * nlocal
+                   ct::TensorShape({nlocal * nlocal})); // Eij shape: nlocal * nlocal
     // Why not use nband * nband ?????
     Eij.zero();
 
@@ -346,17 +342,19 @@ void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
 
     if (PARAM.inp.td_print_eij >= 0.0)
     {
+        ct::Tensor Eij_cpu = Eij.to_device<ct::DEVICE_CPU>();
+
         GlobalV::ofs_running
             << "------------------------------------------------------------------------------------------------"
             << std::endl;
         GlobalV::ofs_running << " Eij:" << std::endl;
-        for (int i = 0; i < pv->nrow_bands; i++)
+        for (int i = 0; i < nband; i++)
         {
-            for (int j = 0; j < pv->ncol_bands; j++)
+            for (int j = 0; j < nband; j++)
             {
                 double aa = 0.0, bb = 0.0;
-                aa = Eij.data<std::complex<double>>()[i * pv->ncol + j].real();
-                bb = Eij.data<std::complex<double>>()[i * pv->ncol + j].imag();
+                aa = Eij_cpu.data<std::complex<double>>()[i * nlocal + j].real();
+                bb = Eij_cpu.data<std::complex<double>>()[i * nlocal + j].imag();
                 if (std::abs(aa) < PARAM.inp.td_print_eij)
                 {
                     aa = 0.0;
@@ -384,8 +382,6 @@ void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
         for (int i = 0; i < nband; ++i)
         {
             base_device::memory::synchronize_memory_op<double, Device, Device>()(
-                ctx,
-                ctx,
                 ekb.data<double>() + i,
                 reinterpret_cast<const double*>(Eij.data<std::complex<double>>() + i * nlocal + i),
                 1);
 
@@ -15,10 +15,6 @@ Evolve_elec<Device>::Evolve_elec(){};
 template <typename Device>
 Evolve_elec<Device>::~Evolve_elec(){};
 
-template <typename Device>
-Device* Evolve_elec<Device>::ctx = {};
-template <typename Device>
-base_device::DEVICE_CPU* Evolve_elec<Device>::cpu_ctx = {};
 template <typename Device>
 ct::DeviceType Evolve_elec<Device>::ct_device_type = ct::DeviceTypeToEnum<Device>::value;
 
@@ -89,53 +85,69 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
             }
             else
             {
-                // std::cout << "nband = " << nband << std::endl;
-                // std::cout << "psi->get_nbands() = " << psi->get_nbands() << std::endl;
-                // std::cout << "nlocal = " << nlocal << std::endl;
-                // std::cout << "psi->get_nbasis() = " << psi->get_nbasis() << std::endl;
-                // std::cout << "para_orb.nloc = " << para_orb.nloc << std::endl;
-                // std::cout << "para_orb.nrow = " << para_orb.nrow << std::endl;
-                // std::cout << "para_orb.ncol = " << para_orb.ncol << std::endl;
-                // std::cout << "ekb.nr = " << ekb.nr << std::endl;
-                // std::cout << "ekb.nc = " << ekb.nc << std::endl;
+                const int len_psi_k_1 = use_lapack ? nband : psi->get_nbands();
+                const int len_psi_k_2 = use_lapack ? nlocal : psi->get_nbasis();
+                const int len_HS_laststep = use_lapack ? nlocal * nlocal : para_orb.nloc;
 
                 // Create Tensor for psi_k, psi_k_laststep, H_laststep, S_laststep, ekb
                 ct::Tensor psi_k_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
                                         ct_device_type,
-                                        ct::TensorShape({psi->get_nbands(), psi->get_nbasis()}));
+                                        ct::TensorShape({len_psi_k_1, len_psi_k_2}));
                 ct::Tensor psi_k_laststep_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
                                                  ct_device_type,
-                                                 ct::TensorShape({psi->get_nbands(), psi->get_nbasis()}));
+                                                 ct::TensorShape({len_psi_k_1, len_psi_k_2}));
                 ct::Tensor H_laststep_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
                                              ct_device_type,
-                                             ct::TensorShape({para_orb.nloc}));
+                                             ct::TensorShape({len_HS_laststep}));
                 ct::Tensor S_laststep_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
                                              ct_device_type,
-                                             ct::TensorShape({para_orb.nloc}));
+                                             ct::TensorShape({len_HS_laststep}));
                 ct::Tensor ekb_tensor(ct::DataType::DT_DOUBLE, ct_device_type, ct::TensorShape({nband}));
 
-                // Syncronize data from CPU to Device
-                syncmem_complex_h2d_op()(ctx,
-                                         cpu_ctx,
-                                         psi_k_tensor.data<std::complex<double>>(),
-                                         psi[0].get_pointer(),
-                                         psi->get_nbands() * psi->get_nbasis());
-                syncmem_complex_h2d_op()(ctx,
-                                         cpu_ctx,
-                                         psi_k_laststep_tensor.data<std::complex<double>>(),
-                                         psi_laststep[0].get_pointer(),
-                                         psi->get_nbands() * psi->get_nbasis());
-                syncmem_complex_h2d_op()(ctx,
-                                         cpu_ctx,
-                                         H_laststep_tensor.data<std::complex<double>>(),
+                // Global psi
+                ModuleESolver::Matrix_g<std::complex<double>> psi_g;
+                ModuleESolver::Matrix_g<std::complex<double>> psi_laststep_g;
+
+                if (use_lapack)
+                {
+                    // Need to gather the psi to the root process on CPU
+                    // H_laststep and S_laststep are already gathered in esolver_ks_lcao_tddft.cpp
+#ifdef __MPI
+                    // Access the rank of the calling process in the communicator
+                    int myid, root_proc = 0;
+                    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+                    // Gather psi to the root process
+                    gatherPsi(myid, root_proc, psi[0].get_pointer(), para_orb, psi_g);
+                    gatherPsi(myid, root_proc, psi_laststep[0].get_pointer(), para_orb, psi_laststep_g);
+
+                    // Syncronize data from CPU to Device
+                    syncmem_complex_h2d_op()(psi_k_tensor.data<std::complex<double>>(),
+                                             psi_g.p.get(),
+                                             len_psi_k_1 * len_psi_k_2);
+                    syncmem_complex_h2d_op()(psi_k_laststep_tensor.data<std::complex<double>>(),
+                                             psi_laststep_g.p.get(),
+                                             len_psi_k_1 * len_psi_k_2);
+#endif
+                }
+                else
+                {
+                    // Syncronize data from CPU to Device
+                    syncmem_complex_h2d_op()(psi_k_tensor.data<std::complex<double>>(),
+                                             psi[0].get_pointer(),
+                                             len_psi_k_1 * len_psi_k_2);
+                    syncmem_complex_h2d_op()(psi_k_laststep_tensor.data<std::complex<double>>(),
+                                             psi_laststep[0].get_pointer(),
+                                             len_psi_k_1 * len_psi_k_2);
+                }
+
+                syncmem_complex_h2d_op()(H_laststep_tensor.data<std::complex<double>>(),
                                          Hk_laststep[ik],
-                                         para_orb.nloc);
-                syncmem_complex_h2d_op()(ctx,
-                                         cpu_ctx,
-                                         S_laststep_tensor.data<std::complex<double>>(),
+                                         len_HS_laststep);
+                syncmem_complex_h2d_op()(S_laststep_tensor.data<std::complex<double>>(),
                                          Sk_laststep[ik],
-                                         para_orb.nloc);
-                syncmem_double_h2d_op()(ctx, cpu_ctx, ekb_tensor.data<double>(), &(ekb(ik, 0)), nband);
+                                         len_HS_laststep);
+                syncmem_double_h2d_op()(ekb_tensor.data<double>(), &(ekb(ik, 0)), nband);
 
                 evolve_psi_tensor<Device>(nband,
                                           nlocal,
@@ -151,28 +163,40 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
                                           print_matrix,
                                           use_lapack);
 
-                // Syncronize data from Device to CPU
-                syncmem_complex_d2h_op()(cpu_ctx,
-                                         ctx,
-                                         psi[0].get_pointer(),
-                                         psi_k_tensor.data<std::complex<double>>(),
-                                         psi->get_nbands() * psi->get_nbasis());
-                syncmem_complex_d2h_op()(cpu_ctx,
-                                         ctx,
-                                         psi_laststep[0].get_pointer(),
-                                         psi_k_laststep_tensor.data<std::complex<double>>(),
-                                         psi->get_nbands() * psi->get_nbasis());
-                syncmem_complex_d2h_op()(cpu_ctx,
-                                         ctx,
-                                         Hk_laststep[ik],
+                // Need to distribute global psi back to all processes
+                if (use_lapack)
+                {
+#ifdef __MPI
+                    // Syncronize data from Device to CPU
+                    syncmem_complex_d2h_op()(psi_g.p.get(),
+                                             psi_k_tensor.data<std::complex<double>>(),
+                                             len_psi_k_1 * len_psi_k_2);
+                    syncmem_complex_d2h_op()(psi_laststep_g.p.get(),
+                                             psi_k_laststep_tensor.data<std::complex<double>>(),
+                                             len_psi_k_1 * len_psi_k_2);
+
+                    // Distribute psi to all processes
+                    distributePsi(para_orb, psi[0].get_pointer(), psi_g);
+                    distributePsi(para_orb, psi_laststep[0].get_pointer(), psi_laststep_g);
+#endif
+                }
+                else
+                {
+                    // Syncronize data from Device to CPU
+                    syncmem_complex_d2h_op()(psi[0].get_pointer(),
+                                             psi_k_tensor.data<std::complex<double>>(),
+                                             len_psi_k_1 * len_psi_k_2);
+                    syncmem_complex_d2h_op()(psi_laststep[0].get_pointer(),
+                                             psi_k_laststep_tensor.data<std::complex<double>>(),
+                                             len_psi_k_1 * len_psi_k_2);
+                }
+                syncmem_complex_d2h_op()(Hk_laststep[ik],
                                          H_laststep_tensor.data<std::complex<double>>(),
-                                         para_orb.nloc);
-                syncmem_complex_d2h_op()(cpu_ctx,
-                                         ctx,
-                                         Sk_laststep[ik],
+                                         len_HS_laststep);
+                syncmem_complex_d2h_op()(Sk_laststep[ik],
                                          S_laststep_tensor.data<std::complex<double>>(),
-                                         para_orb.nloc);
-                syncmem_double_d2h_op()(cpu_ctx, ctx, &(ekb(ik, 0)), ekb_tensor.data<double>(), nband);
+                                         len_HS_laststep);
+                syncmem_double_d2h_op()(&(ekb(ik, 0)), ekb_tensor.data<double>(), nband);
 
                 // std::cout << "Print ekb tensor: " << std::endl;
                 // ekb.print(std::cout);