deepmodeling
diff --git a/‎source/module_base/lapack_connector.h‎
Lines changed: 2 additions & 2 deletions b/‎source/module_base/lapack_connector.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎source/module_base/module_container/ATen/kernels/lapack.cpp‎
Lines changed: 45 additions & 0 deletions b/‎source/module_base/module_container/ATen/kernels/lapack.cpp‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎source/module_base/module_container/ATen/kernels/lapack.h‎
Lines changed: 26 additions & 0 deletions b/‎source/module_base/module_container/ATen/kernels/lapack.h‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎source/module_base/module_container/base/third_party/lapack.h‎
Lines changed: 51 additions & 0 deletions b/‎source/module_base/module_container/base/third_party/lapack.h‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎source/module_hamilt_lcao/module_tddft/bandenergy.cpp‎
Lines changed: 89 additions & 0 deletions b/‎source/module_hamilt_lcao/module_tddft/bandenergy.cpp‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎source/module_hamilt_lcao/module_tddft/bandenergy.h‎
Lines changed: 7 additions & 0 deletions b/‎source/module_hamilt_lcao/module_tddft/bandenergy.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎source/module_hamilt_lcao/module_tddft/evolve_elec.cpp‎
Lines changed: 13 additions & 16 deletions b/‎source/module_hamilt_lcao/module_tddft/evolve_elec.cpp‎
Lines changed: 13 additions & 16 deletions
@@ -133,8 +133,8 @@ extern "C"
 
     // zgetrf computes the LU factorization of a general matrix
     // while zgetri takes its output to perform matrix inversion
-    void zgetrf_(const int* m, const int *n, const std::complex<double> *A, const int *lda, int *ipiv, const int* info);
-    void zgetri_(const int* n, std::complex<double> *A, const int *lda, int *ipiv, std::complex<double> *work, int *lwork, const int *info);
+    void zgetrf_(const int* m, const int *n, std::complex<double> *A, const int *lda, int *ipiv, int* info);
+    void zgetri_(const int* n, std::complex<double>* A, const int* lda, const int* ipiv, std::complex<double>* work, const int* lwork, int* info);
 
     // if trans=='N':	C = alpha * A * A.H + beta * C
 	// if trans=='C':	C = alpha * A.H * A + beta * C
 
@@ -124,6 +124,41 @@ struct lapack_dngvd<T, DEVICE_CPU> {
     }
 };
 
+template <typename T>
+struct lapack_getrf<T, DEVICE_CPU> {
+    void operator()(
+        const int& m,
+        const int& n,
+        T* Mat,
+        const int& lda,
+        int* ipiv,
+        int& info)
+    {
+        lapackConnector::getrf(m, n, Mat, lda, ipiv, info);
+        if (info != 0) {
+            throw std::runtime_error("getrf failed with info = " + std::to_string(info));
+        }
+    }
+};
+
+template <typename T>
+struct lapack_getri<T, DEVICE_CPU> {
+    void operator()(
+        const int& n,
+        T* Mat,
+        const int& lda,
+        const int* ipiv,
+        T* work,
+        const int& lwork,
+        int& info)
+    {
+        lapackConnector::getri(n, Mat, lda, ipiv, work, lwork, info);
+        if (info != 0) {
+            throw std::runtime_error("getri failed with info = " + std::to_string(info));
+        }
+    }
+};
+
 template struct set_matrix<float,  DEVICE_CPU>;
 template struct set_matrix<double, DEVICE_CPU>;
 template struct set_matrix<std::complex<float>,  DEVICE_CPU>;
@@ -149,5 +184,15 @@ template struct lapack_dngvd<double, DEVICE_CPU>;
 template struct lapack_dngvd<std::complex<float>,  DEVICE_CPU>;
 template struct lapack_dngvd<std::complex<double>, DEVICE_CPU>;
 
+template struct lapack_getrf<float,  DEVICE_CPU>;
+template struct lapack_getrf<double, DEVICE_CPU>;
+template struct lapack_getrf<std::complex<float>,  DEVICE_CPU>;
+template struct lapack_getrf<std::complex<double>, DEVICE_CPU>;
+
+template struct lapack_getri<float, DEVICE_CPU>;
+template struct lapack_getri<double, DEVICE_CPU>;
+template struct lapack_getri<std::complex<float>, DEVICE_CPU>;
+template struct lapack_getri<std::complex<double>, DEVICE_CPU>;
+
 } // namespace kernels
 } // namespace container
@@ -65,6 +65,32 @@ struct lapack_dngvd {
         Real* eigen_val);
 };
 
+
+template <typename T, typename Device>
+struct lapack_getrf {
+    void operator()(
+        const int& m,
+        const int& n,
+        T* Mat,
+        const int& lda,
+        int* ipiv,
+        int& info);
+};
+
+
+template <typename T, typename Device>
+struct lapack_getri {
+    void operator()(
+        const int& n,
+        T* Mat,
+        const int& lda,
+        const int* ipiv,
+        T* work,
+        const int& lwork,
+        int& info);
+};
+
+
 #if defined(__CUDA) || defined(__ROCM)
 // TODO: Use C++ singleton to manage the GPU handles
 void createGpuSolverHandle();  // create cusolver handle
 
@@ -105,6 +105,15 @@ void dtrtri_(const char* uplo, const char* diag, const int* n, double* a, const
 void ctrtri_(const char* uplo, const char* diag, const int* n, std::complex<float>* a, const int* lda, int* info);
 void ztrtri_(const char* uplo, const char* diag, const int* n, std::complex<double>* a, const int* lda, int* info);
 
+void sgetrf_(const int* m, const int* n, float* a, const int* lda, int* ipiv, int* info);
+void dgetrf_(const int* m, const int* n, double* a, const int* lda, int* ipiv, int* info);
+void cgetrf_(const int* m, const int* n, std::complex<float>* a, const int* lda, int* ipiv, int* info);
+void zgetrf_(const int* m, const int* n, std::complex<double>* a, const int* lda, int* ipiv, int* info);
+
+void sgetri_(const int* n, float* A, const int* lda, const int* ipiv, float* work, const int* lwork, int* info);
+void dgetri_(const int* n, double* A, const int* lda, const int* ipiv, double* work, const int* lwork, int* info);
+void cgetri_(const int* n, std::complex<float>* A, const int* lda, const int* ipiv, std::complex<float>* work, const int* lwork, int* info);
+void zgetri_(const int* n, std::complex<double>* A, const int* lda, const int* ipiv, std::complex<double>* work, const int* lwork, int* info);
 }
 
 // Class LapackConnector provide the connector to fortran lapack routine.
@@ -321,6 +330,48 @@ void trtri( const char &uplo, const char &diag, const int &n, std::complex<doubl
     ztrtri_( &uplo, &diag, &n, A, &lda, &info);
 }
 
+static inline
+void getrf(const int &m, const int &n, float* A, const int &lda, int* ipiv, int &info)
+{
+    sgetrf_(&m, &n, A, &lda, ipiv, &info);
+}
+static inline
+void getrf(const int &m, const int &n, double* A, const int &lda, int* ipiv, int &info)
+{
+    dgetrf_(&m, &n, A, &lda, ipiv, &info);
+}
+static inline
+void getrf(const int &m, const int &n, std::complex<float>* A, const int &lda, int* ipiv, int &info)
+{
+    cgetrf_(&m, &n, A, &lda, ipiv, &info);
+}
+static inline
+void getrf(const int &m, const int &n, std::complex<double>* A, const int &lda, int* ipiv, int &info)
+{
+    zgetrf_(&m, &n, A, &lda, ipiv, &info);
+}
+
+static inline
+void getri(const int& n, float* A, const int& lda, const int* ipiv, float* work, const int& lwork, int& info)
+{
+    sgetri_(&n, A, &lda, ipiv, work, &lwork, &info);
+}
+static inline
+void getri(const int& n, double* A, const int& lda, const int* ipiv, double* work, const int& lwork, int& info)
+{
+    dgetri_(&n, A, &lda, ipiv, work, &lwork, &info);
+}
+static inline
+void getri(const int& n, std::complex<float>* A, const int& lda, const int* ipiv, std::complex<float>* work, const int& lwork, int& info)
+{
+    cgetri_(&n, A, &lda, ipiv, work, &lwork, &info);
+}
+static inline
+void getri(const int& n, std::complex<double>* A, const int& lda, const int* ipiv, std::complex<double>* work, const int& lwork, int& info)
+{
+    zgetri_(&n, A, &lda, ipiv, work, &lwork, &info);
+}
+
 } // namespace lapackConnector
 } // namespace container
 
 
@@ -2,6 +2,7 @@
 
 #include "evolve_elec.h"
 #include "module_base/lapack_connector.h"
+#include "module_base/module_container/ATen/kernels/blas.h"
 #include "module_base/scalapack_connector.h"
 
 #include <complex>
@@ -271,6 +272,94 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
     info = MPI_Allreduce(Eii.data<double>(), ekb.data<double>(), nband, MPI_DOUBLE, MPI_SUM, pv->comm());
 }
 
+void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
+                               const int nband,
+                               const int nlocal,
+                               const container::Tensor& Htmp,
+                               const container::Tensor& psi_k,
+                               container::Tensor& ekb)
+{
+    // Create Tensor objects for temporary data
+    container::Tensor tmp1(container::DataType::DT_COMPLEX_DOUBLE,
+                           container::DeviceType::CpuDevice,
+                           container::TensorShape({pv->nloc_wfc})); // tmp1 shape: nlocal * nband
+    tmp1.zero();
+
+    container::Tensor Eij(container::DataType::DT_COMPLEX_DOUBLE,
+                          container::DeviceType::CpuDevice,
+                          container::TensorShape({pv->nloc})); // Eij shape: nlocal * nlocal
+    // Why not use nband * nband ?????
+    Eij.zero();
+
+    std::complex<double> alpha = {1.0, 0.0};
+    std::complex<double> beta = {0.0, 0.0};
+
+    // Perform matrix multiplication: tmp1 = Htmp * psi_k
+    container::kernels::blas_gemm<std::complex<double>, container::DEVICE_CPU>()('N',
+                                                                                 'N',
+                                                                                 nlocal,
+                                                                                 nband,
+                                                                                 nlocal,
+                                                                                 &alpha,
+                                                                                 Htmp.data<std::complex<double>>(),
+                                                                                 nlocal, // Leading dimension of Htmp
+                                                                                 psi_k.data<std::complex<double>>(),
+                                                                                 nlocal, // Leading dimension of psi_k
+                                                                                 &beta,
+                                                                                 tmp1.data<std::complex<double>>(),
+                                                                                 nlocal); // Leading dimension of tmp1
+
+    // Perform matrix multiplication: Eij = psi_k^dagger * tmp1
+    container::kernels::blas_gemm<std::complex<double>, container::DEVICE_CPU>()('C',
+                                                                                 'N',
+                                                                                 nband,
+                                                                                 nband,
+                                                                                 nlocal,
+                                                                                 &alpha,
+                                                                                 psi_k.data<std::complex<double>>(),
+                                                                                 nlocal, // Leading dimension of psi_k
+                                                                                 tmp1.data<std::complex<double>>(),
+                                                                                 nlocal, // Leading dimension of tmp1
+                                                                                 &beta,
+                                                                                 Eij.data<std::complex<double>>(),
+                                                                                 nlocal); // Leading dimension of Eij
+
+    if (Evolve_elec::td_print_eij >= 0.0)
+    {
+        GlobalV::ofs_running
+            << "------------------------------------------------------------------------------------------------"
+            << std::endl;
+        GlobalV::ofs_running << " Eij:" << std::endl;
+        for (int i = 0; i < pv->nrow_bands; i++)
+        {
+            for (int j = 0; j < pv->ncol_bands; j++)
+            {
+                double aa = 0.0, bb = 0.0;
+                aa = Eij.data<std::complex<double>>()[i * pv->ncol + j].real();
+                bb = Eij.data<std::complex<double>>()[i * pv->ncol + j].imag();
+                if (std::abs(aa) < Evolve_elec::td_print_eij)
+                    aa = 0.0;
+                if (std::abs(bb) < Evolve_elec::td_print_eij)
+                    bb = 0.0;
+                if (aa > 0.0 || bb > 0.0)
+                {
+                    GlobalV::ofs_running << i << " " << j << " " << aa << "+" << bb << "i " << std::endl;
+                }
+            }
+        }
+        GlobalV::ofs_running << std::endl;
+        GlobalV::ofs_running
+            << "------------------------------------------------------------------------------------------------"
+            << std::endl;
+    }
+
+    // Extract diagonal elements of Eij into ekb
+    for (int i = 0; i < nband; ++i)
+    {
+        ekb.data<double>()[i] = Eij.data<std::complex<double>>()[i * nlocal + i].real();
+    }
+}
+
 #endif
 
 } // namespace module_tddft
@@ -37,6 +37,13 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
                         const container::Tensor& Htmp,
                         const container::Tensor& psi_k,
                         container::Tensor& ekb);
+
+void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
+                               const int nband,
+                               const int nlocal,
+                               const container::Tensor& Htmp,
+                               const container::Tensor& psi_k,
+                               container::Tensor& ekb);
 #endif
 } // namespace module_tddft
 #endif
@@ -39,6 +39,11 @@ void Evolve_elec::solve_psi(const int& istep,
     ModuleBase::TITLE("Evolve_elec", "solve_psi");
     ModuleBase::timer::tick("Evolve_elec", "solve_psi");
 
+    const int print_matrix = 0;
+    // const bool use_tensor = true;
+    const bool use_tensor = false;
+    const bool use_lapack = true;
+
     for (int ik = 0; ik < nks; ik++)
     {
         phm->updateHk(ik);
@@ -58,12 +63,11 @@ void Evolve_elec::solve_psi(const int& istep,
                        nullptr,
                        &(ekb(ik, 0)),
                        htype,
-                       propagator);
+                       propagator,
+                       print_matrix);
         }
         else if (htype == 1)
         {
-            // const bool use_tensor = true;
-            const bool use_tensor = false;
             if (!use_tensor)
             {
                 evolve_psi(nband,
@@ -76,7 +80,8 @@ void Evolve_elec::solve_psi(const int& istep,
                            Sk_laststep[ik],
                            &(ekb(ik, 0)),
                            htype,
-                           propagator);
+                           propagator,
+                           print_matrix);
                 // std::cout << "Print ekb: " << std::endl;
                 // ekb.print(std::cout);
             }
@@ -122,18 +127,10 @@ void Evolve_elec::solve_psi(const int& istep,
                                   S_laststep_tensor,
                                   ekb_tensor,
                                   htype,
-                                  propagator);
-                // evolve_psi_tensor(nband,
-                //                   nlocal,
-                //                   &(para_orb),
-                //                   phm,
-                //                   psi[0].get_pointer(),
-                //                   psi_laststep[0].get_pointer(),
-                //                   Hk_laststep[ik],
-                //                   Sk_laststep[ik],
-                //                   &(ekb(ik, 0)),
-                //                   htype,
-                //                   propagator);
+                                  propagator,
+                                  print_matrix,
+                                  use_lapack);
+
                 // std::cout << "Print ekb tensor: " << std::endl;
                 // ekb.print(std::cout);