diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index ad13d75976..9e3c736b89 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -557,10 +557,13 @@ OBJS_IO_LCAO=cal_r_overlap_R.o\
 
 OBJS_LCAO=evolve_elec.o\
       evolve_psi.o\
-      bandenergy.o\
+      band_energy.o\
       middle_hamilt.o\
       norm_psi.o\
       propagator.o\
+      propagator_cn2.o\
+      propagator_taylor.o\
+      propagator_etrs.o\
       td_velocity.o\
       td_current.o\
       snap_psibeta_half_tddft.o\
diff --git a/source/module_base/lapack_connector.h b/source/module_base/lapack_connector.h
index d4a4b646ff..29018d4db4 100644
--- a/source/module_base/lapack_connector.h
+++ b/source/module_base/lapack_connector.h
@@ -133,8 +133,8 @@ extern "C"
 
     // zgetrf computes the LU factorization of a general matrix
     // while zgetri takes its output to perform matrix inversion
-    void zgetrf_(const int* m, const int *n, const std::complex<double> *A, const int *lda, int *ipiv, const int* info);
-    void zgetri_(const int* n, std::complex<double> *A, const int *lda, int *ipiv, std::complex<double> *work, int *lwork, const int *info);
+    void zgetrf_(const int* m, const int *n, std::complex<double> *A, const int *lda, int *ipiv, int* info);
+    void zgetri_(const int* n, std::complex<double>* A, const int* lda, const int* ipiv, std::complex<double>* work, const int* lwork, int* info);
 
     // if trans=='N':	C = alpha * A * A.H + beta * C
 	// if trans=='C':	C = alpha * A.H * A + beta * C
diff --git a/source/module_base/module_container/ATen/kernels/cuda/lapack.cu b/source/module_base/module_container/ATen/kernels/cuda/lapack.cu
index 48caa4be7e..6300df953f 100644
--- a/source/module_base/module_container/ATen/kernels/cuda/lapack.cu
+++ b/source/module_base/module_container/ATen/kernels/cuda/lapack.cu
@@ -117,6 +117,49 @@ struct lapack_dngvd<T, DEVICE_GPU> {
     }
 };
 
+template <typename T>
+struct lapack_getrf<T, DEVICE_GPU> {
+    void operator()(
+        const int& m,
+        const int& n,
+        T* Mat,
+        const int& lda,
+        int* ipiv)
+    {
+        cuSolverConnector::getrf(cusolver_handle, m, n, Mat, lda, ipiv);
+    }
+};
+
+template <typename T>
+struct lapack_getri<T, DEVICE_GPU> {
+    void operator()(
+        const int& n,
+        T* Mat,
+        const int& lda,
+        const int* ipiv,
+        T* work,
+        const int& lwork)
+    {
+        throw std::runtime_error("cuSOLVER does not provide LU-based matrix inversion interface (getri). To compute the inverse on GPU, use getrs instead.");
+    }
+};
+
+template <typename T>
+struct lapack_getrs<T, DEVICE_GPU> {
+    void operator()(
+        const char& trans,
+        const int& n,
+        const int& nrhs,
+        T* A,
+        const int& lda,
+        const int* ipiv,
+        T* B,
+        const int& ldb)
+    {
+        cuSolverConnector::getrs(cusolver_handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
+    }
+};
+
 template struct set_matrix<float,  DEVICE_GPU>;
 template struct set_matrix<double, DEVICE_GPU>;
 template struct set_matrix<std::complex<float>,  DEVICE_GPU>;
@@ -142,5 +185,20 @@ template struct lapack_dngvd<double, DEVICE_GPU>;
 template struct lapack_dngvd<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_dngvd<std::complex<double>, DEVICE_GPU>;
 
+template struct lapack_getrf<float,  DEVICE_GPU>;
+template struct lapack_getrf<double, DEVICE_GPU>;
+template struct lapack_getrf<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_getrf<std::complex<double>, DEVICE_GPU>;
+
+template struct lapack_getri<float,  DEVICE_GPU>;
+template struct lapack_getri<double, DEVICE_GPU>;
+template struct lapack_getri<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_getri<std::complex<double>, DEVICE_GPU>;
+
+template struct lapack_getrs<float,  DEVICE_GPU>;
+template struct lapack_getrs<double, DEVICE_GPU>;
+template struct lapack_getrs<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_getrs<std::complex<double>, DEVICE_GPU>;
+
 } // namespace kernels
 } // namespace container
\ No newline at end of file
diff --git a/source/module_base/module_container/ATen/kernels/lapack.cpp b/source/module_base/module_container/ATen/kernels/lapack.cpp
index 436b552e1c..2369306309 100644
--- a/source/module_base/module_container/ATen/kernels/lapack.cpp
+++ b/source/module_base/module_container/ATen/kernels/lapack.cpp
@@ -124,6 +124,61 @@ struct lapack_dngvd<T, DEVICE_CPU> {
     }
 };
 
+template <typename T>
+struct lapack_getrf<T, DEVICE_CPU> {
+    void operator()(
+        const int& m,
+        const int& n,
+        T* Mat,
+        const int& lda,
+        int* ipiv)
+    {
+        int info = 0;
+        lapackConnector::getrf(m, n, Mat, lda, ipiv, info);
+        if (info != 0) {
+            throw std::runtime_error("getrf failed with info = " + std::to_string(info));
+        }
+    }
+};
+
+template <typename T>
+struct lapack_getri<T, DEVICE_CPU> {
+    void operator()(
+        const int& n,
+        T* Mat,
+        const int& lda,
+        const int* ipiv,
+        T* work,
+        const int& lwork)
+    {
+        int info = 0;
+        lapackConnector::getri(n, Mat, lda, ipiv, work, lwork, info);
+        if (info != 0) {
+            throw std::runtime_error("getri failed with info = " + std::to_string(info));
+        }
+    }
+};
+
+template <typename T>
+struct lapack_getrs<T, DEVICE_CPU> {
+    void operator()(
+        const char& trans,
+        const int& n,
+        const int& nrhs,
+        T* A,
+        const int& lda,
+        const int* ipiv,
+        T* B,
+        const int& ldb)
+    {
+        int info = 0;
+        lapackConnector::getrs(trans, n, nrhs, A, lda, ipiv, B, ldb, info);
+        if (info != 0) {
+            throw std::runtime_error("getrs failed with info = " + std::to_string(info));
+        }
+    }
+};
+
 template struct set_matrix<float,  DEVICE_CPU>;
 template struct set_matrix<double, DEVICE_CPU>;
 template struct set_matrix<std::complex<float>,  DEVICE_CPU>;
@@ -149,5 +204,20 @@ template struct lapack_dngvd<double, DEVICE_CPU>;
 template struct lapack_dngvd<std::complex<float>,  DEVICE_CPU>;
 template struct lapack_dngvd<std::complex<double>, DEVICE_CPU>;
 
+template struct lapack_getrf<float,  DEVICE_CPU>;
+template struct lapack_getrf<double, DEVICE_CPU>;
+template struct lapack_getrf<std::complex<float>,  DEVICE_CPU>;
+template struct lapack_getrf<std::complex<double>, DEVICE_CPU>;
+
+template struct lapack_getri<float, DEVICE_CPU>;
+template struct lapack_getri<double, DEVICE_CPU>;
+template struct lapack_getri<std::complex<float>, DEVICE_CPU>;
+template struct lapack_getri<std::complex<double>, DEVICE_CPU>;
+
+template struct lapack_getrs<float, DEVICE_CPU>;
+template struct lapack_getrs<double, DEVICE_CPU>;
+template struct lapack_getrs<std::complex<float>, DEVICE_CPU>;
+template struct lapack_getrs<std::complex<double>, DEVICE_CPU>;
+
 } // namespace kernels
 } // namespace container
\ No newline at end of file
diff --git a/source/module_base/module_container/ATen/kernels/lapack.h b/source/module_base/module_container/ATen/kernels/lapack.h
index 561eac2fb1..cf164dec10 100644
--- a/source/module_base/module_container/ATen/kernels/lapack.h
+++ b/source/module_base/module_container/ATen/kernels/lapack.h
@@ -65,6 +65,42 @@ struct lapack_dngvd {
         Real* eigen_val);
 };
 
+
+template <typename T, typename Device>
+struct lapack_getrf {
+    void operator()(
+        const int& m,
+        const int& n,
+        T* Mat,
+        const int& lda,
+        int* ipiv);
+};
+
+
+template <typename T, typename Device>
+struct lapack_getri {
+    void operator()(
+        const int& n,
+        T* Mat,
+        const int& lda,
+        const int* ipiv,
+        T* work,
+        const int& lwork);
+};
+
+template <typename T, typename Device>
+struct lapack_getrs {
+    void operator()(
+        const char& trans,
+        const int& n,
+        const int& nrhs,
+        T* A,
+        const int& lda,
+        const int* ipiv,
+        T* B,
+        const int& ldb);
+};
+
 #if defined(__CUDA) || defined(__ROCM)
 // TODO: Use C++ singleton to manage the GPU handles
 void createGpuSolverHandle();  // create cusolver handle
diff --git a/source/module_base/module_container/base/third_party/cusolver.h b/source/module_base/module_container/base/third_party/cusolver.h
index 2ad1e2a427..defd60a121 100644
--- a/source/module_base/module_container/base/third_party/cusolver.h
+++ b/source/module_base/module_container/base/third_party/cusolver.h
@@ -132,7 +132,7 @@ void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n,
 static inline
 void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float * W)
 {
-    // prepare some values for cusolverDnZhegvd_bufferSize
+    // prepare some values for cusolverDnSsyevd_bufferSize
     int lwork  = 0; 
     int h_info = 0; 
     int*   d_info = nullptr;
@@ -142,7 +142,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u
     // calculate the sizes needed for pre-allocated buffer.
     cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
                                 n, A, lda, W, &lwork));
-    // allocate memery
+    // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
     // compute eigenvalues and eigenvectors.
     cusolverErrcheck(cusolverDnSsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
@@ -158,7 +158,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u
 static inline
 void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double * W)
 {
-    // prepare some values for cusolverDnZhegvd_bufferSize
+    // prepare some values for cusolverDnDsyevd_bufferSize
     int lwork  = 0; 
     int h_info = 0; 
     int*    d_info = nullptr;
@@ -168,7 +168,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u
     // calculate the sizes needed for pre-allocated buffer.
     cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
                                 n, A, lda, W, &lwork));
-    // allocate memery
+    // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
     // compute eigenvalues and eigenvectors.
     cusolverErrcheck(cusolverDnDsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
@@ -184,7 +184,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u
 static inline
 void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex<float>* A, const int& lda, float * W)
 {
-    // prepare some values for cusolverDnZhegvd_bufferSize
+    // prepare some values for cusolverDnCheevd_bufferSize
     int lwork  = 0; 
     int h_info = 0; 
     int*    d_info = nullptr;
@@ -194,7 +194,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u
     // calculate the sizes needed for pre-allocated buffer.
     cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
                                 n, reinterpret_cast<cuComplex*>(A), lda, W, &lwork));
-    // allocate memery
+    // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
     // compute eigenvalues and eigenvectors.
     cusolverErrcheck(cusolverDnCheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
@@ -210,7 +210,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u
 static inline
 void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex<double>* A, const int& lda, double* W)
 {
-    // prepare some values for cusolverDnZhegvd_bufferSize
+    // prepare some values for cusolverDnZheevd_bufferSize
     int lwork  = 0; 
     int h_info = 0; 
     int*    d_info = nullptr;
@@ -220,7 +220,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u
     // calculate the sizes needed for pre-allocated buffer.
     cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
                                 n, reinterpret_cast<cuDoubleComplex*>(A), lda, W, &lwork));
-    // allocate memery
+    // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
     // compute eigenvalues and eigenvectors.
     cusolverErrcheck(cusolverDnZheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
@@ -237,7 +237,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u
 static inline
 void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float* B, const int& ldb, float * W)
 {
-    // prepare some values for cusolverDnZhegvd_bufferSize
+    // prepare some values for cusolverDnSsygvd_bufferSize
     int lwork  = 0; 
     int h_info = 0; 
     int*   d_info = nullptr;
@@ -247,7 +247,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j
     // calculate the sizes needed for pre-allocated buffer.
     cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
                                 n, A, lda, B, ldb, W, &lwork));
-    // allocate memery
+    // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
     // compute eigenvalues and eigenvectors.
     cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
@@ -263,7 +263,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j
 static inline
 void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double* B, const int& ldb, double * W)
 {
-    // prepare some values for cusolverDnZhegvd_bufferSize
+    // prepare some values for cusolverDnDsygvd_bufferSize
     int lwork  = 0; 
     int h_info = 0; 
     int*   d_info = nullptr;
@@ -273,7 +273,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j
     // calculate the sizes needed for pre-allocated buffer.
     cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
                                 n, A, lda, B, ldb, W, &lwork));
-    // allocate memery
+    // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
     // compute eigenvalues and eigenvectors.
     cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
@@ -289,7 +289,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j
 static inline
 void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb, float* W)
 {
-    // prepare some values for cusolverDnZhegvd_bufferSize
+    // prepare some values for cusolverDnChegvd_bufferSize
     int lwork  = 0; 
     int h_info = 0; 
     int*   d_info = nullptr;
@@ -299,7 +299,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j
     // calculate the sizes needed for pre-allocated buffer.
     cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
                                 n, reinterpret_cast<cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb, W, &lwork));
-    // allocate memery
+    // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
     // compute eigenvalues and eigenvectors.
     cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
@@ -325,7 +325,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j
     // calculate the sizes needed for pre-allocated buffer.
     cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
                                 n, reinterpret_cast<cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb, W, &lwork));
-    // allocate memery
+    // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
     // compute eigenvalues and eigenvectors.
     cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
@@ -339,6 +339,180 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j
     cudaErrcheck(cudaFree(d_work));
 }
 
+static inline
+void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, float* A, const int& lda, int* ipiv)
+{
+    // prepare some values for cusolverDnSgetrf_bufferSize
+    int lwork = 0;
+    int h_info = 0;
+    int* d_info = nullptr;
+    float* d_work = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    // calculate the sizes needed for pre-allocated buffer.
+    cusolverErrcheck(cusolverDnSgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
+
+    // allocate memory
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
+
+    // Perform LU decomposition
+    cusolverErrcheck(cusolverDnSgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
+
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        throw std::runtime_error("getrf: failed to compute LU factorization");
+    }
+
+    cudaErrcheck(cudaFree(d_work));
+    cudaErrcheck(cudaFree(d_info));
+}
+static inline
+void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, double* A, const int& lda, int* ipiv)
+{
+    // prepare some values for cusolverDnDgetrf_bufferSize
+    int lwork = 0;
+    int h_info = 0;
+    int* d_info = nullptr;
+    double* d_work = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    // calculate the sizes needed for pre-allocated buffer.
+    cusolverErrcheck(cusolverDnDgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
+
+    // allocate memory
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
+
+    // Perform LU decomposition
+    cusolverErrcheck(cusolverDnDgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
+
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        throw std::runtime_error("getrf: failed to compute LU factorization");
+    }
+
+    cudaErrcheck(cudaFree(d_work));
+    cudaErrcheck(cudaFree(d_info));
+}
+static inline
+void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, std::complex<float>* A, const int& lda, int* ipiv)
+{
+    // prepare some values for cusolverDnCgetrf_bufferSize
+    int lwork = 0;
+    int h_info = 0;
+    int* d_info = nullptr;
+    cuComplex* d_work = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    // calculate the sizes needed for pre-allocated buffer.
+    cusolverErrcheck(cusolverDnCgetrf_bufferSize(cusolver_handle, m, n, reinterpret_cast<cuComplex*>(A), lda, &lwork));
+
+    // allocate memory
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
+
+    // Perform LU decomposition
+    cusolverErrcheck(cusolverDnCgetrf(cusolver_handle, m, n, reinterpret_cast<cuComplex*>(A), lda, d_work, ipiv, d_info));
+
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        throw std::runtime_error("getrf: failed to compute LU factorization");
+    }
+
+    cudaErrcheck(cudaFree(d_work));
+    cudaErrcheck(cudaFree(d_info));
+}
+static inline
+void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, std::complex<double>* A, const int& lda, int* ipiv)
+{
+    // prepare some values for cusolverDnZgetrf_bufferSize
+    int lwork = 0;
+    int h_info = 0;
+    int* d_info = nullptr;
+    cuDoubleComplex* d_work = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    // calculate the sizes needed for pre-allocated buffer.
+    cusolverErrcheck(cusolverDnZgetrf_bufferSize(cusolver_handle, m, n, reinterpret_cast<cuDoubleComplex*>(A), lda, &lwork));
+
+    // allocate memory
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
+
+    // Perform LU decomposition
+    cusolverErrcheck(cusolverDnZgetrf(cusolver_handle, m, n, reinterpret_cast<cuDoubleComplex*>(A), lda, d_work, ipiv, d_info));
+
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        throw std::runtime_error("getrf: failed to compute LU factorization");
+    }
+
+    cudaErrcheck(cudaFree(d_work));
+    cudaErrcheck(cudaFree(d_info));
+}
+
+static inline
+void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, float* A, const int& lda, const int* ipiv, float* B, const int& ldb)
+{
+    int h_info = 0;
+    int* d_info = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    cusolverErrcheck(cusolverDnSgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
+
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        throw std::runtime_error("getrs: failed to solve the linear system");
+    }
+
+    cudaErrcheck(cudaFree(d_info));
+}
+static inline
+void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, double* A, const int& lda, const int* ipiv, double* B, const int& ldb)
+{
+    int h_info = 0;
+    int* d_info = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    cusolverErrcheck(cusolverDnDgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
+
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        throw std::runtime_error("getrs: failed to solve the linear system");
+    }
+
+    cudaErrcheck(cudaFree(d_info));
+}
+static inline
+void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, std::complex<float>* A, const int& lda, const int* ipiv, std::complex<float>* B, const int& ldb)
+{
+    int h_info = 0;
+    int* d_info = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    cusolverErrcheck(cusolverDnCgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, reinterpret_cast<cuComplex*>(A), lda, ipiv, reinterpret_cast<cuComplex*>(B), ldb, d_info));
+
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        throw std::runtime_error("getrs: failed to solve the linear system");
+    }
+
+    cudaErrcheck(cudaFree(d_info));
+}
+static inline
+void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, std::complex<double>* A, const int& lda, const int* ipiv, std::complex<double>* B, const int& ldb)
+{
+    int h_info = 0;
+    int* d_info = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    cusolverErrcheck(cusolverDnZgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, reinterpret_cast<cuDoubleComplex*>(A), lda, ipiv, reinterpret_cast<cuDoubleComplex*>(B), ldb, d_info));
+
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        throw std::runtime_error("getrs: failed to solve the linear system");
+    }
+
+    cudaErrcheck(cudaFree(d_info));
+}
+
 } // namespace cuSolverConnector
 } // namespace container
 
diff --git a/source/module_base/module_container/base/third_party/lapack.h b/source/module_base/module_container/base/third_party/lapack.h
index 9b5ac7f409..7452dc1835 100644
--- a/source/module_base/module_container/base/third_party/lapack.h
+++ b/source/module_base/module_container/base/third_party/lapack.h
@@ -105,6 +105,20 @@ void dtrtri_(const char* uplo, const char* diag, const int* n, double* a, const
 void ctrtri_(const char* uplo, const char* diag, const int* n, std::complex<float>* a, const int* lda, int* info);
 void ztrtri_(const char* uplo, const char* diag, const int* n, std::complex<double>* a, const int* lda, int* info);
 
+void sgetrf_(const int* m, const int* n, float* a, const int* lda, int* ipiv, int* info);
+void dgetrf_(const int* m, const int* n, double* a, const int* lda, int* ipiv, int* info);
+void cgetrf_(const int* m, const int* n, std::complex<float>* a, const int* lda, int* ipiv, int* info);
+void zgetrf_(const int* m, const int* n, std::complex<double>* a, const int* lda, int* ipiv, int* info);
+
+void sgetri_(const int* n, float* A, const int* lda, const int* ipiv, float* work, const int* lwork, int* info);
+void dgetri_(const int* n, double* A, const int* lda, const int* ipiv, double* work, const int* lwork, int* info);
+void cgetri_(const int* n, std::complex<float>* A, const int* lda, const int* ipiv, std::complex<float>* work, const int* lwork, int* info);
+void zgetri_(const int* n, std::complex<double>* A, const int* lda, const int* ipiv, std::complex<double>* work, const int* lwork, int* info);
+
+void sgetrs_(const char* trans, const int* n, const int* nrhs, const float* A, const int* lda, const int* ipiv, float* B, const int* ldb, int* info);
+void dgetrs_(const char* trans, const int* n, const int* nrhs, const double* A, const int* lda, const int* ipiv, double* B, const int* ldb, int* info);
+void cgetrs_(const char* trans, const int* n, const int* nrhs, const std::complex<float>* A, const int* lda, const int* ipiv, std::complex<float>* B, const int* ldb, int* info);
+void zgetrs_(const char* trans, const int* n, const int* nrhs, const std::complex<double>* A, const int* lda, const int* ipiv, std::complex<double>* B, const int* ldb, int* info);
 }
 
 // Class LapackConnector provide the connector to fortran lapack routine.
@@ -321,6 +335,69 @@ void trtri( const char &uplo, const char &diag, const int &n, std::complex<doubl
     ztrtri_( &uplo, &diag, &n, A, &lda, &info);
 }
 
+static inline
+void getrf(const int m, const int n, float* A, const int lda, int* ipiv, int &info)
+{
+    sgetrf_(&m, &n, A, &lda, ipiv, &info);
+}
+static inline
+void getrf(const int m, const int n, double* A, const int lda, int* ipiv, int &info)
+{
+    dgetrf_(&m, &n, A, &lda, ipiv, &info);
+}
+static inline
+void getrf(const int m, const int n, std::complex<float>* A, const int lda, int* ipiv, int &info)
+{
+    cgetrf_(&m, &n, A, &lda, ipiv, &info);
+}
+static inline
+void getrf(const int m, const int n, std::complex<double>* A, const int lda, int* ipiv, int &info)
+{
+    zgetrf_(&m, &n, A, &lda, ipiv, &info);
+}
+
+static inline
+void getri(const int n, float* A, const int lda, const int* ipiv, float* work, const int lwork, int& info)
+{
+    sgetri_(&n, A, &lda, ipiv, work, &lwork, &info);
+}
+static inline
+void getri(const int n, double* A, const int lda, const int* ipiv, double* work, const int lwork, int& info)
+{
+    dgetri_(&n, A, &lda, ipiv, work, &lwork, &info);
+}
+static inline
+void getri(const int n, std::complex<float>* A, const int lda, const int* ipiv, std::complex<float>* work, const int lwork, int& info)
+{
+    cgetri_(&n, A, &lda, ipiv, work, &lwork, &info);
+}
+static inline
+void getri(const int n, std::complex<double>* A, const int lda, const int* ipiv, std::complex<double>* work, const int lwork, int& info)
+{
+    zgetri_(&n, A, &lda, ipiv, work, &lwork, &info);
+}
+
+static inline
+void getrs(const char& trans, const int n, const int nrhs, float* A, const int lda, const int* ipiv, float* B, const int ldb, int& info)
+{
+    sgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info);
+}
+static inline
+void getrs(const char& trans, const int n, const int nrhs, double* A, const int lda, const int* ipiv, double* B, const int ldb, int& info)
+{
+    dgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info);
+}
+static inline
+void getrs(const char& trans, const int n, const int nrhs, std::complex<float>* A, const int lda, const int* ipiv, std::complex<float>* B, const int ldb, int& info)
+{
+    cgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info);
+}
+static inline
+void getrs(const char& trans, const int n, const int nrhs, std::complex<double>* A, const int lda, const int* ipiv, std::complex<double>* B, const int ldb, int& info)
+{
+    zgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info);
+}
+
 } // namespace lapackConnector
 } // namespace container
 
diff --git a/source/module_elecstate/potentials/H_TDDFT_pw.cpp b/source/module_elecstate/potentials/H_TDDFT_pw.cpp
index a0533fda51..cf0dbd2d40 100644
--- a/source/module_elecstate/potentials/H_TDDFT_pw.cpp
+++ b/source/module_elecstate/potentials/H_TDDFT_pw.cpp
@@ -1,12 +1,12 @@
 #include "H_TDDFT_pw.h"
 
-#include "module_parameter/parameter.h"
 #include "module_base/constants.h"
 #include "module_base/math_integral.h"
 #include "module_base/timer.h"
 #include "module_hamilt_lcao/module_tddft/evolve_elec.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_io/input_conv.h"
+#include "module_parameter/parameter.h"
 
 namespace elecstate
 {
@@ -91,7 +91,7 @@ void H_TDDFT_pw::cal_fixed_v(double* vl_pseudo)
     H_TDDFT_pw::istep_int = istep;
 
     // judgement to skip vext
-    if (!module_tddft::Evolve_elec::td_vext || istep > tend || istep < tstart)
+    if (!PARAM.inp.td_vext || istep > tend || istep < tstart)
     {
         return;
     }
@@ -105,12 +105,12 @@ void H_TDDFT_pw::cal_fixed_v(double* vl_pseudo)
     trigo_count = 0;
     heavi_count = 0;
 
-    for (auto direc: module_tddft::Evolve_elec::td_vext_dire_case)
+    for (auto direc: PARAM.inp.td_vext_dire)
     {
         std::vector<double> vext_space(this->rho_basis_->nrxx, 0.0);
         double vext_time = cal_v_time(ttype[count], true);
 
-        if (module_tddft::Evolve_elec::out_efield && GlobalV::MY_RANK == 0)
+        if (PARAM.inp.out_efield && GlobalV::MY_RANK == 0)
         {
             std::stringstream as;
             as << PARAM.globalv.global_out_dir << "efield_" << count << ".dat";
@@ -248,7 +248,7 @@ void H_TDDFT_pw::update_At()
     H_TDDFT_pw::istep++;
 
     // judgement to skip vext
-    if (!module_tddft::Evolve_elec::td_vext || istep > tend || istep < tstart)
+    if (!PARAM.inp.td_vext || istep > tend || istep < tstart)
     {
         return;
     }
@@ -262,7 +262,7 @@ void H_TDDFT_pw::update_At()
     bool last = false;
     double out = 0.0;
 
-    for (auto direc: module_tddft::Evolve_elec::td_vext_dire_case)
+    for (auto direc: PARAM.inp.td_vext_dire)
     {
         last = false;
         // cut the integral space and initialize relevant parameters
@@ -297,7 +297,7 @@ void H_TDDFT_pw::update_At()
         }
 
         // output Efield
-        if (module_tddft::Evolve_elec::out_efield && GlobalV::MY_RANK == 0)
+        if (PARAM.inp.out_efield && GlobalV::MY_RANK == 0)
         {
             std::stringstream as;
             as << PARAM.globalv.global_out_dir << "efield_" << count << ".dat";
diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp
index 0352492a3a..a727a7a095 100644
--- a/source/module_esolver/esolver.cpp
+++ b/source/module_esolver/esolver.cpp
@@ -19,6 +19,7 @@ extern "C"
 #include "esolver_lj.h"
 #include "esolver_of.h"
 #include "module_parameter/md_parameter.h"
+
 #include <stdexcept>
 
 namespace ModuleESolver
@@ -45,16 +46,16 @@ std::string determine_type()
     else if (PARAM.inp.basis_type == "lcao_in_pw")
     {
 #ifdef __LCAO
-		if(PARAM.inp.esolver_type == "sdft")
-		{
-			esolver_type = "sdft_pw";
-		}
-		else if(PARAM.inp.esolver_type == "ksdft")
-		{
+        if (PARAM.inp.esolver_type == "sdft")
+        {
+            esolver_type = "sdft_pw";
+        }
+        else if (PARAM.inp.esolver_type == "ksdft")
+        {
             esolver_type = "ksdft_lip";
-		}
+        }
 #else
-		ModuleBase::WARNING_QUIT("ESolver", "Calculation involving numerical orbitals must be compiled with __LCAO");
+        ModuleBase::WARNING_QUIT("ESolver", "Calculation involving numerical orbitals must be compiled with __LCAO");
 #endif
     }
     else if (PARAM.inp.basis_type == "lcao")
@@ -67,7 +68,7 @@ std::string determine_type()
         else if (PARAM.inp.esolver_type == "ksdft")
         {
             esolver_type = "ksdft_lcao";
-		}
+        }
         else if (PARAM.inp.esolver_type == "ks-lr")
         {
             esolver_type = "ksdft_lr_lcao";
@@ -77,7 +78,7 @@ std::string determine_type()
             esolver_type = "lr_lcao";
         }
 #else
-		ModuleBase::WARNING_QUIT("ESolver", "Calculation involving numerical orbitals must be compiled with __LCAO");
+        ModuleBase::WARNING_QUIT("ESolver", "Calculation involving numerical orbitals must be compiled with __LCAO");
 #endif
     }
 
@@ -98,18 +99,18 @@ std::string determine_type()
 
     auto device_info = PARAM.inp.device;
 
-	for (char &c : device_info)
-	{
-		if (std::islower(c))
-		{
-			c = std::toupper(c);
-		}
-	}
-	if (GlobalV::MY_RANK == 0)
-	{
-		std::cout << " RUNNING WITH DEVICE  : " << device_info << " / "
-			<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;
-	}
+    for (char& c: device_info)
+    {
+        if (std::islower(c))
+        {
+            c = std::toupper(c);
+        }
+    }
+    if (GlobalV::MY_RANK == 0)
+    {
+        std::cout << " RUNNING WITH DEVICE  : " << device_info << " / "
+                  << base_device::information::get_device_info(PARAM.inp.device) << std::endl;
+    }
 
     GlobalV::ofs_running << "\n RUNNING WITH DEVICE  : " << device_info << " / "
                          << base_device::information::get_device_info(PARAM.inp.device) << std::endl;
@@ -117,40 +118,39 @@ std::string determine_type()
     return esolver_type;
 }
 
-
-//Some API to operate E_Solver
+// Some API to operate E_Solver
 ESolver* init_esolver(const Input_para& inp, UnitCell& ucell)
 {
-	//determine type of esolver based on INPUT information
-	const std::string esolver_type = determine_type();
+    // determine type of esolver based on INPUT information
+    const std::string esolver_type = determine_type();
 
     // initialize the corresponding Esolver child class
     if (esolver_type == "ksdft_pw")
     {
 #if ((defined __CUDA) || (defined __ROCM))
-		if (PARAM.inp.device == "gpu")
-		{
-			if (PARAM.inp.precision == "single")
-			{
-				return new ESolver_KS_PW<std::complex<float>, base_device::DEVICE_GPU>();
-			}
-			else
-			{
-				return new ESolver_KS_PW<std::complex<double>, base_device::DEVICE_GPU>();
-			}
-		}
+        if (PARAM.inp.device == "gpu")
+        {
+            if (PARAM.inp.precision == "single")
+            {
+                return new ESolver_KS_PW<std::complex<float>, base_device::DEVICE_GPU>();
+            }
+            else
+            {
+                return new ESolver_KS_PW<std::complex<double>, base_device::DEVICE_GPU>();
+            }
+        }
 #endif
-		if (PARAM.inp.precision == "single")
-		{
-			return new ESolver_KS_PW<std::complex<float>, base_device::DEVICE_CPU>();
-		}
-		else
-		{
-			return new ESolver_KS_PW<std::complex<double>, base_device::DEVICE_CPU>();
-		}
-	}
+        if (PARAM.inp.precision == "single")
+        {
+            return new ESolver_KS_PW<std::complex<float>, base_device::DEVICE_CPU>();
+        }
+        else
+        {
+            return new ESolver_KS_PW<std::complex<double>, base_device::DEVICE_CPU>();
+        }
+    }
     else if (esolver_type == "sdft_pw")
-	{
+    {
 #if ((defined __CUDA) || (defined __ROCM))
         if (PARAM.inp.device == "gpu")
         {
@@ -160,19 +160,19 @@ ESolver* init_esolver(const Input_para& inp, UnitCell& ucell)
             // }
             // else
             // {
-                return new ESolver_SDFT_PW<std::complex<double>, base_device::DEVICE_GPU>();
+            return new ESolver_SDFT_PW<std::complex<double>, base_device::DEVICE_GPU>();
             // }
         }
 #endif
         // if (PARAM.inp.precision == "single")
-		// {
-		// 	return new ESolver_SDFT_PW<std::complex<float>, base_device::DEVICE_CPU>();
-		// }
-		// else
-		// {
-			return new ESolver_SDFT_PW<std::complex<double>, base_device::DEVICE_CPU>();
-		// }
-	}
+        // {
+        // 	return new ESolver_SDFT_PW<std::complex<float>, base_device::DEVICE_CPU>();
+        // }
+        // else
+        // {
+        return new ESolver_SDFT_PW<std::complex<double>, base_device::DEVICE_CPU>();
+        // }
+    }
 #ifdef __LCAO
     else if (esolver_type == "ksdft_lip")
     {
@@ -212,14 +212,26 @@ ESolver* init_esolver(const Input_para& inp, UnitCell& ucell)
         }
     }
     else if (esolver_type == "ksdft_lcao_tddft")
-	{
-		return new ESolver_KS_LCAO_TDDFT();
+    {
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+        if (PARAM.inp.device == "gpu")
+        {
+            return new ESolver_KS_LCAO_TDDFT<base_device::DEVICE_GPU>();
+        }
+#endif
+        return new ESolver_KS_LCAO_TDDFT<base_device::DEVICE_CPU>();
     }
     else if (esolver_type == "lr_lcao")
     {
         // use constructor rather than Init function to initialize reference (instead of pointers) to ucell
-        if (PARAM.globalv.gamma_only_local) { return new LR::ESolver_LR<double, double>(inp, ucell); }
-        else { return new LR::ESolver_LR<std::complex<double>, double>(inp, ucell); }
+        if (PARAM.globalv.gamma_only_local)
+        {
+            return new LR::ESolver_LR<double, double>(inp, ucell);
+        }
+        else
+        {
+            return new LR::ESolver_LR<std::complex<double>, double>(inp, ucell);
+        }
     }
     else if (esolver_type == "ksdft_lr_lcao")
     {
@@ -261,34 +273,36 @@ ESolver* init_esolver(const Input_para& inp, UnitCell& ucell)
         return p_esolver_lr;
     }
 #endif
-	else if(esolver_type == "ofdft")
-	{
-		return new ESolver_OF();
-	}
-	else if (esolver_type == "lj_pot")
-	{
-		return new ESolver_LJ();
-	}
-	else if (esolver_type == "dp_pot")
-	{
-		return new ESolver_DP(PARAM.mdp.pot_file);
-	}
-	throw std::invalid_argument("esolver_type = "+std::string(esolver_type)+". Wrong in "+std::string(__FILE__)+" line "+std::to_string(__LINE__));
+    else if (esolver_type == "ofdft")
+    {
+        return new ESolver_OF();
+    }
+    else if (esolver_type == "lj_pot")
+    {
+        return new ESolver_LJ();
+    }
+    else if (esolver_type == "dp_pot")
+    {
+        return new ESolver_DP(PARAM.mdp.pot_file);
+    }
+    throw std::invalid_argument("esolver_type = " + std::string(esolver_type) + ". Wrong in " + std::string(__FILE__)
+                                + " line " + std::to_string(__LINE__));
 }
 
-void clean_esolver(ESolver * &pesolver, const bool lcao_cblacs_exit)
+void clean_esolver(ESolver*& pesolver, const bool lcao_cblacs_exit)
 {
 // Zhang Xiaoyang modified in 2024/7/6:
 // Note: because of the init method of serial lcao hsolver
 // it needs no release step for it, or this [delete] will cause Segmentation Fault
 // Probably it will be modified later.
 #ifdef __MPI
-	delete pesolver;
+    delete pesolver;
 #ifdef __LCAO
-    if (lcao_cblacs_exit) {
+    if (lcao_cblacs_exit)
+    {
         Cblacs_exit(1);
-}
+    }
 #endif
 #endif
 }
-}
+} // namespace ModuleESolver
diff --git a/source/module_esolver/esolver_ks_lcao_tddft.cpp b/source/module_esolver/esolver_ks_lcao_tddft.cpp
index acff4f7bf1..11792f6a2b 100644
--- a/source/module_esolver/esolver_ks_lcao_tddft.cpp
+++ b/source/module_esolver/esolver_ks_lcao_tddft.cpp
@@ -38,13 +38,23 @@
 namespace ModuleESolver
 {
 
-ESolver_KS_LCAO_TDDFT::ESolver_KS_LCAO_TDDFT()
+template <typename Device>
+ESolver_KS_LCAO_TDDFT<Device>::ESolver_KS_LCAO_TDDFT()
 {
     classname = "ESolver_KS_LCAO_TDDFT";
     basisname = "LCAO";
+
+    // If the device is GPU, we must open use_tensor and use_lapack
+    ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
+    if (ct_device_type == ct::DeviceType::GpuDevice)
+    {
+        use_tensor = true;
+        use_lapack = true;
+    }
 }
 
-ESolver_KS_LCAO_TDDFT::~ESolver_KS_LCAO_TDDFT()
+template <typename Device>
+ESolver_KS_LCAO_TDDFT<Device>::~ESolver_KS_LCAO_TDDFT()
 {
     delete psi_laststep;
     if (Hk_laststep != nullptr)
@@ -65,7 +75,8 @@ ESolver_KS_LCAO_TDDFT::~ESolver_KS_LCAO_TDDFT()
     }
 }
 
-void ESolver_KS_LCAO_TDDFT::before_all_runners(UnitCell& ucell, const Input_para& inp)
+template <typename Device>
+void ESolver_KS_LCAO_TDDFT<Device>::before_all_runners(UnitCell& ucell, const Input_para& inp)
 {
     // 1) run before_all_runners in ESolver_KS_LCAO
     ESolver_KS_LCAO<std::complex<double>, double>::before_all_runners(ucell, inp);
@@ -74,44 +85,54 @@ void ESolver_KS_LCAO_TDDFT::before_all_runners(UnitCell& ucell, const Input_para
     // this->pelec = dynamic_cast<elecstate::ElecStateLCAO_TDDFT*>(this->pelec);
 }
 
-void ESolver_KS_LCAO_TDDFT::hamilt2density_single(UnitCell& ucell, const int istep, const int iter, const double ethr)
+template <typename Device>
+void ESolver_KS_LCAO_TDDFT<Device>::hamilt2density_single(UnitCell& ucell,
+                                                          const int istep,
+                                                          const int iter,
+                                                          const double ethr)
 {
     if (PARAM.inp.init_wfc == "file")
     {
         if (istep >= 1)
         {
-            module_tddft::Evolve_elec::solve_psi(istep,
-                                                 PARAM.inp.nbands,
-                                                 PARAM.globalv.nlocal,
-                                                 this->p_hamilt,
-                                                 this->pv,
-                                                 this->psi,
-                                                 this->psi_laststep,
-                                                 this->Hk_laststep,
-                                                 this->Sk_laststep,
-                                                 this->pelec->ekb,
-                                                 td_htype,
-                                                 PARAM.inp.propagator,
-                                                 kv.get_nks());
+            module_tddft::Evolve_elec<Device>::solve_psi(istep,
+                                                         PARAM.inp.nbands,
+                                                         PARAM.globalv.nlocal,
+                                                         kv.get_nks(),
+                                                         this->p_hamilt,
+                                                         this->pv,
+                                                         this->psi,
+                                                         this->psi_laststep,
+                                                         this->Hk_laststep,
+                                                         this->Sk_laststep,
+                                                         this->pelec->ekb,
+                                                         GlobalV::ofs_running,
+                                                         td_htype,
+                                                         PARAM.inp.propagator,
+                                                         use_tensor,
+                                                         use_lapack);
             this->weight_dm_rho();
         }
         this->weight_dm_rho();
     }
     else if (istep >= 2)
     {
-        module_tddft::Evolve_elec::solve_psi(istep,
-                                             PARAM.inp.nbands,
-                                             PARAM.globalv.nlocal,
-                                             this->p_hamilt,
-                                             this->pv,
-                                             this->psi,
-                                             this->psi_laststep,
-                                             this->Hk_laststep,
-                                             this->Sk_laststep,
-                                             this->pelec->ekb,
-                                             td_htype,
-                                             PARAM.inp.propagator,
-                                             kv.get_nks());
+        module_tddft::Evolve_elec<Device>::solve_psi(istep,
+                                                     PARAM.inp.nbands,
+                                                     PARAM.globalv.nlocal,
+                                                     kv.get_nks(),
+                                                     this->p_hamilt,
+                                                     this->pv,
+                                                     this->psi,
+                                                     this->psi_laststep,
+                                                     this->Hk_laststep,
+                                                     this->Sk_laststep,
+                                                     this->pelec->ekb,
+                                                     GlobalV::ofs_running,
+                                                     td_htype,
+                                                     PARAM.inp.propagator,
+                                                     use_tensor,
+                                                     use_lapack);
         this->weight_dm_rho();
     }
     else
@@ -141,7 +162,8 @@ void ESolver_KS_LCAO_TDDFT::hamilt2density_single(UnitCell& ucell, const int ist
     this->pelec->f_en.deband = this->pelec->cal_delta_eband(ucell);
 }
 
-void ESolver_KS_LCAO_TDDFT::iter_finish(UnitCell& ucell, const int istep, int& iter)
+template <typename Device>
+void ESolver_KS_LCAO_TDDFT<Device>::iter_finish(UnitCell& ucell, const int istep, int& iter)
 {
     // print occupation of each band
     if (iter == 1 && istep <= 2)
@@ -170,7 +192,8 @@ void ESolver_KS_LCAO_TDDFT::iter_finish(UnitCell& ucell, const int istep, int& i
     ESolver_KS_LCAO<std::complex<double>, double>::iter_finish(ucell, istep, iter);
 }
 
-void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const int iter)
+template <typename Device>
+void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell, const int istep, const int iter)
 {
     // Calculate new potential according to new Charge Density
     if (!this->conv_esolver)
@@ -204,13 +227,17 @@ void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const i
 
         if (td_htype == 1)
         {
+            // Length of Hk_laststep and Sk_laststep, nlocal * nlocal for global, nloc for local
+            const int len_HS = use_tensor && use_lapack ? nlocal * nlocal : nloc;
+
             if (this->Hk_laststep == nullptr)
             {
                 this->Hk_laststep = new std::complex<double>*[kv.get_nks()];
                 for (int ik = 0; ik < kv.get_nks(); ++ik)
                 {
-                    this->Hk_laststep[ik] = new std::complex<double>[nloc];
-                    ModuleBase::GlobalFunc::ZEROS(Hk_laststep[ik], nloc);
+                    // Allocate memory for Hk_laststep, if (use_tensor && use_lapack), should be global
+                    this->Hk_laststep[ik] = new std::complex<double>[len_HS];
+                    ModuleBase::GlobalFunc::ZEROS(Hk_laststep[ik], len_HS);
                 }
             }
             if (this->Sk_laststep == nullptr)
@@ -218,8 +245,9 @@ void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const i
                 this->Sk_laststep = new std::complex<double>*[kv.get_nks()];
                 for (int ik = 0; ik < kv.get_nks(); ++ik)
                 {
-                    this->Sk_laststep[ik] = new std::complex<double>[nloc];
-                    ModuleBase::GlobalFunc::ZEROS(Sk_laststep[ik], nloc);
+                    // Allocate memory for Sk_laststep, if (use_tensor && use_lapack), should be global
+                    this->Sk_laststep[ik] = new std::complex<double>[len_HS];
+                    ModuleBase::GlobalFunc::ZEROS(Sk_laststep[ik], len_HS);
                 }
             }
         }
@@ -240,13 +268,37 @@ void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const i
                 this->p_hamilt->updateHk(ik);
                 hamilt::MatrixBlock<complex<double>> h_mat, s_mat;
                 this->p_hamilt->matrix(h_mat, s_mat);
-                BlasConnector::copy(nloc, h_mat.p, 1, Hk_laststep[ik], 1);
-                BlasConnector::copy(nloc, s_mat.p, 1, Sk_laststep[ik], 1);
+
+                if (use_tensor && use_lapack)
+                {
+                    // Gather H and S matrices to root process
+#ifdef __MPI
+                    int myid = 0;
+                    int num_procs = 1;
+                    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+                    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+
+                    Matrix_g<std::complex<double>> h_mat_g, s_mat_g; // Global matrix structure
+
+                    // Collect H matrix
+                    gatherMatrix(myid, 0, h_mat, h_mat_g);
+                    BlasConnector::copy(nlocal * nlocal, h_mat_g.p.get(), 1, Hk_laststep[ik], 1);
+
+                    // Collect S matrix
+                    gatherMatrix(myid, 0, s_mat, s_mat_g);
+                    BlasConnector::copy(nlocal * nlocal, s_mat_g.p.get(), 1, Sk_laststep[ik], 1);
+#endif
+                }
+                else
+                {
+                    BlasConnector::copy(nloc, h_mat.p, 1, Hk_laststep[ik], 1);
+                    BlasConnector::copy(nloc, s_mat.p, 1, Sk_laststep[ik], 1);
+                }
             }
         }
 
         // calculate energy density matrix for tddft
-        if (istep >= (PARAM.inp.init_wfc == "file" ? 0 : 2) && module_tddft::Evolve_elec::td_edm == 0)
+        if (istep >= (PARAM.inp.init_wfc == "file" ? 0 : 2) && PARAM.inp.td_edm == 0)
         {
             elecstate::cal_edm_tddft(this->pv, this->pelec, this->kv, this->p_hamilt);
         }
@@ -278,14 +330,15 @@ void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const i
     }
 }
 
-void ESolver_KS_LCAO_TDDFT::after_scf(UnitCell& ucell, const int istep)
+template <typename Device>
+void ESolver_KS_LCAO_TDDFT<Device>::after_scf(UnitCell& ucell, const int istep)
 {
     ModuleBase::TITLE("ESolver_KS_LCAO_TDDFT", "after_scf");
     ModuleBase::timer::tick("ESolver_KS_LCAO_TDDFT", "after_scf");
 
     for (int is = 0; is < PARAM.inp.nspin; is++)
     {
-        if (module_tddft::Evolve_elec::out_dipole == 1)
+        if (PARAM.inp.out_dipole == 1)
         {
             std::stringstream ss_dipole;
             ss_dipole << PARAM.globalv.global_out_dir << "SPIN" << is + 1 << "_DIPOLE";
@@ -318,7 +371,8 @@ void ESolver_KS_LCAO_TDDFT::after_scf(UnitCell& ucell, const int istep)
     ModuleBase::timer::tick("ESolver_KS_LCAO_TDDFT", "after_scf");
 }
 
-void ESolver_KS_LCAO_TDDFT::weight_dm_rho()
+template <typename Device>
+void ESolver_KS_LCAO_TDDFT<Device>::weight_dm_rho()
 {
     if (PARAM.inp.ocp == 1)
     {
@@ -335,4 +389,9 @@ void ESolver_KS_LCAO_TDDFT::weight_dm_rho()
     this->pelec->psiToRho(this->psi[0]);
 }
 
+template class ESolver_KS_LCAO_TDDFT<base_device::DEVICE_CPU>;
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+template class ESolver_KS_LCAO_TDDFT<base_device::DEVICE_GPU>;
+#endif
+
 } // namespace ModuleESolver
diff --git a/source/module_esolver/esolver_ks_lcao_tddft.h b/source/module_esolver/esolver_ks_lcao_tddft.h
index 40f4486475..c5fdcbc497 100644
--- a/source/module_esolver/esolver_ks_lcao_tddft.h
+++ b/source/module_esolver/esolver_ks_lcao_tddft.h
@@ -2,12 +2,52 @@
 #define ESOLVER_KS_LCAO_TDDFT_H
 #include "esolver_ks.h"
 #include "esolver_ks_lcao.h"
+#include "module_base/scalapack_connector.h" // Cpxgemr2d
 #include "module_hamilt_lcao/hamilt_lcaodft/record_adj.h"
 #include "module_psi/psi.h"
 
 namespace ModuleESolver
 {
+//------------------------ MPI gathering and distributing functions ------------------------//
+// This struct is used for collecting matrices from all processes to root process
+template <typename T>
+struct Matrix_g
+{
+    std::shared_ptr<T> p;
+    size_t row;
+    size_t col;
+    std::shared_ptr<int> desc;
+};
+
+// Collect matrices from all processes to root process
+template <typename T>
+void gatherMatrix(const int myid, const int root_proc, const hamilt::MatrixBlock<T>& mat_l, Matrix_g<T>& mat_g)
+{
+    const int* desca = mat_l.desc; // Obtain the descriptor of the local matrix
+    int ctxt = desca[1];           // BLACS context
+    int nrows = desca[2];          // Global matrix row number
+    int ncols = desca[3];          // Global matrix column number
 
+    if (myid == root_proc)
+    {
+        mat_g.p.reset(new T[nrows * ncols]); // No need to delete[] since it is a shared_ptr
+    }
+    else
+    {
+        mat_g.p.reset(new T[nrows * ncols]); // Placeholder for non-root processes
+    }
+
+    // Set the descriptor of the global matrix
+    mat_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows});
+    mat_g.row = nrows;
+    mat_g.col = ncols;
+
+    // Call the Cpxgemr2d function in ScaLAPACK to collect the matrix data
+    Cpxgemr2d(nrows, ncols, mat_l.p, 1, 1, const_cast<int*>(desca), mat_g.p.get(), 1, 1, mat_g.desc.get(), ctxt);
+}
+//------------------------ MPI gathering and distributing functions ------------------------//
+
+template <typename Device = base_device::DEVICE_CPU>
 class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, double>
 {
   public:
@@ -35,7 +75,11 @@ class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, doubl
     //! Overlap matrix of last time step
     std::complex<double>** Sk_laststep = nullptr;
 
-    int td_htype = 1;
+    const int td_htype = 1;
+
+    //! Control heterogeneous computing of the TDDFT solver
+    bool use_tensor = false;
+    bool use_lapack = false;
 
   private:
     void weight_dm_rho();
diff --git a/source/module_hamilt_lcao/module_tddft/CMakeLists.txt b/source/module_hamilt_lcao/module_tddft/CMakeLists.txt
index 0f6068054b..6c5028d29f 100644
--- a/source/module_hamilt_lcao/module_tddft/CMakeLists.txt
+++ b/source/module_hamilt_lcao/module_tddft/CMakeLists.txt
@@ -2,10 +2,13 @@ if(ENABLE_LCAO)
     list(APPEND objects
         evolve_elec.cpp
         evolve_psi.cpp
-        bandenergy.cpp
+        band_energy.cpp
         middle_hamilt.cpp
         norm_psi.cpp
         propagator.cpp
+        propagator_cn2.cpp
+        propagator_taylor.cpp
+        propagator_etrs.cpp
         upsi.cpp
         td_velocity.cpp
         td_current.cpp
diff --git a/source/module_hamilt_lcao/module_tddft/band_energy.cpp b/source/module_hamilt_lcao/module_tddft/band_energy.cpp
new file mode 100644
index 0000000000..c148042be3
--- /dev/null
+++ b/source/module_hamilt_lcao/module_tddft/band_energy.cpp
@@ -0,0 +1,429 @@
+#include "band_energy.h"
+
+#include "evolve_elec.h"
+#include "module_base/lapack_connector.h"
+#include "module_base/module_container/ATen/kernels/blas.h"
+#include "module_base/scalapack_connector.h"
+
+#include <complex>
+#include <iostream>
+
+namespace module_tddft
+{
+#ifdef __MPI
+
+inline int globalIndex(int localindex, int nblk, int nprocs, int myproc)
+{
+    int iblock, gIndex;
+    iblock = localindex / nblk;
+    gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk;
+    return gIndex;
+}
+
+void compute_ekb(const Parallel_Orbitals* pv,
+                 const int nband,
+                 const int nlocal,
+                 const std::complex<double>* Htmp,
+                 const std::complex<double>* psi_k,
+                 double* ekb,
+                 std::ofstream& ofs_running)
+{
+    assert(pv->nloc_wfc > 0 && pv->nloc > 0);
+
+    std::complex<double>* tmp1 = new std::complex<double>[pv->nloc_wfc];
+    ModuleBase::GlobalFunc::ZEROS(tmp1, pv->nloc_wfc);
+
+    std::complex<double>* eij = new std::complex<double>[pv->nloc];
+    ModuleBase::GlobalFunc::ZEROS(eij, pv->nloc);
+
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nband,
+                             nlocal,
+                             1.0,
+                             Htmp,
+                             1,
+                             1,
+                             pv->desc,
+                             psi_k,
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             0.0,
+                             tmp1,
+                             1,
+                             1,
+                             pv->desc_wfc);
+
+    ScalapackConnector::gemm('C',
+                             'N',
+                             nband,
+                             nband,
+                             nlocal,
+                             1.0,
+                             psi_k,
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             tmp1,
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             0.0,
+                             eij,
+                             1,
+                             1,
+                             pv->desc_Eij);
+
+    if (PARAM.inp.td_print_eij > 0.0)
+    {
+        ofs_running
+            << "------------------------------------------------------------------------------------------------"
+            << std::endl;
+        ofs_running << " Eij:" << std::endl;
+        for (int i = 0; i < pv->nrow_bands; i++)
+        {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->ncol_bands; j++)
+            {
+                double aa = eij[in + j].real();
+                double bb = eij[in + j].imag();
+                if (std::abs(aa) < PARAM.inp.td_print_eij)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < PARAM.inp.td_print_eij)
+                {
+                    bb = 0.0;
+                }
+                if (aa > 0.0 || bb > 0.0)
+                {
+                    ofs_running << i << " " << j << " " << aa << "+" << bb << "i " << std::endl;
+                }
+            }
+        }
+        ofs_running << std::endl;
+        ofs_running
+            << "------------------------------------------------------------------------------------------------"
+            << std::endl;
+    }
+
+    int info = 0;
+    int naroc[2] = {0, 0};
+
+    assert(nband > 0);
+    double* eii = new double[nband];
+    ModuleBase::GlobalFunc::ZEROS(eii, nband);
+
+    for (int iprow = 0; iprow < pv->dim0; ++iprow)
+    {
+        for (int ipcol = 0; ipcol < pv->dim1; ++ipcol)
+        {
+            if (iprow == pv->coord[0] && ipcol == pv->coord[1])
+            {
+                naroc[0] = pv->nrow;
+                naroc[1] = pv->ncol;
+                for (int j = 0; j < naroc[1]; ++j)
+                {
+                    int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol);
+                    if (igcol >= nband)
+                    {
+                        continue;
+                    }
+                    for (int i = 0; i < naroc[0]; ++i)
+                    {
+                        int igrow = globalIndex(i, pv->nb, pv->dim0, iprow);
+                        if (igrow >= nband)
+                        {
+                            continue;
+                        }
+                        if (igcol == igrow)
+                        {
+                            eii[igcol] = eij[j * naroc[0] + i].real();
+                        }
+                    }
+                }
+            }
+        } // loop ipcol
+    } // loop iprow
+    info = MPI_Allreduce(eii, ekb, nband, MPI_DOUBLE, MPI_SUM, pv->comm());
+
+    delete[] tmp1;
+    delete[] eij;
+    delete[] eii;
+}
+
+void compute_ekb_tensor(const Parallel_Orbitals* pv,
+                        const int nband,
+                        const int nlocal,
+                        const ct::Tensor& Htmp,
+                        const ct::Tensor& psi_k,
+                        ct::Tensor& ekb,
+                        std::ofstream& ofs_running)
+{
+    assert(pv->nloc_wfc > 0 && pv->nloc > 0);
+
+    // Create Tensor objects for temporary data
+    ct::Tensor tmp1(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc_wfc}));
+    tmp1.zero();
+
+    ct::Tensor eij(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc}));
+    eij.zero();
+
+    // Perform matrix multiplication: tmp1 = Htmp * psi_k
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nband,
+                             nlocal,
+                             1.0,
+                             Htmp.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc,
+                             psi_k.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             0.0,
+                             tmp1.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc);
+
+    // Perform matrix multiplication: eij = psi_k^dagger * tmp1
+    ScalapackConnector::gemm('C',
+                             'N',
+                             nband,
+                             nband,
+                             nlocal,
+                             1.0,
+                             psi_k.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             tmp1.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             0.0,
+                             eij.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_Eij);
+
+    if (PARAM.inp.td_print_eij >= 0.0)
+    {
+        ofs_running
+            << "------------------------------------------------------------------------------------------------"
+            << std::endl;
+        ofs_running << " Eij:" << std::endl;
+        for (int i = 0; i < pv->nrow_bands; i++)
+        {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->ncol_bands; j++)
+            {
+                double aa = eij.data<std::complex<double>>()[in + j].real();
+                double bb = eij.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < PARAM.inp.td_print_eij)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < PARAM.inp.td_print_eij)
+                {
+                    bb = 0.0;
+                }
+                if (aa > 0.0 || bb > 0.0)
+                {
+                    ofs_running << i << " " << j << " " << aa << "+" << bb << "i " << std::endl;
+                }
+            }
+        }
+        ofs_running << std::endl;
+        ofs_running
+            << "------------------------------------------------------------------------------------------------"
+            << std::endl;
+    }
+
+    int info = 0;
+    int naroc[2] = {0, 0};
+
+    // Create a Tensor for eii
+    assert(nband > 0);
+    ct::Tensor eii(ct::DataType::DT_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({nband}));
+    eii.zero();
+
+    for (int iprow = 0; iprow < pv->dim0; ++iprow)
+    {
+        for (int ipcol = 0; ipcol < pv->dim1; ++ipcol)
+        {
+            if (iprow == pv->coord[0] && ipcol == pv->coord[1])
+            {
+                naroc[0] = pv->nrow;
+                naroc[1] = pv->ncol;
+                for (int j = 0; j < naroc[1]; ++j)
+                {
+                    int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol);
+                    if (igcol >= nband)
+                    {
+                        continue;
+                    }
+                    for (int i = 0; i < naroc[0]; ++i)
+                    {
+                        int igrow = globalIndex(i, pv->nb, pv->dim0, iprow);
+                        if (igrow >= nband)
+                        {
+                            continue;
+                        }
+                        if (igcol == igrow)
+                        {
+                            eii.data<double>()[igcol] = eij.data<std::complex<double>>()[j * naroc[0] + i].real();
+                        }
+                    }
+                }
+            }
+        } // loop ipcol
+    } // loop iprow
+
+    // Perform MPI reduction to compute ekb
+    info = MPI_Allreduce(eii.data<double>(), ekb.data<double>(), nband, MPI_DOUBLE, MPI_SUM, pv->comm());
+}
+
+template <typename Device>
+void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
+                               const int nband,
+                               const int nlocal,
+                               const ct::Tensor& Htmp,
+                               const ct::Tensor& psi_k,
+                               ct::Tensor& ekb,
+                               std::ofstream& ofs_running)
+{
+    // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
+    ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
+    // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+
+    // Create Tensor objects for temporary data
+    ct::Tensor tmp1(ct::DataType::DT_COMPLEX_DOUBLE,
+                    ct_device_type,
+                    ct::TensorShape({nlocal * nband})); // tmp1 shape: nlocal * nband
+    tmp1.zero();
+
+    ct::Tensor eij(ct::DataType::DT_COMPLEX_DOUBLE,
+                   ct_device_type,
+                   ct::TensorShape({nlocal * nlocal})); // eij shape: nlocal * nlocal
+    // Why not use nband * nband ?????
+    eij.zero();
+
+    std::complex<double> alpha = {1.0, 0.0};
+    std::complex<double> beta = {0.0, 0.0};
+
+    // Perform matrix multiplication: tmp1 = Htmp * psi_k
+    ct::kernels::blas_gemm<std::complex<double>, ct_Device>()('N',
+                                                              'N',
+                                                              nlocal,
+                                                              nband,
+                                                              nlocal,
+                                                              &alpha,
+                                                              Htmp.data<std::complex<double>>(),
+                                                              nlocal, // Leading dimension of Htmp
+                                                              psi_k.data<std::complex<double>>(),
+                                                              nlocal, // Leading dimension of psi_k
+                                                              &beta,
+                                                              tmp1.data<std::complex<double>>(),
+                                                              nlocal); // Leading dimension of tmp1
+
+    // Perform matrix multiplication: eij = psi_k^dagger * tmp1
+    ct::kernels::blas_gemm<std::complex<double>, ct_Device>()('C',
+                                                              'N',
+                                                              nband,
+                                                              nband,
+                                                              nlocal,
+                                                              &alpha,
+                                                              psi_k.data<std::complex<double>>(),
+                                                              nlocal, // Leading dimension of psi_k
+                                                              tmp1.data<std::complex<double>>(),
+                                                              nlocal, // Leading dimension of tmp1
+                                                              &beta,
+                                                              eij.data<std::complex<double>>(),
+                                                              nlocal); // Leading dimension of eij
+
+    if (PARAM.inp.td_print_eij >= 0.0)
+    {
+        ct::Tensor eij_cpu = eij.to_device<ct::DEVICE_CPU>();
+
+        ofs_running
+            << "------------------------------------------------------------------------------------------------"
+            << std::endl;
+        ofs_running << " Eij:" << std::endl;
+        for (int i = 0; i < nband; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nband; j++)
+            {
+                double aa = eij_cpu.data<std::complex<double>>()[in + j].real();
+                double bb = eij_cpu.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < PARAM.inp.td_print_eij)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < PARAM.inp.td_print_eij)
+                {
+                    bb = 0.0;
+                }
+                if (aa > 0.0 || bb > 0.0)
+                {
+                    ofs_running << i << " " << j << " " << aa << "+" << bb << "i " << std::endl;
+                }
+            }
+        }
+        ofs_running << std::endl;
+        ofs_running
+            << "------------------------------------------------------------------------------------------------"
+            << std::endl;
+    }
+
+    // Extract diagonal elements of eij into ekb
+    if (ct_device_type == ct::DeviceType::GpuDevice)
+    {
+        // GPU implementation
+        for (int i = 0; i < nband; ++i)
+        {
+            base_device::memory::synchronize_memory_op<double, Device, Device>()(
+                ekb.data<double>() + i,
+                reinterpret_cast<const double*>(eij.data<std::complex<double>>() + i * nlocal + i),
+                1);
+        }
+    }
+    else
+    {
+        // CPU implementation
+        for (int i = 0; i < nband; ++i)
+        {
+            ekb.data<double>()[i] = eij.data<std::complex<double>>()[i * nlocal + i].real();
+        }
+    }
+}
+
+// Explicit instantiation of template functions
+template void compute_ekb_tensor_lapack<base_device::DEVICE_CPU>(const Parallel_Orbitals* pv,
+                                                                 const int nband,
+                                                                 const int nlocal,
+                                                                 const ct::Tensor& Htmp,
+                                                                 const ct::Tensor& psi_k,
+                                                                 ct::Tensor& ekb,
+                                                                 std::ofstream& ofs_running);
+
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+template void compute_ekb_tensor_lapack<base_device::DEVICE_GPU>(const Parallel_Orbitals* pv,
+                                                                 const int nband,
+                                                                 const int nlocal,
+                                                                 const ct::Tensor& Htmp,
+                                                                 const ct::Tensor& psi_k,
+                                                                 ct::Tensor& ekb,
+                                                                 std::ofstream& ofs_running);
+#endif // __CUDA
+#endif // __MPI
+
+} // namespace module_tddft
diff --git a/source/module_hamilt_lcao/module_tddft/band_energy.h b/source/module_hamilt_lcao/module_tddft/band_energy.h
new file mode 100644
index 0000000000..4e1ab9aa5b
--- /dev/null
+++ b/source/module_hamilt_lcao/module_tddft/band_energy.h
@@ -0,0 +1,53 @@
+/**
+ * @file bandenegy.h
+ * @brief compute band energy ekb
+ *  This file originally belonged to file LCAO_evolve.cpp
+ */
+#ifndef BANDENERGY_H
+#define BANDENERGY_H
+
+#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor
+#include "module_basis/module_ao/parallel_orbitals.h"
+
+#include <complex>
+
+namespace module_tddft
+{
+#ifdef __MPI
+/**
+ *  @brief compute band energy ekb <psi_i|H|psi_i>
+ *
+ * @param[in] pv information of parallel
+ * @param[in] nband number of bands
+ * @param[in] nlocal number of orbitals
+ * @param[in] Htmp Hamiltonian
+ * @param[in] psi_k psi of this step
+ * @param[out] ekb band energy
+ */
+void compute_ekb(const Parallel_Orbitals* pv,
+                 const int nband,
+                 const int nlocal,
+                 const std::complex<double>* Htmp,
+                 const std::complex<double>* psi_k,
+                 double* ekb,
+                 std::ofstream& ofs_running);
+
+void compute_ekb_tensor(const Parallel_Orbitals* pv,
+                        const int nband,
+                        const int nlocal,
+                        const ct::Tensor& Htmp,
+                        const ct::Tensor& psi_k,
+                        ct::Tensor& ekb,
+                        std::ofstream& ofs_running);
+
+template <typename Device>
+void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
+                               const int nband,
+                               const int nlocal,
+                               const ct::Tensor& Htmp,
+                               const ct::Tensor& psi_k,
+                               ct::Tensor& ekb,
+                               std::ofstream& ofs_running);
+#endif // __MPI
+} // namespace module_tddft
+#endif
diff --git a/source/module_hamilt_lcao/module_tddft/bandenergy.cpp b/source/module_hamilt_lcao/module_tddft/bandenergy.cpp
deleted file mode 100644
index 37212a6606..0000000000
--- a/source/module_hamilt_lcao/module_tddft/bandenergy.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-#include "bandenergy.h"
-
-#include <complex>
-#include <iostream>
-
-#include "evolve_elec.h"
-#include "module_base/lapack_connector.h"
-#include "module_base/scalapack_connector.h"
-
-namespace module_tddft
-{
-#ifdef __MPI
-
-inline int globalIndex(int localindex, int nblk, int nprocs, int myproc)
-{
-    int iblock, gIndex;
-    iblock = localindex / nblk;
-    gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk;
-    return gIndex;
-}
-
-void compute_ekb(const Parallel_Orbitals* pv,
-                 const int nband,
-                 const int nlocal,
-                 const std::complex<double>* Htmp,
-                 const std::complex<double>* psi_k,
-                 double* ekb)
-{
-
-    std::complex<double>* tmp1 = new std::complex<double>[pv->nloc_wfc];
-    ModuleBase::GlobalFunc::ZEROS(tmp1, pv->nloc_wfc);
-
-    std::complex<double>* Eij = new std::complex<double>[pv->nloc];
-    ModuleBase::GlobalFunc::ZEROS(Eij, pv->nloc);
-
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nband,
-                             nlocal,
-                             1.0,
-                             Htmp,
-                             1,
-                             1,
-                             pv->desc,
-                             psi_k,
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             0.0,
-                             tmp1,
-                             1,
-                             1,
-                             pv->desc_wfc);
-
-    ScalapackConnector::gemm('C',
-                             'N',
-                             nband,
-                             nband,
-                             nlocal,
-                             1.0,
-                             psi_k,
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             tmp1,
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             0.0,
-                             Eij,
-                             1,
-                             1,
-                             pv->desc_Eij);
-
-    if (Evolve_elec::td_print_eij > 0.0)
-    {
-        GlobalV::ofs_running
-            << "------------------------------------------------------------------------------------------------"
-            << std::endl;
-        GlobalV::ofs_running << " Eij:" << std::endl;
-        for (int i = 0; i < pv->nrow_bands; i++)
-        {
-            for (int j = 0; j < pv->ncol_bands; j++)
-            {
-                double aa, bb;
-                aa = Eij[i * pv->ncol + j].real();
-                bb = Eij[i * pv->ncol + j].imag();
-                if (std::abs(aa) < Evolve_elec::td_print_eij)
-                    aa = 0.0;
-                if (std::abs(bb) < Evolve_elec::td_print_eij)
-                    bb = 0.0;
-                if (aa > 0.0 || bb > 0.0)
-                {
-                    GlobalV::ofs_running << i << " " << j << " " << aa << "+" << bb << "i " << std::endl;
-                }
-            }
-        }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running
-            << "------------------------------------------------------------------------------------------------"
-            << std::endl;
-    }
-
-    int info;
-    int naroc[2];
-
-    double* Eii = new double[nband];
-    ModuleBase::GlobalFunc::ZEROS(Eii, nband);
-    for (int iprow = 0; iprow < pv->dim0; ++iprow)
-    {
-        for (int ipcol = 0; ipcol < pv->dim1; ++ipcol)
-        {
-            if (iprow == pv->coord[0] && ipcol == pv->coord[1])
-            {
-                naroc[0] = pv->nrow;
-                naroc[1] = pv->ncol;
-                for (int j = 0; j < naroc[1]; ++j)
-                {
-                    int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol);
-                    if (igcol >= nband)
-                        continue;
-                    for (int i = 0; i < naroc[0]; ++i)
-                    {
-                        int igrow = globalIndex(i, pv->nb, pv->dim0, iprow);
-                        if (igrow >= nband)
-                            continue;
-                        if (igcol == igrow)
-                        {
-                            Eii[igcol] = Eij[j * naroc[0] + i].real();
-                        }
-                    }
-                }
-            }
-        } // loop ipcol
-    }     // loop iprow
-    info = MPI_Allreduce(Eii, ekb, nband, MPI_DOUBLE, MPI_SUM, pv->comm());
-
-    delete[] tmp1;
-    delete[] Eij;
-    delete[] Eii;
-}
-
-#endif
-
-} // namespace module_tddft
diff --git a/source/module_hamilt_lcao/module_tddft/bandenergy.h b/source/module_hamilt_lcao/module_tddft/bandenergy.h
deleted file mode 100644
index fb8c989d8b..0000000000
--- a/source/module_hamilt_lcao/module_tddft/bandenergy.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * @file bandenegy.h
- * @brief compute band energy ekb
- *  This file originally belonged to file LCAO_evolve.cpp
- */
-#ifndef BANDENERGY_H
-#define BANDENERGY_H
-
-#include "module_basis/module_ao/parallel_orbitals.h"
-
-#include <complex>
-
-namespace module_tddft
-{
-#ifdef __MPI
-/**
- *  @brief compute band energy ekb <psi_i|H|psi_i>
- *
- * @param[in] pv information of parallel
- * @param[in] nband number of bands
- * @param[in] nlocal number of orbitals
- * @param[in] Htmp Hamiltonian
- * @param[in] psi_k psi of this step
- * @param[out] ekb band energy
- */
-void compute_ekb(const Parallel_Orbitals* pv,
-                 const int nband,
-                 const int nlocal,
-                 const std::complex<double>* Htmp,
-                 const std::complex<double>* psi_k,
-                 double* ekb);
-#endif
-} // namespace module_tddft
-#endif
diff --git a/source/module_hamilt_lcao/module_tddft/evolve_elec.cpp b/source/module_hamilt_lcao/module_tddft/evolve_elec.cpp
index 7be680cfff..95b05872d2 100644
--- a/source/module_hamilt_lcao/module_tddft/evolve_elec.cpp
+++ b/source/module_hamilt_lcao/module_tddft/evolve_elec.cpp
@@ -10,40 +10,44 @@
 
 namespace module_tddft
 {
-Evolve_elec::Evolve_elec(){};
-Evolve_elec::~Evolve_elec(){};
+template <typename Device>
+Evolve_elec<Device>::Evolve_elec(){};
+template <typename Device>
+Evolve_elec<Device>::~Evolve_elec(){};
 
-double Evolve_elec::td_force_dt;
-bool Evolve_elec::td_vext;
-std::vector<int> Evolve_elec::td_vext_dire_case;
-bool Evolve_elec::out_dipole;
-bool Evolve_elec::out_efield;
-double Evolve_elec::td_print_eij; // the threshold to output Eij elements
-int Evolve_elec::td_edm;          // 0: new edm method   1: old edm method
+template <typename Device>
+ct::DeviceType Evolve_elec<Device>::ct_device_type = ct::DeviceTypeToEnum<Device>::value;
 
 // this routine only serves for TDDFT using LCAO basis set
-void Evolve_elec::solve_psi(const int& istep,
-                            const int nband,
-                            const int nlocal,
-                            hamilt::Hamilt<std::complex<double>>* phm,
-                            Parallel_Orbitals& para_orb,
-                            psi::Psi<std::complex<double>>* psi,
-                            psi::Psi<std::complex<double>>* psi_laststep,
-                            std::complex<double>** Hk_laststep,
-                            std::complex<double>** Sk_laststep,
-                            ModuleBase::matrix& ekb,
-                            int htype,
-                            int propagator,
-                            const int& nks)
+template <typename Device>
+void Evolve_elec<Device>::solve_psi(const int& istep,
+                                    const int nband,
+                                    const int nlocal,
+                                    const int& nks,
+                                    hamilt::Hamilt<std::complex<double>>* phm,
+                                    Parallel_Orbitals& para_orb,
+                                    psi::Psi<std::complex<double>>* psi,
+                                    psi::Psi<std::complex<double>>* psi_laststep,
+                                    std::complex<double>** Hk_laststep,
+                                    std::complex<double>** Sk_laststep,
+                                    ModuleBase::matrix& ekb,
+                                    std::ofstream& ofs_running,
+                                    const int htype,
+                                    const int propagator,
+                                    const bool use_tensor,
+                                    const bool use_lapack)
 {
     ModuleBase::TITLE("Evolve_elec", "solve_psi");
     ModuleBase::timer::tick("Evolve_elec", "solve_psi");
 
+    // Control the print of matrix to running_md.log
+    const int print_matrix = 0;
+
     for (int ik = 0; ik < nks; ik++)
     {
         phm->updateHk(ik);
 
-        ModuleBase::timer::tick("Efficience", "evolve_k");
+        ModuleBase::timer::tick("Efficiency", "evolve_k");
         psi->fix_k(ik);
         psi_laststep->fix_k(ik);
         if (htype == 0)
@@ -58,31 +62,165 @@ void Evolve_elec::solve_psi(const int& istep,
                        nullptr,
                        &(ekb(ik, 0)),
                        htype,
-                       propagator);
+                       propagator,
+                       ofs_running,
+                       print_matrix);
         }
         else if (htype == 1)
         {
-            evolve_psi(nband,
-                       nlocal,
-                       &(para_orb),
-                       phm,
-                       psi[0].get_pointer(),
-                       psi_laststep[0].get_pointer(),
-                       Hk_laststep[ik],
-                       Sk_laststep[ik],
-                       &(ekb(ik, 0)),
-                       htype,
-                       propagator);
+            if (!use_tensor)
+            {
+                evolve_psi(nband,
+                           nlocal,
+                           &(para_orb),
+                           phm,
+                           psi[0].get_pointer(),
+                           psi_laststep[0].get_pointer(),
+                           Hk_laststep[ik],
+                           Sk_laststep[ik],
+                           &(ekb(ik, 0)),
+                           htype,
+                           propagator,
+                           ofs_running,
+                           print_matrix);
+                // std::cout << "Print ekb: " << std::endl;
+                // ekb.print(std::cout);
+            }
+            else
+            {
+                const int len_psi_k_1 = use_lapack ? nband : psi->get_nbands();
+                const int len_psi_k_2 = use_lapack ? nlocal : psi->get_nbasis();
+                const int len_HS_laststep = use_lapack ? nlocal * nlocal : para_orb.nloc;
+
+                // Create Tensor for psi_k, psi_k_laststep, H_laststep, S_laststep, ekb
+                ct::Tensor psi_k_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
+                                        ct_device_type,
+                                        ct::TensorShape({len_psi_k_1, len_psi_k_2}));
+                ct::Tensor psi_k_laststep_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
+                                                 ct_device_type,
+                                                 ct::TensorShape({len_psi_k_1, len_psi_k_2}));
+                ct::Tensor H_laststep_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
+                                             ct_device_type,
+                                             ct::TensorShape({len_HS_laststep}));
+                ct::Tensor S_laststep_tensor(ct::DataType::DT_COMPLEX_DOUBLE,
+                                             ct_device_type,
+                                             ct::TensorShape({len_HS_laststep}));
+                ct::Tensor ekb_tensor(ct::DataType::DT_DOUBLE, ct_device_type, ct::TensorShape({nband}));
+
+                // Global psi
+                ModuleESolver::Matrix_g<std::complex<double>> psi_g;
+                ModuleESolver::Matrix_g<std::complex<double>> psi_laststep_g;
+
+                if (use_lapack)
+                {
+                    // Need to gather the psi to the root process on CPU
+                    // H_laststep and S_laststep are already gathered in esolver_ks_lcao_tddft.cpp
+#ifdef __MPI
+                    // Access the rank of the calling process in the communicator
+                    int myid = 0;
+                    int root_proc = 0;
+                    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+                    // Gather psi to the root process
+                    gatherPsi(myid, root_proc, psi[0].get_pointer(), para_orb, psi_g);
+                    gatherPsi(myid, root_proc, psi_laststep[0].get_pointer(), para_orb, psi_laststep_g);
+
+                    // Syncronize data from CPU to Device
+                    syncmem_complex_h2d_op()(psi_k_tensor.data<std::complex<double>>(),
+                                             psi_g.p.get(),
+                                             len_psi_k_1 * len_psi_k_2);
+                    syncmem_complex_h2d_op()(psi_k_laststep_tensor.data<std::complex<double>>(),
+                                             psi_laststep_g.p.get(),
+                                             len_psi_k_1 * len_psi_k_2);
+#endif
+                }
+                else
+                {
+                    // Syncronize data from CPU to Device
+                    syncmem_complex_h2d_op()(psi_k_tensor.data<std::complex<double>>(),
+                                             psi[0].get_pointer(),
+                                             len_psi_k_1 * len_psi_k_2);
+                    syncmem_complex_h2d_op()(psi_k_laststep_tensor.data<std::complex<double>>(),
+                                             psi_laststep[0].get_pointer(),
+                                             len_psi_k_1 * len_psi_k_2);
+                }
+
+                syncmem_complex_h2d_op()(H_laststep_tensor.data<std::complex<double>>(),
+                                         Hk_laststep[ik],
+                                         len_HS_laststep);
+                syncmem_complex_h2d_op()(S_laststep_tensor.data<std::complex<double>>(),
+                                         Sk_laststep[ik],
+                                         len_HS_laststep);
+                syncmem_double_h2d_op()(ekb_tensor.data<double>(), &(ekb(ik, 0)), nband);
+
+                evolve_psi_tensor<Device>(nband,
+                                          nlocal,
+                                          &(para_orb),
+                                          phm,
+                                          psi_k_tensor,
+                                          psi_k_laststep_tensor,
+                                          H_laststep_tensor,
+                                          S_laststep_tensor,
+                                          ekb_tensor,
+                                          htype,
+                                          propagator,
+                                          ofs_running,
+                                          print_matrix,
+                                          use_lapack);
+
+                // Need to distribute global psi back to all processes
+                if (use_lapack)
+                {
+#ifdef __MPI
+                    // Syncronize data from Device to CPU
+                    syncmem_complex_d2h_op()(psi_g.p.get(),
+                                             psi_k_tensor.data<std::complex<double>>(),
+                                             len_psi_k_1 * len_psi_k_2);
+                    syncmem_complex_d2h_op()(psi_laststep_g.p.get(),
+                                             psi_k_laststep_tensor.data<std::complex<double>>(),
+                                             len_psi_k_1 * len_psi_k_2);
+
+                    // Distribute psi to all processes
+                    distributePsi(para_orb, psi[0].get_pointer(), psi_g);
+                    distributePsi(para_orb, psi_laststep[0].get_pointer(), psi_laststep_g);
+#endif
+                }
+                else
+                {
+                    // Syncronize data from Device to CPU
+                    syncmem_complex_d2h_op()(psi[0].get_pointer(),
+                                             psi_k_tensor.data<std::complex<double>>(),
+                                             len_psi_k_1 * len_psi_k_2);
+                    syncmem_complex_d2h_op()(psi_laststep[0].get_pointer(),
+                                             psi_k_laststep_tensor.data<std::complex<double>>(),
+                                             len_psi_k_1 * len_psi_k_2);
+                }
+                syncmem_complex_d2h_op()(Hk_laststep[ik],
+                                         H_laststep_tensor.data<std::complex<double>>(),
+                                         len_HS_laststep);
+                syncmem_complex_d2h_op()(Sk_laststep[ik],
+                                         S_laststep_tensor.data<std::complex<double>>(),
+                                         len_HS_laststep);
+                syncmem_double_d2h_op()(&(ekb(ik, 0)), ekb_tensor.data<double>(), nband);
+
+                // std::cout << "Print ekb tensor: " << std::endl;
+                // ekb.print(std::cout);
+            }
         }
         else
         {
             std::cout << "method of htype is wrong" << std::endl;
         }
 
-        ModuleBase::timer::tick("Efficience", "evolve_k");
+        ModuleBase::timer::tick("Efficiency", "evolve_k");
     } // end k
 
     ModuleBase::timer::tick("Evolve_elec", "solve_psi");
     return;
 }
+
+template class Evolve_elec<base_device::DEVICE_CPU>;
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+template class Evolve_elec<base_device::DEVICE_GPU>;
+#endif
 } // namespace module_tddft
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_tddft/evolve_elec.h b/source/module_hamilt_lcao/module_tddft/evolve_elec.h
index 71c4aebe43..9da86c9d90 100644
--- a/source/module_hamilt_lcao/module_tddft/evolve_elec.h
+++ b/source/module_hamilt_lcao/module_tddft/evolve_elec.h
@@ -3,6 +3,11 @@
 
 #include "module_base/global_function.h"
 #include "module_base/global_variable.h"
+#include "module_base/module_container/ATen/core/tensor.h"     // ct::Tensor
+#include "module_base/module_container/ATen/core/tensor_map.h" // TensorMap
+#include "module_base/module_device/device.h"                  // base_device
+#include "module_base/module_device/memory_op.h"               // memory operations
+#include "module_base/scalapack_connector.h"                   // Cpxgemr2d
 #include "module_esolver/esolver_ks_lcao.h"
 #include "module_esolver/esolver_ks_lcao_tddft.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
@@ -15,33 +20,140 @@
 // k is the index for the points in the first Brillouin zone
 //-----------------------------------------------------------
 
+//------------------------ Debugging utility function ------------------------//
+
+// Print the shape of a Tensor
+inline void print_tensor_shape(const ct::Tensor& tensor, const std::string& name)
+{
+    std::cout << "Shape of " << name << ": [";
+    for (int i = 0; i < tensor.shape().ndim(); ++i)
+    {
+        std::cout << tensor.shape().dim_size(i);
+        if (i < tensor.shape().ndim() - 1)
+        {
+            std::cout << ", ";
+        }
+    }
+    std::cout << "]" << std::endl;
+}
+
+// Recursive print function
+template <typename T>
+inline void print_tensor_data_recursive(const T* data,
+                                        const std::vector<int64_t>& shape,
+                                        const std::vector<int64_t>& strides,
+                                        int dim,
+                                        std::vector<int64_t>& indices,
+                                        const std::string& name)
+{
+    if (dim == shape.size())
+    {
+        // Recursion base case: print data when reaching the innermost dimension
+        std::cout << name;
+        for (size_t i = 0; i < indices.size(); ++i)
+        {
+            std::cout << "[" << indices[i] << "]";
+        }
+        std::cout << " = " << *data << std::endl;
+        return;
+    }
+    // Recursively process the current dimension
+    for (int64_t i = 0; i < shape[dim]; ++i)
+    {
+        indices[dim] = i;
+        print_tensor_data_recursive(data + i * strides[dim], shape, strides, dim + 1, indices, name);
+    }
+}
+
+// Generic print function
+template <typename T>
+inline void print_tensor_data(const ct::Tensor& tensor, const std::string& name)
+{
+    const std::vector<int64_t>& shape = tensor.shape().dims();
+    const std::vector<int64_t>& strides = tensor.shape().strides();
+    const T* data = tensor.data<T>();
+    std::vector<int64_t> indices(shape.size(), 0);
+    print_tensor_data_recursive(data, shape, strides, 0, indices, name);
+}
+
+// Specialization for std::complex<double>
+template <>
+inline void print_tensor_data<std::complex<double>>(const ct::Tensor& tensor, const std::string& name)
+{
+    const std::vector<int64_t>& shape = tensor.shape().dims();
+    const std::vector<int64_t>& strides = tensor.shape().strides();
+    const std::complex<double>* data = tensor.data<std::complex<double>>();
+    std::vector<int64_t> indices(shape.size(), 0);
+    print_tensor_data_recursive(data, shape, strides, 0, indices, name);
+}
+
+//------------------------ Debugging utility function ------------------------//
+
 namespace module_tddft
 {
-class Evolve_elec
+#ifdef __MPI
+//------------------------ MPI gathering and distributing functions ------------------------//
+template <typename T>
+void gatherPsi(const int myid,
+               const int root_proc,
+               T* psi_l,
+               const Parallel_Orbitals& para_orb,
+               ModuleESolver::Matrix_g<T>& psi_g)
+{
+    const int* desc_psi = para_orb.desc_wfc; // Obtain the descriptor from Parallel_Orbitals
+    int ctxt = desc_psi[1];                  // BLACS context
+    int nrows = desc_psi[2];                 // Global matrix row number
+    int ncols = desc_psi[3];                 // Global matrix column number
+
+    if (myid == root_proc)
+    {
+        psi_g.p.reset(new T[nrows * ncols]); // No need to delete[] since it is a shared_ptr
+    }
+    else
+    {
+        psi_g.p.reset(new T[nrows * ncols]); // Placeholder for non-root processes
+    }
+
+    // Set the descriptor of the global psi
+    psi_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows});
+    psi_g.row = nrows;
+    psi_g.col = ncols;
+
+    // Call the Cpxgemr2d function in ScaLAPACK to collect the matrix data
+    Cpxgemr2d(nrows, ncols, psi_l, 1, 1, const_cast<int*>(desc_psi), psi_g.p.get(), 1, 1, psi_g.desc.get(), ctxt);
+}
+
+template <typename T>
+void distributePsi(const Parallel_Orbitals& para_orb, T* psi_l, const ModuleESolver::Matrix_g<T>& psi_g)
 {
+    const int* desc_psi = para_orb.desc_wfc; // Obtain the descriptor from Parallel_Orbitals
+    int ctxt = desc_psi[1];                  // BLACS context
+    int nrows = desc_psi[2];                 // Global matrix row number
+    int ncols = desc_psi[3];                 // Global matrix column number
+
+    // Call the Cpxgemr2d function in ScaLAPACK to distribute the matrix data
+    Cpxgemr2d(nrows, ncols, psi_g.p.get(), 1, 1, psi_g.desc.get(), psi_l, 1, 1, const_cast<int*>(desc_psi), ctxt);
+}
+//------------------------ MPI gathering and distributing functions ------------------------//
+#endif // __MPI
 
-    friend class ELEC_scf;
+template <typename Device = base_device::DEVICE_CPU>
+class Evolve_elec
+{
     friend class ModuleESolver::ESolver_KS_LCAO<std::complex<double>, double>;
-    friend class ModuleESolver::ESolver_KS_LCAO_TDDFT;
+
+    // Template parameter is needed for the friend class declaration
+    friend class ModuleESolver::ESolver_KS_LCAO_TDDFT<Device>;
 
   public:
     Evolve_elec();
     ~Evolve_elec();
 
-    static double td_force_dt;
-    static bool td_vext;
-    static std::vector<int> td_vext_dire_case;
-    // Output dipole, efield, current or not
-    static bool out_dipole;
-    static bool out_efield;
-
-    static double td_print_eij; // the threshold to output Eij elements
-    static int td_edm;          // 0: new edm method   1: old edm method
-
   private:
     static void solve_psi(const int& istep,
                           const int nband,
                           const int nlocal,
+                          const int& nks,
                           hamilt::Hamilt<std::complex<double>>* phm,
                           Parallel_Orbitals& para_orb,
                           psi::Psi<std::complex<double>>* psi,
@@ -49,9 +161,24 @@ class Evolve_elec
                           std::complex<double>** Hk_laststep,
                           std::complex<double>** Sk_laststep,
                           ModuleBase::matrix& ekb,
-                          int htype,
-                          int propagator,
-                          const int& nks);
+                          std::ofstream& ofs_running,
+                          const int htype,
+                          const int propagator,
+                          const bool use_tensor,
+                          const bool use_lapack);
+
+    // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
+    static ct::DeviceType ct_device_type;
+    // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+
+    // Memory operations
+    using syncmem_double_h2d_op = base_device::memory::synchronize_memory_op<double, Device, base_device::DEVICE_CPU>;
+    using syncmem_double_d2h_op = base_device::memory::synchronize_memory_op<double, base_device::DEVICE_CPU, Device>;
+    using syncmem_complex_h2d_op
+        = base_device::memory::synchronize_memory_op<std::complex<double>, Device, base_device::DEVICE_CPU>;
+    using syncmem_complex_d2h_op
+        = base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, Device>;
 };
 } // namespace module_tddft
 #endif
diff --git a/source/module_hamilt_lcao/module_tddft/evolve_psi.cpp b/source/module_hamilt_lcao/module_tddft/evolve_psi.cpp
index e7ca154e9a..240ab5b273 100644
--- a/source/module_hamilt_lcao/module_tddft/evolve_psi.cpp
+++ b/source/module_hamilt_lcao/module_tddft/evolve_psi.cpp
@@ -1,9 +1,12 @@
 #include "evolve_psi.h"
 
-#include "bandenergy.h"
+#include "band_energy.h"
 #include "middle_hamilt.h"
 #include "module_base/lapack_connector.h"
+#include "module_base/module_container/ATen/kernels/blas.h"   // cuBLAS handle
+#include "module_base/module_container/ATen/kernels/lapack.h" // cuSOLVER handle
 #include "module_base/scalapack_connector.h"
+#include "module_esolver/esolver_ks_lcao_tddft.h" // use gatherMatrix
 #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_parameter/parameter.h"
@@ -25,15 +28,18 @@ void evolve_psi(const int nband,
                 std::complex<double>* S_laststep,
                 double* ekb,
                 int htype,
-                int propagator)
+                int propagator,
+                std::ofstream& ofs_running,
+                const int print_matrix)
 {
+    ofs_running << " evolve_psi::start " << std::endl;
+
     ModuleBase::TITLE("Evolve_psi", "evolve_psi");
     time_t time_start = time(nullptr);
-    GlobalV::ofs_running << " Start Time : " << ctime(&time_start);
+    ofs_running << " Start Time : " << ctime(&time_start);
 
 #ifdef __MPI
 
-    int print_matrix = 0;
     hamilt::MatrixBlock<std::complex<double>> h_mat, s_mat;
     p_hamilt->matrix(h_mat, s_mat);
 
@@ -59,7 +65,7 @@ void evolve_psi(const int nband,
     /// @output Htmp
     if (htype == 1 && propagator != 2)
     {
-        half_Hmatrix(pv, nband, nlocal, Htmp, Stmp, H_laststep, S_laststep, print_matrix);
+        half_Hmatrix(pv, nband, nlocal, Htmp, Stmp, H_laststep, S_laststep, ofs_running, print_matrix);
     }
 
     // (2)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
@@ -68,28 +74,28 @@ void evolve_psi(const int nband,
     /// @input Stmp, Htmp, print_matrix
     /// @output U_operator
     Propagator prop(propagator, pv, PARAM.mdp.md_dt);
-    prop.compute_propagator(nlocal, Stmp, Htmp, H_laststep, U_operator, print_matrix);
+    prop.compute_propagator(nlocal, Stmp, Htmp, H_laststep, U_operator, ofs_running, print_matrix);
 
     // (3)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 
     /// @brief apply U_operator to the wave function of the previous step for new wave function
     /// @input U_operator, psi_k_laststep, print_matrix
     /// @output psi_k
-    upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, print_matrix);
+    upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix);
 
     // (4)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 
     /// @brief normalize psi_k
     /// @input Stmp, psi_not_norm, psi_k, print_matrix
     /// @output psi_k
-    norm_psi(pv, nband, nlocal, Stmp, psi_k, print_matrix);
+    norm_psi(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix);
 
     // (5)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 
     /// @brief compute ekb
     /// @input Htmp, psi_k
     /// @output ekb
-    compute_ekb(pv, nband, nlocal, Hold, psi_k, ekb);
+    compute_ekb(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running);
 
     delete[] Stmp;
     delete[] Htmp;
@@ -101,6 +107,233 @@ void evolve_psi(const int nband,
     time_t time_end = time(nullptr);
     ModuleBase::GlobalFunc::OUT_TIME("evolve(std::complex)", time_start, time_end);
 
+    ofs_running << " evolve_psi::end " << std::endl;
+
     return;
 }
+
+template <typename Device>
+void evolve_psi_tensor(const int nband,
+                       const int nlocal,
+                       const Parallel_Orbitals* pv,
+                       hamilt::Hamilt<std::complex<double>>* p_hamilt,
+                       ct::Tensor& psi_k,
+                       ct::Tensor& psi_k_laststep,
+                       ct::Tensor& H_laststep,
+                       ct::Tensor& S_laststep,
+                       ct::Tensor& ekb,
+                       int htype,
+                       int propagator,
+                       std::ofstream& ofs_running,
+                       const int print_matrix,
+                       const bool use_lapack)
+{
+    // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
+    ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
+    // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+    // Memory operations
+    using syncmem_complex_h2d_op
+        = base_device::memory::synchronize_memory_op<std::complex<double>, Device, base_device::DEVICE_CPU>;
+
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+    // Initialize cuBLAS & cuSOLVER handle
+    ct::kernels::createGpuSolverHandle();
+    ct::kernels::createGpuBlasHandle();
+#endif // __CUDA
+
+    ofs_running << " evolve_psi_tensor::start " << std::endl;
+
+    ModuleBase::TITLE("Evolve_psi", "evolve_psi");
+    time_t time_start = time(nullptr);
+    ofs_running << " Start Time : " << ctime(&time_start);
+
+#ifdef __MPI
+
+    hamilt::MatrixBlock<std::complex<double>> h_mat, s_mat;
+    p_hamilt->matrix(h_mat, s_mat);
+
+    // Create Tensor objects for temporary data and sync from host to device
+    const int len_HS = use_lapack ? nlocal * nlocal : pv->nloc;
+    ct::Tensor Stmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
+    ct::Tensor Htmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
+    ct::Tensor Hold(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
+
+    if (use_lapack)
+    {
+        // Need to gather H and S matrix to root process here
+        int myid = 0;
+        int num_procs = 1;
+        MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+        MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+
+        ModuleESolver::Matrix_g<std::complex<double>> h_mat_g, s_mat_g; // Global matrix structure
+
+        // Collect H matrix
+        ModuleESolver::gatherMatrix(myid, 0, h_mat, h_mat_g);
+        syncmem_complex_h2d_op()(Htmp.data<std::complex<double>>(), h_mat_g.p.get(), len_HS);
+        syncmem_complex_h2d_op()(Hold.data<std::complex<double>>(), h_mat_g.p.get(), len_HS);
+
+        // Collect S matrix
+        ModuleESolver::gatherMatrix(myid, 0, s_mat, s_mat_g);
+        syncmem_complex_h2d_op()(Stmp.data<std::complex<double>>(), s_mat_g.p.get(), len_HS);
+    }
+    else
+    {
+        // Original code
+        syncmem_complex_h2d_op()(Stmp.data<std::complex<double>>(), s_mat.p, len_HS);
+        syncmem_complex_h2d_op()(Htmp.data<std::complex<double>>(), h_mat.p, len_HS);
+        syncmem_complex_h2d_op()(Hold.data<std::complex<double>>(), h_mat.p, len_HS);
+    }
+
+    ct::Tensor U_operator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
+    U_operator.zero();
+
+    int myid = 0;
+    int root_proc = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+    // (1)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+    /// @brief compute H(t+dt/2)
+    /// @input H_laststep, Htmp, print_matrix
+    /// @output Htmp
+    if (htype == 1 && propagator != 2)
+    {
+        if (!use_lapack)
+        {
+            half_Hmatrix_tensor(pv, nband, nlocal, Htmp, Stmp, H_laststep, S_laststep, ofs_running, print_matrix);
+        }
+        else
+        {
+            if (myid == root_proc)
+            {
+                half_Hmatrix_tensor_lapack<Device>(pv,
+                                                   nband,
+                                                   nlocal,
+                                                   Htmp,
+                                                   Stmp,
+                                                   H_laststep,
+                                                   S_laststep,
+                                                   ofs_running,
+                                                   print_matrix);
+            }
+        }
+    }
+
+    // (2)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+    /// @brief compute U_operator
+    /// @input Stmp, Htmp, print_matrix
+    /// @output U_operator
+    Propagator prop(propagator, pv, PARAM.mdp.md_dt);
+    prop.compute_propagator_tensor<Device>(nlocal,
+                                           Stmp,
+                                           Htmp,
+                                           H_laststep,
+                                           U_operator,
+                                           ofs_running,
+                                           print_matrix,
+                                           use_lapack);
+
+    // (3)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+    /// @brief apply U_operator to the wave function of the previous step for new wave function
+    /// @input U_operator, psi_k_laststep, print_matrix
+    /// @output psi_k
+    if (!use_lapack)
+    {
+        upsi_tensor(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix);
+    }
+    else
+    {
+        if (myid == root_proc)
+        {
+            upsi_tensor_lapack<Device>(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix);
+        }
+    }
+
+    // (4)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+    /// @brief normalize psi_k
+    /// @input Stmp, psi_not_norm, psi_k, print_matrix
+    /// @output psi_k
+    if (!use_lapack)
+    {
+        norm_psi_tensor(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix);
+    }
+    else
+    {
+        if (myid == root_proc)
+        {
+            norm_psi_tensor_lapack<Device>(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix);
+        }
+    }
+
+    // (5)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+    /// @brief compute ekb
+    /// @input Htmp, psi_k
+    /// @output ekb
+    if (!use_lapack)
+    {
+        compute_ekb_tensor(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running);
+    }
+    else
+    {
+        if (myid == root_proc)
+        {
+            compute_ekb_tensor_lapack<Device>(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running);
+        }
+    }
+
+#endif // __MPI
+
+    time_t time_end = time(nullptr);
+    ModuleBase::GlobalFunc::OUT_TIME("evolve(std::complex)", time_start, time_end);
+
+    ofs_running << " evolve_psi_tensor::end " << std::endl;
+
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+    // Destroy cuBLAS & cuSOLVER handle
+    ct::kernels::destroyGpuSolverHandle();
+    ct::kernels::destroyGpuBlasHandle();
+#endif // __CUDA
+
+    return;
+}
+
+// Explicit instantiation of template functions
+template void evolve_psi_tensor<base_device::DEVICE_CPU>(const int nband,
+                                                         const int nlocal,
+                                                         const Parallel_Orbitals* pv,
+                                                         hamilt::Hamilt<std::complex<double>>* p_hamilt,
+                                                         ct::Tensor& psi_k,
+                                                         ct::Tensor& psi_k_laststep,
+                                                         ct::Tensor& H_laststep,
+                                                         ct::Tensor& S_laststep,
+                                                         ct::Tensor& ekb,
+                                                         int htype,
+                                                         int propagator,
+                                                         std::ofstream& ofs_running,
+                                                         const int print_matrix,
+                                                         const bool use_lapack);
+
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+template void evolve_psi_tensor<base_device::DEVICE_GPU>(const int nband,
+                                                         const int nlocal,
+                                                         const Parallel_Orbitals* pv,
+                                                         hamilt::Hamilt<std::complex<double>>* p_hamilt,
+                                                         ct::Tensor& psi_k,
+                                                         ct::Tensor& psi_k_laststep,
+                                                         ct::Tensor& H_laststep,
+                                                         ct::Tensor& S_laststep,
+                                                         ct::Tensor& ekb,
+                                                         int htype,
+                                                         int propagator,
+                                                         std::ofstream& ofs_running,
+                                                         const int print_matrix,
+                                                         const bool use_lapack);
+#endif // __CUDA
+
 } // namespace module_tddft
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_tddft/evolve_psi.h b/source/module_hamilt_lcao/module_tddft/evolve_psi.h
index 3746b81f53..44f0db888a 100644
--- a/source/module_hamilt_lcao/module_tddft/evolve_psi.h
+++ b/source/module_hamilt_lcao/module_tddft/evolve_psi.h
@@ -6,6 +6,8 @@
 #ifndef ELEC_PSI_H
 #define ELEC_PSI_H
 
+#include "module_base/module_container/ATen/core/tensor.h"     // ct::Tensor
+#include "module_base/module_container/ATen/core/tensor_map.h" // TensorMap
 #include "module_basis/module_ao/parallel_orbitals.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
 
@@ -21,7 +23,25 @@ void evolve_psi(const int nband,
                 std::complex<double>* S_laststep,
                 double* ekb,
                 int htype,
-                int propagator);
-}
+                int propagator,
+                std::ofstream& ofs_running,
+                const int print_matrix);
+
+template <typename Device>
+void evolve_psi_tensor(const int nband,
+                       const int nlocal,
+                       const Parallel_Orbitals* pv,
+                       hamilt::Hamilt<std::complex<double>>* p_hamilt,
+                       ct::Tensor& psi_k,
+                       ct::Tensor& psi_k_laststep,
+                       ct::Tensor& H_laststep,
+                       ct::Tensor& S_laststep,
+                       ct::Tensor& ekb,
+                       int htype,
+                       int propagator,
+                       std::ofstream& ofs_running,
+                       const int print_matrix,
+                       const bool use_lapack);
+} // namespace module_tddft
 
 #endif
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_tddft/middle_hamilt.cpp b/source/module_hamilt_lcao/module_tddft/middle_hamilt.cpp
index b53ae51bdc..94b3dc3f5d 100644
--- a/source/module_hamilt_lcao/module_tddft/middle_hamilt.cpp
+++ b/source/module_hamilt_lcao/module_tddft/middle_hamilt.cpp
@@ -1,11 +1,13 @@
 #include "middle_hamilt.h"
 
-#include <complex>
-#include <iostream>
-
 #include "module_base/lapack_connector.h"
+#include "module_base/module_container/ATen/kernels/blas.h"
+#include "module_base/module_device/memory_op.h" // memory operations
 #include "module_base/scalapack_connector.h"
 
+#include <complex>
+#include <iostream>
+
 namespace module_tddft
 {
 #ifdef __MPI
@@ -17,34 +19,36 @@ void half_Hmatrix(const Parallel_Orbitals* pv,
                   std::complex<double>* Stmp,
                   const std::complex<double>* H_laststep,
                   const std::complex<double>* S_laststep,
+                  std::ofstream& ofs_running,
                   const int print_matrix)
 {
     if (print_matrix)
     {
-        GlobalV::ofs_running << std::setprecision(10);
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " H(t+dt) :" << std::endl;
+        ofs_running << std::setprecision(10);
+        ofs_running << std::endl;
+        ofs_running << " H(t+dt) :" << std::endl;
         for (int i = 0; i < pv->nrow; i++)
         {
+            const int in = i * pv->ncol;
             for (int j = 0; j < pv->ncol; j++)
             {
-                GlobalV::ofs_running << Htmp[i * pv->ncol + j].real() << "+" << Htmp[i * pv->ncol + j].imag() << "i ";
+                ofs_running << Htmp[in + j].real() << "+" << Htmp[in + j].imag() << "i ";
             }
-            GlobalV::ofs_running << std::endl;
+            ofs_running << std::endl;
         }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " H(t):" << std::endl;
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " H(t):" << std::endl;
         for (int i = 0; i < pv->nrow; i++)
         {
+            const int in = i * pv->ncol;
             for (int j = 0; j < pv->ncol; j++)
             {
-                GlobalV::ofs_running << H_laststep[i * pv->ncol + j].real() << "+"
-                                     << H_laststep[i * pv->ncol + j].imag() << "i ";
+                ofs_running << H_laststep[in + j].real() << "+" << H_laststep[in + j].imag() << "i ";
             }
-            GlobalV::ofs_running << std::endl;
+            ofs_running << std::endl;
         }
-        GlobalV::ofs_running << std::endl;
+        ofs_running << std::endl;
     }
 
     std::complex<double> alpha = {0.5, 0.0};
@@ -54,18 +58,233 @@ void half_Hmatrix(const Parallel_Orbitals* pv,
 
     if (print_matrix)
     {
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " H (t+dt/2) :" << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " H (t+dt/2) :" << std::endl;
+        for (int i = 0; i < pv->nrow; i++)
+        {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->ncol; j++)
+            {
+                ofs_running << Htmp[in + j].real() << "+" << Htmp[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+}
+
+void half_Hmatrix_tensor(const Parallel_Orbitals* pv,
+                         const int nband,
+                         const int nlocal,
+                         ct::Tensor& Htmp,
+                         ct::Tensor& Stmp,
+                         const ct::Tensor& H_laststep,
+                         const ct::Tensor& S_laststep,
+                         std::ofstream& ofs_running,
+                         const int print_matrix)
+{
+    if (print_matrix)
+    {
+        ofs_running << std::setprecision(10);
+        ofs_running << std::endl;
+        ofs_running << " H(t+dt) :" << std::endl;
+        for (int i = 0; i < pv->nrow; i++)
+        {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->ncol; j++)
+            {
+                ofs_running << Htmp.data<std::complex<double>>()[in + j].real() << "+"
+                            << Htmp.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " H(t):" << std::endl;
+        for (int i = 0; i < pv->nrow; i++)
+        {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->ncol; j++)
+            {
+                ofs_running << H_laststep.data<std::complex<double>>()[in + j].real() << "+"
+                            << H_laststep.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+
+    std::complex<double> alpha = {0.5, 0.0};
+    std::complex<double> beta = {0.5, 0.0};
+
+    // Perform the operation Htmp = alpha * H_laststep + beta * Htmp
+    ScalapackConnector::geadd('N',
+                              nlocal,
+                              nlocal,
+                              alpha,
+                              H_laststep.data<std::complex<double>>(),
+                              1,
+                              1,
+                              pv->desc,
+                              beta,
+                              Htmp.data<std::complex<double>>(),
+                              1,
+                              1,
+                              pv->desc);
+
+    // Perform the operation Stmp = alpha * S_laststep + beta * Stmp
+    ScalapackConnector::geadd('N',
+                              nlocal,
+                              nlocal,
+                              alpha,
+                              S_laststep.data<std::complex<double>>(),
+                              1,
+                              1,
+                              pv->desc,
+                              beta,
+                              Stmp.data<std::complex<double>>(),
+                              1,
+                              1,
+                              pv->desc);
+
+    if (print_matrix)
+    {
+        ofs_running << std::endl;
+        ofs_running << " H (t+dt/2) :" << std::endl;
         for (int i = 0; i < pv->nrow; i++)
         {
+            const int in = i * pv->ncol;
             for (int j = 0; j < pv->ncol; j++)
             {
-                GlobalV::ofs_running << Htmp[i * pv->ncol + j].real() << "+" << Htmp[i * pv->ncol + j].imag() << "i ";
+                ofs_running << Htmp.data<std::complex<double>>()[in + j].real() << "+"
+                            << Htmp.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+}
+
+template <typename Device>
+void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv,
+                                const int nband,
+                                const int nlocal,
+                                ct::Tensor& Htmp,
+                                ct::Tensor& Stmp,
+                                const ct::Tensor& H_laststep,
+                                const ct::Tensor& S_laststep,
+                                std::ofstream& ofs_running,
+                                const int print_matrix)
+{
+    // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
+    ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
+    // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+
+    if (print_matrix)
+    {
+        ct::Tensor Htmp_cpu = Htmp.to_device<ct::DEVICE_CPU>();
+        ct::Tensor H_laststep_cpu = H_laststep.to_device<ct::DEVICE_CPU>();
+
+        ofs_running << std::setprecision(10);
+        ofs_running << std::endl;
+        ofs_running << " H(t+dt) :" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                ofs_running << Htmp_cpu.data<std::complex<double>>()[in + j].real() << "+"
+                            << Htmp_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
             }
-            GlobalV::ofs_running << std::endl;
+            ofs_running << std::endl;
         }
-        GlobalV::ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " H(t):" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                ofs_running << H_laststep_cpu.data<std::complex<double>>()[in + j].real() << "+"
+                            << H_laststep_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+
+    std::complex<double> one_half = {0.5, 0.0};
+
+    // Perform the operation Htmp = one_half * H_laststep + one_half * Htmp
+    // Scale Htmp by one_half
+    ct::kernels::blas_scal<std::complex<double>, ct_Device>()(nlocal * nlocal,
+                                                              &one_half,
+                                                              Htmp.data<std::complex<double>>(),
+                                                              1);
+    // Htmp = one_half * H_laststep + Htmp
+    ct::kernels::blas_axpy<std::complex<double>, ct_Device>()(nlocal * nlocal,
+                                                              &one_half,
+                                                              H_laststep.data<std::complex<double>>(),
+                                                              1,
+                                                              Htmp.data<std::complex<double>>(),
+                                                              1);
+
+    // Perform the operation Stmp = one_half * S_laststep + one_half * Stmp
+    // Scale Stmp by one_half
+    ct::kernels::blas_scal<std::complex<double>, ct_Device>()(nlocal * nlocal,
+                                                              &one_half,
+                                                              Stmp.data<std::complex<double>>(),
+                                                              1);
+    // Stmp = one_half * S_laststep + Stmp
+    ct::kernels::blas_axpy<std::complex<double>, ct_Device>()(nlocal * nlocal,
+                                                              &one_half,
+                                                              S_laststep.data<std::complex<double>>(),
+                                                              1,
+                                                              Stmp.data<std::complex<double>>(),
+                                                              1);
+
+    if (print_matrix)
+    {
+        ct::Tensor Htmp_cpu = Htmp.to_device<ct::DEVICE_CPU>();
+
+        ofs_running << std::endl;
+        ofs_running << " H (t+dt/2) :" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                ofs_running << Htmp_cpu.data<std::complex<double>>()[in + j].real() << "+"
+                            << Htmp_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
     }
 }
-#endif
+
+// Explicit instantiation of template functions
+template void half_Hmatrix_tensor_lapack<base_device::DEVICE_CPU>(const Parallel_Orbitals* pv,
+                                                                  const int nband,
+                                                                  const int nlocal,
+                                                                  ct::Tensor& Htmp,
+                                                                  ct::Tensor& Stmp,
+                                                                  const ct::Tensor& H_laststep,
+                                                                  const ct::Tensor& S_laststep,
+                                                                  std::ofstream& ofs_running,
+                                                                  const int print_matrix);
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+template void half_Hmatrix_tensor_lapack<base_device::DEVICE_GPU>(const Parallel_Orbitals* pv,
+                                                                  const int nband,
+                                                                  const int nlocal,
+                                                                  ct::Tensor& Htmp,
+                                                                  ct::Tensor& Stmp,
+                                                                  const ct::Tensor& H_laststep,
+                                                                  const ct::Tensor& S_laststep,
+                                                                  std::ofstream& ofs_running,
+                                                                  const int print_matrix);
+#endif // __CUDA
+#endif // __MPI
 } // namespace module_tddft
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_tddft/middle_hamilt.h b/source/module_hamilt_lcao/module_tddft/middle_hamilt.h
index ac230b6444..4b153e3166 100644
--- a/source/module_hamilt_lcao/module_tddft/middle_hamilt.h
+++ b/source/module_hamilt_lcao/module_tddft/middle_hamilt.h
@@ -6,7 +6,9 @@
 #ifndef MIDDLE_HAMILT_H
 #define MIDDLE_HAMILT_H
 
+#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor
 #include "module_basis/module_ao/parallel_orbitals.h"
+
 #include <complex>
 
 namespace module_tddft
@@ -30,8 +32,31 @@ void half_Hmatrix(const Parallel_Orbitals* pv,
                   std::complex<double>* Stmp,
                   const std::complex<double>* H_laststep,
                   const std::complex<double>* S_laststep,
+                  std::ofstream& ofs_running,
                   const int print_matrix);
-#endif
+
+void half_Hmatrix_tensor(const Parallel_Orbitals* pv,
+                         const int nband,
+                         const int nlocal,
+                         ct::Tensor& Htmp,
+                         ct::Tensor& Stmp,
+                         const ct::Tensor& H_laststep,
+                         const ct::Tensor& S_laststep,
+                         std::ofstream& ofs_running,
+                         const int print_matrix);
+
+template <typename Device>
+void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv,
+                                const int nband,
+                                const int nlocal,
+                                ct::Tensor& Htmp,
+                                ct::Tensor& Stmp,
+                                const ct::Tensor& H_laststep,
+                                const ct::Tensor& S_laststep,
+                                std::ofstream& ofs_running,
+                                const int print_matrix);
+
+#endif // __MPI
 } // namespace module_tddft
 
 #endif
diff --git a/source/module_hamilt_lcao/module_tddft/norm_psi.cpp b/source/module_hamilt_lcao/module_tddft/norm_psi.cpp
index cf3698b3ee..bbd7ea8956 100644
--- a/source/module_hamilt_lcao/module_tddft/norm_psi.cpp
+++ b/source/module_hamilt_lcao/module_tddft/norm_psi.cpp
@@ -1,11 +1,12 @@
 #include "norm_psi.h"
 
-#include <complex>
-#include <iostream>
-
 #include "module_base/lapack_connector.h"
+#include "module_base/module_container/ATen/kernels/blas.h"
 #include "module_base/scalapack_connector.h"
 
+#include <complex>
+#include <iostream>
+
 namespace module_tddft
 {
 #ifdef __MPI
@@ -23,8 +24,11 @@ void norm_psi(const Parallel_Orbitals* pv,
               const int nlocal,
               const std::complex<double>* Stmp,
               std::complex<double>* psi_k,
+              std::ofstream& ofs_running,
               const int print_matrix)
 {
+    assert(pv->nloc_wfc > 0 && pv->nloc > 0);
+
     std::complex<double>* tmp1 = new std::complex<double>[pv->nloc_wfc];
     ModuleBase::GlobalFunc::ZEROS(tmp1, pv->nloc_wfc);
 
@@ -73,28 +77,30 @@ void norm_psi(const Parallel_Orbitals* pv,
 
     if (print_matrix)
     {
-        GlobalV::ofs_running << "original Cij :" << std::endl;
+        ofs_running << "original Cij :" << std::endl;
         for (int i = 0; i < pv->ncol; i++)
         {
+            const int in = i * pv->ncol;
             for (int j = 0; j < pv->nrow; j++)
             {
-                double aa, bb;
-                aa = Cij[i * pv->ncol + j].real();
-                bb = Cij[i * pv->ncol + j].imag();
-                if (std::abs(aa) < 1e-8) {
+                double aa = Cij[in + j].real();
+                double bb = Cij[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
                     aa = 0.0;
-}
-                if (std::abs(bb) < 1e-8) {
+                }
+                if (std::abs(bb) < 1e-8)
+                {
                     bb = 0.0;
-}
-                GlobalV::ofs_running << aa << "+" << bb << "i ";
+                }
+                ofs_running << aa << "+" << bb << "i ";
             }
-            GlobalV::ofs_running << std::endl;
+            ofs_running << std::endl;
         }
-        GlobalV::ofs_running << std::endl;
+        ofs_running << std::endl;
     }
 
-    int naroc[2]; // maximum number of row or column
+    int naroc[2] = {0, 0}; // maximum number of row or column
 
     for (int iprow = 0; iprow < pv->dim0; ++iprow)
     {
@@ -107,15 +113,17 @@ void norm_psi(const Parallel_Orbitals* pv,
                 for (int j = 0; j < naroc[1]; ++j)
                 {
                     int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol);
-                    if (igcol >= nband) {
+                    if (igcol >= nband)
+                    {
                         continue;
-}
+                    }
                     for (int i = 0; i < naroc[0]; ++i)
                     {
                         int igrow = globalIndex(i, pv->nb, pv->dim0, iprow);
-                        if (igrow >= nband) {
+                        if (igrow >= nband)
+                        {
                             continue;
-}
+                        }
                         if (igcol == igrow)
                         {
                             Cij[j * naroc[0] + i] = {1.0 / sqrt(Cij[j * naroc[0] + i].real()), 0.0};
@@ -128,7 +136,7 @@ void norm_psi(const Parallel_Orbitals* pv,
                 }
             }
         } // loop ipcol
-    }     // loop iprow
+    } // loop iprow
 
     BlasConnector::copy(pv->nloc_wfc, psi_k, 1, tmp1, 1);
 
@@ -154,60 +162,510 @@ void norm_psi(const Parallel_Orbitals* pv,
 
     if (print_matrix)
     {
-        GlobalV::ofs_running << " Cij:" << std::endl;
+        ofs_running << " Cij:" << std::endl;
         for (int i = 0; i < pv->ncol; i++)
         {
+            const int in = i * pv->ncol;
             for (int j = 0; j < pv->nrow; j++)
             {
-                GlobalV::ofs_running << Cij[i * pv->ncol + j].real() << "+" << Cij[i * pv->ncol + j].imag() << "i ";
+                ofs_running << Cij[in + j].real() << "+" << Cij[in + j].imag() << "i ";
             }
-            GlobalV::ofs_running << std::endl;
+            ofs_running << std::endl;
         }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " psi_k:" << std::endl;
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " psi_k:" << std::endl;
         for (int i = 0; i < pv->ncol_bands; i++)
         {
+            const int in = i * pv->ncol;
             for (int j = 0; j < pv->ncol; j++)
             {
-                double aa, bb;
-                aa = psi_k[i * pv->ncol + j].real();
-                bb = psi_k[i * pv->ncol + j].imag();
-                if (std::abs(aa) < 1e-8) {
+                double aa = psi_k[in + j].real();
+                double bb = psi_k[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
                     aa = 0.0;
-}
-                if (std::abs(bb) < 1e-8) {
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " psi_k before normalization:" << std::endl;
+        for (int i = 0; i < pv->ncol_bands; i++)
+        {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->ncol; j++)
+            {
+                double aa = tmp1[in + j].real();
+                double bb = tmp1[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
                     bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+    }
+
+    delete[] tmp1;
+    delete[] Cij;
 }
-                GlobalV::ofs_running << aa << "+" << bb << "i ";
+
+void norm_psi_tensor(const Parallel_Orbitals* pv,
+                     const int nband,
+                     const int nlocal,
+                     const ct::Tensor& Stmp,
+                     ct::Tensor& psi_k,
+                     std::ofstream& ofs_running,
+                     const int print_matrix)
+{
+    assert(pv->nloc_wfc > 0 && pv->nloc > 0);
+
+    // Create Tensor objects for temporary data
+    ct::Tensor tmp1(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc_wfc}));
+    tmp1.zero();
+
+    ct::Tensor Cij(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc}));
+    Cij.zero();
+
+    // Perform matrix multiplication: tmp1 = Stmp * psi_k
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nband,
+                             nlocal,
+                             1.0,
+                             Stmp.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc,
+                             psi_k.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             0.0,
+                             tmp1.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc);
+
+    // Perform matrix multiplication: Cij = psi_k^dagger * tmp1
+    ScalapackConnector::gemm('C',
+                             'N',
+                             nband,
+                             nband,
+                             nlocal,
+                             1.0,
+                             psi_k.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             tmp1.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             0.0,
+                             Cij.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_Eij);
+
+    if (print_matrix)
+    {
+        ofs_running << "original Cij :" << std::endl;
+        for (int i = 0; i < pv->ncol; i++)
+        {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->nrow; j++)
+            {
+                double aa = Cij.data<std::complex<double>>()[in + j].real();
+                double bb = Cij.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
             }
-            GlobalV::ofs_running << std::endl;
+            ofs_running << std::endl;
         }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " psi_k before normalization:" << std::endl;
+        ofs_running << std::endl;
+    }
+
+    int naroc[2] = {0, 0}; // maximum number of row or column
+
+    for (int iprow = 0; iprow < pv->dim0; ++iprow)
+    {
+        for (int ipcol = 0; ipcol < pv->dim1; ++ipcol)
+        {
+            if (iprow == pv->coord[0] && ipcol == pv->coord[1])
+            {
+                naroc[0] = pv->nrow;
+                naroc[1] = pv->ncol;
+                for (int j = 0; j < naroc[1]; ++j)
+                {
+                    int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol);
+                    if (igcol >= nband)
+                    {
+                        continue;
+                    }
+                    for (int i = 0; i < naroc[0]; ++i)
+                    {
+                        int igrow = globalIndex(i, pv->nb, pv->dim0, iprow);
+                        if (igrow >= nband)
+                        {
+                            continue;
+                        }
+                        if (igcol == igrow)
+                        {
+                            Cij.data<std::complex<double>>()[j * naroc[0] + i]
+                                = {1.0 / sqrt(Cij.data<std::complex<double>>()[j * naroc[0] + i].real()), 0.0};
+                        }
+                        else
+                        {
+                            Cij.data<std::complex<double>>()[j * naroc[0] + i] = {0.0, 0.0};
+                        }
+                    }
+                }
+            }
+        } // loop ipcol
+    } // loop iprow
+
+    // Copy psi_k to tmp1 (using deep copy)
+    // tmp1.CopyFrom(psi_k); // Does not work because this will cause tmp1 and psi_k to share the same data
+    tmp1 = psi_k; // operator= overload for Tensor class
+
+    // Perform matrix multiplication: psi_k = tmp1 * Cij
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nband,
+                             nband,
+                             1.0,
+                             tmp1.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             Cij.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_Eij,
+                             0.0,
+                             psi_k.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc);
+
+    if (print_matrix)
+    {
+        ofs_running << " Cij:" << std::endl;
         for (int i = 0; i < pv->ncol; i++)
         {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->nrow; j++)
+            {
+                ofs_running << Cij.data<std::complex<double>>()[in + j].real() << "+"
+                            << Cij.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " psi_k:" << std::endl;
+        for (int i = 0; i < pv->ncol_bands; i++)
+        {
+            const int in = i * pv->ncol;
             for (int j = 0; j < pv->ncol; j++)
             {
-                double aa, bb;
-                aa = tmp1[i * pv->ncol + j].real();
-                bb = tmp1[i * pv->ncol + j].imag();
-                if (std::abs(aa) < 1e-8) {
+                double aa = psi_k.data<std::complex<double>>()[in + j].real();
+                double bb = psi_k.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
                     aa = 0.0;
-}
-                if (std::abs(bb) < 1e-8) {
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " psi_k before normalization:" << std::endl;
+        for (int i = 0; i < pv->ncol_bands; i++)
+        {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->ncol; j++)
+            {
+                double aa = tmp1.data<std::complex<double>>()[in + j].real();
+                double bb = tmp1.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
                     bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+    }
 }
-                GlobalV::ofs_running << aa << "+" << bb << "i ";
+
+template <typename Device>
+void norm_psi_tensor_lapack(const Parallel_Orbitals* pv,
+                            const int nband,
+                            const int nlocal,
+                            const ct::Tensor& Stmp,
+                            ct::Tensor& psi_k,
+                            std::ofstream& ofs_running,
+                            const int print_matrix)
+{
+    // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
+    ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
+    // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+
+    // Create Tensor objects for temporary data
+    ct::Tensor tmp1(
+        ct::DataType::DT_COMPLEX_DOUBLE,
+        ct_device_type,
+        ct::TensorShape({nlocal * nband})); // tmp1 shape: nlocal * nband (under 2D block cyclic is pv->nloc_wfc)
+    tmp1.zero();
+
+    ct::Tensor Cij(ct::DataType::DT_COMPLEX_DOUBLE,
+                   ct_device_type,
+                   ct::TensorShape({nlocal * nlocal})); // Cij shape: nlocal * nlocal
+    Cij.zero();
+
+    std::complex<double> alpha = {1.0, 0.0};
+    std::complex<double> beta = {0.0, 0.0};
+
+    // Perform matrix multiplication: tmp1 = Stmp * psi_k
+    ct::kernels::blas_gemm<std::complex<double>, ct_Device>()('N',
+                                                              'N',
+                                                              nlocal,
+                                                              nband,
+                                                              nlocal,
+                                                              &alpha,
+                                                              Stmp.data<std::complex<double>>(),
+                                                              nlocal, // Leading dimension of Stmp
+                                                              psi_k.data<std::complex<double>>(),
+                                                              nlocal, // Leading dimension of psi_k
+                                                              &beta,
+                                                              tmp1.data<std::complex<double>>(),
+                                                              nlocal); // Leading dimension of tmp1
+
+    // Perform matrix multiplication: Cij = psi_k^dagger * tmp1
+    ct::kernels::blas_gemm<std::complex<double>, ct_Device>()('C',
+                                                              'N',
+                                                              nband,
+                                                              nband,
+                                                              nlocal,
+                                                              &alpha,
+                                                              psi_k.data<std::complex<double>>(),
+                                                              nlocal, // Leading dimension of psi_k
+                                                              tmp1.data<std::complex<double>>(),
+                                                              nlocal, // Leading dimension of tmp1
+                                                              &beta,
+                                                              Cij.data<std::complex<double>>(),
+                                                              nlocal); // Leading dimension of Cij
+
+    if (print_matrix)
+    {
+        ct::Tensor Cij_print_cpu = Cij.to_device<ct::DEVICE_CPU>();
+
+        ofs_running << "original Cij :" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                double aa = Cij_print_cpu.data<std::complex<double>>()[in + j].real();
+                double bb = Cij_print_cpu.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
             }
-            GlobalV::ofs_running << std::endl;
+            ofs_running << std::endl;
         }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << std::endl;
+        ofs_running << std::endl;
     }
 
-    delete[] tmp1;
-    delete[] Cij;
+    // Normalize Cij: set diagonal elements to 1/sqrt(Cij[i][i]), off-diagonal elements to 0
+    if (ct_device_type == ct::DeviceType::GpuDevice)
+    {
+        // Step 1: Copy Cij from GPU to CPU
+        ct::Tensor Cij_cpu = Cij.to_device<ct::DEVICE_CPU>();
+
+        // Step 2: Perform normalization on CPU
+        for (int i = 0; i < nband; ++i)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nband; ++j)
+            {
+                if (i == j)
+                {
+                    Cij_cpu.data<std::complex<double>>()[in + j]
+                        = {1.0 / sqrt(Cij_cpu.data<std::complex<double>>()[in + j].real()), 0.0};
+                }
+                else
+                {
+                    Cij_cpu.data<std::complex<double>>()[in + j] = {0.0, 0.0};
+                }
+            }
+        }
+
+        // Step 3: Copy normalized Cij back to GPU
+        Cij = Cij_cpu.to_device<ct_Device>();
+    }
+    else
+    {
+        // CPU implementation
+        for (int i = 0; i < nband; ++i)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nband; ++j)
+            {
+                if (i == j)
+                {
+                    Cij.data<std::complex<double>>()[in + j]
+                        = {1.0 / sqrt(Cij.data<std::complex<double>>()[in + j].real()), 0.0};
+                }
+                else
+                {
+                    Cij.data<std::complex<double>>()[in + j] = {0.0, 0.0};
+                }
+            }
+        }
+    }
+
+    // Copy psi_k to tmp1 (using deep copy)
+    // tmp1.CopyFrom(psi_k); // Does not work because this will cause tmp1 and psi_k to share the same data
+    tmp1 = psi_k; // operator= overload for Tensor class
+
+    // Perform matrix multiplication: psi_k = tmp1 * Cij
+    ct::kernels::blas_gemm<std::complex<double>, ct_Device>()('N',
+                                                              'N',
+                                                              nlocal,
+                                                              nband,
+                                                              nband,
+                                                              &alpha,
+                                                              tmp1.data<std::complex<double>>(),
+                                                              nlocal, // Leading dimension of tmp1
+                                                              Cij.data<std::complex<double>>(),
+                                                              nlocal, // Leading dimension of Cij
+                                                              &beta,
+                                                              psi_k.data<std::complex<double>>(),
+                                                              nlocal); // Leading dimension of psi_k
+
+    if (print_matrix)
+    {
+        ct::Tensor Cij_print_cpu = Cij.to_device<ct::DEVICE_CPU>();
+        ct::Tensor psi_k_cpu = psi_k.to_device<ct::DEVICE_CPU>();
+        ct::Tensor tmp1_cpu = tmp1.to_device<ct::DEVICE_CPU>();
+
+        ofs_running << " Cij:" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                ofs_running << Cij_print_cpu.data<std::complex<double>>()[in + j].real() << "+"
+                            << Cij_print_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " psi_k:" << std::endl;
+        for (int i = 0; i < nband; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                double aa = psi_k_cpu.data<std::complex<double>>()[in + j].real();
+                double bb = psi_k_cpu.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " psi_k before normalization:" << std::endl;
+        for (int i = 0; i < nband; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                double aa = tmp1_cpu.data<std::complex<double>>()[in + j].real();
+                double bb = tmp1_cpu.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+    }
 }
-#endif
+
+// Explicit instantiation of template functions
+template void norm_psi_tensor_lapack<base_device::DEVICE_CPU>(const Parallel_Orbitals* pv,
+                                                              const int nband,
+                                                              const int nlocal,
+                                                              const ct::Tensor& Stmp,
+                                                              ct::Tensor& psi_k,
+                                                              std::ofstream& ofs_running,
+                                                              const int print_matrix);
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+template void norm_psi_tensor_lapack<base_device::DEVICE_GPU>(const Parallel_Orbitals* pv,
+                                                              const int nband,
+                                                              const int nlocal,
+                                                              const ct::Tensor& Stmp,
+                                                              ct::Tensor& psi_k,
+                                                              std::ofstream& ofs_running,
+                                                              const int print_matrix);
+#endif // __CUDA
+#endif // __MPI
 } // namespace module_tddft
diff --git a/source/module_hamilt_lcao/module_tddft/norm_psi.h b/source/module_hamilt_lcao/module_tddft/norm_psi.h
index d3c41978bb..6d3e455a3a 100644
--- a/source/module_hamilt_lcao/module_tddft/norm_psi.h
+++ b/source/module_hamilt_lcao/module_tddft/norm_psi.h
@@ -6,7 +6,9 @@
 #ifndef NORM_PSI_H
 #define NORM_PSI_H
 
+#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor
 #include "module_basis/module_ao/parallel_orbitals.h"
+
 #include <complex>
 
 namespace module_tddft
@@ -28,9 +30,27 @@ void norm_psi(const Parallel_Orbitals* pv,
               const int nlocal,
               const std::complex<double>* Stmp,
               std::complex<double>* psi_k,
+              std::ofstream& ofs_running,
               const int print_matrix);
 
-#endif
+void norm_psi_tensor(const Parallel_Orbitals* pv,
+                     const int nband,
+                     const int nlocal,
+                     const ct::Tensor& Stmp,
+                     ct::Tensor& psi_k,
+                     std::ofstream& ofs_running,
+                     const int print_matrix);
+
+template <typename Device>
+void norm_psi_tensor_lapack(const Parallel_Orbitals* pv,
+                            const int nband,
+                            const int nlocal,
+                            const ct::Tensor& Stmp,
+                            ct::Tensor& psi_k,
+                            std::ofstream& ofs_running,
+                            const int print_matrix);
+
+#endif // __MPI
 } // namespace module_tddft
 
 #endif
diff --git a/source/module_hamilt_lcao/module_tddft/propagator.cpp b/source/module_hamilt_lcao/module_tddft/propagator.cpp
index 38299d5f08..154671176e 100644
--- a/source/module_hamilt_lcao/module_tddft/propagator.cpp
+++ b/source/module_hamilt_lcao/module_tddft/propagator.cpp
@@ -1,6 +1,10 @@
 #include "propagator.h"
 
 #include "module_base/lapack_connector.h"
+#include "module_base/module_container/ATen/kernels/blas.h"
+#include "module_base/module_container/ATen/kernels/lapack.h"
+#include "module_base/module_container/ATen/kernels/memory.h" // memory operations (Tensor)
+#include "module_base/module_device/memory_op.h"              // memory operations
 #include "module_base/scalapack_connector.h"
 #include "module_parameter/parameter.h"
 
@@ -13,603 +17,91 @@ Propagator::~Propagator()
 {
 }
 #ifdef __MPI
-
-inline int globalIndex(int localindex, int nblk, int nprocs, int myproc)
-{
-    int iblock, gIndex;
-    iblock = localindex / nblk;
-    gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk;
-    return gIndex;
-}
-
 void Propagator::compute_propagator(const int nlocal,
                                     const std::complex<double>* Stmp,
                                     const std::complex<double>* Htmp,
                                     const std::complex<double>* H_laststep,
                                     std::complex<double>* U_operator,
+                                    std::ofstream& ofs_running,
                                     const int print_matrix) const
 {
     int tag;
     switch (ptype)
     {
     case 0:
-        compute_propagator_cn2(nlocal, Stmp, Htmp, U_operator, print_matrix);
+        compute_propagator_cn2(nlocal, Stmp, Htmp, U_operator, ofs_running, print_matrix);
         break;
 
     case 1:
         tag = 1;
-        compute_propagator_taylor(nlocal, Stmp, Htmp, U_operator, print_matrix, tag);
+        compute_propagator_taylor(nlocal, Stmp, Htmp, U_operator, ofs_running, print_matrix, tag);
         break;
 
     case 2:
-        compute_propagator_etrs(nlocal, Stmp, Htmp, H_laststep, U_operator, print_matrix);
-
+        compute_propagator_etrs(nlocal, Stmp, Htmp, H_laststep, U_operator, ofs_running, print_matrix);
         break;
 
     default:
-        std::cout << "method of propagator is wrong" << std::endl;
+        ModuleBase::WARNING_QUIT("Propagator::compute_propagator", "Method of RT-TDDFT propagator is wrong!");
         break;
     }
 }
 
-void Propagator::compute_propagator_cn2(const int nlocal,
-                                        const std::complex<double>* Stmp,
-                                        const std::complex<double>* Htmp,
-                                        std::complex<double>* U_operator,
-                                        const int print_matrix) const
-{
-    // (1) copy Htmp to Numerator & Denominator
-    std::complex<double>* Numerator = new std::complex<double>[this->ParaV->nloc];
-    ModuleBase::GlobalFunc::ZEROS(Numerator, this->ParaV->nloc);
-    BlasConnector::copy(this->ParaV->nloc, Htmp, 1, Numerator, 1);
-
-    std::complex<double>* Denominator = new std::complex<double>[this->ParaV->nloc];
-    ModuleBase::GlobalFunc::ZEROS(Denominator, this->ParaV->nloc);
-    BlasConnector::copy(this->ParaV->nloc, Htmp, 1, Denominator, 1);
-
-    if (print_matrix)
-    {
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " S matrix :" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                GlobalV::ofs_running << Stmp[i * this->ParaV->ncol + j].real() << "+"
-                                     << Stmp[i * this->ParaV->ncol + j].imag() << "i ";
-            }
-            GlobalV::ofs_running << std::endl;
-        }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " H matrix :" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                GlobalV::ofs_running << Numerator[i * this->ParaV->ncol + j].real() << "+"
-                                     << Numerator[i * this->ParaV->ncol + j].imag() << "i ";
-            }
-            GlobalV::ofs_running << std::endl;
-        }
-        GlobalV::ofs_running << std::endl;
-    }
-
-    // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    // (2) compute Numerator & Denominator by GEADD
-    // Numerator = Stmp - i*para * Htmp;     beta1 = - para = -0.25 * this->dt
-    // Denominator = Stmp + i*para * Htmp;   beta2 = para = 0.25 * this->dt
-    std::complex<double> alpha = {1.0, 0.0};
-    std::complex<double> beta1 = {0.0, -0.25 * this->dt};
-    std::complex<double> beta2 = {0.0, 0.25 * this->dt};
-
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              alpha,
-                              Stmp,
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              beta1,
-                              Numerator,
-                              1,
-                              1,
-                              this->ParaV->desc);
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              alpha,
-                              Stmp,
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              beta2,
-                              Denominator,
-                              1,
-                              1,
-                              this->ParaV->desc);
-
-    if (print_matrix)
-    {
-        GlobalV::ofs_running << " beta=" << beta1 << std::endl;
-        GlobalV::ofs_running << " fenmu:" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                GlobalV::ofs_running << Denominator[i * this->ParaV->ncol + j].real() << "+"
-                                     << Denominator[i * this->ParaV->ncol + j].imag() << "i ";
-            }
-            GlobalV::ofs_running << std::endl;
-        }
-        GlobalV::ofs_running << std::endl;
-    }
-
-    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    // (3) Next, invert Denominator
-    int* ipiv = new int[this->ParaV->nloc];
-    int info = 0;
-    // (3.1) compute ipiv
-    ScalapackConnector::getrf(nlocal, nlocal, Denominator, 1, 1, this->ParaV->desc, ipiv, &info);
-    int lwork = -1;
-    int liwotk = -1;
-    std::vector<std::complex<double>> work(1, 0);
-    std::vector<int> iwork(1, 0);
-    // (3.2) compute work
-    ScalapackConnector::getri(nlocal,
-                              Denominator,
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              ipiv,
-                              work.data(),
-                              &lwork,
-                              iwork.data(),
-                              &liwotk,
-                              &info);
-    lwork = work[0].real();
-    work.resize(lwork, 0);
-    liwotk = iwork[0];
-    iwork.resize(liwotk, 0);
-    // (3.3) compute inverse matrix of Denominator
-    ScalapackConnector::getri(nlocal,
-                              Denominator,
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              ipiv,
-                              work.data(),
-                              &lwork,
-                              iwork.data(),
-                              &liwotk,
-                              &info);
-    assert(0 == info);
-
-    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-
-    // (4) U_operator = Denominator * Numerator;
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nlocal,
-                             nlocal,
-                             1.0,
-                             Denominator,
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             Numerator,
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             0.0,
-                             U_operator,
-                             1,
-                             1,
-                             this->ParaV->desc);
-
-    if (print_matrix)
-    {
-        GlobalV::ofs_running << " fenmu^-1:" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                GlobalV::ofs_running << Denominator[i * this->ParaV->ncol + j].real() << "+"
-                                     << Denominator[i * this->ParaV->ncol + j].imag() << "i ";
-            }
-            GlobalV::ofs_running << std::endl;
-        }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " fenzi:" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                GlobalV::ofs_running << Numerator[i * this->ParaV->ncol + j].real() << "+"
-                                     << Numerator[i * this->ParaV->ncol + j].imag() << "i ";
-            }
-            GlobalV::ofs_running << std::endl;
-        }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " U operator:" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                double aa, bb;
-                aa = U_operator[i * this->ParaV->ncol + j].real();
-                bb = U_operator[i * this->ParaV->ncol + j].imag();
-                if (std::abs(aa) < 1e-8) {
-                    aa = 0.0;
-}
-                if (std::abs(bb) < 1e-8) {
-                    bb = 0.0;
-}
-                GlobalV::ofs_running << aa << "+" << bb << "i ";
-            }
-            GlobalV::ofs_running << std::endl;
-        }
-    }
-
-    delete[] Numerator;
-    delete[] Denominator;
-    delete[] ipiv;
-}
-
-void Propagator::compute_propagator_taylor(const int nlocal,
-                                           const std::complex<double>* Stmp,
-                                           const std::complex<double>* Htmp,
-                                           std::complex<double>* U_operator,
+template <typename Device>
+void Propagator::compute_propagator_tensor(const int nlocal,
+                                           const ct::Tensor& Stmp,
+                                           const ct::Tensor& Htmp,
+                                           const ct::Tensor& H_laststep,
+                                           ct::Tensor& U_operator,
+                                           std::ofstream& ofs_running,
                                            const int print_matrix,
-                                           const int tag) const
+                                           const bool use_lapack) const
 {
-    ModuleBase::GlobalFunc::ZEROS(U_operator, this->ParaV->nloc);
-    std::complex<double>* A_matrix = new std::complex<double>[this->ParaV->nloc];
-    ModuleBase::GlobalFunc::ZEROS(A_matrix, this->ParaV->nloc);
-    std::complex<double>* rank0 = new std::complex<double>[this->ParaV->nloc];
-    ModuleBase::GlobalFunc::ZEROS(rank0, this->ParaV->nloc);
-    std::complex<double>* rank2 = new std::complex<double>[this->ParaV->nloc];
-    ModuleBase::GlobalFunc::ZEROS(rank2, this->ParaV->nloc);
-    std::complex<double>* rank3 = new std::complex<double>[this->ParaV->nloc];
-    ModuleBase::GlobalFunc::ZEROS(rank3, this->ParaV->nloc);
-    std::complex<double>* rank4 = new std::complex<double>[this->ParaV->nloc];
-    ModuleBase::GlobalFunc::ZEROS(rank4, this->ParaV->nloc);
-    std::complex<double>* tmp1 = new std::complex<double>[this->ParaV->nloc];
-    ModuleBase::GlobalFunc::ZEROS(tmp1, this->ParaV->nloc);
-    std::complex<double>* tmp2 = new std::complex<double>[this->ParaV->nloc];
-    ModuleBase::GlobalFunc::ZEROS(tmp2, this->ParaV->nloc);
-    std::complex<double>* Sinv = new std::complex<double>[this->ParaV->nloc];
-    ModuleBase::GlobalFunc::ZEROS(Sinv, this->ParaV->nloc);
-    BlasConnector::copy(this->ParaV->nloc, Stmp, 1, Sinv, 1);
-
-    if (print_matrix)
+    int tag;
+    switch (ptype)
     {
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " S matrix :" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
+    case 0:
+        if (!use_lapack)
         {
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                GlobalV::ofs_running << Stmp[i * this->ParaV->ncol + j].real() << "+"
-                                     << Stmp[i * this->ParaV->ncol + j].imag() << "i ";
-            }
-            GlobalV::ofs_running << std::endl;
+            compute_propagator_cn2_tensor(nlocal, Stmp, Htmp, U_operator, ofs_running, print_matrix);
         }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " H matrix :" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
+        else
         {
-            for (int j = 0; j < this->ParaV->ncol; j++)
+            int myid = 0;
+            int root_proc = 0;
+            MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+            if (myid == root_proc)
             {
-                GlobalV::ofs_running << Htmp[i * this->ParaV->ncol + j].real() << "+"
-                                     << Htmp[i * this->ParaV->ncol + j].imag() << "i ";
+                compute_propagator_cn2_tensor_lapack<Device>(nlocal, Stmp, Htmp, U_operator, ofs_running, print_matrix);
             }
-            GlobalV::ofs_running << std::endl;
         }
-        GlobalV::ofs_running << std::endl;
-    }
-
-    // set rank0
-    int info;
-    int naroc[2]; // maximum number of row or column
-
-    for (int iprow = 0; iprow < this->ParaV->dim0; ++iprow)
-    {
-        for (int ipcol = 0; ipcol < this->ParaV->dim1; ++ipcol)
-        {
-            if (iprow == ParaV->coord[0] && ipcol == ParaV->coord[1])
-            {
-                naroc[0] = this->ParaV->nrow;
-                naroc[1] = this->ParaV->ncol;
-                for (int j = 0; j < naroc[1]; ++j)
-                {
-                    int igcol = globalIndex(j, this->ParaV->nb, this->ParaV->dim1, ipcol);
-                    if (igcol >= nlocal) {
-                        continue;
-}
-                    for (int i = 0; i < naroc[0]; ++i)
-                    {
-                        int igrow = globalIndex(i, this->ParaV->nb, this->ParaV->dim0, iprow);
-                        if (igrow >= nlocal) {
-                            continue;
-}
-                        if (igcol == igrow)
-                        {
-                            rank0[j * naroc[0] + i] = {1.0, 0.0};
-                        }
-                        else
-                        {
-                            rank0[j * naroc[0] + i] = {0.0, 0.0};
-                        }
-                    }
-                }
-            }
-        } // loop ipcol
-    }     // loop iprow
-
-    std::complex<double> beta = {0.0, -0.5 * this->dt / tag}; // for ETRS tag=2 , for taylor tag=1
-
-    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    // invert Stmp
-    int* ipiv = new int[this->ParaV->nloc];
-    // (3.1) compute ipiv
-    ScalapackConnector::getrf(nlocal, nlocal, Sinv, 1, 1, this->ParaV->desc, ipiv, &info);
-    int lwork = -1;
-    int liwotk = -1;
-    std::vector<std::complex<double>> work(1, 0);
-    std::vector<int> iwork(1, 0);
-    // (3.2) compute work
-    ScalapackConnector::getri(nlocal,
-                              Sinv,
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              ipiv,
-                              work.data(),
-                              &lwork,
-                              iwork.data(),
-                              &liwotk,
-                              &info);
-    lwork = work[0].real();
-    work.resize(lwork, 0);
-    liwotk = iwork[0];
-    iwork.resize(liwotk, 0);
-    ScalapackConnector::getri(nlocal,
-                              Sinv,
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              ipiv,
-                              work.data(),
-                              &lwork,
-                              iwork.data(),
-                              &liwotk,
-                              &info);
-    assert(0 == info);
-
-    //  A_matrix = - idt S^-1 H ;
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nlocal,
-                             nlocal,
-                             beta,
-                             Sinv,
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             Htmp,
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             0.0,
-                             U_operator,
-                             1,
-                             1,
-                             this->ParaV->desc);
-
-    //  rank2 = A^2 ;
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nlocal,
-                             nlocal,
-                             1.0,
-                             U_operator,
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             U_operator,
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             0.0,
-                             rank2,
-                             1,
-                             1,
-                             this->ParaV->desc);
-
-    //  rank3 = A^3 ;
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nlocal,
-                             nlocal,
-                             1.0,
-                             U_operator,
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             rank2,
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             0.0,
-                             rank3,
-                             1,
-                             1,
-                             this->ParaV->desc);
-
-    //  rank4 = A^4 ;
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nlocal,
-                             nlocal,
-                             1.0,
-                             U_operator,
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             rank3,
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             0.0,
-                             rank4,
-                             1,
-                             1,
-                             this->ParaV->desc);
-
-    std::complex<double> p1 = {1.0, 0.0};
-    std::complex<double> p2 = {1.0 / 2.0, 0.0};
-    std::complex<double> p3 = {1.0 / 6.0, 0.0};
-    std::complex<double> p4 = {1.0 / 24.0, 0.0};
-
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              p1,
-                              rank0,
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              p1,
-                              U_operator,
-                              1,
-                              1,
-                              this->ParaV->desc);
-
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              p2,
-                              rank2,
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              p1,
-                              U_operator,
-                              1,
-                              1,
-                              this->ParaV->desc);
-
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              p3,
-                              rank3,
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              p1,
-                              U_operator,
-                              1,
-                              1,
-                              this->ParaV->desc);
-
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              p4,
-                              rank4,
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              p1,
-                              U_operator,
-                              1,
-                              1,
-                              this->ParaV->desc);
+        break;
 
-    if (print_matrix)
-    {
-        GlobalV::ofs_running << " A_matrix:" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                GlobalV::ofs_running << A_matrix[i * this->ParaV->ncol + j].real() << "+"
-                                     << A_matrix[i * this->ParaV->ncol + j].imag() << "i ";
-            }
-            GlobalV::ofs_running << std::endl;
-        }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " U operator:" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                double aa, bb;
-                aa = U_operator[i * this->ParaV->ncol + j].real();
-                bb = U_operator[i * this->ParaV->ncol + j].imag();
-                if (std::abs(aa) < 1e-8) {
-                    aa = 0.0;
-}
-                if (std::abs(bb) < 1e-8) {
-                    bb = 0.0;
-}
-                GlobalV::ofs_running << aa << "+" << bb << "i ";
-            }
-            GlobalV::ofs_running << std::endl;
-        }
+    default:
+        ModuleBase::WARNING_QUIT("Propagator::compute_propagator_tensor",
+                                 "The Tensor-based RT-TDDFT propagator currently supports Crank–Nicolson method only!");
+        break;
     }
-    delete[] A_matrix;
-    delete[] rank0;
-    delete[] rank2;
-    delete[] rank3;
-    delete[] rank4;
-    delete[] tmp1;
-    delete[] tmp2;
-    delete[] Sinv;
-    delete[] ipiv;
-}
-
-void Propagator::compute_propagator_etrs(const int nlocal,
-                                         const std::complex<double>* Stmp,
-                                         const std::complex<double>* Htmp,
-                                         const std::complex<double>* H_laststep,
-                                         std::complex<double>* U_operator,
-                                         const int print_matrix) const
-{
-    std::vector<std::complex<double>> U1(this->ParaV->nloc);
-    std::vector<std::complex<double>> U2(this->ParaV->nloc);
-    int tag = 2;
-    compute_propagator_taylor(nlocal, Stmp, Htmp, U1.data(), print_matrix, tag);
-    compute_propagator_taylor(nlocal, Stmp, H_laststep, U2.data(), print_matrix, tag);
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nlocal,
-                             nlocal,
-                             1.0,
-                             U1.data(),
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             U2.data(),
-                             1,
-                             1,
-                             this->ParaV->desc,
-                             0.0,
-                             U_operator,
-                             1,
-                             1,
-                             this->ParaV->desc);
 }
 
-#endif
+// Explicit instantiation of template functions
+template void Propagator::compute_propagator_tensor<base_device::DEVICE_CPU>(const int nlocal,
+                                                                             const ct::Tensor& Stmp,
+                                                                             const ct::Tensor& Htmp,
+                                                                             const ct::Tensor& H_laststep,
+                                                                             ct::Tensor& U_operator,
+                                                                             std::ofstream& ofs_running,
+                                                                             const int print_matrix,
+                                                                             const bool use_lapack) const;
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+template void Propagator::compute_propagator_tensor<base_device::DEVICE_GPU>(const int nlocal,
+                                                                             const ct::Tensor& Stmp,
+                                                                             const ct::Tensor& Htmp,
+                                                                             const ct::Tensor& H_laststep,
+                                                                             ct::Tensor& U_operator,
+                                                                             std::ofstream& ofs_running,
+                                                                             const int print_matrix,
+                                                                             const bool use_lapack) const;
+#endif // __CUDA
+#endif // __MPI
 } // namespace module_tddft
diff --git a/source/module_hamilt_lcao/module_tddft/propagator.h b/source/module_hamilt_lcao/module_tddft/propagator.h
index c827fc96b2..ca4bec2e66 100644
--- a/source/module_hamilt_lcao/module_tddft/propagator.h
+++ b/source/module_hamilt_lcao/module_tddft/propagator.h
@@ -7,12 +7,98 @@
 #define PROPAGATOR_H
 
 #include "module_base/constants.h"
+#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor
 #include "module_basis/module_ao/parallel_orbitals.h"
 
 #include <complex>
 
 namespace module_tddft
 {
+//--------------------------------- Utility function ---------------------------------//
+#ifdef __MPI
+inline int globalIndex(int localindex, int nblk, int nprocs, int myproc)
+{
+    int iblock, gIndex;
+    iblock = localindex / nblk;
+    gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk;
+    return gIndex;
+}
+#endif // __MPI
+
+// Auxiliary function: process non-complex types, return value 1.0
+template <typename T>
+inline T init_value(typename std::enable_if<!std::is_same<T, std::complex<float>>::value
+                                            && !std::is_same<T, std::complex<double>>::value>::type* = nullptr)
+{
+    return T(1.0);
+}
+
+// Auxiliary function: process complex types, return value 1.0 + 0.0i
+template <typename T>
+inline T init_value(typename std::enable_if<std::is_same<T, std::complex<float>>::value
+                                            || std::is_same<T, std::complex<double>>::value>::type* = nullptr)
+{
+    return T(1.0, 0.0);
+}
+
+// Create an identity matrix of size n×n
+template <typename T>
+ct::Tensor create_identity_matrix(const int n, ct::DeviceType device = ct::DeviceType::CpuDevice)
+{
+    // Choose the data type of the Tensor
+    ct::DataType data_type;
+    if (std::is_same<T, float>::value)
+    {
+        data_type = ct::DataType::DT_FLOAT;
+    }
+    else if (std::is_same<T, double>::value)
+    {
+        data_type = ct::DataType::DT_DOUBLE;
+    }
+    else if (std::is_same<T, std::complex<float>>::value)
+    {
+        data_type = ct::DataType::DT_COMPLEX;
+    }
+    else if (std::is_same<T, std::complex<double>>::value)
+    {
+        data_type = ct::DataType::DT_COMPLEX_DOUBLE;
+    }
+    else
+    {
+        static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value
+                          || std::is_same<T, std::complex<float>>::value
+                          || std::is_same<T, std::complex<double>>::value,
+                      "Unsupported data type!");
+    }
+
+    ct::Tensor tensor(data_type, device, ct::TensorShape({n, n}));
+    tensor.zero();
+
+    // Set the diagonal elements to 1
+    if (device == ct::DeviceType::CpuDevice)
+    {
+        // For CPU, we can directly access the data
+        T* data_ptr = tensor.data<T>();
+        for (int i = 0; i < n; ++i)
+        {
+            data_ptr[i * n + i] = init_value<T>();
+        }
+    }
+    else if (device == ct::DeviceType::GpuDevice)
+    {
+        // For GPU, we need to use a kernel to set the diagonal elements
+        T* data_ptr = tensor.data<T>();
+        for (int i = 0; i < n; ++i)
+        {
+            T value = init_value<T>();
+            ct::kernels::set_memory<T, ct::DEVICE_GPU>()(data_ptr + i * n + i, value, 1);
+        }
+    }
+
+    return tensor;
+}
+//--------------------------------- Utility function ---------------------------------//
+
 class Propagator
 {
   public:
@@ -40,8 +126,19 @@ class Propagator
                             const std::complex<double>* Htmp,
                             const std::complex<double>* H_laststep,
                             std::complex<double>* U_operator,
+                            std::ofstream& ofs_running,
                             const int print_matrix) const;
-#endif
+
+    template <typename Device>
+    void compute_propagator_tensor(const int nlocal,
+                                   const ct::Tensor& Stmp,
+                                   const ct::Tensor& Htmp,
+                                   const ct::Tensor& H_laststep,
+                                   ct::Tensor& U_operator,
+                                   std::ofstream& ofs_running,
+                                   const int print_matrix,
+                                   const bool use_lapack) const;
+#endif // __MPI
 
   private:
     int ptype; // type of propagator
@@ -63,8 +160,24 @@ class Propagator
                                 const std::complex<double>* Stmp,
                                 const std::complex<double>* Htmp,
                                 std::complex<double>* U_operator,
+                                std::ofstream& ofs_running,
                                 const int print_matrix) const;
 
+    void compute_propagator_cn2_tensor(const int nlocal,
+                                       const ct::Tensor& Stmp,
+                                       const ct::Tensor& Htmp,
+                                       ct::Tensor& U_operator,
+                                       std::ofstream& ofs_running,
+                                       const int print_matrix) const;
+
+    template <typename Device>
+    void compute_propagator_cn2_tensor_lapack(const int nlocal,
+                                              const ct::Tensor& Stmp,
+                                              const ct::Tensor& Htmp,
+                                              ct::Tensor& U_operator,
+                                              std::ofstream& ofs_running,
+                                              const int print_matrix) const;
+
     /**
      *  @brief compute propagator of method 4th Taylor
      *
@@ -79,6 +192,7 @@ class Propagator
                                    const std::complex<double>* Stmp,
                                    const std::complex<double>* Htmp,
                                    std::complex<double>* U_operator,
+                                   std::ofstream& ofs_running,
                                    const int print_matrix,
                                    const int tag) const;
 
@@ -97,8 +211,9 @@ class Propagator
                                  const std::complex<double>* Htmp,
                                  const std::complex<double>* H_laststep,
                                  std::complex<double>* U_operator,
+                                 std::ofstream& ofs_running,
                                  const int print_matrix) const;
-#endif
+#endif // __MPI
 };
 } // namespace module_tddft
 
diff --git a/source/module_hamilt_lcao/module_tddft/propagator_cn2.cpp b/source/module_hamilt_lcao/module_tddft/propagator_cn2.cpp
new file mode 100644
index 0000000000..d87cf3c593
--- /dev/null
+++ b/source/module_hamilt_lcao/module_tddft/propagator_cn2.cpp
@@ -0,0 +1,734 @@
+#include "module_base/lapack_connector.h"
+#include "module_base/module_container/ATen/kernels/blas.h"
+#include "module_base/module_container/ATen/kernels/lapack.h"
+#include "module_base/module_container/ATen/kernels/memory.h" // memory operations (Tensor)
+#include "module_base/module_device/memory_op.h"              // memory operations
+#include "module_base/scalapack_connector.h"
+#include "module_parameter/parameter.h"
+#include "propagator.h"
+
+#include <complex>
+#include <iostream>
+
+namespace module_tddft
+{
+#ifdef __MPI
+void Propagator::compute_propagator_cn2(const int nlocal,
+                                        const std::complex<double>* Stmp,
+                                        const std::complex<double>* Htmp,
+                                        std::complex<double>* U_operator,
+                                        std::ofstream& ofs_running,
+                                        const int print_matrix) const
+{
+    assert(this->ParaV->nloc > 0);
+
+    // (1) copy Htmp to Numerator & Denominator
+    std::complex<double>* Numerator = new std::complex<double>[this->ParaV->nloc];
+    ModuleBase::GlobalFunc::ZEROS(Numerator, this->ParaV->nloc);
+    BlasConnector::copy(this->ParaV->nloc, Htmp, 1, Numerator, 1);
+
+    std::complex<double>* Denominator = new std::complex<double>[this->ParaV->nloc];
+    ModuleBase::GlobalFunc::ZEROS(Denominator, this->ParaV->nloc);
+    BlasConnector::copy(this->ParaV->nloc, Htmp, 1, Denominator, 1);
+
+    if (print_matrix)
+    {
+        ofs_running << std::endl;
+        ofs_running << " S matrix :" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Stmp[in + j].real() << "+" << Stmp[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " H matrix :" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Numerator[in + j].real() << "+" << Numerator[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+
+    // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+    // (2) compute Numerator & Denominator by GEADD
+    // Numerator = Stmp - i*para * Htmp;     beta1 = - para = -0.25 * this->dt
+    // Denominator = Stmp + i*para * Htmp;   beta2 = para = 0.25 * this->dt
+    std::complex<double> alpha = {1.0, 0.0};
+    std::complex<double> beta1 = {0.0, -0.25 * this->dt};
+    std::complex<double> beta2 = {0.0, 0.25 * this->dt};
+
+    ScalapackConnector::geadd('N',
+                              nlocal,
+                              nlocal,
+                              alpha,
+                              Stmp,
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              beta1,
+                              Numerator,
+                              1,
+                              1,
+                              this->ParaV->desc);
+    ScalapackConnector::geadd('N',
+                              nlocal,
+                              nlocal,
+                              alpha,
+                              Stmp,
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              beta2,
+                              Denominator,
+                              1,
+                              1,
+                              this->ParaV->desc);
+
+    if (print_matrix)
+    {
+        ofs_running << " beta=" << beta1 << std::endl;
+        ofs_running << " fenmu:" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Denominator[in + j].real() << "+" << Denominator[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+
+    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+    // (3) Next, invert Denominator
+    // What is the size of ipiv exactly? Need to check ScaLAPACK documentation!
+    // But anyway, not this->ParaV->nloc
+    int* ipiv = new int[this->ParaV->nrow + this->ParaV->nb];
+    ModuleBase::GlobalFunc::ZEROS(ipiv, this->ParaV->nrow + this->ParaV->nb);
+    int info = 0;
+    // (3.1) compute ipiv
+    ScalapackConnector::getrf(nlocal, nlocal, Denominator, 1, 1, this->ParaV->desc, ipiv, &info);
+
+    // Print ipiv
+    if (print_matrix)
+    {
+        ofs_running << " this->ParaV->nloc = " << this->ParaV->nloc << std::endl;
+        ofs_running << " this->ParaV->nrow = " << this->ParaV->nrow << std::endl;
+        ofs_running << " this->ParaV->ncol = " << this->ParaV->ncol << std::endl;
+        ofs_running << " this->ParaV->nb = " << this->ParaV->nb << std::endl;
+        ofs_running << " this->ParaV->get_block_size() = " << this->ParaV->get_block_size() << std::endl;
+        ofs_running << " nlocal = " << nlocal << std::endl;
+        ofs_running << " ipiv:" << std::endl;
+        for (int i = 0; i < this->ParaV->nloc; i++)
+        {
+            ofs_running << ipiv[i] << " ";
+        }
+        ofs_running << std::endl;
+    }
+
+    int lwork = -1;
+    int liwotk = -1;
+    std::vector<std::complex<double>> work(1, 0);
+    std::vector<int> iwork(1, 0);
+    // (3.2) compute work
+    ScalapackConnector::getri(nlocal,
+                              Denominator,
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              ipiv,
+                              work.data(),
+                              &lwork,
+                              iwork.data(),
+                              &liwotk,
+                              &info);
+    lwork = work[0].real();
+    work.resize(lwork, 0);
+    liwotk = iwork[0];
+    iwork.resize(liwotk, 0);
+    // (3.3) compute inverse matrix of Denominator
+    ScalapackConnector::getri(nlocal,
+                              Denominator,
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              ipiv,
+                              work.data(),
+                              &lwork,
+                              iwork.data(),
+                              &liwotk,
+                              &info);
+    assert(0 == info);
+
+    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+    // (4) U_operator = Denominator * Numerator;
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nlocal,
+                             nlocal,
+                             1.0,
+                             Denominator,
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             Numerator,
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             0.0,
+                             U_operator,
+                             1,
+                             1,
+                             this->ParaV->desc);
+
+    if (print_matrix)
+    {
+        ofs_running << " fenmu^-1:" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Denominator[in + j].real() << "+" << Denominator[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " fenzi:" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Numerator[in + j].real() << "+" << Numerator[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " U operator:" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                double aa = U_operator[in + j].real();
+                double bb = U_operator[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+    }
+
+    delete[] Numerator;
+    delete[] Denominator;
+    delete[] ipiv;
+}
+
+void Propagator::compute_propagator_cn2_tensor(const int nlocal,
+                                               const ct::Tensor& Stmp,
+                                               const ct::Tensor& Htmp,
+                                               ct::Tensor& U_operator,
+                                               std::ofstream& ofs_running,
+                                               const int print_matrix) const
+{
+    // (1) copy Htmp to Numerator & Denominator
+    ct::Tensor Numerator(ct::DataType::DT_COMPLEX_DOUBLE,
+                         ct::DeviceType::CpuDevice,
+                         ct::TensorShape({this->ParaV->nloc}));
+    Numerator.zero();
+    BlasConnector::copy(this->ParaV->nloc,
+                        Htmp.data<std::complex<double>>(),
+                        1,
+                        Numerator.data<std::complex<double>>(),
+                        1);
+
+    ct::Tensor Denominator(ct::DataType::DT_COMPLEX_DOUBLE,
+                           ct::DeviceType::CpuDevice,
+                           ct::TensorShape({this->ParaV->nloc}));
+    Denominator.zero();
+    BlasConnector::copy(this->ParaV->nloc,
+                        Htmp.data<std::complex<double>>(),
+                        1,
+                        Denominator.data<std::complex<double>>(),
+                        1);
+
+    if (print_matrix)
+    {
+        ofs_running << std::endl;
+        ofs_running << " S matrix :" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Stmp.data<std::complex<double>>()[in + j].real() << "+"
+                            << Stmp.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " H matrix :" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Numerator.data<std::complex<double>>()[in + j].real() << "+"
+                            << Numerator.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+
+    // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+    // (2) compute Numerator & Denominator by GEADD
+    // Numerator = Stmp - i*para * Htmp;     beta1 = - para = -0.25 * this->dt
+    // Denominator = Stmp + i*para * Htmp;   beta2 = para = 0.25 * this->dt
+    std::complex<double> alpha = {1.0, 0.0};
+    std::complex<double> beta1 = {0.0, -0.25 * this->dt};
+    std::complex<double> beta2 = {0.0, 0.25 * this->dt};
+
+    ScalapackConnector::geadd('N',
+                              nlocal,
+                              nlocal,
+                              alpha,
+                              Stmp.data<std::complex<double>>(),
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              beta1,
+                              Numerator.data<std::complex<double>>(),
+                              1,
+                              1,
+                              this->ParaV->desc);
+    ScalapackConnector::geadd('N',
+                              nlocal,
+                              nlocal,
+                              alpha,
+                              Stmp.data<std::complex<double>>(),
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              beta2,
+                              Denominator.data<std::complex<double>>(),
+                              1,
+                              1,
+                              this->ParaV->desc);
+
+    if (print_matrix)
+    {
+        ofs_running << " beta=" << beta1 << std::endl;
+        ofs_running << " fenmu:" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Denominator.data<std::complex<double>>()[in + j].real() << "+"
+                            << Denominator.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+
+    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+    // (3) Next, invert Denominator
+    ct::Tensor ipiv(ct::DataType::DT_INT,
+                    ct::DeviceType::CpuDevice,
+                    ct::TensorShape({this->ParaV->nrow + this->ParaV->nb}));
+    ipiv.zero();
+    int info = 0;
+    // (3.1) compute ipiv
+    ScalapackConnector::getrf(nlocal,
+                              nlocal,
+                              Denominator.data<std::complex<double>>(),
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              ipiv.data<int>(),
+                              &info);
+
+    // Print ipiv
+    if (print_matrix)
+    {
+        ofs_running << " this->ParaV->nloc = " << this->ParaV->nloc << std::endl;
+        ofs_running << " this->ParaV->nrow = " << this->ParaV->nrow << std::endl;
+        ofs_running << " this->ParaV->ncol = " << this->ParaV->ncol << std::endl;
+        ofs_running << " this->ParaV->nb = " << this->ParaV->nb << std::endl;
+        ofs_running << " this->ParaV->get_block_size() = " << this->ParaV->get_block_size() << std::endl;
+        ofs_running << " nlocal = " << nlocal << std::endl;
+        ofs_running << " ipiv:" << std::endl;
+        for (int i = 0; i < this->ParaV->nloc; i++)
+        {
+            ofs_running << ipiv.data<int>()[i] << " ";
+        }
+        ofs_running << std::endl;
+    }
+
+    int lwork = -1;
+    int liwotk = -1;
+    ct::Tensor work(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({1}));
+    ct::Tensor iwork(ct::DataType::DT_INT, ct::DeviceType::CpuDevice, ct::TensorShape({1}));
+    // (3.2) compute work
+    ScalapackConnector::getri(nlocal,
+                              Denominator.data<std::complex<double>>(),
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              ipiv.data<int>(),
+                              work.data<std::complex<double>>(),
+                              &lwork,
+                              iwork.data<int>(),
+                              &liwotk,
+                              &info);
+    lwork = work.data<std::complex<double>>()[0].real();
+    work.resize(ct::TensorShape({lwork}));
+    liwotk = iwork.data<int>()[0];
+    iwork.resize(ct::TensorShape({liwotk}));
+    // (3.3) compute inverse matrix of Denominator
+    ScalapackConnector::getri(nlocal,
+                              Denominator.data<std::complex<double>>(),
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              ipiv.data<int>(),
+                              work.data<std::complex<double>>(),
+                              &lwork,
+                              iwork.data<int>(),
+                              &liwotk,
+                              &info);
+    assert(0 == info);
+
+    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+    // (4) U_operator = Denominator * Numerator;
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nlocal,
+                             nlocal,
+                             1.0,
+                             Denominator.data<std::complex<double>>(),
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             Numerator.data<std::complex<double>>(),
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             0.0,
+                             U_operator.data<std::complex<double>>(),
+                             1,
+                             1,
+                             this->ParaV->desc);
+
+    if (print_matrix)
+    {
+        ofs_running << " fenmu^-1:" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Denominator.data<std::complex<double>>()[in + j].real() << "+"
+                            << Denominator.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " fenzi:" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Numerator.data<std::complex<double>>()[in + j].real() << "+"
+                            << Numerator.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " U operator:" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                double aa = U_operator.data<std::complex<double>>()[in + j].real();
+                double bb = U_operator.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+    }
+}
+
+template <typename Device>
+void Propagator::compute_propagator_cn2_tensor_lapack(const int nlocal,
+                                                      const ct::Tensor& Stmp,
+                                                      const ct::Tensor& Htmp,
+                                                      ct::Tensor& U_operator,
+                                                      std::ofstream& ofs_running,
+                                                      const int print_matrix) const
+{
+    // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
+    ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
+    // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+
+    // (1) copy Htmp to Numerator & Denominator
+    ct::Tensor Numerator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal * nlocal}));
+    Numerator.zero();
+    base_device::memory::synchronize_memory_op<std::complex<double>, Device, Device>()(
+        Numerator.data<std::complex<double>>(),
+        Htmp.data<std::complex<double>>(),
+        nlocal * nlocal);
+
+    ct::Tensor Denominator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal * nlocal}));
+    Denominator.zero();
+    base_device::memory::synchronize_memory_op<std::complex<double>, Device, Device>()(
+        Denominator.data<std::complex<double>>(),
+        Htmp.data<std::complex<double>>(),
+        nlocal * nlocal);
+
+    if (print_matrix)
+    {
+        ct::Tensor Stmp_cpu = Stmp.to_device<ct::DEVICE_CPU>();
+        ct::Tensor Numerator_cpu = Numerator.to_device<ct::DEVICE_CPU>();
+
+        ofs_running << std::endl;
+        ofs_running << " S matrix :" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                ofs_running << Stmp_cpu.data<std::complex<double>>()[in + j].real() << "+"
+                            << Stmp_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " H matrix :" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                ofs_running << Numerator_cpu.data<std::complex<double>>()[in + j].real() << "+"
+                            << Numerator_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+
+    // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+    // (2) compute Numerator & Denominator by GEADD
+    // Numerator = Stmp - i*para * Htmp;     beta1 = - para = -0.25 * this->dt
+    // Denominator = Stmp + i*para * Htmp;   beta2 = para = 0.25 * this->dt
+    std::complex<double> one = {1.0, 0.0};
+    std::complex<double> beta1 = {0.0, -0.25 * this->dt};
+    std::complex<double> beta2 = {0.0, 0.25 * this->dt};
+
+    // Numerator = -i*para * Htmp
+    ct::kernels::blas_scal<std::complex<double>, ct_Device>()(nlocal * nlocal,
+                                                              &beta1,
+                                                              Numerator.data<std::complex<double>>(),
+                                                              1);
+    // Numerator = Stmp + (-i*para * Htmp)
+    ct::kernels::blas_axpy<std::complex<double>, ct_Device>()(nlocal * nlocal,
+                                                              &one,
+                                                              Stmp.data<std::complex<double>>(),
+                                                              1,
+                                                              Numerator.data<std::complex<double>>(),
+                                                              1);
+    // Denominator = i*para * Htmp
+    ct::kernels::blas_scal<std::complex<double>, ct_Device>()(nlocal * nlocal,
+                                                              &beta2,
+                                                              Denominator.data<std::complex<double>>(),
+                                                              1);
+    // Denominator = Stmp + (i*para * Htmp)
+    ct::kernels::blas_axpy<std::complex<double>, ct_Device>()(nlocal * nlocal,
+                                                              &one,
+                                                              Stmp.data<std::complex<double>>(),
+                                                              1,
+                                                              Denominator.data<std::complex<double>>(),
+                                                              1);
+
+    if (print_matrix)
+    {
+        ct::Tensor Denominator_cpu = Denominator.to_device<ct::DEVICE_CPU>();
+
+        ofs_running << " beta=" << beta1 << std::endl;
+        ofs_running << " fenmu:" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                ofs_running << Denominator_cpu.data<std::complex<double>>()[in + j].real() << "+"
+                            << Denominator_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+
+    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+    // (3) Next, invert Denominator
+    ct::Tensor ipiv(ct::DataType::DT_INT, ct_device_type, ct::TensorShape({nlocal}));
+    ipiv.zero();
+    // (3.1) compute ipiv
+    ct::kernels::lapack_getrf<std::complex<double>, ct_Device>()(nlocal,
+                                                                 nlocal,
+                                                                 Denominator.data<std::complex<double>>(),
+                                                                 nlocal,
+                                                                 ipiv.data<int>());
+
+    // Print ipiv
+    if (print_matrix)
+    {
+        ct::Tensor ipiv_cpu = ipiv.to_device<ct::DEVICE_CPU>();
+
+        ofs_running << " ipiv:" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            ofs_running << ipiv_cpu.data<int>()[i] << " ";
+        }
+        ofs_running << std::endl;
+    }
+
+    // (3.2) compute inverse matrix of Denominator
+    ct::Tensor Denominator_inv = create_identity_matrix<std::complex<double>>(nlocal, ct_device_type);
+    ct::kernels::lapack_getrs<std::complex<double>, ct_Device>()('N',
+                                                                 nlocal,
+                                                                 nlocal,
+                                                                 Denominator.data<std::complex<double>>(),
+                                                                 nlocal,
+                                                                 ipiv.data<int>(),
+                                                                 Denominator_inv.data<std::complex<double>>(),
+                                                                 nlocal);
+
+    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+    // (4) U_operator = Denominator_inv * Numerator;
+    std::complex<double> one_gemm = {1.0, 0.0};
+    std::complex<double> zero_gemm = {0.0, 0.0};
+    ct::kernels::blas_gemm<std::complex<double>, ct_Device>()('N',
+                                                              'N',
+                                                              nlocal,
+                                                              nlocal,
+                                                              nlocal,
+                                                              &one_gemm,
+                                                              Denominator_inv.data<std::complex<double>>(),
+                                                              nlocal,
+                                                              Numerator.data<std::complex<double>>(),
+                                                              nlocal,
+                                                              &zero_gemm,
+                                                              U_operator.data<std::complex<double>>(),
+                                                              nlocal);
+
+    if (print_matrix)
+    {
+        ct::Tensor Denominator_inv_cpu = Denominator_inv.to_device<ct::DEVICE_CPU>();
+        ct::Tensor Numerator_cpu = Numerator.to_device<ct::DEVICE_CPU>();
+        ct::Tensor U_operator_cpu = U_operator.to_device<ct::DEVICE_CPU>();
+
+        ofs_running << " fenmu^-1:" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                ofs_running << Denominator_inv_cpu.data<std::complex<double>>()[in + j].real() << "+"
+                            << Denominator_inv_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " fenzi:" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                ofs_running << Numerator_cpu.data<std::complex<double>>()[in + j].real() << "+"
+                            << Numerator_cpu.data<std::complex<double>>()[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " U operator:" << std::endl;
+        for (int i = 0; i < nlocal; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                double aa = U_operator_cpu.data<std::complex<double>>()[in + j].real();
+                double bb = U_operator_cpu.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+    }
+}
+
+// Explicit instantiation of template functions
+template void Propagator::compute_propagator_cn2_tensor_lapack<base_device::DEVICE_CPU>(const int nlocal,
+                                                                                        const ct::Tensor& Stmp,
+                                                                                        const ct::Tensor& Htmp,
+                                                                                        ct::Tensor& U_operator,
+                                                                                        std::ofstream& ofs_running,
+                                                                                        const int print_matrix) const;
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+template void Propagator::compute_propagator_cn2_tensor_lapack<base_device::DEVICE_GPU>(const int nlocal,
+                                                                                        const ct::Tensor& Stmp,
+                                                                                        const ct::Tensor& Htmp,
+                                                                                        ct::Tensor& U_operator,
+                                                                                        std::ofstream& ofs_running,
+                                                                                        const int print_matrix) const;
+#endif // __CUDA
+#endif // __MPI
+} // namespace module_tddft
diff --git a/source/module_hamilt_lcao/module_tddft/propagator_etrs.cpp b/source/module_hamilt_lcao/module_tddft/propagator_etrs.cpp
new file mode 100644
index 0000000000..ad37264d59
--- /dev/null
+++ b/source/module_hamilt_lcao/module_tddft/propagator_etrs.cpp
@@ -0,0 +1,50 @@
+#include "module_base/lapack_connector.h"
+#include "module_base/module_container/ATen/kernels/blas.h"
+#include "module_base/module_container/ATen/kernels/lapack.h"
+#include "module_base/module_container/ATen/kernels/memory.h" // memory operations (Tensor)
+#include "module_base/module_device/memory_op.h"              // memory operations
+#include "module_base/scalapack_connector.h"
+#include "module_parameter/parameter.h"
+#include "propagator.h"
+
+#include <complex>
+#include <iostream>
+
+namespace module_tddft
+{
+#ifdef __MPI
+void Propagator::compute_propagator_etrs(const int nlocal,
+                                         const std::complex<double>* Stmp,
+                                         const std::complex<double>* Htmp,
+                                         const std::complex<double>* H_laststep,
+                                         std::complex<double>* U_operator,
+                                         std::ofstream& ofs_running,
+                                         const int print_matrix) const
+{
+    std::vector<std::complex<double>> U1(this->ParaV->nloc);
+    std::vector<std::complex<double>> U2(this->ParaV->nloc);
+    int tag = 2;
+    compute_propagator_taylor(nlocal, Stmp, Htmp, U1.data(), ofs_running, print_matrix, tag);
+    compute_propagator_taylor(nlocal, Stmp, H_laststep, U2.data(), ofs_running, print_matrix, tag);
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nlocal,
+                             nlocal,
+                             1.0,
+                             U1.data(),
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             U2.data(),
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             0.0,
+                             U_operator,
+                             1,
+                             1,
+                             this->ParaV->desc);
+}
+#endif // __MPI
+} // namespace module_tddft
diff --git a/source/module_hamilt_lcao/module_tddft/propagator_taylor.cpp b/source/module_hamilt_lcao/module_tddft/propagator_taylor.cpp
new file mode 100644
index 0000000000..58567a900a
--- /dev/null
+++ b/source/module_hamilt_lcao/module_tddft/propagator_taylor.cpp
@@ -0,0 +1,343 @@
+#include "module_base/lapack_connector.h"
+#include "module_base/module_container/ATen/kernels/blas.h"
+#include "module_base/module_container/ATen/kernels/lapack.h"
+#include "module_base/module_container/ATen/kernels/memory.h" // memory operations (Tensor)
+#include "module_base/module_device/memory_op.h"              // memory operations
+#include "module_base/scalapack_connector.h"
+#include "module_parameter/parameter.h"
+#include "propagator.h"
+
+#include <complex>
+#include <iostream>
+
+namespace module_tddft
+{
+#ifdef __MPI
+void Propagator::compute_propagator_taylor(const int nlocal,
+                                           const std::complex<double>* Stmp,
+                                           const std::complex<double>* Htmp,
+                                           std::complex<double>* U_operator,
+                                           std::ofstream& ofs_running,
+                                           const int print_matrix,
+                                           const int tag) const
+{
+    assert(this->ParaV->nloc > 0);
+
+    ModuleBase::GlobalFunc::ZEROS(U_operator, this->ParaV->nloc);
+    std::complex<double>* A_matrix = new std::complex<double>[this->ParaV->nloc];
+    ModuleBase::GlobalFunc::ZEROS(A_matrix, this->ParaV->nloc);
+    std::complex<double>* rank0 = new std::complex<double>[this->ParaV->nloc];
+    ModuleBase::GlobalFunc::ZEROS(rank0, this->ParaV->nloc);
+    std::complex<double>* rank2 = new std::complex<double>[this->ParaV->nloc];
+    ModuleBase::GlobalFunc::ZEROS(rank2, this->ParaV->nloc);
+    std::complex<double>* rank3 = new std::complex<double>[this->ParaV->nloc];
+    ModuleBase::GlobalFunc::ZEROS(rank3, this->ParaV->nloc);
+    std::complex<double>* rank4 = new std::complex<double>[this->ParaV->nloc];
+    ModuleBase::GlobalFunc::ZEROS(rank4, this->ParaV->nloc);
+    std::complex<double>* tmp1 = new std::complex<double>[this->ParaV->nloc];
+    ModuleBase::GlobalFunc::ZEROS(tmp1, this->ParaV->nloc);
+    std::complex<double>* tmp2 = new std::complex<double>[this->ParaV->nloc];
+    ModuleBase::GlobalFunc::ZEROS(tmp2, this->ParaV->nloc);
+    std::complex<double>* Sinv = new std::complex<double>[this->ParaV->nloc];
+    ModuleBase::GlobalFunc::ZEROS(Sinv, this->ParaV->nloc);
+    BlasConnector::copy(this->ParaV->nloc, Stmp, 1, Sinv, 1);
+
+    if (print_matrix)
+    {
+        ofs_running << std::endl;
+        ofs_running << " S matrix :" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Stmp[in + j].real() << "+" << Stmp[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " H matrix :" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << Htmp[in + j].real() << "+" << Htmp[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+
+    // set rank0
+    int info = 0;
+    int naroc[2] = {0, 0}; // maximum number of row or column
+
+    for (int iprow = 0; iprow < this->ParaV->dim0; ++iprow)
+    {
+        for (int ipcol = 0; ipcol < this->ParaV->dim1; ++ipcol)
+        {
+            if (iprow == ParaV->coord[0] && ipcol == ParaV->coord[1])
+            {
+                naroc[0] = this->ParaV->nrow;
+                naroc[1] = this->ParaV->ncol;
+                for (int j = 0; j < naroc[1]; ++j)
+                {
+                    int igcol = globalIndex(j, this->ParaV->nb, this->ParaV->dim1, ipcol);
+                    if (igcol >= nlocal)
+                    {
+                        continue;
+                    }
+                    for (int i = 0; i < naroc[0]; ++i)
+                    {
+                        int igrow = globalIndex(i, this->ParaV->nb, this->ParaV->dim0, iprow);
+                        if (igrow >= nlocal)
+                        {
+                            continue;
+                        }
+                        if (igcol == igrow)
+                        {
+                            rank0[j * naroc[0] + i] = {1.0, 0.0};
+                        }
+                        else
+                        {
+                            rank0[j * naroc[0] + i] = {0.0, 0.0};
+                        }
+                    }
+                }
+            }
+        } // loop ipcol
+    } // loop iprow
+
+    std::complex<double> beta = {0.0, -0.5 * this->dt / tag}; // for ETRS tag=2 , for taylor tag=1
+
+    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+    // invert Stmp
+    int* ipiv = new int[this->ParaV->nloc];
+    // (3.1) compute ipiv
+    ScalapackConnector::getrf(nlocal, nlocal, Sinv, 1, 1, this->ParaV->desc, ipiv, &info);
+    int lwork = -1;
+    int liwotk = -1;
+    std::vector<std::complex<double>> work(1, 0);
+    std::vector<int> iwork(1, 0);
+    // (3.2) compute work
+    ScalapackConnector::getri(nlocal,
+                              Sinv,
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              ipiv,
+                              work.data(),
+                              &lwork,
+                              iwork.data(),
+                              &liwotk,
+                              &info);
+    lwork = work[0].real();
+    work.resize(lwork, 0);
+    liwotk = iwork[0];
+    iwork.resize(liwotk, 0);
+    ScalapackConnector::getri(nlocal,
+                              Sinv,
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              ipiv,
+                              work.data(),
+                              &lwork,
+                              iwork.data(),
+                              &liwotk,
+                              &info);
+    assert(0 == info);
+
+    //  A_matrix = - idt S^-1 H ;
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nlocal,
+                             nlocal,
+                             beta,
+                             Sinv,
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             Htmp,
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             0.0,
+                             U_operator,
+                             1,
+                             1,
+                             this->ParaV->desc);
+
+    //  rank2 = A^2 ;
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nlocal,
+                             nlocal,
+                             1.0,
+                             U_operator,
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             U_operator,
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             0.0,
+                             rank2,
+                             1,
+                             1,
+                             this->ParaV->desc);
+
+    //  rank3 = A^3 ;
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nlocal,
+                             nlocal,
+                             1.0,
+                             U_operator,
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             rank2,
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             0.0,
+                             rank3,
+                             1,
+                             1,
+                             this->ParaV->desc);
+
+    //  rank4 = A^4 ;
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nlocal,
+                             nlocal,
+                             1.0,
+                             U_operator,
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             rank3,
+                             1,
+                             1,
+                             this->ParaV->desc,
+                             0.0,
+                             rank4,
+                             1,
+                             1,
+                             this->ParaV->desc);
+
+    std::complex<double> p1 = {1.0, 0.0};
+    std::complex<double> p2 = {1.0 / 2.0, 0.0};
+    std::complex<double> p3 = {1.0 / 6.0, 0.0};
+    std::complex<double> p4 = {1.0 / 24.0, 0.0};
+
+    ScalapackConnector::geadd('N',
+                              nlocal,
+                              nlocal,
+                              p1,
+                              rank0,
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              p1,
+                              U_operator,
+                              1,
+                              1,
+                              this->ParaV->desc);
+
+    ScalapackConnector::geadd('N',
+                              nlocal,
+                              nlocal,
+                              p2,
+                              rank2,
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              p1,
+                              U_operator,
+                              1,
+                              1,
+                              this->ParaV->desc);
+
+    ScalapackConnector::geadd('N',
+                              nlocal,
+                              nlocal,
+                              p3,
+                              rank3,
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              p1,
+                              U_operator,
+                              1,
+                              1,
+                              this->ParaV->desc);
+
+    ScalapackConnector::geadd('N',
+                              nlocal,
+                              nlocal,
+                              p4,
+                              rank4,
+                              1,
+                              1,
+                              this->ParaV->desc,
+                              p1,
+                              U_operator,
+                              1,
+                              1,
+                              this->ParaV->desc);
+
+    if (print_matrix)
+    {
+        ofs_running << " A_matrix:" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                ofs_running << A_matrix[in + j].real() << "+" << A_matrix[in + j].imag() << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " U operator:" << std::endl;
+        for (int i = 0; i < this->ParaV->nrow; i++)
+        {
+            const int in = i * this->ParaV->ncol;
+            for (int j = 0; j < this->ParaV->ncol; j++)
+            {
+                double aa = U_operator[in + j].real();
+                double bb = U_operator[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+    }
+    delete[] A_matrix;
+    delete[] rank0;
+    delete[] rank2;
+    delete[] rank3;
+    delete[] rank4;
+    delete[] tmp1;
+    delete[] tmp2;
+    delete[] Sinv;
+    delete[] ipiv;
+}
+#endif // __MPI
+} // namespace module_tddft
diff --git a/source/module_hamilt_lcao/module_tddft/test/CMakeLists.txt b/source/module_hamilt_lcao/module_tddft/test/CMakeLists.txt
index 3c54ffc9b2..07e64a5977 100644
--- a/source/module_hamilt_lcao/module_tddft/test/CMakeLists.txt
+++ b/source/module_hamilt_lcao/module_tddft/test/CMakeLists.txt
@@ -11,9 +11,9 @@ AddTest(
 )
 
 AddTest(
-  TARGET tddft_bandenergy_test
+  TARGET tddft_band_energy_test
   LIBS parameter ${math_libs} base device tddft_test_lib  
-  SOURCES bandenergy_test.cpp ../bandenergy.cpp ../../../module_basis/module_ao/parallel_orbitals.cpp
+  SOURCES band_energy_test.cpp ../band_energy.cpp ../../../module_basis/module_ao/parallel_orbitals.cpp
 )
 
 AddTest(
@@ -31,6 +31,6 @@ AddTest(
 AddTest(
   TARGET tddft_propagator_test
   LIBS parameter ${math_libs} base device tddft_test_lib  
-  SOURCES propagator_test1.cpp propagator_test2.cpp propagator_test3.cpp ../propagator.cpp 
+  SOURCES propagator_test1.cpp propagator_test2.cpp propagator_test3.cpp ../propagator.cpp ../propagator_cn2.cpp ../propagator_taylor.cpp ../propagator_etrs.cpp
 )
 
diff --git a/source/module_hamilt_lcao/module_tddft/test/bandenergy_test.cpp b/source/module_hamilt_lcao/module_tddft/test/band_energy_test.cpp
similarity index 94%
rename from source/module_hamilt_lcao/module_tddft/test/bandenergy_test.cpp
rename to source/module_hamilt_lcao/module_tddft/test/band_energy_test.cpp
index 853b75be76..706b50a603 100644
--- a/source/module_hamilt_lcao/module_tddft/test/bandenergy_test.cpp
+++ b/source/module_hamilt_lcao/module_tddft/test/band_energy_test.cpp
@@ -1,4 +1,4 @@
-#include "module_hamilt_lcao/module_tddft/bandenergy.h"
+#include "module_hamilt_lcao/module_tddft/band_energy.h"
 
 #include <gtest/gtest.h>
 #include <module_base/scalapack_connector.h>
@@ -9,7 +9,7 @@
 #include "tddft_test.h"
 
 /************************************************
- *  unit test of functions in bandenergy.h
+ *  unit test of functions in band_energy.h
  ***********************************************/
 
 /**
@@ -19,7 +19,6 @@
  */
 
 #define doublethreshold 1e-8
-double module_tddft::Evolve_elec::td_print_eij = -1;
 
 TEST(BandEnergyTest, testBandEnergy)
 {
@@ -87,7 +86,7 @@ TEST(BandEnergyTest, testBandEnergy)
     psi_k[11] = 1.0;
 
     // Call the function
-    module_tddft::compute_ekb(pv, nband, nlocal, Htmp, psi_k, ekb);
+    module_tddft::compute_ekb(pv, nband, nlocal, Htmp, psi_k, ekb, GlobalV::ofs_running);
 
     // Check the results
     EXPECT_NEAR(ekb[0], 3.0, doublethreshold);
diff --git a/source/module_hamilt_lcao/module_tddft/test/middle_hamilt_test.cpp b/source/module_hamilt_lcao/module_tddft/test/middle_hamilt_test.cpp
index 5cda7e4aba..6ac920400a 100644
--- a/source/module_hamilt_lcao/module_tddft/test/middle_hamilt_test.cpp
+++ b/source/module_hamilt_lcao/module_tddft/test/middle_hamilt_test.cpp
@@ -1,12 +1,13 @@
 #include "module_hamilt_lcao/module_tddft/middle_hamilt.h"
 
+#include "module_base/global_variable.h" // GlobalV::ofs_running
+#include "module_basis/module_ao/parallel_orbitals.h"
+#include "tddft_test.h"
+
 #include <gtest/gtest.h>
 #include <module_base/scalapack_connector.h>
 #include <mpi.h>
 
-#include "module_basis/module_ao/parallel_orbitals.h"
-#include "tddft_test.h"
-
 /************************************************
  *  unit test of functions in middle_hamilt.h
  ***********************************************/
@@ -72,7 +73,7 @@ TEST(MiddleHamiltTest, testMiddleHamilt)
     }
 
     // Call the function
-    module_tddft::half_Hmatrix(pv, nband, nlocal, Htmp, Stmp, Hlaststep, Slaststep, print_matrix);
+    module_tddft::half_Hmatrix(pv, nband, nlocal, Htmp, Stmp, Hlaststep, Slaststep, GlobalV::ofs_running, print_matrix);
 
     // Check the results
     EXPECT_NEAR(Htmp[0].real(), 1.0, doublethreshold);
diff --git a/source/module_hamilt_lcao/module_tddft/test/norm_psi_test.cpp b/source/module_hamilt_lcao/module_tddft/test/norm_psi_test.cpp
index 6c78937803..e4b8852c95 100644
--- a/source/module_hamilt_lcao/module_tddft/test/norm_psi_test.cpp
+++ b/source/module_hamilt_lcao/module_tddft/test/norm_psi_test.cpp
@@ -1,12 +1,13 @@
 #include "module_hamilt_lcao/module_tddft/norm_psi.h"
 
+#include "module_base/global_variable.h" // GlobalV::ofs_running
+#include "module_basis/module_ao/parallel_orbitals.h"
+#include "tddft_test.h"
+
 #include <gtest/gtest.h>
 #include <module_base/scalapack_connector.h>
 #include <mpi.h>
 
-#include "module_basis/module_ao/parallel_orbitals.h"
-#include "tddft_test.h"
-
 /************************************************
  *  unit test of functions in norm_psi.h
  ***********************************************/
@@ -89,7 +90,7 @@ TEST(NormPsiTest, testNormPsi)
     psi_k[11] = 1.0;
 
     // Call the function
-    module_tddft::norm_psi(pv, nband, nlocal, Stmp, psi_k, print_matrix);
+    module_tddft::norm_psi(pv, nband, nlocal, Stmp, psi_k, GlobalV::ofs_running, print_matrix);
 
     // Check the results
     EXPECT_NEAR(psi_k[0].real(), 0.577350269189626, doublethreshold);
diff --git a/source/module_hamilt_lcao/module_tddft/test/propagator_test1.cpp b/source/module_hamilt_lcao/module_tddft/test/propagator_test1.cpp
index 882bee90a6..0002ee4b44 100644
--- a/source/module_hamilt_lcao/module_tddft/test/propagator_test1.cpp
+++ b/source/module_hamilt_lcao/module_tddft/test/propagator_test1.cpp
@@ -72,7 +72,7 @@ TEST(PropagatorTest, testPropagatorCN)
     // Call the function
     int propagator = 0;
     module_tddft::Propagator prop(propagator, pv, PARAM.mdp.md_dt);
-    prop.compute_propagator(nlocal, Stmp, Htmp, nullptr, U_operator, print_matrix);
+    prop.compute_propagator(nlocal, Stmp, Htmp, nullptr, U_operator, GlobalV::ofs_running, print_matrix);
 
     // Check the results
     EXPECT_NEAR(U_operator[0].real(), -0.107692307692308, doublethreshold);
diff --git a/source/module_hamilt_lcao/module_tddft/test/propagator_test2.cpp b/source/module_hamilt_lcao/module_tddft/test/propagator_test2.cpp
index 16f57c18a1..7ff2416a6a 100644
--- a/source/module_hamilt_lcao/module_tddft/test/propagator_test2.cpp
+++ b/source/module_hamilt_lcao/module_tddft/test/propagator_test2.cpp
@@ -77,7 +77,7 @@ TEST(PropagatorTest, testPropagatorTaylor)
     // Call the function
     int propagator = 1;
     module_tddft::Propagator prop(propagator, pv, PARAM.mdp.md_dt);
-    prop.compute_propagator(nlocal, Stmp, Htmp, nullptr, U_operator, print_matrix);
+    prop.compute_propagator(nlocal, Stmp, Htmp, nullptr, U_operator, GlobalV::ofs_running, print_matrix);
 
     // Check the results
     EXPECT_NEAR(U_operator[0].real(), 1.95473251028807, doublethreshold);
diff --git a/source/module_hamilt_lcao/module_tddft/test/propagator_test3.cpp b/source/module_hamilt_lcao/module_tddft/test/propagator_test3.cpp
index ba0b84b4a7..faabe0019d 100644
--- a/source/module_hamilt_lcao/module_tddft/test/propagator_test3.cpp
+++ b/source/module_hamilt_lcao/module_tddft/test/propagator_test3.cpp
@@ -81,7 +81,7 @@ TEST(PropagatorTest, testPropagatorETRS)
     // Call the function
     int propagator = 2;
     module_tddft::Propagator prop(propagator, pv, PARAM.mdp.md_dt);
-    prop.compute_propagator(nlocal, Stmp, Htmp, Hlaststep, U_operator, print_matrix);
+    prop.compute_propagator(nlocal, Stmp, Htmp, Hlaststep, U_operator, GlobalV::ofs_running, print_matrix);
 
     // Check the results
     EXPECT_NEAR(U_operator[0].real(), -0.0105865569272976, doublethreshold);
diff --git a/source/module_hamilt_lcao/module_tddft/test/upsi_test1.cpp b/source/module_hamilt_lcao/module_tddft/test/upsi_test1.cpp
index 2782db5376..6ffe45a3bd 100644
--- a/source/module_hamilt_lcao/module_tddft/test/upsi_test1.cpp
+++ b/source/module_hamilt_lcao/module_tddft/test/upsi_test1.cpp
@@ -1,11 +1,12 @@
-#include <gtest/gtest.h>
-#include <module_base/scalapack_connector.h>
-#include <mpi.h>
-
+#include "module_base/global_variable.h" // GlobalV::ofs_running
 #include "module_basis/module_ao/parallel_orbitals.h"
 #include "module_hamilt_lcao/module_tddft/upsi.h"
 #include "tddft_test.h"
 
+#include <gtest/gtest.h>
+#include <module_base/scalapack_connector.h>
+#include <mpi.h>
+
 #define doublethreshold 1e-8
 
 /************************************************
@@ -61,7 +62,7 @@ TEST(UpsiTest, testUpsi1)
     }
 
     // Call the function
-    module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, print_matrix);
+    module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, GlobalV::ofs_running, print_matrix);
 
     // Check the results
     for (int i = 0; i < nlocal * nband; ++i)
diff --git a/source/module_hamilt_lcao/module_tddft/test/upsi_test2.cpp b/source/module_hamilt_lcao/module_tddft/test/upsi_test2.cpp
index 33ca60f7a6..915b2a248b 100644
--- a/source/module_hamilt_lcao/module_tddft/test/upsi_test2.cpp
+++ b/source/module_hamilt_lcao/module_tddft/test/upsi_test2.cpp
@@ -1,11 +1,12 @@
-#include <gtest/gtest.h>
-#include <module_base/scalapack_connector.h>
-#include <mpi.h>
-
+#include "module_base/global_variable.h" // GlobalV::ofs_running
 #include "module_basis/module_ao/parallel_orbitals.h"
 #include "module_hamilt_lcao/module_tddft/upsi.h"
 #include "tddft_test.h"
 
+#include <gtest/gtest.h>
+#include <module_base/scalapack_connector.h>
+#include <mpi.h>
+
 #define doublethreshold 1e-8
 
 /************************************************
@@ -55,7 +56,7 @@ TEST(UpsiTest, testUpsi2)
     }
 
     // Call the function
-    module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, print_matrix);
+    module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, GlobalV::ofs_running, print_matrix);
 
     // Check the results
     for (int i = 0; i < nlocal * nband; ++i)
diff --git a/source/module_hamilt_lcao/module_tddft/test/upsi_test3.cpp b/source/module_hamilt_lcao/module_tddft/test/upsi_test3.cpp
index 576adf60ee..cf5e177b13 100644
--- a/source/module_hamilt_lcao/module_tddft/test/upsi_test3.cpp
+++ b/source/module_hamilt_lcao/module_tddft/test/upsi_test3.cpp
@@ -1,11 +1,12 @@
-#include <gtest/gtest.h>
-#include <module_base/scalapack_connector.h>
-#include <mpi.h>
-
+#include "module_base/global_variable.h" // GlobalV::ofs_running
 #include "module_basis/module_ao/parallel_orbitals.h"
 #include "module_hamilt_lcao/module_tddft/upsi.h"
 #include "tddft_test.h"
 
+#include <gtest/gtest.h>
+#include <module_base/scalapack_connector.h>
+#include <mpi.h>
+
 #define doublethreshold 1e-8
 
 /************************************************
@@ -69,7 +70,7 @@ TEST(UpsiTest, testUpsi3)
     }
 
     // Call the function
-    module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, print_matrix);
+    module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, GlobalV::ofs_running, print_matrix);
 
     // Check the results
     for (int i = 0; i < nlocal; ++i)
diff --git a/source/module_hamilt_lcao/module_tddft/upsi.cpp b/source/module_hamilt_lcao/module_tddft/upsi.cpp
index 9ccc9ac347..cc9a2448a8 100644
--- a/source/module_hamilt_lcao/module_tddft/upsi.cpp
+++ b/source/module_hamilt_lcao/module_tddft/upsi.cpp
@@ -1,11 +1,12 @@
 #include "upsi.h"
 
-#include <complex>
-#include <iostream>
-
 #include "module_base/lapack_connector.h"
+#include "module_base/module_container/ATen/kernels/blas.h"
 #include "module_base/scalapack_connector.h"
 
+#include <complex>
+#include <iostream>
+
 namespace module_tddft
 {
 #ifdef __MPI
@@ -15,9 +16,9 @@ void upsi(const Parallel_Orbitals* pv,
           const std::complex<double>* U_operator,
           const std::complex<double>* psi_k_laststep,
           std::complex<double>* psi_k,
+          std::ofstream& ofs_running,
           const int print_matrix)
 {
-
     ScalapackConnector::gemm('N',
                              'N',
                              nlocal,
@@ -40,43 +41,231 @@ void upsi(const Parallel_Orbitals* pv,
 
     if (print_matrix)
     {
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " psi_k:" << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " psi_k:" << std::endl;
+        for (int i = 0; i < pv->ncol_bands; i++)
+        {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->ncol; j++)
+            {
+                double aa = psi_k[in + j].real();
+                double bb = psi_k[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " psi_k_laststep:" << std::endl;
+        for (int i = 0; i < pv->ncol_bands; i++)
+        {
+            const int in = i * pv->ncol;
+            for (int j = 0; j < pv->ncol; j++)
+            {
+                double aa = psi_k_laststep[in + j].real();
+                double bb = psi_k_laststep[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+}
+
+void upsi_tensor(const Parallel_Orbitals* pv,
+                 const int nband,
+                 const int nlocal,
+                 const ct::Tensor& U_operator,
+                 const ct::Tensor& psi_k_laststep,
+                 ct::Tensor& psi_k,
+                 std::ofstream& ofs_running,
+                 const int print_matrix)
+{
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nband,
+                             nlocal,
+                             1.0,
+                             U_operator.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc,
+                             psi_k_laststep.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             0.0,
+                             psi_k.data<std::complex<double>>(),
+                             1,
+                             1,
+                             pv->desc_wfc);
+
+    if (print_matrix)
+    {
+        ofs_running << std::endl;
+        ofs_running << " psi_k:" << std::endl;
         for (int i = 0; i < pv->ncol_bands; i++)
         {
+            const int in = i * pv->ncol;
             for (int j = 0; j < pv->ncol; j++)
             {
-                double aa, bb;
-                aa = psi_k[i * pv->ncol + j].real();
-                bb = psi_k[i * pv->ncol + j].imag();
+                double aa = psi_k.data<std::complex<double>>()[in + j].real();
+                double bb = psi_k.data<std::complex<double>>()[in + j].imag();
                 if (std::abs(aa) < 1e-8)
+                {
                     aa = 0.0;
+                }
                 if (std::abs(bb) < 1e-8)
+                {
                     bb = 0.0;
-                GlobalV::ofs_running << aa << "+" << bb << "i ";
+                }
+                ofs_running << aa << "+" << bb << "i ";
             }
-            GlobalV::ofs_running << std::endl;
+            ofs_running << std::endl;
         }
-        GlobalV::ofs_running << std::endl;
-        GlobalV::ofs_running << " psi_k_laststep:" << std::endl;
+        ofs_running << std::endl;
+        ofs_running << " psi_k_laststep:" << std::endl;
         for (int i = 0; i < pv->ncol_bands; i++)
         {
+            const int in = i * pv->ncol;
             for (int j = 0; j < pv->ncol; j++)
             {
-                double aa, bb;
-                aa = psi_k_laststep[i * pv->ncol + j].real();
-                bb = psi_k_laststep[i * pv->ncol + j].imag();
+                double aa = psi_k_laststep.data<std::complex<double>>()[in + j].real();
+                double bb = psi_k_laststep.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+    }
+}
+
+template <typename Device>
+void upsi_tensor_lapack(const Parallel_Orbitals* pv,
+                        const int nband,
+                        const int nlocal,
+                        const ct::Tensor& U_operator,
+                        const ct::Tensor& psi_k_laststep,
+                        ct::Tensor& psi_k,
+                        std::ofstream& ofs_running,
+                        const int print_matrix)
+{
+    // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
+    ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
+    // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+
+    // Perform the matrix multiplication: psi_k = U_operator * psi_k_laststep
+    std::complex<double> alpha = {1.0, 0.0};
+    std::complex<double> beta = {0.0, 0.0};
+
+    ct::kernels::blas_gemm<std::complex<double>, ct_Device>()('N',
+                                                              'N',
+                                                              nlocal,
+                                                              nband,
+                                                              nlocal,
+                                                              &alpha,
+                                                              U_operator.data<std::complex<double>>(),
+                                                              nlocal,
+                                                              psi_k_laststep.data<std::complex<double>>(),
+                                                              nlocal,
+                                                              &beta,
+                                                              psi_k.data<std::complex<double>>(),
+                                                              nlocal);
+
+    if (print_matrix)
+    {
+        ct::Tensor psi_k_cpu = psi_k.to_device<ct::DEVICE_CPU>();
+        ct::Tensor psi_k_laststep_cpu = psi_k_laststep.to_device<ct::DEVICE_CPU>();
+
+        ofs_running << std::endl;
+        ofs_running << " psi_k:" << std::endl;
+        for (int i = 0; i < nband; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                double aa = psi_k_cpu.data<std::complex<double>>()[in + j].real();
+                double bb = psi_k_cpu.data<std::complex<double>>()[in + j].imag();
+                if (std::abs(aa) < 1e-8)
+                {
+                    aa = 0.0;
+                }
+                if (std::abs(bb) < 1e-8)
+                {
+                    bb = 0.0;
+                }
+                ofs_running << aa << "+" << bb << "i ";
+            }
+            ofs_running << std::endl;
+        }
+        ofs_running << std::endl;
+        ofs_running << " psi_k_laststep:" << std::endl;
+        for (int i = 0; i < nband; i++)
+        {
+            const int in = i * nlocal;
+            for (int j = 0; j < nlocal; j++)
+            {
+                double aa = psi_k_laststep_cpu.data<std::complex<double>>()[in + j].real();
+                double bb = psi_k_laststep_cpu.data<std::complex<double>>()[in + j].imag();
                 if (std::abs(aa) < 1e-8)
+                {
                     aa = 0.0;
+                }
                 if (std::abs(bb) < 1e-8)
+                {
                     bb = 0.0;
-                GlobalV::ofs_running << aa << "+" << bb << "i ";
+                }
+                ofs_running << aa << "+" << bb << "i ";
             }
-            GlobalV::ofs_running << std::endl;
+            ofs_running << std::endl;
         }
-        GlobalV::ofs_running << std::endl;
+        ofs_running << std::endl;
     }
 }
 
-#endif
+// Explicit instantiation of template functions
+template void upsi_tensor_lapack<base_device::DEVICE_CPU>(const Parallel_Orbitals* pv,
+                                                          const int nband,
+                                                          const int nlocal,
+                                                          const ct::Tensor& U_operator,
+                                                          const ct::Tensor& psi_k_laststep,
+                                                          ct::Tensor& psi_k,
+                                                          std::ofstream& ofs_running,
+                                                          const int print_matrix);
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+template void upsi_tensor_lapack<base_device::DEVICE_GPU>(const Parallel_Orbitals* pv,
+                                                          const int nband,
+                                                          const int nlocal,
+                                                          const ct::Tensor& U_operator,
+                                                          const ct::Tensor& psi_k_laststep,
+                                                          ct::Tensor& psi_k,
+                                                          std::ofstream& ofs_running,
+                                                          const int print_matrix);
+#endif // __CUDA
+#endif // __MPI
 } // namespace module_tddft
diff --git a/source/module_hamilt_lcao/module_tddft/upsi.h b/source/module_hamilt_lcao/module_tddft/upsi.h
index 3d703de5f8..c687a18fed 100644
--- a/source/module_hamilt_lcao/module_tddft/upsi.h
+++ b/source/module_hamilt_lcao/module_tddft/upsi.h
@@ -7,7 +7,9 @@
 #ifndef UPSI_H
 #define UPSI_H
 
+#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor
 #include "module_basis/module_ao/parallel_orbitals.h"
+
 #include <complex>
 
 namespace module_tddft
@@ -30,9 +32,29 @@ void upsi(const Parallel_Orbitals* pv,
           const std::complex<double>* U_operator,
           const std::complex<double>* psi_k_laststep,
           std::complex<double>* psi_k,
+          std::ofstream& ofs_running,
           const int print_matrix);
 
-#endif
+void upsi_tensor(const Parallel_Orbitals* pv,
+                 const int nband,
+                 const int nlocal,
+                 const ct::Tensor& U_operator,
+                 const ct::Tensor& psi_k_laststep,
+                 ct::Tensor& psi_k,
+                 std::ofstream& ofs_running,
+                 const int print_matrix);
+
+template <typename Device>
+void upsi_tensor_lapack(const Parallel_Orbitals* pv,
+                        const int nband,
+                        const int nlocal,
+                        const ct::Tensor& U_operator,
+                        const ct::Tensor& psi_k_laststep,
+                        ct::Tensor& psi_k,
+                        std::ofstream& ofs_running,
+                        const int print_matrix);
+
+#endif // __MPI
 } // namespace module_tddft
 
 #endif
diff --git a/source/module_hsolver/diago_cusolver.cpp b/source/module_hsolver/diago_cusolver.cpp
index 5511ec941c..efaf20a28a 100644
--- a/source/module_hsolver/diago_cusolver.cpp
+++ b/source/module_hsolver/diago_cusolver.cpp
@@ -1,10 +1,10 @@
 #include "diago_cusolver.h"
 
-#include "module_parameter/parameter.h"
 #include "module_base/blacs_connector.h"
 #include "module_base/global_variable.h"
 #include "module_base/scalapack_connector.h"
 #include "module_base/timer.h"
+#include "module_parameter/parameter.h"
 
 #include <memory>
 #include <type_traits>
diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp
index 65ff27dbc1..54e549366d 100644
--- a/source/module_io/input_conv.cpp
+++ b/source/module_io/input_conv.cpp
@@ -275,22 +275,12 @@ void Input_Conv::Convert()
 // Fuxiang He add 2016-10-26
 //----------------------------------------------------------
 #ifdef __LCAO
-    module_tddft::Evolve_elec::td_force_dt = PARAM.inp.td_force_dt;
-    module_tddft::Evolve_elec::td_vext = PARAM.inp.td_vext;
-    if (module_tddft::Evolve_elec::td_vext)
-    {
-        parse_expression(PARAM.inp.td_vext_dire, module_tddft::Evolve_elec::td_vext_dire_case);
-    }
-    module_tddft::Evolve_elec::out_dipole = PARAM.inp.out_dipole;
-    module_tddft::Evolve_elec::out_efield = PARAM.inp.out_efield;
-    module_tddft::Evolve_elec::td_print_eij = PARAM.inp.td_print_eij;
-    module_tddft::Evolve_elec::td_edm = PARAM.inp.td_edm;
     TD_Velocity::out_current = PARAM.inp.out_current;
     TD_Velocity::out_current_k = PARAM.inp.out_current_k;
     TD_Velocity::out_vecpot = PARAM.inp.out_vecpot;
     TD_Velocity::init_vecpot_file = PARAM.inp.init_vecpot_file;
     read_td_efield();
-#endif
+#endif // __LCAO
 
    
 
diff --git a/source/module_io/read_input_item_tddft.cpp b/source/module_io/read_input_item_tddft.cpp
index 236b15545b..9687af495d 100644
--- a/source/module_io/read_input_item_tddft.cpp
+++ b/source/module_io/read_input_item_tddft.cpp
@@ -20,13 +20,28 @@ void ReadInput::item_rt_tddft()
         read_sync_bool(input.td_vext);
         this->add_item(item);
     }
+    // {
+    //     Input_Item item("td_vext_dire");
+    //     item.annotation = "extern potential direction";
+    //     item.read_value = [](const Input_Item& item, Parameter& para) {
+    //         para.input.td_vext_dire = longstring(item.str_values);
+    //     };
+    //     sync_string(input.td_vext_dire);
+    //     this->add_item(item);
+    // }
     {
         Input_Item item("td_vext_dire");
         item.annotation = "extern potential direction";
         item.read_value = [](const Input_Item& item, Parameter& para) {
-            para.input.td_vext_dire = longstring(item.str_values);
+            parse_expression(item.str_values, para.input.td_vext_dire);
         };
-        sync_string(input.td_vext_dire);
+        item.get_final_value = [](Input_Item& item, const Parameter& para) {
+            if (item.is_read())
+            {
+                item.final_value.str(longstring(item.str_values));
+            }
+        };
+        add_intvec_bcast(input.td_vext_dire, para.input.td_vext_dire.size(), 0);
         this->add_item(item);
     }
     {
diff --git a/source/module_io/test/for_testing_input_conv.h b/source/module_io/test/for_testing_input_conv.h
index 88b1f304e2..7abea67bf7 100644
--- a/source/module_io/test/for_testing_input_conv.h
+++ b/source/module_io/test/for_testing_input_conv.h
@@ -30,17 +30,10 @@
 #undef private
 bool berryphase::berry_phase_flag = false;
 
-double module_tddft::Evolve_elec::td_force_dt;
-bool module_tddft::Evolve_elec::td_vext;
-std::vector<int> module_tddft::Evolve_elec::td_vext_dire_case;
-bool module_tddft::Evolve_elec::out_dipole;
-bool module_tddft::Evolve_elec::out_efield;
 bool TD_Velocity::out_current;
 bool TD_Velocity::out_current_k;
 bool TD_Velocity::out_vecpot;
 bool TD_Velocity::init_vecpot_file;
-double module_tddft::Evolve_elec::td_print_eij;
-int module_tddft::Evolve_elec::td_edm;
 double elecstate::Gatefield::zgate = 0.5;
 bool elecstate::Gatefield::relax = false;
 bool elecstate::Gatefield::block = false;
diff --git a/source/module_io/test/read_input_ptest.cpp b/source/module_io/test/read_input_ptest.cpp
index b29cdcfc55..90f985c307 100644
--- a/source/module_io/test/read_input_ptest.cpp
+++ b/source/module_io/test/read_input_ptest.cpp
@@ -295,7 +295,6 @@ TEST_F(InputParaTest, ParaRead)
     EXPECT_DOUBLE_EQ(param.inp.soc_lambda, 1.0);
     EXPECT_DOUBLE_EQ(param.inp.td_force_dt, 0.02);
     EXPECT_EQ(param.inp.td_vext, 0);
-    EXPECT_EQ(param.inp.td_vext_dire, "1");
     EXPECT_EQ(param.inp.propagator, 0);
     EXPECT_EQ(param.inp.td_stype, 0);
     EXPECT_EQ(param.inp.td_ttype, "0");
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index d2c27f78e1..4e42017ebd 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -280,9 +280,10 @@ struct Input_para
     double bessel_descriptor_sigma = 0.1;           ///< spherical bessel smearing_sigma
 
     // ==============   #Parameters (9.rt-tddft) ===========================
-    double td_force_dt = 0.02;      ///<"fs"
-    bool td_vext = false;           ///< add extern potential or not
-    std::string td_vext_dire = "1"; ///< vext direction
+    double td_force_dt = 0.02;           ///<"fs"
+    bool td_vext = false;                ///< add extern potential or not
+    // std::string td_vext_dire = "1";   ///< vext direction
+    std::vector<int> td_vext_dire = {1}; ///< vector of vext direction
 
     bool init_vecpot_file = false; ///< initialize the vector potential, though file or integral
     double td_print_eij = -1.0;    ///< threshold to output Eij elements