diff --git a/source/Makefile.Objects b/source/Makefile.Objects index ad13d75976..9e3c736b89 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -557,10 +557,13 @@ OBJS_IO_LCAO=cal_r_overlap_R.o\ OBJS_LCAO=evolve_elec.o\ evolve_psi.o\ - bandenergy.o\ + band_energy.o\ middle_hamilt.o\ norm_psi.o\ propagator.o\ + propagator_cn2.o\ + propagator_taylor.o\ + propagator_etrs.o\ td_velocity.o\ td_current.o\ snap_psibeta_half_tddft.o\ diff --git a/source/module_base/lapack_connector.h b/source/module_base/lapack_connector.h index d4a4b646ff..29018d4db4 100644 --- a/source/module_base/lapack_connector.h +++ b/source/module_base/lapack_connector.h @@ -133,8 +133,8 @@ extern "C" // zgetrf computes the LU factorization of a general matrix // while zgetri takes its output to perform matrix inversion - void zgetrf_(const int* m, const int *n, const std::complex *A, const int *lda, int *ipiv, const int* info); - void zgetri_(const int* n, std::complex *A, const int *lda, int *ipiv, std::complex *work, int *lwork, const int *info); + void zgetrf_(const int* m, const int *n, std::complex *A, const int *lda, int *ipiv, int* info); + void zgetri_(const int* n, std::complex* A, const int* lda, const int* ipiv, std::complex* work, const int* lwork, int* info); // if trans=='N': C = alpha * A * A.H + beta * C // if trans=='C': C = alpha * A.H * A + beta * C diff --git a/source/module_base/module_container/ATen/kernels/cuda/lapack.cu b/source/module_base/module_container/ATen/kernels/cuda/lapack.cu index 48caa4be7e..6300df953f 100644 --- a/source/module_base/module_container/ATen/kernels/cuda/lapack.cu +++ b/source/module_base/module_container/ATen/kernels/cuda/lapack.cu @@ -117,6 +117,49 @@ struct lapack_dngvd { } }; +template +struct lapack_getrf { + void operator()( + const int& m, + const int& n, + T* Mat, + const int& lda, + int* ipiv) + { + cuSolverConnector::getrf(cusolver_handle, m, n, Mat, lda, ipiv); + } +}; + +template +struct lapack_getri { + void operator()( + const int& n, + T* Mat, + const int& lda, + const int* ipiv, + T* work, + const int& lwork) + { + throw std::runtime_error("cuSOLVER does not provide LU-based matrix inversion interface (getri). To compute the inverse on GPU, use getrs instead."); + } +}; + +template +struct lapack_getrs { + void operator()( + const char& trans, + const int& n, + const int& nrhs, + T* A, + const int& lda, + const int* ipiv, + T* B, + const int& ldb) + { + cuSolverConnector::getrs(cusolver_handle, trans, n, nrhs, A, lda, ipiv, B, ldb); + } +}; + template struct set_matrix; template struct set_matrix; template struct set_matrix, DEVICE_GPU>; @@ -142,5 +185,20 @@ template struct lapack_dngvd; template struct lapack_dngvd, DEVICE_GPU>; template struct lapack_dngvd, DEVICE_GPU>; +template struct lapack_getrf; +template struct lapack_getrf; +template struct lapack_getrf, DEVICE_GPU>; +template struct lapack_getrf, DEVICE_GPU>; + +template struct lapack_getri; +template struct lapack_getri; +template struct lapack_getri, DEVICE_GPU>; +template struct lapack_getri, DEVICE_GPU>; + +template struct lapack_getrs; +template struct lapack_getrs; +template struct lapack_getrs, DEVICE_GPU>; +template struct lapack_getrs, DEVICE_GPU>; + } // namespace kernels } // namespace container \ No newline at end of file diff --git a/source/module_base/module_container/ATen/kernels/lapack.cpp b/source/module_base/module_container/ATen/kernels/lapack.cpp index 436b552e1c..2369306309 100644 --- a/source/module_base/module_container/ATen/kernels/lapack.cpp +++ b/source/module_base/module_container/ATen/kernels/lapack.cpp @@ -124,6 +124,61 @@ struct lapack_dngvd { } }; +template +struct lapack_getrf { + void operator()( + const int& m, + const int& n, + T* Mat, + const int& lda, + int* ipiv) + { + int info = 0; + lapackConnector::getrf(m, n, Mat, lda, ipiv, info); + if (info != 0) { + throw std::runtime_error("getrf failed with info = " + std::to_string(info)); + } + } +}; + +template +struct lapack_getri { + void operator()( + const int& n, + T* Mat, + const int& lda, + const int* ipiv, + T* work, + const int& lwork) + { + int info = 0; + lapackConnector::getri(n, Mat, lda, ipiv, work, lwork, info); + if (info != 0) { + throw std::runtime_error("getri failed with info = " + std::to_string(info)); + } + } +}; + +template +struct lapack_getrs { + void operator()( + const char& trans, + const int& n, + const int& nrhs, + T* A, + const int& lda, + const int* ipiv, + T* B, + const int& ldb) + { + int info = 0; + lapackConnector::getrs(trans, n, nrhs, A, lda, ipiv, B, ldb, info); + if (info != 0) { + throw std::runtime_error("getrs failed with info = " + std::to_string(info)); + } + } +}; + template struct set_matrix; template struct set_matrix; template struct set_matrix, DEVICE_CPU>; @@ -149,5 +204,20 @@ template struct lapack_dngvd; template struct lapack_dngvd, DEVICE_CPU>; template struct lapack_dngvd, DEVICE_CPU>; +template struct lapack_getrf; +template struct lapack_getrf; +template struct lapack_getrf, DEVICE_CPU>; +template struct lapack_getrf, DEVICE_CPU>; + +template struct lapack_getri; +template struct lapack_getri; +template struct lapack_getri, DEVICE_CPU>; +template struct lapack_getri, DEVICE_CPU>; + +template struct lapack_getrs; +template struct lapack_getrs; +template struct lapack_getrs, DEVICE_CPU>; +template struct lapack_getrs, DEVICE_CPU>; + } // namespace kernels } // namespace container \ No newline at end of file diff --git a/source/module_base/module_container/ATen/kernels/lapack.h b/source/module_base/module_container/ATen/kernels/lapack.h index 561eac2fb1..cf164dec10 100644 --- a/source/module_base/module_container/ATen/kernels/lapack.h +++ b/source/module_base/module_container/ATen/kernels/lapack.h @@ -65,6 +65,42 @@ struct lapack_dngvd { Real* eigen_val); }; + +template +struct lapack_getrf { + void operator()( + const int& m, + const int& n, + T* Mat, + const int& lda, + int* ipiv); +}; + + +template +struct lapack_getri { + void operator()( + const int& n, + T* Mat, + const int& lda, + const int* ipiv, + T* work, + const int& lwork); +}; + +template +struct lapack_getrs { + void operator()( + const char& trans, + const int& n, + const int& nrhs, + T* A, + const int& lda, + const int* ipiv, + T* B, + const int& ldb); +}; + #if defined(__CUDA) || defined(__ROCM) // TODO: Use C++ singleton to manage the GPU handles void createGpuSolverHandle(); // create cusolver handle diff --git a/source/module_base/module_container/base/third_party/cusolver.h b/source/module_base/module_container/base/third_party/cusolver.h index 2ad1e2a427..defd60a121 100644 --- a/source/module_base/module_container/base/third_party/cusolver.h +++ b/source/module_base/module_container/base/third_party/cusolver.h @@ -132,7 +132,7 @@ void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, static inline void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float * W) { - // prepare some values for cusolverDnZhegvd_bufferSize + // prepare some values for cusolverDnSsyevd_bufferSize int lwork = 0; int h_info = 0; int* d_info = nullptr; @@ -142,7 +142,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u // calculate the sizes needed for pre-allocated buffer. cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, A, lda, W, &lwork)); - // allocate memery + // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork)); // compute eigenvalues and eigenvectors. cusolverErrcheck(cusolverDnSsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), @@ -158,7 +158,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u static inline void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double * W) { - // prepare some values for cusolverDnZhegvd_bufferSize + // prepare some values for cusolverDnDsyevd_bufferSize int lwork = 0; int h_info = 0; int* d_info = nullptr; @@ -168,7 +168,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u // calculate the sizes needed for pre-allocated buffer. cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, A, lda, W, &lwork)); - // allocate memery + // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork)); // compute eigenvalues and eigenvectors. cusolverErrcheck(cusolverDnDsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), @@ -184,7 +184,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u static inline void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex* A, const int& lda, float * W) { - // prepare some values for cusolverDnZhegvd_bufferSize + // prepare some values for cusolverDnCheevd_bufferSize int lwork = 0; int h_info = 0; int* d_info = nullptr; @@ -194,7 +194,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u // calculate the sizes needed for pre-allocated buffer. cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, reinterpret_cast(A), lda, W, &lwork)); - // allocate memery + // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork)); // compute eigenvalues and eigenvectors. cusolverErrcheck(cusolverDnCheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), @@ -210,7 +210,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u static inline void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex* A, const int& lda, double* W) { - // prepare some values for cusolverDnZhegvd_bufferSize + // prepare some values for cusolverDnZheevd_bufferSize int lwork = 0; int h_info = 0; int* d_info = nullptr; @@ -220,7 +220,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u // calculate the sizes needed for pre-allocated buffer. cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, reinterpret_cast(A), lda, W, &lwork)); - // allocate memery + // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork)); // compute eigenvalues and eigenvectors. cusolverErrcheck(cusolverDnZheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), @@ -237,7 +237,7 @@ void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u static inline void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float* B, const int& ldb, float * W) { - // prepare some values for cusolverDnZhegvd_bufferSize + // prepare some values for cusolverDnSsygvd_bufferSize int lwork = 0; int h_info = 0; int* d_info = nullptr; @@ -247,7 +247,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j // calculate the sizes needed for pre-allocated buffer. cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, A, lda, B, ldb, W, &lwork)); - // allocate memery + // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork)); // compute eigenvalues and eigenvectors. cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), @@ -263,7 +263,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j static inline void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double* B, const int& ldb, double * W) { - // prepare some values for cusolverDnZhegvd_bufferSize + // prepare some values for cusolverDnDsygvd_bufferSize int lwork = 0; int h_info = 0; int* d_info = nullptr; @@ -273,7 +273,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j // calculate the sizes needed for pre-allocated buffer. cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, A, lda, B, ldb, W, &lwork)); - // allocate memery + // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork)); // compute eigenvalues and eigenvectors. cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), @@ -289,7 +289,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j static inline void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex* A, const int& lda, std::complex* B, const int& ldb, float* W) { - // prepare some values for cusolverDnZhegvd_bufferSize + // prepare some values for cusolverDnChegvd_bufferSize int lwork = 0; int h_info = 0; int* d_info = nullptr; @@ -299,7 +299,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j // calculate the sizes needed for pre-allocated buffer. cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, reinterpret_cast(A), lda, reinterpret_cast(B), ldb, W, &lwork)); - // allocate memery + // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork)); // compute eigenvalues and eigenvectors. cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), @@ -325,7 +325,7 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j // calculate the sizes needed for pre-allocated buffer. cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, reinterpret_cast(A), lda, reinterpret_cast(B), ldb, W, &lwork)); - // allocate memery + // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork)); // compute eigenvalues and eigenvectors. cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), @@ -339,6 +339,180 @@ void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j cudaErrcheck(cudaFree(d_work)); } +static inline +void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, float* A, const int& lda, int* ipiv) +{ + // prepare some values for cusolverDnSgetrf_bufferSize + int lwork = 0; + int h_info = 0; + int* d_info = nullptr; + float* d_work = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + // calculate the sizes needed for pre-allocated buffer. + cusolverErrcheck(cusolverDnSgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork)); + + // allocate memory + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork)); + + // Perform LU decomposition + cusolverErrcheck(cusolverDnSgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info)); + + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + throw std::runtime_error("getrf: failed to compute LU factorization"); + } + + cudaErrcheck(cudaFree(d_work)); + cudaErrcheck(cudaFree(d_info)); +} +static inline +void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, double* A, const int& lda, int* ipiv) +{ + // prepare some values for cusolverDnDgetrf_bufferSize + int lwork = 0; + int h_info = 0; + int* d_info = nullptr; + double* d_work = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + // calculate the sizes needed for pre-allocated buffer. + cusolverErrcheck(cusolverDnDgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork)); + + // allocate memory + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork)); + + // Perform LU decomposition + cusolverErrcheck(cusolverDnDgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info)); + + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + throw std::runtime_error("getrf: failed to compute LU factorization"); + } + + cudaErrcheck(cudaFree(d_work)); + cudaErrcheck(cudaFree(d_info)); +} +static inline +void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, std::complex* A, const int& lda, int* ipiv) +{ + // prepare some values for cusolverDnCgetrf_bufferSize + int lwork = 0; + int h_info = 0; + int* d_info = nullptr; + cuComplex* d_work = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + // calculate the sizes needed for pre-allocated buffer. + cusolverErrcheck(cusolverDnCgetrf_bufferSize(cusolver_handle, m, n, reinterpret_cast(A), lda, &lwork)); + + // allocate memory + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork)); + + // Perform LU decomposition + cusolverErrcheck(cusolverDnCgetrf(cusolver_handle, m, n, reinterpret_cast(A), lda, d_work, ipiv, d_info)); + + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + throw std::runtime_error("getrf: failed to compute LU factorization"); + } + + cudaErrcheck(cudaFree(d_work)); + cudaErrcheck(cudaFree(d_info)); +} +static inline +void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, std::complex* A, const int& lda, int* ipiv) +{ + // prepare some values for cusolverDnZgetrf_bufferSize + int lwork = 0; + int h_info = 0; + int* d_info = nullptr; + cuDoubleComplex* d_work = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + // calculate the sizes needed for pre-allocated buffer. + cusolverErrcheck(cusolverDnZgetrf_bufferSize(cusolver_handle, m, n, reinterpret_cast(A), lda, &lwork)); + + // allocate memory + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork)); + + // Perform LU decomposition + cusolverErrcheck(cusolverDnZgetrf(cusolver_handle, m, n, reinterpret_cast(A), lda, d_work, ipiv, d_info)); + + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + throw std::runtime_error("getrf: failed to compute LU factorization"); + } + + cudaErrcheck(cudaFree(d_work)); + cudaErrcheck(cudaFree(d_info)); +} + +static inline +void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, float* A, const int& lda, const int* ipiv, float* B, const int& ldb) +{ + int h_info = 0; + int* d_info = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + cusolverErrcheck(cusolverDnSgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info)); + + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + throw std::runtime_error("getrs: failed to solve the linear system"); + } + + cudaErrcheck(cudaFree(d_info)); +} +static inline +void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, double* A, const int& lda, const int* ipiv, double* B, const int& ldb) +{ + int h_info = 0; + int* d_info = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + cusolverErrcheck(cusolverDnDgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info)); + + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + throw std::runtime_error("getrs: failed to solve the linear system"); + } + + cudaErrcheck(cudaFree(d_info)); +} +static inline +void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, std::complex* A, const int& lda, const int* ipiv, std::complex* B, const int& ldb) +{ + int h_info = 0; + int* d_info = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + cusolverErrcheck(cusolverDnCgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, reinterpret_cast(A), lda, ipiv, reinterpret_cast(B), ldb, d_info)); + + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + throw std::runtime_error("getrs: failed to solve the linear system"); + } + + cudaErrcheck(cudaFree(d_info)); +} +static inline +void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, std::complex* A, const int& lda, const int* ipiv, std::complex* B, const int& ldb) +{ + int h_info = 0; + int* d_info = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + cusolverErrcheck(cusolverDnZgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, reinterpret_cast(A), lda, ipiv, reinterpret_cast(B), ldb, d_info)); + + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + throw std::runtime_error("getrs: failed to solve the linear system"); + } + + cudaErrcheck(cudaFree(d_info)); +} + } // namespace cuSolverConnector } // namespace container diff --git a/source/module_base/module_container/base/third_party/lapack.h b/source/module_base/module_container/base/third_party/lapack.h index 9b5ac7f409..7452dc1835 100644 --- a/source/module_base/module_container/base/third_party/lapack.h +++ b/source/module_base/module_container/base/third_party/lapack.h @@ -105,6 +105,20 @@ void dtrtri_(const char* uplo, const char* diag, const int* n, double* a, const void ctrtri_(const char* uplo, const char* diag, const int* n, std::complex* a, const int* lda, int* info); void ztrtri_(const char* uplo, const char* diag, const int* n, std::complex* a, const int* lda, int* info); +void sgetrf_(const int* m, const int* n, float* a, const int* lda, int* ipiv, int* info); +void dgetrf_(const int* m, const int* n, double* a, const int* lda, int* ipiv, int* info); +void cgetrf_(const int* m, const int* n, std::complex* a, const int* lda, int* ipiv, int* info); +void zgetrf_(const int* m, const int* n, std::complex* a, const int* lda, int* ipiv, int* info); + +void sgetri_(const int* n, float* A, const int* lda, const int* ipiv, float* work, const int* lwork, int* info); +void dgetri_(const int* n, double* A, const int* lda, const int* ipiv, double* work, const int* lwork, int* info); +void cgetri_(const int* n, std::complex* A, const int* lda, const int* ipiv, std::complex* work, const int* lwork, int* info); +void zgetri_(const int* n, std::complex* A, const int* lda, const int* ipiv, std::complex* work, const int* lwork, int* info); + +void sgetrs_(const char* trans, const int* n, const int* nrhs, const float* A, const int* lda, const int* ipiv, float* B, const int* ldb, int* info); +void dgetrs_(const char* trans, const int* n, const int* nrhs, const double* A, const int* lda, const int* ipiv, double* B, const int* ldb, int* info); +void cgetrs_(const char* trans, const int* n, const int* nrhs, const std::complex* A, const int* lda, const int* ipiv, std::complex* B, const int* ldb, int* info); +void zgetrs_(const char* trans, const int* n, const int* nrhs, const std::complex* A, const int* lda, const int* ipiv, std::complex* B, const int* ldb, int* info); } // Class LapackConnector provide the connector to fortran lapack routine. @@ -321,6 +335,69 @@ void trtri( const char &uplo, const char &diag, const int &n, std::complex* A, const int lda, int* ipiv, int &info) +{ + cgetrf_(&m, &n, A, &lda, ipiv, &info); +} +static inline +void getrf(const int m, const int n, std::complex* A, const int lda, int* ipiv, int &info) +{ + zgetrf_(&m, &n, A, &lda, ipiv, &info); +} + +static inline +void getri(const int n, float* A, const int lda, const int* ipiv, float* work, const int lwork, int& info) +{ + sgetri_(&n, A, &lda, ipiv, work, &lwork, &info); +} +static inline +void getri(const int n, double* A, const int lda, const int* ipiv, double* work, const int lwork, int& info) +{ + dgetri_(&n, A, &lda, ipiv, work, &lwork, &info); +} +static inline +void getri(const int n, std::complex* A, const int lda, const int* ipiv, std::complex* work, const int lwork, int& info) +{ + cgetri_(&n, A, &lda, ipiv, work, &lwork, &info); +} +static inline +void getri(const int n, std::complex* A, const int lda, const int* ipiv, std::complex* work, const int lwork, int& info) +{ + zgetri_(&n, A, &lda, ipiv, work, &lwork, &info); +} + +static inline +void getrs(const char& trans, const int n, const int nrhs, float* A, const int lda, const int* ipiv, float* B, const int ldb, int& info) +{ + sgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); +} +static inline +void getrs(const char& trans, const int n, const int nrhs, double* A, const int lda, const int* ipiv, double* B, const int ldb, int& info) +{ + dgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); +} +static inline +void getrs(const char& trans, const int n, const int nrhs, std::complex* A, const int lda, const int* ipiv, std::complex* B, const int ldb, int& info) +{ + cgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); +} +static inline +void getrs(const char& trans, const int n, const int nrhs, std::complex* A, const int lda, const int* ipiv, std::complex* B, const int ldb, int& info) +{ + zgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); +} + } // namespace lapackConnector } // namespace container diff --git a/source/module_elecstate/potentials/H_TDDFT_pw.cpp b/source/module_elecstate/potentials/H_TDDFT_pw.cpp index a0533fda51..cf0dbd2d40 100644 --- a/source/module_elecstate/potentials/H_TDDFT_pw.cpp +++ b/source/module_elecstate/potentials/H_TDDFT_pw.cpp @@ -1,12 +1,12 @@ #include "H_TDDFT_pw.h" -#include "module_parameter/parameter.h" #include "module_base/constants.h" #include "module_base/math_integral.h" #include "module_base/timer.h" #include "module_hamilt_lcao/module_tddft/evolve_elec.h" #include "module_hamilt_pw/hamilt_pwdft/global.h" #include "module_io/input_conv.h" +#include "module_parameter/parameter.h" namespace elecstate { @@ -91,7 +91,7 @@ void H_TDDFT_pw::cal_fixed_v(double* vl_pseudo) H_TDDFT_pw::istep_int = istep; // judgement to skip vext - if (!module_tddft::Evolve_elec::td_vext || istep > tend || istep < tstart) + if (!PARAM.inp.td_vext || istep > tend || istep < tstart) { return; } @@ -105,12 +105,12 @@ void H_TDDFT_pw::cal_fixed_v(double* vl_pseudo) trigo_count = 0; heavi_count = 0; - for (auto direc: module_tddft::Evolve_elec::td_vext_dire_case) + for (auto direc: PARAM.inp.td_vext_dire) { std::vector vext_space(this->rho_basis_->nrxx, 0.0); double vext_time = cal_v_time(ttype[count], true); - if (module_tddft::Evolve_elec::out_efield && GlobalV::MY_RANK == 0) + if (PARAM.inp.out_efield && GlobalV::MY_RANK == 0) { std::stringstream as; as << PARAM.globalv.global_out_dir << "efield_" << count << ".dat"; @@ -248,7 +248,7 @@ void H_TDDFT_pw::update_At() H_TDDFT_pw::istep++; // judgement to skip vext - if (!module_tddft::Evolve_elec::td_vext || istep > tend || istep < tstart) + if (!PARAM.inp.td_vext || istep > tend || istep < tstart) { return; } @@ -262,7 +262,7 @@ void H_TDDFT_pw::update_At() bool last = false; double out = 0.0; - for (auto direc: module_tddft::Evolve_elec::td_vext_dire_case) + for (auto direc: PARAM.inp.td_vext_dire) { last = false; // cut the integral space and initialize relevant parameters @@ -297,7 +297,7 @@ void H_TDDFT_pw::update_At() } // output Efield - if (module_tddft::Evolve_elec::out_efield && GlobalV::MY_RANK == 0) + if (PARAM.inp.out_efield && GlobalV::MY_RANK == 0) { std::stringstream as; as << PARAM.globalv.global_out_dir << "efield_" << count << ".dat"; diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp index 0352492a3a..a727a7a095 100644 --- a/source/module_esolver/esolver.cpp +++ b/source/module_esolver/esolver.cpp @@ -19,6 +19,7 @@ extern "C" #include "esolver_lj.h" #include "esolver_of.h" #include "module_parameter/md_parameter.h" + #include namespace ModuleESolver @@ -45,16 +46,16 @@ std::string determine_type() else if (PARAM.inp.basis_type == "lcao_in_pw") { #ifdef __LCAO - if(PARAM.inp.esolver_type == "sdft") - { - esolver_type = "sdft_pw"; - } - else if(PARAM.inp.esolver_type == "ksdft") - { + if (PARAM.inp.esolver_type == "sdft") + { + esolver_type = "sdft_pw"; + } + else if (PARAM.inp.esolver_type == "ksdft") + { esolver_type = "ksdft_lip"; - } + } #else - ModuleBase::WARNING_QUIT("ESolver", "Calculation involving numerical orbitals must be compiled with __LCAO"); + ModuleBase::WARNING_QUIT("ESolver", "Calculation involving numerical orbitals must be compiled with __LCAO"); #endif } else if (PARAM.inp.basis_type == "lcao") @@ -67,7 +68,7 @@ std::string determine_type() else if (PARAM.inp.esolver_type == "ksdft") { esolver_type = "ksdft_lcao"; - } + } else if (PARAM.inp.esolver_type == "ks-lr") { esolver_type = "ksdft_lr_lcao"; @@ -77,7 +78,7 @@ std::string determine_type() esolver_type = "lr_lcao"; } #else - ModuleBase::WARNING_QUIT("ESolver", "Calculation involving numerical orbitals must be compiled with __LCAO"); + ModuleBase::WARNING_QUIT("ESolver", "Calculation involving numerical orbitals must be compiled with __LCAO"); #endif } @@ -98,18 +99,18 @@ std::string determine_type() auto device_info = PARAM.inp.device; - for (char &c : device_info) - { - if (std::islower(c)) - { - c = std::toupper(c); - } - } - if (GlobalV::MY_RANK == 0) - { - std::cout << " RUNNING WITH DEVICE : " << device_info << " / " - << base_device::information::get_device_info(PARAM.inp.device) << std::endl; - } + for (char& c: device_info) + { + if (std::islower(c)) + { + c = std::toupper(c); + } + } + if (GlobalV::MY_RANK == 0) + { + std::cout << " RUNNING WITH DEVICE : " << device_info << " / " + << base_device::information::get_device_info(PARAM.inp.device) << std::endl; + } GlobalV::ofs_running << "\n RUNNING WITH DEVICE : " << device_info << " / " << base_device::information::get_device_info(PARAM.inp.device) << std::endl; @@ -117,40 +118,39 @@ std::string determine_type() return esolver_type; } - -//Some API to operate E_Solver +// Some API to operate E_Solver ESolver* init_esolver(const Input_para& inp, UnitCell& ucell) { - //determine type of esolver based on INPUT information - const std::string esolver_type = determine_type(); + // determine type of esolver based on INPUT information + const std::string esolver_type = determine_type(); // initialize the corresponding Esolver child class if (esolver_type == "ksdft_pw") { #if ((defined __CUDA) || (defined __ROCM)) - if (PARAM.inp.device == "gpu") - { - if (PARAM.inp.precision == "single") - { - return new ESolver_KS_PW, base_device::DEVICE_GPU>(); - } - else - { - return new ESolver_KS_PW, base_device::DEVICE_GPU>(); - } - } + if (PARAM.inp.device == "gpu") + { + if (PARAM.inp.precision == "single") + { + return new ESolver_KS_PW, base_device::DEVICE_GPU>(); + } + else + { + return new ESolver_KS_PW, base_device::DEVICE_GPU>(); + } + } #endif - if (PARAM.inp.precision == "single") - { - return new ESolver_KS_PW, base_device::DEVICE_CPU>(); - } - else - { - return new ESolver_KS_PW, base_device::DEVICE_CPU>(); - } - } + if (PARAM.inp.precision == "single") + { + return new ESolver_KS_PW, base_device::DEVICE_CPU>(); + } + else + { + return new ESolver_KS_PW, base_device::DEVICE_CPU>(); + } + } else if (esolver_type == "sdft_pw") - { + { #if ((defined __CUDA) || (defined __ROCM)) if (PARAM.inp.device == "gpu") { @@ -160,19 +160,19 @@ ESolver* init_esolver(const Input_para& inp, UnitCell& ucell) // } // else // { - return new ESolver_SDFT_PW, base_device::DEVICE_GPU>(); + return new ESolver_SDFT_PW, base_device::DEVICE_GPU>(); // } } #endif // if (PARAM.inp.precision == "single") - // { - // return new ESolver_SDFT_PW, base_device::DEVICE_CPU>(); - // } - // else - // { - return new ESolver_SDFT_PW, base_device::DEVICE_CPU>(); - // } - } + // { + // return new ESolver_SDFT_PW, base_device::DEVICE_CPU>(); + // } + // else + // { + return new ESolver_SDFT_PW, base_device::DEVICE_CPU>(); + // } + } #ifdef __LCAO else if (esolver_type == "ksdft_lip") { @@ -212,14 +212,26 @@ ESolver* init_esolver(const Input_para& inp, UnitCell& ucell) } } else if (esolver_type == "ksdft_lcao_tddft") - { - return new ESolver_KS_LCAO_TDDFT(); + { +#if ((defined __CUDA) /* || (defined __ROCM) */) + if (PARAM.inp.device == "gpu") + { + return new ESolver_KS_LCAO_TDDFT(); + } +#endif + return new ESolver_KS_LCAO_TDDFT(); } else if (esolver_type == "lr_lcao") { // use constructor rather than Init function to initialize reference (instead of pointers) to ucell - if (PARAM.globalv.gamma_only_local) { return new LR::ESolver_LR(inp, ucell); } - else { return new LR::ESolver_LR, double>(inp, ucell); } + if (PARAM.globalv.gamma_only_local) + { + return new LR::ESolver_LR(inp, ucell); + } + else + { + return new LR::ESolver_LR, double>(inp, ucell); + } } else if (esolver_type == "ksdft_lr_lcao") { @@ -261,34 +273,36 @@ ESolver* init_esolver(const Input_para& inp, UnitCell& ucell) return p_esolver_lr; } #endif - else if(esolver_type == "ofdft") - { - return new ESolver_OF(); - } - else if (esolver_type == "lj_pot") - { - return new ESolver_LJ(); - } - else if (esolver_type == "dp_pot") - { - return new ESolver_DP(PARAM.mdp.pot_file); - } - throw std::invalid_argument("esolver_type = "+std::string(esolver_type)+". Wrong in "+std::string(__FILE__)+" line "+std::to_string(__LINE__)); + else if (esolver_type == "ofdft") + { + return new ESolver_OF(); + } + else if (esolver_type == "lj_pot") + { + return new ESolver_LJ(); + } + else if (esolver_type == "dp_pot") + { + return new ESolver_DP(PARAM.mdp.pot_file); + } + throw std::invalid_argument("esolver_type = " + std::string(esolver_type) + ". Wrong in " + std::string(__FILE__) + + " line " + std::to_string(__LINE__)); } -void clean_esolver(ESolver * &pesolver, const bool lcao_cblacs_exit) +void clean_esolver(ESolver*& pesolver, const bool lcao_cblacs_exit) { // Zhang Xiaoyang modified in 2024/7/6: // Note: because of the init method of serial lcao hsolver // it needs no release step for it, or this [delete] will cause Segmentation Fault // Probably it will be modified later. #ifdef __MPI - delete pesolver; + delete pesolver; #ifdef __LCAO - if (lcao_cblacs_exit) { + if (lcao_cblacs_exit) + { Cblacs_exit(1); -} + } #endif #endif } -} +} // namespace ModuleESolver diff --git a/source/module_esolver/esolver_ks_lcao_tddft.cpp b/source/module_esolver/esolver_ks_lcao_tddft.cpp index acff4f7bf1..11792f6a2b 100644 --- a/source/module_esolver/esolver_ks_lcao_tddft.cpp +++ b/source/module_esolver/esolver_ks_lcao_tddft.cpp @@ -38,13 +38,23 @@ namespace ModuleESolver { -ESolver_KS_LCAO_TDDFT::ESolver_KS_LCAO_TDDFT() +template +ESolver_KS_LCAO_TDDFT::ESolver_KS_LCAO_TDDFT() { classname = "ESolver_KS_LCAO_TDDFT"; basisname = "LCAO"; + + // If the device is GPU, we must open use_tensor and use_lapack + ct::DeviceType ct_device_type = ct::DeviceTypeToEnum::value; + if (ct_device_type == ct::DeviceType::GpuDevice) + { + use_tensor = true; + use_lapack = true; + } } -ESolver_KS_LCAO_TDDFT::~ESolver_KS_LCAO_TDDFT() +template +ESolver_KS_LCAO_TDDFT::~ESolver_KS_LCAO_TDDFT() { delete psi_laststep; if (Hk_laststep != nullptr) @@ -65,7 +75,8 @@ ESolver_KS_LCAO_TDDFT::~ESolver_KS_LCAO_TDDFT() } } -void ESolver_KS_LCAO_TDDFT::before_all_runners(UnitCell& ucell, const Input_para& inp) +template +void ESolver_KS_LCAO_TDDFT::before_all_runners(UnitCell& ucell, const Input_para& inp) { // 1) run before_all_runners in ESolver_KS_LCAO ESolver_KS_LCAO, double>::before_all_runners(ucell, inp); @@ -74,44 +85,54 @@ void ESolver_KS_LCAO_TDDFT::before_all_runners(UnitCell& ucell, const Input_para // this->pelec = dynamic_cast(this->pelec); } -void ESolver_KS_LCAO_TDDFT::hamilt2density_single(UnitCell& ucell, const int istep, const int iter, const double ethr) +template +void ESolver_KS_LCAO_TDDFT::hamilt2density_single(UnitCell& ucell, + const int istep, + const int iter, + const double ethr) { if (PARAM.inp.init_wfc == "file") { if (istep >= 1) { - module_tddft::Evolve_elec::solve_psi(istep, - PARAM.inp.nbands, - PARAM.globalv.nlocal, - this->p_hamilt, - this->pv, - this->psi, - this->psi_laststep, - this->Hk_laststep, - this->Sk_laststep, - this->pelec->ekb, - td_htype, - PARAM.inp.propagator, - kv.get_nks()); + module_tddft::Evolve_elec::solve_psi(istep, + PARAM.inp.nbands, + PARAM.globalv.nlocal, + kv.get_nks(), + this->p_hamilt, + this->pv, + this->psi, + this->psi_laststep, + this->Hk_laststep, + this->Sk_laststep, + this->pelec->ekb, + GlobalV::ofs_running, + td_htype, + PARAM.inp.propagator, + use_tensor, + use_lapack); this->weight_dm_rho(); } this->weight_dm_rho(); } else if (istep >= 2) { - module_tddft::Evolve_elec::solve_psi(istep, - PARAM.inp.nbands, - PARAM.globalv.nlocal, - this->p_hamilt, - this->pv, - this->psi, - this->psi_laststep, - this->Hk_laststep, - this->Sk_laststep, - this->pelec->ekb, - td_htype, - PARAM.inp.propagator, - kv.get_nks()); + module_tddft::Evolve_elec::solve_psi(istep, + PARAM.inp.nbands, + PARAM.globalv.nlocal, + kv.get_nks(), + this->p_hamilt, + this->pv, + this->psi, + this->psi_laststep, + this->Hk_laststep, + this->Sk_laststep, + this->pelec->ekb, + GlobalV::ofs_running, + td_htype, + PARAM.inp.propagator, + use_tensor, + use_lapack); this->weight_dm_rho(); } else @@ -141,7 +162,8 @@ void ESolver_KS_LCAO_TDDFT::hamilt2density_single(UnitCell& ucell, const int ist this->pelec->f_en.deband = this->pelec->cal_delta_eband(ucell); } -void ESolver_KS_LCAO_TDDFT::iter_finish(UnitCell& ucell, const int istep, int& iter) +template +void ESolver_KS_LCAO_TDDFT::iter_finish(UnitCell& ucell, const int istep, int& iter) { // print occupation of each band if (iter == 1 && istep <= 2) @@ -170,7 +192,8 @@ void ESolver_KS_LCAO_TDDFT::iter_finish(UnitCell& ucell, const int istep, int& i ESolver_KS_LCAO, double>::iter_finish(ucell, istep, iter); } -void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const int iter) +template +void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const int iter) { // Calculate new potential according to new Charge Density if (!this->conv_esolver) @@ -204,13 +227,17 @@ void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const i if (td_htype == 1) { + // Length of Hk_laststep and Sk_laststep, nlocal * nlocal for global, nloc for local + const int len_HS = use_tensor && use_lapack ? nlocal * nlocal : nloc; + if (this->Hk_laststep == nullptr) { this->Hk_laststep = new std::complex*[kv.get_nks()]; for (int ik = 0; ik < kv.get_nks(); ++ik) { - this->Hk_laststep[ik] = new std::complex[nloc]; - ModuleBase::GlobalFunc::ZEROS(Hk_laststep[ik], nloc); + // Allocate memory for Hk_laststep, if (use_tensor && use_lapack), should be global + this->Hk_laststep[ik] = new std::complex[len_HS]; + ModuleBase::GlobalFunc::ZEROS(Hk_laststep[ik], len_HS); } } if (this->Sk_laststep == nullptr) @@ -218,8 +245,9 @@ void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const i this->Sk_laststep = new std::complex*[kv.get_nks()]; for (int ik = 0; ik < kv.get_nks(); ++ik) { - this->Sk_laststep[ik] = new std::complex[nloc]; - ModuleBase::GlobalFunc::ZEROS(Sk_laststep[ik], nloc); + // Allocate memory for Sk_laststep, if (use_tensor && use_lapack), should be global + this->Sk_laststep[ik] = new std::complex[len_HS]; + ModuleBase::GlobalFunc::ZEROS(Sk_laststep[ik], len_HS); } } } @@ -240,13 +268,37 @@ void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const i this->p_hamilt->updateHk(ik); hamilt::MatrixBlock> h_mat, s_mat; this->p_hamilt->matrix(h_mat, s_mat); - BlasConnector::copy(nloc, h_mat.p, 1, Hk_laststep[ik], 1); - BlasConnector::copy(nloc, s_mat.p, 1, Sk_laststep[ik], 1); + + if (use_tensor && use_lapack) + { + // Gather H and S matrices to root process +#ifdef __MPI + int myid = 0; + int num_procs = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + Matrix_g> h_mat_g, s_mat_g; // Global matrix structure + + // Collect H matrix + gatherMatrix(myid, 0, h_mat, h_mat_g); + BlasConnector::copy(nlocal * nlocal, h_mat_g.p.get(), 1, Hk_laststep[ik], 1); + + // Collect S matrix + gatherMatrix(myid, 0, s_mat, s_mat_g); + BlasConnector::copy(nlocal * nlocal, s_mat_g.p.get(), 1, Sk_laststep[ik], 1); +#endif + } + else + { + BlasConnector::copy(nloc, h_mat.p, 1, Hk_laststep[ik], 1); + BlasConnector::copy(nloc, s_mat.p, 1, Sk_laststep[ik], 1); + } } } // calculate energy density matrix for tddft - if (istep >= (PARAM.inp.init_wfc == "file" ? 0 : 2) && module_tddft::Evolve_elec::td_edm == 0) + if (istep >= (PARAM.inp.init_wfc == "file" ? 0 : 2) && PARAM.inp.td_edm == 0) { elecstate::cal_edm_tddft(this->pv, this->pelec, this->kv, this->p_hamilt); } @@ -278,14 +330,15 @@ void ESolver_KS_LCAO_TDDFT::update_pot(UnitCell& ucell, const int istep, const i } } -void ESolver_KS_LCAO_TDDFT::after_scf(UnitCell& ucell, const int istep) +template +void ESolver_KS_LCAO_TDDFT::after_scf(UnitCell& ucell, const int istep) { ModuleBase::TITLE("ESolver_KS_LCAO_TDDFT", "after_scf"); ModuleBase::timer::tick("ESolver_KS_LCAO_TDDFT", "after_scf"); for (int is = 0; is < PARAM.inp.nspin; is++) { - if (module_tddft::Evolve_elec::out_dipole == 1) + if (PARAM.inp.out_dipole == 1) { std::stringstream ss_dipole; ss_dipole << PARAM.globalv.global_out_dir << "SPIN" << is + 1 << "_DIPOLE"; @@ -318,7 +371,8 @@ void ESolver_KS_LCAO_TDDFT::after_scf(UnitCell& ucell, const int istep) ModuleBase::timer::tick("ESolver_KS_LCAO_TDDFT", "after_scf"); } -void ESolver_KS_LCAO_TDDFT::weight_dm_rho() +template +void ESolver_KS_LCAO_TDDFT::weight_dm_rho() { if (PARAM.inp.ocp == 1) { @@ -335,4 +389,9 @@ void ESolver_KS_LCAO_TDDFT::weight_dm_rho() this->pelec->psiToRho(this->psi[0]); } +template class ESolver_KS_LCAO_TDDFT; +#if ((defined __CUDA) /* || (defined __ROCM) */) +template class ESolver_KS_LCAO_TDDFT; +#endif + } // namespace ModuleESolver diff --git a/source/module_esolver/esolver_ks_lcao_tddft.h b/source/module_esolver/esolver_ks_lcao_tddft.h index 40f4486475..c5fdcbc497 100644 --- a/source/module_esolver/esolver_ks_lcao_tddft.h +++ b/source/module_esolver/esolver_ks_lcao_tddft.h @@ -2,12 +2,52 @@ #define ESOLVER_KS_LCAO_TDDFT_H #include "esolver_ks.h" #include "esolver_ks_lcao.h" +#include "module_base/scalapack_connector.h" // Cpxgemr2d #include "module_hamilt_lcao/hamilt_lcaodft/record_adj.h" #include "module_psi/psi.h" namespace ModuleESolver { +//------------------------ MPI gathering and distributing functions ------------------------// +// This struct is used for collecting matrices from all processes to root process +template +struct Matrix_g +{ + std::shared_ptr p; + size_t row; + size_t col; + std::shared_ptr desc; +}; + +// Collect matrices from all processes to root process +template +void gatherMatrix(const int myid, const int root_proc, const hamilt::MatrixBlock& mat_l, Matrix_g& mat_g) +{ + const int* desca = mat_l.desc; // Obtain the descriptor of the local matrix + int ctxt = desca[1]; // BLACS context + int nrows = desca[2]; // Global matrix row number + int ncols = desca[3]; // Global matrix column number + if (myid == root_proc) + { + mat_g.p.reset(new T[nrows * ncols]); // No need to delete[] since it is a shared_ptr + } + else + { + mat_g.p.reset(new T[nrows * ncols]); // Placeholder for non-root processes + } + + // Set the descriptor of the global matrix + mat_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows}); + mat_g.row = nrows; + mat_g.col = ncols; + + // Call the Cpxgemr2d function in ScaLAPACK to collect the matrix data + Cpxgemr2d(nrows, ncols, mat_l.p, 1, 1, const_cast(desca), mat_g.p.get(), 1, 1, mat_g.desc.get(), ctxt); +} +//------------------------ MPI gathering and distributing functions ------------------------// + +template class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO, double> { public: @@ -35,7 +75,11 @@ class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO, doubl //! Overlap matrix of last time step std::complex** Sk_laststep = nullptr; - int td_htype = 1; + const int td_htype = 1; + + //! Control heterogeneous computing of the TDDFT solver + bool use_tensor = false; + bool use_lapack = false; private: void weight_dm_rho(); diff --git a/source/module_hamilt_lcao/module_tddft/CMakeLists.txt b/source/module_hamilt_lcao/module_tddft/CMakeLists.txt index 0f6068054b..6c5028d29f 100644 --- a/source/module_hamilt_lcao/module_tddft/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_tddft/CMakeLists.txt @@ -2,10 +2,13 @@ if(ENABLE_LCAO) list(APPEND objects evolve_elec.cpp evolve_psi.cpp - bandenergy.cpp + band_energy.cpp middle_hamilt.cpp norm_psi.cpp propagator.cpp + propagator_cn2.cpp + propagator_taylor.cpp + propagator_etrs.cpp upsi.cpp td_velocity.cpp td_current.cpp diff --git a/source/module_hamilt_lcao/module_tddft/band_energy.cpp b/source/module_hamilt_lcao/module_tddft/band_energy.cpp new file mode 100644 index 0000000000..c148042be3 --- /dev/null +++ b/source/module_hamilt_lcao/module_tddft/band_energy.cpp @@ -0,0 +1,429 @@ +#include "band_energy.h" + +#include "evolve_elec.h" +#include "module_base/lapack_connector.h" +#include "module_base/module_container/ATen/kernels/blas.h" +#include "module_base/scalapack_connector.h" + +#include +#include + +namespace module_tddft +{ +#ifdef __MPI + +inline int globalIndex(int localindex, int nblk, int nprocs, int myproc) +{ + int iblock, gIndex; + iblock = localindex / nblk; + gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk; + return gIndex; +} + +void compute_ekb(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const std::complex* Htmp, + const std::complex* psi_k, + double* ekb, + std::ofstream& ofs_running) +{ + assert(pv->nloc_wfc > 0 && pv->nloc > 0); + + std::complex* tmp1 = new std::complex[pv->nloc_wfc]; + ModuleBase::GlobalFunc::ZEROS(tmp1, pv->nloc_wfc); + + std::complex* eij = new std::complex[pv->nloc]; + ModuleBase::GlobalFunc::ZEROS(eij, pv->nloc); + + ScalapackConnector::gemm('N', + 'N', + nlocal, + nband, + nlocal, + 1.0, + Htmp, + 1, + 1, + pv->desc, + psi_k, + 1, + 1, + pv->desc_wfc, + 0.0, + tmp1, + 1, + 1, + pv->desc_wfc); + + ScalapackConnector::gemm('C', + 'N', + nband, + nband, + nlocal, + 1.0, + psi_k, + 1, + 1, + pv->desc_wfc, + tmp1, + 1, + 1, + pv->desc_wfc, + 0.0, + eij, + 1, + 1, + pv->desc_Eij); + + if (PARAM.inp.td_print_eij > 0.0) + { + ofs_running + << "------------------------------------------------------------------------------------------------" + << std::endl; + ofs_running << " Eij:" << std::endl; + for (int i = 0; i < pv->nrow_bands; i++) + { + const int in = i * pv->ncol; + for (int j = 0; j < pv->ncol_bands; j++) + { + double aa = eij[in + j].real(); + double bb = eij[in + j].imag(); + if (std::abs(aa) < PARAM.inp.td_print_eij) + { + aa = 0.0; + } + if (std::abs(bb) < PARAM.inp.td_print_eij) + { + bb = 0.0; + } + if (aa > 0.0 || bb > 0.0) + { + ofs_running << i << " " << j << " " << aa << "+" << bb << "i " << std::endl; + } + } + } + ofs_running << std::endl; + ofs_running + << "------------------------------------------------------------------------------------------------" + << std::endl; + } + + int info = 0; + int naroc[2] = {0, 0}; + + assert(nband > 0); + double* eii = new double[nband]; + ModuleBase::GlobalFunc::ZEROS(eii, nband); + + for (int iprow = 0; iprow < pv->dim0; ++iprow) + { + for (int ipcol = 0; ipcol < pv->dim1; ++ipcol) + { + if (iprow == pv->coord[0] && ipcol == pv->coord[1]) + { + naroc[0] = pv->nrow; + naroc[1] = pv->ncol; + for (int j = 0; j < naroc[1]; ++j) + { + int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol); + if (igcol >= nband) + { + continue; + } + for (int i = 0; i < naroc[0]; ++i) + { + int igrow = globalIndex(i, pv->nb, pv->dim0, iprow); + if (igrow >= nband) + { + continue; + } + if (igcol == igrow) + { + eii[igcol] = eij[j * naroc[0] + i].real(); + } + } + } + } + } // loop ipcol + } // loop iprow + info = MPI_Allreduce(eii, ekb, nband, MPI_DOUBLE, MPI_SUM, pv->comm()); + + delete[] tmp1; + delete[] eij; + delete[] eii; +} + +void compute_ekb_tensor(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Htmp, + const ct::Tensor& psi_k, + ct::Tensor& ekb, + std::ofstream& ofs_running) +{ + assert(pv->nloc_wfc > 0 && pv->nloc > 0); + + // Create Tensor objects for temporary data + ct::Tensor tmp1(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc_wfc})); + tmp1.zero(); + + ct::Tensor eij(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc})); + eij.zero(); + + // Perform matrix multiplication: tmp1 = Htmp * psi_k + ScalapackConnector::gemm('N', + 'N', + nlocal, + nband, + nlocal, + 1.0, + Htmp.data>(), + 1, + 1, + pv->desc, + psi_k.data>(), + 1, + 1, + pv->desc_wfc, + 0.0, + tmp1.data>(), + 1, + 1, + pv->desc_wfc); + + // Perform matrix multiplication: eij = psi_k^dagger * tmp1 + ScalapackConnector::gemm('C', + 'N', + nband, + nband, + nlocal, + 1.0, + psi_k.data>(), + 1, + 1, + pv->desc_wfc, + tmp1.data>(), + 1, + 1, + pv->desc_wfc, + 0.0, + eij.data>(), + 1, + 1, + pv->desc_Eij); + + if (PARAM.inp.td_print_eij >= 0.0) + { + ofs_running + << "------------------------------------------------------------------------------------------------" + << std::endl; + ofs_running << " Eij:" << std::endl; + for (int i = 0; i < pv->nrow_bands; i++) + { + const int in = i * pv->ncol; + for (int j = 0; j < pv->ncol_bands; j++) + { + double aa = eij.data>()[in + j].real(); + double bb = eij.data>()[in + j].imag(); + if (std::abs(aa) < PARAM.inp.td_print_eij) + { + aa = 0.0; + } + if (std::abs(bb) < PARAM.inp.td_print_eij) + { + bb = 0.0; + } + if (aa > 0.0 || bb > 0.0) + { + ofs_running << i << " " << j << " " << aa << "+" << bb << "i " << std::endl; + } + } + } + ofs_running << std::endl; + ofs_running + << "------------------------------------------------------------------------------------------------" + << std::endl; + } + + int info = 0; + int naroc[2] = {0, 0}; + + // Create a Tensor for eii + assert(nband > 0); + ct::Tensor eii(ct::DataType::DT_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({nband})); + eii.zero(); + + for (int iprow = 0; iprow < pv->dim0; ++iprow) + { + for (int ipcol = 0; ipcol < pv->dim1; ++ipcol) + { + if (iprow == pv->coord[0] && ipcol == pv->coord[1]) + { + naroc[0] = pv->nrow; + naroc[1] = pv->ncol; + for (int j = 0; j < naroc[1]; ++j) + { + int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol); + if (igcol >= nband) + { + continue; + } + for (int i = 0; i < naroc[0]; ++i) + { + int igrow = globalIndex(i, pv->nb, pv->dim0, iprow); + if (igrow >= nband) + { + continue; + } + if (igcol == igrow) + { + eii.data()[igcol] = eij.data>()[j * naroc[0] + i].real(); + } + } + } + } + } // loop ipcol + } // loop iprow + + // Perform MPI reduction to compute ekb + info = MPI_Allreduce(eii.data(), ekb.data(), nband, MPI_DOUBLE, MPI_SUM, pv->comm()); +} + +template +void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Htmp, + const ct::Tensor& psi_k, + ct::Tensor& ekb, + std::ofstream& ofs_running) +{ + // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice + ct::DeviceType ct_device_type = ct::DeviceTypeToEnum::value; + // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU + using ct_Device = typename ct::PsiToContainer::type; + + // Create Tensor objects for temporary data + ct::Tensor tmp1(ct::DataType::DT_COMPLEX_DOUBLE, + ct_device_type, + ct::TensorShape({nlocal * nband})); // tmp1 shape: nlocal * nband + tmp1.zero(); + + ct::Tensor eij(ct::DataType::DT_COMPLEX_DOUBLE, + ct_device_type, + ct::TensorShape({nlocal * nlocal})); // eij shape: nlocal * nlocal + // Why not use nband * nband ????? + eij.zero(); + + std::complex alpha = {1.0, 0.0}; + std::complex beta = {0.0, 0.0}; + + // Perform matrix multiplication: tmp1 = Htmp * psi_k + ct::kernels::blas_gemm, ct_Device>()('N', + 'N', + nlocal, + nband, + nlocal, + &alpha, + Htmp.data>(), + nlocal, // Leading dimension of Htmp + psi_k.data>(), + nlocal, // Leading dimension of psi_k + &beta, + tmp1.data>(), + nlocal); // Leading dimension of tmp1 + + // Perform matrix multiplication: eij = psi_k^dagger * tmp1 + ct::kernels::blas_gemm, ct_Device>()('C', + 'N', + nband, + nband, + nlocal, + &alpha, + psi_k.data>(), + nlocal, // Leading dimension of psi_k + tmp1.data>(), + nlocal, // Leading dimension of tmp1 + &beta, + eij.data>(), + nlocal); // Leading dimension of eij + + if (PARAM.inp.td_print_eij >= 0.0) + { + ct::Tensor eij_cpu = eij.to_device(); + + ofs_running + << "------------------------------------------------------------------------------------------------" + << std::endl; + ofs_running << " Eij:" << std::endl; + for (int i = 0; i < nband; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nband; j++) + { + double aa = eij_cpu.data>()[in + j].real(); + double bb = eij_cpu.data>()[in + j].imag(); + if (std::abs(aa) < PARAM.inp.td_print_eij) + { + aa = 0.0; + } + if (std::abs(bb) < PARAM.inp.td_print_eij) + { + bb = 0.0; + } + if (aa > 0.0 || bb > 0.0) + { + ofs_running << i << " " << j << " " << aa << "+" << bb << "i " << std::endl; + } + } + } + ofs_running << std::endl; + ofs_running + << "------------------------------------------------------------------------------------------------" + << std::endl; + } + + // Extract diagonal elements of eij into ekb + if (ct_device_type == ct::DeviceType::GpuDevice) + { + // GPU implementation + for (int i = 0; i < nband; ++i) + { + base_device::memory::synchronize_memory_op()( + ekb.data() + i, + reinterpret_cast(eij.data>() + i * nlocal + i), + 1); + } + } + else + { + // CPU implementation + for (int i = 0; i < nband; ++i) + { + ekb.data()[i] = eij.data>()[i * nlocal + i].real(); + } + } +} + +// Explicit instantiation of template functions +template void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Htmp, + const ct::Tensor& psi_k, + ct::Tensor& ekb, + std::ofstream& ofs_running); + +#if ((defined __CUDA) /* || (defined __ROCM) */) +template void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Htmp, + const ct::Tensor& psi_k, + ct::Tensor& ekb, + std::ofstream& ofs_running); +#endif // __CUDA +#endif // __MPI + +} // namespace module_tddft diff --git a/source/module_hamilt_lcao/module_tddft/band_energy.h b/source/module_hamilt_lcao/module_tddft/band_energy.h new file mode 100644 index 0000000000..4e1ab9aa5b --- /dev/null +++ b/source/module_hamilt_lcao/module_tddft/band_energy.h @@ -0,0 +1,53 @@ +/** + * @file bandenegy.h + * @brief compute band energy ekb + * This file originally belonged to file LCAO_evolve.cpp + */ +#ifndef BANDENERGY_H +#define BANDENERGY_H + +#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor +#include "module_basis/module_ao/parallel_orbitals.h" + +#include + +namespace module_tddft +{ +#ifdef __MPI +/** + * @brief compute band energy ekb + * + * @param[in] pv information of parallel + * @param[in] nband number of bands + * @param[in] nlocal number of orbitals + * @param[in] Htmp Hamiltonian + * @param[in] psi_k psi of this step + * @param[out] ekb band energy + */ +void compute_ekb(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const std::complex* Htmp, + const std::complex* psi_k, + double* ekb, + std::ofstream& ofs_running); + +void compute_ekb_tensor(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Htmp, + const ct::Tensor& psi_k, + ct::Tensor& ekb, + std::ofstream& ofs_running); + +template +void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Htmp, + const ct::Tensor& psi_k, + ct::Tensor& ekb, + std::ofstream& ofs_running); +#endif // __MPI +} // namespace module_tddft +#endif diff --git a/source/module_hamilt_lcao/module_tddft/bandenergy.cpp b/source/module_hamilt_lcao/module_tddft/bandenergy.cpp deleted file mode 100644 index 37212a6606..0000000000 --- a/source/module_hamilt_lcao/module_tddft/bandenergy.cpp +++ /dev/null @@ -1,146 +0,0 @@ -#include "bandenergy.h" - -#include -#include - -#include "evolve_elec.h" -#include "module_base/lapack_connector.h" -#include "module_base/scalapack_connector.h" - -namespace module_tddft -{ -#ifdef __MPI - -inline int globalIndex(int localindex, int nblk, int nprocs, int myproc) -{ - int iblock, gIndex; - iblock = localindex / nblk; - gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk; - return gIndex; -} - -void compute_ekb(const Parallel_Orbitals* pv, - const int nband, - const int nlocal, - const std::complex* Htmp, - const std::complex* psi_k, - double* ekb) -{ - - std::complex* tmp1 = new std::complex[pv->nloc_wfc]; - ModuleBase::GlobalFunc::ZEROS(tmp1, pv->nloc_wfc); - - std::complex* Eij = new std::complex[pv->nloc]; - ModuleBase::GlobalFunc::ZEROS(Eij, pv->nloc); - - ScalapackConnector::gemm('N', - 'N', - nlocal, - nband, - nlocal, - 1.0, - Htmp, - 1, - 1, - pv->desc, - psi_k, - 1, - 1, - pv->desc_wfc, - 0.0, - tmp1, - 1, - 1, - pv->desc_wfc); - - ScalapackConnector::gemm('C', - 'N', - nband, - nband, - nlocal, - 1.0, - psi_k, - 1, - 1, - pv->desc_wfc, - tmp1, - 1, - 1, - pv->desc_wfc, - 0.0, - Eij, - 1, - 1, - pv->desc_Eij); - - if (Evolve_elec::td_print_eij > 0.0) - { - GlobalV::ofs_running - << "------------------------------------------------------------------------------------------------" - << std::endl; - GlobalV::ofs_running << " Eij:" << std::endl; - for (int i = 0; i < pv->nrow_bands; i++) - { - for (int j = 0; j < pv->ncol_bands; j++) - { - double aa, bb; - aa = Eij[i * pv->ncol + j].real(); - bb = Eij[i * pv->ncol + j].imag(); - if (std::abs(aa) < Evolve_elec::td_print_eij) - aa = 0.0; - if (std::abs(bb) < Evolve_elec::td_print_eij) - bb = 0.0; - if (aa > 0.0 || bb > 0.0) - { - GlobalV::ofs_running << i << " " << j << " " << aa << "+" << bb << "i " << std::endl; - } - } - } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running - << "------------------------------------------------------------------------------------------------" - << std::endl; - } - - int info; - int naroc[2]; - - double* Eii = new double[nband]; - ModuleBase::GlobalFunc::ZEROS(Eii, nband); - for (int iprow = 0; iprow < pv->dim0; ++iprow) - { - for (int ipcol = 0; ipcol < pv->dim1; ++ipcol) - { - if (iprow == pv->coord[0] && ipcol == pv->coord[1]) - { - naroc[0] = pv->nrow; - naroc[1] = pv->ncol; - for (int j = 0; j < naroc[1]; ++j) - { - int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol); - if (igcol >= nband) - continue; - for (int i = 0; i < naroc[0]; ++i) - { - int igrow = globalIndex(i, pv->nb, pv->dim0, iprow); - if (igrow >= nband) - continue; - if (igcol == igrow) - { - Eii[igcol] = Eij[j * naroc[0] + i].real(); - } - } - } - } - } // loop ipcol - } // loop iprow - info = MPI_Allreduce(Eii, ekb, nband, MPI_DOUBLE, MPI_SUM, pv->comm()); - - delete[] tmp1; - delete[] Eij; - delete[] Eii; -} - -#endif - -} // namespace module_tddft diff --git a/source/module_hamilt_lcao/module_tddft/bandenergy.h b/source/module_hamilt_lcao/module_tddft/bandenergy.h deleted file mode 100644 index fb8c989d8b..0000000000 --- a/source/module_hamilt_lcao/module_tddft/bandenergy.h +++ /dev/null @@ -1,34 +0,0 @@ -/** - * @file bandenegy.h - * @brief compute band energy ekb - * This file originally belonged to file LCAO_evolve.cpp - */ -#ifndef BANDENERGY_H -#define BANDENERGY_H - -#include "module_basis/module_ao/parallel_orbitals.h" - -#include - -namespace module_tddft -{ -#ifdef __MPI -/** - * @brief compute band energy ekb - * - * @param[in] pv information of parallel - * @param[in] nband number of bands - * @param[in] nlocal number of orbitals - * @param[in] Htmp Hamiltonian - * @param[in] psi_k psi of this step - * @param[out] ekb band energy - */ -void compute_ekb(const Parallel_Orbitals* pv, - const int nband, - const int nlocal, - const std::complex* Htmp, - const std::complex* psi_k, - double* ekb); -#endif -} // namespace module_tddft -#endif diff --git a/source/module_hamilt_lcao/module_tddft/evolve_elec.cpp b/source/module_hamilt_lcao/module_tddft/evolve_elec.cpp index 7be680cfff..95b05872d2 100644 --- a/source/module_hamilt_lcao/module_tddft/evolve_elec.cpp +++ b/source/module_hamilt_lcao/module_tddft/evolve_elec.cpp @@ -10,40 +10,44 @@ namespace module_tddft { -Evolve_elec::Evolve_elec(){}; -Evolve_elec::~Evolve_elec(){}; +template +Evolve_elec::Evolve_elec(){}; +template +Evolve_elec::~Evolve_elec(){}; -double Evolve_elec::td_force_dt; -bool Evolve_elec::td_vext; -std::vector Evolve_elec::td_vext_dire_case; -bool Evolve_elec::out_dipole; -bool Evolve_elec::out_efield; -double Evolve_elec::td_print_eij; // the threshold to output Eij elements -int Evolve_elec::td_edm; // 0: new edm method 1: old edm method +template +ct::DeviceType Evolve_elec::ct_device_type = ct::DeviceTypeToEnum::value; // this routine only serves for TDDFT using LCAO basis set -void Evolve_elec::solve_psi(const int& istep, - const int nband, - const int nlocal, - hamilt::Hamilt>* phm, - Parallel_Orbitals& para_orb, - psi::Psi>* psi, - psi::Psi>* psi_laststep, - std::complex** Hk_laststep, - std::complex** Sk_laststep, - ModuleBase::matrix& ekb, - int htype, - int propagator, - const int& nks) +template +void Evolve_elec::solve_psi(const int& istep, + const int nband, + const int nlocal, + const int& nks, + hamilt::Hamilt>* phm, + Parallel_Orbitals& para_orb, + psi::Psi>* psi, + psi::Psi>* psi_laststep, + std::complex** Hk_laststep, + std::complex** Sk_laststep, + ModuleBase::matrix& ekb, + std::ofstream& ofs_running, + const int htype, + const int propagator, + const bool use_tensor, + const bool use_lapack) { ModuleBase::TITLE("Evolve_elec", "solve_psi"); ModuleBase::timer::tick("Evolve_elec", "solve_psi"); + // Control the print of matrix to running_md.log + const int print_matrix = 0; + for (int ik = 0; ik < nks; ik++) { phm->updateHk(ik); - ModuleBase::timer::tick("Efficience", "evolve_k"); + ModuleBase::timer::tick("Efficiency", "evolve_k"); psi->fix_k(ik); psi_laststep->fix_k(ik); if (htype == 0) @@ -58,31 +62,165 @@ void Evolve_elec::solve_psi(const int& istep, nullptr, &(ekb(ik, 0)), htype, - propagator); + propagator, + ofs_running, + print_matrix); } else if (htype == 1) { - evolve_psi(nband, - nlocal, - &(para_orb), - phm, - psi[0].get_pointer(), - psi_laststep[0].get_pointer(), - Hk_laststep[ik], - Sk_laststep[ik], - &(ekb(ik, 0)), - htype, - propagator); + if (!use_tensor) + { + evolve_psi(nband, + nlocal, + &(para_orb), + phm, + psi[0].get_pointer(), + psi_laststep[0].get_pointer(), + Hk_laststep[ik], + Sk_laststep[ik], + &(ekb(ik, 0)), + htype, + propagator, + ofs_running, + print_matrix); + // std::cout << "Print ekb: " << std::endl; + // ekb.print(std::cout); + } + else + { + const int len_psi_k_1 = use_lapack ? nband : psi->get_nbands(); + const int len_psi_k_2 = use_lapack ? nlocal : psi->get_nbasis(); + const int len_HS_laststep = use_lapack ? nlocal * nlocal : para_orb.nloc; + + // Create Tensor for psi_k, psi_k_laststep, H_laststep, S_laststep, ekb + ct::Tensor psi_k_tensor(ct::DataType::DT_COMPLEX_DOUBLE, + ct_device_type, + ct::TensorShape({len_psi_k_1, len_psi_k_2})); + ct::Tensor psi_k_laststep_tensor(ct::DataType::DT_COMPLEX_DOUBLE, + ct_device_type, + ct::TensorShape({len_psi_k_1, len_psi_k_2})); + ct::Tensor H_laststep_tensor(ct::DataType::DT_COMPLEX_DOUBLE, + ct_device_type, + ct::TensorShape({len_HS_laststep})); + ct::Tensor S_laststep_tensor(ct::DataType::DT_COMPLEX_DOUBLE, + ct_device_type, + ct::TensorShape({len_HS_laststep})); + ct::Tensor ekb_tensor(ct::DataType::DT_DOUBLE, ct_device_type, ct::TensorShape({nband})); + + // Global psi + ModuleESolver::Matrix_g> psi_g; + ModuleESolver::Matrix_g> psi_laststep_g; + + if (use_lapack) + { + // Need to gather the psi to the root process on CPU + // H_laststep and S_laststep are already gathered in esolver_ks_lcao_tddft.cpp +#ifdef __MPI + // Access the rank of the calling process in the communicator + int myid = 0; + int root_proc = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + + // Gather psi to the root process + gatherPsi(myid, root_proc, psi[0].get_pointer(), para_orb, psi_g); + gatherPsi(myid, root_proc, psi_laststep[0].get_pointer(), para_orb, psi_laststep_g); + + // Syncronize data from CPU to Device + syncmem_complex_h2d_op()(psi_k_tensor.data>(), + psi_g.p.get(), + len_psi_k_1 * len_psi_k_2); + syncmem_complex_h2d_op()(psi_k_laststep_tensor.data>(), + psi_laststep_g.p.get(), + len_psi_k_1 * len_psi_k_2); +#endif + } + else + { + // Syncronize data from CPU to Device + syncmem_complex_h2d_op()(psi_k_tensor.data>(), + psi[0].get_pointer(), + len_psi_k_1 * len_psi_k_2); + syncmem_complex_h2d_op()(psi_k_laststep_tensor.data>(), + psi_laststep[0].get_pointer(), + len_psi_k_1 * len_psi_k_2); + } + + syncmem_complex_h2d_op()(H_laststep_tensor.data>(), + Hk_laststep[ik], + len_HS_laststep); + syncmem_complex_h2d_op()(S_laststep_tensor.data>(), + Sk_laststep[ik], + len_HS_laststep); + syncmem_double_h2d_op()(ekb_tensor.data(), &(ekb(ik, 0)), nband); + + evolve_psi_tensor(nband, + nlocal, + &(para_orb), + phm, + psi_k_tensor, + psi_k_laststep_tensor, + H_laststep_tensor, + S_laststep_tensor, + ekb_tensor, + htype, + propagator, + ofs_running, + print_matrix, + use_lapack); + + // Need to distribute global psi back to all processes + if (use_lapack) + { +#ifdef __MPI + // Syncronize data from Device to CPU + syncmem_complex_d2h_op()(psi_g.p.get(), + psi_k_tensor.data>(), + len_psi_k_1 * len_psi_k_2); + syncmem_complex_d2h_op()(psi_laststep_g.p.get(), + psi_k_laststep_tensor.data>(), + len_psi_k_1 * len_psi_k_2); + + // Distribute psi to all processes + distributePsi(para_orb, psi[0].get_pointer(), psi_g); + distributePsi(para_orb, psi_laststep[0].get_pointer(), psi_laststep_g); +#endif + } + else + { + // Syncronize data from Device to CPU + syncmem_complex_d2h_op()(psi[0].get_pointer(), + psi_k_tensor.data>(), + len_psi_k_1 * len_psi_k_2); + syncmem_complex_d2h_op()(psi_laststep[0].get_pointer(), + psi_k_laststep_tensor.data>(), + len_psi_k_1 * len_psi_k_2); + } + syncmem_complex_d2h_op()(Hk_laststep[ik], + H_laststep_tensor.data>(), + len_HS_laststep); + syncmem_complex_d2h_op()(Sk_laststep[ik], + S_laststep_tensor.data>(), + len_HS_laststep); + syncmem_double_d2h_op()(&(ekb(ik, 0)), ekb_tensor.data(), nband); + + // std::cout << "Print ekb tensor: " << std::endl; + // ekb.print(std::cout); + } } else { std::cout << "method of htype is wrong" << std::endl; } - ModuleBase::timer::tick("Efficience", "evolve_k"); + ModuleBase::timer::tick("Efficiency", "evolve_k"); } // end k ModuleBase::timer::tick("Evolve_elec", "solve_psi"); return; } + +template class Evolve_elec; +#if ((defined __CUDA) /* || (defined __ROCM) */) +template class Evolve_elec; +#endif } // namespace module_tddft \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_tddft/evolve_elec.h b/source/module_hamilt_lcao/module_tddft/evolve_elec.h index 71c4aebe43..9da86c9d90 100644 --- a/source/module_hamilt_lcao/module_tddft/evolve_elec.h +++ b/source/module_hamilt_lcao/module_tddft/evolve_elec.h @@ -3,6 +3,11 @@ #include "module_base/global_function.h" #include "module_base/global_variable.h" +#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor +#include "module_base/module_container/ATen/core/tensor_map.h" // TensorMap +#include "module_base/module_device/device.h" // base_device +#include "module_base/module_device/memory_op.h" // memory operations +#include "module_base/scalapack_connector.h" // Cpxgemr2d #include "module_esolver/esolver_ks_lcao.h" #include "module_esolver/esolver_ks_lcao_tddft.h" #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h" @@ -15,33 +20,140 @@ // k is the index for the points in the first Brillouin zone //----------------------------------------------------------- +//------------------------ Debugging utility function ------------------------// + +// Print the shape of a Tensor +inline void print_tensor_shape(const ct::Tensor& tensor, const std::string& name) +{ + std::cout << "Shape of " << name << ": ["; + for (int i = 0; i < tensor.shape().ndim(); ++i) + { + std::cout << tensor.shape().dim_size(i); + if (i < tensor.shape().ndim() - 1) + { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; +} + +// Recursive print function +template +inline void print_tensor_data_recursive(const T* data, + const std::vector& shape, + const std::vector& strides, + int dim, + std::vector& indices, + const std::string& name) +{ + if (dim == shape.size()) + { + // Recursion base case: print data when reaching the innermost dimension + std::cout << name; + for (size_t i = 0; i < indices.size(); ++i) + { + std::cout << "[" << indices[i] << "]"; + } + std::cout << " = " << *data << std::endl; + return; + } + // Recursively process the current dimension + for (int64_t i = 0; i < shape[dim]; ++i) + { + indices[dim] = i; + print_tensor_data_recursive(data + i * strides[dim], shape, strides, dim + 1, indices, name); + } +} + +// Generic print function +template +inline void print_tensor_data(const ct::Tensor& tensor, const std::string& name) +{ + const std::vector& shape = tensor.shape().dims(); + const std::vector& strides = tensor.shape().strides(); + const T* data = tensor.data(); + std::vector indices(shape.size(), 0); + print_tensor_data_recursive(data, shape, strides, 0, indices, name); +} + +// Specialization for std::complex +template <> +inline void print_tensor_data>(const ct::Tensor& tensor, const std::string& name) +{ + const std::vector& shape = tensor.shape().dims(); + const std::vector& strides = tensor.shape().strides(); + const std::complex* data = tensor.data>(); + std::vector indices(shape.size(), 0); + print_tensor_data_recursive(data, shape, strides, 0, indices, name); +} + +//------------------------ Debugging utility function ------------------------// + namespace module_tddft { -class Evolve_elec +#ifdef __MPI +//------------------------ MPI gathering and distributing functions ------------------------// +template +void gatherPsi(const int myid, + const int root_proc, + T* psi_l, + const Parallel_Orbitals& para_orb, + ModuleESolver::Matrix_g& psi_g) +{ + const int* desc_psi = para_orb.desc_wfc; // Obtain the descriptor from Parallel_Orbitals + int ctxt = desc_psi[1]; // BLACS context + int nrows = desc_psi[2]; // Global matrix row number + int ncols = desc_psi[3]; // Global matrix column number + + if (myid == root_proc) + { + psi_g.p.reset(new T[nrows * ncols]); // No need to delete[] since it is a shared_ptr + } + else + { + psi_g.p.reset(new T[nrows * ncols]); // Placeholder for non-root processes + } + + // Set the descriptor of the global psi + psi_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows}); + psi_g.row = nrows; + psi_g.col = ncols; + + // Call the Cpxgemr2d function in ScaLAPACK to collect the matrix data + Cpxgemr2d(nrows, ncols, psi_l, 1, 1, const_cast(desc_psi), psi_g.p.get(), 1, 1, psi_g.desc.get(), ctxt); +} + +template +void distributePsi(const Parallel_Orbitals& para_orb, T* psi_l, const ModuleESolver::Matrix_g& psi_g) { + const int* desc_psi = para_orb.desc_wfc; // Obtain the descriptor from Parallel_Orbitals + int ctxt = desc_psi[1]; // BLACS context + int nrows = desc_psi[2]; // Global matrix row number + int ncols = desc_psi[3]; // Global matrix column number + + // Call the Cpxgemr2d function in ScaLAPACK to distribute the matrix data + Cpxgemr2d(nrows, ncols, psi_g.p.get(), 1, 1, psi_g.desc.get(), psi_l, 1, 1, const_cast(desc_psi), ctxt); +} +//------------------------ MPI gathering and distributing functions ------------------------// +#endif // __MPI - friend class ELEC_scf; +template +class Evolve_elec +{ friend class ModuleESolver::ESolver_KS_LCAO, double>; - friend class ModuleESolver::ESolver_KS_LCAO_TDDFT; + + // Template parameter is needed for the friend class declaration + friend class ModuleESolver::ESolver_KS_LCAO_TDDFT; public: Evolve_elec(); ~Evolve_elec(); - static double td_force_dt; - static bool td_vext; - static std::vector td_vext_dire_case; - // Output dipole, efield, current or not - static bool out_dipole; - static bool out_efield; - - static double td_print_eij; // the threshold to output Eij elements - static int td_edm; // 0: new edm method 1: old edm method - private: static void solve_psi(const int& istep, const int nband, const int nlocal, + const int& nks, hamilt::Hamilt>* phm, Parallel_Orbitals& para_orb, psi::Psi>* psi, @@ -49,9 +161,24 @@ class Evolve_elec std::complex** Hk_laststep, std::complex** Sk_laststep, ModuleBase::matrix& ekb, - int htype, - int propagator, - const int& nks); + std::ofstream& ofs_running, + const int htype, + const int propagator, + const bool use_tensor, + const bool use_lapack); + + // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice + static ct::DeviceType ct_device_type; + // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU + using ct_Device = typename ct::PsiToContainer::type; + + // Memory operations + using syncmem_double_h2d_op = base_device::memory::synchronize_memory_op; + using syncmem_double_d2h_op = base_device::memory::synchronize_memory_op; + using syncmem_complex_h2d_op + = base_device::memory::synchronize_memory_op, Device, base_device::DEVICE_CPU>; + using syncmem_complex_d2h_op + = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, Device>; }; } // namespace module_tddft #endif diff --git a/source/module_hamilt_lcao/module_tddft/evolve_psi.cpp b/source/module_hamilt_lcao/module_tddft/evolve_psi.cpp index e7ca154e9a..240ab5b273 100644 --- a/source/module_hamilt_lcao/module_tddft/evolve_psi.cpp +++ b/source/module_hamilt_lcao/module_tddft/evolve_psi.cpp @@ -1,9 +1,12 @@ #include "evolve_psi.h" -#include "bandenergy.h" +#include "band_energy.h" #include "middle_hamilt.h" #include "module_base/lapack_connector.h" +#include "module_base/module_container/ATen/kernels/blas.h" // cuBLAS handle +#include "module_base/module_container/ATen/kernels/lapack.h" // cuSOLVER handle #include "module_base/scalapack_connector.h" +#include "module_esolver/esolver_ks_lcao_tddft.h" // use gatherMatrix #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h" #include "module_hamilt_pw/hamilt_pwdft/global.h" #include "module_parameter/parameter.h" @@ -25,15 +28,18 @@ void evolve_psi(const int nband, std::complex* S_laststep, double* ekb, int htype, - int propagator) + int propagator, + std::ofstream& ofs_running, + const int print_matrix) { + ofs_running << " evolve_psi::start " << std::endl; + ModuleBase::TITLE("Evolve_psi", "evolve_psi"); time_t time_start = time(nullptr); - GlobalV::ofs_running << " Start Time : " << ctime(&time_start); + ofs_running << " Start Time : " << ctime(&time_start); #ifdef __MPI - int print_matrix = 0; hamilt::MatrixBlock> h_mat, s_mat; p_hamilt->matrix(h_mat, s_mat); @@ -59,7 +65,7 @@ void evolve_psi(const int nband, /// @output Htmp if (htype == 1 && propagator != 2) { - half_Hmatrix(pv, nband, nlocal, Htmp, Stmp, H_laststep, S_laststep, print_matrix); + half_Hmatrix(pv, nband, nlocal, Htmp, Stmp, H_laststep, S_laststep, ofs_running, print_matrix); } // (2)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -68,28 +74,28 @@ void evolve_psi(const int nband, /// @input Stmp, Htmp, print_matrix /// @output U_operator Propagator prop(propagator, pv, PARAM.mdp.md_dt); - prop.compute_propagator(nlocal, Stmp, Htmp, H_laststep, U_operator, print_matrix); + prop.compute_propagator(nlocal, Stmp, Htmp, H_laststep, U_operator, ofs_running, print_matrix); // (3)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> /// @brief apply U_operator to the wave function of the previous step for new wave function /// @input U_operator, psi_k_laststep, print_matrix /// @output psi_k - upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, print_matrix); + upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix); // (4)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> /// @brief normalize psi_k /// @input Stmp, psi_not_norm, psi_k, print_matrix /// @output psi_k - norm_psi(pv, nband, nlocal, Stmp, psi_k, print_matrix); + norm_psi(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix); // (5)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> /// @brief compute ekb /// @input Htmp, psi_k /// @output ekb - compute_ekb(pv, nband, nlocal, Hold, psi_k, ekb); + compute_ekb(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running); delete[] Stmp; delete[] Htmp; @@ -101,6 +107,233 @@ void evolve_psi(const int nband, time_t time_end = time(nullptr); ModuleBase::GlobalFunc::OUT_TIME("evolve(std::complex)", time_start, time_end); + ofs_running << " evolve_psi::end " << std::endl; + return; } + +template +void evolve_psi_tensor(const int nband, + const int nlocal, + const Parallel_Orbitals* pv, + hamilt::Hamilt>* p_hamilt, + ct::Tensor& psi_k, + ct::Tensor& psi_k_laststep, + ct::Tensor& H_laststep, + ct::Tensor& S_laststep, + ct::Tensor& ekb, + int htype, + int propagator, + std::ofstream& ofs_running, + const int print_matrix, + const bool use_lapack) +{ + // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice + ct::DeviceType ct_device_type = ct::DeviceTypeToEnum::value; + // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU + using ct_Device = typename ct::PsiToContainer::type; + // Memory operations + using syncmem_complex_h2d_op + = base_device::memory::synchronize_memory_op, Device, base_device::DEVICE_CPU>; + +#if ((defined __CUDA) /* || (defined __ROCM) */) + // Initialize cuBLAS & cuSOLVER handle + ct::kernels::createGpuSolverHandle(); + ct::kernels::createGpuBlasHandle(); +#endif // __CUDA + + ofs_running << " evolve_psi_tensor::start " << std::endl; + + ModuleBase::TITLE("Evolve_psi", "evolve_psi"); + time_t time_start = time(nullptr); + ofs_running << " Start Time : " << ctime(&time_start); + +#ifdef __MPI + + hamilt::MatrixBlock> h_mat, s_mat; + p_hamilt->matrix(h_mat, s_mat); + + // Create Tensor objects for temporary data and sync from host to device + const int len_HS = use_lapack ? nlocal * nlocal : pv->nloc; + ct::Tensor Stmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); + ct::Tensor Htmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); + ct::Tensor Hold(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); + + if (use_lapack) + { + // Need to gather H and S matrix to root process here + int myid = 0; + int num_procs = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + ModuleESolver::Matrix_g> h_mat_g, s_mat_g; // Global matrix structure + + // Collect H matrix + ModuleESolver::gatherMatrix(myid, 0, h_mat, h_mat_g); + syncmem_complex_h2d_op()(Htmp.data>(), h_mat_g.p.get(), len_HS); + syncmem_complex_h2d_op()(Hold.data>(), h_mat_g.p.get(), len_HS); + + // Collect S matrix + ModuleESolver::gatherMatrix(myid, 0, s_mat, s_mat_g); + syncmem_complex_h2d_op()(Stmp.data>(), s_mat_g.p.get(), len_HS); + } + else + { + // Original code + syncmem_complex_h2d_op()(Stmp.data>(), s_mat.p, len_HS); + syncmem_complex_h2d_op()(Htmp.data>(), h_mat.p, len_HS); + syncmem_complex_h2d_op()(Hold.data>(), h_mat.p, len_HS); + } + + ct::Tensor U_operator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS})); + U_operator.zero(); + + int myid = 0; + int root_proc = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + + // (1)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + /// @brief compute H(t+dt/2) + /// @input H_laststep, Htmp, print_matrix + /// @output Htmp + if (htype == 1 && propagator != 2) + { + if (!use_lapack) + { + half_Hmatrix_tensor(pv, nband, nlocal, Htmp, Stmp, H_laststep, S_laststep, ofs_running, print_matrix); + } + else + { + if (myid == root_proc) + { + half_Hmatrix_tensor_lapack(pv, + nband, + nlocal, + Htmp, + Stmp, + H_laststep, + S_laststep, + ofs_running, + print_matrix); + } + } + } + + // (2)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + /// @brief compute U_operator + /// @input Stmp, Htmp, print_matrix + /// @output U_operator + Propagator prop(propagator, pv, PARAM.mdp.md_dt); + prop.compute_propagator_tensor(nlocal, + Stmp, + Htmp, + H_laststep, + U_operator, + ofs_running, + print_matrix, + use_lapack); + + // (3)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + /// @brief apply U_operator to the wave function of the previous step for new wave function + /// @input U_operator, psi_k_laststep, print_matrix + /// @output psi_k + if (!use_lapack) + { + upsi_tensor(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix); + } + else + { + if (myid == root_proc) + { + upsi_tensor_lapack(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix); + } + } + + // (4)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + /// @brief normalize psi_k + /// @input Stmp, psi_not_norm, psi_k, print_matrix + /// @output psi_k + if (!use_lapack) + { + norm_psi_tensor(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix); + } + else + { + if (myid == root_proc) + { + norm_psi_tensor_lapack(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix); + } + } + + // (5)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + /// @brief compute ekb + /// @input Htmp, psi_k + /// @output ekb + if (!use_lapack) + { + compute_ekb_tensor(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running); + } + else + { + if (myid == root_proc) + { + compute_ekb_tensor_lapack(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running); + } + } + +#endif // __MPI + + time_t time_end = time(nullptr); + ModuleBase::GlobalFunc::OUT_TIME("evolve(std::complex)", time_start, time_end); + + ofs_running << " evolve_psi_tensor::end " << std::endl; + +#if ((defined __CUDA) /* || (defined __ROCM) */) + // Destroy cuBLAS & cuSOLVER handle + ct::kernels::destroyGpuSolverHandle(); + ct::kernels::destroyGpuBlasHandle(); +#endif // __CUDA + + return; +} + +// Explicit instantiation of template functions +template void evolve_psi_tensor(const int nband, + const int nlocal, + const Parallel_Orbitals* pv, + hamilt::Hamilt>* p_hamilt, + ct::Tensor& psi_k, + ct::Tensor& psi_k_laststep, + ct::Tensor& H_laststep, + ct::Tensor& S_laststep, + ct::Tensor& ekb, + int htype, + int propagator, + std::ofstream& ofs_running, + const int print_matrix, + const bool use_lapack); + +#if ((defined __CUDA) /* || (defined __ROCM) */) +template void evolve_psi_tensor(const int nband, + const int nlocal, + const Parallel_Orbitals* pv, + hamilt::Hamilt>* p_hamilt, + ct::Tensor& psi_k, + ct::Tensor& psi_k_laststep, + ct::Tensor& H_laststep, + ct::Tensor& S_laststep, + ct::Tensor& ekb, + int htype, + int propagator, + std::ofstream& ofs_running, + const int print_matrix, + const bool use_lapack); +#endif // __CUDA + } // namespace module_tddft \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_tddft/evolve_psi.h b/source/module_hamilt_lcao/module_tddft/evolve_psi.h index 3746b81f53..44f0db888a 100644 --- a/source/module_hamilt_lcao/module_tddft/evolve_psi.h +++ b/source/module_hamilt_lcao/module_tddft/evolve_psi.h @@ -6,6 +6,8 @@ #ifndef ELEC_PSI_H #define ELEC_PSI_H +#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor +#include "module_base/module_container/ATen/core/tensor_map.h" // TensorMap #include "module_basis/module_ao/parallel_orbitals.h" #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h" @@ -21,7 +23,25 @@ void evolve_psi(const int nband, std::complex* S_laststep, double* ekb, int htype, - int propagator); -} + int propagator, + std::ofstream& ofs_running, + const int print_matrix); + +template +void evolve_psi_tensor(const int nband, + const int nlocal, + const Parallel_Orbitals* pv, + hamilt::Hamilt>* p_hamilt, + ct::Tensor& psi_k, + ct::Tensor& psi_k_laststep, + ct::Tensor& H_laststep, + ct::Tensor& S_laststep, + ct::Tensor& ekb, + int htype, + int propagator, + std::ofstream& ofs_running, + const int print_matrix, + const bool use_lapack); +} // namespace module_tddft #endif \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_tddft/middle_hamilt.cpp b/source/module_hamilt_lcao/module_tddft/middle_hamilt.cpp index b53ae51bdc..94b3dc3f5d 100644 --- a/source/module_hamilt_lcao/module_tddft/middle_hamilt.cpp +++ b/source/module_hamilt_lcao/module_tddft/middle_hamilt.cpp @@ -1,11 +1,13 @@ #include "middle_hamilt.h" -#include -#include - #include "module_base/lapack_connector.h" +#include "module_base/module_container/ATen/kernels/blas.h" +#include "module_base/module_device/memory_op.h" // memory operations #include "module_base/scalapack_connector.h" +#include +#include + namespace module_tddft { #ifdef __MPI @@ -17,34 +19,36 @@ void half_Hmatrix(const Parallel_Orbitals* pv, std::complex* Stmp, const std::complex* H_laststep, const std::complex* S_laststep, + std::ofstream& ofs_running, const int print_matrix) { if (print_matrix) { - GlobalV::ofs_running << std::setprecision(10); - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " H(t+dt) :" << std::endl; + ofs_running << std::setprecision(10); + ofs_running << std::endl; + ofs_running << " H(t+dt) :" << std::endl; for (int i = 0; i < pv->nrow; i++) { + const int in = i * pv->ncol; for (int j = 0; j < pv->ncol; j++) { - GlobalV::ofs_running << Htmp[i * pv->ncol + j].real() << "+" << Htmp[i * pv->ncol + j].imag() << "i "; + ofs_running << Htmp[in + j].real() << "+" << Htmp[in + j].imag() << "i "; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " H(t):" << std::endl; + ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << " H(t):" << std::endl; for (int i = 0; i < pv->nrow; i++) { + const int in = i * pv->ncol; for (int j = 0; j < pv->ncol; j++) { - GlobalV::ofs_running << H_laststep[i * pv->ncol + j].real() << "+" - << H_laststep[i * pv->ncol + j].imag() << "i "; + ofs_running << H_laststep[in + j].real() << "+" << H_laststep[in + j].imag() << "i "; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } std::complex alpha = {0.5, 0.0}; @@ -54,18 +58,233 @@ void half_Hmatrix(const Parallel_Orbitals* pv, if (print_matrix) { - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " H (t+dt/2) :" << std::endl; + ofs_running << std::endl; + ofs_running << " H (t+dt/2) :" << std::endl; + for (int i = 0; i < pv->nrow; i++) + { + const int in = i * pv->ncol; + for (int j = 0; j < pv->ncol; j++) + { + ofs_running << Htmp[in + j].real() << "+" << Htmp[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } +} + +void half_Hmatrix_tensor(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + ct::Tensor& Htmp, + ct::Tensor& Stmp, + const ct::Tensor& H_laststep, + const ct::Tensor& S_laststep, + std::ofstream& ofs_running, + const int print_matrix) +{ + if (print_matrix) + { + ofs_running << std::setprecision(10); + ofs_running << std::endl; + ofs_running << " H(t+dt) :" << std::endl; + for (int i = 0; i < pv->nrow; i++) + { + const int in = i * pv->ncol; + for (int j = 0; j < pv->ncol; j++) + { + ofs_running << Htmp.data>()[in + j].real() << "+" + << Htmp.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << " H(t):" << std::endl; + for (int i = 0; i < pv->nrow; i++) + { + const int in = i * pv->ncol; + for (int j = 0; j < pv->ncol; j++) + { + ofs_running << H_laststep.data>()[in + j].real() << "+" + << H_laststep.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } + + std::complex alpha = {0.5, 0.0}; + std::complex beta = {0.5, 0.0}; + + // Perform the operation Htmp = alpha * H_laststep + beta * Htmp + ScalapackConnector::geadd('N', + nlocal, + nlocal, + alpha, + H_laststep.data>(), + 1, + 1, + pv->desc, + beta, + Htmp.data>(), + 1, + 1, + pv->desc); + + // Perform the operation Stmp = alpha * S_laststep + beta * Stmp + ScalapackConnector::geadd('N', + nlocal, + nlocal, + alpha, + S_laststep.data>(), + 1, + 1, + pv->desc, + beta, + Stmp.data>(), + 1, + 1, + pv->desc); + + if (print_matrix) + { + ofs_running << std::endl; + ofs_running << " H (t+dt/2) :" << std::endl; for (int i = 0; i < pv->nrow; i++) { + const int in = i * pv->ncol; for (int j = 0; j < pv->ncol; j++) { - GlobalV::ofs_running << Htmp[i * pv->ncol + j].real() << "+" << Htmp[i * pv->ncol + j].imag() << "i "; + ofs_running << Htmp.data>()[in + j].real() << "+" + << Htmp.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } +} + +template +void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + ct::Tensor& Htmp, + ct::Tensor& Stmp, + const ct::Tensor& H_laststep, + const ct::Tensor& S_laststep, + std::ofstream& ofs_running, + const int print_matrix) +{ + // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice + ct::DeviceType ct_device_type = ct::DeviceTypeToEnum::value; + // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU + using ct_Device = typename ct::PsiToContainer::type; + + if (print_matrix) + { + ct::Tensor Htmp_cpu = Htmp.to_device(); + ct::Tensor H_laststep_cpu = H_laststep.to_device(); + + ofs_running << std::setprecision(10); + ofs_running << std::endl; + ofs_running << " H(t+dt) :" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + ofs_running << Htmp_cpu.data>()[in + j].real() << "+" + << Htmp_cpu.data>()[in + j].imag() << "i "; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << " H(t):" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + ofs_running << H_laststep_cpu.data>()[in + j].real() << "+" + << H_laststep_cpu.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } + + std::complex one_half = {0.5, 0.0}; + + // Perform the operation Htmp = one_half * H_laststep + one_half * Htmp + // Scale Htmp by one_half + ct::kernels::blas_scal, ct_Device>()(nlocal * nlocal, + &one_half, + Htmp.data>(), + 1); + // Htmp = one_half * H_laststep + Htmp + ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal, + &one_half, + H_laststep.data>(), + 1, + Htmp.data>(), + 1); + + // Perform the operation Stmp = one_half * S_laststep + one_half * Stmp + // Scale Stmp by one_half + ct::kernels::blas_scal, ct_Device>()(nlocal * nlocal, + &one_half, + Stmp.data>(), + 1); + // Stmp = one_half * S_laststep + Stmp + ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal, + &one_half, + S_laststep.data>(), + 1, + Stmp.data>(), + 1); + + if (print_matrix) + { + ct::Tensor Htmp_cpu = Htmp.to_device(); + + ofs_running << std::endl; + ofs_running << " H (t+dt/2) :" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + ofs_running << Htmp_cpu.data>()[in + j].real() << "+" + << Htmp_cpu.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; } } -#endif + +// Explicit instantiation of template functions +template void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + ct::Tensor& Htmp, + ct::Tensor& Stmp, + const ct::Tensor& H_laststep, + const ct::Tensor& S_laststep, + std::ofstream& ofs_running, + const int print_matrix); +#if ((defined __CUDA) /* || (defined __ROCM) */) +template void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + ct::Tensor& Htmp, + ct::Tensor& Stmp, + const ct::Tensor& H_laststep, + const ct::Tensor& S_laststep, + std::ofstream& ofs_running, + const int print_matrix); +#endif // __CUDA +#endif // __MPI } // namespace module_tddft \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_tddft/middle_hamilt.h b/source/module_hamilt_lcao/module_tddft/middle_hamilt.h index ac230b6444..4b153e3166 100644 --- a/source/module_hamilt_lcao/module_tddft/middle_hamilt.h +++ b/source/module_hamilt_lcao/module_tddft/middle_hamilt.h @@ -6,7 +6,9 @@ #ifndef MIDDLE_HAMILT_H #define MIDDLE_HAMILT_H +#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor #include "module_basis/module_ao/parallel_orbitals.h" + #include namespace module_tddft @@ -30,8 +32,31 @@ void half_Hmatrix(const Parallel_Orbitals* pv, std::complex* Stmp, const std::complex* H_laststep, const std::complex* S_laststep, + std::ofstream& ofs_running, const int print_matrix); -#endif + +void half_Hmatrix_tensor(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + ct::Tensor& Htmp, + ct::Tensor& Stmp, + const ct::Tensor& H_laststep, + const ct::Tensor& S_laststep, + std::ofstream& ofs_running, + const int print_matrix); + +template +void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + ct::Tensor& Htmp, + ct::Tensor& Stmp, + const ct::Tensor& H_laststep, + const ct::Tensor& S_laststep, + std::ofstream& ofs_running, + const int print_matrix); + +#endif // __MPI } // namespace module_tddft #endif diff --git a/source/module_hamilt_lcao/module_tddft/norm_psi.cpp b/source/module_hamilt_lcao/module_tddft/norm_psi.cpp index cf3698b3ee..bbd7ea8956 100644 --- a/source/module_hamilt_lcao/module_tddft/norm_psi.cpp +++ b/source/module_hamilt_lcao/module_tddft/norm_psi.cpp @@ -1,11 +1,12 @@ #include "norm_psi.h" -#include -#include - #include "module_base/lapack_connector.h" +#include "module_base/module_container/ATen/kernels/blas.h" #include "module_base/scalapack_connector.h" +#include +#include + namespace module_tddft { #ifdef __MPI @@ -23,8 +24,11 @@ void norm_psi(const Parallel_Orbitals* pv, const int nlocal, const std::complex* Stmp, std::complex* psi_k, + std::ofstream& ofs_running, const int print_matrix) { + assert(pv->nloc_wfc > 0 && pv->nloc > 0); + std::complex* tmp1 = new std::complex[pv->nloc_wfc]; ModuleBase::GlobalFunc::ZEROS(tmp1, pv->nloc_wfc); @@ -73,28 +77,30 @@ void norm_psi(const Parallel_Orbitals* pv, if (print_matrix) { - GlobalV::ofs_running << "original Cij :" << std::endl; + ofs_running << "original Cij :" << std::endl; for (int i = 0; i < pv->ncol; i++) { + const int in = i * pv->ncol; for (int j = 0; j < pv->nrow; j++) { - double aa, bb; - aa = Cij[i * pv->ncol + j].real(); - bb = Cij[i * pv->ncol + j].imag(); - if (std::abs(aa) < 1e-8) { + double aa = Cij[in + j].real(); + double bb = Cij[in + j].imag(); + if (std::abs(aa) < 1e-8) + { aa = 0.0; -} - if (std::abs(bb) < 1e-8) { + } + if (std::abs(bb) < 1e-8) + { bb = 0.0; -} - GlobalV::ofs_running << aa << "+" << bb << "i "; + } + ofs_running << aa << "+" << bb << "i "; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - int naroc[2]; // maximum number of row or column + int naroc[2] = {0, 0}; // maximum number of row or column for (int iprow = 0; iprow < pv->dim0; ++iprow) { @@ -107,15 +113,17 @@ void norm_psi(const Parallel_Orbitals* pv, for (int j = 0; j < naroc[1]; ++j) { int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol); - if (igcol >= nband) { + if (igcol >= nband) + { continue; -} + } for (int i = 0; i < naroc[0]; ++i) { int igrow = globalIndex(i, pv->nb, pv->dim0, iprow); - if (igrow >= nband) { + if (igrow >= nband) + { continue; -} + } if (igcol == igrow) { Cij[j * naroc[0] + i] = {1.0 / sqrt(Cij[j * naroc[0] + i].real()), 0.0}; @@ -128,7 +136,7 @@ void norm_psi(const Parallel_Orbitals* pv, } } } // loop ipcol - } // loop iprow + } // loop iprow BlasConnector::copy(pv->nloc_wfc, psi_k, 1, tmp1, 1); @@ -154,60 +162,510 @@ void norm_psi(const Parallel_Orbitals* pv, if (print_matrix) { - GlobalV::ofs_running << " Cij:" << std::endl; + ofs_running << " Cij:" << std::endl; for (int i = 0; i < pv->ncol; i++) { + const int in = i * pv->ncol; for (int j = 0; j < pv->nrow; j++) { - GlobalV::ofs_running << Cij[i * pv->ncol + j].real() << "+" << Cij[i * pv->ncol + j].imag() << "i "; + ofs_running << Cij[in + j].real() << "+" << Cij[in + j].imag() << "i "; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " psi_k:" << std::endl; + ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << " psi_k:" << std::endl; for (int i = 0; i < pv->ncol_bands; i++) { + const int in = i * pv->ncol; for (int j = 0; j < pv->ncol; j++) { - double aa, bb; - aa = psi_k[i * pv->ncol + j].real(); - bb = psi_k[i * pv->ncol + j].imag(); - if (std::abs(aa) < 1e-8) { + double aa = psi_k[in + j].real(); + double bb = psi_k[in + j].imag(); + if (std::abs(aa) < 1e-8) + { aa = 0.0; -} - if (std::abs(bb) < 1e-8) { + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " psi_k before normalization:" << std::endl; + for (int i = 0; i < pv->ncol_bands; i++) + { + const int in = i * pv->ncol; + for (int j = 0; j < pv->ncol; j++) + { + double aa = tmp1[in + j].real(); + double bb = tmp1[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << std::endl; + } + + delete[] tmp1; + delete[] Cij; } - GlobalV::ofs_running << aa << "+" << bb << "i "; + +void norm_psi_tensor(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Stmp, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix) +{ + assert(pv->nloc_wfc > 0 && pv->nloc > 0); + + // Create Tensor objects for temporary data + ct::Tensor tmp1(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc_wfc})); + tmp1.zero(); + + ct::Tensor Cij(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc})); + Cij.zero(); + + // Perform matrix multiplication: tmp1 = Stmp * psi_k + ScalapackConnector::gemm('N', + 'N', + nlocal, + nband, + nlocal, + 1.0, + Stmp.data>(), + 1, + 1, + pv->desc, + psi_k.data>(), + 1, + 1, + pv->desc_wfc, + 0.0, + tmp1.data>(), + 1, + 1, + pv->desc_wfc); + + // Perform matrix multiplication: Cij = psi_k^dagger * tmp1 + ScalapackConnector::gemm('C', + 'N', + nband, + nband, + nlocal, + 1.0, + psi_k.data>(), + 1, + 1, + pv->desc_wfc, + tmp1.data>(), + 1, + 1, + pv->desc_wfc, + 0.0, + Cij.data>(), + 1, + 1, + pv->desc_Eij); + + if (print_matrix) + { + ofs_running << "original Cij :" << std::endl; + for (int i = 0; i < pv->ncol; i++) + { + const int in = i * pv->ncol; + for (int j = 0; j < pv->nrow; j++) + { + double aa = Cij.data>()[in + j].real(); + double bb = Cij.data>()[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " psi_k before normalization:" << std::endl; + ofs_running << std::endl; + } + + int naroc[2] = {0, 0}; // maximum number of row or column + + for (int iprow = 0; iprow < pv->dim0; ++iprow) + { + for (int ipcol = 0; ipcol < pv->dim1; ++ipcol) + { + if (iprow == pv->coord[0] && ipcol == pv->coord[1]) + { + naroc[0] = pv->nrow; + naroc[1] = pv->ncol; + for (int j = 0; j < naroc[1]; ++j) + { + int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol); + if (igcol >= nband) + { + continue; + } + for (int i = 0; i < naroc[0]; ++i) + { + int igrow = globalIndex(i, pv->nb, pv->dim0, iprow); + if (igrow >= nband) + { + continue; + } + if (igcol == igrow) + { + Cij.data>()[j * naroc[0] + i] + = {1.0 / sqrt(Cij.data>()[j * naroc[0] + i].real()), 0.0}; + } + else + { + Cij.data>()[j * naroc[0] + i] = {0.0, 0.0}; + } + } + } + } + } // loop ipcol + } // loop iprow + + // Copy psi_k to tmp1 (using deep copy) + // tmp1.CopyFrom(psi_k); // Does not work because this will cause tmp1 and psi_k to share the same data + tmp1 = psi_k; // operator= overload for Tensor class + + // Perform matrix multiplication: psi_k = tmp1 * Cij + ScalapackConnector::gemm('N', + 'N', + nlocal, + nband, + nband, + 1.0, + tmp1.data>(), + 1, + 1, + pv->desc_wfc, + Cij.data>(), + 1, + 1, + pv->desc_Eij, + 0.0, + psi_k.data>(), + 1, + 1, + pv->desc_wfc); + + if (print_matrix) + { + ofs_running << " Cij:" << std::endl; for (int i = 0; i < pv->ncol; i++) { + const int in = i * pv->ncol; + for (int j = 0; j < pv->nrow; j++) + { + ofs_running << Cij.data>()[in + j].real() << "+" + << Cij.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << " psi_k:" << std::endl; + for (int i = 0; i < pv->ncol_bands; i++) + { + const int in = i * pv->ncol; for (int j = 0; j < pv->ncol; j++) { - double aa, bb; - aa = tmp1[i * pv->ncol + j].real(); - bb = tmp1[i * pv->ncol + j].imag(); - if (std::abs(aa) < 1e-8) { + double aa = psi_k.data>()[in + j].real(); + double bb = psi_k.data>()[in + j].imag(); + if (std::abs(aa) < 1e-8) + { aa = 0.0; -} - if (std::abs(bb) < 1e-8) { + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " psi_k before normalization:" << std::endl; + for (int i = 0; i < pv->ncol_bands; i++) + { + const int in = i * pv->ncol; + for (int j = 0; j < pv->ncol; j++) + { + double aa = tmp1.data>()[in + j].real(); + double bb = tmp1.data>()[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << std::endl; + } } - GlobalV::ofs_running << aa << "+" << bb << "i "; + +template +void norm_psi_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Stmp, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix) +{ + // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice + ct::DeviceType ct_device_type = ct::DeviceTypeToEnum::value; + // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU + using ct_Device = typename ct::PsiToContainer::type; + + // Create Tensor objects for temporary data + ct::Tensor tmp1( + ct::DataType::DT_COMPLEX_DOUBLE, + ct_device_type, + ct::TensorShape({nlocal * nband})); // tmp1 shape: nlocal * nband (under 2D block cyclic is pv->nloc_wfc) + tmp1.zero(); + + ct::Tensor Cij(ct::DataType::DT_COMPLEX_DOUBLE, + ct_device_type, + ct::TensorShape({nlocal * nlocal})); // Cij shape: nlocal * nlocal + Cij.zero(); + + std::complex alpha = {1.0, 0.0}; + std::complex beta = {0.0, 0.0}; + + // Perform matrix multiplication: tmp1 = Stmp * psi_k + ct::kernels::blas_gemm, ct_Device>()('N', + 'N', + nlocal, + nband, + nlocal, + &alpha, + Stmp.data>(), + nlocal, // Leading dimension of Stmp + psi_k.data>(), + nlocal, // Leading dimension of psi_k + &beta, + tmp1.data>(), + nlocal); // Leading dimension of tmp1 + + // Perform matrix multiplication: Cij = psi_k^dagger * tmp1 + ct::kernels::blas_gemm, ct_Device>()('C', + 'N', + nband, + nband, + nlocal, + &alpha, + psi_k.data>(), + nlocal, // Leading dimension of psi_k + tmp1.data>(), + nlocal, // Leading dimension of tmp1 + &beta, + Cij.data>(), + nlocal); // Leading dimension of Cij + + if (print_matrix) + { + ct::Tensor Cij_print_cpu = Cij.to_device(); + + ofs_running << "original Cij :" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + double aa = Cij_print_cpu.data>()[in + j].real(); + double bb = Cij_print_cpu.data>()[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - delete[] tmp1; - delete[] Cij; + // Normalize Cij: set diagonal elements to 1/sqrt(Cij[i][i]), off-diagonal elements to 0 + if (ct_device_type == ct::DeviceType::GpuDevice) + { + // Step 1: Copy Cij from GPU to CPU + ct::Tensor Cij_cpu = Cij.to_device(); + + // Step 2: Perform normalization on CPU + for (int i = 0; i < nband; ++i) + { + const int in = i * nlocal; + for (int j = 0; j < nband; ++j) + { + if (i == j) + { + Cij_cpu.data>()[in + j] + = {1.0 / sqrt(Cij_cpu.data>()[in + j].real()), 0.0}; + } + else + { + Cij_cpu.data>()[in + j] = {0.0, 0.0}; + } + } + } + + // Step 3: Copy normalized Cij back to GPU + Cij = Cij_cpu.to_device(); + } + else + { + // CPU implementation + for (int i = 0; i < nband; ++i) + { + const int in = i * nlocal; + for (int j = 0; j < nband; ++j) + { + if (i == j) + { + Cij.data>()[in + j] + = {1.0 / sqrt(Cij.data>()[in + j].real()), 0.0}; + } + else + { + Cij.data>()[in + j] = {0.0, 0.0}; + } + } + } + } + + // Copy psi_k to tmp1 (using deep copy) + // tmp1.CopyFrom(psi_k); // Does not work because this will cause tmp1 and psi_k to share the same data + tmp1 = psi_k; // operator= overload for Tensor class + + // Perform matrix multiplication: psi_k = tmp1 * Cij + ct::kernels::blas_gemm, ct_Device>()('N', + 'N', + nlocal, + nband, + nband, + &alpha, + tmp1.data>(), + nlocal, // Leading dimension of tmp1 + Cij.data>(), + nlocal, // Leading dimension of Cij + &beta, + psi_k.data>(), + nlocal); // Leading dimension of psi_k + + if (print_matrix) + { + ct::Tensor Cij_print_cpu = Cij.to_device(); + ct::Tensor psi_k_cpu = psi_k.to_device(); + ct::Tensor tmp1_cpu = tmp1.to_device(); + + ofs_running << " Cij:" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + ofs_running << Cij_print_cpu.data>()[in + j].real() << "+" + << Cij_print_cpu.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << " psi_k:" << std::endl; + for (int i = 0; i < nband; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + double aa = psi_k_cpu.data>()[in + j].real(); + double bb = psi_k_cpu.data>()[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " psi_k before normalization:" << std::endl; + for (int i = 0; i < nband; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + double aa = tmp1_cpu.data>()[in + j].real(); + double bb = tmp1_cpu.data>()[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << std::endl; + } } -#endif + +// Explicit instantiation of template functions +template void norm_psi_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Stmp, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix); +#if ((defined __CUDA) /* || (defined __ROCM) */) +template void norm_psi_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Stmp, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix); +#endif // __CUDA +#endif // __MPI } // namespace module_tddft diff --git a/source/module_hamilt_lcao/module_tddft/norm_psi.h b/source/module_hamilt_lcao/module_tddft/norm_psi.h index d3c41978bb..6d3e455a3a 100644 --- a/source/module_hamilt_lcao/module_tddft/norm_psi.h +++ b/source/module_hamilt_lcao/module_tddft/norm_psi.h @@ -6,7 +6,9 @@ #ifndef NORM_PSI_H #define NORM_PSI_H +#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor #include "module_basis/module_ao/parallel_orbitals.h" + #include namespace module_tddft @@ -28,9 +30,27 @@ void norm_psi(const Parallel_Orbitals* pv, const int nlocal, const std::complex* Stmp, std::complex* psi_k, + std::ofstream& ofs_running, const int print_matrix); -#endif +void norm_psi_tensor(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Stmp, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix); + +template +void norm_psi_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& Stmp, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix); + +#endif // __MPI } // namespace module_tddft #endif diff --git a/source/module_hamilt_lcao/module_tddft/propagator.cpp b/source/module_hamilt_lcao/module_tddft/propagator.cpp index 38299d5f08..154671176e 100644 --- a/source/module_hamilt_lcao/module_tddft/propagator.cpp +++ b/source/module_hamilt_lcao/module_tddft/propagator.cpp @@ -1,6 +1,10 @@ #include "propagator.h" #include "module_base/lapack_connector.h" +#include "module_base/module_container/ATen/kernels/blas.h" +#include "module_base/module_container/ATen/kernels/lapack.h" +#include "module_base/module_container/ATen/kernels/memory.h" // memory operations (Tensor) +#include "module_base/module_device/memory_op.h" // memory operations #include "module_base/scalapack_connector.h" #include "module_parameter/parameter.h" @@ -13,603 +17,91 @@ Propagator::~Propagator() { } #ifdef __MPI - -inline int globalIndex(int localindex, int nblk, int nprocs, int myproc) -{ - int iblock, gIndex; - iblock = localindex / nblk; - gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk; - return gIndex; -} - void Propagator::compute_propagator(const int nlocal, const std::complex* Stmp, const std::complex* Htmp, const std::complex* H_laststep, std::complex* U_operator, + std::ofstream& ofs_running, const int print_matrix) const { int tag; switch (ptype) { case 0: - compute_propagator_cn2(nlocal, Stmp, Htmp, U_operator, print_matrix); + compute_propagator_cn2(nlocal, Stmp, Htmp, U_operator, ofs_running, print_matrix); break; case 1: tag = 1; - compute_propagator_taylor(nlocal, Stmp, Htmp, U_operator, print_matrix, tag); + compute_propagator_taylor(nlocal, Stmp, Htmp, U_operator, ofs_running, print_matrix, tag); break; case 2: - compute_propagator_etrs(nlocal, Stmp, Htmp, H_laststep, U_operator, print_matrix); - + compute_propagator_etrs(nlocal, Stmp, Htmp, H_laststep, U_operator, ofs_running, print_matrix); break; default: - std::cout << "method of propagator is wrong" << std::endl; + ModuleBase::WARNING_QUIT("Propagator::compute_propagator", "Method of RT-TDDFT propagator is wrong!"); break; } } -void Propagator::compute_propagator_cn2(const int nlocal, - const std::complex* Stmp, - const std::complex* Htmp, - std::complex* U_operator, - const int print_matrix) const -{ - // (1) copy Htmp to Numerator & Denominator - std::complex* Numerator = new std::complex[this->ParaV->nloc]; - ModuleBase::GlobalFunc::ZEROS(Numerator, this->ParaV->nloc); - BlasConnector::copy(this->ParaV->nloc, Htmp, 1, Numerator, 1); - - std::complex* Denominator = new std::complex[this->ParaV->nloc]; - ModuleBase::GlobalFunc::ZEROS(Denominator, this->ParaV->nloc); - BlasConnector::copy(this->ParaV->nloc, Htmp, 1, Denominator, 1); - - if (print_matrix) - { - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " S matrix :" << std::endl; - for (int i = 0; i < this->ParaV->nrow; i++) - { - for (int j = 0; j < this->ParaV->ncol; j++) - { - GlobalV::ofs_running << Stmp[i * this->ParaV->ncol + j].real() << "+" - << Stmp[i * this->ParaV->ncol + j].imag() << "i "; - } - GlobalV::ofs_running << std::endl; - } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " H matrix :" << std::endl; - for (int i = 0; i < this->ParaV->nrow; i++) - { - for (int j = 0; j < this->ParaV->ncol; j++) - { - GlobalV::ofs_running << Numerator[i * this->ParaV->ncol + j].real() << "+" - << Numerator[i * this->ParaV->ncol + j].imag() << "i "; - } - GlobalV::ofs_running << std::endl; - } - GlobalV::ofs_running << std::endl; - } - - // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - // (2) compute Numerator & Denominator by GEADD - // Numerator = Stmp - i*para * Htmp; beta1 = - para = -0.25 * this->dt - // Denominator = Stmp + i*para * Htmp; beta2 = para = 0.25 * this->dt - std::complex alpha = {1.0, 0.0}; - std::complex beta1 = {0.0, -0.25 * this->dt}; - std::complex beta2 = {0.0, 0.25 * this->dt}; - - ScalapackConnector::geadd('N', - nlocal, - nlocal, - alpha, - Stmp, - 1, - 1, - this->ParaV->desc, - beta1, - Numerator, - 1, - 1, - this->ParaV->desc); - ScalapackConnector::geadd('N', - nlocal, - nlocal, - alpha, - Stmp, - 1, - 1, - this->ParaV->desc, - beta2, - Denominator, - 1, - 1, - this->ParaV->desc); - - if (print_matrix) - { - GlobalV::ofs_running << " beta=" << beta1 << std::endl; - GlobalV::ofs_running << " fenmu:" << std::endl; - for (int i = 0; i < this->ParaV->nrow; i++) - { - for (int j = 0; j < this->ParaV->ncol; j++) - { - GlobalV::ofs_running << Denominator[i * this->ParaV->ncol + j].real() << "+" - << Denominator[i * this->ParaV->ncol + j].imag() << "i "; - } - GlobalV::ofs_running << std::endl; - } - GlobalV::ofs_running << std::endl; - } - - //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - // (3) Next, invert Denominator - int* ipiv = new int[this->ParaV->nloc]; - int info = 0; - // (3.1) compute ipiv - ScalapackConnector::getrf(nlocal, nlocal, Denominator, 1, 1, this->ParaV->desc, ipiv, &info); - int lwork = -1; - int liwotk = -1; - std::vector> work(1, 0); - std::vector iwork(1, 0); - // (3.2) compute work - ScalapackConnector::getri(nlocal, - Denominator, - 1, - 1, - this->ParaV->desc, - ipiv, - work.data(), - &lwork, - iwork.data(), - &liwotk, - &info); - lwork = work[0].real(); - work.resize(lwork, 0); - liwotk = iwork[0]; - iwork.resize(liwotk, 0); - // (3.3) compute inverse matrix of Denominator - ScalapackConnector::getri(nlocal, - Denominator, - 1, - 1, - this->ParaV->desc, - ipiv, - work.data(), - &lwork, - iwork.data(), - &liwotk, - &info); - assert(0 == info); - - //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - // (4) U_operator = Denominator * Numerator; - ScalapackConnector::gemm('N', - 'N', - nlocal, - nlocal, - nlocal, - 1.0, - Denominator, - 1, - 1, - this->ParaV->desc, - Numerator, - 1, - 1, - this->ParaV->desc, - 0.0, - U_operator, - 1, - 1, - this->ParaV->desc); - - if (print_matrix) - { - GlobalV::ofs_running << " fenmu^-1:" << std::endl; - for (int i = 0; i < this->ParaV->nrow; i++) - { - for (int j = 0; j < this->ParaV->ncol; j++) - { - GlobalV::ofs_running << Denominator[i * this->ParaV->ncol + j].real() << "+" - << Denominator[i * this->ParaV->ncol + j].imag() << "i "; - } - GlobalV::ofs_running << std::endl; - } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " fenzi:" << std::endl; - for (int i = 0; i < this->ParaV->nrow; i++) - { - for (int j = 0; j < this->ParaV->ncol; j++) - { - GlobalV::ofs_running << Numerator[i * this->ParaV->ncol + j].real() << "+" - << Numerator[i * this->ParaV->ncol + j].imag() << "i "; - } - GlobalV::ofs_running << std::endl; - } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " U operator:" << std::endl; - for (int i = 0; i < this->ParaV->nrow; i++) - { - for (int j = 0; j < this->ParaV->ncol; j++) - { - double aa, bb; - aa = U_operator[i * this->ParaV->ncol + j].real(); - bb = U_operator[i * this->ParaV->ncol + j].imag(); - if (std::abs(aa) < 1e-8) { - aa = 0.0; -} - if (std::abs(bb) < 1e-8) { - bb = 0.0; -} - GlobalV::ofs_running << aa << "+" << bb << "i "; - } - GlobalV::ofs_running << std::endl; - } - } - - delete[] Numerator; - delete[] Denominator; - delete[] ipiv; -} - -void Propagator::compute_propagator_taylor(const int nlocal, - const std::complex* Stmp, - const std::complex* Htmp, - std::complex* U_operator, +template +void Propagator::compute_propagator_tensor(const int nlocal, + const ct::Tensor& Stmp, + const ct::Tensor& Htmp, + const ct::Tensor& H_laststep, + ct::Tensor& U_operator, + std::ofstream& ofs_running, const int print_matrix, - const int tag) const + const bool use_lapack) const { - ModuleBase::GlobalFunc::ZEROS(U_operator, this->ParaV->nloc); - std::complex* A_matrix = new std::complex[this->ParaV->nloc]; - ModuleBase::GlobalFunc::ZEROS(A_matrix, this->ParaV->nloc); - std::complex* rank0 = new std::complex[this->ParaV->nloc]; - ModuleBase::GlobalFunc::ZEROS(rank0, this->ParaV->nloc); - std::complex* rank2 = new std::complex[this->ParaV->nloc]; - ModuleBase::GlobalFunc::ZEROS(rank2, this->ParaV->nloc); - std::complex* rank3 = new std::complex[this->ParaV->nloc]; - ModuleBase::GlobalFunc::ZEROS(rank3, this->ParaV->nloc); - std::complex* rank4 = new std::complex[this->ParaV->nloc]; - ModuleBase::GlobalFunc::ZEROS(rank4, this->ParaV->nloc); - std::complex* tmp1 = new std::complex[this->ParaV->nloc]; - ModuleBase::GlobalFunc::ZEROS(tmp1, this->ParaV->nloc); - std::complex* tmp2 = new std::complex[this->ParaV->nloc]; - ModuleBase::GlobalFunc::ZEROS(tmp2, this->ParaV->nloc); - std::complex* Sinv = new std::complex[this->ParaV->nloc]; - ModuleBase::GlobalFunc::ZEROS(Sinv, this->ParaV->nloc); - BlasConnector::copy(this->ParaV->nloc, Stmp, 1, Sinv, 1); - - if (print_matrix) + int tag; + switch (ptype) { - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " S matrix :" << std::endl; - for (int i = 0; i < this->ParaV->nrow; i++) + case 0: + if (!use_lapack) { - for (int j = 0; j < this->ParaV->ncol; j++) - { - GlobalV::ofs_running << Stmp[i * this->ParaV->ncol + j].real() << "+" - << Stmp[i * this->ParaV->ncol + j].imag() << "i "; - } - GlobalV::ofs_running << std::endl; + compute_propagator_cn2_tensor(nlocal, Stmp, Htmp, U_operator, ofs_running, print_matrix); } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " H matrix :" << std::endl; - for (int i = 0; i < this->ParaV->nrow; i++) + else { - for (int j = 0; j < this->ParaV->ncol; j++) + int myid = 0; + int root_proc = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + if (myid == root_proc) { - GlobalV::ofs_running << Htmp[i * this->ParaV->ncol + j].real() << "+" - << Htmp[i * this->ParaV->ncol + j].imag() << "i "; + compute_propagator_cn2_tensor_lapack(nlocal, Stmp, Htmp, U_operator, ofs_running, print_matrix); } - GlobalV::ofs_running << std::endl; } - GlobalV::ofs_running << std::endl; - } - - // set rank0 - int info; - int naroc[2]; // maximum number of row or column - - for (int iprow = 0; iprow < this->ParaV->dim0; ++iprow) - { - for (int ipcol = 0; ipcol < this->ParaV->dim1; ++ipcol) - { - if (iprow == ParaV->coord[0] && ipcol == ParaV->coord[1]) - { - naroc[0] = this->ParaV->nrow; - naroc[1] = this->ParaV->ncol; - for (int j = 0; j < naroc[1]; ++j) - { - int igcol = globalIndex(j, this->ParaV->nb, this->ParaV->dim1, ipcol); - if (igcol >= nlocal) { - continue; -} - for (int i = 0; i < naroc[0]; ++i) - { - int igrow = globalIndex(i, this->ParaV->nb, this->ParaV->dim0, iprow); - if (igrow >= nlocal) { - continue; -} - if (igcol == igrow) - { - rank0[j * naroc[0] + i] = {1.0, 0.0}; - } - else - { - rank0[j * naroc[0] + i] = {0.0, 0.0}; - } - } - } - } - } // loop ipcol - } // loop iprow - - std::complex beta = {0.0, -0.5 * this->dt / tag}; // for ETRS tag=2 , for taylor tag=1 - - //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - // invert Stmp - int* ipiv = new int[this->ParaV->nloc]; - // (3.1) compute ipiv - ScalapackConnector::getrf(nlocal, nlocal, Sinv, 1, 1, this->ParaV->desc, ipiv, &info); - int lwork = -1; - int liwotk = -1; - std::vector> work(1, 0); - std::vector iwork(1, 0); - // (3.2) compute work - ScalapackConnector::getri(nlocal, - Sinv, - 1, - 1, - this->ParaV->desc, - ipiv, - work.data(), - &lwork, - iwork.data(), - &liwotk, - &info); - lwork = work[0].real(); - work.resize(lwork, 0); - liwotk = iwork[0]; - iwork.resize(liwotk, 0); - ScalapackConnector::getri(nlocal, - Sinv, - 1, - 1, - this->ParaV->desc, - ipiv, - work.data(), - &lwork, - iwork.data(), - &liwotk, - &info); - assert(0 == info); - - // A_matrix = - idt S^-1 H ; - ScalapackConnector::gemm('N', - 'N', - nlocal, - nlocal, - nlocal, - beta, - Sinv, - 1, - 1, - this->ParaV->desc, - Htmp, - 1, - 1, - this->ParaV->desc, - 0.0, - U_operator, - 1, - 1, - this->ParaV->desc); - - // rank2 = A^2 ; - ScalapackConnector::gemm('N', - 'N', - nlocal, - nlocal, - nlocal, - 1.0, - U_operator, - 1, - 1, - this->ParaV->desc, - U_operator, - 1, - 1, - this->ParaV->desc, - 0.0, - rank2, - 1, - 1, - this->ParaV->desc); - - // rank3 = A^3 ; - ScalapackConnector::gemm('N', - 'N', - nlocal, - nlocal, - nlocal, - 1.0, - U_operator, - 1, - 1, - this->ParaV->desc, - rank2, - 1, - 1, - this->ParaV->desc, - 0.0, - rank3, - 1, - 1, - this->ParaV->desc); - - // rank4 = A^4 ; - ScalapackConnector::gemm('N', - 'N', - nlocal, - nlocal, - nlocal, - 1.0, - U_operator, - 1, - 1, - this->ParaV->desc, - rank3, - 1, - 1, - this->ParaV->desc, - 0.0, - rank4, - 1, - 1, - this->ParaV->desc); - - std::complex p1 = {1.0, 0.0}; - std::complex p2 = {1.0 / 2.0, 0.0}; - std::complex p3 = {1.0 / 6.0, 0.0}; - std::complex p4 = {1.0 / 24.0, 0.0}; - - ScalapackConnector::geadd('N', - nlocal, - nlocal, - p1, - rank0, - 1, - 1, - this->ParaV->desc, - p1, - U_operator, - 1, - 1, - this->ParaV->desc); - - ScalapackConnector::geadd('N', - nlocal, - nlocal, - p2, - rank2, - 1, - 1, - this->ParaV->desc, - p1, - U_operator, - 1, - 1, - this->ParaV->desc); - - ScalapackConnector::geadd('N', - nlocal, - nlocal, - p3, - rank3, - 1, - 1, - this->ParaV->desc, - p1, - U_operator, - 1, - 1, - this->ParaV->desc); - - ScalapackConnector::geadd('N', - nlocal, - nlocal, - p4, - rank4, - 1, - 1, - this->ParaV->desc, - p1, - U_operator, - 1, - 1, - this->ParaV->desc); + break; - if (print_matrix) - { - GlobalV::ofs_running << " A_matrix:" << std::endl; - for (int i = 0; i < this->ParaV->nrow; i++) - { - for (int j = 0; j < this->ParaV->ncol; j++) - { - GlobalV::ofs_running << A_matrix[i * this->ParaV->ncol + j].real() << "+" - << A_matrix[i * this->ParaV->ncol + j].imag() << "i "; - } - GlobalV::ofs_running << std::endl; - } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " U operator:" << std::endl; - for (int i = 0; i < this->ParaV->nrow; i++) - { - for (int j = 0; j < this->ParaV->ncol; j++) - { - double aa, bb; - aa = U_operator[i * this->ParaV->ncol + j].real(); - bb = U_operator[i * this->ParaV->ncol + j].imag(); - if (std::abs(aa) < 1e-8) { - aa = 0.0; -} - if (std::abs(bb) < 1e-8) { - bb = 0.0; -} - GlobalV::ofs_running << aa << "+" << bb << "i "; - } - GlobalV::ofs_running << std::endl; - } + default: + ModuleBase::WARNING_QUIT("Propagator::compute_propagator_tensor", + "The Tensor-based RT-TDDFT propagator currently supports Crank–Nicolson method only!"); + break; } - delete[] A_matrix; - delete[] rank0; - delete[] rank2; - delete[] rank3; - delete[] rank4; - delete[] tmp1; - delete[] tmp2; - delete[] Sinv; - delete[] ipiv; -} - -void Propagator::compute_propagator_etrs(const int nlocal, - const std::complex* Stmp, - const std::complex* Htmp, - const std::complex* H_laststep, - std::complex* U_operator, - const int print_matrix) const -{ - std::vector> U1(this->ParaV->nloc); - std::vector> U2(this->ParaV->nloc); - int tag = 2; - compute_propagator_taylor(nlocal, Stmp, Htmp, U1.data(), print_matrix, tag); - compute_propagator_taylor(nlocal, Stmp, H_laststep, U2.data(), print_matrix, tag); - ScalapackConnector::gemm('N', - 'N', - nlocal, - nlocal, - nlocal, - 1.0, - U1.data(), - 1, - 1, - this->ParaV->desc, - U2.data(), - 1, - 1, - this->ParaV->desc, - 0.0, - U_operator, - 1, - 1, - this->ParaV->desc); } -#endif +// Explicit instantiation of template functions +template void Propagator::compute_propagator_tensor(const int nlocal, + const ct::Tensor& Stmp, + const ct::Tensor& Htmp, + const ct::Tensor& H_laststep, + ct::Tensor& U_operator, + std::ofstream& ofs_running, + const int print_matrix, + const bool use_lapack) const; +#if ((defined __CUDA) /* || (defined __ROCM) */) +template void Propagator::compute_propagator_tensor(const int nlocal, + const ct::Tensor& Stmp, + const ct::Tensor& Htmp, + const ct::Tensor& H_laststep, + ct::Tensor& U_operator, + std::ofstream& ofs_running, + const int print_matrix, + const bool use_lapack) const; +#endif // __CUDA +#endif // __MPI } // namespace module_tddft diff --git a/source/module_hamilt_lcao/module_tddft/propagator.h b/source/module_hamilt_lcao/module_tddft/propagator.h index c827fc96b2..ca4bec2e66 100644 --- a/source/module_hamilt_lcao/module_tddft/propagator.h +++ b/source/module_hamilt_lcao/module_tddft/propagator.h @@ -7,12 +7,98 @@ #define PROPAGATOR_H #include "module_base/constants.h" +#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor #include "module_basis/module_ao/parallel_orbitals.h" #include namespace module_tddft { +//--------------------------------- Utility function ---------------------------------// +#ifdef __MPI +inline int globalIndex(int localindex, int nblk, int nprocs, int myproc) +{ + int iblock, gIndex; + iblock = localindex / nblk; + gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk; + return gIndex; +} +#endif // __MPI + +// Auxiliary function: process non-complex types, return value 1.0 +template +inline T init_value(typename std::enable_if>::value + && !std::is_same>::value>::type* = nullptr) +{ + return T(1.0); +} + +// Auxiliary function: process complex types, return value 1.0 + 0.0i +template +inline T init_value(typename std::enable_if>::value + || std::is_same>::value>::type* = nullptr) +{ + return T(1.0, 0.0); +} + +// Create an identity matrix of size n×n +template +ct::Tensor create_identity_matrix(const int n, ct::DeviceType device = ct::DeviceType::CpuDevice) +{ + // Choose the data type of the Tensor + ct::DataType data_type; + if (std::is_same::value) + { + data_type = ct::DataType::DT_FLOAT; + } + else if (std::is_same::value) + { + data_type = ct::DataType::DT_DOUBLE; + } + else if (std::is_same>::value) + { + data_type = ct::DataType::DT_COMPLEX; + } + else if (std::is_same>::value) + { + data_type = ct::DataType::DT_COMPLEX_DOUBLE; + } + else + { + static_assert(std::is_same::value || std::is_same::value + || std::is_same>::value + || std::is_same>::value, + "Unsupported data type!"); + } + + ct::Tensor tensor(data_type, device, ct::TensorShape({n, n})); + tensor.zero(); + + // Set the diagonal elements to 1 + if (device == ct::DeviceType::CpuDevice) + { + // For CPU, we can directly access the data + T* data_ptr = tensor.data(); + for (int i = 0; i < n; ++i) + { + data_ptr[i * n + i] = init_value(); + } + } + else if (device == ct::DeviceType::GpuDevice) + { + // For GPU, we need to use a kernel to set the diagonal elements + T* data_ptr = tensor.data(); + for (int i = 0; i < n; ++i) + { + T value = init_value(); + ct::kernels::set_memory()(data_ptr + i * n + i, value, 1); + } + } + + return tensor; +} +//--------------------------------- Utility function ---------------------------------// + class Propagator { public: @@ -40,8 +126,19 @@ class Propagator const std::complex* Htmp, const std::complex* H_laststep, std::complex* U_operator, + std::ofstream& ofs_running, const int print_matrix) const; -#endif + + template + void compute_propagator_tensor(const int nlocal, + const ct::Tensor& Stmp, + const ct::Tensor& Htmp, + const ct::Tensor& H_laststep, + ct::Tensor& U_operator, + std::ofstream& ofs_running, + const int print_matrix, + const bool use_lapack) const; +#endif // __MPI private: int ptype; // type of propagator @@ -63,8 +160,24 @@ class Propagator const std::complex* Stmp, const std::complex* Htmp, std::complex* U_operator, + std::ofstream& ofs_running, const int print_matrix) const; + void compute_propagator_cn2_tensor(const int nlocal, + const ct::Tensor& Stmp, + const ct::Tensor& Htmp, + ct::Tensor& U_operator, + std::ofstream& ofs_running, + const int print_matrix) const; + + template + void compute_propagator_cn2_tensor_lapack(const int nlocal, + const ct::Tensor& Stmp, + const ct::Tensor& Htmp, + ct::Tensor& U_operator, + std::ofstream& ofs_running, + const int print_matrix) const; + /** * @brief compute propagator of method 4th Taylor * @@ -79,6 +192,7 @@ class Propagator const std::complex* Stmp, const std::complex* Htmp, std::complex* U_operator, + std::ofstream& ofs_running, const int print_matrix, const int tag) const; @@ -97,8 +211,9 @@ class Propagator const std::complex* Htmp, const std::complex* H_laststep, std::complex* U_operator, + std::ofstream& ofs_running, const int print_matrix) const; -#endif +#endif // __MPI }; } // namespace module_tddft diff --git a/source/module_hamilt_lcao/module_tddft/propagator_cn2.cpp b/source/module_hamilt_lcao/module_tddft/propagator_cn2.cpp new file mode 100644 index 0000000000..d87cf3c593 --- /dev/null +++ b/source/module_hamilt_lcao/module_tddft/propagator_cn2.cpp @@ -0,0 +1,734 @@ +#include "module_base/lapack_connector.h" +#include "module_base/module_container/ATen/kernels/blas.h" +#include "module_base/module_container/ATen/kernels/lapack.h" +#include "module_base/module_container/ATen/kernels/memory.h" // memory operations (Tensor) +#include "module_base/module_device/memory_op.h" // memory operations +#include "module_base/scalapack_connector.h" +#include "module_parameter/parameter.h" +#include "propagator.h" + +#include +#include + +namespace module_tddft +{ +#ifdef __MPI +void Propagator::compute_propagator_cn2(const int nlocal, + const std::complex* Stmp, + const std::complex* Htmp, + std::complex* U_operator, + std::ofstream& ofs_running, + const int print_matrix) const +{ + assert(this->ParaV->nloc > 0); + + // (1) copy Htmp to Numerator & Denominator + std::complex* Numerator = new std::complex[this->ParaV->nloc]; + ModuleBase::GlobalFunc::ZEROS(Numerator, this->ParaV->nloc); + BlasConnector::copy(this->ParaV->nloc, Htmp, 1, Numerator, 1); + + std::complex* Denominator = new std::complex[this->ParaV->nloc]; + ModuleBase::GlobalFunc::ZEROS(Denominator, this->ParaV->nloc); + BlasConnector::copy(this->ParaV->nloc, Htmp, 1, Denominator, 1); + + if (print_matrix) + { + ofs_running << std::endl; + ofs_running << " S matrix :" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Stmp[in + j].real() << "+" << Stmp[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << " H matrix :" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Numerator[in + j].real() << "+" << Numerator[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } + + // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + // (2) compute Numerator & Denominator by GEADD + // Numerator = Stmp - i*para * Htmp; beta1 = - para = -0.25 * this->dt + // Denominator = Stmp + i*para * Htmp; beta2 = para = 0.25 * this->dt + std::complex alpha = {1.0, 0.0}; + std::complex beta1 = {0.0, -0.25 * this->dt}; + std::complex beta2 = {0.0, 0.25 * this->dt}; + + ScalapackConnector::geadd('N', + nlocal, + nlocal, + alpha, + Stmp, + 1, + 1, + this->ParaV->desc, + beta1, + Numerator, + 1, + 1, + this->ParaV->desc); + ScalapackConnector::geadd('N', + nlocal, + nlocal, + alpha, + Stmp, + 1, + 1, + this->ParaV->desc, + beta2, + Denominator, + 1, + 1, + this->ParaV->desc); + + if (print_matrix) + { + ofs_running << " beta=" << beta1 << std::endl; + ofs_running << " fenmu:" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Denominator[in + j].real() << "+" << Denominator[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } + + //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + // (3) Next, invert Denominator + // What is the size of ipiv exactly? Need to check ScaLAPACK documentation! + // But anyway, not this->ParaV->nloc + int* ipiv = new int[this->ParaV->nrow + this->ParaV->nb]; + ModuleBase::GlobalFunc::ZEROS(ipiv, this->ParaV->nrow + this->ParaV->nb); + int info = 0; + // (3.1) compute ipiv + ScalapackConnector::getrf(nlocal, nlocal, Denominator, 1, 1, this->ParaV->desc, ipiv, &info); + + // Print ipiv + if (print_matrix) + { + ofs_running << " this->ParaV->nloc = " << this->ParaV->nloc << std::endl; + ofs_running << " this->ParaV->nrow = " << this->ParaV->nrow << std::endl; + ofs_running << " this->ParaV->ncol = " << this->ParaV->ncol << std::endl; + ofs_running << " this->ParaV->nb = " << this->ParaV->nb << std::endl; + ofs_running << " this->ParaV->get_block_size() = " << this->ParaV->get_block_size() << std::endl; + ofs_running << " nlocal = " << nlocal << std::endl; + ofs_running << " ipiv:" << std::endl; + for (int i = 0; i < this->ParaV->nloc; i++) + { + ofs_running << ipiv[i] << " "; + } + ofs_running << std::endl; + } + + int lwork = -1; + int liwotk = -1; + std::vector> work(1, 0); + std::vector iwork(1, 0); + // (3.2) compute work + ScalapackConnector::getri(nlocal, + Denominator, + 1, + 1, + this->ParaV->desc, + ipiv, + work.data(), + &lwork, + iwork.data(), + &liwotk, + &info); + lwork = work[0].real(); + work.resize(lwork, 0); + liwotk = iwork[0]; + iwork.resize(liwotk, 0); + // (3.3) compute inverse matrix of Denominator + ScalapackConnector::getri(nlocal, + Denominator, + 1, + 1, + this->ParaV->desc, + ipiv, + work.data(), + &lwork, + iwork.data(), + &liwotk, + &info); + assert(0 == info); + + //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + // (4) U_operator = Denominator * Numerator; + ScalapackConnector::gemm('N', + 'N', + nlocal, + nlocal, + nlocal, + 1.0, + Denominator, + 1, + 1, + this->ParaV->desc, + Numerator, + 1, + 1, + this->ParaV->desc, + 0.0, + U_operator, + 1, + 1, + this->ParaV->desc); + + if (print_matrix) + { + ofs_running << " fenmu^-1:" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Denominator[in + j].real() << "+" << Denominator[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " fenzi:" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Numerator[in + j].real() << "+" << Numerator[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " U operator:" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + double aa = U_operator[in + j].real(); + double bb = U_operator[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + } + + delete[] Numerator; + delete[] Denominator; + delete[] ipiv; +} + +void Propagator::compute_propagator_cn2_tensor(const int nlocal, + const ct::Tensor& Stmp, + const ct::Tensor& Htmp, + ct::Tensor& U_operator, + std::ofstream& ofs_running, + const int print_matrix) const +{ + // (1) copy Htmp to Numerator & Denominator + ct::Tensor Numerator(ct::DataType::DT_COMPLEX_DOUBLE, + ct::DeviceType::CpuDevice, + ct::TensorShape({this->ParaV->nloc})); + Numerator.zero(); + BlasConnector::copy(this->ParaV->nloc, + Htmp.data>(), + 1, + Numerator.data>(), + 1); + + ct::Tensor Denominator(ct::DataType::DT_COMPLEX_DOUBLE, + ct::DeviceType::CpuDevice, + ct::TensorShape({this->ParaV->nloc})); + Denominator.zero(); + BlasConnector::copy(this->ParaV->nloc, + Htmp.data>(), + 1, + Denominator.data>(), + 1); + + if (print_matrix) + { + ofs_running << std::endl; + ofs_running << " S matrix :" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Stmp.data>()[in + j].real() << "+" + << Stmp.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << " H matrix :" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Numerator.data>()[in + j].real() << "+" + << Numerator.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } + + // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + // (2) compute Numerator & Denominator by GEADD + // Numerator = Stmp - i*para * Htmp; beta1 = - para = -0.25 * this->dt + // Denominator = Stmp + i*para * Htmp; beta2 = para = 0.25 * this->dt + std::complex alpha = {1.0, 0.0}; + std::complex beta1 = {0.0, -0.25 * this->dt}; + std::complex beta2 = {0.0, 0.25 * this->dt}; + + ScalapackConnector::geadd('N', + nlocal, + nlocal, + alpha, + Stmp.data>(), + 1, + 1, + this->ParaV->desc, + beta1, + Numerator.data>(), + 1, + 1, + this->ParaV->desc); + ScalapackConnector::geadd('N', + nlocal, + nlocal, + alpha, + Stmp.data>(), + 1, + 1, + this->ParaV->desc, + beta2, + Denominator.data>(), + 1, + 1, + this->ParaV->desc); + + if (print_matrix) + { + ofs_running << " beta=" << beta1 << std::endl; + ofs_running << " fenmu:" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Denominator.data>()[in + j].real() << "+" + << Denominator.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } + + //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + // (3) Next, invert Denominator + ct::Tensor ipiv(ct::DataType::DT_INT, + ct::DeviceType::CpuDevice, + ct::TensorShape({this->ParaV->nrow + this->ParaV->nb})); + ipiv.zero(); + int info = 0; + // (3.1) compute ipiv + ScalapackConnector::getrf(nlocal, + nlocal, + Denominator.data>(), + 1, + 1, + this->ParaV->desc, + ipiv.data(), + &info); + + // Print ipiv + if (print_matrix) + { + ofs_running << " this->ParaV->nloc = " << this->ParaV->nloc << std::endl; + ofs_running << " this->ParaV->nrow = " << this->ParaV->nrow << std::endl; + ofs_running << " this->ParaV->ncol = " << this->ParaV->ncol << std::endl; + ofs_running << " this->ParaV->nb = " << this->ParaV->nb << std::endl; + ofs_running << " this->ParaV->get_block_size() = " << this->ParaV->get_block_size() << std::endl; + ofs_running << " nlocal = " << nlocal << std::endl; + ofs_running << " ipiv:" << std::endl; + for (int i = 0; i < this->ParaV->nloc; i++) + { + ofs_running << ipiv.data()[i] << " "; + } + ofs_running << std::endl; + } + + int lwork = -1; + int liwotk = -1; + ct::Tensor work(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({1})); + ct::Tensor iwork(ct::DataType::DT_INT, ct::DeviceType::CpuDevice, ct::TensorShape({1})); + // (3.2) compute work + ScalapackConnector::getri(nlocal, + Denominator.data>(), + 1, + 1, + this->ParaV->desc, + ipiv.data(), + work.data>(), + &lwork, + iwork.data(), + &liwotk, + &info); + lwork = work.data>()[0].real(); + work.resize(ct::TensorShape({lwork})); + liwotk = iwork.data()[0]; + iwork.resize(ct::TensorShape({liwotk})); + // (3.3) compute inverse matrix of Denominator + ScalapackConnector::getri(nlocal, + Denominator.data>(), + 1, + 1, + this->ParaV->desc, + ipiv.data(), + work.data>(), + &lwork, + iwork.data(), + &liwotk, + &info); + assert(0 == info); + + //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + // (4) U_operator = Denominator * Numerator; + ScalapackConnector::gemm('N', + 'N', + nlocal, + nlocal, + nlocal, + 1.0, + Denominator.data>(), + 1, + 1, + this->ParaV->desc, + Numerator.data>(), + 1, + 1, + this->ParaV->desc, + 0.0, + U_operator.data>(), + 1, + 1, + this->ParaV->desc); + + if (print_matrix) + { + ofs_running << " fenmu^-1:" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Denominator.data>()[in + j].real() << "+" + << Denominator.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " fenzi:" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Numerator.data>()[in + j].real() << "+" + << Numerator.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " U operator:" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + double aa = U_operator.data>()[in + j].real(); + double bb = U_operator.data>()[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + } +} + +template +void Propagator::compute_propagator_cn2_tensor_lapack(const int nlocal, + const ct::Tensor& Stmp, + const ct::Tensor& Htmp, + ct::Tensor& U_operator, + std::ofstream& ofs_running, + const int print_matrix) const +{ + // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice + ct::DeviceType ct_device_type = ct::DeviceTypeToEnum::value; + // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU + using ct_Device = typename ct::PsiToContainer::type; + + // (1) copy Htmp to Numerator & Denominator + ct::Tensor Numerator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal * nlocal})); + Numerator.zero(); + base_device::memory::synchronize_memory_op, Device, Device>()( + Numerator.data>(), + Htmp.data>(), + nlocal * nlocal); + + ct::Tensor Denominator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({nlocal * nlocal})); + Denominator.zero(); + base_device::memory::synchronize_memory_op, Device, Device>()( + Denominator.data>(), + Htmp.data>(), + nlocal * nlocal); + + if (print_matrix) + { + ct::Tensor Stmp_cpu = Stmp.to_device(); + ct::Tensor Numerator_cpu = Numerator.to_device(); + + ofs_running << std::endl; + ofs_running << " S matrix :" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + ofs_running << Stmp_cpu.data>()[in + j].real() << "+" + << Stmp_cpu.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << " H matrix :" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + ofs_running << Numerator_cpu.data>()[in + j].real() << "+" + << Numerator_cpu.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } + + // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + // (2) compute Numerator & Denominator by GEADD + // Numerator = Stmp - i*para * Htmp; beta1 = - para = -0.25 * this->dt + // Denominator = Stmp + i*para * Htmp; beta2 = para = 0.25 * this->dt + std::complex one = {1.0, 0.0}; + std::complex beta1 = {0.0, -0.25 * this->dt}; + std::complex beta2 = {0.0, 0.25 * this->dt}; + + // Numerator = -i*para * Htmp + ct::kernels::blas_scal, ct_Device>()(nlocal * nlocal, + &beta1, + Numerator.data>(), + 1); + // Numerator = Stmp + (-i*para * Htmp) + ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal, + &one, + Stmp.data>(), + 1, + Numerator.data>(), + 1); + // Denominator = i*para * Htmp + ct::kernels::blas_scal, ct_Device>()(nlocal * nlocal, + &beta2, + Denominator.data>(), + 1); + // Denominator = Stmp + (i*para * Htmp) + ct::kernels::blas_axpy, ct_Device>()(nlocal * nlocal, + &one, + Stmp.data>(), + 1, + Denominator.data>(), + 1); + + if (print_matrix) + { + ct::Tensor Denominator_cpu = Denominator.to_device(); + + ofs_running << " beta=" << beta1 << std::endl; + ofs_running << " fenmu:" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + ofs_running << Denominator_cpu.data>()[in + j].real() << "+" + << Denominator_cpu.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } + + //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + // (3) Next, invert Denominator + ct::Tensor ipiv(ct::DataType::DT_INT, ct_device_type, ct::TensorShape({nlocal})); + ipiv.zero(); + // (3.1) compute ipiv + ct::kernels::lapack_getrf, ct_Device>()(nlocal, + nlocal, + Denominator.data>(), + nlocal, + ipiv.data()); + + // Print ipiv + if (print_matrix) + { + ct::Tensor ipiv_cpu = ipiv.to_device(); + + ofs_running << " ipiv:" << std::endl; + for (int i = 0; i < nlocal; i++) + { + ofs_running << ipiv_cpu.data()[i] << " "; + } + ofs_running << std::endl; + } + + // (3.2) compute inverse matrix of Denominator + ct::Tensor Denominator_inv = create_identity_matrix>(nlocal, ct_device_type); + ct::kernels::lapack_getrs, ct_Device>()('N', + nlocal, + nlocal, + Denominator.data>(), + nlocal, + ipiv.data(), + Denominator_inv.data>(), + nlocal); + + //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + // (4) U_operator = Denominator_inv * Numerator; + std::complex one_gemm = {1.0, 0.0}; + std::complex zero_gemm = {0.0, 0.0}; + ct::kernels::blas_gemm, ct_Device>()('N', + 'N', + nlocal, + nlocal, + nlocal, + &one_gemm, + Denominator_inv.data>(), + nlocal, + Numerator.data>(), + nlocal, + &zero_gemm, + U_operator.data>(), + nlocal); + + if (print_matrix) + { + ct::Tensor Denominator_inv_cpu = Denominator_inv.to_device(); + ct::Tensor Numerator_cpu = Numerator.to_device(); + ct::Tensor U_operator_cpu = U_operator.to_device(); + + ofs_running << " fenmu^-1:" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + ofs_running << Denominator_inv_cpu.data>()[in + j].real() << "+" + << Denominator_inv_cpu.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " fenzi:" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + ofs_running << Numerator_cpu.data>()[in + j].real() << "+" + << Numerator_cpu.data>()[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " U operator:" << std::endl; + for (int i = 0; i < nlocal; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + double aa = U_operator_cpu.data>()[in + j].real(); + double bb = U_operator_cpu.data>()[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + } +} + +// Explicit instantiation of template functions +template void Propagator::compute_propagator_cn2_tensor_lapack(const int nlocal, + const ct::Tensor& Stmp, + const ct::Tensor& Htmp, + ct::Tensor& U_operator, + std::ofstream& ofs_running, + const int print_matrix) const; +#if ((defined __CUDA) /* || (defined __ROCM) */) +template void Propagator::compute_propagator_cn2_tensor_lapack(const int nlocal, + const ct::Tensor& Stmp, + const ct::Tensor& Htmp, + ct::Tensor& U_operator, + std::ofstream& ofs_running, + const int print_matrix) const; +#endif // __CUDA +#endif // __MPI +} // namespace module_tddft diff --git a/source/module_hamilt_lcao/module_tddft/propagator_etrs.cpp b/source/module_hamilt_lcao/module_tddft/propagator_etrs.cpp new file mode 100644 index 0000000000..ad37264d59 --- /dev/null +++ b/source/module_hamilt_lcao/module_tddft/propagator_etrs.cpp @@ -0,0 +1,50 @@ +#include "module_base/lapack_connector.h" +#include "module_base/module_container/ATen/kernels/blas.h" +#include "module_base/module_container/ATen/kernels/lapack.h" +#include "module_base/module_container/ATen/kernels/memory.h" // memory operations (Tensor) +#include "module_base/module_device/memory_op.h" // memory operations +#include "module_base/scalapack_connector.h" +#include "module_parameter/parameter.h" +#include "propagator.h" + +#include +#include + +namespace module_tddft +{ +#ifdef __MPI +void Propagator::compute_propagator_etrs(const int nlocal, + const std::complex* Stmp, + const std::complex* Htmp, + const std::complex* H_laststep, + std::complex* U_operator, + std::ofstream& ofs_running, + const int print_matrix) const +{ + std::vector> U1(this->ParaV->nloc); + std::vector> U2(this->ParaV->nloc); + int tag = 2; + compute_propagator_taylor(nlocal, Stmp, Htmp, U1.data(), ofs_running, print_matrix, tag); + compute_propagator_taylor(nlocal, Stmp, H_laststep, U2.data(), ofs_running, print_matrix, tag); + ScalapackConnector::gemm('N', + 'N', + nlocal, + nlocal, + nlocal, + 1.0, + U1.data(), + 1, + 1, + this->ParaV->desc, + U2.data(), + 1, + 1, + this->ParaV->desc, + 0.0, + U_operator, + 1, + 1, + this->ParaV->desc); +} +#endif // __MPI +} // namespace module_tddft diff --git a/source/module_hamilt_lcao/module_tddft/propagator_taylor.cpp b/source/module_hamilt_lcao/module_tddft/propagator_taylor.cpp new file mode 100644 index 0000000000..58567a900a --- /dev/null +++ b/source/module_hamilt_lcao/module_tddft/propagator_taylor.cpp @@ -0,0 +1,343 @@ +#include "module_base/lapack_connector.h" +#include "module_base/module_container/ATen/kernels/blas.h" +#include "module_base/module_container/ATen/kernels/lapack.h" +#include "module_base/module_container/ATen/kernels/memory.h" // memory operations (Tensor) +#include "module_base/module_device/memory_op.h" // memory operations +#include "module_base/scalapack_connector.h" +#include "module_parameter/parameter.h" +#include "propagator.h" + +#include +#include + +namespace module_tddft +{ +#ifdef __MPI +void Propagator::compute_propagator_taylor(const int nlocal, + const std::complex* Stmp, + const std::complex* Htmp, + std::complex* U_operator, + std::ofstream& ofs_running, + const int print_matrix, + const int tag) const +{ + assert(this->ParaV->nloc > 0); + + ModuleBase::GlobalFunc::ZEROS(U_operator, this->ParaV->nloc); + std::complex* A_matrix = new std::complex[this->ParaV->nloc]; + ModuleBase::GlobalFunc::ZEROS(A_matrix, this->ParaV->nloc); + std::complex* rank0 = new std::complex[this->ParaV->nloc]; + ModuleBase::GlobalFunc::ZEROS(rank0, this->ParaV->nloc); + std::complex* rank2 = new std::complex[this->ParaV->nloc]; + ModuleBase::GlobalFunc::ZEROS(rank2, this->ParaV->nloc); + std::complex* rank3 = new std::complex[this->ParaV->nloc]; + ModuleBase::GlobalFunc::ZEROS(rank3, this->ParaV->nloc); + std::complex* rank4 = new std::complex[this->ParaV->nloc]; + ModuleBase::GlobalFunc::ZEROS(rank4, this->ParaV->nloc); + std::complex* tmp1 = new std::complex[this->ParaV->nloc]; + ModuleBase::GlobalFunc::ZEROS(tmp1, this->ParaV->nloc); + std::complex* tmp2 = new std::complex[this->ParaV->nloc]; + ModuleBase::GlobalFunc::ZEROS(tmp2, this->ParaV->nloc); + std::complex* Sinv = new std::complex[this->ParaV->nloc]; + ModuleBase::GlobalFunc::ZEROS(Sinv, this->ParaV->nloc); + BlasConnector::copy(this->ParaV->nloc, Stmp, 1, Sinv, 1); + + if (print_matrix) + { + ofs_running << std::endl; + ofs_running << " S matrix :" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Stmp[in + j].real() << "+" << Stmp[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << std::endl; + ofs_running << " H matrix :" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << Htmp[in + j].real() << "+" << Htmp[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } + + // set rank0 + int info = 0; + int naroc[2] = {0, 0}; // maximum number of row or column + + for (int iprow = 0; iprow < this->ParaV->dim0; ++iprow) + { + for (int ipcol = 0; ipcol < this->ParaV->dim1; ++ipcol) + { + if (iprow == ParaV->coord[0] && ipcol == ParaV->coord[1]) + { + naroc[0] = this->ParaV->nrow; + naroc[1] = this->ParaV->ncol; + for (int j = 0; j < naroc[1]; ++j) + { + int igcol = globalIndex(j, this->ParaV->nb, this->ParaV->dim1, ipcol); + if (igcol >= nlocal) + { + continue; + } + for (int i = 0; i < naroc[0]; ++i) + { + int igrow = globalIndex(i, this->ParaV->nb, this->ParaV->dim0, iprow); + if (igrow >= nlocal) + { + continue; + } + if (igcol == igrow) + { + rank0[j * naroc[0] + i] = {1.0, 0.0}; + } + else + { + rank0[j * naroc[0] + i] = {0.0, 0.0}; + } + } + } + } + } // loop ipcol + } // loop iprow + + std::complex beta = {0.0, -0.5 * this->dt / tag}; // for ETRS tag=2 , for taylor tag=1 + + //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + // invert Stmp + int* ipiv = new int[this->ParaV->nloc]; + // (3.1) compute ipiv + ScalapackConnector::getrf(nlocal, nlocal, Sinv, 1, 1, this->ParaV->desc, ipiv, &info); + int lwork = -1; + int liwotk = -1; + std::vector> work(1, 0); + std::vector iwork(1, 0); + // (3.2) compute work + ScalapackConnector::getri(nlocal, + Sinv, + 1, + 1, + this->ParaV->desc, + ipiv, + work.data(), + &lwork, + iwork.data(), + &liwotk, + &info); + lwork = work[0].real(); + work.resize(lwork, 0); + liwotk = iwork[0]; + iwork.resize(liwotk, 0); + ScalapackConnector::getri(nlocal, + Sinv, + 1, + 1, + this->ParaV->desc, + ipiv, + work.data(), + &lwork, + iwork.data(), + &liwotk, + &info); + assert(0 == info); + + // A_matrix = - idt S^-1 H ; + ScalapackConnector::gemm('N', + 'N', + nlocal, + nlocal, + nlocal, + beta, + Sinv, + 1, + 1, + this->ParaV->desc, + Htmp, + 1, + 1, + this->ParaV->desc, + 0.0, + U_operator, + 1, + 1, + this->ParaV->desc); + + // rank2 = A^2 ; + ScalapackConnector::gemm('N', + 'N', + nlocal, + nlocal, + nlocal, + 1.0, + U_operator, + 1, + 1, + this->ParaV->desc, + U_operator, + 1, + 1, + this->ParaV->desc, + 0.0, + rank2, + 1, + 1, + this->ParaV->desc); + + // rank3 = A^3 ; + ScalapackConnector::gemm('N', + 'N', + nlocal, + nlocal, + nlocal, + 1.0, + U_operator, + 1, + 1, + this->ParaV->desc, + rank2, + 1, + 1, + this->ParaV->desc, + 0.0, + rank3, + 1, + 1, + this->ParaV->desc); + + // rank4 = A^4 ; + ScalapackConnector::gemm('N', + 'N', + nlocal, + nlocal, + nlocal, + 1.0, + U_operator, + 1, + 1, + this->ParaV->desc, + rank3, + 1, + 1, + this->ParaV->desc, + 0.0, + rank4, + 1, + 1, + this->ParaV->desc); + + std::complex p1 = {1.0, 0.0}; + std::complex p2 = {1.0 / 2.0, 0.0}; + std::complex p3 = {1.0 / 6.0, 0.0}; + std::complex p4 = {1.0 / 24.0, 0.0}; + + ScalapackConnector::geadd('N', + nlocal, + nlocal, + p1, + rank0, + 1, + 1, + this->ParaV->desc, + p1, + U_operator, + 1, + 1, + this->ParaV->desc); + + ScalapackConnector::geadd('N', + nlocal, + nlocal, + p2, + rank2, + 1, + 1, + this->ParaV->desc, + p1, + U_operator, + 1, + 1, + this->ParaV->desc); + + ScalapackConnector::geadd('N', + nlocal, + nlocal, + p3, + rank3, + 1, + 1, + this->ParaV->desc, + p1, + U_operator, + 1, + 1, + this->ParaV->desc); + + ScalapackConnector::geadd('N', + nlocal, + nlocal, + p4, + rank4, + 1, + 1, + this->ParaV->desc, + p1, + U_operator, + 1, + 1, + this->ParaV->desc); + + if (print_matrix) + { + ofs_running << " A_matrix:" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + ofs_running << A_matrix[in + j].real() << "+" << A_matrix[in + j].imag() << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " U operator:" << std::endl; + for (int i = 0; i < this->ParaV->nrow; i++) + { + const int in = i * this->ParaV->ncol; + for (int j = 0; j < this->ParaV->ncol; j++) + { + double aa = U_operator[in + j].real(); + double bb = U_operator[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + } + delete[] A_matrix; + delete[] rank0; + delete[] rank2; + delete[] rank3; + delete[] rank4; + delete[] tmp1; + delete[] tmp2; + delete[] Sinv; + delete[] ipiv; +} +#endif // __MPI +} // namespace module_tddft diff --git a/source/module_hamilt_lcao/module_tddft/test/CMakeLists.txt b/source/module_hamilt_lcao/module_tddft/test/CMakeLists.txt index 3c54ffc9b2..07e64a5977 100644 --- a/source/module_hamilt_lcao/module_tddft/test/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_tddft/test/CMakeLists.txt @@ -11,9 +11,9 @@ AddTest( ) AddTest( - TARGET tddft_bandenergy_test + TARGET tddft_band_energy_test LIBS parameter ${math_libs} base device tddft_test_lib - SOURCES bandenergy_test.cpp ../bandenergy.cpp ../../../module_basis/module_ao/parallel_orbitals.cpp + SOURCES band_energy_test.cpp ../band_energy.cpp ../../../module_basis/module_ao/parallel_orbitals.cpp ) AddTest( @@ -31,6 +31,6 @@ AddTest( AddTest( TARGET tddft_propagator_test LIBS parameter ${math_libs} base device tddft_test_lib - SOURCES propagator_test1.cpp propagator_test2.cpp propagator_test3.cpp ../propagator.cpp + SOURCES propagator_test1.cpp propagator_test2.cpp propagator_test3.cpp ../propagator.cpp ../propagator_cn2.cpp ../propagator_taylor.cpp ../propagator_etrs.cpp ) diff --git a/source/module_hamilt_lcao/module_tddft/test/bandenergy_test.cpp b/source/module_hamilt_lcao/module_tddft/test/band_energy_test.cpp similarity index 94% rename from source/module_hamilt_lcao/module_tddft/test/bandenergy_test.cpp rename to source/module_hamilt_lcao/module_tddft/test/band_energy_test.cpp index 853b75be76..706b50a603 100644 --- a/source/module_hamilt_lcao/module_tddft/test/bandenergy_test.cpp +++ b/source/module_hamilt_lcao/module_tddft/test/band_energy_test.cpp @@ -1,4 +1,4 @@ -#include "module_hamilt_lcao/module_tddft/bandenergy.h" +#include "module_hamilt_lcao/module_tddft/band_energy.h" #include #include @@ -9,7 +9,7 @@ #include "tddft_test.h" /************************************************ - * unit test of functions in bandenergy.h + * unit test of functions in band_energy.h ***********************************************/ /** @@ -19,7 +19,6 @@ */ #define doublethreshold 1e-8 -double module_tddft::Evolve_elec::td_print_eij = -1; TEST(BandEnergyTest, testBandEnergy) { @@ -87,7 +86,7 @@ TEST(BandEnergyTest, testBandEnergy) psi_k[11] = 1.0; // Call the function - module_tddft::compute_ekb(pv, nband, nlocal, Htmp, psi_k, ekb); + module_tddft::compute_ekb(pv, nband, nlocal, Htmp, psi_k, ekb, GlobalV::ofs_running); // Check the results EXPECT_NEAR(ekb[0], 3.0, doublethreshold); diff --git a/source/module_hamilt_lcao/module_tddft/test/middle_hamilt_test.cpp b/source/module_hamilt_lcao/module_tddft/test/middle_hamilt_test.cpp index 5cda7e4aba..6ac920400a 100644 --- a/source/module_hamilt_lcao/module_tddft/test/middle_hamilt_test.cpp +++ b/source/module_hamilt_lcao/module_tddft/test/middle_hamilt_test.cpp @@ -1,12 +1,13 @@ #include "module_hamilt_lcao/module_tddft/middle_hamilt.h" +#include "module_base/global_variable.h" // GlobalV::ofs_running +#include "module_basis/module_ao/parallel_orbitals.h" +#include "tddft_test.h" + #include #include #include -#include "module_basis/module_ao/parallel_orbitals.h" -#include "tddft_test.h" - /************************************************ * unit test of functions in middle_hamilt.h ***********************************************/ @@ -72,7 +73,7 @@ TEST(MiddleHamiltTest, testMiddleHamilt) } // Call the function - module_tddft::half_Hmatrix(pv, nband, nlocal, Htmp, Stmp, Hlaststep, Slaststep, print_matrix); + module_tddft::half_Hmatrix(pv, nband, nlocal, Htmp, Stmp, Hlaststep, Slaststep, GlobalV::ofs_running, print_matrix); // Check the results EXPECT_NEAR(Htmp[0].real(), 1.0, doublethreshold); diff --git a/source/module_hamilt_lcao/module_tddft/test/norm_psi_test.cpp b/source/module_hamilt_lcao/module_tddft/test/norm_psi_test.cpp index 6c78937803..e4b8852c95 100644 --- a/source/module_hamilt_lcao/module_tddft/test/norm_psi_test.cpp +++ b/source/module_hamilt_lcao/module_tddft/test/norm_psi_test.cpp @@ -1,12 +1,13 @@ #include "module_hamilt_lcao/module_tddft/norm_psi.h" +#include "module_base/global_variable.h" // GlobalV::ofs_running +#include "module_basis/module_ao/parallel_orbitals.h" +#include "tddft_test.h" + #include #include #include -#include "module_basis/module_ao/parallel_orbitals.h" -#include "tddft_test.h" - /************************************************ * unit test of functions in norm_psi.h ***********************************************/ @@ -89,7 +90,7 @@ TEST(NormPsiTest, testNormPsi) psi_k[11] = 1.0; // Call the function - module_tddft::norm_psi(pv, nband, nlocal, Stmp, psi_k, print_matrix); + module_tddft::norm_psi(pv, nband, nlocal, Stmp, psi_k, GlobalV::ofs_running, print_matrix); // Check the results EXPECT_NEAR(psi_k[0].real(), 0.577350269189626, doublethreshold); diff --git a/source/module_hamilt_lcao/module_tddft/test/propagator_test1.cpp b/source/module_hamilt_lcao/module_tddft/test/propagator_test1.cpp index 882bee90a6..0002ee4b44 100644 --- a/source/module_hamilt_lcao/module_tddft/test/propagator_test1.cpp +++ b/source/module_hamilt_lcao/module_tddft/test/propagator_test1.cpp @@ -72,7 +72,7 @@ TEST(PropagatorTest, testPropagatorCN) // Call the function int propagator = 0; module_tddft::Propagator prop(propagator, pv, PARAM.mdp.md_dt); - prop.compute_propagator(nlocal, Stmp, Htmp, nullptr, U_operator, print_matrix); + prop.compute_propagator(nlocal, Stmp, Htmp, nullptr, U_operator, GlobalV::ofs_running, print_matrix); // Check the results EXPECT_NEAR(U_operator[0].real(), -0.107692307692308, doublethreshold); diff --git a/source/module_hamilt_lcao/module_tddft/test/propagator_test2.cpp b/source/module_hamilt_lcao/module_tddft/test/propagator_test2.cpp index 16f57c18a1..7ff2416a6a 100644 --- a/source/module_hamilt_lcao/module_tddft/test/propagator_test2.cpp +++ b/source/module_hamilt_lcao/module_tddft/test/propagator_test2.cpp @@ -77,7 +77,7 @@ TEST(PropagatorTest, testPropagatorTaylor) // Call the function int propagator = 1; module_tddft::Propagator prop(propagator, pv, PARAM.mdp.md_dt); - prop.compute_propagator(nlocal, Stmp, Htmp, nullptr, U_operator, print_matrix); + prop.compute_propagator(nlocal, Stmp, Htmp, nullptr, U_operator, GlobalV::ofs_running, print_matrix); // Check the results EXPECT_NEAR(U_operator[0].real(), 1.95473251028807, doublethreshold); diff --git a/source/module_hamilt_lcao/module_tddft/test/propagator_test3.cpp b/source/module_hamilt_lcao/module_tddft/test/propagator_test3.cpp index ba0b84b4a7..faabe0019d 100644 --- a/source/module_hamilt_lcao/module_tddft/test/propagator_test3.cpp +++ b/source/module_hamilt_lcao/module_tddft/test/propagator_test3.cpp @@ -81,7 +81,7 @@ TEST(PropagatorTest, testPropagatorETRS) // Call the function int propagator = 2; module_tddft::Propagator prop(propagator, pv, PARAM.mdp.md_dt); - prop.compute_propagator(nlocal, Stmp, Htmp, Hlaststep, U_operator, print_matrix); + prop.compute_propagator(nlocal, Stmp, Htmp, Hlaststep, U_operator, GlobalV::ofs_running, print_matrix); // Check the results EXPECT_NEAR(U_operator[0].real(), -0.0105865569272976, doublethreshold); diff --git a/source/module_hamilt_lcao/module_tddft/test/upsi_test1.cpp b/source/module_hamilt_lcao/module_tddft/test/upsi_test1.cpp index 2782db5376..6ffe45a3bd 100644 --- a/source/module_hamilt_lcao/module_tddft/test/upsi_test1.cpp +++ b/source/module_hamilt_lcao/module_tddft/test/upsi_test1.cpp @@ -1,11 +1,12 @@ -#include -#include -#include - +#include "module_base/global_variable.h" // GlobalV::ofs_running #include "module_basis/module_ao/parallel_orbitals.h" #include "module_hamilt_lcao/module_tddft/upsi.h" #include "tddft_test.h" +#include +#include +#include + #define doublethreshold 1e-8 /************************************************ @@ -61,7 +62,7 @@ TEST(UpsiTest, testUpsi1) } // Call the function - module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, print_matrix); + module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, GlobalV::ofs_running, print_matrix); // Check the results for (int i = 0; i < nlocal * nband; ++i) diff --git a/source/module_hamilt_lcao/module_tddft/test/upsi_test2.cpp b/source/module_hamilt_lcao/module_tddft/test/upsi_test2.cpp index 33ca60f7a6..915b2a248b 100644 --- a/source/module_hamilt_lcao/module_tddft/test/upsi_test2.cpp +++ b/source/module_hamilt_lcao/module_tddft/test/upsi_test2.cpp @@ -1,11 +1,12 @@ -#include -#include -#include - +#include "module_base/global_variable.h" // GlobalV::ofs_running #include "module_basis/module_ao/parallel_orbitals.h" #include "module_hamilt_lcao/module_tddft/upsi.h" #include "tddft_test.h" +#include +#include +#include + #define doublethreshold 1e-8 /************************************************ @@ -55,7 +56,7 @@ TEST(UpsiTest, testUpsi2) } // Call the function - module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, print_matrix); + module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, GlobalV::ofs_running, print_matrix); // Check the results for (int i = 0; i < nlocal * nband; ++i) diff --git a/source/module_hamilt_lcao/module_tddft/test/upsi_test3.cpp b/source/module_hamilt_lcao/module_tddft/test/upsi_test3.cpp index 576adf60ee..cf5e177b13 100644 --- a/source/module_hamilt_lcao/module_tddft/test/upsi_test3.cpp +++ b/source/module_hamilt_lcao/module_tddft/test/upsi_test3.cpp @@ -1,11 +1,12 @@ -#include -#include -#include - +#include "module_base/global_variable.h" // GlobalV::ofs_running #include "module_basis/module_ao/parallel_orbitals.h" #include "module_hamilt_lcao/module_tddft/upsi.h" #include "tddft_test.h" +#include +#include +#include + #define doublethreshold 1e-8 /************************************************ @@ -69,7 +70,7 @@ TEST(UpsiTest, testUpsi3) } // Call the function - module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, print_matrix); + module_tddft::upsi(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, GlobalV::ofs_running, print_matrix); // Check the results for (int i = 0; i < nlocal; ++i) diff --git a/source/module_hamilt_lcao/module_tddft/upsi.cpp b/source/module_hamilt_lcao/module_tddft/upsi.cpp index 9ccc9ac347..cc9a2448a8 100644 --- a/source/module_hamilt_lcao/module_tddft/upsi.cpp +++ b/source/module_hamilt_lcao/module_tddft/upsi.cpp @@ -1,11 +1,12 @@ #include "upsi.h" -#include -#include - #include "module_base/lapack_connector.h" +#include "module_base/module_container/ATen/kernels/blas.h" #include "module_base/scalapack_connector.h" +#include +#include + namespace module_tddft { #ifdef __MPI @@ -15,9 +16,9 @@ void upsi(const Parallel_Orbitals* pv, const std::complex* U_operator, const std::complex* psi_k_laststep, std::complex* psi_k, + std::ofstream& ofs_running, const int print_matrix) { - ScalapackConnector::gemm('N', 'N', nlocal, @@ -40,43 +41,231 @@ void upsi(const Parallel_Orbitals* pv, if (print_matrix) { - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " psi_k:" << std::endl; + ofs_running << std::endl; + ofs_running << " psi_k:" << std::endl; + for (int i = 0; i < pv->ncol_bands; i++) + { + const int in = i * pv->ncol; + for (int j = 0; j < pv->ncol; j++) + { + double aa = psi_k[in + j].real(); + double bb = psi_k[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " psi_k_laststep:" << std::endl; + for (int i = 0; i < pv->ncol_bands; i++) + { + const int in = i * pv->ncol; + for (int j = 0; j < pv->ncol; j++) + { + double aa = psi_k_laststep[in + j].real(); + double bb = psi_k_laststep[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } +} + +void upsi_tensor(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& U_operator, + const ct::Tensor& psi_k_laststep, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix) +{ + ScalapackConnector::gemm('N', + 'N', + nlocal, + nband, + nlocal, + 1.0, + U_operator.data>(), + 1, + 1, + pv->desc, + psi_k_laststep.data>(), + 1, + 1, + pv->desc_wfc, + 0.0, + psi_k.data>(), + 1, + 1, + pv->desc_wfc); + + if (print_matrix) + { + ofs_running << std::endl; + ofs_running << " psi_k:" << std::endl; for (int i = 0; i < pv->ncol_bands; i++) { + const int in = i * pv->ncol; for (int j = 0; j < pv->ncol; j++) { - double aa, bb; - aa = psi_k[i * pv->ncol + j].real(); - bb = psi_k[i * pv->ncol + j].imag(); + double aa = psi_k.data>()[in + j].real(); + double bb = psi_k.data>()[in + j].imag(); if (std::abs(aa) < 1e-8) + { aa = 0.0; + } if (std::abs(bb) < 1e-8) + { bb = 0.0; - GlobalV::ofs_running << aa << "+" << bb << "i "; + } + ofs_running << aa << "+" << bb << "i "; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - GlobalV::ofs_running << std::endl; - GlobalV::ofs_running << " psi_k_laststep:" << std::endl; + ofs_running << std::endl; + ofs_running << " psi_k_laststep:" << std::endl; for (int i = 0; i < pv->ncol_bands; i++) { + const int in = i * pv->ncol; for (int j = 0; j < pv->ncol; j++) { - double aa, bb; - aa = psi_k_laststep[i * pv->ncol + j].real(); - bb = psi_k_laststep[i * pv->ncol + j].imag(); + double aa = psi_k_laststep.data>()[in + j].real(); + double bb = psi_k_laststep.data>()[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + } +} + +template +void upsi_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& U_operator, + const ct::Tensor& psi_k_laststep, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix) +{ + // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice + ct::DeviceType ct_device_type = ct::DeviceTypeToEnum::value; + // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU + using ct_Device = typename ct::PsiToContainer::type; + + // Perform the matrix multiplication: psi_k = U_operator * psi_k_laststep + std::complex alpha = {1.0, 0.0}; + std::complex beta = {0.0, 0.0}; + + ct::kernels::blas_gemm, ct_Device>()('N', + 'N', + nlocal, + nband, + nlocal, + &alpha, + U_operator.data>(), + nlocal, + psi_k_laststep.data>(), + nlocal, + &beta, + psi_k.data>(), + nlocal); + + if (print_matrix) + { + ct::Tensor psi_k_cpu = psi_k.to_device(); + ct::Tensor psi_k_laststep_cpu = psi_k_laststep.to_device(); + + ofs_running << std::endl; + ofs_running << " psi_k:" << std::endl; + for (int i = 0; i < nband; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + double aa = psi_k_cpu.data>()[in + j].real(); + double bb = psi_k_cpu.data>()[in + j].imag(); + if (std::abs(aa) < 1e-8) + { + aa = 0.0; + } + if (std::abs(bb) < 1e-8) + { + bb = 0.0; + } + ofs_running << aa << "+" << bb << "i "; + } + ofs_running << std::endl; + } + ofs_running << std::endl; + ofs_running << " psi_k_laststep:" << std::endl; + for (int i = 0; i < nband; i++) + { + const int in = i * nlocal; + for (int j = 0; j < nlocal; j++) + { + double aa = psi_k_laststep_cpu.data>()[in + j].real(); + double bb = psi_k_laststep_cpu.data>()[in + j].imag(); if (std::abs(aa) < 1e-8) + { aa = 0.0; + } if (std::abs(bb) < 1e-8) + { bb = 0.0; - GlobalV::ofs_running << aa << "+" << bb << "i "; + } + ofs_running << aa << "+" << bb << "i "; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } - GlobalV::ofs_running << std::endl; + ofs_running << std::endl; } } -#endif +// Explicit instantiation of template functions +template void upsi_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& U_operator, + const ct::Tensor& psi_k_laststep, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix); +#if ((defined __CUDA) /* || (defined __ROCM) */) +template void upsi_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& U_operator, + const ct::Tensor& psi_k_laststep, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix); +#endif // __CUDA +#endif // __MPI } // namespace module_tddft diff --git a/source/module_hamilt_lcao/module_tddft/upsi.h b/source/module_hamilt_lcao/module_tddft/upsi.h index 3d703de5f8..c687a18fed 100644 --- a/source/module_hamilt_lcao/module_tddft/upsi.h +++ b/source/module_hamilt_lcao/module_tddft/upsi.h @@ -7,7 +7,9 @@ #ifndef UPSI_H #define UPSI_H +#include "module_base/module_container/ATen/core/tensor.h" // ct::Tensor #include "module_basis/module_ao/parallel_orbitals.h" + #include namespace module_tddft @@ -30,9 +32,29 @@ void upsi(const Parallel_Orbitals* pv, const std::complex* U_operator, const std::complex* psi_k_laststep, std::complex* psi_k, + std::ofstream& ofs_running, const int print_matrix); -#endif +void upsi_tensor(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& U_operator, + const ct::Tensor& psi_k_laststep, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix); + +template +void upsi_tensor_lapack(const Parallel_Orbitals* pv, + const int nband, + const int nlocal, + const ct::Tensor& U_operator, + const ct::Tensor& psi_k_laststep, + ct::Tensor& psi_k, + std::ofstream& ofs_running, + const int print_matrix); + +#endif // __MPI } // namespace module_tddft #endif diff --git a/source/module_hsolver/diago_cusolver.cpp b/source/module_hsolver/diago_cusolver.cpp index 5511ec941c..efaf20a28a 100644 --- a/source/module_hsolver/diago_cusolver.cpp +++ b/source/module_hsolver/diago_cusolver.cpp @@ -1,10 +1,10 @@ #include "diago_cusolver.h" -#include "module_parameter/parameter.h" #include "module_base/blacs_connector.h" #include "module_base/global_variable.h" #include "module_base/scalapack_connector.h" #include "module_base/timer.h" +#include "module_parameter/parameter.h" #include #include diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp index 65ff27dbc1..54e549366d 100644 --- a/source/module_io/input_conv.cpp +++ b/source/module_io/input_conv.cpp @@ -275,22 +275,12 @@ void Input_Conv::Convert() // Fuxiang He add 2016-10-26 //---------------------------------------------------------- #ifdef __LCAO - module_tddft::Evolve_elec::td_force_dt = PARAM.inp.td_force_dt; - module_tddft::Evolve_elec::td_vext = PARAM.inp.td_vext; - if (module_tddft::Evolve_elec::td_vext) - { - parse_expression(PARAM.inp.td_vext_dire, module_tddft::Evolve_elec::td_vext_dire_case); - } - module_tddft::Evolve_elec::out_dipole = PARAM.inp.out_dipole; - module_tddft::Evolve_elec::out_efield = PARAM.inp.out_efield; - module_tddft::Evolve_elec::td_print_eij = PARAM.inp.td_print_eij; - module_tddft::Evolve_elec::td_edm = PARAM.inp.td_edm; TD_Velocity::out_current = PARAM.inp.out_current; TD_Velocity::out_current_k = PARAM.inp.out_current_k; TD_Velocity::out_vecpot = PARAM.inp.out_vecpot; TD_Velocity::init_vecpot_file = PARAM.inp.init_vecpot_file; read_td_efield(); -#endif +#endif // __LCAO diff --git a/source/module_io/read_input_item_tddft.cpp b/source/module_io/read_input_item_tddft.cpp index 236b15545b..9687af495d 100644 --- a/source/module_io/read_input_item_tddft.cpp +++ b/source/module_io/read_input_item_tddft.cpp @@ -20,13 +20,28 @@ void ReadInput::item_rt_tddft() read_sync_bool(input.td_vext); this->add_item(item); } + // { + // Input_Item item("td_vext_dire"); + // item.annotation = "extern potential direction"; + // item.read_value = [](const Input_Item& item, Parameter& para) { + // para.input.td_vext_dire = longstring(item.str_values); + // }; + // sync_string(input.td_vext_dire); + // this->add_item(item); + // } { Input_Item item("td_vext_dire"); item.annotation = "extern potential direction"; item.read_value = [](const Input_Item& item, Parameter& para) { - para.input.td_vext_dire = longstring(item.str_values); + parse_expression(item.str_values, para.input.td_vext_dire); }; - sync_string(input.td_vext_dire); + item.get_final_value = [](Input_Item& item, const Parameter& para) { + if (item.is_read()) + { + item.final_value.str(longstring(item.str_values)); + } + }; + add_intvec_bcast(input.td_vext_dire, para.input.td_vext_dire.size(), 0); this->add_item(item); } { diff --git a/source/module_io/test/for_testing_input_conv.h b/source/module_io/test/for_testing_input_conv.h index 88b1f304e2..7abea67bf7 100644 --- a/source/module_io/test/for_testing_input_conv.h +++ b/source/module_io/test/for_testing_input_conv.h @@ -30,17 +30,10 @@ #undef private bool berryphase::berry_phase_flag = false; -double module_tddft::Evolve_elec::td_force_dt; -bool module_tddft::Evolve_elec::td_vext; -std::vector module_tddft::Evolve_elec::td_vext_dire_case; -bool module_tddft::Evolve_elec::out_dipole; -bool module_tddft::Evolve_elec::out_efield; bool TD_Velocity::out_current; bool TD_Velocity::out_current_k; bool TD_Velocity::out_vecpot; bool TD_Velocity::init_vecpot_file; -double module_tddft::Evolve_elec::td_print_eij; -int module_tddft::Evolve_elec::td_edm; double elecstate::Gatefield::zgate = 0.5; bool elecstate::Gatefield::relax = false; bool elecstate::Gatefield::block = false; diff --git a/source/module_io/test/read_input_ptest.cpp b/source/module_io/test/read_input_ptest.cpp index b29cdcfc55..90f985c307 100644 --- a/source/module_io/test/read_input_ptest.cpp +++ b/source/module_io/test/read_input_ptest.cpp @@ -295,7 +295,6 @@ TEST_F(InputParaTest, ParaRead) EXPECT_DOUBLE_EQ(param.inp.soc_lambda, 1.0); EXPECT_DOUBLE_EQ(param.inp.td_force_dt, 0.02); EXPECT_EQ(param.inp.td_vext, 0); - EXPECT_EQ(param.inp.td_vext_dire, "1"); EXPECT_EQ(param.inp.propagator, 0); EXPECT_EQ(param.inp.td_stype, 0); EXPECT_EQ(param.inp.td_ttype, "0"); diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h index d2c27f78e1..4e42017ebd 100644 --- a/source/module_parameter/input_parameter.h +++ b/source/module_parameter/input_parameter.h @@ -280,9 +280,10 @@ struct Input_para double bessel_descriptor_sigma = 0.1; ///< spherical bessel smearing_sigma // ============== #Parameters (9.rt-tddft) =========================== - double td_force_dt = 0.02; ///<"fs" - bool td_vext = false; ///< add extern potential or not - std::string td_vext_dire = "1"; ///< vext direction + double td_force_dt = 0.02; ///<"fs" + bool td_vext = false; ///< add extern potential or not + // std::string td_vext_dire = "1"; ///< vext direction + std::vector td_vext_dire = {1}; ///< vector of vext direction bool init_vecpot_file = false; ///< initialize the vector potential, though file or integral double td_print_eij = -1.0; ///< threshold to output Eij elements