Replace the APIs for more efficient solving eigenpairs in davidson and CG-subspace (#1703)

haozhihan · dyzheng · commit b0c36c37d599 · 2022-12-30T12:23:24.000+08:00
* Add dngvd operation for LAPACK_subspace

* Change gvx_op to evx_op in davidson method

* Add dnevx_op &amp; dngvd_op CUDA version
diff --git a/source/module_base/lapack_connector.h b/source/module_base/lapack_connector.h
@@ -22,18 +22,42 @@
 extern "C"
 {
     int ilaenv_(int* ispec,const char* name,const char* opts,
-    const int* n1,const int* n2,const int* n3,const int* n4);
+                const int* n1,const int* n2,const int* n3,const int* n4);
+
+
     // solve the generalized eigenproblem Ax=eBx, where A is Hermitian and complex couble
-    // zhegv_ returns all eigenvalues while zhegvx_ returns selected ones
+    // zhegv_ & zhegvd_ returns all eigenvalues while zhegvx_ returns selected ones
+
+    void zhegvd_(const int* itype, const char* jobz, const char* uplo, const int* n,
+                 std::complex<double>* a, const int* lda, std::complex<double>* b, const int* ldb, 
+                 double* w, std::complex<double>* work, int* lwork, double* rwork, int* lrwork,
+                 int* iwork, int* liwork, int* info);
+
     void zhegv_(const int* itype,const char* jobz,const char* uplo,const int* n,
                 std::complex<double>* a,const int* lda,std::complex<double>* b,const int* ldb,
                 double* w,std::complex<double>* work,int* lwork,double* rwork,int* info);
+
     void zhegvx_(const int* itype,const char* jobz,const char* range,const char* uplo,
                  const int* n,std::complex<double> *a,const int* lda,std::complex<double> *b,
                  const int* ldb,const double* vl,const double* vu,const int* il,
                  const int* iu,const double* abstol,const int* m,double* w,
                  std::complex<double> *z,const int *ldz,std::complex<double> *work,const int* lwork,
                  double* rwork,int* iwork,int* ifail,int* info);
+
+    // solve the eigenproblem Ax=ex, where A is Hermitian and complex couble
+    // zheev_ returns all eigenvalues while zheevx_ returns selected ones
+    void zheev_(const char* jobz,const char* uplo,const int* n,std::complex<double> *a,
+                const int* lda,double* w,std::complex<double >* work,const int* lwork,
+                double* rwork,int* info);
+
+    void zheevx_(const char* jobz,const char* range,const char* uplo,
+                 const int* n,std::complex<double> *a,const int* lda,
+                 const double* vl,const double* vu,const int* il,
+                 const int* iu,const double* abstol,const int* m,double* w,
+                 std::complex<double> *z,const int *ldz,std::complex<double> *work,const int* lwork,
+                 double* rwork,int* iwork,int* ifail,int* info);
+
+
 	// solve the generalized eigenproblem Ax=eBx, where A is Symmetric and real couble
     // dsygv_ returns all eigenvalues while dsygvx_ returns selected ones
 	void dsygv_(const int* itype, const char* jobz,const char* uplo, const int* n,
@@ -47,10 +71,8 @@ extern "C"
     // solve the eigenproblem Ax=ex, where A is Symmetric and real double
 	void dsyev_(const char* jobz,const char* uplo,const int* n,double *a,
                 const int* lda,double* w,double* work,const int* lwork, int* info);
-    // solve the eigenproblem Ax=ex, where A is Hermitian and complex couble
-    void zheev_(const char* jobz,const char* uplo,const int* n,std::complex<double> *a,
-                const int* lda,double* w,std::complex<double >* work,const int* lwork,
-                double* rwork,int* info);
+    
+
     // dsytrf_ computes the Bunch-Kaufman factorization of a double precision
     // symmetric matrix, while dsytri takes its output to perform martrix inversion
     void dsytrf_(const char* uplo, const int* n, double * a, const int* lda,
@@ -226,7 +248,34 @@ class LapackConnector
         return nb;
     }
 
-    // wrap function of fortran lapack routine zhegv.
+    // wrap function of fortran lapack routine zhegvd.
+    static inline
+    void zhegvd(const int itype, const char jobz, const char uplo, const int n, 
+                ModuleBase::ComplexMatrix& a, const int lda, 
+                ModuleBase::ComplexMatrix& b, const int ldb, double* w, 
+                std::complex<double>* work, int lwork, double* rwork, int lrwork,
+                int* iwork, int liwork, int info)
+    {	
+        // Transpose the std::complex matrix to the fortran-form real-std::complex array.
+        std::complex<double>* aux = LapackConnector::transpose(a, n, lda);
+        std::complex<double>* bux = LapackConnector::transpose(b, n, ldb);
+
+        // call the fortran routine
+        zhegvd_(&itype, &jobz, &uplo, &n, 
+                aux, &lda, bux, &ldb, w,
+                work, &lwork, rwork, &lrwork,
+                iwork, &liwork, &info);
+        
+        // Transpose the fortran-form real-std::complex array to the std::complex matrix.
+        LapackConnector::transpose(aux, a, n, lda);
+        LapackConnector::transpose(bux, b, n, ldb);
+        
+        // free the memory.
+        delete[] aux;
+        delete[] bux;
+    }
+
+    // wrap function of fortran lapack routine zhegv ( ModuleBase::ComplexMatrix version ).
     static inline
     void zhegv(	const int itype,const char jobz,const char uplo,const int n,ModuleBase::ComplexMatrix& a,
                 const int lda,ModuleBase::ComplexMatrix& b,const int ldb,double* w,std::complex<double>* work,
@@ -244,20 +293,21 @@ class LapackConnector
         delete[] aux;
         delete[] bux;
     }
-    // wrap function of fortran lapack routine zhegv.
+
+    // wrap function of fortran lapack routine zhegv ( pointer version ) .
     static inline
     void zhegv(	const int itype, const char jobz, const char uplo, const int n, std::complex<double>* a,
                 const int lda, const std::complex<double>* b, const int ldb, double* w, std::complex<double>* work,
                 int lwork, double* rwork, int info, int ld_real)
-    {	// Transpose the std::complex matrix to the fortran-form real-std::complex array.
+    {	
+        // Transpose the std::complex matrix to the fortran-form real-std::complex array.
         std::complex<double>* aux = LapackConnector::transpose(a, n, lda, ld_real);
         std::complex<double>* bux = LapackConnector::transpose(b, n, ldb, ld_real);
 
         // call the fortran routine
         zhegv_(&itype, &jobz, &uplo, &n, aux, &lda, bux, &ldb, w, work, &lwork, rwork, &info);
+        
         // Transpose the fortran-form real-std::complex array to the std::complex matrix.
-        // LapackConnector::transpose(aux, a, n, lda);
-        // LapackConnector::transpose(bux, b, n, ldb);
         for (int i = 0; i < n; ++i)
         {
             for (int j = 0; j < lda; ++j)
@@ -270,7 +320,7 @@ class LapackConnector
         delete[] bux;
     }
 
-    // wrap function of fortran lapack routine zheev.
+    // wrap function of fortran lapack routine zhegvx ( ModuleBase::ComplexMatrix version ).
     static inline
     void zhegvx( const int itype, const char jobz, const char range, const char uplo,
                  const int n, const ModuleBase::ComplexMatrix& a, const int lda, const ModuleBase::ComplexMatrix& b,
@@ -298,7 +348,8 @@ class LapackConnector
         delete[] zux;
 
     }
-    // wrap function of fortran lapack routine zheev.
+
+    // wrap function of fortran lapack routine zhegvx ( pointer version ).
     static inline
     void zhegvx( const int itype, const char jobz, const char range, const char uplo,
                  const int n, const std::complex<double>* a, const int lda, const std::complex<double>* b,
@@ -331,6 +382,38 @@ class LapackConnector
         delete[] zux;
     }
 
+    static inline
+    void zheevx( const int itype, const char jobz, const char range, const char uplo, const int n, 
+                 const std::complex<double>* a, const int lda, const double vl, const double vu, const int il, const int iu,
+                 const double abstol, const int m, double* w, std::complex<double>* z, const int ldz,
+                 std::complex<double>* work, const int lwork, double* rwork, int* iwork, int* ifail, int info, int nbase_x)
+    {
+        // Transpose the std::complex matrix to the fortran-form real-std::complex array.
+        std::complex<double>* aux = LapackConnector::transpose(a, n, lda, nbase_x);
+        std::complex<double>* zux = new std::complex<double>[n*iu];// mohan modify 2009-08-02
+
+        // call the fortran routine
+        zheevx_(&jobz, &range, &uplo, &n, 
+                aux, &lda, &vl, &vu, &il, &iu, 
+                &abstol, &m, w, zux, &ldz, 
+                work, &lwork, rwork, iwork, ifail, &info);
+
+        // Transpose the fortran-form real-std::complex array to the std::complex matrix
+        for (int i = 0; i < iu; ++i)
+        {
+            for (int j = 0; j < n; ++j)
+            {
+                z[j * nbase_x + i] = zux[i*n+j];
+            }
+        }
+
+        // free the memory.
+        delete[] aux;
+        delete[] zux;
+    }
+
+
+
 	// calculate the eigenvalues and eigenfunctions of a real symmetric matrix.
     static inline
     void dsygv(	const int itype,const char jobz,const char uplo,const int n,ModuleBase::matrix& a,
diff --git a/source/module_hsolver/diago_david.cpp b/source/module_hsolver/diago_david.cpp
@@ -625,16 +625,17 @@ void DiagoDavid<FPTYPE, Device>::diag_zhegvx(const int& n, // nbase
             resmem_var_op()(this->ctx, eigenvalue_gpu, this->nbase_x);
             syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalue_gpu, eigenvalue, this->nbase_x);
 
-            dngvx_op<FPTYPE, Device>()(this->ctx, n, this->nbase_x, this->hcc, this->scc, m, eigenvalue_gpu, this->vcc);
+            dnevx_op<FPTYPE, Device>()(this->ctx, n, this->nbase_x, this->hcc, m, eigenvalue_gpu, this->vcc);
 
             syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, eigenvalue, eigenvalue_gpu, this->nbase_x);
             delmem_var_op()(this->ctx, eigenvalue_gpu);
 #endif
         }
         else
         {
-            dngvx_op<FPTYPE,
-                     Device>()(this->ctx, n, this->nbase_x, this->hcc, this->scc, m, this->eigenvalue, this->vcc);
+            // dngvx_op<FPTYPE,
+            //          Device>()(this->ctx, n, this->nbase_x, this->hcc, this->scc, m, this->eigenvalue, this->vcc);
+            dnevx_op<FPTYPE, Device>()(this->ctx, n, this->nbase_x, this->hcc, m, this->eigenvalue, this->vcc);
         }
     }
 
diff --git a/source/module_hsolver/diago_iter_assist.cpp b/source/module_hsolver/diago_iter_assist.cpp
@@ -432,7 +432,8 @@ void DiagoIterAssist<FPTYPE, Device>::diagH_LAPACK(
         //===========================
         // calculate all eigenvalues
         //===========================
-        dngv_op<FPTYPE, Device>()(ctx, nstart, ldh, hcc, scc, res, vcc);
+        // dngv_op<FPTYPE, Device>()(ctx, nstart, ldh, hcc, scc, res, vcc);
+        dngvd_op<FPTYPE, Device>()(ctx, nstart, ldh, hcc, scc, res, vcc);
     }
     else {
         //=====================================
diff --git a/source/module_hsolver/include/dngvd_op.h b/source/module_hsolver/include/dngvd_op.h
@@ -16,6 +16,12 @@ template <typename FPTYPE, typename Device> struct dngvx_op
     /// @brief DNGVX computes the first m eigenvalues ​​and their corresponding eigenvectors of
     /// a complex generalized Hermitian-definite eigenproblem
     ///
+    /// In this op, the CPU version is implemented through the `gvx` interface, and the CUDA version
+    /// is implemented through the `gvd` interface and acquires the first m eigenpairs.
+    /// API doc:
+    /// 1. zhegvx: https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_ga8ea76cbbb14edb5a22069e203fc8e8b2.html
+    /// 2. cusolverDnZhegvd: https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-sygvd
+    ///
     /// Input Parameters
     ///     @param d : the type of device
     ///     @param nstart : the number of cols of the matrix
@@ -41,6 +47,41 @@ template <typename FPTYPE, typename Device> struct dngv_op
     /// @brief DNGV computes all the eigenvalues and eigenvectors of a complex generalized
     /// Hermitian-definite eigenproblem
     ///
+    /// In this op, the CPU version is implemented through the `gv` interface, and the CUDA version
+    /// is implemented through the `gvd` interface.
+    /// API doc:
+    /// 1. zhegv: https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_gaf7b790b3b89de432a423c9006c1cc1ac.html
+    /// 2. cusolverDnZhegvd: https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-sygvd
+    ///
+    /// Input Parameters
+    ///     @param d : the type of device
+    ///     @param nstart : the number of cols of the matrix
+    ///     @param ldh : the number of rows of the matrix
+    ///     @param A : the hermitian matrix A in A x=lambda B x (row major)
+    ///     @param B : the overlap matrix B in A x=lambda B x (row major)
+    /// Output Parameter
+    ///     @param W : calculated eigenvalues
+    ///     @param V : calculated eigenvectors (row major)
+    void operator()(const Device* d,
+                    const int nstart,
+                    const int ldh,
+                    const std::complex<FPTYPE>* A,
+                    const std::complex<FPTYPE>* B,
+                    double* W,
+                    std::complex<FPTYPE>* V);
+};
+
+template <typename FPTYPE, typename Device> struct dngvd_op
+{
+    /// @brief DNGVD computes all the eigenvalues and eigenvectors of a complex generalized
+    /// Hermitian-definite eigenproblem. If eigenvectors are desired, it uses a divide and conquer algorithm.
+    ///
+    /// In this op, the CPU version is implemented through the `gvd` interface, and the CUDA version
+    /// is implemented through the `gvd` interface.
+    /// API doc:
+    /// 1. zhegvd: https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_ga74fdf9b5a16c90d8b7a589dec5ca058a.html
+    /// 2. cusolverDnZhegvd: https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-sygvd
+    ///
     /// Input Parameters
     ///     @param d : the type of device
     ///     @param nstart : the number of cols of the matrix
@@ -59,6 +100,36 @@ template <typename FPTYPE, typename Device> struct dngv_op
                     std::complex<FPTYPE>* V);
 };
 
+
+template <typename FPTYPE, typename Device> struct dnevx_op
+{
+    /// @brief DNEVX computes the first m eigenvalues ​​and their corresponding eigenvectors of
+    /// a complex generalized Hermitian-definite eigenproblem
+    ///
+    /// In this op, the CPU version is implemented through the `evx` interface, and the CUDA version
+    /// is implemented through the `evd` interface and acquires the first m eigenpairs.
+    /// API doc:
+    /// 1. zheevx: https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_gaabef68a9c7b10df7aef8f4fec89fddbe.html
+    /// 2. cusolverDnZheevd: https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-syevd
+    ///
+    /// Input Parameters
+    ///     @param d : the type of device
+    ///     @param nstart : the number of cols of the matrix
+    ///     @param ldh : the number of rows of the matrix
+    ///     @param A : the hermitian matrix A in A x=lambda B x (row major)
+    /// Output Parameter
+    ///     @param W : calculated eigenvalues
+    ///     @param V : calculated eigenvectors (row major)
+    void operator()(const Device* d,
+                    const int nstart,
+                    const int ldh,
+                    const std::complex<FPTYPE>* A,
+                    const int m,
+                    double* W,
+                    std::complex<FPTYPE>* V);
+};
+
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 
 void createCUSOLVERhandle();
diff --git a/source/module_hsolver/src/cuda/dngvd_op.cu b/source/module_hsolver/src/cuda/dngvd_op.cu
diff --git a/source/module_hsolver/src/dngvd_op.cpp b/source/module_hsolver/src/dngvd_op.cpp

Original file line number	Diff line number	Diff line change
`@@ -625,16 +625,17 @@ void DiagoDavid<FPTYPE, Device>::diag_zhegvx(const int& n, // nbase`
`625`	`625`	`resmem_var_op()(this->ctx, eigenvalue_gpu, this->nbase_x);`
`626`	`626`	`syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalue_gpu, eigenvalue, this->nbase_x);`
`627`	`627`
`628`		`- dngvx_op<FPTYPE, Device>()(this->ctx, n, this->nbase_x, this->hcc, this->scc, m, eigenvalue_gpu, this->vcc);`
	`628`	`+ dnevx_op<FPTYPE, Device>()(this->ctx, n, this->nbase_x, this->hcc, m, eigenvalue_gpu, this->vcc);`
`629`	`629`
`630`	`630`	`syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, eigenvalue, eigenvalue_gpu, this->nbase_x);`
`631`	`631`	`delmem_var_op()(this->ctx, eigenvalue_gpu);`
`632`	`632`	`#endif`
`633`	`633`	`}`
`634`	`634`	`else`
`635`	`635`	`{`
`636`		`- dngvx_op<FPTYPE,`
`637`		`- Device>()(this->ctx, n, this->nbase_x, this->hcc, this->scc, m, this->eigenvalue, this->vcc);`
	`636`	`+ // dngvx_op<FPTYPE,`
	`637`	`+ // Device>()(this->ctx, n, this->nbase_x, this->hcc, this->scc, m, this->eigenvalue, this->vcc);`
	`638`	`+ dnevx_op<FPTYPE, Device>()(this->ctx, n, this->nbase_x, this->hcc, m, this->eigenvalue, this->vcc);`
`638`	`639`	`}`
`639`	`640`	`}`
`640`	`641`