deepmodeling · mohanchen · Jan 6, 2025 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
@@ -39,6 +39,7 @@
     - [pw\_diag\_thr](#pw_diag_thr)
     - [pw\_diag\_nmax](#pw_diag_nmax)
     - [pw\_diag\_ndim](#pw_diag_ndim)
+    - [diag\_subspace](#diag_subspace)
     - [erf\_ecut](#erf_ecut)
     - [fft\_mode](#fft_mode)
     - [erf\_height](#erf_height)
@@ -787,7 +788,18 @@ These variables are used to control the plane wave related parameters.
 
 - **Type**: Integer
 - **Description**: Only useful when you use `ks_solver = dav` or `ks_solver = dav_subspace`. It indicates dimension of workspace(number of wavefunction packets, at least 2 needed) for the Davidson method. A larger value may yield a smaller number of iterations in the algorithm but uses more memory and more CPU time in subspace diagonalization.
-- **Default**: 4
+- **Default**: 4 
+
+### diag_subspace
+
+- **Type**: Integer
+- **Description**: The method to diagonalize subspace in dav_subspace method. The available options are:
+  - 0: by LAPACK
+  - 1: by GenELPA
+  - 2: by ScaLAPACK
+  LAPACK only solve in one core, GenELPA and ScaLAPACK can solve in parallel. If the system is small (such as the band number is less than 100), LAPACK is recommended. If the system is large and MPI parallel is used, then GenELPA or ScaLAPACK is recommended, and GenELPA usually has better performance. For GenELPA and ScaLAPACK, the block size can be set by [nb2d](#nb2d).
+
+- **Default**: 0
 
 ### erf_ecut
 

diff --git a/python/pyabacus/src/hsolver/CMakeLists.txt b/python/pyabacus/src/hsolver/CMakeLists.txt
@@ -5,6 +5,9 @@ list(APPEND _diago
     ${HSOLVER_PATH}/diago_cg.cpp
     ${HSOLVER_PATH}/diag_const_nums.cpp
     ${HSOLVER_PATH}/diago_iter_assist.cpp
+    ${HSOLVER_PATH}/diag_hs_para.cpp
+    ${HSOLVER_PATH}/diago_pxxxgvx.cpp
+
 
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
     ${HSOLVER_PATH}/kernels/math_kernel_op.cpp

diff --git a/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp b/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
@@ -108,7 +108,9 @@ class PyDiagoDavSubspace
         bool need_subspace,
         std::vector<double>& diag_ethr,
         bool scf_type,
-        hsolver::diag_comm_info comm_info
+        hsolver::diag_comm_info comm_info,
+        int diag_subspace,
+        int nb2d
     ) {
         auto hpsi_func = [mm_op] (
             std::complex<double> *psi_in,
@@ -138,7 +140,9 @@ class PyDiagoDavSubspace
             tol, 
             max_iter, 
             need_subspace, 
-            comm_info
+            comm_info,
+            diag_subspace,
+            nb2d
         );
 
         return obj->diag(hpsi_func, psi, nbasis, eigenvalue, diag_ethr, scf_type);

diff --git a/python/pyabacus/src/hsolver/py_hsolver.cpp b/python/pyabacus/src/hsolver/py_hsolver.cpp
@@ -67,6 +67,13 @@ void bind_hsolver(py::module& m)
                 where the initial precision of eigenvalue calculation can be coarse.
                 If false, it indicates a non-self-consistent field (non-SCF) calculation,
                 where high precision in eigenvalue calculation is required from the start.
+            comm_info : diag_comm_info
+                The communicator information.
+            diago_subspace : int
+                The method to solve the generalized eigenvalue problem.
+                0: LAPACK, 1: Gen-ELPA, 2: ScaLAPACK
+            nb2d : int
+                The block size in 2d block cyclic distribution if use elpa or scalapack.
         )pbdoc", 
         "mm_op"_a, 
         "precond_vec"_a, 
@@ -76,7 +83,9 @@ void bind_hsolver(py::module& m)
         "need_subspace"_a, 
         "diag_ethr"_a, 
         "scf_type"_a, 
-        "comm_info"_a)
+        "comm_info"_a,
+        "diago_subspace"_a,
+        "nb2d"_a)
         .def("set_psi", &py_hsolver::PyDiagoDavSubspace::set_psi, R"pbdoc(
             Set the initial guess of the eigenvectors, i.e. the wave functions.
         )pbdoc", "psi_in"_a)

diff --git a/python/pyabacus/src/pyabacus/hsolver/_hsolver.py b/python/pyabacus/src/pyabacus/hsolver/_hsolver.py
@@ -34,7 +34,9 @@ def dav_subspace(
     max_iter: int = 1000,
     need_subspace: bool = False,
     diag_ethr: Union[List[float], None] = None,
-    scf_type: bool = False
+    scf_type: bool = False,
+    diag_subspace: int = 0,
+    nb2d: int = 0
 ) -> Tuple[NDArray[np.float64], NDArray[np.complex128]]:
     """ A function to diagonalize a matrix using the Davidson-Subspace method.
 
@@ -67,6 +69,11 @@ def dav_subspace(
         If True, the initial precision of eigenvalue calculation can be coarse. 
         If False, it indicates a non-self-consistent field (non-SCF) calculation, 
         where high precision in eigenvalue calculation is required from the start.  
+    diag_subspace : int, optional
+        The method to do the diagonalization, by default 0.
+        0: LAPACK, 1: Gen-elpa, 2: Scalapack
+    nb2d : int, optional
+        The block size for 2D decomposition, by default 0, which will be automatically set.
 
     Returns
     -------
@@ -101,7 +108,9 @@ def dav_subspace(
         need_subspace,
         diag_ethr,
         scf_type,
-        comm_info
+        comm_info,
+        diag_subspace,
+        nb2d
     )
 
     e = _diago_obj_dav_subspace.get_eigenvalue()

diff --git a/source/Makefile.Objects b/source/Makefile.Objects
@@ -338,6 +338,8 @@ OBJS_HSOLVER=diago_cg.o\
     math_kernel_op.o\
     dngvd_op.o\
     diag_const_nums.o\
+    diag_hs_para.o\
+    diago_pxxxgvx.o\
 
 OBJS_HSOLVER_LCAO=hsolver_lcao.o\
       diago_scalapack.o\

diff --git a/source/module_base/blacs_connector.h b/source/module_base/blacs_connector.h
@@ -39,7 +39,7 @@ extern "C"
 		// Informational and Miscellaneous
 	void Cblacs_gridinfo(int icontxt, int* nprow, int *npcol, int *myprow, int *mypcol);
     void Cblacs_gridinit(int* icontxt, char* layout, int nprow, int npcol);
-    void Cblacs_gridexit(int* icontxt);
+    void Cblacs_gridexit(int icontxt);
     int Cblacs_pnum(int icontxt, int prow, int pcol);
     void Cblacs_pcoord(int icontxt, int pnum, int *prow, int *pcol);
 	void Cblacs_exit(int icontxt);

diff --git a/source/module_base/scalapack_connector.h b/source/module_base/scalapack_connector.h
@@ -80,12 +80,26 @@ extern "C"
 		const double* vl, const double* vu, const int* il, const int* iu,
 		const double* abstol, int* m, int* nz, double* w, const double*orfac, double* Z, const int* iz, const int* jz, const int*descz,
 		double* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);
+
 	void pzhegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
 		const int* n, std::complex<double>* A, const int* ia, const int* ja, const int*desca, std::complex<double>* B, const int* ib, const int* jb, const int*descb,
 		const double* vl, const double* vu, const int* il, const int* iu,
 		const double* abstol, int* m, int* nz, double* w, const double*orfac, std::complex<double>* Z, const int* iz, const int* jz, const int*descz,
 		std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);
 
+	void pssygvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, float* A, const int* ia, const int* ja, const int*desca, float* B, const int* ib, const int* jb, const int*descb,
+		const float* vl, const float* vu, const int* il, const int* iu,
+		const float* abstol, int* m, int* nz, float* w, const float*orfac, float* Z, const int* iz, const int* jz, const int*descz,
+		float* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);
+
+	void pchegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, std::complex<float>* A, const int* ia, const int* ja, const int*desca, std::complex<float>* B, const int* ib, const int* jb, const int*descb,
+		const float* vl, const float* vu, const int* il, const int* iu,
+		const float* abstol, int* m, int* nz, float* w, const float*orfac, std::complex<float>* Z, const int* iz, const int* jz, const int*descz,
+		std::complex<float>* work, int* lwork, float* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);
+
+
 	void pzgetri_(
 		const int *n, 
 		const std::complex<double> *A, const int *ia, const int *ja, const int *desca,

diff --git a/source/module_hsolver/CMakeLists.txt b/source/module_hsolver/CMakeLists.txt
@@ -9,6 +9,9 @@ list(APPEND objects
     hsolver_pw_sdft.cpp
     diago_iter_assist.cpp
     hsolver.cpp
+    diago_pxxxgvx.cpp
+    diag_hs_para.cpp
+
 )
 
 if(ENABLE_LCAO)

diff --git a/source/module_hsolver/diag_hs_para.cpp b/source/module_hsolver/diag_hs_para.cpp
@@ -0,0 +1,205 @@
+#include "module_hsolver/diag_hs_para.h"
+
+#include "module_base/scalapack_connector.h"
+#include "module_base/parallel_2d.h"
+#include "module_hsolver/diago_pxxxgvx.h"
+
+#ifdef __ELPA
+#include "module_hsolver/genelpa/elpa_solver.h"
+#endif
+
+#include <iostream>
+
+namespace hsolver
+{
+
+#ifdef __ELPA
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               std::complex<double>* h_local,
+               std::complex<double>* s_local,
+               double* ekb,
+               std::complex<double>* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    int DecomposedState = 0;
+    ELPA_Solver es(false, comm, nband, para2d_local.get_row_size(), para2d_local.get_col_size(), para2d_local.desc);
+    es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
+    es.exit();
+}
+
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               double* h_local,
+               double* s_local,
+               double* ekb,
+               double* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    int DecomposedState = 0;
+    ELPA_Solver es(true, comm, nband, para2d_local.get_row_size(), para2d_local.get_col_size(), para2d_local.desc);
+    es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
+    es.exit();
+}
+
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               std::complex<float>* h_local,
+               std::complex<float>* s_local,
+               float* ekb,
+               std::complex<float>* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    std::cout << "Error: ELPA do not support single precision. " << std::endl;
+    exit(1);
+}
+
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               float* h_local,
+               float* s_local,
+               float* ekb,
+               float* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    std::cout << "Error: ELPA do not support single precision. " << std::endl;
+    exit(1);
+}
+
+#endif
+
+#ifdef __MPI
+
+template <typename T>
+void Diago_HS_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size)
+{
+    int myrank = 0;
+    MPI_Comm_rank(comm, &myrank);
+    Parallel_2D para2d_global;
+    Parallel_2D para2d_local;
+    para2d_global.init(lda, lda, lda, comm);
+
+    int max_nb = block_size;
+    if (block_size == 0)
+    {
+        if (nband > 500)
+        {
+            max_nb = 32;
+        }
+        else
+        {
+            max_nb = 16;
+        }
+    }
+    else if (block_size < 0)
+    {
+        std::cout << "Error: block_size in diago_subspace should be a positive integer. " << std::endl;
+        exit(1);
+    }
+
+    // for genelpa, if the block size is too large that some cores have no data, then it will cause error.
+    if (diag_subspace == 1)
+    {
+        if (max_nb * (std::max(para2d_global.dim0, para2d_global.dim1) - 1) >= lda)
+        {
+            max_nb = lda / std::max(para2d_global.dim0, para2d_global.dim1);
+        }
+    }
+
+    para2d_local.init(lda, lda, max_nb, comm);
+    std::vector<T> h_local(para2d_local.get_col_size() * para2d_local.get_row_size());
+    std::vector<T> s_local(para2d_local.get_col_size() * para2d_local.get_row_size());
+    std::vector<T> wfc_2d(para2d_local.get_col_size() * para2d_local.get_row_size());
+
+    // distribute h and s to 2D
+    Cpxgemr2d(lda, lda, h, 1, 1, para2d_global.desc, h_local.data(), 1, 1, para2d_local.desc, para2d_local.blacs_ctxt);
+    Cpxgemr2d(lda, lda, s, 1, 1, para2d_global.desc, s_local.data(), 1, 1, para2d_local.desc, para2d_local.blacs_ctxt);
+
+    if (diag_subspace == 1)
+    {
+#ifdef __ELPA
+        elpa_diag(comm, nband, h_local.data(), s_local.data(), ekb, wfc_2d.data(), para2d_local);
+#else
+        std::cout << "ERROR: try to use ELPA to solve the generalized eigenvalue problem, but ELPA is not compiled. "
+                  << std::endl;
+        exit(1);
+#endif
+    }
+    else if (diag_subspace == 2)
+    {
+        hsolver::pxxxgvx_diag(para2d_local.desc,
+                              para2d_local.get_row_size(),
+                              para2d_local.get_col_size(),
+                              nband,
+                              h_local.data(),
+                              s_local.data(),
+                              ekb,
+                              wfc_2d.data());
+    }
+    else
+    {
+        std::cout << "Error: parallel diagonalization method is not supported. " << "diag_subspace = " << diag_subspace
+                  << std::endl;
+        exit(1);
+    }
+
+    // gather wfc
+    Cpxgemr2d(lda, lda, wfc_2d.data(), 1, 1, para2d_local.desc, wfc, 1, 1, para2d_global.desc, para2d_local.blacs_ctxt);
+
+    // free the context
+    Cblacs_gridexit(para2d_local.blacs_ctxt);
+    Cblacs_gridexit(para2d_global.blacs_ctxt);
+}
+
+// template instantiation
+template void Diago_HS_para<double>(double* h,
+                                    double* s,
+                                    const int lda,
+                                    const int nband,
+                                    typename GetTypeReal<double>::type* const ekb,
+                                    double* const wfc,
+                                    const MPI_Comm& comm,
+                                    const int diag_subspace,
+                                    const int block_size);
+
+template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
+                                                  std::complex<double>* s,
+                                                  const int lda,
+                                                  const int nband,
+                                                  typename GetTypeReal<std::complex<double>>::type* const ekb,
+                                                  std::complex<double>* const wfc,
+                                                  const MPI_Comm& comm,
+                                                  const int diag_subspace,
+                                                  const int block_size);
+
+template void Diago_HS_para<float>(float* h,
+                                   float* s,
+                                   const int lda,
+                                   const int nband,
+                                   typename GetTypeReal<float>::type* const ekb,
+                                   float* const wfc,
+                                   const MPI_Comm& comm,
+                                   const int diag_subspace,
+                                   const int block_size);
+
+template void Diago_HS_para<std::complex<float>>(std::complex<float>* h,
+                                                 std::complex<float>* s,
+                                                 const int lda,
+                                                 const int nband,
+                                                 typename GetTypeReal<std::complex<float>>::type* const ekb,
+                                                 std::complex<float>* const wfc,
+                                                 const MPI_Comm& comm,
+                                                 const int diag_subspace,
+                                                 const int block_size);
+
+#endif
+
+} // namespace hsolver