diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index 7eb1b382a0..96ce3a3864 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -21,6 +21,7 @@
     - [kspacing](#kspacing)
     - [min\_dist\_coef](#min_dist_coef)
     - [device](#device)
+    - [nb2d](#nb2d)
     - [precision](#precision)
   - [Variables related to input files](#variables-related-to-input-files)
     - [stru\_file](#stru_file)
@@ -40,12 +41,12 @@
     - [diago\_smooth\_ethr](#diago_smooth_ethr)
     - [pw\_diag\_nmax](#pw_diag_nmax)
     - [pw\_diag\_ndim](#pw_diag_ndim)
+    - [diag\_subspace](#diag_subspace)
     - [erf\_ecut](#erf_ecut)
     - [fft\_mode](#fft_mode)
     - [erf\_height](#erf_height)
     - [erf\_sigma](#erf_sigma)
   - [Numerical atomic orbitals related variables](#numerical-atomic-orbitals-related-variables)
-    - [nb2d](#nb2d)
     - [lmaxmax](#lmaxmax)
     - [lcao\_ecut](#lcao_ecut)
     - [lcao\_dk](#lcao_dk)
@@ -667,6 +668,19 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c
   - cg/bpcg/dav ks_solver: required by the `single` precision options
 - **Default**: double
 
+### nb2d
+
+- **Type**: Integer
+- **Description**: When using elpa or scalapack to solver the eigenvalue problem, the data should be distributed by the two-dimensional block-cyclic distribution. This paramter specifies the size of the block. It is valid for:
+  - [ks_solver](#ks_solver) is genelpa or scalapack_gvx. If nb2d is set to 0, then it will be automatically set in the program according to the size of atomic orbital basis:
+    - if size <= 500: nb2d = 1
+    - if 500 < size <= 1000: nb2d = 32
+    - if size > 1000: nb2d = 64;
+  - [ks_solver](#ks_solver) is dav_subspace, and [diag_subspace](#diag_subspace) is 1 or 2. It is the block size for the diagonization of subspace. If it is set to 0, then it will be automatically set in the program according to the number of band:
+    - if number of band > 500: nb2d = 32
+    - if number of band < 500: nb2d = 16
+- **Default**: 0
+
 [back to top](#full-list-of-input-keywords)
 
 ## Variables related to input files
@@ -794,7 +808,18 @@ These variables are used to control the plane wave related parameters.
 
 - **Type**: Integer
 - **Description**: Only useful when you use `ks_solver = dav` or `ks_solver = dav_subspace`. It indicates dimension of workspace(number of wavefunction packets, at least 2 needed) for the Davidson method. A larger value may yield a smaller number of iterations in the algorithm but uses more memory and more CPU time in subspace diagonalization.
-- **Default**: 4
+- **Default**: 4 
+
+### diag_subspace
+
+- **Type**: Integer
+- **Description**: The method to diagonalize subspace in dav_subspace method. The available options are:
+  - 0: by LAPACK
+  - 1: by GenELPA
+  - 2: by ScaLAPACK
+  LAPACK only solve in one core, GenELPA and ScaLAPACK can solve in parallel. If the system is small (such as the band number is less than 100), LAPACK is recommended. If the system is large and MPI parallel is used, then GenELPA or ScaLAPACK is recommended, and GenELPA usually has better performance. For GenELPA and ScaLAPACK, the block size can be set by [nb2d](#nb2d).
+
+- **Default**: 0
 
 ### erf_ecut
 
@@ -837,15 +862,6 @@ These variables are used to control the plane wave related parameters.
 
 These variables are used to control the numerical atomic orbitals related parameters.
 
-### nb2d
-
-- **Type**: Integer
-- **Description**: In LCAO calculations, we arrange the total number of processors in an 2D array, so that we can partition the wavefunction matrix (number of bands*total size of atomic orbital basis) and distribute them in this 2D array. When the system is large, we group processors into sizes of nb2d, so that multiple processors take care of one row block (a group of atomic orbitals) in the wavefunction matrix. If set to 0, nb2d will be automatically set in the program according to the size of atomic orbital basis:
-  - if size <= 500 : nb2d = 1
-  - if 500 < size <= 1000 : nb2d = 32
-  - if size > 1000 : nb2d = 64;
-- **Default**: 0
-
 ### lmaxmax
 
 - **Type**: Integer
diff --git a/python/pyabacus/src/hsolver/CMakeLists.txt b/python/pyabacus/src/hsolver/CMakeLists.txt
index 853973e653..f0f04f97a7 100644
--- a/python/pyabacus/src/hsolver/CMakeLists.txt
+++ b/python/pyabacus/src/hsolver/CMakeLists.txt
@@ -5,6 +5,9 @@ list(APPEND _diago
     ${HSOLVER_PATH}/diago_cg.cpp
     ${HSOLVER_PATH}/diag_const_nums.cpp
     ${HSOLVER_PATH}/diago_iter_assist.cpp
+    ${HSOLVER_PATH}/diag_hs_para.cpp
+    ${HSOLVER_PATH}/diago_pxxxgvx.cpp
+
 
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
     ${HSOLVER_PATH}/kernels/math_kernel_op.cpp
diff --git a/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp b/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
index 112eb836e2..897d8d668e 100644
--- a/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
+++ b/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
@@ -108,7 +108,9 @@ class PyDiagoDavSubspace
         bool need_subspace,
         std::vector<double>& diag_ethr,
         bool scf_type,
-        hsolver::diag_comm_info comm_info
+        hsolver::diag_comm_info comm_info,
+        int diag_subspace,
+        int nb2d
     ) {
         auto hpsi_func = [mm_op] (
             std::complex<double> *psi_in,
@@ -138,7 +140,9 @@ class PyDiagoDavSubspace
             tol, 
             max_iter, 
             need_subspace, 
-            comm_info
+            comm_info,
+            diag_subspace,
+            nb2d
         );
 
         return obj->diag(hpsi_func, psi, nbasis, eigenvalue, diag_ethr, scf_type);
diff --git a/python/pyabacus/src/hsolver/py_hsolver.cpp b/python/pyabacus/src/hsolver/py_hsolver.cpp
index dfe7063ea4..e791fe9f09 100644
--- a/python/pyabacus/src/hsolver/py_hsolver.cpp
+++ b/python/pyabacus/src/hsolver/py_hsolver.cpp
@@ -67,6 +67,13 @@ void bind_hsolver(py::module& m)
                 where the initial precision of eigenvalue calculation can be coarse.
                 If false, it indicates a non-self-consistent field (non-SCF) calculation,
                 where high precision in eigenvalue calculation is required from the start.
+            comm_info : diag_comm_info
+                The communicator information.
+            diago_subspace : int
+                The method to solve the generalized eigenvalue problem.
+                0: LAPACK, 1: Gen-ELPA, 2: ScaLAPACK
+            nb2d : int
+                The block size in 2d block cyclic distribution if use elpa or scalapack.
         )pbdoc", 
         "mm_op"_a, 
         "precond_vec"_a, 
@@ -76,7 +83,9 @@ void bind_hsolver(py::module& m)
         "need_subspace"_a, 
         "diag_ethr"_a, 
         "scf_type"_a, 
-        "comm_info"_a)
+        "comm_info"_a,
+        "diago_subspace"_a,
+        "nb2d"_a)
         .def("set_psi", &py_hsolver::PyDiagoDavSubspace::set_psi, R"pbdoc(
             Set the initial guess of the eigenvectors, i.e. the wave functions.
         )pbdoc", "psi_in"_a)
diff --git a/python/pyabacus/src/pyabacus/hsolver/_hsolver.py b/python/pyabacus/src/pyabacus/hsolver/_hsolver.py
index 5d2b11d99a..7c60ce538c 100644
--- a/python/pyabacus/src/pyabacus/hsolver/_hsolver.py
+++ b/python/pyabacus/src/pyabacus/hsolver/_hsolver.py
@@ -34,7 +34,9 @@ def dav_subspace(
     max_iter: int = 1000,
     need_subspace: bool = False,
     diag_ethr: Union[List[float], None] = None,
-    scf_type: bool = False
+    scf_type: bool = False,
+    diag_subspace: int = 0,
+    nb2d: int = 0
 ) -> Tuple[NDArray[np.float64], NDArray[np.complex128]]:
     """ A function to diagonalize a matrix using the Davidson-Subspace method.
 
@@ -67,6 +69,11 @@ def dav_subspace(
         If True, the initial precision of eigenvalue calculation can be coarse. 
         If False, it indicates a non-self-consistent field (non-SCF) calculation, 
         where high precision in eigenvalue calculation is required from the start.  
+    diag_subspace : int, optional
+        The method to do the diagonalization, by default 0.
+        0: LAPACK, 1: Gen-elpa, 2: Scalapack
+    nb2d : int, optional
+        The block size for 2D decomposition, by default 0, which will be automatically set.
     
     Returns
     -------
@@ -101,7 +108,9 @@ def dav_subspace(
         need_subspace,
         diag_ethr,
         scf_type,
-        comm_info
+        comm_info,
+        diag_subspace,
+        nb2d
     )
     
     e = _diago_obj_dav_subspace.get_eigenvalue()
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 76e51cfbce..fcc69cc64e 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -339,6 +339,8 @@ OBJS_HSOLVER=diago_cg.o\
     math_kernel_op.o\
     dngvd_op.o\
     diag_const_nums.o\
+    diag_hs_para.o\
+    diago_pxxxgvx.o\
     
 OBJS_HSOLVER_LCAO=hsolver_lcao.o\
       diago_scalapack.o\
diff --git a/source/module_base/blacs_connector.h b/source/module_base/blacs_connector.h
index 3bcc43811a..61c67324e8 100644
--- a/source/module_base/blacs_connector.h
+++ b/source/module_base/blacs_connector.h
@@ -39,7 +39,7 @@ extern "C"
 		// Informational and Miscellaneous
 	void Cblacs_gridinfo(int icontxt, int* nprow, int *npcol, int *myprow, int *mypcol);
     void Cblacs_gridinit(int* icontxt, char* layout, int nprow, int npcol);
-    void Cblacs_gridexit(int* icontxt);
+    void Cblacs_gridexit(int icontxt);
     int Cblacs_pnum(int icontxt, int prow, int pcol);
     void Cblacs_pcoord(int icontxt, int pnum, int *prow, int *pcol);
 	void Cblacs_exit(int icontxt);
diff --git a/source/module_base/scalapack_connector.h b/source/module_base/scalapack_connector.h
index 8e0798b0d8..ebd1946aac 100644
--- a/source/module_base/scalapack_connector.h
+++ b/source/module_base/scalapack_connector.h
@@ -80,12 +80,26 @@ extern "C"
 		const double* vl, const double* vu, const int* il, const int* iu,
 		const double* abstol, int* m, int* nz, double* w, const double*orfac, double* Z, const int* iz, const int* jz, const int*descz,
 		double* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);
+
 	void pzhegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
 		const int* n, std::complex<double>* A, const int* ia, const int* ja, const int*desca, std::complex<double>* B, const int* ib, const int* jb, const int*descb,
 		const double* vl, const double* vu, const int* il, const int* iu,
 		const double* abstol, int* m, int* nz, double* w, const double*orfac, std::complex<double>* Z, const int* iz, const int* jz, const int*descz,
 		std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);
 
+	void pssygvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, float* A, const int* ia, const int* ja, const int*desca, float* B, const int* ib, const int* jb, const int*descb,
+		const float* vl, const float* vu, const int* il, const int* iu,
+		const float* abstol, int* m, int* nz, float* w, const float*orfac, float* Z, const int* iz, const int* jz, const int*descz,
+		float* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);
+		
+	void pchegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, std::complex<float>* A, const int* ia, const int* ja, const int*desca, std::complex<float>* B, const int* ib, const int* jb, const int*descb,
+		const float* vl, const float* vu, const int* il, const int* iu,
+		const float* abstol, int* m, int* nz, float* w, const float*orfac, std::complex<float>* Z, const int* iz, const int* jz, const int*descz,
+		std::complex<float>* work, int* lwork, float* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);
+
+
 	void pzgetri_(
 		const int *n, 
 		const std::complex<double> *A, const int *ia, const int *ja, const int *desca,
diff --git a/source/module_hsolver/CMakeLists.txt b/source/module_hsolver/CMakeLists.txt
index 8e1ff69905..93a708f21d 100644
--- a/source/module_hsolver/CMakeLists.txt
+++ b/source/module_hsolver/CMakeLists.txt
@@ -9,6 +9,9 @@ list(APPEND objects
     hsolver_pw_sdft.cpp
     diago_iter_assist.cpp
     hsolver.cpp
+    diago_pxxxgvx.cpp
+    diag_hs_para.cpp
+
 )
 
 if(ENABLE_LCAO)
diff --git a/source/module_hsolver/diag_hs_para.cpp b/source/module_hsolver/diag_hs_para.cpp
new file mode 100644
index 0000000000..43d25a3112
--- /dev/null
+++ b/source/module_hsolver/diag_hs_para.cpp
@@ -0,0 +1,205 @@
+#include "module_hsolver/diag_hs_para.h"
+
+#include "module_base/scalapack_connector.h"
+#include "module_base/parallel_2d.h"
+#include "module_hsolver/diago_pxxxgvx.h"
+
+#ifdef __ELPA
+#include "module_hsolver/genelpa/elpa_solver.h"
+#endif
+
+#include <iostream>
+
+namespace hsolver
+{
+
+#ifdef __ELPA
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               std::complex<double>* h_local,
+               std::complex<double>* s_local,
+               double* ekb,
+               std::complex<double>* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    int DecomposedState = 0;
+    ELPA_Solver es(false, comm, nband, para2d_local.get_row_size(), para2d_local.get_col_size(), para2d_local.desc);
+    es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
+    es.exit();
+}
+
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               double* h_local,
+               double* s_local,
+               double* ekb,
+               double* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    int DecomposedState = 0;
+    ELPA_Solver es(true, comm, nband, para2d_local.get_row_size(), para2d_local.get_col_size(), para2d_local.desc);
+    es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
+    es.exit();
+}
+
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               std::complex<float>* h_local,
+               std::complex<float>* s_local,
+               float* ekb,
+               std::complex<float>* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    std::cout << "Error: ELPA do not support single precision. " << std::endl;
+    exit(1);
+}
+
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               float* h_local,
+               float* s_local,
+               float* ekb,
+               float* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    std::cout << "Error: ELPA do not support single precision. " << std::endl;
+    exit(1);
+}
+
+#endif
+
+#ifdef __MPI
+
+template <typename T>
+void diago_hs_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size)
+{
+    int myrank = 0;
+    MPI_Comm_rank(comm, &myrank);
+    Parallel_2D para2d_global;
+    Parallel_2D para2d_local;
+    para2d_global.init(lda, lda, lda, comm);
+
+    int max_nb = block_size;
+    if (block_size == 0)
+    {
+        if (nband > 500)
+        {
+            max_nb = 32;
+        }
+        else
+        {
+            max_nb = 16;
+        }
+    }
+    else if (block_size < 0)
+    {
+        std::cout << "Error: block_size in diago_subspace should be a positive integer. " << std::endl;
+        exit(1);
+    }
+
+    // for genelpa, if the block size is too large that some cores have no data, then it will cause error.
+    if (diag_subspace == 1)
+    {
+        if (max_nb * (std::max(para2d_global.dim0, para2d_global.dim1) - 1) >= lda)
+        {
+            max_nb = lda / std::max(para2d_global.dim0, para2d_global.dim1);
+        }
+    }
+
+    para2d_local.init(lda, lda, max_nb, comm);
+    std::vector<T> h_local(para2d_local.get_col_size() * para2d_local.get_row_size());
+    std::vector<T> s_local(para2d_local.get_col_size() * para2d_local.get_row_size());
+    std::vector<T> wfc_2d(para2d_local.get_col_size() * para2d_local.get_row_size());
+
+    // distribute h and s to 2D
+    Cpxgemr2d(lda, lda, h, 1, 1, para2d_global.desc, h_local.data(), 1, 1, para2d_local.desc, para2d_local.blacs_ctxt);
+    Cpxgemr2d(lda, lda, s, 1, 1, para2d_global.desc, s_local.data(), 1, 1, para2d_local.desc, para2d_local.blacs_ctxt);
+
+    if (diag_subspace == 1)
+    {
+#ifdef __ELPA
+        elpa_diag(comm, nband, h_local.data(), s_local.data(), ekb, wfc_2d.data(), para2d_local);
+#else
+        std::cout << "ERROR: try to use ELPA to solve the generalized eigenvalue problem, but ELPA is not compiled. "
+                  << std::endl;
+        exit(1);
+#endif
+    }
+    else if (diag_subspace == 2)
+    {
+        hsolver::pxxxgvx_diag(para2d_local.desc,
+                              para2d_local.get_row_size(),
+                              para2d_local.get_col_size(),
+                              nband,
+                              h_local.data(),
+                              s_local.data(),
+                              ekb,
+                              wfc_2d.data());
+    }
+    else
+    {
+        std::cout << "Error: parallel diagonalization method is not supported. " << "diag_subspace = " << diag_subspace
+                  << std::endl;
+        exit(1);
+    }
+
+    // gather wfc
+    Cpxgemr2d(lda, lda, wfc_2d.data(), 1, 1, para2d_local.desc, wfc, 1, 1, para2d_global.desc, para2d_local.blacs_ctxt);
+
+    // free the context
+    Cblacs_gridexit(para2d_local.blacs_ctxt);
+    Cblacs_gridexit(para2d_global.blacs_ctxt);
+}
+
+// template instantiation
+template void diago_hs_para<double>(double* h,
+                                    double* s,
+                                    const int lda,
+                                    const int nband,
+                                    typename GetTypeReal<double>::type* const ekb,
+                                    double* const wfc,
+                                    const MPI_Comm& comm,
+                                    const int diag_subspace,
+                                    const int block_size);
+
+template void diago_hs_para<std::complex<double>>(std::complex<double>* h,
+                                                  std::complex<double>* s,
+                                                  const int lda,
+                                                  const int nband,
+                                                  typename GetTypeReal<std::complex<double>>::type* const ekb,
+                                                  std::complex<double>* const wfc,
+                                                  const MPI_Comm& comm,
+                                                  const int diag_subspace,
+                                                  const int block_size);
+
+template void diago_hs_para<float>(float* h,
+                                   float* s,
+                                   const int lda,
+                                   const int nband,
+                                   typename GetTypeReal<float>::type* const ekb,
+                                   float* const wfc,
+                                   const MPI_Comm& comm,
+                                   const int diag_subspace,
+                                   const int block_size);
+
+template void diago_hs_para<std::complex<float>>(std::complex<float>* h,
+                                                 std::complex<float>* s,
+                                                 const int lda,
+                                                 const int nband,
+                                                 typename GetTypeReal<std::complex<float>>::type* const ekb,
+                                                 std::complex<float>* const wfc,
+                                                 const MPI_Comm& comm,
+                                                 const int diag_subspace,
+                                                 const int block_size);
+
+#endif
+
+} // namespace hsolver
\ No newline at end of file
diff --git a/source/module_hsolver/diag_hs_para.h b/source/module_hsolver/diag_hs_para.h
new file mode 100644
index 0000000000..659f3b0781
--- /dev/null
+++ b/source/module_hsolver/diag_hs_para.h
@@ -0,0 +1,46 @@
+#include "module_base/parallel_2d.h"
+#include "module_base/macros.h"
+
+#ifdef __MPI
+#include <mpi.h>
+#endif
+
+namespace hsolver
+{
+
+
+#ifdef __MPI
+
+/**
+ * @brief Parallel do the generalized eigenvalue problem
+ * 
+ * @tparam T double or complex<double> or float or complex<float>
+ * @param H the hermitian matrix H.
+ * @param S the overlap matrix S.
+ * @param lda the leading dimension of H and S
+ * @param nband the number of bands to be calculated
+ * @param ekb to store the eigenvalues.
+ * @param wfc to store the eigenvectors
+ * @param comm the communicator
+ * @param diag_subspace the method to solve the generalized eigenvalue problem
+ * @param block_size the block size in 2d block cyclic distribution if use elpa or scalapack. 
+ * 
+ * @note 1. h and s should be full matrix in rank 0 of the communicator, and the other ranks is not concerned.
+ * @note 2. wfc is complete in rank 0, and not store in other ranks.
+ * @note 3. diag_subspace should be 1: by elpa, 2: by scalapack
+ * @note 4. block_size should be 0 or a positive integer. If it is 0, then will use a value as large as possible that is allowed
+ */
+template <typename T>
+void diago_hs_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size = 0);
+#endif
+
+} // namespace hsolver                 
+              
\ No newline at end of file
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index b180d72c13..82dadcb0d0 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -8,8 +8,14 @@
 #include "module_hsolver/kernels/math_kernel_op.h"
 #include "module_base/kernels/dsp/dsp_connector.h"
 
+#include "module_hsolver/diag_hs_para.h"
+
 #include <vector>
 
+#ifdef __MPI
+#include <mpi.h>
+#endif
+
 using namespace hsolver;
 
 template <typename T, typename Device>
@@ -20,9 +26,12 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
                                                 const double& diag_thr_in,
                                                 const int& diag_nmax_in,
                                                 const bool& need_subspace_in,
-                                                const diag_comm_info& diag_comm_in)
+                                                const diag_comm_info& diag_comm_in,
+                                                const int diag_subspace_in,
+                                                const int diago_subspace_bs_in)
     : precondition(precondition_in), n_band(nband_in), dim(nbasis_in), nbase_x(nband_in * david_ndim_in),
-      diag_thr(diag_thr_in), iter_nmax(diag_nmax_in), is_subspace(need_subspace_in), diag_comm(diag_comm_in)
+      diag_thr(diag_thr_in), iter_nmax(diag_nmax_in), is_subspace(need_subspace_in), diag_comm(diag_comm_in),
+        diag_subspace(diag_subspace_in), diago_subspace_bs(diago_subspace_bs_in)
 {
     this->device = base_device::get_device_type<Device>(this->ctx);
 
@@ -32,6 +41,7 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
 
     assert(david_ndim_in > 1);
     assert(david_ndim_in * nband_in < nbasis_in * this->diag_comm.nproc);
+    assert(diag_subspace >= 0 && diag_subspace < 3);
 
     // TODO: Added memory usage statistics
 
@@ -526,14 +536,13 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
                                                T* vcc)
 {
     ModuleBase::timer::tick("Diago_DavSubspace", "diag_zhegvx");
+    assert(nbase_x >= std::max(1, nbase));
 
-    if (this->diag_comm.rank == 0)
+    if (this->device == base_device::GpuDevice)
     {
-        assert(nbase_x >= std::max(1, nbase));
-
-        if (this->device == base_device::GpuDevice)
-        {
 #if defined(__CUDA) || defined(__ROCM)
+        if (this->diag_comm.rank == 0)
+        {
             Real* eigenvalue_gpu = nullptr;
             resmem_real_op()(this->ctx, eigenvalue_gpu, this->nbase_x);
 
@@ -562,46 +571,96 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
             syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, (*eigenvalue_iter).data(), eigenvalue_gpu, this->nbase_x);
 
             delmem_real_op()(this->ctx, eigenvalue_gpu);
-#endif
         }
-        else
+#endif
+    }
+    else
+    {
+        if (this->diag_subspace == 0)
         {
-            std::vector<std::vector<T>> h_diag(nbase, std::vector<T>(nbase, *this->zero));
-            std::vector<std::vector<T>> s_diag(nbase, std::vector<T>(nbase, *this->zero));
-
-            for (size_t i = 0; i < nbase; i++)
+            if (this->diag_comm.rank == 0)
             {
-                for (size_t j = 0; j < nbase; j++)
+                std::vector<std::vector<T>> h_diag(nbase, std::vector<T>(nbase, *this->zero));
+                std::vector<std::vector<T>> s_diag(nbase, std::vector<T>(nbase, *this->zero));
+
+                for (size_t i = 0; i < nbase; i++)
                 {
-                    h_diag[i][j] = hcc[i * this->nbase_x + j];
-                    s_diag[i][j] = scc[i * this->nbase_x + j];
+                    for (size_t j = 0; j < nbase; j++)
+                    {
+                        h_diag[i][j] = hcc[i * this->nbase_x + j];
+                        s_diag[i][j] = scc[i * this->nbase_x + j];
+                    }
+                }
+                dngvx_op<T, Device>()(this->ctx,
+                                      nbase,
+                                      this->nbase_x,
+                                      this->hcc,
+                                      this->scc,
+                                      nband,
+                                      (*eigenvalue_iter).data(),
+                                      this->vcc);
+                // reset:
+                for (size_t i = 0; i < nbase; i++)
+                {
+                    for (size_t j = 0; j < nbase; j++)
+                    {
+                        hcc[i * this->nbase_x + j] = h_diag[i][j];
+                        scc[i * this->nbase_x + j] = s_diag[i][j];
+                    }
+
+                    for (size_t j = nbase; j < this->nbase_x; j++)
+                    {
+                        hcc[i * this->nbase_x + j] = *this->zero;
+                        hcc[j * this->nbase_x + i] = *this->zero;
+                        scc[i * this->nbase_x + j] = *this->zero;
+                        scc[j * this->nbase_x + i] = *this->zero;
+                    }
                 }
             }
-            dngvx_op<T, Device>()(this->ctx,
-                                  nbase,
-                                  this->nbase_x,
-                                  this->hcc,
-                                  this->scc,
-                                  nband,
-                                  (*eigenvalue_iter).data(),
-                                  this->vcc);
-            // reset:
-            for (size_t i = 0; i < nbase; i++)
+        }
+        else
+        {
+#ifdef __MPI  
+            std::vector<T> h_diag;
+            std::vector<T> s_diag;
+            std::vector<T> vcc_tmp;
+            if (this->diag_comm.rank == 0)
             {
-                for (size_t j = 0; j < nbase; j++)
+                h_diag.resize(nbase * nbase, *this->zero);
+                s_diag.resize(nbase * nbase, *this->zero);
+                vcc_tmp.resize(nbase * nbase, *this->zero);
+                for (size_t i = 0; i < nbase; i++)
                 {
-                    hcc[i * this->nbase_x + j] = h_diag[i][j];
-                    scc[i * this->nbase_x + j] = s_diag[i][j];
+                    for (size_t j = 0; j < nbase; j++)
+                    {
+                        h_diag[i * nbase + j] = hcc[i * this->nbase_x + j];
+                        s_diag[i * nbase + j] = scc[i * this->nbase_x + j];
+                    }
                 }
-
-                for (size_t j = nbase; j < this->nbase_x; j++)
+            }
+            diago_hs_para(h_diag.data(),
+                          s_diag.data(),
+                          nbase,
+                          nband,
+                          (*eigenvalue_iter).data(),
+                          vcc_tmp.data(),
+                          this->diag_comm.comm,
+                          this->diag_subspace,
+                          this->diago_subspace_bs);
+            if (this->diag_comm.rank == 0)
+            {
+                for (size_t i = 0; i < nband; i++)
                 {
-                    hcc[i * this->nbase_x + j] = *this->zero;
-                    hcc[j * this->nbase_x + i] = *this->zero;
-                    scc[i * this->nbase_x + j] = *this->zero;
-                    scc[j * this->nbase_x + i] = *this->zero;
+                    for (size_t j = 0; j < nbase; j++)
+                    {
+                        vcc[i * this->nbase_x + j] = vcc_tmp[i * nbase + j];
+                    }
                 }
             }
+#else
+            std::cout << "Error: parallel diagonalization is not supported in serial mode." << std::endl;
+            exit(1);    
+#endif
         }
     }
 
diff --git a/source/module_hsolver/diago_dav_subspace.h b/source/module_hsolver/diago_dav_subspace.h
index 622b73638c..3abbf5d9aa 100644
--- a/source/module_hsolver/diago_dav_subspace.h
+++ b/source/module_hsolver/diago_dav_subspace.h
@@ -31,7 +31,9 @@ class Diago_DavSubspace
                       const double& diag_thr_in,
                       const int& diag_nmax_in,
                       const bool& need_subspace_in,
-                      const diag_comm_info& diag_comm_in);
+                      const diag_comm_info& diag_comm_in,
+                      const int diago_dav_method_in,
+                      const int block_size_in);
 
     ~Diago_DavSubspace();
 
@@ -139,6 +141,9 @@ class Diago_DavSubspace
 
     bool test_exit_cond(const int& ntry, const int& notconv, const bool& scf);
 
+    int diag_subspace; // 0: LAPACK, 1: Gen-ELPA, 2: ScaLAPACK
+    int diago_subspace_bs = 0; // the block size in 2d block cyclic distribution if use elpa or scalapack
+
 #ifdef __DSP
     using resmem_complex_op = base_device::memory::resize_memory_op_mt<T, Device>;
     using delmem_complex_op = base_device::memory::delete_memory_op_mt<T, Device>;
diff --git a/source/module_hsolver/diago_pxxxgvx.cpp b/source/module_hsolver/diago_pxxxgvx.cpp
new file mode 100644
index 0000000000..51e29f27ff
--- /dev/null
+++ b/source/module_hsolver/diago_pxxxgvx.cpp
@@ -0,0 +1,670 @@
+#include "diago_pxxxgvx.h"
+
+#include "module_base/blacs_connector.h"
+#include "module_base/scalapack_connector.h"
+
+#include <complex>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+namespace hsolver
+{
+
+#ifdef __MPI
+
+/**
+ * @file diago_pxxxgvx.cpp
+ * @brief This file contains functions for performing parallel diagonalization using Scalapack.
+ *
+ * The functions in this file are designed to handle different data types (double, float, std::complex<double>,
+ * std::complex<float>) and perform diagonalization using Scalapack routines (pdsygvx_, pzhegvx_, pssygvx_, pchegvx_).
+ *
+ * The main functions are:
+ * - pxxxgvx_: Wrapper functions for Scalapack diagonalization routines.
+ * - pxxxgvx_post_processing: Post-processing function to handle errors and adjust parameters based on Scalapack's info
+ * output.
+ * - get_lwork: Helper functions to retrieve the optimal workspace size.
+ * - pxxxgvx_diag: Template function to perform diagonalization for different data types.
+ *
+ * Template instantiations for pxxxgvx_diag are provided for double, float, std::complex<double>, and
+ * std::complex<float>.
+ */
+
+/**
+ * @brief Wrapper function for Scalapack's generalized eigensolver routines.
+ * 
+ * @param itype Specifies the problem type to be solved.
+ * @param jobz Specifies whether to compute eigenvectors.
+ * @param range Specifies the range of eigenvalues to be found.
+ * @param uplo Specifies whether the upper or lower triangular part of the matrices is referenced.
+ * @param n The order of the matrices A and B.
+ * @param A The array containing the matrix A.
+ * @param ia The row index in the global array A.
+ * @param ja The column index in the global array A.
+ * @param desca The array descriptor for the distributed matrix A.
+ * @param B The array containing the matrix B.
+ * @param ib The row index in the global array B.
+ * @param jb The column index in the global array B.
+ * @param descb The array descriptor for the distributed matrix B.
+ * @param vl Lower bound of the interval to be searched for eigenvalues.
+ * @param vu Upper bound of the interval to be searched for eigenvalues.
+ * @param il Index of the smallest eigenvalue to be returned.
+ * @param iu Index of the largest eigenvalue to be returned.
+ * @param abstol The absolute error tolerance for the eigenvalues.
+ * @param m The total number of eigenvalues found.
+ * @param nz The total number of eigenvalues found in the interval (vl, vu].
+ * @param w The array to store the eigenvalues.
+ * @param orfac The orthogonality factor.
+ * @param Z The array to store the eigenvectors.
+ * @param iz The row index in the global array Z.
+ * @param jz The column index in the global array Z.
+ * @param descz The array descriptor for the distributed matrix Z.
+ * @param work Workspace array.
+ * @param lwork The dimension of the array work.
+ * @param rwork Workspace array (not used in this function).
+ * @param lrwork The dimension of the array rwork (not used in this function).
+ * @param iwork Workspace array.
+ * @param liwork The dimension of the array iwork.
+ * @param ifail The array to store the indices of the eigenvectors that failed to converge.
+ * @param iclustr The array to store the indices of the eigenvalue clusters.
+ * @param gap The array to store the gaps between eigenvalue clusters.
+ * @param info Output status of the computation.
+ * 
+ * @note for a uniform interface, rwork and lrwork are input arguments, but not used in pdsygvx_/pssygvx_
+ */
+void pxxxgvx_(const int* itype,
+              const char* jobz,
+              const char* range,
+              const char* uplo,
+              const int* n,
+              double* A,
+              const int* ia,
+              const int* ja,
+              const int* desca,
+              double* B,
+              const int* ib,
+              const int* jb,
+              const int* descb,
+              const double* vl,
+              const double* vu,
+              const int* il,
+              const int* iu,
+              const double* abstol,
+              int* m,
+              int* nz,
+              double* w,
+              const double* orfac,
+              double* Z,
+              const int* iz,
+              const int* jz,
+              const int* descz,
+              double* work,
+              int* lwork,
+              double* rwork, 
+              int* lrwork,
+              int* iwork,
+              int* liwork,
+              int* ifail,
+              int* iclustr,
+              double* gap,
+              int* info)
+{
+    // double
+    pdsygvx_(itype,
+             jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             B,
+             ib,
+             jb,
+             descb,
+             vl,
+             vu,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
+}
+
+void pxxxgvx_(const int* itype,
+              const char* jobz,
+              const char* range,
+              const char* uplo,
+              const int* n,
+              std::complex<double>* A,
+              const int* ia,
+              const int* ja,
+              const int* desca,
+              std::complex<double>* B,
+              const int* ib,
+              const int* jb,
+              const int* descb,
+              const double* vl,
+              const double* vu,
+              const int* il,
+              const int* iu,
+              const double* abstol,
+              int* m,
+              int* nz,
+              double* w,
+              const double* orfac,
+              std::complex<double>* Z,
+              const int* iz,
+              const int* jz,
+              const int* descz,
+              std::complex<double>* work,
+              int* lwork,
+              double* rwork,
+              int* lrwork,
+              int* iwork,
+              int* liwork,
+              int* ifail,
+              int* iclustr,
+              double* gap,
+              int* info)
+{
+    // std::complex<double>
+    pzhegvx_(itype,
+             jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             B,
+             ib,
+             jb,
+             descb,
+             vl,
+             vu,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             rwork,
+             lrwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
+}
+
+void pxxxgvx_(const int* itype,
+              const char* jobz,
+              const char* range,
+              const char* uplo,
+              const int* n,
+              float* A,
+              const int* ia,
+              const int* ja,
+              const int* desca,
+              float* B,
+              const int* ib,
+              const int* jb,
+              const int* descb,
+              const float* vl,
+              const float* vu,
+              const int* il,
+              const int* iu,
+              const float* abstol,
+              int* m,
+              int* nz,
+              float* w,
+              const float* orfac,
+              float* Z,
+              const int* iz,
+              const int* jz,
+              const int* descz,
+              float* work,
+              int* lwork,
+              float* rwork,
+              int* lrwork,
+              int* iwork,
+              int* liwork,
+              int* ifail,
+              int* iclustr,
+              float* gap,
+              int* info)
+{
+    // float
+    pssygvx_(itype,
+             jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             B,
+             ib,
+             jb,
+             descb,
+             vl,
+             vu,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
+}
+
+void pxxxgvx_(const int* itype,
+              const char* jobz,
+              const char* range,
+              const char* uplo,
+              const int* n,
+              std::complex<float>* A,
+              const int* ia,
+              const int* ja,
+              const int* desca,
+              std::complex<float>* B,
+              const int* ib,
+              const int* jb,
+              const int* descb,
+              const float* vl,
+              const float* vu,
+              const int* il,
+              const int* iu,
+              const float* abstol,
+              int* m,
+              int* nz,
+              float* w,
+              const float* orfac,
+              std::complex<float>* Z,
+              const int* iz,
+              const int* jz,
+              const int* descz,
+              std::complex<float>* work,
+              int* lwork,
+              float* rwork,
+              int* lrwork,
+              int* iwork,
+              int* liwork,
+              int* ifail,
+              int* iclustr,
+              float* gap,
+              int* info)
+{
+    // std::complex<float>
+    pchegvx_(itype,
+             jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             B,
+             ib,
+             jb,
+             descb,
+             vl,
+             vu,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             rwork,
+             lrwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
+}
+
+void pxxxgvx_post_processing(const int info,
+                             const std::vector<int>& ifail,
+                             const std::vector<int>& iclustr,
+                             const int M,
+                             const int NZ,
+                             const int nbands,
+                             int& degeneracy_max)
+{
+    const std::string str_info = "Scalapack diagonalization: \n    info = " + std::to_string(info) + ".\n";
+
+    if (info == 0)
+    {
+        return;
+    }
+    else if (info < 0)
+    {
+        const int info_negative = -info;
+        const std::string str_index = (info_negative > 100)
+                                          ? std::to_string(info_negative / 100) + "-th argument "
+                                                + std::to_string(info_negative % 100) + "-entry is illegal.\n"
+                                          : std::to_string(info_negative) + "-th argument is illegal.\n";
+        throw std::runtime_error(str_info + str_index);
+    }
+    else if (info % 2)
+    {
+        std::string str_ifail = "ifail = ";
+        for (const int i: ifail)
+        {
+            str_ifail += std::to_string(i) + " ";
+        }
+        throw std::runtime_error(str_info + str_ifail);
+    }
+    else if (info / 2 % 2)
+    {
+        int degeneracy_need = 0;
+        for (int irank = 0; irank < iclustr.size() / 2; ++irank)
+        {
+            degeneracy_need = std::max(degeneracy_need, iclustr[2 * irank + 1] - iclustr[2 * irank]);
+        }
+        const std::string str_need = "degeneracy_need = " + std::to_string(degeneracy_need) + ".\n";
+        const std::string str_saved = "degeneracy_saved = " + std::to_string(degeneracy_max) + ".\n";
+        if (degeneracy_need <= degeneracy_max)
+        {
+            throw std::runtime_error(str_info + str_need + str_saved);
+        }
+        else
+        {
+            std::cout << str_need << str_saved;
+            degeneracy_max = degeneracy_need;
+            return;
+        }
+    }
+    else if (info / 4 % 2)
+    {
+        const std::string str_M = "M = " + std::to_string(M) + ".\n";
+        const std::string str_NZ = "NZ = " + std::to_string(NZ) + ".\n";
+        const std::string str_NBANDS = "Number of eigenvalues solved = " + std::to_string(nbands) + ".\n";
+        throw std::runtime_error(str_info + str_M + str_NZ + str_NBANDS);
+    }
+    else if (info / 16 % 2)
+    {
+        const std::string str_npos = "Not positive definite = " + std::to_string(ifail[0]) + ".\n";
+        throw std::runtime_error(str_info + str_npos);
+    }
+    else
+    {
+        throw std::runtime_error(str_info);
+    }
+}
+
+void get_lwork(int& lwork, std::vector<double>& work)
+{
+    lwork = work[0];
+}
+
+void get_lwork(int& lwork, std::vector<float>& work)
+{
+    lwork = work[0];
+}
+
+void get_lwork(int& lwork, std::vector<std::complex<double>>& work)
+{
+    lwork = work[0].real();
+}
+
+void get_lwork(int& lwork, std::vector<std::complex<float>>& work)
+{
+    lwork = work[0].real();
+}
+
+template <typename T>
+void pxxxgvx_diag(const int* const desc,
+                  const int ncol,
+                  const int nrow,
+                  const int nbands,
+                  const T* const h_mat,
+                  const T* const s_mat,
+                  typename GetTypeReal<T>::type* const ekb,
+                  T* const wfc_2d)
+{
+    int nprow = 1;
+    int npcol = 1;
+    int myprow = 0;
+    int mypcol = 0;
+    Cblacs_gridinfo(desc[1], &nprow, &npcol, &myprow, &mypcol);
+    int dsize = nprow * npcol;
+
+    int degeneracy_max = 12; // only used for complex<float> and complex<double>
+    while (true)
+    {
+        std::vector<T> h_tmp(ncol * nrow, 0);
+        std::vector<T> s_tmp(ncol * nrow, 0);
+        memcpy(h_tmp.data(), h_mat, sizeof(T) * ncol * nrow);
+        memcpy(s_tmp.data(), s_mat, sizeof(T) * ncol * nrow);
+
+        int ndim_global = desc[2];
+        const char jobz = 'V';
+        const char range = 'I';
+        const char uplo = 'U';
+        const int itype = 1;
+        const int il = 1;
+        const int iu = nbands;
+        const int one = 1;
+        int M = 0;
+        int NZ = 0;
+        int lwork = -1;
+        int lrwork = -1;
+        int liwork = -1;
+        int info = 0;
+        const typename GetTypeReal<T>::type abstol = 0;
+        const typename GetTypeReal<T>::type orfac = -1;
+        const typename GetTypeReal<T>::type vl = 0;
+        const typename GetTypeReal<T>::type vu = 0;
+        std::vector<T> work(1, 0);
+        std::vector<typename GetTypeReal<T>::type> rwork(3, 0); // only used for complex<float> and complex<double>
+        std::vector<int> iwork(1, 0);
+        std::vector<int> ifail(ndim_global, 0);
+        std::vector<int> iclustr(2 * dsize);
+        std::vector<typename GetTypeReal<T>::type> gap(dsize);
+
+        pxxxgvx_(&itype,
+                 &jobz,
+                 &range,
+                 &uplo,
+                 &ndim_global,
+                 h_tmp.data(),
+                 &one,
+                 &one,
+                 desc,
+                 s_tmp.data(),
+                 &one,
+                 &one,
+                 desc,
+                 &vl,
+                 &vu,
+                 &il,
+                 &iu,
+                 &abstol,
+                 &M,
+                 &NZ,
+                 ekb,
+                 &orfac,
+                 wfc_2d,
+                 &one,
+                 &one,
+                 desc,
+                 work.data(),
+                 &lwork,       // is not used for real data type
+                 rwork.data(), // is not used for real data type
+                 &lrwork,
+                 iwork.data(),
+                 &liwork,
+                 ifail.data(),
+                 iclustr.data(),
+                 gap.data(),
+                 &info);
+
+        if (info)
+        {
+            throw std::runtime_error("Scalapack diagonalization: \n    info = " + std::to_string(info) + ".\n");
+        }
+
+        if (std::is_same<T, std::complex<float>>::value || std::is_same<T, std::complex<double>>::value)
+        {
+            get_lwork(lwork, work);
+            work.resize(lwork, 0);
+            liwork = iwork[0];
+            iwork.resize(liwork, 0);
+            lrwork = rwork[0] + degeneracy_max * ndim_global;
+            int maxlrwork = std::max(lrwork, 3);
+            rwork.resize(maxlrwork, 0);
+        }
+        else
+        {
+            get_lwork(lwork, work);
+            work.resize(std::max(lwork, 3), 0);
+            liwork = iwork[0];
+            iwork.resize(liwork, 0);
+        }
+
+        pxxxgvx_(&itype,
+                 &jobz,
+                 &range,
+                 &uplo,
+                 &ndim_global,
+                 h_tmp.data(),
+                 &one,
+                 &one,
+                 desc,
+                 s_tmp.data(),
+                 &one,
+                 &one,
+                 desc,
+                 &vl,
+                 &vu,
+                 &il,
+                 &iu,
+                 &abstol,
+                 &M,
+                 &NZ,
+                 ekb,
+                 &orfac,
+                 wfc_2d,
+                 &one,
+                 &one,
+                 desc,
+                 work.data(),
+                 &lwork,
+                 rwork.data(), // is not used for real data type
+                 &lrwork,      // is not used for real data type
+                 iwork.data(),
+                 &liwork,
+                 ifail.data(),
+                 iclustr.data(),
+                 gap.data(),
+                 &info);
+
+        if (info == 0)
+        {
+            return;
+        }
+        pxxxgvx_post_processing(info, ifail, iclustr, M, NZ, nbands, degeneracy_max);
+
+        // break the loop for real data type
+        if (std::is_same<T, float>::value || std::is_same<T, double>::value)
+        {
+            return;
+        }
+    }
+}
+
+// template instantiation
+// double
+template void pxxxgvx_diag(const int* const desc,
+                           const int ncol,
+                           const int nrow,
+                           const int nbands,
+                           const double* const h_mat,
+                           const double* const s_mat,
+                           double* const ekb,
+                           double* const wfc_2d);
+
+// std::complex<double>                           
+template void pxxxgvx_diag(const int* const desc,
+                           const int ncol,
+                           const int nrow,
+                           const int nbands,
+                           const std::complex<double>* const h_mat,
+                           const std::complex<double>* const s_mat,
+                           double* const ekb,
+                           std::complex<double>* const wfc_2d);
+
+// float
+template void pxxxgvx_diag(const int* const desc,
+                           const int ncol,
+                           const int nrow,
+                           const int nbands,
+                           const float* const h_mat,
+                           const float* const s_mat,
+                           float* const ekb,
+                           float* const wfc_2d);
+
+// std::complex<float>
+template void pxxxgvx_diag(const int* const desc,
+                           const int ncol,
+                           const int nrow,
+                           const int nbands,
+                           const std::complex<float>* const h_mat,
+                           const std::complex<float>* const s_mat,
+                           float* const ekb,
+                           std::complex<float>* const wfc_2d);
+
+#endif
+
+} // namespace hsolver
\ No newline at end of file
diff --git a/source/module_hsolver/diago_pxxxgvx.h b/source/module_hsolver/diago_pxxxgvx.h
new file mode 100644
index 0000000000..e4f9ba7896
--- /dev/null
+++ b/source/module_hsolver/diago_pxxxgvx.h
@@ -0,0 +1,37 @@
+#ifndef HSOLVER_DIAGO_PXXXGVX_H
+#define HSOLVER_DIAGO_PXXXGVX_H
+#include <complex>
+#include "module_base/macros.h"
+
+namespace hsolver
+{
+
+#ifdef __MPI
+/**
+ * @brief Wrapper function for Scalapack's generalized eigensolver routines: pdsygvx_, pzhegvx_, pssygvx_, pchegvx_.
+ * 
+ * @param desc the descriptor of scalapack descriptor
+ * @param ncol the number of columns of the H/S matrix in current processor
+ * @param nrow the number of rows of the H/S matrix in current processor
+ * @param nbands the number of bands to be solved
+ * @param h_mat the Hamiltonian matrix
+ * @param s_mat the overlap matrix
+ * @param ekb the eigenvalues
+ * @param wfc_2d the eigenvectors in 2D block cyclic distribution
+ * 
+ */
+
+template <typename T>
+void pxxxgvx_diag(const int* const desc,
+                  const int ncol,
+                  const int nrow,
+                  const int nbands,
+                  const T* const h_mat,
+                  const T* const s_mat,
+                  typename GetTypeReal<T>::type* const ekb,
+                  T* const wfc_2d);
+#endif 
+
+} // namespace hsolver
+
+#endif // HSOLVER_DIAGO_PXXXGVX_H
\ No newline at end of file
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index dbfca81061..8cc2b754dc 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -529,7 +529,9 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
                                                   this->diag_thr,
                                                   this->diag_iter_max,
                                                   this->need_subspace,
-                                                  comm_info);
+                                                  comm_info,
+                                                  PARAM.inp.diag_subspace,
+                                                  PARAM.inp.nb2d);
 
         DiagoIterAssist<T, Device>::avg_iter += static_cast<double>(
             dav_subspace.diag(hpsi_func, psi.get_pointer(), psi.get_nbasis(), eigenvalue, this->ethr_band, scf));
diff --git a/source/module_hsolver/test/CMakeLists.txt b/source/module_hsolver/test/CMakeLists.txt
index 0d938acc15..fdb447a09d 100644
--- a/source/module_hsolver/test/CMakeLists.txt
+++ b/source/module_hsolver/test/CMakeLists.txt
@@ -154,6 +154,12 @@ install(FILES parallel_k2d_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 
 
 
+AddTest(
+  TARGET hsolver_diago_hs_parallel
+  LIBS parameter  ${math_libs} ELPA::ELPA base device MPI::MPI_CXX genelpa psi
+  SOURCES test_diago_hs_para.cpp ../diag_hs_para.cpp ../diago_pxxxgvx.cpp ../diago_elpa.cpp ../diago_scalapack.cpp 
+)
+
 find_program(BASH bash)
 if (ENABLE_MPI)
   add_test(NAME HSolver_cg_parallel
@@ -173,11 +179,11 @@ if (ENABLE_MPI)
           COMMAND ${BASH} diago_lcao_parallel_test.sh
           WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
         )
-  if(ENABLE_PEXSI)
-    add_test(NAME HSolver_LCAO_PEXSI_parallel
-          COMMAND ${BASH} diago_pexsi_parallel_test.sh
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-    )
-endif()
-endif()
+    if(ENABLE_PEXSI)
+      add_test(NAME HSolver_LCAO_PEXSI_parallel
+            COMMAND ${BASH} diago_pexsi_parallel_test.sh
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      )
+    endif()
+  endif()
 endif()
\ No newline at end of file
diff --git a/source/module_hsolver/test/test_diago_hs_para.cpp b/source/module_hsolver/test/test_diago_hs_para.cpp
new file mode 100644
index 0000000000..51efda5e05
--- /dev/null
+++ b/source/module_hsolver/test/test_diago_hs_para.cpp
@@ -0,0 +1,353 @@
+#ifndef TEST_DIAGO_PXXXGVX_H
+#define TEST_DIAGO_PXXXGVX_H
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <complex>
+#include <random>
+#include <mpi.h>
+#include <chrono>
+#include <fstream>
+
+#include "../diag_hs_para.h"
+#include "module_hsolver/kernels/dngvd_op.h"
+
+template <typename T>
+typename std::enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value>::type
+generate_random_hs_impl(int d, std::mt19937& gen, std::uniform_real_distribution<typename GetTypeReal<T>::type>& dis, std::vector<T>& h_mat, std::vector<T>& s_mat) {
+    // For S matrix, firstly we generate a random symmetric matrix s_tmp, then we set S = s_tmp * s_tmp^T + n * I
+    std::vector<T> s_tmp(d*d,0);
+    for (int i = 0; i < d; ++i) {
+        for (int j = i; j < d; ++j) {
+            typename GetTypeReal<T>::type value1 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            h_mat[i * d + j] = value1;
+            h_mat[j * d + i] = value1;
+
+            // construct a random overlap matrix
+            typename GetTypeReal<T>::type value2 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            s_tmp[i * d + j] = value2;
+        }
+    }
+
+    // set S = s_tmp * s_tmp^T + n * I
+    for (int i = 0; i < d; ++i) {
+        for (int j = 0; j < d; ++j) {
+            s_mat[i * d + j] = 0;
+            for (int k = 0; k < d; ++k) {
+                s_mat[i * d + j] += s_tmp[i * d + k] * s_tmp[j * d + k];
+            }
+            if (i == j) {
+                s_mat[i * d + j] += 2.0;
+            }
+        }
+    }
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, std::complex<double>>::value || std::is_same<T, std::complex<float>>::value>::type
+generate_random_hs_impl(int d, std::mt19937& gen, std::uniform_real_distribution<typename GetTypeReal<T>::type>& dis, std::vector<T>& h_mat, std::vector<T>& s_mat) {
+    std::vector<T> s_tmp(d*d,0);
+    for (int i = 0; i < d; ++i) {
+        for (int j = i; j < d; ++j) {
+            typename GetTypeReal<T>::type value1 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            typename GetTypeReal<T>::type value2 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            h_mat[i * d + j] = T(value1, value2);
+            if (i != j)
+            {
+                h_mat[j * d + i] = T(value1, -value2);
+            }
+            else{
+                h_mat[j * d + i] = T(value1, 0);
+            }
+
+            // construct a random overlap matrix
+            value1 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            value2 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            s_tmp[i * d + j] = T(value1, value2);
+        }
+    }
+
+    // set S = s_tmp * s_tmp^T + n * I
+    for (int i = 0; i < d; ++i) {
+        for (int j = 0; j < d; ++j) {
+            s_mat[i * d + j] = T(0, 0);
+            for (int k = 0; k < d; ++k) {
+                s_mat[i * d + j] += s_tmp[i * d + k] * std::conj(s_tmp[j * d + k]);
+            }
+            if (i == j) {
+                s_mat[i * d + j] += T(2.0, 0);
+            }
+        }
+    }
+}
+
+template <typename T>
+void generate_random_hs(int d, int random_seed ,std::vector<T>& h_mat, std::vector<T>& s_mat) {
+    std::mt19937 gen(random_seed);
+    std::uniform_real_distribution<typename GetTypeReal<T>::type> dis(-1.0,1.0);
+
+    h_mat.resize(d * d);
+    s_mat.resize(d * d);
+    generate_random_hs_impl(d, gen, dis, h_mat, s_mat);
+}
+
+
+template <typename T>
+typename std::enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value>::type
+verify_results(const std::vector<T>& h_psi, const std::vector<T>& s_psi, const std::vector<typename GetTypeReal<T>::type>& ekb, int lda, int nbands, double threshold) {
+    for (int i = 0; i < lda; ++i) {
+        for (int j = 0; j < nbands; ++j) {
+            ASSERT_NEAR(h_psi[j * lda + i], ekb[j] * s_psi[j * lda + i], threshold);
+        }
+    }
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, std::complex<double>>::value || std::is_same<T, std::complex<float>>::value>::type
+verify_results(const std::vector<T>& h_psi, const std::vector<T>& s_psi, const std::vector<typename GetTypeReal<T>::type>& ekb, int lda, int nbands, double threshold) {
+    for (int i = 0; i < lda; ++i) {
+        for (int j = 0; j < nbands; ++j) {
+            ASSERT_NEAR(h_psi[j * lda + i].real(), ekb[j] * s_psi[j * lda + i].real(), threshold);
+            ASSERT_NEAR(h_psi[j * lda + i].imag(), ekb[j] * s_psi[j * lda + i].imag(), threshold);
+        }
+    }
+}
+
+template <typename T>
+void test_diago_hs(int lda, int nb, int random_seed, int nbands, int diag_type, MPI_Comm comm) {
+    // diag_type should be 1 (for elpa) or 2 (for scalapack)
+    int my_rank;
+    MPI_Comm_rank(comm, &my_rank);
+
+    std::vector<T> h_mat, s_mat, wfc, h_psi, s_psi;
+    std::vector<typename GetTypeReal<T>::type> ekb(lda);
+    if (my_rank==0)
+    {
+        h_mat.resize(lda * lda);
+        s_mat.resize(lda * lda);
+        wfc.resize(lda * lda);
+        generate_random_hs(lda, random_seed, h_mat, s_mat);
+    }
+    hsolver::diago_hs_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb.data(), wfc.data(), comm, diag_type, nb);
+
+    // Verify results
+    if (my_rank == 0){
+        double threshold = 1e-6;
+        if (std::is_same<T, std::complex<double>>::value || std::is_same<T, double>::value) {
+            threshold = 1e-12;
+        }
+
+        h_psi.resize(lda * nbands, 0);
+        s_psi.resize(lda * nbands, 0);
+
+        for (int i = 0; i < lda; ++i) {
+            for (int j = 0; j < nbands; ++j) {
+                for (int k = 0; k < lda; ++k) {
+                    h_psi[j * lda + i] += h_mat[k * lda + i] * wfc[j * lda + k];
+                    s_psi[j * lda + i] += s_mat[k * lda + i] * wfc[j * lda + k];
+                }
+            }
+        }
+        verify_results<T>(h_psi, s_psi, ekb, lda, nbands, threshold);
+    }
+}
+
+template <typename T>
+void test_performance(int lda, int nb, int nbands, MPI_Comm comm,int case_numb, int loop_numb) {
+    // generate 10 random H/S, and do the diagonalization 100 times by using elpa/scalapack and lapack.
+    int my_rank, nproc;
+    MPI_Comm_rank(comm, &my_rank);
+    MPI_Comm_size(comm, &nproc);
+
+    std::vector<T> h_mat, s_mat, wfc, h_psi, s_psi;
+    std::vector<typename GetTypeReal<T>::type> ekb_elpa(lda);
+    std::vector<typename GetTypeReal<T>::type> ekb_scalap(lda);
+    std::vector<typename GetTypeReal<T>::type> ekb_lapack(lda);
+
+    if (my_rank==0)
+    {
+        std::cout << "\nMatrix size: " << lda << " x " << lda << std::endl;
+        std::cout << "Number of bands: " << nbands << std::endl;
+        std::cout << "Number of processors: " << nproc << std::endl;
+        std::cout << "Block size of 2D distribution: " << nb << std::endl;
+        h_mat.resize(lda * lda);
+        s_mat.resize(lda * lda);
+        wfc.resize(lda * lda);
+    }
+
+    // store all the times in a vector
+    std::vector<double> time_elpa(case_numb, 0);
+    std::vector<double> time_scalap(case_numb, 0);
+    std::vector<double> time_lapack(case_numb, 0);
+
+    if (my_rank == 0) { std::cout << "Random matrix ";
+}
+    for (int randomi = 0; randomi < case_numb; ++randomi) 
+    {
+        
+        if (my_rank == 0) {
+            std::cout << randomi << " ";
+            generate_random_hs(lda, randomi, h_mat, s_mat);
+        }
+
+        // ELPA
+        MPI_Barrier(comm);
+        auto start = std::chrono::high_resolution_clock::now();
+        for (int j=0;j<loop_numb;j++)
+        {
+            hsolver::diago_hs_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb_elpa.data(), wfc.data(), comm, 1, nb);
+            MPI_Barrier(comm);
+        }
+        MPI_Barrier(comm);
+        auto end = std::chrono::high_resolution_clock::now();
+        time_elpa[randomi] = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+
+        // scalapack
+        start = std::chrono::high_resolution_clock::now();
+        for (int j=0;j<loop_numb;j++)
+        {
+            hsolver::diago_hs_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb_scalap.data(), wfc.data(), comm, 2, nb);
+            MPI_Barrier(comm);
+        }
+        MPI_Barrier(comm);
+        end = std::chrono::high_resolution_clock::now();
+        time_scalap[randomi] = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+        //LApack
+        if (my_rank == 0) 
+        { 
+            std::vector<T> h_tmp, s_tmp;
+            start = std::chrono::high_resolution_clock::now();
+            base_device::DEVICE_CPU* ctx = {};
+
+            for (int j=0;j<loop_numb;j++)
+            {
+                h_tmp = h_mat;
+                s_tmp = s_mat;
+                hsolver::dngvx_op<T,base_device::DEVICE_CPU>()(ctx,
+                                      lda,
+                                      lda,
+                                      h_tmp.data(),
+                                      s_tmp.data(),
+                                      nbands,
+                                      ekb_lapack.data(),
+                                      wfc.data());
+            }
+            end = std::chrono::high_resolution_clock::now();
+            time_lapack[randomi] = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+            //COMPARE EKB
+            for (int i = 0; i < nbands; ++i) {
+                typename GetTypeReal<T>::type diff_elpa_lapack = std::abs(ekb_elpa[i] - ekb_lapack[i]);
+                typename GetTypeReal<T>::type diff_scalap_lapack = std::abs(ekb_scalap[i] - ekb_lapack[i]);
+                if (diff_elpa_lapack > 1e-6 || diff_scalap_lapack > 1e-6)
+                {
+                    std::cout << "eigenvalue " << i << " by ELPA: " << ekb_elpa[i] << std::endl;
+                    std::cout << "eigenvalue " << i << " by Scalapack: " << ekb_scalap[i] << std::endl;
+                    std::cout << "eigenvalue " << i << " by Lapack: " << ekb_lapack[i] << std::endl;
+                }
+            }
+        }
+        MPI_Barrier(comm);   
+
+    }
+
+    if (my_rank == 0)
+    {
+        std::cout << "\nELPA Time     : ";
+        for (int i=0; i < case_numb;i++)
+        {std::cout << time_elpa[i] << " ";}
+        std::cout << std::endl;
+
+        std::cout << "scalapack Time: ";
+        for (int i=0; i < case_numb;i++)
+        {std::cout << time_scalap[i] << " ";}
+        std::cout << std::endl;
+
+        std::cout << "lapack Time   : ";
+        for (int i=0; i < case_numb;i++)
+        {std::cout << time_lapack[i] << " ";}
+        std::cout << std::endl;
+
+        // print out the average time and speedup
+        double avg_time_elpa = 0;
+        double avg_time_scalap = 0;
+        double avg_time_lapack = 0;
+        for (int i=0; i < case_numb;i++)
+        {
+            avg_time_elpa += time_elpa[i];
+            avg_time_scalap += time_scalap[i];
+            avg_time_lapack += time_lapack[i];
+        }
+
+        avg_time_elpa /= case_numb;
+        avg_time_scalap /= case_numb;
+        avg_time_lapack /= case_numb;
+        std::cout << "Average Lapack Time   : " << avg_time_lapack << " ms" << std::endl;
+        std::cout << "Average ELPA Time     : " << avg_time_elpa << " ms, Speedup: " << avg_time_lapack / avg_time_elpa << std::endl;
+        std::cout << "Average Scalapack Time: " << avg_time_scalap << " ms, Speedup: " << avg_time_lapack / avg_time_scalap << std::endl;
+    }
+}
+
+//test_diago_hs(int lda, int nb, int random_seed, int nbands, int diag_type, MPI_Comm comm)
+TEST(DiagoPxxxgvxElpaTest, Double) {
+    test_diago_hs<double>(16, 4, 0, 10, 1, MPI_COMM_WORLD);
+    test_diago_hs<double>(20, 6, 0, 18, 1, MPI_COMM_WORLD);
+}
+
+TEST(DiagoPxxxgvxElpaTest, ComplexDouble) {
+    test_diago_hs<std::complex<double>>(16, 6, 0, 10, 1, MPI_COMM_WORLD);
+    test_diago_hs<std::complex<double>>(20, 8, 0, 18, 1, MPI_COMM_WORLD);
+}
+
+TEST(DiagoPxxxgvxScalapackTest, Double) {
+    test_diago_hs<double>(16, 4, 0, 10, 2,MPI_COMM_WORLD);
+    test_diago_hs<double>(20, 6, 0, 18, 2,MPI_COMM_WORLD);
+}
+
+TEST(DiagoPxxxgvxScalapackTest, ComplexDouble) {
+    test_diago_hs<std::complex<double>>(16, 4, 0, 10, 2, MPI_COMM_WORLD);
+}
+TEST(DiagoPxxxgvxScalapackTest, Float) {
+    test_diago_hs<float>(16, 4, 0, 10,2,MPI_COMM_WORLD);
+}
+
+TEST(DiagoPxxxgvxScalapackTest, ComplexFloat) {
+    test_diago_hs<std::complex<float>>(16, 4, 0, 10,2,MPI_COMM_WORLD);
+}
+
+//TEST(DiagoPxxxgvxPerformanceTest, Double) {
+//    int ndim = 200;
+//    int nband = 180;
+//    int case_numb = 10;
+//    int loop_numb = 10;
+//    test_performance<std::complex<double>>(ndim, 32,  nband, MPI_COMM_WORLD, case_numb, loop_numb);
+//}
+
+int main(int argc, char** argv) {
+    MPI_Init(&argc, &argv);
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+    testing::InitGoogleTest(&argc, argv);
+    ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners();
+
+    if (myrank != 0) {
+        delete listeners.Release(listeners.default_result_printer());
+    }
+
+    int result = RUN_ALL_TESTS();
+    if (myrank == 0 && result != 0)
+    {
+        std::cout << "ERROR:some tests are not passed" << std::endl;
+        MPI_Finalize();
+        return result;
+	}
+
+    MPI_Finalize();
+
+	return 0;
+}
+
+#endif // TEST_DIAGO_PXXXGVX_H
\ No newline at end of file
diff --git a/source/module_hsolver/test/test_hsolver_pw.cpp b/source/module_hsolver/test/test_hsolver_pw.cpp
index 5763f1b7d8..530491a669 100644
--- a/source/module_hsolver/test/test_hsolver_pw.cpp
+++ b/source/module_hsolver/test/test_hsolver_pw.cpp
@@ -32,6 +32,62 @@
  *  - 8. solve()
  *      - lcao_in_pw specific implementation
  */
+
+// mock diago_hs_para
+namespace hsolver {
+template <typename T>
+void diago_hs_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size = 0)
+{}
+template void diago_hs_para<double>(double* h,
+                                    double* s,
+                                    const int lda,
+                                    const int nband,
+                                    typename GetTypeReal<double>::type* const ekb,
+                                    double* const wfc,
+                                    const MPI_Comm& comm,
+                                    const int diag_subspace,
+                                    const int block_size);
+
+template void diago_hs_para<std::complex<double>>(std::complex<double>* h,
+                                                  std::complex<double>* s,
+                                                  const int lda,
+                                                  const int nband,
+                                                  typename GetTypeReal<std::complex<double>>::type* const ekb,
+                                                  std::complex<double>* const wfc,
+                                                  const MPI_Comm& comm,
+                                                  const int diag_subspace,
+                                                  const int block_size);
+
+template void diago_hs_para<float>(float* h,
+                                   float* s,
+                                   const int lda,
+                                   const int nband,
+                                   typename GetTypeReal<float>::type* const ekb,
+                                   float* const wfc,
+                                   const MPI_Comm& comm,
+                                   const int diag_subspace,
+                                   const int block_size);
+                                   
+template void diago_hs_para<std::complex<float>>(std::complex<float>* h,
+                                                 std::complex<float>* s,
+                                                 const int lda,
+                                                 const int nband,
+                                                 typename GetTypeReal<std::complex<float>>::type* const ekb,
+                                                 std::complex<float>* const wfc,
+                                                 const MPI_Comm& comm,
+                                                 const int diag_subspace,
+                                                 const int block_size);
+
+}
+
 class TestHSolverPW : public ::testing::Test {
   public:
     ModulePW::PW_Basis_K pwbk;
diff --git a/source/module_hsolver/test/test_hsolver_sdft.cpp b/source/module_hsolver/test/test_hsolver_sdft.cpp
index ae1e8d76fd..76f24af4ff 100644
--- a/source/module_hsolver/test/test_hsolver_sdft.cpp
+++ b/source/module_hsolver/test/test_hsolver_sdft.cpp
@@ -23,6 +23,59 @@ Sto_Func<REAL>::Sto_Func()
 }
 template class Sto_Func<double>;
 
+// mock diago_hs_para
+namespace hsolver {
+template <typename T>
+void diago_hs_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size = 0)
+{}
+template void diago_hs_para<double>(double* h,
+                                    double* s,
+                                    const int lda,
+                                    const int nband,
+                                    typename GetTypeReal<double>::type* const ekb,
+                                    double* const wfc,
+                                    const MPI_Comm& comm,
+                                    const int diag_subspace,
+                                    const int block_size);
+template void diago_hs_para<std::complex<double>>(std::complex<double>* h,
+                                                  std::complex<double>* s,
+                                                  const int lda,
+                                                  const int nband,
+                                                  typename GetTypeReal<std::complex<double>>::type* const ekb,
+                                                  std::complex<double>* const wfc,
+                                                  const MPI_Comm& comm,
+                                                  const int diag_subspace,
+                                                  const int block_size);
+template void diago_hs_para<float>(float* h,
+                                   float* s,
+                                   const int lda,
+                                   const int nband,
+                                   typename GetTypeReal<float>::type* const ekb,
+                                   float* const wfc,
+                                   const MPI_Comm& comm,
+                                   const int diag_subspace,
+                                   const int block_size);          
+template void diago_hs_para<std::complex<float>>(std::complex<float>* h,
+                                                 std::complex<float>* s,
+                                                 const int lda,
+                                                 const int nband,
+                                                 typename GetTypeReal<std::complex<float>>::type* const ekb,
+                                                 std::complex<float>* const wfc,
+                                                 const MPI_Comm& comm,
+                                                 const int diag_subspace,
+                                                 const int block_size);
+
+}
+
+
 template<>
 void elecstate::ElecStatePW<std::complex<double>, base_device::DEVICE_CPU>::init_rho_data() 
 {
diff --git a/source/module_io/read_input_item_system.cpp b/source/module_io/read_input_item_system.cpp
index 58839c7b09..ca80e04412 100644
--- a/source/module_io/read_input_item_system.cpp
+++ b/source/module_io/read_input_item_system.cpp
@@ -477,6 +477,12 @@ void ReadInput::item_system()
         read_sync_int(input.fft_mode);
         this->add_item(item);
     }
+    {
+        Input_Item item("diag_subspace");
+        item.annotation = "method of subspace diagonalization in dav_subspace. 0:LaPack; 1:genelpa, 2:scalapack";
+        read_sync_int(input.diag_subspace);
+        this->add_item(item);
+    }
     {
         Input_Item item("init_wfc");
         item.annotation = "start wave functions are from 'atomic', "
diff --git a/source/module_lr/hsolver_lrtd.hpp b/source/module_lr/hsolver_lrtd.hpp
index 82e11b6a9e..d1c926ec3a 100644
--- a/source/module_lr/hsolver_lrtd.hpp
+++ b/source/module_lr/hsolver_lrtd.hpp
@@ -101,7 +101,9 @@ namespace LR
                         diag_ethr,
                         maxiter,
                         false, //always do the subspace diag (check the implementation)
-                        comm_info);
+                        comm_info,
+                        PARAM.inp.diag_subspace,
+                        PARAM.inp.nb2d);
                     std::vector<double> ethr_band(nband, diag_ethr);
                     hsolver::DiagoIterAssist<T>::avg_iter
                         += static_cast<double>(dav_subspace.diag(
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index 7d50b3db1e..7b160a2e0b 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -89,6 +89,7 @@ struct Input_para
     bool diago_smooth_ethr = false; ///< smooth ethr for iter methods
     int pw_diag_ndim = 4;      ///< dimension of workspace for Davidson diagonalization
     int diago_cg_prec = 1;     ///< mohan add 2012-03-31
+    int diag_subspace = 0; // 0: Lapack, 1: elpa, 2: scalapack
 
     std::string smearing_method = "gauss"; ///< "gauss",
                                            ///< "mp","methfessel-paxton"
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/INPUT b/tests/integrate/102_PW_DS_davsubspace_sca/INPUT
new file mode 100644
index 0000000000..b060c92bdd
--- /dev/null
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/INPUT
@@ -0,0 +1,29 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix			autotest
+calculation     scf
+
+nbands			6
+symmetry		1
+pseudo_dir	../../PP_ORB
+
+#Parameters (2.Iteration)
+ecutwfc			20
+scf_thr				1e-8
+scf_nmax			100
+
+
+#Parameters (3.Basis)
+basis_type		pw
+
+#Parameters (4.Smearing)
+smearing_method		gauss
+smearing_sigma			0.002
+
+#Parameters (5.Mixing)
+mixing_type		plain
+mixing_beta		0.5
+
+ks_solver       dav_subspace
+diag_subspace 2
+nb2d 4
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/KPT b/tests/integrate/102_PW_DS_davsubspace_sca/KPT
new file mode 100644
index 0000000000..c289c0158a
--- /dev/null
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/STRU b/tests/integrate/102_PW_DS_davsubspace_sca/STRU
new file mode 100644
index 0000000000..cdb8157cce
--- /dev/null
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/STRU
@@ -0,0 +1,19 @@
+ATOMIC_SPECIES
+Si 14 Si.pz-vbc.UPF
+
+LATTICE_CONSTANT
+10.2  // add lattice constant
+
+LATTICE_VECTORS
+0.5 0.5 0.0
+0.5 0.0 0.5
+0.0 0.5 0.5
+
+ATOMIC_POSITIONS
+Direct 
+
+Si	// Element type	
+0.0	// magnetism
+2	
+0.00 0.00 0.00 1 1 1
+0.25 0.25 0.25 1 1 1
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/jd b/tests/integrate/102_PW_DS_davsubspace_sca/jd
new file mode 100644
index 0000000000..70376ea55e
--- /dev/null
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/jd
@@ -0,0 +1 @@
+test subspace diagonalization with scalapack
\ No newline at end of file
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/result.ref b/tests/integrate/102_PW_DS_davsubspace_sca/result.ref
new file mode 100644
index 0000000000..46352c76f5
--- /dev/null
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/result.ref
@@ -0,0 +1,6 @@
+etotref -198.2238296207179
+etotperatomref -99.1119148104
+pointgroupref T_d
+spacegroupref O_h
+nksibzref 1
+totaltimeref 
diff --git a/tests/integrate/CASES_CPU.txt b/tests/integrate/CASES_CPU.txt
index 6e6093523a..935bcf2089 100644
--- a/tests/integrate/CASES_CPU.txt
+++ b/tests/integrate/CASES_CPU.txt
@@ -26,6 +26,7 @@
 102_PW_BPCG
 102_PW_CG
 102_PW_DS_davsubspace
+102_PW_DS_davsubspace_sca
 102_PW_PINT_RKS
 102_PW_PINT_UKS
 103_PW_15_CF_CS_S1_smallg