From 200a9cf5a17bb08df8458e3716f19e1c47d5151a Mon Sep 17 00:00:00 2001
From: root <pxlxingliang>
Date: Thu, 21 Nov 2024 10:33:44 +0800
Subject: [PATCH 01/12] feature: parallel solve subspace diagonalization in
 dav_subspace

---
 docs/advanced/input_files/input-main.md       |  14 +-
 .../src/hsolver/py_diago_dav_subspace.hpp     |   4 +-
 source/module_base/blacs_connector.h          |   2 +-
 source/module_base/scalapack_connector.h      |  11 +
 source/module_hsolver/CMakeLists.txt          |   3 +
 source/module_hsolver/diag_hs_para.cpp        | 201 ++++++++++
 source/module_hsolver/diag_hs_para.h          |  47 +++
 source/module_hsolver/diago_dav_subspace.cpp  | 126 ++++--
 source/module_hsolver/diago_dav_subspace.h    |   7 +-
 source/module_hsolver/diago_pxxxgvx.cpp       | 344 +++++++++++++++++
 source/module_hsolver/diago_pxxxgvx.h         |  37 ++
 source/module_hsolver/hsolver_pw.cpp          |   4 +-
 source/module_hsolver/test/CMakeLists.txt     |  20 +-
 .../test/test_diago_hs_para.cpp               | 359 ++++++++++++++++++
 source/module_io/read_input_item_system.cpp   |   6 +
 source/module_lr/hsolver_lrtd.hpp             |   4 +-
 source/module_parameter/input_parameter.h     |   1 +
 .../integrate/102_PW_DS_davsubspace_sca/INPUT |  29 ++
 tests/integrate/102_PW_DS_davsubspace_sca/KPT |   4 +
 .../integrate/102_PW_DS_davsubspace_sca/STRU  |  19 +
 tests/integrate/102_PW_DS_davsubspace_sca/jd  |   1 +
 .../102_PW_DS_davsubspace_sca/result.ref      |   6 +
 tests/integrate/CASES_CPU.txt                 |   1 +
 23 files changed, 1202 insertions(+), 48 deletions(-)
 create mode 100644 source/module_hsolver/diag_hs_para.cpp
 create mode 100644 source/module_hsolver/diag_hs_para.h
 create mode 100644 source/module_hsolver/diago_pxxxgvx.cpp
 create mode 100644 source/module_hsolver/diago_pxxxgvx.h
 create mode 100644 source/module_hsolver/test/test_diago_hs_para.cpp
 create mode 100644 tests/integrate/102_PW_DS_davsubspace_sca/INPUT
 create mode 100644 tests/integrate/102_PW_DS_davsubspace_sca/KPT
 create mode 100644 tests/integrate/102_PW_DS_davsubspace_sca/STRU
 create mode 100644 tests/integrate/102_PW_DS_davsubspace_sca/jd
 create mode 100644 tests/integrate/102_PW_DS_davsubspace_sca/result.ref
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index d8491bbf71..789e7428b5 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -39,6 +39,7 @@
     - [pw\_diag\_thr](#pw_diag_thr)
     - [pw\_diag\_nmax](#pw_diag_nmax)
     - [pw\_diag\_ndim](#pw_diag_ndim)
+    - [diag\_subspace\_method](#diag_subspace_method)
     - [erf\_ecut](#erf_ecut)
     - [fft\_mode](#fft_mode)
     - [erf\_height](#erf_height)
@@ -783,7 +784,18 @@ These variables are used to control the plane wave related parameters.
 
 - **Type**: Integer
 - **Description**: Only useful when you use `ks_solver = dav` or `ks_solver = dav_subspace`. It indicates dimension of workspace(number of wavefunction packets, at least 2 needed) for the Davidson method. A larger value may yield a smaller number of iterations in the algorithm but uses more memory and more CPU time in subspace diagonalization.
-- **Default**: 4
+- **Default**: 4 
+
+### diag_subspace_method
+
+- **Type**: Integer
+- **Description**: The method to diagonalize subspace in dav_subspace method. The available options are:
+  - 0: by LAPACK
+  - 1: by GenELPA
+  - 2: by ScaLAPACK
+  LAPACK only solve in one core, GenELPA and ScaLAPACK can solve in parallel. If the system is small (such as the band number is less than 100), LAPACK is recommended. If the system is large and MPI parallel is used, then GenELPA or ScaLAPACK is recommended, and GenELPA usually has better performance. For GenELPA and ScaLAPACK, the block size can be set by [nb2d](#nb2d).
+
+- **Default**: 0
 
 ### erf_ecut
 
diff --git a/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp b/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
index 8ef539902e..0eff2467fe 100644
--- a/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
+++ b/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
@@ -138,7 +138,9 @@ class PyDiagoDavSubspace
             tol, 
             max_iter, 
             need_subspace, 
-            comm_info
+            comm_info,
+            PARAM.inp.diag_subspace_method,
+            PARAM.inp.nb2d
         );
 
         return obj->diag(hpsi_func, psi, nbasis, eigenvalue, diag_ethr.data(), scf_type);
diff --git a/source/module_base/blacs_connector.h b/source/module_base/blacs_connector.h
index 3bcc43811a..61c67324e8 100644
--- a/source/module_base/blacs_connector.h
+++ b/source/module_base/blacs_connector.h
@@ -39,7 +39,7 @@ extern "C"
 		// Informational and Miscellaneous
 	void Cblacs_gridinfo(int icontxt, int* nprow, int *npcol, int *myprow, int *mypcol);
     void Cblacs_gridinit(int* icontxt, char* layout, int nprow, int npcol);
-    void Cblacs_gridexit(int* icontxt);
+    void Cblacs_gridexit(int icontxt);
     int Cblacs_pnum(int icontxt, int prow, int pcol);
     void Cblacs_pcoord(int icontxt, int pnum, int *prow, int *pcol);
 	void Cblacs_exit(int icontxt);
diff --git a/source/module_base/scalapack_connector.h b/source/module_base/scalapack_connector.h
index 8e0798b0d8..210cef58a1 100644
--- a/source/module_base/scalapack_connector.h
+++ b/source/module_base/scalapack_connector.h
@@ -85,6 +85,17 @@ extern "C"
 		const double* vl, const double* vu, const int* il, const int* iu,
 		const double* abstol, int* m, int* nz, double* w, const double*orfac, std::complex<double>* Z, const int* iz, const int* jz, const int*descz,
 		std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);
+	void pssygvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, float* A, const int* ia, const int* ja, const int*desca, float* B, const int* ib, const int* jb, const int*descb,
+		const float* vl, const float* vu, const int* il, const int* iu,
+		const float* abstol, int* m, int* nz, float* w, const float*orfac, float* Z, const int* iz, const int* jz, const int*descz,
+		float* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);
+	void pchegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, std::complex<float>* A, const int* ia, const int* ja, const int*desca, std::complex<float>* B, const int* ib, const int* jb, const int*descb,
+		const float* vl, const float* vu, const int* il, const int* iu,
+		const float* abstol, int* m, int* nz, float* w, const float*orfac, std::complex<float>* Z, const int* iz, const int* jz, const int*descz,
+		std::complex<float>* work, int* lwork, float* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);
+
 
 	void pzgetri_(
 		const int *n, 
diff --git a/source/module_hsolver/CMakeLists.txt b/source/module_hsolver/CMakeLists.txt
index 8e1ff69905..93a708f21d 100644
--- a/source/module_hsolver/CMakeLists.txt
+++ b/source/module_hsolver/CMakeLists.txt
@@ -9,6 +9,9 @@ list(APPEND objects
     hsolver_pw_sdft.cpp
     diago_iter_assist.cpp
     hsolver.cpp
+    diago_pxxxgvx.cpp
+    diag_hs_para.cpp
+
 )
 
 if(ENABLE_LCAO)
diff --git a/source/module_hsolver/diag_hs_para.cpp b/source/module_hsolver/diag_hs_para.cpp
new file mode 100644
index 0000000000..d227c9c53b
--- /dev/null
+++ b/source/module_hsolver/diag_hs_para.cpp
@@ -0,0 +1,201 @@
+#include <iostream>
+#include "module_hsolver/diag_hs_para.h"
+#include "module_basis/module_ao/parallel_2d.h"
+#include "module_hsolver/diago_pxxxgvx.h"
+#include "module_base/scalapack_connector.h"
+#include "module_hsolver/genelpa/elpa_solver.h"
+
+namespace hsolver
+{
+
+#ifdef __ELPA 
+    void elpa_diag(MPI_Comm comm,
+                   const int nband, 
+                   std::complex<double>* h_local, 
+                   std::complex<double>* s_local, 
+                   double* ekb, 
+                   std::complex<double>* wfc_2d,
+                   Parallel_2D& para2d_local)
+    {
+        int DecomposedState = 0;
+        ELPA_Solver es(false,
+                   comm,
+                   nband,
+                   para2d_local.get_row_size(),
+                   para2d_local.get_col_size(),
+                   para2d_local.desc);
+        es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
+        es.exit();
+    }
+
+    void elpa_diag(MPI_Comm comm,
+                   const int nband, 
+                   double* h_local, 
+                   double* s_local, 
+                   double* ekb, 
+                   double* wfc_2d,
+                   Parallel_2D& para2d_local)
+    {
+        int DecomposedState = 0;
+        ELPA_Solver es(true,
+                   comm,
+                   nband,
+                   para2d_local.get_row_size(),
+                   para2d_local.get_col_size(),
+                   para2d_local.desc);
+        es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
+        es.exit();
+    }
+
+    void elpa_diag(MPI_Comm comm,
+                   const int nband, 
+                   std::complex<float>* h_local, 
+                   std::complex<float>* s_local, 
+                   float* ekb, 
+                   std::complex<float>* wfc_2d,
+                   Parallel_2D& para2d_local)
+    {
+        std::cout << "Error: ELPA do not support single precision. " << std::endl;
+        exit(1);
+    }
+
+    void elpa_diag(MPI_Comm comm,
+                   const int nband, 
+                   float* h_local, 
+                   float* s_local, 
+                   float* ekb, 
+                   float* wfc_2d,
+                   Parallel_2D& para2d_local)
+    {
+        std::cout << "Error: ELPA do not support single precision. " << std::endl;
+        exit(1);
+    }
+
+#endif
+
+
+#ifdef __MPI
+
+template <typename T>
+void Diago_HS_para(
+                T* h, 
+                T* s, 
+                 const int lda, 
+                 const int nband,
+                 typename GetTypeReal<T>::type *const ekb,
+                 T *const wfc,
+                 const MPI_Comm& comm,
+                 const int diag_subspace_method,
+                 const int block_size)
+{
+    int myrank;
+    MPI_Comm_rank(comm, &myrank);
+    Parallel_2D para2d_global;
+    Parallel_2D para2d_local;
+    para2d_global.init(lda,lda,lda,comm);
+
+    int max_nb = block_size;
+    if (block_size == 0)
+    {
+        if (nband > 500)
+        {
+            max_nb = 32;
+        }
+        else
+        {
+            max_nb = 16;
+        }
+    }
+    else if (block_size < 0)
+    {
+        std::cout << "Error: block_size in diago_subspace should be a positive integer. " << std::endl;
+        exit(1);
+    }
+
+    // for genelpa, if the block size is too large that some cores have no data, then it will cause error.
+    if (diag_subspace_method == 1)
+    {
+        if (max_nb * (std::max(para2d_global.dim0, para2d_global.dim1) - 1) >= lda)
+        {
+            max_nb = lda / std::max(para2d_global.dim0, para2d_global.dim1);
+        }
+    }
+    
+    para2d_local.init(lda,lda,max_nb,comm);
+    std::vector<T> h_local(para2d_local.get_col_size() * para2d_local.get_row_size());
+    std::vector<T> s_local(para2d_local.get_col_size() * para2d_local.get_row_size());
+    std::vector<T> wfc_2d(para2d_local.get_col_size() * para2d_local.get_row_size());
+    
+    // distribute h and s to 2D
+    Cpxgemr2d(lda,lda,h,1,1,para2d_global.desc,h_local.data(),1,1,para2d_local.desc,para2d_local.blacs_ctxt);
+    Cpxgemr2d(lda,lda,s,1,1,para2d_global.desc,s_local.data(),1,1,para2d_local.desc,para2d_local.blacs_ctxt);
+
+    if (diag_subspace_method == 1)
+    {
+#ifdef __ELPA 
+        elpa_diag(comm, nband, h_local.data(), s_local.data(), ekb, wfc_2d.data(), para2d_local);
+#else
+        std::cout << "ERROR: try to use ELPA to solve the generalized eigenvalue problem, but ELPA is not compiled. " << std::endl;
+        exit(1);
+#endif        
+    }
+    else if (diag_subspace_method == 2)
+    {
+        hsolver::pxxxgvx_diag(para2d_local.desc, para2d_local.get_row_size(), para2d_local.get_col_size(),nband, h_local.data(), s_local.data(), ekb, wfc_2d.data());
+    }
+    else{
+        std::cout << "Error: parallel diagonalization method is not supported. " << "diag_subspace_method = " << diag_subspace_method << std::endl;
+        exit(1);
+    }
+
+    // gather wfc
+    Cpxgemr2d(lda,lda,wfc_2d.data(),1,1,para2d_local.desc,wfc,1,1,para2d_global.desc,para2d_local.blacs_ctxt);
+
+    // free the context
+    Cblacs_gridexit(para2d_local.blacs_ctxt);
+    Cblacs_gridexit(para2d_global.blacs_ctxt);
+}
+
+// template instantiation
+template void Diago_HS_para<double>(double* h, 
+                                    double* s, 
+                                    const int lda, 
+                                    const int nband, 
+                                    typename GetTypeReal<double>::type *const ekb, 
+                                    double *const wfc, 
+                                    const MPI_Comm& comm, 
+                                    const int diag_subspace_method,
+                                    const int block_size);
+template void Diago_HS_para<std::complex<double>>(std::complex<double>* h, 
+                                    std::complex<double>* s, 
+                                    const int lda, 
+                                    const int nband, 
+                                    typename GetTypeReal<std::complex<double>>::type *const ekb, 
+                                    std::complex<double> *const wfc, 
+                                    const MPI_Comm& comm, 
+                                    const int diag_subspace_method,
+                                    const int block_size);      
+template void Diago_HS_para<float>(float* h, 
+                                    float* s, 
+                                    const int lda, 
+                                    const int nband, 
+                                    typename GetTypeReal<float>::type *const ekb, 
+                                    float *const wfc, 
+                                    const MPI_Comm& comm, 
+                                    const int diag_subspace_method,
+                                    const int block_size);
+template void Diago_HS_para<std::complex<float>>(std::complex<float>* h, 
+                                    std::complex<float>* s, 
+                                    const int lda, 
+                                    const int nband, 
+                                    typename GetTypeReal<std::complex<float>>::type *const ekb, 
+                                    std::complex<float> *const wfc, 
+                                    const MPI_Comm& comm, 
+                                    const int diag_subspace_method,
+                                    const int block_size);                                                                                                      
+
+
+
+#endif
+
+} // namespace hsolver
\ No newline at end of file
diff --git a/source/module_hsolver/diag_hs_para.h b/source/module_hsolver/diag_hs_para.h
new file mode 100644
index 0000000000..ce73199a00
--- /dev/null
+++ b/source/module_hsolver/diag_hs_para.h
@@ -0,0 +1,47 @@
+#include "module_basis/module_ao/parallel_2d.h"
+#include "module_base/macros.h"
+
+#ifdef __MPI
+#include <mpi.h>
+#endif
+
+namespace hsolver
+{
+
+
+#ifdef __MPI
+
+/**
+ * @brief Parallel do the generalized eigenvalue problem
+ * 
+ * @tparam T double or complex<double> or float or complex<float>
+ * @param H the hermitian matrix H.
+ * @param S the overlap matrix S.
+ * @param lda the leading dimension of H and S
+ * @param nband the number of bands to be calculated
+ * @param ekb to store the eigenvalues.
+ * @param wfc to store the eigenvectors
+ * @param comm the communicator
+ * @param diag_subspace_method the method to solve the generalized eigenvalue problem
+ * @param block_size the block size in 2d block cyclic distribution if use elpa or scalapack. 
+ * 
+ * @note 1. h and s should be full matrix in rank 0 of the communicator, and the other ranks is not concerned.
+ * @note 2. wfc is complete in rank 0, and not store in other ranks.
+ * @note 3. diag_subspace_method should be 1: by elpa, 2: by scalapack
+ * @note 4. block_size should be 0 or a positive integer. If it is 0, then will use a value as large as possible that is allowed
+ */
+template <typename T>
+void Diago_HS_para(
+                T* h, 
+                T* s, 
+                 const int lda, 
+                 const int nband,
+                 typename GetTypeReal<T>::type *const ekb,
+                 T *const wfc,
+                 const MPI_Comm& comm,
+                 const int diag_subspace_method,
+                 const int block_size=0);
+#endif
+
+} // namespace hsolver                 
+              
\ No newline at end of file
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index d11d7093f1..3aa4b8c45b 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -8,7 +8,10 @@
 #include "module_hsolver/kernels/math_kernel_op.h"
 #include "module_base/kernels/dsp/dsp_connector.h"
 
+#include "module_hsolver/diag_hs_para.h"
+
 #include <vector>
+#include <mpi.h>
 
 using namespace hsolver;
 
@@ -20,9 +23,12 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
                                                 const double& diag_thr_in,
                                                 const int& diag_nmax_in,
                                                 const bool& need_subspace_in,
-                                                const diag_comm_info& diag_comm_in)
+                                                const diag_comm_info& diag_comm_in,
+                                                const int diag_subspace_method_in,
+                                                const int diago_subspace_bs_in)
     : precondition(precondition_in), n_band(nband_in), dim(nbasis_in), nbase_x(nband_in * david_ndim_in),
-      diag_thr(diag_thr_in), iter_nmax(diag_nmax_in), is_subspace(need_subspace_in), diag_comm(diag_comm_in)
+      diag_thr(diag_thr_in), iter_nmax(diag_nmax_in), is_subspace(need_subspace_in), diag_comm(diag_comm_in),
+        diag_subspace_method(diag_subspace_method_in), diago_subspace_bs(diago_subspace_bs_in)
 {
     this->device = base_device::get_device_type<Device>(this->ctx);
 
@@ -32,6 +38,7 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
 
     assert(david_ndim_in > 1);
     assert(david_ndim_in * nband_in < nbasis_in * this->diag_comm.nproc);
+    assert(diag_subspace_method >= 0 && diag_subspace_method < 3);
 
     // TODO: Added memory usage statistics
 
@@ -526,14 +533,13 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
                                                T* vcc)
 {
     ModuleBase::timer::tick("Diago_DavSubspace", "diag_zhegvx");
+    assert(nbase_x >= std::max(1, nbase));
 
-    if (this->diag_comm.rank == 0)
+    if (this->device == base_device::GpuDevice)
     {
-        assert(nbase_x >= std::max(1, nbase));
-
-        if (this->device == base_device::GpuDevice)
-        {
 #if defined(__CUDA) || defined(__ROCM)
+        if (this->diag_comm.rank == 0)
+        {
             Real* eigenvalue_gpu = nullptr;
             resmem_real_op()(this->ctx, eigenvalue_gpu, this->nbase_x);
 
@@ -562,46 +568,96 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
             syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, (*eigenvalue_iter).data(), eigenvalue_gpu, this->nbase_x);
 
             delmem_real_op()(this->ctx, eigenvalue_gpu);
-#endif
         }
-        else
+#endif
+    }
+    else
+    {
+        if (this->diag_subspace_method == 0)
         {
-            std::vector<std::vector<T>> h_diag(nbase, std::vector<T>(nbase, *this->zero));
-            std::vector<std::vector<T>> s_diag(nbase, std::vector<T>(nbase, *this->zero));
-
-            for (size_t i = 0; i < nbase; i++)
+            if (this->diag_comm.rank == 0)
             {
-                for (size_t j = 0; j < nbase; j++)
+                std::vector<std::vector<T>> h_diag(nbase, std::vector<T>(nbase, *this->zero));
+                std::vector<std::vector<T>> s_diag(nbase, std::vector<T>(nbase, *this->zero));
+
+                for (size_t i = 0; i < nbase; i++)
+                {
+                    for (size_t j = 0; j < nbase; j++)
+                    {
+                        h_diag[i][j] = hcc[i * this->nbase_x + j];
+                        s_diag[i][j] = scc[i * this->nbase_x + j];
+                    }
+                }
+                dngvx_op<T, Device>()(this->ctx,
+                                      nbase,
+                                      this->nbase_x,
+                                      this->hcc,
+                                      this->scc,
+                                      nband,
+                                      (*eigenvalue_iter).data(),
+                                      this->vcc);
+                // reset:
+                for (size_t i = 0; i < nbase; i++)
                 {
-                    h_diag[i][j] = hcc[i * this->nbase_x + j];
-                    s_diag[i][j] = scc[i * this->nbase_x + j];
+                    for (size_t j = 0; j < nbase; j++)
+                    {
+                        hcc[i * this->nbase_x + j] = h_diag[i][j];
+                        scc[i * this->nbase_x + j] = s_diag[i][j];
+                    }
+
+                    for (size_t j = nbase; j < this->nbase_x; j++)
+                    {
+                        hcc[i * this->nbase_x + j] = *this->zero;
+                        hcc[j * this->nbase_x + i] = *this->zero;
+                        scc[i * this->nbase_x + j] = *this->zero;
+                        scc[j * this->nbase_x + i] = *this->zero;
+                    }
                 }
             }
-            dngvx_op<T, Device>()(this->ctx,
-                                  nbase,
-                                  this->nbase_x,
-                                  this->hcc,
-                                  this->scc,
-                                  nband,
-                                  (*eigenvalue_iter).data(),
-                                  this->vcc);
-            // reset:
-            for (size_t i = 0; i < nbase; i++)
+        }
+        else
+        {
+#ifdef __MPI  
+            std::vector<T> h_diag;
+            std::vector<T> s_diag;
+            std::vector<T> vcc_tmp;
+            if (this->diag_comm.rank == 0)
             {
-                for (size_t j = 0; j < nbase; j++)
+                h_diag.resize(nbase * nbase, *this->zero);
+                s_diag.resize(nbase * nbase, *this->zero);
+                vcc_tmp.resize(nbase * nbase, *this->zero);
+                for (size_t i = 0; i < nbase; i++)
                 {
-                    hcc[i * this->nbase_x + j] = h_diag[i][j];
-                    scc[i * this->nbase_x + j] = s_diag[i][j];
+                    for (size_t j = 0; j < nbase; j++)
+                    {
+                        h_diag[i * nbase + j] = hcc[i * this->nbase_x + j];
+                        s_diag[i * nbase + j] = scc[i * this->nbase_x + j];
+                    }
                 }
-
-                for (size_t j = nbase; j < this->nbase_x; j++)
+            }
+            Diago_HS_para(h_diag.data(),
+                        s_diag.data(),
+                        nbase,
+                        nband,
+                        (*eigenvalue_iter).data(),
+                        vcc_tmp.data(),
+                        this->diag_comm.comm,
+                        this->diag_subspace_method,
+                        this->diago_subspace_bs);
+            if (this->diag_comm.rank == 0)
+            {
+                for (size_t i = 0; i < nband; i++)
                 {
-                    hcc[i * this->nbase_x + j] = *this->zero;
-                    hcc[j * this->nbase_x + i] = *this->zero;
-                    scc[i * this->nbase_x + j] = *this->zero;
-                    scc[j * this->nbase_x + i] = *this->zero;
+                    for (size_t j = 0; j < nbase; j++)
+                    {
+                        vcc[i * this->nbase_x + j] = vcc_tmp[i * nbase + j];
+                    }
                 }
             }
+#else
+            std::cout << "Error: parallel diagonalization is not supported in serial mode." << std::endl;
+            exit(1);    
+#endif
         }
     }
 
diff --git a/source/module_hsolver/diago_dav_subspace.h b/source/module_hsolver/diago_dav_subspace.h
index d93c252602..baf984d999 100644
--- a/source/module_hsolver/diago_dav_subspace.h
+++ b/source/module_hsolver/diago_dav_subspace.h
@@ -31,7 +31,9 @@ class Diago_DavSubspace
                       const double& diag_thr_in,
                       const int& diag_nmax_in,
                       const bool& need_subspace_in,
-                      const diag_comm_info& diag_comm_in);
+                      const diag_comm_info& diag_comm_in,
+                      const int diago_dav_method_in,
+                      const int block_size_in);
 
     ~Diago_DavSubspace();
 
@@ -139,6 +141,9 @@ class Diago_DavSubspace
 
     bool test_exit_cond(const int& ntry, const int& notconv, const bool& scf);
 
+    int diag_subspace_method; // 0: LAPACK, 1: Gen-ELPA, 2: ScaLAPACK
+    int diago_subspace_bs = 0; // the block size in 2d block cyclic distribution if use elpa or scalapack
+
 #ifdef __DSP
     using resmem_complex_op = base_device::memory::resize_memory_op_mt<T, Device>;
     using delmem_complex_op = base_device::memory::delete_memory_op_mt<T, Device>;
diff --git a/source/module_hsolver/diago_pxxxgvx.cpp b/source/module_hsolver/diago_pxxxgvx.cpp
new file mode 100644
index 0000000000..ac639d7996
--- /dev/null
+++ b/source/module_hsolver/diago_pxxxgvx.cpp
@@ -0,0 +1,344 @@
+#include <complex>
+#include <vector>
+#include "string.h"
+#include <iostream>
+
+#include "diago_pxxxgvx.h"
+#include "module_base/scalapack_connector.h"
+#include "module_base/blacs_connector.h"
+
+namespace hsolver
+{
+
+#ifdef __MPI
+
+void pxxxgvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, double* A, const int* ia, const int* ja, const int*desca, double* B, const int* ib, const int* jb, const int*descb,
+		const double* vl, const double* vu, const int* il, const int* iu,
+		const double* abstol, int* m, int* nz, double* w, const double*orfac, double* Z, const int* iz, const int* jz, const int*descz,
+		double* work, int* lwork, double* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info)
+{
+    // for a uniform interface, rwork and lrwork are input arguments, but not used in pdsygvx_/pssygvx_
+    pdsygvx_(itype, jobz, range, uplo,
+		n, A, ia, ja, desca, B, ib, jb, descb,
+		vl, vu, il,  iu,
+		abstol, m, nz, w, orfac, Z, iz, jz, descz,
+		work, lwork, iwork, liwork, ifail, iclustr, gap, info);
+}
+
+void pxxxgvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, std::complex<double>* A, const int* ia, const int* ja, const int*desca, std::complex<double>* B, const int* ib, const int* jb, const int*descb,
+		const double* vl, const double* vu, const int* il, const int* iu,
+		const double* abstol, int* m, int* nz, double* w, const double*orfac, std::complex<double>* Z, const int* iz, const int* jz, const int*descz,
+		std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info)
+{
+    pzhegvx_(itype, jobz, range, uplo,
+		n, A, ia, ja, desca, B, ib, jb, descb,
+		vl, vu, il,  iu,
+		abstol, m, nz, w, orfac, Z, iz, jz, descz,
+		work, lwork,rwork, lrwork, iwork, liwork, ifail, iclustr, gap, info);
+}
+
+void pxxxgvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, float* A, const int* ia, const int* ja, const int*desca, float* B, const int* ib, const int* jb, const int*descb,
+		const float* vl, const float* vu, const int* il, const int* iu,
+		const float* abstol, int* m, int* nz, float* w, const float*orfac, float* Z, const int* iz, const int* jz, const int*descz,
+		float* work, int* lwork, float* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info)
+{
+    pssygvx_(itype, jobz, range, uplo,
+		n, A, ia, ja, desca, B, ib, jb, descb,
+		vl, vu, il,  iu,
+		abstol, m, nz, w, orfac, Z, iz, jz, descz,
+		work, lwork, iwork, liwork, ifail, iclustr, gap, info);
+}
+
+void pxxxgvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, std::complex<float>* A, const int* ia, const int* ja, const int*desca, std::complex<float>* B, const int* ib, const int* jb, const int*descb,
+		const float* vl, const float* vu, const int* il, const int* iu,
+		const float* abstol, int* m, int* nz, float* w, const float*orfac, std::complex<float>* Z, const int* iz, const int* jz, const int*descz,
+		std::complex<float>* work, int* lwork, float* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info)
+{
+    pchegvx_(itype, jobz, range, uplo,
+		n, A, ia, ja, desca, B, ib, jb, descb,
+		vl, vu, il,  iu,
+		abstol, m, nz, w, orfac, Z, iz, jz, descz,
+		work, lwork,rwork, lrwork, iwork, liwork, ifail, iclustr, gap, info);
+}
+
+// post processing for pdsygvx/pzhegvx/pdsygvx/pzhegvx
+void pxxxgvx_post_processing(const int info, 
+                    const std::vector<int>& ifail,
+                    const std::vector<int>& iclustr,
+                    const int M,
+                    const int NZ,
+                    const int nbands, 
+                    int& degeneracy_max)
+{
+    const std::string str_info = "Scalapack diagonalization: \n    info = " + std::to_string(info) + ".\n";
+
+    if (info == 0)
+    {
+        return;
+    }
+    else if (info < 0)
+    {
+        const int info_negative = -info;
+        const std::string str_index
+            = (info_negative > 100)
+                  ? std::to_string(info_negative / 100) + "-th argument "
+                        + std::to_string(info_negative % 100) + "-entry is illegal.\n"
+                  : std::to_string(info_negative) + "-th argument is illegal.\n";
+        throw std::runtime_error(str_info + str_index);
+    }
+    else if (info % 2)
+    {
+        std::string str_ifail = "ifail = ";
+        for (const int i: ifail)
+        {
+            str_ifail += std::to_string(i) + " ";
+        }
+        throw std::runtime_error(str_info + str_ifail);
+    }
+    else if (info / 2 % 2)
+    {
+        int degeneracy_need = 0;
+        for (int irank = 0; irank < iclustr.size()/2; ++irank)
+        {
+            degeneracy_need = std::max(degeneracy_need, iclustr[2 * irank + 1] - iclustr[2 * irank]);
+        }
+        const std::string str_need = "degeneracy_need = " + std::to_string(degeneracy_need) + ".\n";
+        const std::string str_saved
+            = "degeneracy_saved = " + std::to_string(degeneracy_max) + ".\n";
+        if (degeneracy_need <= degeneracy_max)
+        {
+            throw std::runtime_error(str_info + str_need + str_saved);
+        }
+        else
+        {
+            std::cout << str_need << str_saved;
+            degeneracy_max = degeneracy_need;
+            return;
+        }
+    }
+    else if (info / 4 % 2)
+    {
+        const std::string str_M = "M = " + std::to_string(M) + ".\n";
+        const std::string str_NZ = "NZ = " + std::to_string(NZ) + ".\n";
+        const std::string str_NBANDS
+            = "Number of eigenvalues solved = " + std::to_string(nbands) + ".\n";
+        throw std::runtime_error(str_info + str_M + str_NZ + str_NBANDS);
+    }
+    else if (info / 16 % 2)
+    {
+        const std::string str_npos = "Not positive definite = " + std::to_string(ifail[0]) + ".\n";
+        throw std::runtime_error(str_info + str_npos);
+    }
+    else
+    {
+        throw std::runtime_error(str_info);
+    }
+}
+
+void get_lwork(int& lwork, std::vector<double>& work)
+{
+    lwork = work[0];
+}
+
+void get_lwork(int& lwork, std::vector<float>& work)
+{
+    lwork = work[0];
+}
+
+void get_lwork(int& lwork, std::vector<std::complex<double>>& work)
+{
+    lwork = work[0].real();
+}
+
+void get_lwork(int& lwork, std::vector<std::complex<float>>& work)
+{
+    lwork = work[0].real();
+}
+
+template <typename T>
+void pxxxgvx_diag(const int *const desc,
+                      const int ncol,
+                      const int nrow,
+                      const int nbands,
+                      const T *const h_mat,
+                      const T *const s_mat,
+                      typename GetTypeReal<T>::type *const ekb,
+                      T *const wfc_2d)
+{
+    int nprow, npcol, myprow, mypcol;
+    Cblacs_gridinfo(desc[1], &nprow, &npcol, &myprow, &mypcol);
+    int dsize = nprow * npcol;
+
+    int degeneracy_max = 12; // only used for complex<float> and complex<double>
+    while (true)
+    {
+        std::vector<T> h_tmp(ncol * nrow);
+        std::vector<T> s_tmp(ncol * nrow);
+        memcpy(h_tmp.data(), h_mat, sizeof(T) * ncol * nrow);
+        memcpy(s_tmp.data(), s_mat, sizeof(T) * ncol * nrow);
+
+        int ndim_global = desc[2];
+        const char jobz = 'V', range = 'I', uplo = 'U';
+        const int itype = 1, il = 1, iu = nbands, one = 1;
+        int M = 0, NZ = 0, lwork = -1, lrwork = -1, liwork = -1, info = 0;
+        const typename GetTypeReal<T>::type abstol = 0, orfac = -1;
+        const typename GetTypeReal<T>::type vl = 0, vu = 0;
+        std::vector<T> work(1, 0);
+        std::vector<typename GetTypeReal<T>::type> rwork(3, 0); // only used for complex<float> and complex<double>
+        std::vector<int> iwork(1, 0);
+        std::vector<int> ifail(ndim_global, 0);
+        std::vector<int> iclustr(2 * dsize);
+        std::vector<typename GetTypeReal<T>::type> gap(dsize);
+
+        pxxxgvx_(&itype,
+                 &jobz,
+                 &range,
+                 &uplo,
+                 &ndim_global,
+                 h_tmp.data(),
+                 &one,
+                 &one,
+                 desc,
+                 s_tmp.data(),
+                 &one,
+                 &one,
+                 desc,
+                 &vl,
+                 &vu,
+                 &il,
+                 &iu,
+                 &abstol,
+                 &M,
+                 &NZ,
+                 ekb,
+                 &orfac,
+                 wfc_2d,
+                 &one,
+                 &one,
+                 desc,
+                 work.data(),
+                 &lwork, // is not used for real data type
+                 rwork.data(), // is not used for real data type
+                 &lrwork,
+                 iwork.data(),
+                 &liwork,
+                 ifail.data(),
+                 iclustr.data(),
+                 gap.data(),
+                 &info);
+
+        if (info)
+        {
+            throw std::runtime_error("Scalapack diagonalization: \n    info = " + std::to_string(info) + ".\n");
+        }
+
+        if (std::is_same<T, std::complex<float>>::value || std::is_same<T, std::complex<double>>::value)
+        {
+            get_lwork(lwork, work);
+            work.resize(lwork, 0);
+            liwork = iwork[0];
+            iwork.resize(liwork, 0);
+            lrwork = rwork[0] + degeneracy_max * ndim_global;
+            int maxlrwork = std::max(lrwork, 3);
+            rwork.resize(maxlrwork, 0);
+        }
+        else
+        {
+            get_lwork(lwork, work);
+            work.resize(std::max(lwork, 3), 0);
+            liwork = iwork[0];
+            iwork.resize(liwork, 0);
+        }
+
+        pxxxgvx_(&itype,
+                 &jobz,
+                 &range,
+                 &uplo,
+                 &ndim_global,
+                 h_tmp.data(),
+                 &one,
+                 &one,
+                 desc,
+                 s_tmp.data(),
+                 &one,
+                 &one,
+                 desc,
+                 &vl,
+                 &vu,
+                 &il,
+                 &iu,
+                 &abstol,
+                 &M,
+                 &NZ,
+                 ekb,
+                 &orfac,
+                 wfc_2d,
+                 &one,
+                 &one,
+                 desc,
+                 work.data(),
+                 &lwork,
+                 rwork.data(), // is not used for real data type
+                 &lrwork,      // is not used for real data type
+                 iwork.data(),
+                 &liwork,
+                 ifail.data(),
+                 iclustr.data(),
+                 gap.data(),
+                 &info);
+
+        if (info == 0)
+        {
+            return;
+        }
+        pxxxgvx_post_processing(info, ifail,iclustr, M,NZ,nbands,degeneracy_max);
+
+        // break the loop for real data type
+        if (std::is_same<T, float>::value || std::is_same<T,double>::value)
+        {
+            return;
+        }
+    }
+}
+
+// template instantiation
+template void pxxxgvx_diag(const int *const desc,
+                           const int ncol,
+                           const int nrow,
+                           const int nbands,
+                           const double *const h_mat,
+                           const double *const s_mat,
+                           double *const ekb,
+                           double *const wfc_2d);
+template void pxxxgvx_diag(const int *const desc,
+                           const int ncol,
+                           const int nrow,
+                           const int nbands,
+                           const std::complex<double> *const h_mat,
+                           const std::complex<double> *const s_mat,
+                           double *const ekb,
+                           std::complex<double> *const wfc_2d);
+template void pxxxgvx_diag(const int *const desc,
+                            const int ncol,
+                            const int nrow,
+                            const int nbands,
+                            const float *const h_mat,
+                            const float *const s_mat,
+                            float *const ekb,
+                            float *const wfc_2d);
+template void pxxxgvx_diag(const int *const desc,
+                            const int ncol,
+                            const int nrow,
+                            const int nbands,
+                            const std::complex<float> *const h_mat,
+                            const std::complex<float> *const s_mat,
+                            float *const ekb,
+                            std::complex<float> *const wfc_2d);                           
+
+#endif
+
+} // namespace
\ No newline at end of file
diff --git a/source/module_hsolver/diago_pxxxgvx.h b/source/module_hsolver/diago_pxxxgvx.h
new file mode 100644
index 0000000000..6684a5ea6f
--- /dev/null
+++ b/source/module_hsolver/diago_pxxxgvx.h
@@ -0,0 +1,37 @@
+#ifndef HSOLVER_DIAGO_PXXXGVX_H
+#define HSOLVER_DIAGO_PXXXGVX_H
+#include <complex>
+#include "module_base/macros.h"
+
+namespace hsolver
+{
+
+#ifdef __MPI
+/**
+ * @brief generalized eigenvalue problem using pdsygvx/pzhegvx/pdsygvx/pzhegvx
+ * 
+ * @param desc the descriptor of scalapack descriptor
+ * @param ncol the number of columns of the H/S matrix in current processor
+ * @param nrow the number of rows of the H/S matrix in current processor
+ * @param nbands the number of bands to be solved
+ * @param h_mat the Hamiltonian matrix
+ * @param s_mat the overlap matrix
+ * @param ekb the eigenvalues
+ * @param wfc_2d the eigenvectors in 2D block cyclic distribution
+ * 
+ */
+
+template <typename T>
+void pxxxgvx_diag(const int* const desc,
+                  const int ncol,
+                  const int nrow,
+                  const int nbands,
+                  const T* const h_mat,
+                  const T* const s_mat,
+                  typename GetTypeReal<T>::type* const ekb,
+                  T* const wfc_2d);
+#endif 
+
+} // namespace hsolver
+
+#endif // HSOLVER_DIAGO_PXXXGVX_H
\ No newline at end of file
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index 05a4a893eb..83bac220f8 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -500,7 +500,9 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
                                                   this->diag_thr,
                                                   this->diag_iter_max,
                                                   this->need_subspace,
-                                                  comm_info);
+                                                  comm_info,
+                                                  PARAM.inp.diag_subspace_method,
+                                                  PARAM.inp.nb2d);
 
         DiagoIterAssist<T, Device>::avg_iter += static_cast<double>(
             dav_subspace.diag(hpsi_func, psi.get_pointer(), psi.get_nbasis(), eigenvalue, this->ethr_band.data(), scf));
diff --git a/source/module_hsolver/test/CMakeLists.txt b/source/module_hsolver/test/CMakeLists.txt
index 53698c37cb..7ed27cb6c2 100644
--- a/source/module_hsolver/test/CMakeLists.txt
+++ b/source/module_hsolver/test/CMakeLists.txt
@@ -153,6 +153,12 @@ AddTest(
   SOURCES parallel_k2d_test.cpp ../parallel_k2d.cpp ../../module_cell/parallel_kpoints.cpp  ../../module_basis/module_ao/parallel_2d.cpp
 )
 
+AddTest(
+  TARGET hsolver_diago_hs_parallel
+  LIBS parameter  ${math_libs} ELPA::ELPA base device MPI::MPI_CXX genelpa psi
+  SOURCES test_diago_hs_para.cpp ../diag_hs_para.cpp ../diago_pxxxgvx.cpp  ../../module_basis/module_ao/parallel_2d.cpp ../diago_elpa.cpp ../diago_scalapack.cpp 
+)
+
 find_program(BASH bash)
 if (ENABLE_MPI)
   add_test(NAME HSolver_cg_parallel
@@ -172,11 +178,11 @@ if (ENABLE_MPI)
           COMMAND ${BASH} diago_lcao_parallel_test.sh
           WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
         )
-  if(ENABLE_PEXSI)
-    add_test(NAME HSolver_LCAO_PEXSI_parallel
-          COMMAND ${BASH} diago_pexsi_parallel_test.sh
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-    )
-endif()
-endif()
+    if(ENABLE_PEXSI)
+      add_test(NAME HSolver_LCAO_PEXSI_parallel
+            COMMAND ${BASH} diago_pexsi_parallel_test.sh
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      )
+    endif()
+  endif()
 endif()
\ No newline at end of file
diff --git a/source/module_hsolver/test/test_diago_hs_para.cpp b/source/module_hsolver/test/test_diago_hs_para.cpp
new file mode 100644
index 0000000000..29ed7e30a1
--- /dev/null
+++ b/source/module_hsolver/test/test_diago_hs_para.cpp
@@ -0,0 +1,359 @@
+#ifndef TEST_DIAGO_PXXXGVX_H
+#define TEST_DIAGO_PXXXGVX_H
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <complex>
+#include <random>
+#include <mpi.h>
+#include <chrono>
+#include <fstream>
+
+#include "../diag_hs_para.h"
+#include "module_hsolver/kernels/dngvd_op.h"
+
+template <typename T>
+typename std::enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value>::type
+generate_random_hs_impl(int d, std::mt19937& gen, std::uniform_real_distribution<typename GetTypeReal<T>::type>& dis, std::vector<T>& h_mat, std::vector<T>& s_mat) {
+    // For S matrix, firstly we generate a random symmetric matrix s_tmp, then we set S = s_tmp * s_tmp^T + n * I
+    std::vector<T> s_tmp(d*d,0);
+    for (int i = 0; i < d; ++i) {
+        for (int j = i; j < d; ++j) {
+            typename GetTypeReal<T>::type value1 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            h_mat[i * d + j] = value1;
+            h_mat[j * d + i] = value1;
+
+            // construct a random overlap matrix
+            typename GetTypeReal<T>::type value2 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            s_tmp[i * d + j] = value2;
+        }
+    }
+
+    // set S = s_tmp * s_tmp^T + n * I
+    for (int i = 0; i < d; ++i) {
+        for (int j = 0; j < d; ++j) {
+            s_mat[i * d + j] = 0;
+            for (int k = 0; k < d; ++k) {
+                s_mat[i * d + j] += s_tmp[i * d + k] * s_tmp[j * d + k];
+            }
+            if (i == j) {
+                s_mat[i * d + j] += 2.0;
+            }
+        }
+    }
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, std::complex<double>>::value || std::is_same<T, std::complex<float>>::value>::type
+generate_random_hs_impl(int d, std::mt19937& gen, std::uniform_real_distribution<typename GetTypeReal<T>::type>& dis, std::vector<T>& h_mat, std::vector<T>& s_mat) {
+    std::vector<T> s_tmp(d*d,0);
+    for (int i = 0; i < d; ++i) {
+        for (int j = i; j < d; ++j) {
+            typename GetTypeReal<T>::type value1 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            typename GetTypeReal<T>::type value2 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            h_mat[i * d + j] = T(value1, value2);
+            if (i != j)
+            {
+                h_mat[j * d + i] = T(value1, -value2);
+            }
+            else{
+                h_mat[j * d + i] = T(value1, 0);
+            }
+
+            // construct a random overlap matrix
+            value1 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            value2 = static_cast<typename GetTypeReal<T>::type>(dis(gen));
+            s_tmp[i * d + j] = T(value1, value2);
+        }
+    }
+
+    // set S = s_tmp * s_tmp^T + n * I
+    for (int i = 0; i < d; ++i) {
+        for (int j = 0; j < d; ++j) {
+            s_mat[i * d + j] = T(0, 0);
+            for (int k = 0; k < d; ++k) {
+                s_mat[i * d + j] += s_tmp[i * d + k] * std::conj(s_tmp[j * d + k]);
+            }
+            if (i == j) {
+                s_mat[i * d + j] += T(2.0, 0);
+            }
+        }
+    }
+}
+
+template <typename T>
+void generate_random_hs(int d, int random_seed ,std::vector<T>& h_mat, std::vector<T>& s_mat) {
+    std::mt19937 gen(random_seed);
+    std::uniform_real_distribution<typename GetTypeReal<T>::type> dis(-1.0,1.0);
+
+    h_mat.resize(d * d);
+    s_mat.resize(d * d);
+    generate_random_hs_impl(d, gen, dis, h_mat, s_mat);
+}
+
+
+template <typename T>
+typename std::enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value>::type
+verify_results(const std::vector<T>& h_psi, const std::vector<T>& s_psi, const std::vector<typename GetTypeReal<T>::type>& ekb, int lda, int nbands, double threshold) {
+    for (int i = 0; i < lda; ++i) {
+        for (int j = 0; j < nbands; ++j) {
+            ASSERT_NEAR(h_psi[j * lda + i], ekb[j] * s_psi[j * lda + i], threshold);
+        }
+    }
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, std::complex<double>>::value || std::is_same<T, std::complex<float>>::value>::type
+verify_results(const std::vector<T>& h_psi, const std::vector<T>& s_psi, const std::vector<typename GetTypeReal<T>::type>& ekb, int lda, int nbands, double threshold) {
+    for (int i = 0; i < lda; ++i) {
+        for (int j = 0; j < nbands; ++j) {
+            ASSERT_NEAR(h_psi[j * lda + i].real(), ekb[j] * s_psi[j * lda + i].real(), threshold);
+            ASSERT_NEAR(h_psi[j * lda + i].imag(), ekb[j] * s_psi[j * lda + i].imag(), threshold);
+        }
+    }
+}
+
+template <typename T>
+void test_diago_hs(int lda, int nb, int random_seed, int nbands, int diag_type, MPI_Comm comm) {
+    // diag_type should be 1 (for elpa) or 2 (for scalapack)
+    int my_rank;
+    MPI_Comm_rank(comm, &my_rank);
+
+    std::vector<T> h_mat, s_mat, wfc, h_psi, s_psi;
+    std::vector<typename GetTypeReal<T>::type> ekb(lda);
+    if (my_rank==0)
+    {
+        h_mat.resize(lda * lda);
+        s_mat.resize(lda * lda);
+        wfc.resize(lda * lda);
+        //generate_random_hs(lda, random_seed, h_mat, s_mat);
+        //read h from h.dat and s from s.dat,
+        std::ifstream hfile("h.dat");
+        hfile.read((char*)h_mat.data(), lda * lda * sizeof(T));
+        hfile.close();
+        std::ifstream sfile("s.dat");
+        sfile.read((char*)s_mat.data(), lda * lda * sizeof(T));
+        sfile.close();
+    }
+    hsolver::Diago_HS_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb.data(), wfc.data(), comm, diag_type, nb);
+
+    // Verify results
+    if (my_rank == 0){
+        double threshold = 1e-6;
+        if (std::is_same<T, std::complex<double>>::value || std::is_same<T, double>::value) {
+            threshold = 1e-12;
+        }
+
+        h_psi.resize(lda * nbands, 0);
+        s_psi.resize(lda * nbands, 0);
+
+        for (int i = 0; i < lda; ++i) {
+            for (int j = 0; j < nbands; ++j) {
+                for (int k = 0; k < lda; ++k) {
+                    h_psi[j * lda + i] += h_mat[k * lda + i] * wfc[j * lda + k];
+                    s_psi[j * lda + i] += s_mat[k * lda + i] * wfc[j * lda + k];
+                }
+            }
+        }
+        verify_results<T>(h_psi, s_psi, ekb, lda, nbands, threshold);
+    }
+}
+
+template <typename T>
+void test_performance(int lda, int nb, int nbands, MPI_Comm comm,int case_numb, int loop_numb) {
+    // generate 10 random H/S, and do the diagonalization 100 times by using elpa/scalapack and lapack.
+    int my_rank, nproc;
+    MPI_Comm_rank(comm, &my_rank);
+    MPI_Comm_size(comm, &nproc);
+
+    std::vector<T> h_mat, s_mat, wfc, h_psi, s_psi;
+    std::vector<typename GetTypeReal<T>::type> ekb_elpa(lda);
+    std::vector<typename GetTypeReal<T>::type> ekb_scalap(lda);
+    std::vector<typename GetTypeReal<T>::type> ekb_lapack(lda);
+
+    if (my_rank==0)
+    {
+        std::cout << "\nMatrix size: " << lda << " x " << lda << std::endl;
+        std::cout << "Number of bands: " << nbands << std::endl;
+        std::cout << "Number of processors: " << nproc << std::endl;
+        std::cout << "Block size of 2D distribution: " << nb << std::endl;
+        h_mat.resize(lda * lda);
+        s_mat.resize(lda * lda);
+        wfc.resize(lda * lda);
+    }
+
+    // store all the times in a vector
+    std::vector<double> time_elpa(case_numb, 0);
+    std::vector<double> time_scalap(case_numb, 0);
+    std::vector<double> time_lapack(case_numb, 0);
+
+    if (my_rank == 0) std::cout << "Random matrix ";
+    for (int randomi = 0; randomi < case_numb; ++randomi) 
+    {
+        
+        if (my_rank == 0) {
+            std::cout << randomi << " ";
+            generate_random_hs(lda, randomi, h_mat, s_mat);
+        }
+
+        // ELPA
+        MPI_Barrier(comm);
+        auto start = std::chrono::high_resolution_clock::now();
+        for (int j=0;j<loop_numb;j++)
+        {
+            hsolver::Diago_HS_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb_elpa.data(), wfc.data(), comm, 1, nb);
+            MPI_Barrier(comm);
+        }
+        MPI_Barrier(comm);
+        auto end = std::chrono::high_resolution_clock::now();
+        time_elpa[randomi] = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+
+        // scalapack
+        start = std::chrono::high_resolution_clock::now();
+        for (int j=0;j<loop_numb;j++)
+        {
+            hsolver::Diago_HS_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb_scalap.data(), wfc.data(), comm, 2, nb);
+            MPI_Barrier(comm);
+        }
+        MPI_Barrier(comm);
+        end = std::chrono::high_resolution_clock::now();
+        time_scalap[randomi] = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+        //LApack
+        if (my_rank == 0) 
+        { 
+            std::vector<T> h_tmp, s_tmp;
+            start = std::chrono::high_resolution_clock::now();
+            base_device::DEVICE_CPU* ctx = {};
+
+            for (int j=0;j<loop_numb;j++)
+            {
+                h_tmp = h_mat;
+                s_tmp = s_mat;
+                hsolver::dngvx_op<T,base_device::DEVICE_CPU>()(ctx,
+                                      lda,
+                                      lda,
+                                      h_tmp.data(),
+                                      s_tmp.data(),
+                                      nbands,
+                                      ekb_lapack.data(),
+                                      wfc.data());
+            }
+            end = std::chrono::high_resolution_clock::now();
+            time_lapack[randomi] = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+            //COMPARE EKB
+            for (int i = 0; i < nbands; ++i) {
+                typename GetTypeReal<T>::type diff_elpa_lapack = std::abs(ekb_elpa[i] - ekb_lapack[i]);
+                typename GetTypeReal<T>::type diff_scalap_lapack = std::abs(ekb_scalap[i] - ekb_lapack[i]);
+                if (diff_elpa_lapack > 1e-6 || diff_scalap_lapack > 1e-6)
+                {
+                    std::cout << "eigenvalue " << i << " by ELPA: " << ekb_elpa[i] << std::endl;
+                    std::cout << "eigenvalue " << i << " by Scalapack: " << ekb_scalap[i] << std::endl;
+                    std::cout << "eigenvalue " << i << " by Lapack: " << ekb_lapack[i] << std::endl;
+                }
+            }
+        }
+        MPI_Barrier(comm);   
+
+    }
+
+    if (my_rank == 0)
+    {
+        std::cout << "\nELPA Time     : ";
+        for (int i=0; i < case_numb;i++)
+        {std::cout << time_elpa[i] << " ";}
+        std::cout << std::endl;
+
+        std::cout << "scalapack Time: ";
+        for (int i=0; i < case_numb;i++)
+        {std::cout << time_scalap[i] << " ";}
+        std::cout << std::endl;
+
+        std::cout << "lapack Time   : ";
+        for (int i=0; i < case_numb;i++)
+        {std::cout << time_lapack[i] << " ";}
+        std::cout << std::endl;
+
+        // print out the average time and speedup
+        double avg_time_elpa = 0;
+        double avg_time_scalap = 0;
+        double avg_time_lapack = 0;
+        for (int i=0; i < case_numb;i++)
+        {
+            avg_time_elpa += time_elpa[i];
+            avg_time_scalap += time_scalap[i];
+            avg_time_lapack += time_lapack[i];
+        }
+
+        avg_time_elpa /= case_numb;
+        avg_time_scalap /= case_numb;
+        avg_time_lapack /= case_numb;
+        std::cout << "Average Lapack Time   : " << avg_time_lapack << " ms" << std::endl;
+        std::cout << "Average ELPA Time     : " << avg_time_elpa << " ms, Speedup: " << avg_time_lapack / avg_time_elpa << std::endl;
+        std::cout << "Average Scalapack Time: " << avg_time_scalap << " ms, Speedup: " << avg_time_lapack / avg_time_scalap << std::endl;
+    }
+}
+
+//test_diago_hs(int lda, int nb, int random_seed, int nbands, int diag_type, MPI_Comm comm)
+TEST(DiagoPxxxgvxElpaTest, Double) {
+    test_diago_hs<double>(16, 4, 0, 10, 1, MPI_COMM_WORLD);
+    test_diago_hs<double>(20, 6, 0, 18, 1, MPI_COMM_WORLD);
+}
+
+TEST(DiagoPxxxgvxElpaTest, ComplexDouble) {
+    test_diago_hs<std::complex<double>>(16, 6, 0, 10, 1, MPI_COMM_WORLD);
+    test_diago_hs<std::complex<double>>(20, 8, 0, 18, 1, MPI_COMM_WORLD);
+}
+
+TEST(DiagoPxxxgvxScalapackTest, Double) {
+    test_diago_hs<double>(16, 4, 0, 10, 2,MPI_COMM_WORLD);
+    test_diago_hs<double>(20, 6, 0, 18, 2,MPI_COMM_WORLD);
+}
+
+TEST(DiagoPxxxgvxScalapackTest, ComplexDouble) {
+    test_diago_hs<std::complex<double>>(16, 4, 0, 10, 2, MPI_COMM_WORLD);
+}
+TEST(DiagoPxxxgvxScalapackTest, Float) {
+    test_diago_hs<float>(16, 4, 0, 10,2,MPI_COMM_WORLD);
+}
+
+TEST(DiagoPxxxgvxScalapackTest, ComplexFloat) {
+    test_diago_hs<std::complex<float>>(16, 4, 0, 10,2,MPI_COMM_WORLD);
+}
+
+//TEST(DiagoPxxxgvxPerformanceTest, Double) {
+//    int ndim = 200;
+//    int nband = 180;
+//    int case_numb = 10;
+//    int loop_numb = 10;
+//    test_performance<std::complex<double>>(ndim, 32,  nband, MPI_COMM_WORLD, case_numb, loop_numb);
+//}
+
+int main(int argc, char** argv) {
+    MPI_Init(&argc, &argv);
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+    testing::InitGoogleTest(&argc, argv);
+    ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners();
+
+    if (myrank != 0) {
+        delete listeners.Release(listeners.default_result_printer());
+    }
+
+    int result = RUN_ALL_TESTS();
+    if (myrank == 0 && result != 0)
+    {
+        std::cout << "ERROR:some tests are not passed" << std::endl;
+        MPI_Finalize();
+        return result;
+	}
+
+    MPI_Finalize();
+
+	return 0;
+}
+
+#endif // TEST_DIAGO_PXXXGVX_H
\ No newline at end of file
diff --git a/source/module_io/read_input_item_system.cpp b/source/module_io/read_input_item_system.cpp
index 7398dc2e6a..3fd2e48bd5 100644
--- a/source/module_io/read_input_item_system.cpp
+++ b/source/module_io/read_input_item_system.cpp
@@ -477,6 +477,12 @@ void ReadInput::item_system()
         read_sync_int(input.fft_mode);
         this->add_item(item);
     }
+    {
+        Input_Item item("diag_subspace_method");
+        item.annotation = "method of subspace diagonalization in dav_subspace. 0:LaPack; 1:genelpa, 2:scalapack";
+        read_sync_int(input.diag_subspace_method);
+        this->add_item(item);
+    }
     {
         Input_Item item("init_wfc");
         item.annotation = "start wave functions are from 'atomic', "
diff --git a/source/module_lr/hsolver_lrtd.hpp b/source/module_lr/hsolver_lrtd.hpp
index 6c8be619eb..3694631cfc 100644
--- a/source/module_lr/hsolver_lrtd.hpp
+++ b/source/module_lr/hsolver_lrtd.hpp
@@ -95,7 +95,9 @@ namespace LR
                         diag_ethr,
                         maxiter,
                         false, //always do the subspace diag (check the implementation)
-                        comm_info);
+                        comm_info,
+                        PARAM.inp.diag_subspace_method,
+                        PARAM.inp.nb2d);
                     std::vector<double> ethr_band(nband, diag_ethr);
                     hsolver::DiagoIterAssist<T>::avg_iter
                         += static_cast<double>(dav_subspace.diag(
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index db71d32b63..dfdf33b704 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -88,6 +88,7 @@ struct Input_para
     double pw_diag_thr = 0.01; ///< used in cg method
     int pw_diag_ndim = 4;      ///< dimension of workspace for Davidson diagonalization
     int diago_cg_prec = 1;     ///< mohan add 2012-03-31
+    int diag_subspace_method = 0; // 0: Lapack, 1: elpa, 2: scalapack
 
     std::string smearing_method = "gauss"; ///< "gauss",
                                            ///< "mp","methfessel-paxton"
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/INPUT b/tests/integrate/102_PW_DS_davsubspace_sca/INPUT
new file mode 100644
index 0000000000..0cbffad0f8
--- /dev/null
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/INPUT
@@ -0,0 +1,29 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix			autotest
+calculation     scf
+
+nbands			6
+symmetry		1
+pseudo_dir	../../PP_ORB
+
+#Parameters (2.Iteration)
+ecutwfc			20
+scf_thr				1e-8
+scf_nmax			100
+
+
+#Parameters (3.Basis)
+basis_type		pw
+
+#Parameters (4.Smearing)
+smearing_method		gauss
+smearing_sigma			0.002
+
+#Parameters (5.Mixing)
+mixing_type		plain
+mixing_beta		0.5
+
+ks_solver       dav_subspace
+diag_subspace_method 2
+nb2d 4
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/KPT b/tests/integrate/102_PW_DS_davsubspace_sca/KPT
new file mode 100644
index 0000000000..c289c0158a
--- /dev/null
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/STRU b/tests/integrate/102_PW_DS_davsubspace_sca/STRU
new file mode 100644
index 0000000000..cdb8157cce
--- /dev/null
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/STRU
@@ -0,0 +1,19 @@
+ATOMIC_SPECIES
+Si 14 Si.pz-vbc.UPF
+
+LATTICE_CONSTANT
+10.2  // add lattice constant
+
+LATTICE_VECTORS
+0.5 0.5 0.0
+0.5 0.0 0.5
+0.0 0.5 0.5
+
+ATOMIC_POSITIONS
+Direct 
+
+Si	// Element type	
+0.0	// magnetism
+2	
+0.00 0.00 0.00 1 1 1
+0.25 0.25 0.25 1 1 1
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/jd b/tests/integrate/102_PW_DS_davsubspace_sca/jd
new file mode 100644
index 0000000000..70376ea55e
--- /dev/null
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/jd
@@ -0,0 +1 @@
+test subspace diagonalization with scalapack
\ No newline at end of file
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/result.ref b/tests/integrate/102_PW_DS_davsubspace_sca/result.ref
new file mode 100644
index 0000000000..46352c76f5
--- /dev/null
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/result.ref
@@ -0,0 +1,6 @@
+etotref -198.2238296207179
+etotperatomref -99.1119148104
+pointgroupref T_d
+spacegroupref O_h
+nksibzref 1
+totaltimeref 
diff --git a/tests/integrate/CASES_CPU.txt b/tests/integrate/CASES_CPU.txt
index 822bc4b126..07de643103 100644
--- a/tests/integrate/CASES_CPU.txt
+++ b/tests/integrate/CASES_CPU.txt
@@ -25,6 +25,7 @@
 102_PW_BPCG
 102_PW_CG
 102_PW_DS_davsubspace
+102_PW_DS_davsubspace_sca
 102_PW_PINT_RKS
 102_PW_PINT_UKS
 103_PW_15_CF_CS_S1_smallg

From 2d528b1319f510b2cacadc93649dfd59289291ef Mon Sep 17 00:00:00 2001
From: "pre-commit-ci-lite[bot]"
 <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Date: Thu, 21 Nov 2024 03:23:43 +0000
Subject: [PATCH 02/12] [pre-commit.ci lite] apply automatic fixes

---
 source/module_hsolver/diago_pxxxgvx.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/module_hsolver/diago_pxxxgvx.cpp b/source/module_hsolver/diago_pxxxgvx.cpp
index ac639d7996..24168894dd 100644
--- a/source/module_hsolver/diago_pxxxgvx.cpp
+++ b/source/module_hsolver/diago_pxxxgvx.cpp
@@ -1,6 +1,6 @@
 #include <complex>
 #include <vector>
-#include "string.h"
+#include <cstring>
 #include <iostream>
 
 #include "diago_pxxxgvx.h"

From 2c3bb1b64a4bfbbfbd40eee0265945d78d76ccb3 Mon Sep 17 00:00:00 2001
From: root <pxlxingliang>
Date: Thu, 21 Nov 2024 15:33:18 +0800
Subject: [PATCH 03/12] fix Makefile

---
 source/Makefile.Objects | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 3f240d9b52..29b6df8073 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -338,6 +338,8 @@ OBJS_HSOLVER=diago_cg.o\
     math_kernel_op.o\
     dngvd_op.o\
     diag_const_nums.o\
+    diag_hs_para.o\
+    diago_pxxxgvx.o\
     
 OBJS_HSOLVER_LCAO=hsolver_lcao.o\
       diago_scalapack.o\

From 2cf60e2677869bee58f55267ec8be19dfa3323fe Mon Sep 17 00:00:00 2001
From: root <pxlxingliang>
Date: Fri, 22 Nov 2024 18:29:02 +0800
Subject: [PATCH 04/12] fix ut

---
 docs/advanced/input_files/input-main.md       |   4 +-
 python/pyabacus/src/hsolver/CMakeLists.txt    |   3 +
 .../src/hsolver/py_diago_dav_subspace.hpp     |   2 +-
 source/module_base/scalapack_connector.h      |   3 +
 source/module_hsolver/diag_hs_para.cpp        | 251 ++++----
 source/module_hsolver/diag_hs_para.h          |  23 +-
 source/module_hsolver/diago_dav_subspace.cpp  |  24 +-
 source/module_hsolver/diago_dav_subspace.h    |   2 +-
 source/module_hsolver/diago_pxxxgvx.cpp       | 546 ++++++++++++++----
 source/module_hsolver/diago_pxxxgvx.h         |   2 +-
 source/module_hsolver/hsolver_pw.cpp          |   2 +-
 .../module_hsolver/test/test_hsolver_pw.cpp   |  56 ++
 .../module_hsolver/test/test_hsolver_sdft.cpp |  52 ++
 source/module_io/read_input_item_system.cpp   |   4 +-
 source/module_lr/hsolver_lrtd.hpp             |   2 +-
 source/module_parameter/input_parameter.h     |   2 +-
 16 files changed, 709 insertions(+), 269 deletions(-)

diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index 019e28829a..798da7c9e8 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -39,7 +39,7 @@
     - [pw\_diag\_thr](#pw_diag_thr)
     - [pw\_diag\_nmax](#pw_diag_nmax)
     - [pw\_diag\_ndim](#pw_diag_ndim)
-    - [diag\_subspace\_method](#diag_subspace_method)
+    - [diag\_subspace\_method](#diag_subspace)
     - [erf\_ecut](#erf_ecut)
     - [fft\_mode](#fft_mode)
     - [erf\_height](#erf_height)
@@ -787,7 +787,7 @@ These variables are used to control the plane wave related parameters.
 - **Description**: Only useful when you use `ks_solver = dav` or `ks_solver = dav_subspace`. It indicates dimension of workspace(number of wavefunction packets, at least 2 needed) for the Davidson method. A larger value may yield a smaller number of iterations in the algorithm but uses more memory and more CPU time in subspace diagonalization.
 - **Default**: 4 
 
-### diag_subspace_method
+### diag_subspace
 
 - **Type**: Integer
 - **Description**: The method to diagonalize subspace in dav_subspace method. The available options are:
diff --git a/python/pyabacus/src/hsolver/CMakeLists.txt b/python/pyabacus/src/hsolver/CMakeLists.txt
index 853973e653..f0f04f97a7 100644
--- a/python/pyabacus/src/hsolver/CMakeLists.txt
+++ b/python/pyabacus/src/hsolver/CMakeLists.txt
@@ -5,6 +5,9 @@ list(APPEND _diago
     ${HSOLVER_PATH}/diago_cg.cpp
     ${HSOLVER_PATH}/diag_const_nums.cpp
     ${HSOLVER_PATH}/diago_iter_assist.cpp
+    ${HSOLVER_PATH}/diag_hs_para.cpp
+    ${HSOLVER_PATH}/diago_pxxxgvx.cpp
+
 
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
     ${HSOLVER_PATH}/kernels/math_kernel_op.cpp
diff --git a/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp b/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
index 0eff2467fe..5374ac29a8 100644
--- a/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
+++ b/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
@@ -139,7 +139,7 @@ class PyDiagoDavSubspace
             max_iter, 
             need_subspace, 
             comm_info,
-            PARAM.inp.diag_subspace_method,
+            PARAM.inp.diag_subspace,
             PARAM.inp.nb2d
         );
 
diff --git a/source/module_base/scalapack_connector.h b/source/module_base/scalapack_connector.h
index 210cef58a1..ebd1946aac 100644
--- a/source/module_base/scalapack_connector.h
+++ b/source/module_base/scalapack_connector.h
@@ -80,16 +80,19 @@ extern "C"
 		const double* vl, const double* vu, const int* il, const int* iu,
 		const double* abstol, int* m, int* nz, double* w, const double*orfac, double* Z, const int* iz, const int* jz, const int*descz,
 		double* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);
+
 	void pzhegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
 		const int* n, std::complex<double>* A, const int* ia, const int* ja, const int*desca, std::complex<double>* B, const int* ib, const int* jb, const int*descb,
 		const double* vl, const double* vu, const int* il, const int* iu,
 		const double* abstol, int* m, int* nz, double* w, const double*orfac, std::complex<double>* Z, const int* iz, const int* jz, const int*descz,
 		std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);
+
 	void pssygvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
 		const int* n, float* A, const int* ia, const int* ja, const int*desca, float* B, const int* ib, const int* jb, const int*descb,
 		const float* vl, const float* vu, const int* il, const int* iu,
 		const float* abstol, int* m, int* nz, float* w, const float*orfac, float* Z, const int* iz, const int* jz, const int*descz,
 		float* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);
+		
 	void pchegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
 		const int* n, std::complex<float>* A, const int* ia, const int* ja, const int*desca, std::complex<float>* B, const int* ib, const int* jb, const int*descb,
 		const float* vl, const float* vu, const int* il, const int* iu,
diff --git a/source/module_hsolver/diag_hs_para.cpp b/source/module_hsolver/diag_hs_para.cpp
index d227c9c53b..ed558e492d 100644
--- a/source/module_hsolver/diag_hs_para.cpp
+++ b/source/module_hsolver/diag_hs_para.cpp
@@ -1,98 +1,88 @@
-#include <iostream>
 #include "module_hsolver/diag_hs_para.h"
+
+#include "module_base/scalapack_connector.h"
 #include "module_basis/module_ao/parallel_2d.h"
 #include "module_hsolver/diago_pxxxgvx.h"
-#include "module_base/scalapack_connector.h"
 #include "module_hsolver/genelpa/elpa_solver.h"
 
+#include <iostream>
+
 namespace hsolver
 {
 
-#ifdef __ELPA 
-    void elpa_diag(MPI_Comm comm,
-                   const int nband, 
-                   std::complex<double>* h_local, 
-                   std::complex<double>* s_local, 
-                   double* ekb, 
-                   std::complex<double>* wfc_2d,
-                   Parallel_2D& para2d_local)
-    {
-        int DecomposedState = 0;
-        ELPA_Solver es(false,
-                   comm,
-                   nband,
-                   para2d_local.get_row_size(),
-                   para2d_local.get_col_size(),
-                   para2d_local.desc);
-        es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
-        es.exit();
-    }
+#ifdef __ELPA
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               std::complex<double>* h_local,
+               std::complex<double>* s_local,
+               double* ekb,
+               std::complex<double>* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    int DecomposedState = 0;
+    ELPA_Solver es(false, comm, nband, para2d_local.get_row_size(), para2d_local.get_col_size(), para2d_local.desc);
+    es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
+    es.exit();
+}
 
-    void elpa_diag(MPI_Comm comm,
-                   const int nband, 
-                   double* h_local, 
-                   double* s_local, 
-                   double* ekb, 
-                   double* wfc_2d,
-                   Parallel_2D& para2d_local)
-    {
-        int DecomposedState = 0;
-        ELPA_Solver es(true,
-                   comm,
-                   nband,
-                   para2d_local.get_row_size(),
-                   para2d_local.get_col_size(),
-                   para2d_local.desc);
-        es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
-        es.exit();
-    }
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               double* h_local,
+               double* s_local,
+               double* ekb,
+               double* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    int DecomposedState = 0;
+    ELPA_Solver es(true, comm, nband, para2d_local.get_row_size(), para2d_local.get_col_size(), para2d_local.desc);
+    es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
+    es.exit();
+}
 
-    void elpa_diag(MPI_Comm comm,
-                   const int nband, 
-                   std::complex<float>* h_local, 
-                   std::complex<float>* s_local, 
-                   float* ekb, 
-                   std::complex<float>* wfc_2d,
-                   Parallel_2D& para2d_local)
-    {
-        std::cout << "Error: ELPA do not support single precision. " << std::endl;
-        exit(1);
-    }
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               std::complex<float>* h_local,
+               std::complex<float>* s_local,
+               float* ekb,
+               std::complex<float>* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    std::cout << "Error: ELPA do not support single precision. " << std::endl;
+    exit(1);
+}
 
-    void elpa_diag(MPI_Comm comm,
-                   const int nband, 
-                   float* h_local, 
-                   float* s_local, 
-                   float* ekb, 
-                   float* wfc_2d,
-                   Parallel_2D& para2d_local)
-    {
-        std::cout << "Error: ELPA do not support single precision. " << std::endl;
-        exit(1);
-    }
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               float* h_local,
+               float* s_local,
+               float* ekb,
+               float* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    std::cout << "Error: ELPA do not support single precision. " << std::endl;
+    exit(1);
+}
 
 #endif
 
-
 #ifdef __MPI
 
 template <typename T>
-void Diago_HS_para(
-                T* h, 
-                T* s, 
-                 const int lda, 
-                 const int nband,
-                 typename GetTypeReal<T>::type *const ekb,
-                 T *const wfc,
-                 const MPI_Comm& comm,
-                 const int diag_subspace_method,
-                 const int block_size)
+void Diago_HS_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size)
 {
-    int myrank;
+    int myrank = 0;
     MPI_Comm_rank(comm, &myrank);
     Parallel_2D para2d_global;
     Parallel_2D para2d_local;
-    para2d_global.init(lda,lda,lda,comm);
+    para2d_global.init(lda, lda, lda, comm);
 
     int max_nb = block_size;
     if (block_size == 0)
@@ -113,43 +103,53 @@ void Diago_HS_para(
     }
 
     // for genelpa, if the block size is too large that some cores have no data, then it will cause error.
-    if (diag_subspace_method == 1)
+    if (diag_subspace == 1)
     {
         if (max_nb * (std::max(para2d_global.dim0, para2d_global.dim1) - 1) >= lda)
         {
             max_nb = lda / std::max(para2d_global.dim0, para2d_global.dim1);
         }
     }
-    
-    para2d_local.init(lda,lda,max_nb,comm);
+
+    para2d_local.init(lda, lda, max_nb, comm);
     std::vector<T> h_local(para2d_local.get_col_size() * para2d_local.get_row_size());
     std::vector<T> s_local(para2d_local.get_col_size() * para2d_local.get_row_size());
     std::vector<T> wfc_2d(para2d_local.get_col_size() * para2d_local.get_row_size());
-    
+
     // distribute h and s to 2D
-    Cpxgemr2d(lda,lda,h,1,1,para2d_global.desc,h_local.data(),1,1,para2d_local.desc,para2d_local.blacs_ctxt);
-    Cpxgemr2d(lda,lda,s,1,1,para2d_global.desc,s_local.data(),1,1,para2d_local.desc,para2d_local.blacs_ctxt);
+    Cpxgemr2d(lda, lda, h, 1, 1, para2d_global.desc, h_local.data(), 1, 1, para2d_local.desc, para2d_local.blacs_ctxt);
+    Cpxgemr2d(lda, lda, s, 1, 1, para2d_global.desc, s_local.data(), 1, 1, para2d_local.desc, para2d_local.blacs_ctxt);
 
-    if (diag_subspace_method == 1)
+    if (diag_subspace == 1)
     {
-#ifdef __ELPA 
+#ifdef __ELPA
         elpa_diag(comm, nband, h_local.data(), s_local.data(), ekb, wfc_2d.data(), para2d_local);
 #else
-        std::cout << "ERROR: try to use ELPA to solve the generalized eigenvalue problem, but ELPA is not compiled. " << std::endl;
+        std::cout << "ERROR: try to use ELPA to solve the generalized eigenvalue problem, but ELPA is not compiled. "
+                  << std::endl;
         exit(1);
-#endif        
+#endif
     }
-    else if (diag_subspace_method == 2)
+    else if (diag_subspace == 2)
     {
-        hsolver::pxxxgvx_diag(para2d_local.desc, para2d_local.get_row_size(), para2d_local.get_col_size(),nband, h_local.data(), s_local.data(), ekb, wfc_2d.data());
+        hsolver::pxxxgvx_diag(para2d_local.desc,
+                              para2d_local.get_row_size(),
+                              para2d_local.get_col_size(),
+                              nband,
+                              h_local.data(),
+                              s_local.data(),
+                              ekb,
+                              wfc_2d.data());
     }
-    else{
-        std::cout << "Error: parallel diagonalization method is not supported. " << "diag_subspace_method = " << diag_subspace_method << std::endl;
+    else
+    {
+        std::cout << "Error: parallel diagonalization method is not supported. " << "diag_subspace = " << diag_subspace
+                  << std::endl;
         exit(1);
     }
 
     // gather wfc
-    Cpxgemr2d(lda,lda,wfc_2d.data(),1,1,para2d_local.desc,wfc,1,1,para2d_global.desc,para2d_local.blacs_ctxt);
+    Cpxgemr2d(lda, lda, wfc_2d.data(), 1, 1, para2d_local.desc, wfc, 1, 1, para2d_global.desc, para2d_local.blacs_ctxt);
 
     // free the context
     Cblacs_gridexit(para2d_local.blacs_ctxt);
@@ -157,44 +157,45 @@ void Diago_HS_para(
 }
 
 // template instantiation
-template void Diago_HS_para<double>(double* h, 
-                                    double* s, 
-                                    const int lda, 
-                                    const int nband, 
-                                    typename GetTypeReal<double>::type *const ekb, 
-                                    double *const wfc, 
-                                    const MPI_Comm& comm, 
-                                    const int diag_subspace_method,
-                                    const int block_size);
-template void Diago_HS_para<std::complex<double>>(std::complex<double>* h, 
-                                    std::complex<double>* s, 
-                                    const int lda, 
-                                    const int nband, 
-                                    typename GetTypeReal<std::complex<double>>::type *const ekb, 
-                                    std::complex<double> *const wfc, 
-                                    const MPI_Comm& comm, 
-                                    const int diag_subspace_method,
-                                    const int block_size);      
-template void Diago_HS_para<float>(float* h, 
-                                    float* s, 
-                                    const int lda, 
-                                    const int nband, 
-                                    typename GetTypeReal<float>::type *const ekb, 
-                                    float *const wfc, 
-                                    const MPI_Comm& comm, 
-                                    const int diag_subspace_method,
+template void Diago_HS_para<double>(double* h,
+                                    double* s,
+                                    const int lda,
+                                    const int nband,
+                                    typename GetTypeReal<double>::type* const ekb,
+                                    double* const wfc,
+                                    const MPI_Comm& comm,
+                                    const int diag_subspace,
                                     const int block_size);
-template void Diago_HS_para<std::complex<float>>(std::complex<float>* h, 
-                                    std::complex<float>* s, 
-                                    const int lda, 
-                                    const int nband, 
-                                    typename GetTypeReal<std::complex<float>>::type *const ekb, 
-                                    std::complex<float> *const wfc, 
-                                    const MPI_Comm& comm, 
-                                    const int diag_subspace_method,
-                                    const int block_size);                                                                                                      
-
 
+template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
+                                                  std::complex<double>* s,
+                                                  const int lda,
+                                                  const int nband,
+                                                  typename GetTypeReal<std::complex<double>>::type* const ekb,
+                                                  std::complex<double>* const wfc,
+                                                  const MPI_Comm& comm,
+                                                  const int diag_subspace,
+                                                  const int block_size);
+
+template void Diago_HS_para<float>(float* h,
+                                   float* s,
+                                   const int lda,
+                                   const int nband,
+                                   typename GetTypeReal<float>::type* const ekb,
+                                   float* const wfc,
+                                   const MPI_Comm& comm,
+                                   const int diag_subspace,
+                                   const int block_size);
+                                   
+template void Diago_HS_para<std::complex<float>>(std::complex<float>* h,
+                                                 std::complex<float>* s,
+                                                 const int lda,
+                                                 const int nband,
+                                                 typename GetTypeReal<std::complex<float>>::type* const ekb,
+                                                 std::complex<float>* const wfc,
+                                                 const MPI_Comm& comm,
+                                                 const int diag_subspace,
+                                                 const int block_size);
 
 #endif
 
diff --git a/source/module_hsolver/diag_hs_para.h b/source/module_hsolver/diag_hs_para.h
index ce73199a00..fd54ebcae1 100644
--- a/source/module_hsolver/diag_hs_para.h
+++ b/source/module_hsolver/diag_hs_para.h
@@ -22,25 +22,24 @@ namespace hsolver
  * @param ekb to store the eigenvalues.
  * @param wfc to store the eigenvectors
  * @param comm the communicator
- * @param diag_subspace_method the method to solve the generalized eigenvalue problem
+ * @param diag_subspace the method to solve the generalized eigenvalue problem
  * @param block_size the block size in 2d block cyclic distribution if use elpa or scalapack. 
  * 
  * @note 1. h and s should be full matrix in rank 0 of the communicator, and the other ranks is not concerned.
  * @note 2. wfc is complete in rank 0, and not store in other ranks.
- * @note 3. diag_subspace_method should be 1: by elpa, 2: by scalapack
+ * @note 3. diag_subspace should be 1: by elpa, 2: by scalapack
  * @note 4. block_size should be 0 or a positive integer. If it is 0, then will use a value as large as possible that is allowed
  */
 template <typename T>
-void Diago_HS_para(
-                T* h, 
-                T* s, 
-                 const int lda, 
-                 const int nband,
-                 typename GetTypeReal<T>::type *const ekb,
-                 T *const wfc,
-                 const MPI_Comm& comm,
-                 const int diag_subspace_method,
-                 const int block_size=0);
+void Diago_HS_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size = 0);
 #endif
 
 } // namespace hsolver                 
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index 3aa4b8c45b..83d9b50db3 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -24,11 +24,11 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
                                                 const int& diag_nmax_in,
                                                 const bool& need_subspace_in,
                                                 const diag_comm_info& diag_comm_in,
-                                                const int diag_subspace_method_in,
+                                                const int diag_subspace_in,
                                                 const int diago_subspace_bs_in)
     : precondition(precondition_in), n_band(nband_in), dim(nbasis_in), nbase_x(nband_in * david_ndim_in),
       diag_thr(diag_thr_in), iter_nmax(diag_nmax_in), is_subspace(need_subspace_in), diag_comm(diag_comm_in),
-        diag_subspace_method(diag_subspace_method_in), diago_subspace_bs(diago_subspace_bs_in)
+        diag_subspace(diag_subspace_in), diago_subspace_bs(diago_subspace_bs_in)
 {
     this->device = base_device::get_device_type<Device>(this->ctx);
 
@@ -38,7 +38,7 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
 
     assert(david_ndim_in > 1);
     assert(david_ndim_in * nband_in < nbasis_in * this->diag_comm.nproc);
-    assert(diag_subspace_method >= 0 && diag_subspace_method < 3);
+    assert(diag_subspace >= 0 && diag_subspace < 3);
 
     // TODO: Added memory usage statistics
 
@@ -573,7 +573,7 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
     }
     else
     {
-        if (this->diag_subspace_method == 0)
+        if (this->diag_subspace == 0)
         {
             if (this->diag_comm.rank == 0)
             {
@@ -636,14 +636,14 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
                 }
             }
             Diago_HS_para(h_diag.data(),
-                        s_diag.data(),
-                        nbase,
-                        nband,
-                        (*eigenvalue_iter).data(),
-                        vcc_tmp.data(),
-                        this->diag_comm.comm,
-                        this->diag_subspace_method,
-                        this->diago_subspace_bs);
+                          s_diag.data(),
+                          nbase,
+                          nband,
+                          (*eigenvalue_iter).data(),
+                          vcc_tmp.data(),
+                          this->diag_comm.comm,
+                          this->diag_subspace,
+                          this->diago_subspace_bs);
             if (this->diag_comm.rank == 0)
             {
                 for (size_t i = 0; i < nband; i++)
diff --git a/source/module_hsolver/diago_dav_subspace.h b/source/module_hsolver/diago_dav_subspace.h
index baf984d999..7d9fb185fc 100644
--- a/source/module_hsolver/diago_dav_subspace.h
+++ b/source/module_hsolver/diago_dav_subspace.h
@@ -141,7 +141,7 @@ class Diago_DavSubspace
 
     bool test_exit_cond(const int& ntry, const int& notconv, const bool& scf);
 
-    int diag_subspace_method; // 0: LAPACK, 1: Gen-ELPA, 2: ScaLAPACK
+    int diag_subspace; // 0: LAPACK, 1: Gen-ELPA, 2: ScaLAPACK
     int diago_subspace_bs = 0; // the block size in 2d block cyclic distribution if use elpa or scalapack
 
 #ifdef __DSP
diff --git a/source/module_hsolver/diago_pxxxgvx.cpp b/source/module_hsolver/diago_pxxxgvx.cpp
index 24168894dd..51e29f27ff 100644
--- a/source/module_hsolver/diago_pxxxgvx.cpp
+++ b/source/module_hsolver/diago_pxxxgvx.cpp
@@ -1,78 +1,385 @@
+#include "diago_pxxxgvx.h"
+
+#include "module_base/blacs_connector.h"
+#include "module_base/scalapack_connector.h"
+
 #include <complex>
-#include <vector>
 #include <cstring>
 #include <iostream>
-
-#include "diago_pxxxgvx.h"
-#include "module_base/scalapack_connector.h"
-#include "module_base/blacs_connector.h"
+#include <vector>
 
 namespace hsolver
 {
 
 #ifdef __MPI
 
-void pxxxgvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
-		const int* n, double* A, const int* ia, const int* ja, const int*desca, double* B, const int* ib, const int* jb, const int*descb,
-		const double* vl, const double* vu, const int* il, const int* iu,
-		const double* abstol, int* m, int* nz, double* w, const double*orfac, double* Z, const int* iz, const int* jz, const int*descz,
-		double* work, int* lwork, double* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info)
+/**
+ * @file diago_pxxxgvx.cpp
+ * @brief This file contains functions for performing parallel diagonalization using Scalapack.
+ *
+ * The functions in this file are designed to handle different data types (double, float, std::complex<double>,
+ * std::complex<float>) and perform diagonalization using Scalapack routines (pdsygvx_, pzhegvx_, pssygvx_, pchegvx_).
+ *
+ * The main functions are:
+ * - pxxxgvx_: Wrapper functions for Scalapack diagonalization routines.
+ * - pxxxgvx_post_processing: Post-processing function to handle errors and adjust parameters based on Scalapack's info
+ * output.
+ * - get_lwork: Helper functions to retrieve the optimal workspace size.
+ * - pxxxgvx_diag: Template function to perform diagonalization for different data types.
+ *
+ * Template instantiations for pxxxgvx_diag are provided for double, float, std::complex<double>, and
+ * std::complex<float>.
+ */
+
+/**
+ * @brief Wrapper function for Scalapack's generalized eigensolver routines.
+ * 
+ * @param itype Specifies the problem type to be solved.
+ * @param jobz Specifies whether to compute eigenvectors.
+ * @param range Specifies the range of eigenvalues to be found.
+ * @param uplo Specifies whether the upper or lower triangular part of the matrices is referenced.
+ * @param n The order of the matrices A and B.
+ * @param A The array containing the matrix A.
+ * @param ia The row index in the global array A.
+ * @param ja The column index in the global array A.
+ * @param desca The array descriptor for the distributed matrix A.
+ * @param B The array containing the matrix B.
+ * @param ib The row index in the global array B.
+ * @param jb The column index in the global array B.
+ * @param descb The array descriptor for the distributed matrix B.
+ * @param vl Lower bound of the interval to be searched for eigenvalues.
+ * @param vu Upper bound of the interval to be searched for eigenvalues.
+ * @param il Index of the smallest eigenvalue to be returned.
+ * @param iu Index of the largest eigenvalue to be returned.
+ * @param abstol The absolute error tolerance for the eigenvalues.
+ * @param m The total number of eigenvalues found.
+ * @param nz The total number of eigenvalues found in the interval (vl, vu].
+ * @param w The array to store the eigenvalues.
+ * @param orfac The orthogonality factor.
+ * @param Z The array to store the eigenvectors.
+ * @param iz The row index in the global array Z.
+ * @param jz The column index in the global array Z.
+ * @param descz The array descriptor for the distributed matrix Z.
+ * @param work Workspace array.
+ * @param lwork The dimension of the array work.
+ * @param rwork Workspace array (not used in this function).
+ * @param lrwork The dimension of the array rwork (not used in this function).
+ * @param iwork Workspace array.
+ * @param liwork The dimension of the array iwork.
+ * @param ifail The array to store the indices of the eigenvectors that failed to converge.
+ * @param iclustr The array to store the indices of the eigenvalue clusters.
+ * @param gap The array to store the gaps between eigenvalue clusters.
+ * @param info Output status of the computation.
+ * 
+ * @note for a uniform interface, rwork and lrwork are input arguments, but not used in pdsygvx_/pssygvx_
+ */
+void pxxxgvx_(const int* itype,
+              const char* jobz,
+              const char* range,
+              const char* uplo,
+              const int* n,
+              double* A,
+              const int* ia,
+              const int* ja,
+              const int* desca,
+              double* B,
+              const int* ib,
+              const int* jb,
+              const int* descb,
+              const double* vl,
+              const double* vu,
+              const int* il,
+              const int* iu,
+              const double* abstol,
+              int* m,
+              int* nz,
+              double* w,
+              const double* orfac,
+              double* Z,
+              const int* iz,
+              const int* jz,
+              const int* descz,
+              double* work,
+              int* lwork,
+              double* rwork, 
+              int* lrwork,
+              int* iwork,
+              int* liwork,
+              int* ifail,
+              int* iclustr,
+              double* gap,
+              int* info)
 {
-    // for a uniform interface, rwork and lrwork are input arguments, but not used in pdsygvx_/pssygvx_
-    pdsygvx_(itype, jobz, range, uplo,
-		n, A, ia, ja, desca, B, ib, jb, descb,
-		vl, vu, il,  iu,
-		abstol, m, nz, w, orfac, Z, iz, jz, descz,
-		work, lwork, iwork, liwork, ifail, iclustr, gap, info);
+    // double
+    pdsygvx_(itype,
+             jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             B,
+             ib,
+             jb,
+             descb,
+             vl,
+             vu,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
 }
 
-void pxxxgvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
-		const int* n, std::complex<double>* A, const int* ia, const int* ja, const int*desca, std::complex<double>* B, const int* ib, const int* jb, const int*descb,
-		const double* vl, const double* vu, const int* il, const int* iu,
-		const double* abstol, int* m, int* nz, double* w, const double*orfac, std::complex<double>* Z, const int* iz, const int* jz, const int*descz,
-		std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info)
+void pxxxgvx_(const int* itype,
+              const char* jobz,
+              const char* range,
+              const char* uplo,
+              const int* n,
+              std::complex<double>* A,
+              const int* ia,
+              const int* ja,
+              const int* desca,
+              std::complex<double>* B,
+              const int* ib,
+              const int* jb,
+              const int* descb,
+              const double* vl,
+              const double* vu,
+              const int* il,
+              const int* iu,
+              const double* abstol,
+              int* m,
+              int* nz,
+              double* w,
+              const double* orfac,
+              std::complex<double>* Z,
+              const int* iz,
+              const int* jz,
+              const int* descz,
+              std::complex<double>* work,
+              int* lwork,
+              double* rwork,
+              int* lrwork,
+              int* iwork,
+              int* liwork,
+              int* ifail,
+              int* iclustr,
+              double* gap,
+              int* info)
 {
-    pzhegvx_(itype, jobz, range, uplo,
-		n, A, ia, ja, desca, B, ib, jb, descb,
-		vl, vu, il,  iu,
-		abstol, m, nz, w, orfac, Z, iz, jz, descz,
-		work, lwork,rwork, lrwork, iwork, liwork, ifail, iclustr, gap, info);
+    // std::complex<double>
+    pzhegvx_(itype,
+             jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             B,
+             ib,
+             jb,
+             descb,
+             vl,
+             vu,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             rwork,
+             lrwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
 }
 
-void pxxxgvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
-		const int* n, float* A, const int* ia, const int* ja, const int*desca, float* B, const int* ib, const int* jb, const int*descb,
-		const float* vl, const float* vu, const int* il, const int* iu,
-		const float* abstol, int* m, int* nz, float* w, const float*orfac, float* Z, const int* iz, const int* jz, const int*descz,
-		float* work, int* lwork, float* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info)
+void pxxxgvx_(const int* itype,
+              const char* jobz,
+              const char* range,
+              const char* uplo,
+              const int* n,
+              float* A,
+              const int* ia,
+              const int* ja,
+              const int* desca,
+              float* B,
+              const int* ib,
+              const int* jb,
+              const int* descb,
+              const float* vl,
+              const float* vu,
+              const int* il,
+              const int* iu,
+              const float* abstol,
+              int* m,
+              int* nz,
+              float* w,
+              const float* orfac,
+              float* Z,
+              const int* iz,
+              const int* jz,
+              const int* descz,
+              float* work,
+              int* lwork,
+              float* rwork,
+              int* lrwork,
+              int* iwork,
+              int* liwork,
+              int* ifail,
+              int* iclustr,
+              float* gap,
+              int* info)
 {
-    pssygvx_(itype, jobz, range, uplo,
-		n, A, ia, ja, desca, B, ib, jb, descb,
-		vl, vu, il,  iu,
-		abstol, m, nz, w, orfac, Z, iz, jz, descz,
-		work, lwork, iwork, liwork, ifail, iclustr, gap, info);
+    // float
+    pssygvx_(itype,
+             jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             B,
+             ib,
+             jb,
+             descb,
+             vl,
+             vu,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
 }
 
-void pxxxgvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
-		const int* n, std::complex<float>* A, const int* ia, const int* ja, const int*desca, std::complex<float>* B, const int* ib, const int* jb, const int*descb,
-		const float* vl, const float* vu, const int* il, const int* iu,
-		const float* abstol, int* m, int* nz, float* w, const float*orfac, std::complex<float>* Z, const int* iz, const int* jz, const int*descz,
-		std::complex<float>* work, int* lwork, float* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info)
+void pxxxgvx_(const int* itype,
+              const char* jobz,
+              const char* range,
+              const char* uplo,
+              const int* n,
+              std::complex<float>* A,
+              const int* ia,
+              const int* ja,
+              const int* desca,
+              std::complex<float>* B,
+              const int* ib,
+              const int* jb,
+              const int* descb,
+              const float* vl,
+              const float* vu,
+              const int* il,
+              const int* iu,
+              const float* abstol,
+              int* m,
+              int* nz,
+              float* w,
+              const float* orfac,
+              std::complex<float>* Z,
+              const int* iz,
+              const int* jz,
+              const int* descz,
+              std::complex<float>* work,
+              int* lwork,
+              float* rwork,
+              int* lrwork,
+              int* iwork,
+              int* liwork,
+              int* ifail,
+              int* iclustr,
+              float* gap,
+              int* info)
 {
-    pchegvx_(itype, jobz, range, uplo,
-		n, A, ia, ja, desca, B, ib, jb, descb,
-		vl, vu, il,  iu,
-		abstol, m, nz, w, orfac, Z, iz, jz, descz,
-		work, lwork,rwork, lrwork, iwork, liwork, ifail, iclustr, gap, info);
+    // std::complex<float>
+    pchegvx_(itype,
+             jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             B,
+             ib,
+             jb,
+             descb,
+             vl,
+             vu,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             rwork,
+             lrwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
 }
 
-// post processing for pdsygvx/pzhegvx/pdsygvx/pzhegvx
-void pxxxgvx_post_processing(const int info, 
-                    const std::vector<int>& ifail,
-                    const std::vector<int>& iclustr,
-                    const int M,
-                    const int NZ,
-                    const int nbands, 
-                    int& degeneracy_max)
+void pxxxgvx_post_processing(const int info,
+                             const std::vector<int>& ifail,
+                             const std::vector<int>& iclustr,
+                             const int M,
+                             const int NZ,
+                             const int nbands,
+                             int& degeneracy_max)
 {
     const std::string str_info = "Scalapack diagonalization: \n    info = " + std::to_string(info) + ".\n";
 
@@ -83,11 +390,10 @@ void pxxxgvx_post_processing(const int info,
     else if (info < 0)
     {
         const int info_negative = -info;
-        const std::string str_index
-            = (info_negative > 100)
-                  ? std::to_string(info_negative / 100) + "-th argument "
-                        + std::to_string(info_negative % 100) + "-entry is illegal.\n"
-                  : std::to_string(info_negative) + "-th argument is illegal.\n";
+        const std::string str_index = (info_negative > 100)
+                                          ? std::to_string(info_negative / 100) + "-th argument "
+                                                + std::to_string(info_negative % 100) + "-entry is illegal.\n"
+                                          : std::to_string(info_negative) + "-th argument is illegal.\n";
         throw std::runtime_error(str_info + str_index);
     }
     else if (info % 2)
@@ -102,13 +408,12 @@ void pxxxgvx_post_processing(const int info,
     else if (info / 2 % 2)
     {
         int degeneracy_need = 0;
-        for (int irank = 0; irank < iclustr.size()/2; ++irank)
+        for (int irank = 0; irank < iclustr.size() / 2; ++irank)
         {
             degeneracy_need = std::max(degeneracy_need, iclustr[2 * irank + 1] - iclustr[2 * irank]);
         }
         const std::string str_need = "degeneracy_need = " + std::to_string(degeneracy_need) + ".\n";
-        const std::string str_saved
-            = "degeneracy_saved = " + std::to_string(degeneracy_max) + ".\n";
+        const std::string str_saved = "degeneracy_saved = " + std::to_string(degeneracy_max) + ".\n";
         if (degeneracy_need <= degeneracy_max)
         {
             throw std::runtime_error(str_info + str_need + str_saved);
@@ -124,8 +429,7 @@ void pxxxgvx_post_processing(const int info,
     {
         const std::string str_M = "M = " + std::to_string(M) + ".\n";
         const std::string str_NZ = "NZ = " + std::to_string(NZ) + ".\n";
-        const std::string str_NBANDS
-            = "Number of eigenvalues solved = " + std::to_string(nbands) + ".\n";
+        const std::string str_NBANDS = "Number of eigenvalues solved = " + std::to_string(nbands) + ".\n";
         throw std::runtime_error(str_info + str_M + str_NZ + str_NBANDS);
     }
     else if (info / 16 % 2)
@@ -160,33 +464,48 @@ void get_lwork(int& lwork, std::vector<std::complex<float>>& work)
 }
 
 template <typename T>
-void pxxxgvx_diag(const int *const desc,
-                      const int ncol,
-                      const int nrow,
-                      const int nbands,
-                      const T *const h_mat,
-                      const T *const s_mat,
-                      typename GetTypeReal<T>::type *const ekb,
-                      T *const wfc_2d)
+void pxxxgvx_diag(const int* const desc,
+                  const int ncol,
+                  const int nrow,
+                  const int nbands,
+                  const T* const h_mat,
+                  const T* const s_mat,
+                  typename GetTypeReal<T>::type* const ekb,
+                  T* const wfc_2d)
 {
-    int nprow, npcol, myprow, mypcol;
+    int nprow = 1;
+    int npcol = 1;
+    int myprow = 0;
+    int mypcol = 0;
     Cblacs_gridinfo(desc[1], &nprow, &npcol, &myprow, &mypcol);
     int dsize = nprow * npcol;
 
     int degeneracy_max = 12; // only used for complex<float> and complex<double>
     while (true)
     {
-        std::vector<T> h_tmp(ncol * nrow);
-        std::vector<T> s_tmp(ncol * nrow);
+        std::vector<T> h_tmp(ncol * nrow, 0);
+        std::vector<T> s_tmp(ncol * nrow, 0);
         memcpy(h_tmp.data(), h_mat, sizeof(T) * ncol * nrow);
         memcpy(s_tmp.data(), s_mat, sizeof(T) * ncol * nrow);
 
         int ndim_global = desc[2];
-        const char jobz = 'V', range = 'I', uplo = 'U';
-        const int itype = 1, il = 1, iu = nbands, one = 1;
-        int M = 0, NZ = 0, lwork = -1, lrwork = -1, liwork = -1, info = 0;
-        const typename GetTypeReal<T>::type abstol = 0, orfac = -1;
-        const typename GetTypeReal<T>::type vl = 0, vu = 0;
+        const char jobz = 'V';
+        const char range = 'I';
+        const char uplo = 'U';
+        const int itype = 1;
+        const int il = 1;
+        const int iu = nbands;
+        const int one = 1;
+        int M = 0;
+        int NZ = 0;
+        int lwork = -1;
+        int lrwork = -1;
+        int liwork = -1;
+        int info = 0;
+        const typename GetTypeReal<T>::type abstol = 0;
+        const typename GetTypeReal<T>::type orfac = -1;
+        const typename GetTypeReal<T>::type vl = 0;
+        const typename GetTypeReal<T>::type vu = 0;
         std::vector<T> work(1, 0);
         std::vector<typename GetTypeReal<T>::type> rwork(3, 0); // only used for complex<float> and complex<double>
         std::vector<int> iwork(1, 0);
@@ -221,7 +540,7 @@ void pxxxgvx_diag(const int *const desc,
                  &one,
                  desc,
                  work.data(),
-                 &lwork, // is not used for real data type
+                 &lwork,       // is not used for real data type
                  rwork.data(), // is not used for real data type
                  &lrwork,
                  iwork.data(),
@@ -295,10 +614,10 @@ void pxxxgvx_diag(const int *const desc,
         {
             return;
         }
-        pxxxgvx_post_processing(info, ifail,iclustr, M,NZ,nbands,degeneracy_max);
+        pxxxgvx_post_processing(info, ifail, iclustr, M, NZ, nbands, degeneracy_max);
 
         // break the loop for real data type
-        if (std::is_same<T, float>::value || std::is_same<T,double>::value)
+        if (std::is_same<T, float>::value || std::is_same<T, double>::value)
         {
             return;
         }
@@ -306,39 +625,46 @@ void pxxxgvx_diag(const int *const desc,
 }
 
 // template instantiation
-template void pxxxgvx_diag(const int *const desc,
+// double
+template void pxxxgvx_diag(const int* const desc,
+                           const int ncol,
+                           const int nrow,
+                           const int nbands,
+                           const double* const h_mat,
+                           const double* const s_mat,
+                           double* const ekb,
+                           double* const wfc_2d);
+
+// std::complex<double>                           
+template void pxxxgvx_diag(const int* const desc,
                            const int ncol,
                            const int nrow,
                            const int nbands,
-                           const double *const h_mat,
-                           const double *const s_mat,
-                           double *const ekb,
-                           double *const wfc_2d);
-template void pxxxgvx_diag(const int *const desc,
+                           const std::complex<double>* const h_mat,
+                           const std::complex<double>* const s_mat,
+                           double* const ekb,
+                           std::complex<double>* const wfc_2d);
+
+// float
+template void pxxxgvx_diag(const int* const desc,
+                           const int ncol,
+                           const int nrow,
+                           const int nbands,
+                           const float* const h_mat,
+                           const float* const s_mat,
+                           float* const ekb,
+                           float* const wfc_2d);
+
+// std::complex<float>
+template void pxxxgvx_diag(const int* const desc,
                            const int ncol,
                            const int nrow,
                            const int nbands,
-                           const std::complex<double> *const h_mat,
-                           const std::complex<double> *const s_mat,
-                           double *const ekb,
-                           std::complex<double> *const wfc_2d);
-template void pxxxgvx_diag(const int *const desc,
-                            const int ncol,
-                            const int nrow,
-                            const int nbands,
-                            const float *const h_mat,
-                            const float *const s_mat,
-                            float *const ekb,
-                            float *const wfc_2d);
-template void pxxxgvx_diag(const int *const desc,
-                            const int ncol,
-                            const int nrow,
-                            const int nbands,
-                            const std::complex<float> *const h_mat,
-                            const std::complex<float> *const s_mat,
-                            float *const ekb,
-                            std::complex<float> *const wfc_2d);                           
+                           const std::complex<float>* const h_mat,
+                           const std::complex<float>* const s_mat,
+                           float* const ekb,
+                           std::complex<float>* const wfc_2d);
 
 #endif
 
-} // namespace
\ No newline at end of file
+} // namespace hsolver
\ No newline at end of file
diff --git a/source/module_hsolver/diago_pxxxgvx.h b/source/module_hsolver/diago_pxxxgvx.h
index 6684a5ea6f..e4f9ba7896 100644
--- a/source/module_hsolver/diago_pxxxgvx.h
+++ b/source/module_hsolver/diago_pxxxgvx.h
@@ -8,7 +8,7 @@ namespace hsolver
 
 #ifdef __MPI
 /**
- * @brief generalized eigenvalue problem using pdsygvx/pzhegvx/pdsygvx/pzhegvx
+ * @brief Wrapper function for Scalapack's generalized eigensolver routines: pdsygvx_, pzhegvx_, pssygvx_, pchegvx_.
  * 
  * @param desc the descriptor of scalapack descriptor
  * @param ncol the number of columns of the H/S matrix in current processor
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index 83bac220f8..eb71f90b5a 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -501,7 +501,7 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
                                                   this->diag_iter_max,
                                                   this->need_subspace,
                                                   comm_info,
-                                                  PARAM.inp.diag_subspace_method,
+                                                  PARAM.inp.diag_subspace,
                                                   PARAM.inp.nb2d);
 
         DiagoIterAssist<T, Device>::avg_iter += static_cast<double>(
diff --git a/source/module_hsolver/test/test_hsolver_pw.cpp b/source/module_hsolver/test/test_hsolver_pw.cpp
index 8c5da0d1c5..505f00a9df 100644
--- a/source/module_hsolver/test/test_hsolver_pw.cpp
+++ b/source/module_hsolver/test/test_hsolver_pw.cpp
@@ -32,6 +32,62 @@
  *  - 8. solve()
  *      - lcao_in_pw specific implementation
  */
+
+// mock Diago_HS_para
+namespace hsolver {
+template <typename T>
+void Diago_HS_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size = 0)
+{}
+template void Diago_HS_para<double>(double* h,
+                                    double* s,
+                                    const int lda,
+                                    const int nband,
+                                    typename GetTypeReal<double>::type* const ekb,
+                                    double* const wfc,
+                                    const MPI_Comm& comm,
+                                    const int diag_subspace,
+                                    const int block_size);
+
+template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
+                                                  std::complex<double>* s,
+                                                  const int lda,
+                                                  const int nband,
+                                                  typename GetTypeReal<std::complex<double>>::type* const ekb,
+                                                  std::complex<double>* const wfc,
+                                                  const MPI_Comm& comm,
+                                                  const int diag_subspace,
+                                                  const int block_size);
+
+template void Diago_HS_para<float>(float* h,
+                                   float* s,
+                                   const int lda,
+                                   const int nband,
+                                   typename GetTypeReal<float>::type* const ekb,
+                                   float* const wfc,
+                                   const MPI_Comm& comm,
+                                   const int diag_subspace,
+                                   const int block_size);
+                                   
+template void Diago_HS_para<std::complex<float>>(std::complex<float>* h,
+                                                 std::complex<float>* s,
+                                                 const int lda,
+                                                 const int nband,
+                                                 typename GetTypeReal<std::complex<float>>::type* const ekb,
+                                                 std::complex<float>* const wfc,
+                                                 const MPI_Comm& comm,
+                                                 const int diag_subspace,
+                                                 const int block_size);
+
+}
+
 class TestHSolverPW : public ::testing::Test {
   public:
     ModulePW::PW_Basis_K pwbk;
diff --git a/source/module_hsolver/test/test_hsolver_sdft.cpp b/source/module_hsolver/test/test_hsolver_sdft.cpp
index 7b234e953e..5bf2c650d4 100644
--- a/source/module_hsolver/test/test_hsolver_sdft.cpp
+++ b/source/module_hsolver/test/test_hsolver_sdft.cpp
@@ -23,6 +23,58 @@ Sto_Func<REAL>::Sto_Func()
 }
 template class Sto_Func<double>;
 
+// mock Diago_HS_para
+namespace hsolver {
+template <typename T>
+void Diago_HS_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size = 0)
+{}
+template void Diago_HS_para<double>(double* h,
+                                    double* s,
+                                    const int lda,
+                                    const int nband,
+                                    typename GetTypeReal<double>::type* const ekb,
+                                    double* const wfc,
+                                    const MPI_Comm& comm,
+                                    const int diag_subspace,
+                                    const int block_size);
+template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
+                                                  std::complex<double>* s,
+                                                  const int lda,
+                                                  const int nband,
+                                                  typename GetTypeReal<std::complex<double>>::type* const ekb,
+                                                  std::complex<double>* const wfc,
+                                                  const MPI_Comm& comm,
+                                                  const int diag_subspace,
+                                                  const int block_size);
+template void Diago_HS_para<float>(float* h,
+                                   float* s,
+                                   const int lda,
+                                   const int nband,
+                                   typename GetTypeReal<float>::type* const ekb,
+                                   float* const wfc,
+                                   const MPI_Comm& comm,
+                                   const int diag_subspace,
+                                   const int block_size);          
+template void Diago_HS_para<std::complex<float>>(std::complex<float>* h,
+                                                 std::complex<float>* s,
+                                                 const int lda,
+                                                 const int nband,
+                                                 typename GetTypeReal<std::complex<float>>::type* const ekb,
+                                                 std::complex<float>* const wfc,
+                                                 const MPI_Comm& comm,
+                                                 const int diag_subspace,
+                                                 const int block_size);
+
+}
+
 
 template <>
 elecstate::ElecStatePW<std::complex<double>, base_device::DEVICE_CPU>::ElecStatePW(ModulePW::PW_Basis_K* wfc_basis_in,
diff --git a/source/module_io/read_input_item_system.cpp b/source/module_io/read_input_item_system.cpp
index 3fd2e48bd5..184bafd363 100644
--- a/source/module_io/read_input_item_system.cpp
+++ b/source/module_io/read_input_item_system.cpp
@@ -478,9 +478,9 @@ void ReadInput::item_system()
         this->add_item(item);
     }
     {
-        Input_Item item("diag_subspace_method");
+        Input_Item item("diag_subspace");
         item.annotation = "method of subspace diagonalization in dav_subspace. 0:LaPack; 1:genelpa, 2:scalapack";
-        read_sync_int(input.diag_subspace_method);
+        read_sync_int(input.diag_subspace);
         this->add_item(item);
     }
     {
diff --git a/source/module_lr/hsolver_lrtd.hpp b/source/module_lr/hsolver_lrtd.hpp
index 3694631cfc..1a96991f47 100644
--- a/source/module_lr/hsolver_lrtd.hpp
+++ b/source/module_lr/hsolver_lrtd.hpp
@@ -96,7 +96,7 @@ namespace LR
                         maxiter,
                         false, //always do the subspace diag (check the implementation)
                         comm_info,
-                        PARAM.inp.diag_subspace_method,
+                        PARAM.inp.diag_subspace,
                         PARAM.inp.nb2d);
                     std::vector<double> ethr_band(nband, diag_ethr);
                     hsolver::DiagoIterAssist<T>::avg_iter
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index a593223f43..6d897888fd 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -88,7 +88,7 @@ struct Input_para
     double pw_diag_thr = 0.01; ///< used in cg method
     int pw_diag_ndim = 4;      ///< dimension of workspace for Davidson diagonalization
     int diago_cg_prec = 1;     ///< mohan add 2012-03-31
-    int diag_subspace_method = 0; // 0: Lapack, 1: elpa, 2: scalapack
+    int diag_subspace = 0; // 0: Lapack, 1: elpa, 2: scalapack
 
     std::string smearing_method = "gauss"; ///< "gauss",
                                            ///< "mp","methfessel-paxton"

From d729dd55c012056ddc417b2574eb5dbe6617ba82 Mon Sep 17 00:00:00 2001
From: root <pxlxingliang>
Date: Fri, 22 Nov 2024 18:37:36 +0800
Subject: [PATCH 05/12] fix

---
 source/module_hsolver/diago_dav_subspace.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index 83d9b50db3..d4947e7715 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -11,7 +11,10 @@
 #include "module_hsolver/diag_hs_para.h"
 
 #include <vector>
+
+#ifdef __MPI
 #include <mpi.h>
+#endif
 
 using namespace hsolver;
 

From 7bb7414560fa3c5292910b39a30b1d9ad11bc6f1 Mon Sep 17 00:00:00 2001
From: root <pxlxingliang>
Date: Fri, 22 Nov 2024 19:09:30 +0800
Subject: [PATCH 06/12] fix

---
 source/module_hsolver/diag_hs_para.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/source/module_hsolver/diag_hs_para.cpp b/source/module_hsolver/diag_hs_para.cpp
index ed558e492d..0ac87b9eaf 100644
--- a/source/module_hsolver/diag_hs_para.cpp
+++ b/source/module_hsolver/diag_hs_para.cpp
@@ -3,7 +3,10 @@
 #include "module_base/scalapack_connector.h"
 #include "module_basis/module_ao/parallel_2d.h"
 #include "module_hsolver/diago_pxxxgvx.h"
+
+#ifdef __ELPA
 #include "module_hsolver/genelpa/elpa_solver.h"
+#endif
 
 #include <iostream>
 
@@ -186,7 +189,7 @@ template void Diago_HS_para<float>(float* h,
                                    const MPI_Comm& comm,
                                    const int diag_subspace,
                                    const int block_size);
-                                   
+
 template void Diago_HS_para<std::complex<float>>(std::complex<float>* h,
                                                  std::complex<float>* s,
                                                  const int lda,

From 68cb71c27b3f9087fb7353b22efaf4e882f2691c Mon Sep 17 00:00:00 2001
From: root <pxlxingliang>
Date: Mon, 25 Nov 2024 10:24:45 +0800
Subject: [PATCH 07/12] fix test

---
 source/module_hsolver/test/test_diago_hs_para.cpp | 9 +--------
 tests/integrate/102_PW_DS_davsubspace_sca/INPUT   | 2 +-
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/source/module_hsolver/test/test_diago_hs_para.cpp b/source/module_hsolver/test/test_diago_hs_para.cpp
index 29ed7e30a1..6115d930c2 100644
--- a/source/module_hsolver/test/test_diago_hs_para.cpp
+++ b/source/module_hsolver/test/test_diago_hs_para.cpp
@@ -126,14 +126,7 @@ void test_diago_hs(int lda, int nb, int random_seed, int nbands, int diag_type,
         h_mat.resize(lda * lda);
         s_mat.resize(lda * lda);
         wfc.resize(lda * lda);
-        //generate_random_hs(lda, random_seed, h_mat, s_mat);
-        //read h from h.dat and s from s.dat,
-        std::ifstream hfile("h.dat");
-        hfile.read((char*)h_mat.data(), lda * lda * sizeof(T));
-        hfile.close();
-        std::ifstream sfile("s.dat");
-        sfile.read((char*)s_mat.data(), lda * lda * sizeof(T));
-        sfile.close();
+        generate_random_hs(lda, random_seed, h_mat, s_mat);
     }
     hsolver::Diago_HS_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb.data(), wfc.data(), comm, diag_type, nb);
 
diff --git a/tests/integrate/102_PW_DS_davsubspace_sca/INPUT b/tests/integrate/102_PW_DS_davsubspace_sca/INPUT
index 0cbffad0f8..b060c92bdd 100644
--- a/tests/integrate/102_PW_DS_davsubspace_sca/INPUT
+++ b/tests/integrate/102_PW_DS_davsubspace_sca/INPUT
@@ -25,5 +25,5 @@ mixing_type		plain
 mixing_beta		0.5
 
 ks_solver       dav_subspace
-diag_subspace_method 2
+diag_subspace 2
 nb2d 4

From ed488e7a2176fb033ada236b6daeed78dae5c454 Mon Sep 17 00:00:00 2001
From: root <pxlxingliang>
Date: Fri, 13 Dec 2024 16:35:37 +0800
Subject: [PATCH 08/12] fix

---
 source/module_hsolver/diag_hs_para.cpp | 2 +-
 source/module_hsolver/diag_hs_para.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/module_hsolver/diag_hs_para.cpp b/source/module_hsolver/diag_hs_para.cpp
index 0ac87b9eaf..2722a02c07 100644
--- a/source/module_hsolver/diag_hs_para.cpp
+++ b/source/module_hsolver/diag_hs_para.cpp
@@ -1,7 +1,7 @@
 #include "module_hsolver/diag_hs_para.h"
 
 #include "module_base/scalapack_connector.h"
-#include "module_basis/module_ao/parallel_2d.h"
+#include "module_base/parallel_2d.h"
 #include "module_hsolver/diago_pxxxgvx.h"
 
 #ifdef __ELPA
diff --git a/source/module_hsolver/diag_hs_para.h b/source/module_hsolver/diag_hs_para.h
index fd54ebcae1..222adca0f5 100644
--- a/source/module_hsolver/diag_hs_para.h
+++ b/source/module_hsolver/diag_hs_para.h
@@ -1,4 +1,4 @@
-#include "module_basis/module_ao/parallel_2d.h"
+#include "module_base/parallel_2d.h"
 #include "module_base/macros.h"
 
 #ifdef __MPI

From 32066231f6ae129787674fa6cc63ebb2d510fb8c Mon Sep 17 00:00:00 2001
From: root <pxlxingliang>
Date: Fri, 13 Dec 2024 17:26:20 +0800
Subject: [PATCH 09/12] fix pyabacus

---
 .../pyabacus/src/hsolver/py_diago_dav_subspace.hpp  |  8 +++++---
 python/pyabacus/src/hsolver/py_hsolver.cpp          | 11 ++++++++++-
 python/pyabacus/src/pyabacus/hsolver/_hsolver.py    | 13 +++++++++++--
 source/module_hsolver/test/CMakeLists.txt           |  2 +-
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp b/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
index 5af16e11d4..897d8d668e 100644
--- a/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
+++ b/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
@@ -108,7 +108,9 @@ class PyDiagoDavSubspace
         bool need_subspace,
         std::vector<double>& diag_ethr,
         bool scf_type,
-        hsolver::diag_comm_info comm_info
+        hsolver::diag_comm_info comm_info,
+        int diag_subspace,
+        int nb2d
     ) {
         auto hpsi_func = [mm_op] (
             std::complex<double> *psi_in,
@@ -139,8 +141,8 @@ class PyDiagoDavSubspace
             max_iter, 
             need_subspace, 
             comm_info,
-            PARAM.inp.diag_subspace,
-            PARAM.inp.nb2d
+            diag_subspace,
+            nb2d
         );
 
         return obj->diag(hpsi_func, psi, nbasis, eigenvalue, diag_ethr, scf_type);
diff --git a/python/pyabacus/src/hsolver/py_hsolver.cpp b/python/pyabacus/src/hsolver/py_hsolver.cpp
index dfe7063ea4..e791fe9f09 100644
--- a/python/pyabacus/src/hsolver/py_hsolver.cpp
+++ b/python/pyabacus/src/hsolver/py_hsolver.cpp
@@ -67,6 +67,13 @@ void bind_hsolver(py::module& m)
                 where the initial precision of eigenvalue calculation can be coarse.
                 If false, it indicates a non-self-consistent field (non-SCF) calculation,
                 where high precision in eigenvalue calculation is required from the start.
+            comm_info : diag_comm_info
+                The communicator information.
+            diago_subspace : int
+                The method to solve the generalized eigenvalue problem.
+                0: LAPACK, 1: Gen-ELPA, 2: ScaLAPACK
+            nb2d : int
+                The block size in 2d block cyclic distribution if use elpa or scalapack.
         )pbdoc", 
         "mm_op"_a, 
         "precond_vec"_a, 
@@ -76,7 +83,9 @@ void bind_hsolver(py::module& m)
         "need_subspace"_a, 
         "diag_ethr"_a, 
         "scf_type"_a, 
-        "comm_info"_a)
+        "comm_info"_a,
+        "diago_subspace"_a,
+        "nb2d"_a)
         .def("set_psi", &py_hsolver::PyDiagoDavSubspace::set_psi, R"pbdoc(
             Set the initial guess of the eigenvectors, i.e. the wave functions.
         )pbdoc", "psi_in"_a)
diff --git a/python/pyabacus/src/pyabacus/hsolver/_hsolver.py b/python/pyabacus/src/pyabacus/hsolver/_hsolver.py
index 5d2b11d99a..7c60ce538c 100644
--- a/python/pyabacus/src/pyabacus/hsolver/_hsolver.py
+++ b/python/pyabacus/src/pyabacus/hsolver/_hsolver.py
@@ -34,7 +34,9 @@ def dav_subspace(
     max_iter: int = 1000,
     need_subspace: bool = False,
     diag_ethr: Union[List[float], None] = None,
-    scf_type: bool = False
+    scf_type: bool = False,
+    diag_subspace: int = 0,
+    nb2d: int = 0
 ) -> Tuple[NDArray[np.float64], NDArray[np.complex128]]:
     """ A function to diagonalize a matrix using the Davidson-Subspace method.
 
@@ -67,6 +69,11 @@ def dav_subspace(
         If True, the initial precision of eigenvalue calculation can be coarse. 
         If False, it indicates a non-self-consistent field (non-SCF) calculation, 
         where high precision in eigenvalue calculation is required from the start.  
+    diag_subspace : int, optional
+        The method to do the diagonalization, by default 0.
+        0: LAPACK, 1: Gen-elpa, 2: Scalapack
+    nb2d : int, optional
+        The block size for 2D decomposition, by default 0, which will be automatically set.
     
     Returns
     -------
@@ -101,7 +108,9 @@ def dav_subspace(
         need_subspace,
         diag_ethr,
         scf_type,
-        comm_info
+        comm_info,
+        diag_subspace,
+        nb2d
     )
     
     e = _diago_obj_dav_subspace.get_eigenvalue()
diff --git a/source/module_hsolver/test/CMakeLists.txt b/source/module_hsolver/test/CMakeLists.txt
index fd93894e3e..fdb447a09d 100644
--- a/source/module_hsolver/test/CMakeLists.txt
+++ b/source/module_hsolver/test/CMakeLists.txt
@@ -157,7 +157,7 @@ install(FILES parallel_k2d_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 AddTest(
   TARGET hsolver_diago_hs_parallel
   LIBS parameter  ${math_libs} ELPA::ELPA base device MPI::MPI_CXX genelpa psi
-  SOURCES test_diago_hs_para.cpp ../diag_hs_para.cpp ../diago_pxxxgvx.cpp  ../../module_basis/module_ao/parallel_2d.cpp ../diago_elpa.cpp ../diago_scalapack.cpp 
+  SOURCES test_diago_hs_para.cpp ../diag_hs_para.cpp ../diago_pxxxgvx.cpp ../diago_elpa.cpp ../diago_scalapack.cpp 
 )
 
 find_program(BASH bash)

From f4d1ea869f009ce63eccf46cf46c7c904e417ae6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci-lite[bot]"
 <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Date: Fri, 13 Dec 2024 10:14:19 +0000
Subject: [PATCH 10/12] [pre-commit.ci lite] apply automatic fixes

---
 source/module_hsolver/test/test_diago_hs_para.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/source/module_hsolver/test/test_diago_hs_para.cpp b/source/module_hsolver/test/test_diago_hs_para.cpp
index 6115d930c2..0196622049 100644
--- a/source/module_hsolver/test/test_diago_hs_para.cpp
+++ b/source/module_hsolver/test/test_diago_hs_para.cpp
@@ -180,7 +180,8 @@ void test_performance(int lda, int nb, int nbands, MPI_Comm comm,int case_numb,
     std::vector<double> time_scalap(case_numb, 0);
     std::vector<double> time_lapack(case_numb, 0);
 
-    if (my_rank == 0) std::cout << "Random matrix ";
+    if (my_rank == 0) { std::cout << "Random matrix ";
+}
     for (int randomi = 0; randomi < case_numb; ++randomi) 
     {
         

From 124cdf35b9fd34d7d514824bf0016ab698c1be8c Mon Sep 17 00:00:00 2001
From: root <pxlxingliang>
Date: Sat, 14 Dec 2024 10:00:19 +0800
Subject: [PATCH 11/12] fix doc

---
 docs/advanced/input_files/input-main.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index 2e1ebea719..bfa5879899 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -39,7 +39,7 @@
     - [pw\_diag\_thr](#pw_diag_thr)
     - [pw\_diag\_nmax](#pw_diag_nmax)
     - [pw\_diag\_ndim](#pw_diag_ndim)
-    - [diag\_subspace\_method](#diag_subspace)
+    - [diag\_subspace](#diag_subspace)
     - [erf\_ecut](#erf_ecut)
     - [fft\_mode](#fft_mode)
     - [erf\_height](#erf_height)

From 9825563c76beb5aa4eb7ab7359059c9730a48f5c Mon Sep 17 00:00:00 2001
From: root <pxlxingliang>
Date: Wed, 18 Dec 2024 23:09:07 +0800
Subject: [PATCH 12/12] update the doc

---
 docs/advanced/input_files/input-main.md       | 24 +++++++++++--------
 source/module_hsolver/diag_hs_para.cpp        | 10 ++++----
 source/module_hsolver/diag_hs_para.h          |  2 +-
 source/module_hsolver/diago_dav_subspace.cpp  |  2 +-
 .../test/test_diago_hs_para.cpp               |  6 ++---
 .../module_hsolver/test/test_hsolver_pw.cpp   | 12 +++++-----
 .../module_hsolver/test/test_hsolver_sdft.cpp | 12 +++++-----
 7 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index bfa5879899..3d8f82f98d 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -21,6 +21,7 @@
     - [kspacing](#kspacing)
     - [min\_dist\_coef](#min_dist_coef)
     - [device](#device)
+    - [nb2d](#nb2d)
     - [precision](#precision)
   - [Variables related to input files](#variables-related-to-input-files)
     - [stru\_file](#stru_file)
@@ -45,7 +46,6 @@
     - [erf\_height](#erf_height)
     - [erf\_sigma](#erf_sigma)
   - [Numerical atomic orbitals related variables](#numerical-atomic-orbitals-related-variables)
-    - [nb2d](#nb2d)
     - [lmaxmax](#lmaxmax)
     - [lcao\_ecut](#lcao_ecut)
     - [lcao\_dk](#lcao_dk)
@@ -667,6 +667,19 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c
   - cg/bpcg/dav ks_solver: required by the `single` precision options
 - **Default**: double
 
+### nb2d
+
+- **Type**: Integer
+- **Description**: When using elpa or scalapack to solver the eigenvalue problem, the data should be distributed by the two-dimensional block-cyclic distribution. This paramter specifies the size of the block. It is valid for:
+  - [ks_solver](#ks_solver) is genelpa or scalapack_gvx. If nb2d is set to 0, then it will be automatically set in the program according to the size of atomic orbital basis:
+    - if size <= 500: nb2d = 1
+    - if 500 < size <= 1000: nb2d = 32
+    - if size > 1000: nb2d = 64;
+  - [ks_solver](#ks_solver) is dav_subspace, and [diag_subspace](#diag_subspace) is 1 or 2. It is the block size for the diagonization of subspace. If it is set to 0, then it will be automatically set in the program according to the number of band:
+    - if number of band > 500: nb2d = 32
+    - if number of band < 500: nb2d = 16
+- **Default**: 0
+
 [back to top](#full-list-of-input-keywords)
 
 ## Variables related to input files
@@ -842,15 +855,6 @@ These variables are used to control the plane wave related parameters.
 
 These variables are used to control the numerical atomic orbitals related parameters.
 
-### nb2d
-
-- **Type**: Integer
-- **Description**: In LCAO calculations, we arrange the total number of processors in an 2D array, so that we can partition the wavefunction matrix (number of bands*total size of atomic orbital basis) and distribute them in this 2D array. When the system is large, we group processors into sizes of nb2d, so that multiple processors take care of one row block (a group of atomic orbitals) in the wavefunction matrix. If set to 0, nb2d will be automatically set in the program according to the size of atomic orbital basis:
-  - if size <= 500 : nb2d = 1
-  - if 500 < size <= 1000 : nb2d = 32
-  - if size > 1000 : nb2d = 64;
-- **Default**: 0
-
 ### lmaxmax
 
 - **Type**: Integer
diff --git a/source/module_hsolver/diag_hs_para.cpp b/source/module_hsolver/diag_hs_para.cpp
index 2722a02c07..43d25a3112 100644
--- a/source/module_hsolver/diag_hs_para.cpp
+++ b/source/module_hsolver/diag_hs_para.cpp
@@ -71,7 +71,7 @@ void elpa_diag(MPI_Comm comm,
 #ifdef __MPI
 
 template <typename T>
-void Diago_HS_para(T* h,
+void diago_hs_para(T* h,
                    T* s,
                    const int lda,
                    const int nband,
@@ -160,7 +160,7 @@ void Diago_HS_para(T* h,
 }
 
 // template instantiation
-template void Diago_HS_para<double>(double* h,
+template void diago_hs_para<double>(double* h,
                                     double* s,
                                     const int lda,
                                     const int nband,
@@ -170,7 +170,7 @@ template void Diago_HS_para<double>(double* h,
                                     const int diag_subspace,
                                     const int block_size);
 
-template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
+template void diago_hs_para<std::complex<double>>(std::complex<double>* h,
                                                   std::complex<double>* s,
                                                   const int lda,
                                                   const int nband,
@@ -180,7 +180,7 @@ template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
                                                   const int diag_subspace,
                                                   const int block_size);
 
-template void Diago_HS_para<float>(float* h,
+template void diago_hs_para<float>(float* h,
                                    float* s,
                                    const int lda,
                                    const int nband,
@@ -190,7 +190,7 @@ template void Diago_HS_para<float>(float* h,
                                    const int diag_subspace,
                                    const int block_size);
 
-template void Diago_HS_para<std::complex<float>>(std::complex<float>* h,
+template void diago_hs_para<std::complex<float>>(std::complex<float>* h,
                                                  std::complex<float>* s,
                                                  const int lda,
                                                  const int nband,
diff --git a/source/module_hsolver/diag_hs_para.h b/source/module_hsolver/diag_hs_para.h
index 222adca0f5..659f3b0781 100644
--- a/source/module_hsolver/diag_hs_para.h
+++ b/source/module_hsolver/diag_hs_para.h
@@ -31,7 +31,7 @@ namespace hsolver
  * @note 4. block_size should be 0 or a positive integer. If it is 0, then will use a value as large as possible that is allowed
  */
 template <typename T>
-void Diago_HS_para(T* h,
+void diago_hs_para(T* h,
                    T* s,
                    const int lda,
                    const int nband,
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index b398b52676..82dadcb0d0 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -638,7 +638,7 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
                     }
                 }
             }
-            Diago_HS_para(h_diag.data(),
+            diago_hs_para(h_diag.data(),
                           s_diag.data(),
                           nbase,
                           nband,
diff --git a/source/module_hsolver/test/test_diago_hs_para.cpp b/source/module_hsolver/test/test_diago_hs_para.cpp
index 0196622049..51efda5e05 100644
--- a/source/module_hsolver/test/test_diago_hs_para.cpp
+++ b/source/module_hsolver/test/test_diago_hs_para.cpp
@@ -128,7 +128,7 @@ void test_diago_hs(int lda, int nb, int random_seed, int nbands, int diag_type,
         wfc.resize(lda * lda);
         generate_random_hs(lda, random_seed, h_mat, s_mat);
     }
-    hsolver::Diago_HS_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb.data(), wfc.data(), comm, diag_type, nb);
+    hsolver::diago_hs_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb.data(), wfc.data(), comm, diag_type, nb);
 
     // Verify results
     if (my_rank == 0){
@@ -195,7 +195,7 @@ void test_performance(int lda, int nb, int nbands, MPI_Comm comm,int case_numb,
         auto start = std::chrono::high_resolution_clock::now();
         for (int j=0;j<loop_numb;j++)
         {
-            hsolver::Diago_HS_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb_elpa.data(), wfc.data(), comm, 1, nb);
+            hsolver::diago_hs_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb_elpa.data(), wfc.data(), comm, 1, nb);
             MPI_Barrier(comm);
         }
         MPI_Barrier(comm);
@@ -207,7 +207,7 @@ void test_performance(int lda, int nb, int nbands, MPI_Comm comm,int case_numb,
         start = std::chrono::high_resolution_clock::now();
         for (int j=0;j<loop_numb;j++)
         {
-            hsolver::Diago_HS_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb_scalap.data(), wfc.data(), comm, 2, nb);
+            hsolver::diago_hs_para<T>(h_mat.data(), s_mat.data(), lda, nbands,ekb_scalap.data(), wfc.data(), comm, 2, nb);
             MPI_Barrier(comm);
         }
         MPI_Barrier(comm);
diff --git a/source/module_hsolver/test/test_hsolver_pw.cpp b/source/module_hsolver/test/test_hsolver_pw.cpp
index 87849b3e32..530491a669 100644
--- a/source/module_hsolver/test/test_hsolver_pw.cpp
+++ b/source/module_hsolver/test/test_hsolver_pw.cpp
@@ -33,10 +33,10 @@
  *      - lcao_in_pw specific implementation
  */
 
-// mock Diago_HS_para
+// mock diago_hs_para
 namespace hsolver {
 template <typename T>
-void Diago_HS_para(T* h,
+void diago_hs_para(T* h,
                    T* s,
                    const int lda,
                    const int nband,
@@ -46,7 +46,7 @@ void Diago_HS_para(T* h,
                    const int diag_subspace,
                    const int block_size = 0)
 {}
-template void Diago_HS_para<double>(double* h,
+template void diago_hs_para<double>(double* h,
                                     double* s,
                                     const int lda,
                                     const int nband,
@@ -56,7 +56,7 @@ template void Diago_HS_para<double>(double* h,
                                     const int diag_subspace,
                                     const int block_size);
 
-template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
+template void diago_hs_para<std::complex<double>>(std::complex<double>* h,
                                                   std::complex<double>* s,
                                                   const int lda,
                                                   const int nband,
@@ -66,7 +66,7 @@ template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
                                                   const int diag_subspace,
                                                   const int block_size);
 
-template void Diago_HS_para<float>(float* h,
+template void diago_hs_para<float>(float* h,
                                    float* s,
                                    const int lda,
                                    const int nband,
@@ -76,7 +76,7 @@ template void Diago_HS_para<float>(float* h,
                                    const int diag_subspace,
                                    const int block_size);
                                    
-template void Diago_HS_para<std::complex<float>>(std::complex<float>* h,
+template void diago_hs_para<std::complex<float>>(std::complex<float>* h,
                                                  std::complex<float>* s,
                                                  const int lda,
                                                  const int nband,
diff --git a/source/module_hsolver/test/test_hsolver_sdft.cpp b/source/module_hsolver/test/test_hsolver_sdft.cpp
index 3d60f21c4d..274a619993 100644
--- a/source/module_hsolver/test/test_hsolver_sdft.cpp
+++ b/source/module_hsolver/test/test_hsolver_sdft.cpp
@@ -23,10 +23,10 @@ Sto_Func<REAL>::Sto_Func()
 }
 template class Sto_Func<double>;
 
-// mock Diago_HS_para
+// mock diago_hs_para
 namespace hsolver {
 template <typename T>
-void Diago_HS_para(T* h,
+void diago_hs_para(T* h,
                    T* s,
                    const int lda,
                    const int nband,
@@ -36,7 +36,7 @@ void Diago_HS_para(T* h,
                    const int diag_subspace,
                    const int block_size = 0)
 {}
-template void Diago_HS_para<double>(double* h,
+template void diago_hs_para<double>(double* h,
                                     double* s,
                                     const int lda,
                                     const int nband,
@@ -45,7 +45,7 @@ template void Diago_HS_para<double>(double* h,
                                     const MPI_Comm& comm,
                                     const int diag_subspace,
                                     const int block_size);
-template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
+template void diago_hs_para<std::complex<double>>(std::complex<double>* h,
                                                   std::complex<double>* s,
                                                   const int lda,
                                                   const int nband,
@@ -54,7 +54,7 @@ template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
                                                   const MPI_Comm& comm,
                                                   const int diag_subspace,
                                                   const int block_size);
-template void Diago_HS_para<float>(float* h,
+template void diago_hs_para<float>(float* h,
                                    float* s,
                                    const int lda,
                                    const int nband,
@@ -63,7 +63,7 @@ template void Diago_HS_para<float>(float* h,
                                    const MPI_Comm& comm,
                                    const int diag_subspace,
                                    const int block_size);          
-template void Diago_HS_para<std::complex<float>>(std::complex<float>* h,
+template void diago_hs_para<std::complex<float>>(std::complex<float>* h,
                                                  std::complex<float>* s,
                                                  const int lda,
                                                  const int nband,