diff --git a/source/source_hsolver/diago_cusolver.cpp b/source/source_hsolver/diago_cusolver.cpp
index a42c7844ba..960c4ec5d7 100644
--- a/source/source_hsolver/diago_cusolver.cpp
+++ b/source/source_hsolver/diago_cusolver.cpp
@@ -15,24 +15,13 @@ using complex = std::complex<double>;
 // Namespace for the diagonalization solver
 namespace hsolver
 {
-// this struct is used for collecting matrices from all processes to root process
-template <typename T>
-struct Matrix_g
-{
-    std::shared_ptr<T> p;
-    size_t row;
-    size_t col;
-    std::shared_ptr<int> desc;
-};
-
 // Initialize the DecomposedState variable for real and complex numbers
 template <typename T>
 int DiagoCusolver<T>::DecomposedState = 0;
 
 template <typename T>
-DiagoCusolver<T>::DiagoCusolver(const Parallel_Orbitals* ParaV)
+DiagoCusolver<T>::DiagoCusolver()
 {
-    this->ParaV = ParaV;
 }
 
 template <typename T>
@@ -40,162 +29,25 @@ DiagoCusolver<T>::~DiagoCusolver()
 {
 }
 
-// Wrapper for pdgemr2d and pzgemr2d
-// static inline void Cpxgemr2d(
-//    const int M, const int N,
-//    double *a, const int ia, const int ja, const int *desca,
-//    double *b, const int ib, const int jb, const int *descb,
-//    const int blacs_ctxt)
-//{
-//    pdgemr2d_(&M, &N,
-//              a, &ia, &ja, desca,
-//              b, &ib, &jb, descb,
-//              &blacs_ctxt);
-//}
-//
-// static inline void Cpxgemr2d(
-//    const int M, const int N,
-//    complex *a, const int ia, const int ja, const int *desca,
-//    complex *b, const int ib, const int jb, const int *descb,
-//    const int blacs_ctxt)
-//{
-//    pzgemr2d_(&M, &N,
-//              a, &ia, &ja, desca,
-//              b, &ib, &jb, descb,
-//              &blacs_ctxt);
-//}
-
-// Use Cpxgemr2d to collect matrices from all processes to root process
-template <typename mat, typename matg>
-static void gatherMatrix(const int myid, const int root_proc, const mat& mat_l, matg& mat_g)
-{
-    auto a = mat_l.p;
-    const int* desca = mat_l.desc;
-    int ctxt = desca[1];
-    int nrows = desca[2];
-    int ncols = desca[3];
-
-    if (myid == root_proc)
-    {
-        mat_g.p.reset(new typename std::remove_reference<decltype(*a)>::type[nrows * ncols]);
-    }
-    else
-    {
-        mat_g.p.reset(new typename std::remove_reference<decltype(*a)>::type[1]);
-    }
-
-    // Set descb, which has all elements in the only block in the root process
-    mat_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows});
-
-    mat_g.row = nrows;
-    mat_g.col = ncols;
-
-    Cpxgemr2d(nrows, ncols, a, 1, 1, const_cast<int*>(desca), mat_g.p.get(), 1, 1, mat_g.desc.get(), ctxt);
-}
-
-// Convert the Psi to a 2D block storage format
-template <typename T>
-static void distributePsi(const int* desc_psi, T* psi, T* psi_g)
-{
-    int ctxt = desc_psi[1];
-    int nrows = desc_psi[2];
-    int ncols = desc_psi[3];
-    int rsrc = desc_psi[6];
-    int csrc = desc_psi[7];
-
-    int descg[9] = {1, ctxt, nrows, ncols, nrows, ncols, rsrc, csrc, nrows};
-    int descl[9];
-
-    std::copy(desc_psi, desc_psi + 9, descl);
-
-    Cpxgemr2d(nrows, ncols, psi_g, 1, 1, descg, psi, 1, 1, descl, ctxt);
-}
-
 // Diagonalization function
 template <typename T>
-void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* eigenvalue_in)
+void DiagoCusolver<T>::diag(
+    hamilt::MatrixBlock<T>& h_mat,
+    hamilt::MatrixBlock<T>& s_mat,
+    psi::Psi<T>& psi,
+    Real* eigenvalue_in)
 {
-    // Output the title for the current operation
     ModuleBase::TITLE("DiagoCusolver", "diag");
-
-    // Create matrices for the Hamiltonian and overlap
-    hamilt::MatrixBlock<T> h_mat;
-    hamilt::MatrixBlock<T> s_mat;
-    phm_in->matrix(h_mat, s_mat);
-
-#ifdef __MPI
-    // global matrix
-    Matrix_g<T> h_mat_g;
-    Matrix_g<T> s_mat_g;
-
-    // get the context and process information
-    int ctxt = ParaV->blacs_ctxt;
-    int nprows = 0;
-    int npcols = 0;
-    int myprow = 0;
-    int mypcol = 0;
-    Cblacs_gridinfo(ctxt, &nprows, &npcols, &myprow, &mypcol);
-    const int num_procs = nprows * npcols;
-    const int myid = Cblacs_pnum(ctxt, myprow, mypcol);
-    const int root_proc = Cblacs_pnum(ctxt, ParaV->desc[6], ParaV->desc[7]);
-#endif
-
+    ModuleBase::timer::tick("DiagoCusolver", "cusolver");
     // Allocate memory for eigenvalues
     std::vector<double> eigen(PARAM.globalv.nlocal, 0.0);
-
-    // Start the timer for the cusolver operation
-    ModuleBase::timer::tick("DiagoCusolver", "cusolver");
-
-#ifdef __MPI
-    if (num_procs > 1)
-    {
-        // gather matrices from processes to root process
-        gatherMatrix(myid, root_proc, h_mat, h_mat_g);
-        gatherMatrix(myid, root_proc, s_mat, s_mat_g);
-    }
-#endif
-
-    // Call the dense diagonalization routine
-#ifdef __MPI
-    if (num_procs > 1)
-    {
-        MPI_Barrier(MPI_COMM_WORLD);
-        // global psi for distribute
-        int psi_len = myid == root_proc ? h_mat_g.row * h_mat_g.col : 1;
-        std::vector<T> psi_g(psi_len);
-        if (myid == root_proc)
-        {
-            this->dc.Dngvd(h_mat_g.col, h_mat_g.row, h_mat_g.p.get(), s_mat_g.p.get(), eigen.data(), psi_g.data());
-        }
-
-        MPI_Barrier(MPI_COMM_WORLD);
-
-        // broadcast eigenvalues to all processes
-        MPI_Bcast(eigen.data(), PARAM.inp.nbands, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);
-
-        // distribute psi to all processes
-        distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g.data());
-    }
-    else
-    {
-        // Be careful that h_mat.row * h_mat.col != psi.get_nbands() * psi.get_nbasis() under multi-k situation
-        std::vector<T> eigenvectors(h_mat.row * h_mat.col);
-        this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
-        const int size = psi.get_nbands() * psi.get_nbasis();
-        BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
-    }
-#else
     std::vector<T> eigenvectors(h_mat.row * h_mat.col);
     this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
     const int size = psi.get_nbands() * psi.get_nbasis();
     BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
-#endif
-    // Stop the timer for the cusolver operation
-    ModuleBase::timer::tick("DiagoCusolver", "cusolver");
-
-    // Copy the eigenvalues to the output arrays
     const int inc = 1;
     BlasConnector::copy(PARAM.inp.nbands, eigen.data(), inc, eigenvalue_in, inc);
+    ModuleBase::timer::tick("DiagoCusolver", "cusolver");
 }
 
 // Explicit instantiation of the DiagoCusolver class for real and complex numbers
diff --git a/source/source_hsolver/diago_cusolver.h b/source/source_hsolver/diago_cusolver.h
index bdfdaa86ee..5f895ea21d 100644
--- a/source/source_hsolver/diago_cusolver.h
+++ b/source/source_hsolver/diago_cusolver.h
@@ -17,15 +17,18 @@ class DiagoCusolver
   private:
     // Real is the real part of the complex type T
     using Real = typename GetTypeReal<T>::type;
-    Parallel_Orbitals const * ParaV;
 
   public:
 
-    DiagoCusolver(const Parallel_Orbitals* ParaV = nullptr);
+    DiagoCusolver();
     ~DiagoCusolver();
     
     // Override the diag function for CUSOLVER diagonalization
-    void diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* eigenvalue_in);
+    void diag(
+      hamilt::MatrixBlock<T>& h_mat,
+      hamilt::MatrixBlock<T>& s_mat,
+      psi::Psi<T>& psi,
+      Real* eigenvalue_in);
 
     // Static variable to keep track of the decomposition state
     static int DecomposedState;
diff --git a/source/source_hsolver/hsolver_lcao.cpp b/source/source_hsolver/hsolver_lcao.cpp
index 2a168f8b24..a9d940216b 100644
--- a/source/source_hsolver/hsolver_lcao.cpp
+++ b/source/source_hsolver/hsolver_lcao.cpp
@@ -48,14 +48,20 @@ void HSolverLCAO<T, Device>::solve(hamilt::Hamilt<T>* pHamilt,
 
     if (this->method != "pexsi")
     {
+    #ifdef __MPI
+    #ifdef __CUDA
+        if (this->method == "cusolver" && GlobalV::NPROC > 1)
+        {
+            this->parakSolve_cusolver(pHamilt, psi, pes);
+        }else 
+    #endif
         if (PARAM.globalv.kpar_lcao > 1
             && (this->method == "genelpa" || this->method == "elpa" || this->method == "scalapack_gvx"))
         {
-#ifdef __MPI
             this->parakSolve(pHamilt, psi, pes, PARAM.globalv.kpar_lcao);
-#endif
-        }
-        else if (PARAM.globalv.kpar_lcao == 1)
+        } else
+    #endif
+        if (PARAM.globalv.kpar_lcao == 1)
         {
             /// Loop over k points for solve Hamiltonian to eigenpairs(eigenvalues and eigenvectors).
             for (int ik = 0; ik < psi.get_nk(); ++ik)
@@ -151,8 +157,11 @@ void HSolverLCAO<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T>* hm, psi::Psi<T>&
 #ifdef __CUDA
     else if (this->method == "cusolver")
     {
-        DiagoCusolver<T> cs(this->ParaV);
-        cs.diag(hm, psi, eigenvalue);
+        // Note: This branch will only be executed in the single-process case
+        DiagoCusolver<T> cu;
+        hamilt::MatrixBlock<T> hk, sk;
+        hm->matrix(hk, sk);
+        cu.diag(hk, sk, psi, eigenvalue);
     }
 #ifdef __CUSOLVERMP
     else if (this->method == "cusolvermp")
@@ -295,6 +304,191 @@ void HSolverLCAO<T, Device>::parakSolve(hamilt::Hamilt<T>* pHamilt,
 #endif
 }
 
+#if defined (__MPI) && defined (__CUDA)
+template <typename T, typename Device>
+void HSolverLCAO<T, Device>::parakSolve_cusolver(hamilt::Hamilt<T>* pHamilt,
+                                            psi::Psi<T>& psi,
+                                            elecstate::ElecState* pes)
+{
+    ModuleBase::timer::tick("HSolverLCAO", "parakSolve");
+    const int dev_id = base_device::information::set_device_by_rank();
+
+    int world_rank, world_size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+    // Split communicator by shared memory node
+    MPI_Comm nodeComm;
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, world_rank, MPI_INFO_NULL, &nodeComm);
+
+    int local_rank, local_size;
+    MPI_Comm_rank(nodeComm, &local_rank);
+    MPI_Comm_size(nodeComm, &local_size);
+
+    // Get number of CUDA devices on this node
+    int device_count = 0;
+    cudaError_t cuda_err = cudaGetDeviceCount(&device_count);
+    if (cuda_err != cudaSuccess) {
+        device_count = 0; // Treat as no GPU available
+    }
+
+    if(local_rank >= device_count) {
+        local_rank = -1; // Mark as inactive for GPU work
+    }
+
+    // Determine the number of MPI processes on this node that can actively use a GPU.
+    // This is the minimum of:
+    //   - The number of available MPI processes on the node (local_size)
+    //   - The number of available CUDA-capable GPUs on the node (device_count)
+    // Each GPU is assumed to be used by one dedicated MPI process.
+    // Thus, only the first 'min(local_size, device_count)' ranks on this node
+    // will be assigned GPU work; the rest will be inactive or used for communication-only roles.
+    int active_procs_per_node = std::min(local_size, device_count);
+
+    std::vector<int> all_active_procs(world_size);
+    std::vector<int> all_local_ranks(world_size);
+
+    MPI_Allgather(&active_procs_per_node, 1, MPI_INT, 
+                  all_active_procs.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    MPI_Allgather(&local_rank, 1, MPI_INT, 
+                  all_local_ranks.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    
+    int total_active_ranks = 0;
+    int total_nodes = 0;
+    int highest_active_rank = 0;
+
+    for (int i = 0; i < world_size; ++i) {
+        if (all_local_ranks[i] == 0) {  // new node
+            total_nodes++;
+            total_active_ranks += all_active_procs[i];
+            highest_active_rank = std::max(highest_active_rank, all_active_procs[i] - 1);
+        }
+    }
+
+    // active_ranks will store the global ranks of all active processes across all nodes
+    // The order of global ranks stored here determines the order in which they will be assigned to k-points.
+    // The k-points will be distributed among these ranks in a round-robin fashion.
+    // The purpose of setting the order is to ensure load balancing among nodes as much as possible
+    std::vector<int> active_ranks;
+    for(int i = 0; i <= highest_active_rank; i++)
+    {
+        for(int j = 0; j < world_size; j++)
+        {
+            if(all_local_ranks[j] == i)
+            {
+                active_ranks.push_back(j);
+            }
+        }
+    }
+    
+    const int nks = psi.get_nk();  // total number of k points
+    const int nbands = this->ParaV->get_nbands();
+    // Set the parallel storage scheme for the matrix and psi
+    Parallel_2D mat_para_global;    // store the info about how the origin matrix is distributed in parallel
+    Parallel_2D mat_para_local;     // store the info about how the matrix is distributed after collected from all processes
+    Parallel_2D psi_para_global;    // store the info about how the psi is distributed in parallel
+    Parallel_2D psi_para_local;     // store the info about how the psi is distributed before distributing to all processes
+
+    MPI_Comm self_comm;  // the communicator that only contains the current process itself
+    MPI_Comm_split(MPI_COMM_WORLD, world_rank, 0, &self_comm);
+    int nrow = this->ParaV->get_global_row_size(); // number of rows in the global matrix
+    int ncol = nrow;
+    int nb2d = this->ParaV->get_block_size();      // block size for the 2D matrix distribution
+    mat_para_global.init(nrow, ncol, nb2d, MPI_COMM_WORLD);
+    psi_para_global.init(nrow, nbands, nb2d, MPI_COMM_WORLD);
+    mat_para_local.init(nrow, ncol, nb2d, self_comm);
+    psi_para_local.init(nrow, ncol, nb2d, self_comm);
+
+    std::vector<T> hk_mat; // temporary storage for H(k) matrix collected from all processes
+    std::vector<T> sk_mat; // temporary storage for S(k) matrix collected from all processes
+    // In each iteration, we process total_active_ranks k-points.
+    for(int ik_start = 0; ik_start < nks; ik_start += total_active_ranks)
+    {
+        int kpt_assigned = -1; // the k-point assigned to the current MPI process in this iteration
+        // Compute and gather the hk and sk matrices distributed across different processes in parallel,
+        // preparing for subsequent transfer to the GPU for computation.
+        for(int ik = ik_start; ik < ik_start + total_active_ranks && ik < nks; ik++)
+        {
+            // `is_active` indicates whether this MPI process is assigned to compute the current k-point
+            bool is_active = world_rank == active_ranks[ik % total_active_ranks];
+            if (is_active)
+            {
+                kpt_assigned = ik;
+                hk_mat.resize(nrow * ncol);
+                sk_mat.resize(nrow * ncol);
+            }
+            pHamilt->updateHk(ik);
+            hamilt::MatrixBlock<T> hk_2D, sk_2D;
+            pHamilt->matrix(hk_2D, sk_2D);
+            int desc_tmp[9];
+            T* hk_local_ptr = hk_mat.data();
+            T* sk_local_ptr = sk_mat.data();
+            std::copy(mat_para_local.desc, mat_para_local.desc + 9, desc_tmp);
+            if( !is_active)
+            {
+                desc_tmp[1] = -1;
+            }
+
+            Cpxgemr2d(nrow, ncol, hk_2D.p, 1, 1, mat_para_global.desc,
+                      hk_local_ptr, 1, 1, desc_tmp,
+                      mat_para_global.blacs_ctxt);
+            Cpxgemr2d(nrow, ncol, sk_2D.p, 1, 1, mat_para_global.desc,
+                      sk_local_ptr, 1, 1, desc_tmp,
+                      mat_para_global.blacs_ctxt);
+        }
+
+        // diagonalize the Hamiltonian matrix using cusolver
+        psi::Psi<T> psi_local{};
+        if(kpt_assigned != -1)
+        {
+            psi_local.resize(1, ncol, nrow);
+            DiagoCusolver<T> cu{};
+            hamilt::MatrixBlock<T> hk_local = hamilt::MatrixBlock<T>{
+                    hk_mat.data(), (size_t)nrow, (size_t)ncol,
+                    mat_para_local.desc};
+            hamilt::MatrixBlock<T> sk_local = hamilt::MatrixBlock<T>{
+                    sk_mat.data(), (size_t)nrow, (size_t)ncol,
+                    mat_para_local.desc};
+            cu.diag(hk_local, sk_local, psi_local, &(pes->ekb(kpt_assigned, 0)));
+        }
+
+        // transfer the eigenvectors and eigenvalues to all processes
+        for(int ik = ik_start; ik < ik_start + total_active_ranks && ik < nks; ik++)
+        {
+            int root = active_ranks[ik % total_active_ranks];
+            MPI_Bcast(&(pes->ekb(ik, 0)), nbands, MPI_DOUBLE, root, MPI_COMM_WORLD);
+            int desc_pool[9];
+            std::copy(psi_para_local.desc, psi_para_local.desc + 9, desc_pool);
+            T* psi_local_ptr = nullptr;
+            if (world_rank != root)
+            {
+                desc_pool[1] = -1;
+            }else
+            {
+                psi_local_ptr = psi_local.get_pointer();
+            }
+            psi.fix_k(ik);
+            Cpxgemr2d(nrow,
+                      nbands,
+                      psi_local_ptr,
+                      1,
+                      1,
+                      desc_pool,
+                      psi.get_pointer(),
+                      1,
+                      1,
+                      psi_para_global.desc,
+                      psi_para_global.blacs_ctxt);
+        }
+    }
+
+    MPI_Comm_free(&self_comm);
+    MPI_Comm_free(&nodeComm);
+    ModuleBase::timer::tick("HSolverLCAO", "parakSolve");
+}
+#endif
+
+
 template class HSolverLCAO<double>;
 template class HSolverLCAO<std::complex<double>>;
 
diff --git a/source/source_hsolver/hsolver_lcao.h b/source/source_hsolver/hsolver_lcao.h
index 8ee0a15576..a1905fbcac 100644
--- a/source/source_hsolver/hsolver_lcao.h
+++ b/source/source_hsolver/hsolver_lcao.h
@@ -24,6 +24,11 @@ class HSolverLCAO
 
     void parakSolve(hamilt::Hamilt<T>* pHamilt, psi::Psi<T>& psi, elecstate::ElecState* pes, int kpar); // for kpar_lcao > 1
 
+    // The solving algorithm using cusolver is different from others, so a separate function is needed
+    void parakSolve_cusolver(hamilt::Hamilt<T>* pHamilt,
+                             psi::Psi<T>& psi,
+                             elecstate::ElecState* pes);
+
     const Parallel_Orbitals* ParaV;
     
     const std::string method;