diff --git a/source/source_hsolver/diago_cusolver.cpp b/source/source_hsolver/diago_cusolver.cpp index a42c7844ba..960c4ec5d7 100644 --- a/source/source_hsolver/diago_cusolver.cpp +++ b/source/source_hsolver/diago_cusolver.cpp @@ -15,24 +15,13 @@ using complex = std::complex; // Namespace for the diagonalization solver namespace hsolver { -// this struct is used for collecting matrices from all processes to root process -template -struct Matrix_g -{ - std::shared_ptr p; - size_t row; - size_t col; - std::shared_ptr desc; -}; - // Initialize the DecomposedState variable for real and complex numbers template int DiagoCusolver::DecomposedState = 0; template -DiagoCusolver::DiagoCusolver(const Parallel_Orbitals* ParaV) +DiagoCusolver::DiagoCusolver() { - this->ParaV = ParaV; } template @@ -40,162 +29,25 @@ DiagoCusolver::~DiagoCusolver() { } -// Wrapper for pdgemr2d and pzgemr2d -// static inline void Cpxgemr2d( -// const int M, const int N, -// double *a, const int ia, const int ja, const int *desca, -// double *b, const int ib, const int jb, const int *descb, -// const int blacs_ctxt) -//{ -// pdgemr2d_(&M, &N, -// a, &ia, &ja, desca, -// b, &ib, &jb, descb, -// &blacs_ctxt); -//} -// -// static inline void Cpxgemr2d( -// const int M, const int N, -// complex *a, const int ia, const int ja, const int *desca, -// complex *b, const int ib, const int jb, const int *descb, -// const int blacs_ctxt) -//{ -// pzgemr2d_(&M, &N, -// a, &ia, &ja, desca, -// b, &ib, &jb, descb, -// &blacs_ctxt); -//} - -// Use Cpxgemr2d to collect matrices from all processes to root process -template -static void gatherMatrix(const int myid, const int root_proc, const mat& mat_l, matg& mat_g) -{ - auto a = mat_l.p; - const int* desca = mat_l.desc; - int ctxt = desca[1]; - int nrows = desca[2]; - int ncols = desca[3]; - - if (myid == root_proc) - { - mat_g.p.reset(new typename std::remove_reference::type[nrows * ncols]); - } - else - { - mat_g.p.reset(new typename std::remove_reference::type[1]); - } - - // Set descb, which has all elements in the only block in the root process - mat_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows}); - - mat_g.row = nrows; - mat_g.col = ncols; - - Cpxgemr2d(nrows, ncols, a, 1, 1, const_cast(desca), mat_g.p.get(), 1, 1, mat_g.desc.get(), ctxt); -} - -// Convert the Psi to a 2D block storage format -template -static void distributePsi(const int* desc_psi, T* psi, T* psi_g) -{ - int ctxt = desc_psi[1]; - int nrows = desc_psi[2]; - int ncols = desc_psi[3]; - int rsrc = desc_psi[6]; - int csrc = desc_psi[7]; - - int descg[9] = {1, ctxt, nrows, ncols, nrows, ncols, rsrc, csrc, nrows}; - int descl[9]; - - std::copy(desc_psi, desc_psi + 9, descl); - - Cpxgemr2d(nrows, ncols, psi_g, 1, 1, descg, psi, 1, 1, descl, ctxt); -} - // Diagonalization function template -void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* eigenvalue_in) +void DiagoCusolver::diag( + hamilt::MatrixBlock& h_mat, + hamilt::MatrixBlock& s_mat, + psi::Psi& psi, + Real* eigenvalue_in) { - // Output the title for the current operation ModuleBase::TITLE("DiagoCusolver", "diag"); - - // Create matrices for the Hamiltonian and overlap - hamilt::MatrixBlock h_mat; - hamilt::MatrixBlock s_mat; - phm_in->matrix(h_mat, s_mat); - -#ifdef __MPI - // global matrix - Matrix_g h_mat_g; - Matrix_g s_mat_g; - - // get the context and process information - int ctxt = ParaV->blacs_ctxt; - int nprows = 0; - int npcols = 0; - int myprow = 0; - int mypcol = 0; - Cblacs_gridinfo(ctxt, &nprows, &npcols, &myprow, &mypcol); - const int num_procs = nprows * npcols; - const int myid = Cblacs_pnum(ctxt, myprow, mypcol); - const int root_proc = Cblacs_pnum(ctxt, ParaV->desc[6], ParaV->desc[7]); -#endif - + ModuleBase::timer::tick("DiagoCusolver", "cusolver"); // Allocate memory for eigenvalues std::vector eigen(PARAM.globalv.nlocal, 0.0); - - // Start the timer for the cusolver operation - ModuleBase::timer::tick("DiagoCusolver", "cusolver"); - -#ifdef __MPI - if (num_procs > 1) - { - // gather matrices from processes to root process - gatherMatrix(myid, root_proc, h_mat, h_mat_g); - gatherMatrix(myid, root_proc, s_mat, s_mat_g); - } -#endif - - // Call the dense diagonalization routine -#ifdef __MPI - if (num_procs > 1) - { - MPI_Barrier(MPI_COMM_WORLD); - // global psi for distribute - int psi_len = myid == root_proc ? h_mat_g.row * h_mat_g.col : 1; - std::vector psi_g(psi_len); - if (myid == root_proc) - { - this->dc.Dngvd(h_mat_g.col, h_mat_g.row, h_mat_g.p.get(), s_mat_g.p.get(), eigen.data(), psi_g.data()); - } - - MPI_Barrier(MPI_COMM_WORLD); - - // broadcast eigenvalues to all processes - MPI_Bcast(eigen.data(), PARAM.inp.nbands, MPI_DOUBLE, root_proc, MPI_COMM_WORLD); - - // distribute psi to all processes - distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g.data()); - } - else - { - // Be careful that h_mat.row * h_mat.col != psi.get_nbands() * psi.get_nbasis() under multi-k situation - std::vector eigenvectors(h_mat.row * h_mat.col); - this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data()); - const int size = psi.get_nbands() * psi.get_nbasis(); - BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1); - } -#else std::vector eigenvectors(h_mat.row * h_mat.col); this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data()); const int size = psi.get_nbands() * psi.get_nbasis(); BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1); -#endif - // Stop the timer for the cusolver operation - ModuleBase::timer::tick("DiagoCusolver", "cusolver"); - - // Copy the eigenvalues to the output arrays const int inc = 1; BlasConnector::copy(PARAM.inp.nbands, eigen.data(), inc, eigenvalue_in, inc); + ModuleBase::timer::tick("DiagoCusolver", "cusolver"); } // Explicit instantiation of the DiagoCusolver class for real and complex numbers diff --git a/source/source_hsolver/diago_cusolver.h b/source/source_hsolver/diago_cusolver.h index bdfdaa86ee..5f895ea21d 100644 --- a/source/source_hsolver/diago_cusolver.h +++ b/source/source_hsolver/diago_cusolver.h @@ -17,15 +17,18 @@ class DiagoCusolver private: // Real is the real part of the complex type T using Real = typename GetTypeReal::type; - Parallel_Orbitals const * ParaV; public: - DiagoCusolver(const Parallel_Orbitals* ParaV = nullptr); + DiagoCusolver(); ~DiagoCusolver(); // Override the diag function for CUSOLVER diagonalization - void diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* eigenvalue_in); + void diag( + hamilt::MatrixBlock& h_mat, + hamilt::MatrixBlock& s_mat, + psi::Psi& psi, + Real* eigenvalue_in); // Static variable to keep track of the decomposition state static int DecomposedState; diff --git a/source/source_hsolver/hsolver_lcao.cpp b/source/source_hsolver/hsolver_lcao.cpp index 2a168f8b24..a9d940216b 100644 --- a/source/source_hsolver/hsolver_lcao.cpp +++ b/source/source_hsolver/hsolver_lcao.cpp @@ -48,14 +48,20 @@ void HSolverLCAO::solve(hamilt::Hamilt* pHamilt, if (this->method != "pexsi") { + #ifdef __MPI + #ifdef __CUDA + if (this->method == "cusolver" && GlobalV::NPROC > 1) + { + this->parakSolve_cusolver(pHamilt, psi, pes); + }else + #endif if (PARAM.globalv.kpar_lcao > 1 && (this->method == "genelpa" || this->method == "elpa" || this->method == "scalapack_gvx")) { -#ifdef __MPI this->parakSolve(pHamilt, psi, pes, PARAM.globalv.kpar_lcao); -#endif - } - else if (PARAM.globalv.kpar_lcao == 1) + } else + #endif + if (PARAM.globalv.kpar_lcao == 1) { /// Loop over k points for solve Hamiltonian to eigenpairs(eigenvalues and eigenvectors). for (int ik = 0; ik < psi.get_nk(); ++ik) @@ -151,8 +157,11 @@ void HSolverLCAO::hamiltSolvePsiK(hamilt::Hamilt* hm, psi::Psi& #ifdef __CUDA else if (this->method == "cusolver") { - DiagoCusolver cs(this->ParaV); - cs.diag(hm, psi, eigenvalue); + // Note: This branch will only be executed in the single-process case + DiagoCusolver cu; + hamilt::MatrixBlock hk, sk; + hm->matrix(hk, sk); + cu.diag(hk, sk, psi, eigenvalue); } #ifdef __CUSOLVERMP else if (this->method == "cusolvermp") @@ -295,6 +304,191 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, #endif } +#if defined (__MPI) && defined (__CUDA) +template +void HSolverLCAO::parakSolve_cusolver(hamilt::Hamilt* pHamilt, + psi::Psi& psi, + elecstate::ElecState* pes) +{ + ModuleBase::timer::tick("HSolverLCAO", "parakSolve"); + const int dev_id = base_device::information::set_device_by_rank(); + + int world_rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + // Split communicator by shared memory node + MPI_Comm nodeComm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, world_rank, MPI_INFO_NULL, &nodeComm); + + int local_rank, local_size; + MPI_Comm_rank(nodeComm, &local_rank); + MPI_Comm_size(nodeComm, &local_size); + + // Get number of CUDA devices on this node + int device_count = 0; + cudaError_t cuda_err = cudaGetDeviceCount(&device_count); + if (cuda_err != cudaSuccess) { + device_count = 0; // Treat as no GPU available + } + + if(local_rank >= device_count) { + local_rank = -1; // Mark as inactive for GPU work + } + + // Determine the number of MPI processes on this node that can actively use a GPU. + // This is the minimum of: + // - The number of available MPI processes on the node (local_size) + // - The number of available CUDA-capable GPUs on the node (device_count) + // Each GPU is assumed to be used by one dedicated MPI process. + // Thus, only the first 'min(local_size, device_count)' ranks on this node + // will be assigned GPU work; the rest will be inactive or used for communication-only roles. + int active_procs_per_node = std::min(local_size, device_count); + + std::vector all_active_procs(world_size); + std::vector all_local_ranks(world_size); + + MPI_Allgather(&active_procs_per_node, 1, MPI_INT, + all_active_procs.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Allgather(&local_rank, 1, MPI_INT, + all_local_ranks.data(), 1, MPI_INT, MPI_COMM_WORLD); + + int total_active_ranks = 0; + int total_nodes = 0; + int highest_active_rank = 0; + + for (int i = 0; i < world_size; ++i) { + if (all_local_ranks[i] == 0) { // new node + total_nodes++; + total_active_ranks += all_active_procs[i]; + highest_active_rank = std::max(highest_active_rank, all_active_procs[i] - 1); + } + } + + // active_ranks will store the global ranks of all active processes across all nodes + // The order of global ranks stored here determines the order in which they will be assigned to k-points. + // The k-points will be distributed among these ranks in a round-robin fashion. + // The purpose of setting the order is to ensure load balancing among nodes as much as possible + std::vector active_ranks; + for(int i = 0; i <= highest_active_rank; i++) + { + for(int j = 0; j < world_size; j++) + { + if(all_local_ranks[j] == i) + { + active_ranks.push_back(j); + } + } + } + + const int nks = psi.get_nk(); // total number of k points + const int nbands = this->ParaV->get_nbands(); + // Set the parallel storage scheme for the matrix and psi + Parallel_2D mat_para_global; // store the info about how the origin matrix is distributed in parallel + Parallel_2D mat_para_local; // store the info about how the matrix is distributed after collected from all processes + Parallel_2D psi_para_global; // store the info about how the psi is distributed in parallel + Parallel_2D psi_para_local; // store the info about how the psi is distributed before distributing to all processes + + MPI_Comm self_comm; // the communicator that only contains the current process itself + MPI_Comm_split(MPI_COMM_WORLD, world_rank, 0, &self_comm); + int nrow = this->ParaV->get_global_row_size(); // number of rows in the global matrix + int ncol = nrow; + int nb2d = this->ParaV->get_block_size(); // block size for the 2D matrix distribution + mat_para_global.init(nrow, ncol, nb2d, MPI_COMM_WORLD); + psi_para_global.init(nrow, nbands, nb2d, MPI_COMM_WORLD); + mat_para_local.init(nrow, ncol, nb2d, self_comm); + psi_para_local.init(nrow, ncol, nb2d, self_comm); + + std::vector hk_mat; // temporary storage for H(k) matrix collected from all processes + std::vector sk_mat; // temporary storage for S(k) matrix collected from all processes + // In each iteration, we process total_active_ranks k-points. + for(int ik_start = 0; ik_start < nks; ik_start += total_active_ranks) + { + int kpt_assigned = -1; // the k-point assigned to the current MPI process in this iteration + // Compute and gather the hk and sk matrices distributed across different processes in parallel, + // preparing for subsequent transfer to the GPU for computation. + for(int ik = ik_start; ik < ik_start + total_active_ranks && ik < nks; ik++) + { + // `is_active` indicates whether this MPI process is assigned to compute the current k-point + bool is_active = world_rank == active_ranks[ik % total_active_ranks]; + if (is_active) + { + kpt_assigned = ik; + hk_mat.resize(nrow * ncol); + sk_mat.resize(nrow * ncol); + } + pHamilt->updateHk(ik); + hamilt::MatrixBlock hk_2D, sk_2D; + pHamilt->matrix(hk_2D, sk_2D); + int desc_tmp[9]; + T* hk_local_ptr = hk_mat.data(); + T* sk_local_ptr = sk_mat.data(); + std::copy(mat_para_local.desc, mat_para_local.desc + 9, desc_tmp); + if( !is_active) + { + desc_tmp[1] = -1; + } + + Cpxgemr2d(nrow, ncol, hk_2D.p, 1, 1, mat_para_global.desc, + hk_local_ptr, 1, 1, desc_tmp, + mat_para_global.blacs_ctxt); + Cpxgemr2d(nrow, ncol, sk_2D.p, 1, 1, mat_para_global.desc, + sk_local_ptr, 1, 1, desc_tmp, + mat_para_global.blacs_ctxt); + } + + // diagonalize the Hamiltonian matrix using cusolver + psi::Psi psi_local{}; + if(kpt_assigned != -1) + { + psi_local.resize(1, ncol, nrow); + DiagoCusolver cu{}; + hamilt::MatrixBlock hk_local = hamilt::MatrixBlock{ + hk_mat.data(), (size_t)nrow, (size_t)ncol, + mat_para_local.desc}; + hamilt::MatrixBlock sk_local = hamilt::MatrixBlock{ + sk_mat.data(), (size_t)nrow, (size_t)ncol, + mat_para_local.desc}; + cu.diag(hk_local, sk_local, psi_local, &(pes->ekb(kpt_assigned, 0))); + } + + // transfer the eigenvectors and eigenvalues to all processes + for(int ik = ik_start; ik < ik_start + total_active_ranks && ik < nks; ik++) + { + int root = active_ranks[ik % total_active_ranks]; + MPI_Bcast(&(pes->ekb(ik, 0)), nbands, MPI_DOUBLE, root, MPI_COMM_WORLD); + int desc_pool[9]; + std::copy(psi_para_local.desc, psi_para_local.desc + 9, desc_pool); + T* psi_local_ptr = nullptr; + if (world_rank != root) + { + desc_pool[1] = -1; + }else + { + psi_local_ptr = psi_local.get_pointer(); + } + psi.fix_k(ik); + Cpxgemr2d(nrow, + nbands, + psi_local_ptr, + 1, + 1, + desc_pool, + psi.get_pointer(), + 1, + 1, + psi_para_global.desc, + psi_para_global.blacs_ctxt); + } + } + + MPI_Comm_free(&self_comm); + MPI_Comm_free(&nodeComm); + ModuleBase::timer::tick("HSolverLCAO", "parakSolve"); +} +#endif + + template class HSolverLCAO; template class HSolverLCAO>; diff --git a/source/source_hsolver/hsolver_lcao.h b/source/source_hsolver/hsolver_lcao.h index 8ee0a15576..a1905fbcac 100644 --- a/source/source_hsolver/hsolver_lcao.h +++ b/source/source_hsolver/hsolver_lcao.h @@ -24,6 +24,11 @@ class HSolverLCAO void parakSolve(hamilt::Hamilt* pHamilt, psi::Psi& psi, elecstate::ElecState* pes, int kpar); // for kpar_lcao > 1 + // The solving algorithm using cusolver is different from others, so a separate function is needed + void parakSolve_cusolver(hamilt::Hamilt* pHamilt, + psi::Psi& psi, + elecstate::ElecState* pes); + const Parallel_Orbitals* ParaV; const std::string method;