From 31baa6b15dae061ae6263d5742182ec79e172a9a Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Sun, 12 Jan 2025 19:04:55 +0800 Subject: [PATCH 1/8] Subsitute gemm for einsum in rotate_wf --- source/module_hsolver/diago_bpcg.cpp | 40 +++++++++++++++++----------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/source/module_hsolver/diago_bpcg.cpp b/source/module_hsolver/diago_bpcg.cpp index 7830af2f67..5aab4f5d0e 100644 --- a/source/module_hsolver/diago_bpcg.cpp +++ b/source/module_hsolver/diago_bpcg.cpp @@ -115,6 +115,8 @@ void DiagoBPCG::orth_cholesky( hsub_out.data(), this->n_band); //ldc + // Parallel_Reduce::reduce_pool(hsub_out.data(), this->n_band * this->n_band); + // set hsub matrix to lower format; ct::kernels::set_matrix()( 'L', hsub_out.data(), this->n_band); @@ -184,6 +186,8 @@ void DiagoBPCG::orth_projection( hsub_in.data(), this->n_band); //ldc + // Parallel_Reduce::reduce_pool(hsub_in.data(), this->n_band * this->n_band); + // set_matrix_op()('L', hsub_in->data(), this->n_band); option = ct::EinsumOption( /*conj_x=*/false, /*conj_y=*/false, /*alpha=*/-1.0, /*beta=*/1.0, /*Tensor out=*/&grad_out); @@ -205,6 +209,8 @@ void DiagoBPCG::orth_projection( grad_out.data(), this->n_basis); //ldc + // Parallel_Reduce::reduce_pool(grad_out.data(), this->n_basis * this->n_band); + return; } @@ -216,25 +222,27 @@ void DiagoBPCG::rotate_wf( { ct::EinsumOption option( /*conj_x=*/false, /*conj_y=*/false, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&workspace_in); - workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option); + // workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option); // this->rotate_wf(hsub_out, psi_out, workspace_in); // this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub); // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band) - // gemm_op()(this->ctx, - // 'N', - // 'N', - // this->n_basis, //m - // this->n_band, //n - // this->n_band, //k - // this->one, //1.0 - // psi_out.data(), - // this->n_basis, //lda - // hsub_in.data(), - // this->n_band, //ldb - // this->zero, //0.0 - // workspace_in.data(), - // this->n_basis); //ldc + gemm_op()(this->ctx, + 'N', + 'N', + this->n_basis, //m + this->n_band, //n + this->n_band, //k + this->one, //1.0 + psi_out.data(), + this->n_basis, //lda + hsub_in.data(), + this->n_band, //ldb + this->zero, //0.0 + workspace_in.data(), + this->n_basis); //ldc + + // Parallel_Reduce::reduce_pool(workspace_in.data(), this->n_basis * this->n_band); syncmem_complex_op()(psi_out.template data(), workspace_in.template data(), this->n_band * this->n_basis); @@ -281,6 +289,8 @@ void DiagoBPCG::diag_hsub( hsub_out.data(), this->n_band); //ldc + // Parallel_Reduce::reduce_pool(hsub_out.data(), this->n_band * this->n_band); + ct::kernels::lapack_dnevd()('V', 'U', hsub_out.data(), this->n_band, eigenvalue_out.data()); return; From 6114817e310a948bece2499016f1f45f20afba54 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Sun, 12 Jan 2025 19:54:42 +0800 Subject: [PATCH 2/8] Add planewave parallel support for inner-produce like gemm_op in bpcg --- source/module_hsolver/diago_bpcg.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/source/module_hsolver/diago_bpcg.cpp b/source/module_hsolver/diago_bpcg.cpp index 5aab4f5d0e..c1d5d5afb3 100644 --- a/source/module_hsolver/diago_bpcg.cpp +++ b/source/module_hsolver/diago_bpcg.cpp @@ -115,7 +115,7 @@ void DiagoBPCG::orth_cholesky( hsub_out.data(), this->n_band); //ldc - // Parallel_Reduce::reduce_pool(hsub_out.data(), this->n_band * this->n_band); + Parallel_Reduce::reduce_pool(hsub_out.data(), this->n_band * this->n_band); // set hsub matrix to lower format; ct::kernels::set_matrix()( @@ -186,7 +186,7 @@ void DiagoBPCG::orth_projection( hsub_in.data(), this->n_band); //ldc - // Parallel_Reduce::reduce_pool(hsub_in.data(), this->n_band * this->n_band); + Parallel_Reduce::reduce_pool(hsub_in.data(), this->n_band * this->n_band); // set_matrix_op()('L', hsub_in->data(), this->n_band); option = ct::EinsumOption( @@ -209,6 +209,7 @@ void DiagoBPCG::orth_projection( grad_out.data(), this->n_basis); //ldc + // * This type of non inner produce like operation does not need reduce! // Parallel_Reduce::reduce_pool(grad_out.data(), this->n_basis * this->n_band); return; @@ -242,6 +243,7 @@ void DiagoBPCG::rotate_wf( workspace_in.data(), this->n_basis); //ldc + // * This type of non inner produce like operation does not need reduce! // Parallel_Reduce::reduce_pool(workspace_in.data(), this->n_basis * this->n_band); syncmem_complex_op()(psi_out.template data(), workspace_in.template data(), this->n_band * this->n_basis); @@ -289,7 +291,7 @@ void DiagoBPCG::diag_hsub( hsub_out.data(), this->n_band); //ldc - // Parallel_Reduce::reduce_pool(hsub_out.data(), this->n_band * this->n_band); + Parallel_Reduce::reduce_pool(hsub_out.data(), this->n_band * this->n_band); ct::kernels::lapack_dnevd()('V', 'U', hsub_out.data(), this->n_band, eigenvalue_out.data()); From 91f62f6157b93e6ed8ab0a156da703227f73c77e Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Sun, 12 Jan 2025 20:24:41 +0800 Subject: [PATCH 3/8] Add reduce for dot ops used in bpcg --- source/module_hsolver/kernels/math_kernel_op.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp index c3784949ab..a8ada66cc5 100644 --- a/source/module_hsolver/kernels/math_kernel_op.cpp +++ b/source/module_hsolver/kernels/math_kernel_op.cpp @@ -24,6 +24,7 @@ struct line_minimize_with_block_op Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; auto A = reinterpret_cast(grad_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { @@ -71,6 +72,7 @@ struct calc_grad_with_block_op T grad_1 = {0.0, 0.0}; auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { From b025b62c23eeca70e90792af322c74249057b8f2 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Sun, 12 Jan 2025 20:54:54 +0800 Subject: [PATCH 4/8] Add reduce for manual inner product(for loop) ops used in bpcg --- source/module_hsolver/kernels/math_kernel_op.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp index a8ada66cc5..3a752c3659 100644 --- a/source/module_hsolver/kernels/math_kernel_op.cpp +++ b/source/module_hsolver/kernels/math_kernel_op.cpp @@ -35,6 +35,9 @@ struct line_minimize_with_block_op epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item])); epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item])); } + Parallel_Reduce::reduce_pool(epsilo_0); + Parallel_Reduce::reduce_pool(epsilo_1); + Parallel_Reduce::reduce_pool(epsilo_2); theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); cos_theta = std::cos(theta); sin_theta = std::sin(theta); @@ -81,6 +84,7 @@ struct calc_grad_with_block_op hpsi_out[item] *= norm; epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); } + Parallel_Reduce::reduce_pool(epsilo); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -89,6 +93,8 @@ struct calc_grad_with_block_op err += grad_2; beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec? } + Parallel_Reduce::reduce_pool(err); + Parallel_Reduce::reduce_pool(beta); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; From d975f41fddca2b4b1f6f189a19ee5e20c2dca31e Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Mon, 13 Jan 2025 16:59:49 +0800 Subject: [PATCH 5/8] Update docs now that BPCG supports plane wave parallelization. --- docs/advanced/scf/hsolver.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced/scf/hsolver.md b/docs/advanced/scf/hsolver.md index f002c2bfda..0cb02ce9dd 100644 --- a/docs/advanced/scf/hsolver.md +++ b/docs/advanced/scf/hsolver.md @@ -4,7 +4,7 @@ Method of explicit solving KS-equation can be chosen by variable "ks_solver" in INPUT file. -When "basis_type = pw", `ks_solver` can be `cg`, `bpcg` or `dav`. The `bpcg` method only supports K-point parallelism currently. The default setting `cg` is recommended, which is band-by-band conjugate gradient diagonalization method. There is a large probability that the use of setting of `dav` , which is block Davidson diagonalization method, can be tried to improve performance. +When "basis_type = pw", `ks_solver` can be `cg`, `bpcg` or `dav`. The default setting `cg` is recommended, which is band-by-band conjugate gradient diagonalization method. There is a large probability that the use of setting of `dav` , which is block Davidson diagonalization method, can be tried to improve performance. When "basis_type = lcao", `ks_solver` can be `genelpa` or `scalapack_gvx`. The default setting `genelpa` is recommended, which is based on ELPA (EIGENVALUE SOLVERS FOR PETAFLOP APPLICATIONS) (https://elpa.mpcdf.mpg.de/) and the kernel is auto choosed by GENELPA(https://github.com/pplab/GenELPA), usually faster than the setting of "scalapack_gvx", which is based on ScaLAPACK(Scalable Linear Algebra PACKage) From f3c3d75e24675134e1e6a048c9b91bb470f39ffa Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Mon, 13 Jan 2025 21:20:20 +0800 Subject: [PATCH 6/8] Update Autotest.sh to run BPCG test with MPI np=4 --- tests/integrate/Autotest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integrate/Autotest.sh b/tests/integrate/Autotest.sh index 674199f96a..fcdc88a0d8 100755 --- a/tests/integrate/Autotest.sh +++ b/tests/integrate/Autotest.sh @@ -250,7 +250,7 @@ for dir in $testdir; do TIMEFORMAT='[----------] Time elapsed: %R seconds' #parallel test time { - if [ "$case" = "282_NO_RPA" -o "$dir" = "102_PW_BPCG" ]; then + if [ "$case" = "282_NO_RPA" ]; then mpirun -np 1 $abacus > log.txt else mpirun -np $np $abacus > log.txt From 4cfa47da2c8286448ebd42db3d7dffc764bbc383 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Mon, 13 Jan 2025 21:26:11 +0800 Subject: [PATCH 7/8] remove unused code and redundancies --- source/module_hsolver/diago_bpcg.cpp | 9 ++------- source/module_hsolver/diago_bpcg.h | 3 +++ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/source/module_hsolver/diago_bpcg.cpp b/source/module_hsolver/diago_bpcg.cpp index c1d5d5afb3..846bef9ff8 100644 --- a/source/module_hsolver/diago_bpcg.cpp +++ b/source/module_hsolver/diago_bpcg.cpp @@ -169,7 +169,6 @@ void DiagoBPCG::orth_projection( /*conj_x=*/false, /*conj_y=*/true, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&hsub_in); // hsub_in = ct::op::einsum("ij,kj->ik", grad_out, psi_in, option); - // this->orth_projection(this->psi, this->hsub, this->grad); // gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band) gemm_op()(this->ctx, 'C', @@ -209,8 +208,7 @@ void DiagoBPCG::orth_projection( grad_out.data(), this->n_basis); //ldc - // * This type of non inner produce like operation does not need reduce! - // Parallel_Reduce::reduce_pool(grad_out.data(), this->n_basis * this->n_band); + // * This type of non inner product like operation does not need reduce! return; } @@ -225,8 +223,6 @@ void DiagoBPCG::rotate_wf( /*conj_x=*/false, /*conj_y=*/false, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&workspace_in); // workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option); - // this->rotate_wf(hsub_out, psi_out, workspace_in); - // this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub); // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band) gemm_op()(this->ctx, 'N', @@ -243,8 +239,7 @@ void DiagoBPCG::rotate_wf( workspace_in.data(), this->n_basis); //ldc - // * This type of non inner produce like operation does not need reduce! - // Parallel_Reduce::reduce_pool(workspace_in.data(), this->n_basis * this->n_band); + // * This type of non inner product like operation does not need reduce! syncmem_complex_op()(psi_out.template data(), workspace_in.template data(), this->n_band * this->n_basis); diff --git a/source/module_hsolver/diago_bpcg.h b/source/module_hsolver/diago_bpcg.h index 44ddd9736f..a80c1406b6 100644 --- a/source/module_hsolver/diago_bpcg.h +++ b/source/module_hsolver/diago_bpcg.h @@ -340,6 +340,9 @@ class DiagoBPCG using resmem_complex_op = ct::kernels::resize_memory; using syncmem_complex_op = ct::kernels::synchronize_memory; + // note: these operators use template parameter base_device::Device_* + // defined in module_base/module_device/types.h + // different from ct_Device! using calc_grad_with_block_op = hsolver::calc_grad_with_block_op; using line_minimize_with_block_op = hsolver::line_minimize_with_block_op; using gemm_op = hsolver::gemm_op; From 0d59d1ba86b793982fc2f36f74b1a0f08b22e8ea Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Mon, 13 Jan 2025 21:49:09 +0800 Subject: [PATCH 8/8] Update result.ref for BPCG multicore test --- tests/integrate/102_PW_BPCG/result.ref | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integrate/102_PW_BPCG/result.ref b/tests/integrate/102_PW_BPCG/result.ref index 2972395a15..4815e05cb4 100644 --- a/tests/integrate/102_PW_BPCG/result.ref +++ b/tests/integrate/102_PW_BPCG/result.ref @@ -1,7 +1,7 @@ etotref -4869.74705201 etotperatomref -2434.87352600 -totalforceref 5.19483000 -totalstressref 37241.45334600 +totalforceref 5.19522000 +totalstressref 37241.49490600 pointgroupref C_1 spacegroupref C_1 nksibzref 8