diff --git a/docs/advanced/scf/hsolver.md b/docs/advanced/scf/hsolver.md index f002c2bfda..0cb02ce9dd 100644 --- a/docs/advanced/scf/hsolver.md +++ b/docs/advanced/scf/hsolver.md @@ -4,7 +4,7 @@ Method of explicit solving KS-equation can be chosen by variable "ks_solver" in INPUT file. -When "basis_type = pw", `ks_solver` can be `cg`, `bpcg` or `dav`. The `bpcg` method only supports K-point parallelism currently. The default setting `cg` is recommended, which is band-by-band conjugate gradient diagonalization method. There is a large probability that the use of setting of `dav` , which is block Davidson diagonalization method, can be tried to improve performance. +When "basis_type = pw", `ks_solver` can be `cg`, `bpcg` or `dav`. The default setting `cg` is recommended, which is band-by-band conjugate gradient diagonalization method. There is a large probability that the use of setting of `dav` , which is block Davidson diagonalization method, can be tried to improve performance. When "basis_type = lcao", `ks_solver` can be `genelpa` or `scalapack_gvx`. The default setting `genelpa` is recommended, which is based on ELPA (EIGENVALUE SOLVERS FOR PETAFLOP APPLICATIONS) (https://elpa.mpcdf.mpg.de/) and the kernel is auto choosed by GENELPA(https://github.com/pplab/GenELPA), usually faster than the setting of "scalapack_gvx", which is based on ScaLAPACK(Scalable Linear Algebra PACKage) diff --git a/source/module_hsolver/diago_bpcg.cpp b/source/module_hsolver/diago_bpcg.cpp index 7830af2f67..846bef9ff8 100644 --- a/source/module_hsolver/diago_bpcg.cpp +++ b/source/module_hsolver/diago_bpcg.cpp @@ -115,6 +115,8 @@ void DiagoBPCG::orth_cholesky( hsub_out.data(), this->n_band); //ldc + Parallel_Reduce::reduce_pool(hsub_out.data(), this->n_band * this->n_band); + // set hsub matrix to lower format; ct::kernels::set_matrix()( 'L', hsub_out.data(), this->n_band); @@ -167,7 +169,6 @@ void DiagoBPCG::orth_projection( /*conj_x=*/false, /*conj_y=*/true, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&hsub_in); // hsub_in = ct::op::einsum("ij,kj->ik", grad_out, psi_in, option); - // this->orth_projection(this->psi, this->hsub, this->grad); // gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band) gemm_op()(this->ctx, 'C', @@ -184,6 +185,8 @@ void DiagoBPCG::orth_projection( hsub_in.data(), this->n_band); //ldc + Parallel_Reduce::reduce_pool(hsub_in.data(), this->n_band * this->n_band); + // set_matrix_op()('L', hsub_in->data(), this->n_band); option = ct::EinsumOption( /*conj_x=*/false, /*conj_y=*/false, /*alpha=*/-1.0, /*beta=*/1.0, /*Tensor out=*/&grad_out); @@ -205,6 +208,8 @@ void DiagoBPCG::orth_projection( grad_out.data(), this->n_basis); //ldc + // * This type of non inner product like operation does not need reduce! + return; } @@ -216,25 +221,25 @@ void DiagoBPCG::rotate_wf( { ct::EinsumOption option( /*conj_x=*/false, /*conj_y=*/false, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&workspace_in); - workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option); + // workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option); - // this->rotate_wf(hsub_out, psi_out, workspace_in); - // this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub); // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band) - // gemm_op()(this->ctx, - // 'N', - // 'N', - // this->n_basis, //m - // this->n_band, //n - // this->n_band, //k - // this->one, //1.0 - // psi_out.data(), - // this->n_basis, //lda - // hsub_in.data(), - // this->n_band, //ldb - // this->zero, //0.0 - // workspace_in.data(), - // this->n_basis); //ldc + gemm_op()(this->ctx, + 'N', + 'N', + this->n_basis, //m + this->n_band, //n + this->n_band, //k + this->one, //1.0 + psi_out.data(), + this->n_basis, //lda + hsub_in.data(), + this->n_band, //ldb + this->zero, //0.0 + workspace_in.data(), + this->n_basis); //ldc + + // * This type of non inner product like operation does not need reduce! syncmem_complex_op()(psi_out.template data(), workspace_in.template data(), this->n_band * this->n_basis); @@ -281,6 +286,8 @@ void DiagoBPCG::diag_hsub( hsub_out.data(), this->n_band); //ldc + Parallel_Reduce::reduce_pool(hsub_out.data(), this->n_band * this->n_band); + ct::kernels::lapack_dnevd()('V', 'U', hsub_out.data(), this->n_band, eigenvalue_out.data()); return; diff --git a/source/module_hsolver/diago_bpcg.h b/source/module_hsolver/diago_bpcg.h index 44ddd9736f..a80c1406b6 100644 --- a/source/module_hsolver/diago_bpcg.h +++ b/source/module_hsolver/diago_bpcg.h @@ -340,6 +340,9 @@ class DiagoBPCG using resmem_complex_op = ct::kernels::resize_memory; using syncmem_complex_op = ct::kernels::synchronize_memory; + // note: these operators use template parameter base_device::Device_* + // defined in module_base/module_device/types.h + // different from ct_Device! using calc_grad_with_block_op = hsolver::calc_grad_with_block_op; using line_minimize_with_block_op = hsolver::line_minimize_with_block_op; using gemm_op = hsolver::gemm_op; diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp index c3784949ab..3a752c3659 100644 --- a/source/module_hsolver/kernels/math_kernel_op.cpp +++ b/source/module_hsolver/kernels/math_kernel_op.cpp @@ -24,6 +24,7 @@ struct line_minimize_with_block_op Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; auto A = reinterpret_cast(grad_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { @@ -34,6 +35,9 @@ struct line_minimize_with_block_op epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item])); epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item])); } + Parallel_Reduce::reduce_pool(epsilo_0); + Parallel_Reduce::reduce_pool(epsilo_1); + Parallel_Reduce::reduce_pool(epsilo_2); theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); cos_theta = std::cos(theta); sin_theta = std::sin(theta); @@ -71,6 +75,7 @@ struct calc_grad_with_block_op T grad_1 = {0.0, 0.0}; auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { @@ -79,6 +84,7 @@ struct calc_grad_with_block_op hpsi_out[item] *= norm; epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); } + Parallel_Reduce::reduce_pool(epsilo); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -87,6 +93,8 @@ struct calc_grad_with_block_op err += grad_2; beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec? } + Parallel_Reduce::reduce_pool(err); + Parallel_Reduce::reduce_pool(beta); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; diff --git a/tests/integrate/102_PW_BPCG/result.ref b/tests/integrate/102_PW_BPCG/result.ref index 2972395a15..4815e05cb4 100644 --- a/tests/integrate/102_PW_BPCG/result.ref +++ b/tests/integrate/102_PW_BPCG/result.ref @@ -1,7 +1,7 @@ etotref -4869.74705201 etotperatomref -2434.87352600 -totalforceref 5.19483000 -totalstressref 37241.45334600 +totalforceref 5.19522000 +totalstressref 37241.49490600 pointgroupref C_1 spacegroupref C_1 nksibzref 8 diff --git a/tests/integrate/Autotest.sh b/tests/integrate/Autotest.sh index 674199f96a..fcdc88a0d8 100755 --- a/tests/integrate/Autotest.sh +++ b/tests/integrate/Autotest.sh @@ -250,7 +250,7 @@ for dir in $testdir; do TIMEFORMAT='[----------] Time elapsed: %R seconds' #parallel test time { - if [ "$case" = "282_NO_RPA" -o "$dir" = "102_PW_BPCG" ]; then + if [ "$case" = "282_NO_RPA" ]; then mpirun -np 1 $abacus > log.txt else mpirun -np $np $abacus > log.txt