Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/advanced/scf/hsolver.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

Method of explicit solving KS-equation can be chosen by variable "ks_solver" in INPUT file.

When "basis_type = pw", `ks_solver` can be `cg`, `bpcg` or `dav`. The `bpcg` method only supports K-point parallelism currently. The default setting `cg` is recommended, which is band-by-band conjugate gradient diagonalization method. There is a large probability that the use of setting of `dav` , which is block Davidson diagonalization method, can be tried to improve performance.
When "basis_type = pw", `ks_solver` can be `cg`, `bpcg` or `dav`. The default setting `cg` is recommended, which is band-by-band conjugate gradient diagonalization method. There is a large probability that the use of setting of `dav` , which is block Davidson diagonalization method, can be tried to improve performance.

When "basis_type = lcao", `ks_solver` can be `genelpa` or `scalapack_gvx`. The default setting `genelpa` is recommended, which is based on ELPA (EIGENVALUE SOLVERS FOR PETAFLOP APPLICATIONS) (https://elpa.mpcdf.mpg.de/) and the kernel is auto choosed by GENELPA(https://github.com/pplab/GenELPA), usually faster than the setting of "scalapack_gvx", which is based on ScaLAPACK(Scalable Linear Algebra PACKage)

Expand Down
43 changes: 25 additions & 18 deletions source/module_hsolver/diago_bpcg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ void DiagoBPCG<T, Device>::orth_cholesky(
hsub_out.data<T>(),
this->n_band); //ldc

Parallel_Reduce::reduce_pool(hsub_out.data<T>(), this->n_band * this->n_band);

// set hsub matrix to lower format;
ct::kernels::set_matrix<T, ct_Device>()(
'L', hsub_out.data<T>(), this->n_band);
Expand Down Expand Up @@ -167,7 +169,6 @@ void DiagoBPCG<T, Device>::orth_projection(
/*conj_x=*/false, /*conj_y=*/true, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&hsub_in);
// hsub_in = ct::op::einsum("ij,kj->ik", grad_out, psi_in, option);

// this->orth_projection(this->psi, this->hsub, this->grad);
// gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band)
gemm_op()(this->ctx,
'C',
Expand All @@ -184,6 +185,8 @@ void DiagoBPCG<T, Device>::orth_projection(
hsub_in.data<T>(),
this->n_band); //ldc

Parallel_Reduce::reduce_pool(hsub_in.data<T>(), this->n_band * this->n_band);

// set_matrix_op()('L', hsub_in->data<T>(), this->n_band);
option = ct::EinsumOption(
/*conj_x=*/false, /*conj_y=*/false, /*alpha=*/-1.0, /*beta=*/1.0, /*Tensor out=*/&grad_out);
Expand All @@ -205,6 +208,8 @@ void DiagoBPCG<T, Device>::orth_projection(
grad_out.data<T>(),
this->n_basis); //ldc

// * This type of non inner product like operation does not need reduce!

return;
}

Expand All @@ -216,25 +221,25 @@ void DiagoBPCG<T, Device>::rotate_wf(
{
ct::EinsumOption option(
/*conj_x=*/false, /*conj_y=*/false, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&workspace_in);
workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option);
// workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option);

// this->rotate_wf(hsub_out, psi_out, workspace_in);
// this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);
// gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band)
// gemm_op()(this->ctx,
// 'N',
// 'N',
// this->n_basis, //m
// this->n_band, //n
// this->n_band, //k
// this->one, //1.0
// psi_out.data<T>(),
// this->n_basis, //lda
// hsub_in.data<T>(),
// this->n_band, //ldb
// this->zero, //0.0
// workspace_in.data<T>(),
// this->n_basis); //ldc
gemm_op()(this->ctx,
'N',
'N',
this->n_basis, //m
this->n_band, //n
this->n_band, //k
this->one, //1.0
psi_out.data<T>(),
this->n_basis, //lda
hsub_in.data<T>(),
this->n_band, //ldb
this->zero, //0.0
workspace_in.data<T>(),
this->n_basis); //ldc

// * This type of non inner product like operation does not need reduce!

syncmem_complex_op()(psi_out.template data<T>(), workspace_in.template data<T>(), this->n_band * this->n_basis);

Expand Down Expand Up @@ -281,6 +286,8 @@ void DiagoBPCG<T, Device>::diag_hsub(
hsub_out.data<T>(),
this->n_band); //ldc

Parallel_Reduce::reduce_pool(hsub_out.data<T>(), this->n_band * this->n_band);

ct::kernels::lapack_dnevd<T, ct_Device>()('V', 'U', hsub_out.data<T>(), this->n_band, eigenvalue_out.data<Real>());

return;
Expand Down
3 changes: 3 additions & 0 deletions source/module_hsolver/diago_bpcg.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,9 @@ class DiagoBPCG
using resmem_complex_op = ct::kernels::resize_memory<T, ct_Device>;
using syncmem_complex_op = ct::kernels::synchronize_memory<T, ct_Device, ct_Device>;

// note: these operators use template parameter base_device::Device_*
// defined in module_base/module_device/types.h
// different from ct_Device!
using calc_grad_with_block_op = hsolver::calc_grad_with_block_op<T, Device>;
using line_minimize_with_block_op = hsolver::line_minimize_with_block_op<T, Device>;
using gemm_op = hsolver::gemm_op<T, Device>;
Expand Down
8 changes: 8 additions & 0 deletions source/module_hsolver/kernels/math_kernel_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
auto A = reinterpret_cast<const Real*>(grad_out + band_idx * n_basis_max);
Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
Parallel_Reduce::reduce_pool(norm);
norm = 1.0 / sqrt(norm);
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
Expand All @@ -34,6 +35,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item]));
epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item]));
}
Parallel_Reduce::reduce_pool(epsilo_0);
Parallel_Reduce::reduce_pool(epsilo_1);
Parallel_Reduce::reduce_pool(epsilo_2);
theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
cos_theta = std::cos(theta);
sin_theta = std::sin(theta);
Expand Down Expand Up @@ -71,6 +75,7 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
T grad_1 = {0.0, 0.0};
auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
Parallel_Reduce::reduce_pool(norm);
norm = 1.0 / sqrt(norm);
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
Expand All @@ -79,6 +84,7 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
hpsi_out[item] *= norm;
epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
}
Parallel_Reduce::reduce_pool(epsilo);
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
Expand All @@ -87,6 +93,8 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
err += grad_2;
beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
}
Parallel_Reduce::reduce_pool(err);
Parallel_Reduce::reduce_pool(beta);
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
Expand Down
4 changes: 2 additions & 2 deletions tests/integrate/102_PW_BPCG/result.ref
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
etotref -4869.74705201
etotperatomref -2434.87352600
totalforceref 5.19483000
totalstressref 37241.45334600
totalforceref 5.19522000
totalstressref 37241.49490600
pointgroupref C_1
spacegroupref C_1
nksibzref 8
Expand Down
2 changes: 1 addition & 1 deletion tests/integrate/Autotest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ for dir in $testdir; do
TIMEFORMAT='[----------] Time elapsed: %R seconds'
#parallel test
time {
if [ "$case" = "282_NO_RPA" -o "$dir" = "102_PW_BPCG" ]; then
if [ "$case" = "282_NO_RPA" ]; then
mpirun -np 1 $abacus > log.txt
else
mpirun -np $np $abacus > log.txt
Expand Down
Loading