Skip to content
2 changes: 1 addition & 1 deletion docs/advanced/scf/hsolver.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

Method of explicit solving KS-equation can be chosen by variable "ks_solver" in INPUT file.

When "basis_type = pw", `ks_solver` can be `cg`, `bpcg` or `dav`. The `bpcg` method only supports K-point parallelism currently. The default setting `cg` is recommended, which is band-by-band conjugate gradient diagonalization method. There is a large probability that the use of setting of `dav` , which is block Davidson diagonalization method, can be tried to improve performance.
When "basis_type = pw", `ks_solver` can be `cg`, `bpcg` or `dav`. The default setting `cg` is recommended, which is band-by-band conjugate gradient diagonalization method. There is a large probability that the use of setting of `dav` , which is block Davidson diagonalization method, can be tried to improve performance.

When "basis_type = lcao", `ks_solver` can be `genelpa` or `scalapack_gvx`. The default setting `genelpa` is recommended, which is based on ELPA (EIGENVALUE SOLVERS FOR PETAFLOP APPLICATIONS) (https://elpa.mpcdf.mpg.de/) and the kernel is auto choosed by GENELPA(https://github.com/pplab/GenELPA), usually faster than the setting of "scalapack_gvx", which is based on ScaLAPACK(Scalable Linear Algebra PACKage)

Expand Down
42 changes: 27 additions & 15 deletions source/module_hsolver/diago_bpcg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ void DiagoBPCG<T, Device>::orth_cholesky(
hsub_out.data<T>(),
this->n_band); //ldc

Parallel_Reduce::reduce_pool(hsub_out.data<T>(), this->n_band * this->n_band);

// set hsub matrix to lower format;
ct::kernels::set_matrix<T, ct_Device>()(
'L', hsub_out.data<T>(), this->n_band);
Expand Down Expand Up @@ -184,6 +186,8 @@ void DiagoBPCG<T, Device>::orth_projection(
hsub_in.data<T>(),
this->n_band); //ldc

Parallel_Reduce::reduce_pool(hsub_in.data<T>(), this->n_band * this->n_band);

// set_matrix_op()('L', hsub_in->data<T>(), this->n_band);
option = ct::EinsumOption(
/*conj_x=*/false, /*conj_y=*/false, /*alpha=*/-1.0, /*beta=*/1.0, /*Tensor out=*/&grad_out);
Expand All @@ -205,6 +209,9 @@ void DiagoBPCG<T, Device>::orth_projection(
grad_out.data<T>(),
this->n_basis); //ldc

// * This type of non inner produce like operation does not need reduce!
// Parallel_Reduce::reduce_pool(grad_out.data<T>(), this->n_basis * this->n_band);

return;
}

Expand All @@ -216,25 +223,28 @@ void DiagoBPCG<T, Device>::rotate_wf(
{
ct::EinsumOption option(
/*conj_x=*/false, /*conj_y=*/false, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&workspace_in);
workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option);
// workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option);

// this->rotate_wf(hsub_out, psi_out, workspace_in);
// this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);
// gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band)
// gemm_op()(this->ctx,
// 'N',
// 'N',
// this->n_basis, //m
// this->n_band, //n
// this->n_band, //k
// this->one, //1.0
// psi_out.data<T>(),
// this->n_basis, //lda
// hsub_in.data<T>(),
// this->n_band, //ldb
// this->zero, //0.0
// workspace_in.data<T>(),
// this->n_basis); //ldc
gemm_op()(this->ctx,
'N',
'N',
this->n_basis, //m
this->n_band, //n
this->n_band, //k
this->one, //1.0
psi_out.data<T>(),
this->n_basis, //lda
hsub_in.data<T>(),
this->n_band, //ldb
this->zero, //0.0
workspace_in.data<T>(),
this->n_basis); //ldc

// * This type of non inner produce like operation does not need reduce!
// Parallel_Reduce::reduce_pool(workspace_in.data<T>(), this->n_basis * this->n_band);

syncmem_complex_op()(psi_out.template data<T>(), workspace_in.template data<T>(), this->n_band * this->n_basis);

Expand Down Expand Up @@ -281,6 +291,8 @@ void DiagoBPCG<T, Device>::diag_hsub(
hsub_out.data<T>(),
this->n_band); //ldc

Parallel_Reduce::reduce_pool(hsub_out.data<T>(), this->n_band * this->n_band);

ct::kernels::lapack_dnevd<T, ct_Device>()('V', 'U', hsub_out.data<T>(), this->n_band, eigenvalue_out.data<Real>());

return;
Expand Down
8 changes: 8 additions & 0 deletions source/module_hsolver/kernels/math_kernel_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
auto A = reinterpret_cast<const Real*>(grad_out + band_idx * n_basis_max);
Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
Parallel_Reduce::reduce_pool(norm);
norm = 1.0 / sqrt(norm);
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
Expand All @@ -34,6 +35,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item]));
epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item]));
}
Parallel_Reduce::reduce_pool(epsilo_0);
Parallel_Reduce::reduce_pool(epsilo_1);
Parallel_Reduce::reduce_pool(epsilo_2);
theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
cos_theta = std::cos(theta);
sin_theta = std::sin(theta);
Expand Down Expand Up @@ -71,6 +75,7 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
T grad_1 = {0.0, 0.0};
auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
Parallel_Reduce::reduce_pool(norm);
norm = 1.0 / sqrt(norm);
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
Expand All @@ -79,6 +84,7 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
hpsi_out[item] *= norm;
epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
}
Parallel_Reduce::reduce_pool(epsilo);
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
Expand All @@ -87,6 +93,8 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
err += grad_2;
beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
}
Parallel_Reduce::reduce_pool(err);
Parallel_Reduce::reduce_pool(beta);
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
Expand Down
Loading