From 31baa6b15dae061ae6263d5742182ec79e172a9a Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Sun, 12 Jan 2025 19:04:55 +0800
Subject: [PATCH 1/8] Subsitute gemm for einsum in rotate_wf

---
 source/module_hsolver/diago_bpcg.cpp | 40 +++++++++++++++++-----------
 1 file changed, 25 insertions(+), 15 deletions(-)
diff --git a/source/module_hsolver/diago_bpcg.cpp b/source/module_hsolver/diago_bpcg.cpp
index 7830af2f67..5aab4f5d0e 100644
--- a/source/module_hsolver/diago_bpcg.cpp
+++ b/source/module_hsolver/diago_bpcg.cpp
@@ -115,6 +115,8 @@ void DiagoBPCG<T, Device>::orth_cholesky(
                 hsub_out.data<T>(),
                 this->n_band);     //ldc
 
+    // Parallel_Reduce::reduce_pool(hsub_out.data<T>(), this->n_band * this->n_band);
+    
     // set hsub matrix to lower format;
     ct::kernels::set_matrix<T, ct_Device>()(
         'L', hsub_out.data<T>(), this->n_band);
@@ -184,6 +186,8 @@ void DiagoBPCG<T, Device>::orth_projection(
                 hsub_in.data<T>(),
                 this->n_band);     //ldc
 
+    // Parallel_Reduce::reduce_pool(hsub_in.data<T>(), this->n_band * this->n_band);
+    
     // set_matrix_op()('L', hsub_in->data<T>(), this->n_band);
     option = ct::EinsumOption(
         /*conj_x=*/false, /*conj_y=*/false, /*alpha=*/-1.0, /*beta=*/1.0, /*Tensor out=*/&grad_out);
@@ -205,6 +209,8 @@ void DiagoBPCG<T, Device>::orth_projection(
                 grad_out.data<T>(),
                 this->n_basis);   //ldc
 
+    // Parallel_Reduce::reduce_pool(grad_out.data<T>(), this->n_basis * this->n_band);
+    
     return;
 }
 
@@ -216,25 +222,27 @@ void DiagoBPCG<T, Device>::rotate_wf(
 {
     ct::EinsumOption option(
         /*conj_x=*/false, /*conj_y=*/false, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&workspace_in);
-    workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option);
+    // workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option);
 
     // this->rotate_wf(hsub_out, psi_out, workspace_in);
     // this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);
     // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band)
-    // gemm_op()(this->ctx,
-    //             'N',
-    //             'N',
-    //             this->n_basis,        //m
-    //             this->n_band,       //n
-    //             this->n_band,       //k
-    //             this->one,          //1.0
-    //             psi_out.data<T>(),
-    //             this->n_basis,      //lda
-    //             hsub_in.data<T>(),
-    //             this->n_band,       //ldb
-    //             this->zero,         //0.0
-    //             workspace_in.data<T>(),
-    //             this->n_basis);     //ldc
+    gemm_op()(this->ctx,
+                'N',
+                'N',
+                this->n_basis,        //m
+                this->n_band,       //n
+                this->n_band,       //k
+                this->one,          //1.0
+                psi_out.data<T>(),
+                this->n_basis,      //lda
+                hsub_in.data<T>(),
+                this->n_band,       //ldb
+                this->zero,         //0.0
+                workspace_in.data<T>(),
+                this->n_basis);     //ldc
+    
+    // Parallel_Reduce::reduce_pool(workspace_in.data<T>(), this->n_basis * this->n_band);
 
     syncmem_complex_op()(psi_out.template data<T>(), workspace_in.template data<T>(), this->n_band * this->n_basis);
 
@@ -281,6 +289,8 @@ void DiagoBPCG<T, Device>::diag_hsub(
                 hsub_out.data<T>(),
                 this->n_band);      //ldc
 
+    // Parallel_Reduce::reduce_pool(hsub_out.data<T>(), this->n_band * this->n_band);
+
     ct::kernels::lapack_dnevd<T, ct_Device>()('V', 'U', hsub_out.data<T>(), this->n_band, eigenvalue_out.data<Real>());
 
     return;

From 6114817e310a948bece2499016f1f45f20afba54 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Sun, 12 Jan 2025 19:54:42 +0800
Subject: [PATCH 2/8] Add planewave parallel support for inner-produce like
 gemm_op in bpcg

---
 source/module_hsolver/diago_bpcg.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/source/module_hsolver/diago_bpcg.cpp b/source/module_hsolver/diago_bpcg.cpp
index 5aab4f5d0e..c1d5d5afb3 100644
--- a/source/module_hsolver/diago_bpcg.cpp
+++ b/source/module_hsolver/diago_bpcg.cpp
@@ -115,7 +115,7 @@ void DiagoBPCG<T, Device>::orth_cholesky(
                 hsub_out.data<T>(),
                 this->n_band);     //ldc
 
-    // Parallel_Reduce::reduce_pool(hsub_out.data<T>(), this->n_band * this->n_band);
+    Parallel_Reduce::reduce_pool(hsub_out.data<T>(), this->n_band * this->n_band);
     
     // set hsub matrix to lower format;
     ct::kernels::set_matrix<T, ct_Device>()(
@@ -186,7 +186,7 @@ void DiagoBPCG<T, Device>::orth_projection(
                 hsub_in.data<T>(),
                 this->n_band);     //ldc
 
-    // Parallel_Reduce::reduce_pool(hsub_in.data<T>(), this->n_band * this->n_band);
+    Parallel_Reduce::reduce_pool(hsub_in.data<T>(), this->n_band * this->n_band);
     
     // set_matrix_op()('L', hsub_in->data<T>(), this->n_band);
     option = ct::EinsumOption(
@@ -209,6 +209,7 @@ void DiagoBPCG<T, Device>::orth_projection(
                 grad_out.data<T>(),
                 this->n_basis);   //ldc
 
+    // * This type of non inner produce like operation does not need reduce!
     // Parallel_Reduce::reduce_pool(grad_out.data<T>(), this->n_basis * this->n_band);
     
     return;
@@ -242,6 +243,7 @@ void DiagoBPCG<T, Device>::rotate_wf(
                 workspace_in.data<T>(),
                 this->n_basis);     //ldc
     
+    // * This type of non inner produce like operation does not need reduce!
     // Parallel_Reduce::reduce_pool(workspace_in.data<T>(), this->n_basis * this->n_band);
 
     syncmem_complex_op()(psi_out.template data<T>(), workspace_in.template data<T>(), this->n_band * this->n_basis);
@@ -289,7 +291,7 @@ void DiagoBPCG<T, Device>::diag_hsub(
                 hsub_out.data<T>(),
                 this->n_band);      //ldc
 
-    // Parallel_Reduce::reduce_pool(hsub_out.data<T>(), this->n_band * this->n_band);
+    Parallel_Reduce::reduce_pool(hsub_out.data<T>(), this->n_band * this->n_band);
 
     ct::kernels::lapack_dnevd<T, ct_Device>()('V', 'U', hsub_out.data<T>(), this->n_band, eigenvalue_out.data<Real>());
 

From 91f62f6157b93e6ed8ab0a156da703227f73c77e Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Sun, 12 Jan 2025 20:24:41 +0800
Subject: [PATCH 3/8] Add reduce for dot ops used in bpcg

---
 source/module_hsolver/kernels/math_kernel_op.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp
index c3784949ab..a8ada66cc5 100644
--- a/source/module_hsolver/kernels/math_kernel_op.cpp
+++ b/source/module_hsolver/kernels/math_kernel_op.cpp
@@ -24,6 +24,7 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
             Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
             auto A = reinterpret_cast<const Real*>(grad_out + band_idx * n_basis_max);
             Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
+            Parallel_Reduce::reduce_pool(norm);
             norm = 1.0 / sqrt(norm);
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
@@ -71,6 +72,7 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
             T grad_1 = {0.0, 0.0};
             auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
             Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
+            Parallel_Reduce::reduce_pool(norm);
             norm = 1.0 / sqrt(norm);
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {

From b025b62c23eeca70e90792af322c74249057b8f2 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Sun, 12 Jan 2025 20:54:54 +0800
Subject: [PATCH 4/8] Add reduce for manual inner product(for loop) ops used in
 bpcg

---
 source/module_hsolver/kernels/math_kernel_op.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp
index a8ada66cc5..3a752c3659 100644
--- a/source/module_hsolver/kernels/math_kernel_op.cpp
+++ b/source/module_hsolver/kernels/math_kernel_op.cpp
@@ -35,6 +35,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
                 epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item]));
                 epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item]));
             }
+            Parallel_Reduce::reduce_pool(epsilo_0);
+            Parallel_Reduce::reduce_pool(epsilo_1);
+            Parallel_Reduce::reduce_pool(epsilo_2);
             theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
             cos_theta = std::cos(theta);
             sin_theta = std::sin(theta);
@@ -81,6 +84,7 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
                 hpsi_out[item] *= norm;
                 epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
             }
+            Parallel_Reduce::reduce_pool(epsilo);
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -89,6 +93,8 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
                 err += grad_2;
                 beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
             }
+            Parallel_Reduce::reduce_pool(err);
+            Parallel_Reduce::reduce_pool(beta);
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;

From d975f41fddca2b4b1f6f189a19ee5e20c2dca31e Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Mon, 13 Jan 2025 16:59:49 +0800
Subject: [PATCH 5/8] Update docs now that BPCG supports plane wave
 parallelization.

---
 docs/advanced/scf/hsolver.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/advanced/scf/hsolver.md b/docs/advanced/scf/hsolver.md
index f002c2bfda..0cb02ce9dd 100644
--- a/docs/advanced/scf/hsolver.md
+++ b/docs/advanced/scf/hsolver.md
@@ -4,7 +4,7 @@
 
 Method of explicit solving KS-equation can be chosen by variable "ks_solver" in INPUT file.
 
-When "basis_type = pw", `ks_solver` can be `cg`, `bpcg` or `dav`. The `bpcg` method only supports K-point parallelism currently. The default setting `cg` is recommended, which is band-by-band conjugate gradient diagonalization method. There is a large probability that the use of setting of `dav` , which is block Davidson diagonalization method, can be tried to improve performance.  
+When "basis_type = pw", `ks_solver` can be `cg`, `bpcg` or `dav`. The default setting `cg` is recommended, which is band-by-band conjugate gradient diagonalization method. There is a large probability that the use of setting of `dav` , which is block Davidson diagonalization method, can be tried to improve performance.  
 
 When "basis_type = lcao", `ks_solver` can be `genelpa` or `scalapack_gvx`. The default setting `genelpa` is recommended, which is based on ELPA (EIGENVALUE SOLVERS FOR PETAFLOP APPLICATIONS) (https://elpa.mpcdf.mpg.de/) and the kernel is auto choosed by GENELPA(https://github.com/pplab/GenELPA), usually faster than the setting of "scalapack_gvx", which is based on ScaLAPACK(Scalable Linear Algebra PACKage)  
 

From f3c3d75e24675134e1e6a048c9b91bb470f39ffa Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Mon, 13 Jan 2025 21:20:20 +0800
Subject: [PATCH 6/8] Update Autotest.sh to run BPCG test with MPI np=4

---
 tests/integrate/Autotest.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integrate/Autotest.sh b/tests/integrate/Autotest.sh
index 674199f96a..fcdc88a0d8 100755
--- a/tests/integrate/Autotest.sh
+++ b/tests/integrate/Autotest.sh
@@ -250,7 +250,7 @@ for dir in $testdir; do
     TIMEFORMAT='[----------] Time elapsed: %R seconds'
     #parallel test
     time {
-        if [ "$case" = "282_NO_RPA" -o "$dir" = "102_PW_BPCG" ]; then
+        if [ "$case" = "282_NO_RPA" ]; then
             mpirun -np 1 $abacus > log.txt
         else
             mpirun -np $np $abacus > log.txt

From 4cfa47da2c8286448ebd42db3d7dffc764bbc383 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Mon, 13 Jan 2025 21:26:11 +0800
Subject: [PATCH 7/8] remove unused code and redundancies

---
 source/module_hsolver/diago_bpcg.cpp | 9 ++-------
 source/module_hsolver/diago_bpcg.h   | 3 +++
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/source/module_hsolver/diago_bpcg.cpp b/source/module_hsolver/diago_bpcg.cpp
index c1d5d5afb3..846bef9ff8 100644
--- a/source/module_hsolver/diago_bpcg.cpp
+++ b/source/module_hsolver/diago_bpcg.cpp
@@ -169,7 +169,6 @@ void DiagoBPCG<T, Device>::orth_projection(
         /*conj_x=*/false, /*conj_y=*/true, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&hsub_in);
     // hsub_in = ct::op::einsum("ij,kj->ik", grad_out, psi_in, option);
 
-    // this->orth_projection(this->psi, this->hsub, this->grad);
     // gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band)
     gemm_op()(this->ctx,
                 'C',
@@ -209,8 +208,7 @@ void DiagoBPCG<T, Device>::orth_projection(
                 grad_out.data<T>(),
                 this->n_basis);   //ldc
 
-    // * This type of non inner produce like operation does not need reduce!
-    // Parallel_Reduce::reduce_pool(grad_out.data<T>(), this->n_basis * this->n_band);
+    // * This type of non inner product like operation does not need reduce!
     
     return;
 }
@@ -225,8 +223,6 @@ void DiagoBPCG<T, Device>::rotate_wf(
         /*conj_x=*/false, /*conj_y=*/false, /*alpha=*/1.0, /*beta=*/0.0, /*Tensor out=*/&workspace_in);
     // workspace_in = ct::op::einsum("ij,jk->ik", hsub_in, psi_out, option);
 
-    // this->rotate_wf(hsub_out, psi_out, workspace_in);
-    // this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);
     // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band)
     gemm_op()(this->ctx,
                 'N',
@@ -243,8 +239,7 @@ void DiagoBPCG<T, Device>::rotate_wf(
                 workspace_in.data<T>(),
                 this->n_basis);     //ldc
     
-    // * This type of non inner produce like operation does not need reduce!
-    // Parallel_Reduce::reduce_pool(workspace_in.data<T>(), this->n_basis * this->n_band);
+    // * This type of non inner product like operation does not need reduce!
 
     syncmem_complex_op()(psi_out.template data<T>(), workspace_in.template data<T>(), this->n_band * this->n_basis);
 
diff --git a/source/module_hsolver/diago_bpcg.h b/source/module_hsolver/diago_bpcg.h
index 44ddd9736f..a80c1406b6 100644
--- a/source/module_hsolver/diago_bpcg.h
+++ b/source/module_hsolver/diago_bpcg.h
@@ -340,6 +340,9 @@ class DiagoBPCG
     using resmem_complex_op = ct::kernels::resize_memory<T, ct_Device>;
     using syncmem_complex_op = ct::kernels::synchronize_memory<T, ct_Device, ct_Device>;
 
+    // note: these operators use template parameter base_device::Device_*
+    // defined in module_base/module_device/types.h
+    // different from ct_Device!
     using calc_grad_with_block_op = hsolver::calc_grad_with_block_op<T, Device>;
     using line_minimize_with_block_op = hsolver::line_minimize_with_block_op<T, Device>;
     using gemm_op = hsolver::gemm_op<T, Device>;

From 0d59d1ba86b793982fc2f36f74b1a0f08b22e8ea Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Mon, 13 Jan 2025 21:49:09 +0800
Subject: [PATCH 8/8] Update result.ref for BPCG multicore test

---
 tests/integrate/102_PW_BPCG/result.ref | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integrate/102_PW_BPCG/result.ref b/tests/integrate/102_PW_BPCG/result.ref
index 2972395a15..4815e05cb4 100644
--- a/tests/integrate/102_PW_BPCG/result.ref
+++ b/tests/integrate/102_PW_BPCG/result.ref
@@ -1,7 +1,7 @@
 etotref -4869.74705201
 etotperatomref -2434.87352600
-totalforceref 5.19483000
-totalstressref 37241.45334600
+totalforceref 5.19522000
+totalstressref 37241.49490600
 pointgroupref C_1
 spacegroupref C_1
 nksibzref 8