fix BPCG

Qianruipku · Qianruipku · commit 7ddd28fa6a52 · 2025-01-27T15:07:33.000+08:00
diff --git a/source/module_cell/cal_atoms_info.h b/source/module_cell/cal_atoms_info.h
@@ -73,11 +73,16 @@ class CalAtomsInfo
         if (para.inp.ks_solver == "bpcg") // only bpcg support band parallel
         {
             para.sys.nbands_l = para.inp.nbands / para.inp.bndpar;
-            if (GlobalV::RANK_IN_BPGROUP < para.inp.nbands % para.inp.bndpar)
+            if (GlobalV::MY_BNDGROUP < para.inp.nbands % para.inp.bndpar)
             {
                 para.sys.nbands_l++;
             }
         }
+        // temporary code
+        if (GlobalV::MY_BNDGROUP == 0 || para.inp.ks_solver == "bpcg")
+        {
+            para.sys.ks_run = true;
+        }
         return;
     }
 };
diff --git a/source/module_elecstate/elecstate_print.cpp b/source/module_elecstate/elecstate_print.cpp
@@ -247,7 +247,7 @@ void ElecState::print_band(const int& ik, const int& printe, const int& iter)
 {
     // check the band energy.
     bool wrong = false;
-    for (int ib = 0; ib < PARAM.inp.nbands; ++ib)
+    for (int ib = 0; ib < PARAM.globalv.nbands_l; ++ib)
     {
         if (std::abs(this->ekb(ik, ib)) > 1.0e10)
         {
@@ -269,7 +269,7 @@ void ElecState::print_band(const int& ik, const int& printe, const int& iter)
             GlobalV::ofs_running << " Energy (eV) & Occupations  for spin=" << this->klist->isk[ik] + 1
                                  << " K-point=" << ik + 1 << std::endl;
             GlobalV::ofs_running << std::setiosflags(std::ios::showpoint);
-            for (int ib = 0; ib < PARAM.inp.nbands; ib++)
+            for (int ib = 0; ib < PARAM.globalv.nbands_l; ib++)
             {
                 GlobalV::ofs_running << " " << std::setw(6) << ib + 1 << std::setw(15)
                                      << this->ekb(ik, ib) * ModuleBase::Ry_to_eV;
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp
@@ -58,7 +58,7 @@ void Stochastic_Iter<T, Device>::orthog(const int& ik, psi::Psi<T, Device>& psi,
 {
     ModuleBase::TITLE("Stochastic_Iter", "orthog");
     ModuleBase::timer::tick("Stochastic_Iter", "orthog");
-    const int nbands_l = psi.get_nbands();
+    int nbands_l = psi.get_nbands();
     const int nbands = PARAM.inp.nbands;
     // orthogonal part
     if (nbands > 0)
@@ -74,24 +74,63 @@ void Stochastic_Iter<T, Device>::orthog(const int& ik, psi::Psi<T, Device>& psi,
         // orthogonal part
         T* sum = nullptr;
         resmem_complex_op()(sum, nbands * nchipk);
-        // sum(b<NBANDS, a<nchi) = < psi_b | chi_a >
-        ModuleBase::PGemmCN<T, Device> pmmcn;
+
+        if(PARAM.globalv.all_ks_run)
+        {
+            // sum(b<NBANDS, a<nchi) = < psi_b | chi_a >
+            ModuleBase::PGemmCN<T, Device> pmmcn;
 #ifdef __MPI
-        pmmcn.set_dimension(BP_WORLD, POOL_WORLD, nbands_l, npwx, nchipk, npwx, npw, nbands, 2);
+            pmmcn.set_dimension(BP_WORLD, POOL_WORLD, nbands_l, npwx, nchipk, npwx, npw, nbands, 2);
 #else
-        pmmcn.set_dimension(nbands_l, npwx, nchipk, npwx, npw, nbands, 2);
+            pmmcn.set_dimension(nbands_l, npwx, nchipk, npwx, npw, nbands, 2);
 #endif
-        pmmcn.multiply(1.0, &psi(ik, 0, 0), wfgout, 0.0, sum);
-        
-        // psi -= psi * sum
-        hsolver::PLinearTransform<T, Device> pltrans;
+            pmmcn.multiply(1.0, &psi(ik, 0, 0), wfgout, 0.0, sum);
+
+            // psi -= psi * sum
+            hsolver::PLinearTransform<T, Device> pltrans;
 #ifdef __MPI
-        pltrans.set_dimension(npw, nbands_l, nchipk, npwx, BP_WORLD, true);
+            pltrans.set_dimension(npw, nbands_l, nchipk, npwx, BP_WORLD, true);
 #else
-        pltrans.set_dimension(npw, nbands_l, nchipk, npwx, true);
+            pltrans.set_dimension(npw, nbands_l, nchipk, npwx, true);
 #endif
-        pltrans.act(-1.0, &psi(ik, 0, 0), sum, 1.0, wfgout);
-        
+            pltrans.act(-1.0, &psi(ik, 0, 0), sum, 1.0, wfgout);
+        }
+        else
+        {
+            // sum(b<NBANDS, a<nchi) = < psi_b | chi_a >
+            ModuleBase::gemm_op<T, Device>()(ctx,
+                                             'C',
+                                             'N',
+                                             nbands,
+                                             nchipk,
+                                             npw,
+                                             &ModuleBase::ONE,
+                                             &psi(ik, 0, 0),
+                                             npwx,
+                                             wfgout,
+                                             npwx,
+                                             &ModuleBase::ZERO,
+                                             sum,
+                                             nbands);
+            Parallel_Reduce::reduce_pool(sum, nbands * nchipk);
+
+            // psi -= psi * sum
+            ModuleBase::gemm_op<T, Device>()(ctx,
+                                             'N',
+                                             'N',
+                                             npw,
+                                             nchipk,
+                                             nbands,
+                                             &ModuleBase::NEG_ONE,
+                                             &psi(ik, 0, 0),
+                                             npwx,
+                                             sum,
+                                             nbands,
+                                             &ModuleBase::ONE,
+                                             wfgout,
+                                             npwx);
+        }
+
         delmem_complex_op()(sum);
     }
     ModuleBase::timer::tick("Stochastic_Iter", "orthog");
diff --git a/source/module_hsolver/diago_bpcg.cpp b/source/module_hsolver/diago_bpcg.cpp
@@ -12,6 +12,7 @@
 #include "module_base/global_function.h"
 #include "module_base/kernels/math_kernel_op.h"
 #include "para_linear_transform.h"
+#include "module_parameter/parameter.h"
 
 namespace hsolver {
 
@@ -44,9 +45,9 @@ void DiagoBPCG<T, Device>::init_iter(const int nband, const int nband_l, const i
 
     // All column major tensors
 
-    this->beta          = std::move(ct::Tensor(r_type, device_type, {this->n_band}));
+    this->beta          = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
     this->eigen         = std::move(ct::Tensor(r_type, device_type, {this->n_band}));
-    this->err_st        = std::move(ct::Tensor(r_type, device_type, {this->n_band}));
+    this->err_st        = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
 
     this->hsub          = std::move(ct::Tensor(t_type, device_type, {this->n_band, this->n_band}));
 
@@ -175,7 +176,7 @@ void DiagoBPCG<T, Device>::rotate_wf(
 {
     // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band)
     this->plintrans.act(1.0, psi_out.data<T>(), hsub_in.data<T>(), 0.0, workspace_in.data<T>());
-    syncmem_complex_op()(psi_out.template data<T>(), workspace_in.template data<T>(), this->n_band * this->n_basis);
+    syncmem_complex_op()(psi_out.template data<T>(), workspace_in.template data<T>(), this->n_band_l * this->n_basis);
 
     return;
 }
@@ -187,7 +188,7 @@ void DiagoBPCG<T, Device>::calc_hpsi_with_block(
         ct::Tensor& hpsi_out)
 {
     // calculate all-band hpsi
-    hpsi_func(psi_in, hpsi_out.data<T>(), this->n_basis, this->n_band);
+    hpsi_func(psi_in, hpsi_out.data<T>(), this->n_basis, this->n_band_l);
 }
 
 template<typename T, typename Device>
@@ -256,17 +257,17 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
 {
     const int current_scf_iter = hsolver::DiagoIterAssist<T, Device>::SCF_ITER;
     // Get the pointer of the input psi
-    this->psi = std::move(ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band, this->n_basis}));
+    this->psi = std::move(ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band_l, this->n_basis}));
 
     // Update the precondition array
     this->calc_prec();
 
     // Improving the initial guess of the wave function psi through a subspace diagonalization.
     this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen);
 
-    setmem_complex_op()(this->grad_old.template data<T>(), 0, this->n_basis * this->n_band);
+    setmem_complex_op()(this->grad_old.template data<T>(), 0, this->n_basis * this->n_band_l);
 
-    setmem_var_op()(this->beta.template data<Real>(), std::numeric_limits<Real>::infinity(), this->n_band);
+    setmem_var_op()(this->beta.template data<Real>(), std::numeric_limits<Real>::infinity(), this->n_band_l);
 
     int ntry = 0;
     int max_iter = current_scf_iter > 1 ?
@@ -290,7 +291,7 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
         this->orth_projection(this->psi, this->hsub, this->grad);
 
         // this->grad_old = this->grad;
-        syncmem_complex_op()(this->grad_old.template data<T>(), this->grad.template data<T>(), n_basis * n_band);
+        syncmem_complex_op()(this->grad_old.template data<T>(), this->grad.template data<T>(), n_basis * n_band_l);
 
         // Calculate H|grad> matrix
         this->calc_hpsi_with_block(hpsi_func, this->grad.template data<T>(), /*this->grad_wrapper[0],*/ this->hgrad);
@@ -311,7 +312,14 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
 
     this->calc_hsub_with_block_exit(this->psi, this->hpsi, this->hsub, this->work, this->eigen);
 
-    syncmem_var_d2h_op()(eigenvalue_in, this->eigen.template data<Real>(), this->n_band);
+    int start_nband = 0;
+#ifdef __MPI
+    if (PARAM.inp.bndpar > 1)
+    {
+        start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP];
+    }
+#endif
+    syncmem_var_d2h_op()(eigenvalue_in, this->eigen.template data<Real>() + start_nband, this->n_band_l);
 
     return;
 }
diff --git a/source/module_hsolver/para_linear_transform.cpp b/source/module_hsolver/para_linear_transform.cpp
@@ -6,13 +6,13 @@ namespace hsolver
 {
 template <typename T, typename Device>
 void PLinearTransform<T, Device>::set_dimension(const int nrowA,
-                                                        const int ncolA,
-                                                        const int ncolB,
-                                                        const int LDA,
+                                                const int ncolA,
+                                                const int ncolB,
+                                                const int LDA,
 #ifdef __MPI
-                                                        MPI_Comm col_world,
+                                                MPI_Comm col_world,
 #endif
-                                                        const bool localU)
+                                                const bool localU)
 {
     this->nrowA = nrowA;
     this->ncolA = ncolA;
@@ -91,13 +91,13 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con
             T real_beta = ip == 0 ? beta : 0;
             const int ncolA_ip = colA_loc[ip];
             // get U_tmp
-            
-                const int start_row = start_colA[ip];
-                for (int i = 0; i < ncolB; ++i)
-                {
-                    const T* U_part = U + start_row + (i + start) * ncolA_glo;
-                    syncmem_dev_op()(U_tmp + i * ncolA_ip, U_part, ncolA_ip);
-                }
+
+            const int start_row = start_colA[ip];
+            for (int i = 0; i < ncolB; ++i)
+            {
+                const T* U_part = U + start_row + (i + start) * ncolA_glo;
+                syncmem_dev_op()(U_tmp + i * ncolA_ip, U_part, ncolA_ip);
+            }
 
             if (ip == rank_col)
             {
diff --git a/source/module_io/read_set_globalv.cpp b/source/module_io/read_set_globalv.cpp
@@ -61,15 +61,10 @@ void ReadInput::set_globalv(const Input_para& inp, System_para& sys)
     Parallel_Common::bcast_bool(sys.double_grid);
 #endif
     /// set ks_run
-    if (GlobalV::MY_BNDGROUP == 0 || inp.ks_solver == "bpcg")
-    {
-        sys.ks_run = true;
-    }
     if (inp.ks_solver != "bpcg" && inp.bndpar > 1)
     {
         sys.all_ks_run = false;
     }
-
 }
 
 /// @note Here para.inp has been synchronized of all ranks. 
diff --git a/source/module_io/write_istate_info.cpp b/source/module_io/write_istate_info.cpp
@@ -41,7 +41,7 @@ void ModuleIO::write_istate_info(const ModuleBase::matrix &ekb,const ModuleBase:
                           << std::setw(25) << "Kpoint = " << ik_global
                           << std::setw(25) << "(" << kv.kvec_d[ik].x << " " << kv.kvec_d[ik].y
                           << " " << kv.kvec_d[ik].z << ")" << std::endl;
-                    for (int ib = 0; ib < PARAM.inp.nbands; ib++)
+                    for (int ib = 0; ib < PARAM.globalv.nbands_l; ib++)
                     {
                         ofsi2.precision(16);
                         ofsi2 << std::setw(6) << ib + 1 << std::setw(25)
diff --git a/tests/integrate/102_PW_BPCG_BP/INPUT b/tests/integrate/102_PW_BPCG_BP/INPUT
@@ -0,0 +1,33 @@
+INPUT_PARAMETERS
+#Parameters	(General)
+suffix              autotest
+pseudo_dir          ../../PP_ORB
+pw_seed             1
+
+gamma_only          0
+calculation         scf
+symmetry            1
+out_level           ie
+smearing_method     gaussian
+smearing_sigma      0.02
+
+#Parameters (3.PW)
+ecutwfc             40
+scf_thr             1e-7
+scf_nmax            20
+bndpar              2
+
+#Parameters (LCAO)
+basis_type          pw
+ks_solver           bpcg
+device              cpu
+chg_extrap          second-order
+out_dm              0
+pw_diag_thr         0.00001
+
+cal_force           1
+cal_stress          1
+
+mixing_type         broyden
+mixing_beta         0.4
+mixing_gg0          1.5
diff --git a/tests/integrate/102_PW_BPCG_BP/KPT b/tests/integrate/102_PW_BPCG_BP/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+2 2 2  0 0 0
diff --git a/tests/integrate/102_PW_BPCG_BP/README b/tests/integrate/102_PW_BPCG_BP/README
@@ -0,0 +1,10 @@
+This test for:
+*GaAs-deformation
+*PW
+*bndpar 2
+*kpoints 2*2*2
+*sg15 pseudopotential
+*smearing_method gauss
+*ks_solver bpcg
+*mixing_type broyden-kerker
+*mixing_beta 0.4
diff --git a/tests/integrate/102_PW_BPCG_BP/STRU b/tests/integrate/102_PW_BPCG_BP/STRU
@@ -0,0 +1,23 @@
+ATOMIC_SPECIES
+As 1   As_dojo.upf upf201
+Ga 1   Ga_dojo.upf upf201
+
+LATTICE_CONSTANT
+1  // add lattice constant, 10.58 ang
+
+LATTICE_VECTORS
+5.33 5.33  0.0
+0.0  5.33 5.33
+5.33  0.0  5.33
+ATOMIC_POSITIONS
+Direct //Cartesian or Direct coordinate.
+
+As
+0
+1
+0.300000          0.3300000          0.27000000     0 0 0
+
+Ga              //Element Label
+0
+1              //number of atom
+0.00000          0.00000          0.000000     0 0 0
diff --git a/tests/integrate/102_PW_BPCG_BP/result.ref b/tests/integrate/102_PW_BPCG_BP/result.ref
@@ -0,0 +1,8 @@
+etotref -4869.74705201
+etotperatomref -2434.87352600
+totalforceref 5.19522000
+totalstressref 37241.49490600
+pointgroupref C_1
+spacegroupref C_1
+nksibzref 8
+totaltimeref 10.37
diff --git a/tests/integrate/184_PW_BPCG_SDFT_5D10S/INPUT b/tests/integrate/184_PW_BPCG_SDFT_5D10S/INPUT
@@ -0,0 +1,38 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix			autotest
+calculation     scf
+esolver_type    sdft
+ks_solver       bpcg
+method_sto      1
+
+symmetry		0
+pseudo_dir	../../PP_ORB
+
+bndpar          2
+
+nbands			5
+nbands_sto		11
+
+nche_sto		120
+seed_sto        20000
+kpar            1
+cal_force       1
+cal_stress      1
+
+#Parameters (2.Iteration)
+ecutwfc			20
+scf_thr			1e-4
+scf_nmax		20
+
+
+#Parameters (3.Basis)
+basis_type		pw
+
+#Parameters (4.Smearing)
+smearing_method		fd
+smearing_sigma		0.6
+
+#Parameters (5.Mixing)
+mixing_type		broyden
+mixing_beta		0.4
diff --git a/tests/integrate/184_PW_BPCG_SDFT_5D10S/KPT b/tests/integrate/184_PW_BPCG_SDFT_5D10S/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/integrate/184_PW_BPCG_SDFT_5D10S/README b/tests/integrate/184_PW_BPCG_SDFT_5D10S/README
diff --git a/tests/integrate/184_PW_BPCG_SDFT_5D10S/STRU b/tests/integrate/184_PW_BPCG_SDFT_5D10S/STRU
diff --git a/tests/integrate/184_PW_BPCG_SDFT_5D10S/jd b/tests/integrate/184_PW_BPCG_SDFT_5D10S/jd
diff --git a/tests/integrate/184_PW_BPCG_SDFT_5D10S/result.ref b/tests/integrate/184_PW_BPCG_SDFT_5D10S/result.ref

Original file line number	Diff line number	Diff line change
`@@ -73,11 +73,16 @@ class CalAtomsInfo`
`73`	`73`	`if (para.inp.ks_solver == "bpcg") // only bpcg support band parallel`
`74`	`74`	`{`
`75`	`75`	`para.sys.nbands_l = para.inp.nbands / para.inp.bndpar;`
`76`		`- if (GlobalV::RANK_IN_BPGROUP < para.inp.nbands % para.inp.bndpar)`
	`76`	`+ if (GlobalV::MY_BNDGROUP < para.inp.nbands % para.inp.bndpar)`
`77`	`77`	`{`
`78`	`78`	`para.sys.nbands_l++;`
`79`	`79`	`}`
`80`	`80`	`}`
	`81`	`+ // temporary code`
	`82`	`+ if (GlobalV::MY_BNDGROUP == 0 \|\| para.inp.ks_solver == "bpcg")`
	`83`	`+ {`
	`84`	`+ para.sys.ks_run = true;`
	`85`	`+ }`
`81`	`86`	`return;`
`82`	`87`	`}`
`83`	`88`	`};`
Original file line number	Diff line number	Diff line change
`@@ -247,7 +247,7 @@ void ElecState::print_band(const int& ik, const int& printe, const int& iter)`
`247`	`247`	`{`
`248`	`248`	`// check the band energy.`
`249`	`249`	`bool wrong = false;`
`250`		`- for (int ib = 0; ib < PARAM.inp.nbands; ++ib)`
	`250`	`+ for (int ib = 0; ib < PARAM.globalv.nbands_l; ++ib)`
`251`	`251`	`{`
`252`	`252`	`if (std::abs(this->ekb(ik, ib)) > 1.0e10)`
`253`	`253`	`{`
`@@ -269,7 +269,7 @@ void ElecState::print_band(const int& ik, const int& printe, const int& iter)`
`269`	`269`	`GlobalV::ofs_running << " Energy (eV) & Occupations for spin=" << this->klist->isk[ik] + 1`
`270`	`270`	`<< " K-point=" << ik + 1 << std::endl;`
`271`	`271`	`GlobalV::ofs_running << std::setiosflags(std::ios::showpoint);`
`272`		`- for (int ib = 0; ib < PARAM.inp.nbands; ib++)`
	`272`	`+ for (int ib = 0; ib < PARAM.globalv.nbands_l; ib++)`
`273`	`273`	`{`
`274`	`274`	`GlobalV::ofs_running << " " << std::setw(6) << ib + 1 << std::setw(15)`
`275`	`275`	`<< this->ekb(ik, ib) * ModuleBase::Ry_to_eV;`
Original file line number	Diff line number	Diff line change
`@@ -61,15 +61,10 @@ void ReadInput::set_globalv(const Input_para& inp, System_para& sys)`
`61`	`61`	`Parallel_Common::bcast_bool(sys.double_grid);`
`62`	`62`	`#endif`
`63`	`63`	`/// set ks_run`
`64`		`- if (GlobalV::MY_BNDGROUP == 0 \|\| inp.ks_solver == "bpcg")`
`65`		`- {`
`66`		`- sys.ks_run = true;`
`67`		`- }`
`68`	`64`	`if (inp.ks_solver != "bpcg" && inp.bndpar > 1)`
`69`	`65`	`{`
`70`	`66`	`sys.all_ks_run = false;`
`71`	`67`	`}`
`72`		`-`
`73`	`68`	`}`
`74`	`69`
`75`	`70`	`/// @note Here para.inp has been synchronized of all ranks.`
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ void ModuleIO::write_istate_info(const ModuleBase::matrix &ekb,const ModuleBase:`
`41`	`41`	`<< std::setw(25) << "Kpoint = " << ik_global`
`42`	`42`	`<< std::setw(25) << "(" << kv.kvec_d[ik].x << " " << kv.kvec_d[ik].y`
`43`	`43`	`<< " " << kv.kvec_d[ik].z << ")" << std::endl;`
`44`		`- for (int ib = 0; ib < PARAM.inp.nbands; ib++)`
	`44`	`+ for (int ib = 0; ib < PARAM.globalv.nbands_l; ib++)`
`45`	`45`	`{`
`46`	`46`	`ofsi2.precision(16);`
`47`	`47`	`ofsi2 << std::setw(6) << ib + 1 << std::setw(25)`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +K_POINTS
 +0
 +Gamma
 +2 2 2  0 0 0