diff --git a/source/source_base/parallel_reduce.cpp b/source/source_base/parallel_reduce.cpp index 03535573b7..5de53cbbdd 100644 --- a/source/source_base/parallel_reduce.cpp +++ b/source/source_base/parallel_reduce.cpp @@ -1,3 +1,4 @@ +// Force recompilation #include "parallel_reduce.h" #include "parallel_comm.h" @@ -99,6 +100,141 @@ void Parallel_Reduce::reduce_double_diag(double* object, const int n) return; } +template <> +void Parallel_Reduce::reduce_pool(int* object, const int n) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_SUM, POOL_WORLD); +#endif + return; +} + +void Parallel_Reduce::reduce_or_all(bool& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD); +#endif + return; +} + +template <> +void Parallel_Reduce::reduce_max_all(double& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); +#endif + return; +} + +template <> +void Parallel_Reduce::reduce_max_all(float& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD); +#endif + return; +} + +template <> +void Parallel_Reduce::reduce_max_all(int& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); +#endif + return; +} + +template <> +void Parallel_Reduce::reduce_min_all(double& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); +#endif + return; +} + +template <> +void Parallel_Reduce::reduce_min_all(float& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD); +#endif + return; +} + +template <> +void Parallel_Reduce::reduce_min_all(int& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); +#endif + return; +} + +void Parallel_Reduce::reduce_max_pool(int* object, const int n) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_MAX, POOL_WORLD); +#endif + return; +} + +void Parallel_Reduce::reduce_min_pool(double& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, POOL_WORLD); +#endif + return; +} + +void Parallel_Reduce::reduce_or_bp(bool& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD); +#endif + return; +} + +void Parallel_Reduce::reduce_double_bgroup(double& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, INT_BGROUP); +#endif + return; +} + +void Parallel_Reduce::reduce_double_bgroup(double* object, const int n) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, INT_BGROUP); +#endif + return; +} + +void Parallel_Reduce::reduce_double_kp(double* object, const int n) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, KP_WORLD); +#endif + return; +} + +void Parallel_Reduce::reduce_double_bp(double& object) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, BP_WORLD); +#endif + return; +} + +void Parallel_Reduce::reduce_double_bp(double* object, const int n) +{ +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, BP_WORLD); +#endif + return; +} + template <> void Parallel_Reduce::reduce_pool(float& object) { diff --git a/source/source_base/parallel_reduce.h b/source/source_base/parallel_reduce.h index 7ab85be1cb..a36911d2c5 100644 --- a/source/source_base/parallel_reduce.h +++ b/source/source_base/parallel_reduce.h @@ -31,6 +31,25 @@ void reduce_int_grid(int* object, const int n); // mohan add 2012-01-12 void reduce_double_grid(double* object, const int n); void reduce_double_diag(double* object, const int n); +void reduce_or_all(bool& object); +template +void reduce_max_all(T& object); +template +void reduce_min_all(T& object); + +void reduce_max_pool(int* object, const int n); +void reduce_min_pool(double& object); + +void reduce_or_bp(bool& object); + +void reduce_double_bgroup(double& object); +void reduce_double_bgroup(double* object, const int n); + +void reduce_double_bp(double& object); +void reduce_double_bp(double* object, const int n); + +void reduce_double_kp(double* object, const int n); + void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double& object); void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double* object, const int n); diff --git a/source/source_basis/module_pw/pw_basis_big.h b/source/source_basis/module_pw/pw_basis_big.h index 2a04720877..b3e48cf698 100644 --- a/source/source_basis/module_pw/pw_basis_big.h +++ b/source/source_basis/module_pw/pw_basis_big.h @@ -2,6 +2,7 @@ #define PW_BASIS_BIG_H #include "source_base/constants.h" #include "source_base/global_function.h" +#include "source_base/parallel_reduce.h" #ifdef __MPI #include "mpi.h" #endif @@ -167,7 +168,7 @@ class PW_Basis_Big : public PW_Basis_Sup ibox[1] = 2*n2+1; ibox[2] = 2*n3+1; #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, ibox, 3, MPI_INT, MPI_MAX , this->pool_world); + Parallel_Reduce::reduce_max_pool(ibox, 3); #endif // Find the minimal FFT box size the factors into the primes (2,3,5,7). @@ -350,7 +351,7 @@ class PW_Basis_Big : public PW_Basis_Sup } } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, &this->gridecut_lat, 1, MPI_DOUBLE, MPI_MIN , this->pool_world); + Parallel_Reduce::reduce_min_pool(this->gridecut_lat); #endif this->gridecut_lat -= 1e-6; diff --git a/source/source_basis/module_pw/pw_init.cpp b/source/source_basis/module_pw/pw_init.cpp index 08c676d39f..f9cfafa5f1 100644 --- a/source/source_basis/module_pw/pw_init.cpp +++ b/source/source_basis/module_pw/pw_init.cpp @@ -1,5 +1,6 @@ #include "pw_basis.h" #include "source_base/constants.h" +#include "source_base/parallel_reduce.h" namespace ModulePW { @@ -86,7 +87,7 @@ void PW_Basis:: initgrids( ibox[1] = 2*n2+1; ibox[2] = 2*n3+1; #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, ibox, 3, MPI_INT, MPI_MAX , this->pool_world); + Parallel_Reduce::reduce_max_pool(ibox, 3); #endif // Find the minimal FFT box size the factors into the primes (2,3,5,7). @@ -200,7 +201,7 @@ void PW_Basis:: initgrids( } } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, &this->gridecut_lat, 1, MPI_DOUBLE, MPI_MIN , this->pool_world); + Parallel_Reduce::reduce_min_pool(this->gridecut_lat); #endif this->gridecut_lat -= 1e-6; diff --git a/source/source_basis/module_pw/test/depend_mock.cpp b/source/source_basis/module_pw/test/depend_mock.cpp index 4fdcfd5f4a..99518ca7c3 100644 --- a/source/source_basis/module_pw/test/depend_mock.cpp +++ b/source/source_basis/module_pw/test/depend_mock.cpp @@ -2,6 +2,7 @@ #include "mpi.h" #endif #include "depend_mock.h" +#include namespace GlobalV { @@ -11,14 +12,83 @@ namespace GlobalV MPI_Comm POOL_WORLD; namespace Parallel_Reduce { - template void reduce_all(T& object) { return; }; - template void reduce_pool(T& object) { return; }; + template void reduce_all(T& object); + template void reduce_all(T* object, const int n); + template void reduce_pool(T& object); + template void reduce_pool(T* object, const int n); template<> - void reduce_all(double& object) { return; }; + void reduce_all(int& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } template<> - void reduce_pool(double& object) { return; }; + void reduce_all(long long& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); } template<> - void reduce_pool(float& object) { return; }; + void reduce_all(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); } + template<> + void reduce_all(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); } + template<> + void reduce_all>(std::complex& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); } + template<> + void reduce_all>(std::complex& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_FLOAT_COMPLEX, MPI_SUM, MPI_COMM_WORLD); } + + template<> + void reduce_all(int* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } + template<> + void reduce_all(long long* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); } + template<> + void reduce_all(double* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); } + template<> + void reduce_all>(std::complex* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); } + template<> + void reduce_all>(std::complex* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_C_FLOAT_COMPLEX, MPI_SUM, MPI_COMM_WORLD); } + + template<> + void reduce_pool(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_SUM, POOL_WORLD); } + template<> + void reduce_pool(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, POOL_WORLD); } + template<> + void reduce_pool>(std::complex& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, POOL_WORLD); } + + template<> + void reduce_pool(int* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_SUM, POOL_WORLD); } + template<> + void reduce_pool(double* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, POOL_WORLD); } + + void reduce_max_pool(int* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_MAX, POOL_WORLD); } + void reduce_min_pool(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, POOL_WORLD); } + + // Other stubs can remain as is if not used or if they don't break logic + void reduce_or_all(bool& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD); } + + template + void reduce_max_all(T& object); + template<> void reduce_max_all(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); } + template<> void reduce_max_all(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD); } + template<> void reduce_max_all(int& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); } + + template + void reduce_min_all(T& object); + template<> void reduce_min_all(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); } + template<> void reduce_min_all(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD); } + template<> void reduce_min_all(int& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); } + + void reduce_or_bp(bool& object) { return; }; + + void reduce_double_bgroup(double& object) { return; }; + void reduce_double_bgroup(double* object, const int n) { return; }; + + void reduce_double_bp(double& object) { return; }; + void reduce_double_bp(double* object, const int n) { return; }; + + void reduce_double_kp(double* object, const int n) { return; }; + + void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double& object) { return; }; + void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double* object, const int n) { return; }; + + void gather_min_int_all(const int& nproc, int& v) { return; }; + void gather_max_double_all(const int& nproc, double& v) { return; }; + void gather_min_double_all(const int& nproc, double& v) { return; }; + void gather_max_double_pool(const int& nproc_in_pool, double& v) { return; }; + void gather_min_double_pool(const int& nproc_in_pool, double& v) { return; }; + void gather_int_all(int& v, int* all) { return; }; } #endif \ No newline at end of file diff --git a/source/source_cell/parallel_kpoints.cpp b/source/source_cell/parallel_kpoints.cpp index 2ca14090fb..dad4aa8637 100644 --- a/source/source_cell/parallel_kpoints.cpp +++ b/source/source_cell/parallel_kpoints.cpp @@ -2,6 +2,7 @@ #include "source_base/parallel_common.h" #include "source_base/parallel_global.h" +#include "source_base/parallel_reduce.h" // the kpoints here are reduced after symmetry applied. void Parallel_Kpoints::kinfo(int& nkstot_in, @@ -123,8 +124,7 @@ void Parallel_Kpoints::gatherkvec(const std::vector> vec_global[i + startk_pool[this->my_pool]] = vec_local[i]; } } - - MPI_Allreduce(MPI_IN_PLACE, &vec_global[0], 3 * this->nkstot_np, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + Parallel_Reduce::reduce_all(reinterpret_cast(vec_global.data()), 3 * this->nkstot_np); return; } #endif diff --git a/source/source_estate/module_charge/charge_mpi.cpp b/source/source_estate/module_charge/charge_mpi.cpp index 32fc8bc195..b13ed5ab0b 100644 --- a/source/source_estate/module_charge/charge_mpi.cpp +++ b/source/source_estate/module_charge/charge_mpi.cpp @@ -30,7 +30,7 @@ void Charge::reduce_diff_pools(double* array_rho) const ModuleBase::timer::tick("Charge", "reduce_diff_pools"); if (KP_WORLD != MPI_COMM_NULL) { - MPI_Allreduce(MPI_IN_PLACE, array_rho, this->nrxx, MPI_DOUBLE, MPI_SUM, KP_WORLD); + Parallel_Reduce::reduce_double_kp(array_rho, this->nrxx); } else { @@ -111,7 +111,7 @@ void Charge::reduce_diff_pools(double* array_rho) const } if(PARAM.globalv.all_ks_run && PARAM.inp.bndpar > 1) { - MPI_Allreduce(MPI_IN_PLACE, array_rho, this->nrxx, MPI_DOUBLE, MPI_SUM, BP_WORLD); + Parallel_Reduce::reduce_double_bp(array_rho, this->nrxx); } ModuleBase::timer::tick("Charge", "reduce_diff_pools"); } diff --git a/source/source_estate/module_charge/symmetry_rhog.cpp b/source/source_estate/module_charge/symmetry_rhog.cpp index aae2103b75..8292c93acf 100644 --- a/source/source_estate/module_charge/symmetry_rhog.cpp +++ b/source/source_estate/module_charge/symmetry_rhog.cpp @@ -1,6 +1,7 @@ #include "symmetry_rho.h" #include "source_pw/module_pwdft/global.h" #include "source_base/parallel_global.h" +#include "source_base/parallel_reduce.h" #include "source_hamilt/module_xc/xc_functional.h" @@ -10,7 +11,7 @@ void Symmetry_rho::psymmg(std::complex* rhog_part, const ModulePW::PW_Ba int * fftixy2is = new int [rho_basis->fftnxy]; rho_basis->getfftixy2is(fftixy2is); //current proc #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, fftixy2is, rho_basis->fftnxy, MPI_INT, MPI_SUM, POOL_WORLD); + Parallel_Reduce::reduce_pool(fftixy2is, rho_basis->fftnxy); if(rho_basis->poolnproc>1) for (int i=0;ifftnxy;++i) fftixy2is[i]+=rho_basis->poolnproc-1; diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp index d4db3d790b..71667c1f3f 100644 --- a/source/source_hsolver/diago_bpcg.cpp +++ b/source/source_hsolver/diago_bpcg.cpp @@ -4,6 +4,7 @@ #include "source_base/global_function.h" #include "source_base/kernels/math_kernel_op.h" #include "source_base/parallel_comm.h" // different MPI worlds +#include "source_base/parallel_reduce.h" #include "source_hsolver/kernels/bpcg_kernel_op.h" #include "para_linear_transform.h" @@ -86,7 +87,7 @@ bool DiagoBPCG::test_error(const ct::Tensor& err_in, const std::vecto } } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, ¬_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD); + Parallel_Reduce::reduce_or_bp(not_conv); #endif return not_conv; } diff --git a/source/source_io/output_log.cpp b/source/source_io/output_log.cpp index 7a4471b0a6..94ac774ea4 100644 --- a/source/source_io/output_log.cpp +++ b/source/source_io/output_log.cpp @@ -6,6 +6,7 @@ #include "source_base/global_variable.h" #include "source_base/parallel_comm.h" +#include "source_base/parallel_reduce.h" #ifdef __MPI #include @@ -154,7 +155,7 @@ void output_vacuum_level(const UnitCell* ucell, } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, ave, length, MPI_DOUBLE, MPI_SUM, POOL_WORLD); + Parallel_Reduce::reduce_pool(ave, length); #endif int surface = nxyz / length; diff --git a/source/source_io/write_eig_occ.cpp b/source/source_io/write_eig_occ.cpp index 4b13d885b9..e0271535f8 100644 --- a/source/source_io/write_eig_occ.cpp +++ b/source/source_io/write_eig_occ.cpp @@ -5,6 +5,7 @@ #include "source_base/global_variable.h" #include "source_base/timer.h" #include "source_base/parallel_comm.h" // use POOL_WORLD +#include "source_base/parallel_reduce.h" #ifdef __MPI #include // use MPI_Barrier @@ -192,7 +193,7 @@ void ModuleIO::write_eig_file(const ModuleBase::matrix &ekb, } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, &wrong, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD); + Parallel_Reduce::reduce_or_all(wrong); #endif if (wrong) { diff --git a/source/source_lcao/module_lr/utils/lr_util.hpp b/source/source_lcao/module_lr/utils/lr_util.hpp index 0ef1280c99..f8414027b3 100644 --- a/source/source_lcao/module_lr/utils/lr_util.hpp +++ b/source/source_lcao/module_lr/utils/lr_util.hpp @@ -4,6 +4,8 @@ #include #include "source_cell/unitcell.h" #include "source_base/constants.h" +#include "source_base/parallel_reduce.h" +#include "source_base/parallel_device.h" #include "source_hamilt/module_xc/xc_functional.h" namespace LR_Util { @@ -172,7 +174,7 @@ namespace LR_Util } //reduce to root - MPI_Allreduce(MPI_IN_PLACE, fullmat, global_nrow * global_ncol, get_mpi_datatype(), MPI_SUM, pv.comm()); + Parallel_Common::reduce_dev(fullmat, global_nrow * global_ncol, pv.comm()); }; #endif diff --git a/source/source_lcao/module_operator_lcao/op_exx_lcao.hpp b/source/source_lcao/module_operator_lcao/op_exx_lcao.hpp index 80f0c422a2..48065e80c8 100644 --- a/source/source_lcao/module_operator_lcao/op_exx_lcao.hpp +++ b/source/source_lcao/module_operator_lcao/op_exx_lcao.hpp @@ -6,6 +6,7 @@ #include "source_io/module_parameter/parameter.h" #include "source_lcao/module_ri/RI_2D_Comm.h" #include "source_pw/module_pwdft/global.h" +#include "source_base/parallel_reduce.h" #include "source_hamilt/module_xc/xc_functional.h" #include "source_io/restart_exx_csr.h" @@ -245,7 +246,7 @@ OperatorEXX>::OperatorEXX(HS_Matrix_K* hsk_in, // Add MPI communication to synchronize all_exist across processes #ifdef __MPI // don't read in any files if one of the processes doesn't have it - MPI_Allreduce(MPI_IN_PLACE, &all_exist, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + Parallel_Reduce::reduce_min_all(all_exist); #endif if (all_exist) { @@ -264,7 +265,7 @@ OperatorEXX>::OperatorEXX(HS_Matrix_K* hsk_in, std::ifstream ifs(restart_HR_path_cereal, std::ios::binary); int all_exist_cereal = ifs ? 1 : 0; #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, &all_exist_cereal, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + Parallel_Reduce::reduce_min_all(all_exist_cereal); #endif if (!all_exist_cereal) { diff --git a/source/source_pw/module_pwdft/VNL_in_pw.cpp b/source/source_pw/module_pwdft/VNL_in_pw.cpp index 438358c4ae..c007a67c71 100644 --- a/source/source_pw/module_pwdft/VNL_in_pw.cpp +++ b/source/source_pw/module_pwdft/VNL_in_pw.cpp @@ -10,6 +10,7 @@ #include "source_base/math_ylmreal.h" #include "source_base/memory.h" #include "source_base/module_device/device.h" +#include "source_base/parallel_reduce.h" #include "source_base/timer.h" #include "source_pw/module_pwdft/global.h" #include "source_pw/module_pwdft/kernels/vnl_op.h" @@ -683,8 +684,8 @@ void pseudopot_cell_vnl::init_vnl(UnitCell& cell, const ModulePW::PW_Basis* rho_ } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, this->qq_nt.ptr, this->qq_nt.getSize(), MPI_DOUBLE, MPI_SUM, POOL_WORLD); - MPI_Allreduce(MPI_IN_PLACE, this->qq_so.ptr, this->qq_so.getSize(), MPI_DOUBLE_COMPLEX, MPI_SUM, POOL_WORLD); + Parallel_Reduce::reduce_pool(this->qq_nt.ptr, this->qq_nt.getSize()); + Parallel_Reduce::reduce_pool(this->qq_so.ptr, this->qq_so.getSize()); #endif // set the atomic specific qq_at matrices @@ -1510,7 +1511,7 @@ void pseudopot_cell_vnl::newq(const ModuleBase::matrix& veff, const ModulePW::PW } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, deeq.ptr, deeq.getSize(), MPI_DOUBLE, MPI_SUM, POOL_WORLD); + Parallel_Reduce::reduce_pool(deeq.ptr, deeq.getSize()); #endif delete[] qnorm; diff --git a/source/source_pw/module_pwdft/elecond.cpp b/source/source_pw/module_pwdft/elecond.cpp index 068ca01067..757259b31c 100644 --- a/source/source_pw/module_pwdft/elecond.cpp +++ b/source/source_pw/module_pwdft/elecond.cpp @@ -4,6 +4,7 @@ #include "source_base/global_variable.h" #include "source_base/kernels/math_kernel_op.h" #include "source_base/parallel_device.h" +#include "source_base/parallel_reduce.h" #include "source_estate/occupy.h" #include "source_io/binstream.h" #include "source_io/module_parameter/parameter.h" @@ -93,9 +94,9 @@ void EleCond::KG(const int& smear_type, jjresponse_ks(ik, nt, dt, decut, wg, velop, ct11.data(), ct12.data(), ct22.data()); } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, ct11.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, ct12.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, ct22.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + Parallel_Reduce::reduce_all(ct11.data(), nt); + Parallel_Reduce::reduce_all(ct12.data(), nt); + Parallel_Reduce::reduce_all(ct22.data(), nt); #endif //------------------------------------------------------------------ // Output diff --git a/source/source_pw/module_pwdft/setup_pwwfc.cpp b/source/source_pw/module_pwdft/setup_pwwfc.cpp index 759178638c..d2f7cfba9c 100644 --- a/source/source_pw/module_pwdft/setup_pwwfc.cpp +++ b/source/source_pw/module_pwdft/setup_pwwfc.cpp @@ -1,5 +1,6 @@ #include "source_pw/module_pwdft/setup_pwwfc.h" // pw_wfc #include "source_base/parallel_comm.h" // POOL_WORLD +#include "source_base/parallel_reduce.h" #include "source_io/print_info.h" // print information void pw::teardown_pwwfc(ModulePW::PW_Basis_K* &pw_wfc) @@ -56,7 +57,7 @@ void pw::setup_pwwfc(const Input_para& inp, #ifdef __MPI if (inp.pw_seed > 0) { - MPI_Allreduce(MPI_IN_PLACE, &pw_wfc->ggecut, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + Parallel_Reduce::reduce_max_all(pw_wfc->ggecut); } // qianrui add 2021-8-13 to make different kpar parameters can get the same // results diff --git a/source/source_pw/module_stodft/sto_dos.cpp b/source/source_pw/module_stodft/sto_dos.cpp index dd90224e15..0326924d41 100644 --- a/source/source_pw/module_stodft/sto_dos.cpp +++ b/source/source_pw/module_stodft/sto_dos.cpp @@ -1,5 +1,6 @@ #include "sto_dos.h" +#include "source_base/parallel_reduce.h" #include "source_base/timer.h" #include "source_base/tool_title.h" #include "source_io/module_parameter/parameter.h" @@ -234,9 +235,9 @@ void Sto_DOS::caldos(const double sigmain, const double de, cons error[ie] = tmperror; } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, ks_dos.data(), ndos, MPI_DOUBLE, MPI_SUM, INT_BGROUP); - MPI_Allreduce(MPI_IN_PLACE, sto_dos.data(), ndos, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, error.data(), ndos, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + Parallel_Reduce::reduce_double_bgroup(ks_dos.data(), ndos); + Parallel_Reduce::reduce_all(sto_dos.data(), ndos); + Parallel_Reduce::reduce_all(error.data(), ndos); #endif if (GlobalV::MY_RANK == 0) { diff --git a/source/source_pw/module_stodft/sto_elecond.cpp b/source/source_pw/module_stodft/sto_elecond.cpp index b0fe4d71d7..9f9bdb1de7 100644 --- a/source/source_pw/module_stodft/sto_elecond.cpp +++ b/source/source_pw/module_stodft/sto_elecond.cpp @@ -5,6 +5,7 @@ #include "source_base/memory.h" #include "source_base/module_container/ATen/tensor.h" #include "source_base/parallel_device.h" +#include "source_base/parallel_reduce.h" #include "source_base/timer.h" #include "source_base/vector3.h" #include "source_io/module_parameter/parameter.h" @@ -1059,9 +1060,9 @@ void Sto_EleCond::sKG(const int& smear_type, } // ik loop ModuleBase::timer::tick("Sto_EleCond", "kloop"); #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, ct11.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, ct12.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, ct22.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + Parallel_Reduce::reduce_all(ct11.data(), nt); + Parallel_Reduce::reduce_all(ct12.data(), nt); + Parallel_Reduce::reduce_all(ct22.data(), nt); #endif //------------------------------------------------------------------ diff --git a/source/source_pw/module_stodft/sto_iter.cpp b/source/source_pw/module_stodft/sto_iter.cpp index aa9990a415..3d35b7d458 100644 --- a/source/source_pw/module_stodft/sto_iter.cpp +++ b/source/source_pw/module_stodft/sto_iter.cpp @@ -204,9 +204,8 @@ void Stochastic_Iter::checkemm(const int& ik, if (ik == nks - 1) { #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, p_hamilt_sto->emax, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, p_hamilt_sto->emin, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, &change, 1, MPI_CHAR, MPI_LOR, MPI_COMM_WORLD); + Parallel_Reduce::reduce_max_all(*p_hamilt_sto->emax); + Parallel_Reduce::reduce_min_all(*p_hamilt_sto->emin); #endif if (change) { @@ -249,7 +248,7 @@ void Stochastic_Iter::check_precision(const double ref, const double } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, &error, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + Parallel_Reduce::reduce_all(error); #endif double relative_error = std::abs(error / ref); GlobalV::ofs_running << info << "Relative Chebyshev Precision: " << relative_error * 1e9 << "E-09" << std::endl; @@ -468,12 +467,12 @@ double Stochastic_Iter::calne(elecstate::ElecState* pes) } KS_ne /= GlobalV::NPROC_IN_POOL; #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, &KS_ne, 1, MPI_DOUBLE, MPI_SUM, INT_BGROUP); + Parallel_Reduce::reduce_double_bgroup(KS_ne); if(PARAM.globalv.all_ks_run) { - MPI_Allreduce(MPI_IN_PLACE, &KS_ne, 1, MPI_DOUBLE, MPI_SUM, BP_WORLD); + Parallel_Reduce::reduce_double_bp(KS_ne); } - MPI_Allreduce(MPI_IN_PLACE, &sto_ne, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + Parallel_Reduce::reduce_all(sto_ne); #endif totne = KS_ne + sto_ne; @@ -535,12 +534,12 @@ void Stochastic_Iter::sum_stoeband(Stochastic_WF& stowf, } pes->f_en.demet /= GlobalV::NPROC_IN_POOL; #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, &pes->f_en.demet, 1, MPI_DOUBLE, MPI_SUM, INT_BGROUP); + Parallel_Reduce::reduce_double_bgroup(pes->f_en.demet); if(PARAM.globalv.all_ks_run) { - MPI_Allreduce(MPI_IN_PLACE, &pes->f_en.demet, 1, MPI_DOUBLE, MPI_SUM, BP_WORLD); + Parallel_Reduce::reduce_double_bp(pes->f_en.demet); } - MPI_Allreduce(MPI_IN_PLACE, &stodemet, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + Parallel_Reduce::reduce_all(stodemet); #endif pes->f_en.demet += stodemet; this->check_precision(pes->f_en.demet, 1e-4, "TS"); @@ -581,7 +580,7 @@ void Stochastic_Iter::sum_stoeband(Stochastic_WF& stowf, } } #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, &sto_eband, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + Parallel_Reduce::reduce_all(sto_eband); #endif pes->f_en.eband += sto_eband; ModuleBase::timer::tick("Stochastic_Iter", "sum_stoeband"); @@ -673,7 +672,7 @@ void Stochastic_Iter::cal_storho(const UnitCell& ucell, pes->charge->reduce_diff_pools(sto_rho[is]); if (!PARAM.globalv.all_ks_run && PARAM.inp.bndpar > 1) { - MPI_Allreduce(MPI_IN_PLACE, sto_rho[is], nrxx, MPI_DOUBLE, MPI_SUM, BP_WORLD); + Parallel_Reduce::reduce_double_bp(sto_rho[is], nrxx); } } } @@ -695,7 +694,7 @@ void Stochastic_Iter::cal_storho(const UnitCell& ucell, sto_ne *= ucell.omega / wfc_basis->nxyz; #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, &sto_ne, 1, MPI_DOUBLE, MPI_SUM, POOL_WORLD); + Parallel_Reduce::reduce_pool(sto_ne); #endif double factor = targetne / (KS_ne + sto_ne); if (std::abs(factor - 1) > 1e-10) diff --git a/source/source_pw/module_stodft/sto_tool.cpp b/source/source_pw/module_stodft/sto_tool.cpp index de1e72e3f1..95391611b2 100644 --- a/source/source_pw/module_stodft/sto_tool.cpp +++ b/source/source_pw/module_stodft/sto_tool.cpp @@ -2,6 +2,7 @@ #include "source_base/math_chebyshev.h" #include "source_base/parallel_device.h" +#include "source_base/parallel_reduce.h" #include "source_base/timer.h" #include "source_io/module_parameter/parameter.h" #ifdef __MPI @@ -103,8 +104,8 @@ void check_che_op::operator()(const int& nche_in, if (ik == nk - 1) { #ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, p_hamilt_sto->emax, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, p_hamilt_sto->emin, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); + Parallel_Reduce::reduce_max_all(*p_hamilt_sto->emax); + Parallel_Reduce::reduce_min_all(*p_hamilt_sto->emin); #endif GlobalV::ofs_running << "New Emax " << *p_hamilt_sto->emax << " Ry; new Emin " << *p_hamilt_sto->emin << " Ry" << std::endl;