diff --git a/source/source_base/parallel_reduce.cpp b/source/source_base/parallel_reduce.cpp
index 03535573b7..5de53cbbdd 100644
--- a/source/source_base/parallel_reduce.cpp
+++ b/source/source_base/parallel_reduce.cpp
@@ -1,3 +1,4 @@
+// Force recompilation
 #include "parallel_reduce.h"
 
 #include "parallel_comm.h"
@@ -99,6 +100,141 @@ void Parallel_Reduce::reduce_double_diag(double* object, const int n)
     return;
 }
 
+template <>
+void Parallel_Reduce::reduce_pool<int>(int* object, const int n)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_SUM, POOL_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_or_all(bool& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_max_all<double>(double& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_max_all<float>(float& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_max_all<int>(int& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_min_all<double>(double& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_min_all<float>(float& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_min_all<int>(int& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_max_pool(int* object, const int n)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_MAX, POOL_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_min_pool(double& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, POOL_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_or_bp(bool& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_double_bgroup(double& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, INT_BGROUP);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_double_bgroup(double* object, const int n)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, INT_BGROUP);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_double_kp(double* object, const int n)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, KP_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_double_bp(double& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, BP_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_double_bp(double* object, const int n)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, BP_WORLD);
+#endif
+    return;
+}
+
 template <>
 void Parallel_Reduce::reduce_pool<float>(float& object)
 {
diff --git a/source/source_base/parallel_reduce.h b/source/source_base/parallel_reduce.h
index 7ab85be1cb..a36911d2c5 100644
--- a/source/source_base/parallel_reduce.h
+++ b/source/source_base/parallel_reduce.h
@@ -31,6 +31,25 @@ void reduce_int_grid(int* object, const int n); // mohan add 2012-01-12
 void reduce_double_grid(double* object, const int n);
 void reduce_double_diag(double* object, const int n);
 
+void reduce_or_all(bool& object);
+template <typename T>
+void reduce_max_all(T& object);
+template <typename T>
+void reduce_min_all(T& object);
+
+void reduce_max_pool(int* object, const int n);
+void reduce_min_pool(double& object);
+
+void reduce_or_bp(bool& object);
+
+void reduce_double_bgroup(double& object);
+void reduce_double_bgroup(double* object, const int n);
+
+void reduce_double_bp(double& object);
+void reduce_double_bp(double* object, const int n);
+
+void reduce_double_kp(double* object, const int n);
+
 void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double& object);
 void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double* object, const int n);
 
diff --git a/source/source_basis/module_pw/pw_basis_big.h b/source/source_basis/module_pw/pw_basis_big.h
index 2a04720877..b3e48cf698 100644
--- a/source/source_basis/module_pw/pw_basis_big.h
+++ b/source/source_basis/module_pw/pw_basis_big.h
@@ -2,6 +2,7 @@
 #define PW_BASIS_BIG_H
 #include "source_base/constants.h"
 #include "source_base/global_function.h"
+#include "source_base/parallel_reduce.h"
 #ifdef __MPI
 #include "mpi.h"
 #endif
@@ -167,7 +168,7 @@ class PW_Basis_Big : public PW_Basis_Sup
     ibox[1] = 2*n2+1;
     ibox[2] = 2*n3+1;
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, ibox, 3, MPI_INT, MPI_MAX , this->pool_world);
+    Parallel_Reduce::reduce_max_pool(ibox, 3);
 #endif
 
     // Find the minimal FFT box size the factors into the primes (2,3,5,7).
@@ -350,7 +351,7 @@ class PW_Basis_Big : public PW_Basis_Sup
             }
         }
 #ifdef __MPI
-        MPI_Allreduce(MPI_IN_PLACE, &this->gridecut_lat, 1, MPI_DOUBLE, MPI_MIN , this->pool_world);
+        Parallel_Reduce::reduce_min_pool(this->gridecut_lat);
 #endif
         this->gridecut_lat -= 1e-6;
 
diff --git a/source/source_basis/module_pw/pw_init.cpp b/source/source_basis/module_pw/pw_init.cpp
index 08c676d39f..f9cfafa5f1 100644
--- a/source/source_basis/module_pw/pw_init.cpp
+++ b/source/source_basis/module_pw/pw_init.cpp
@@ -1,5 +1,6 @@
 #include "pw_basis.h"
 #include "source_base/constants.h"
+#include "source_base/parallel_reduce.h"
 
 namespace ModulePW
 {
@@ -86,7 +87,7 @@ void PW_Basis:: initgrids(
     ibox[1] = 2*n2+1;
     ibox[2] = 2*n3+1;
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, ibox, 3, MPI_INT, MPI_MAX , this->pool_world);
+    Parallel_Reduce::reduce_max_pool(ibox, 3);
 #endif
 
     // Find the minimal FFT box size the factors into the primes (2,3,5,7).
@@ -200,7 +201,7 @@ void PW_Basis:: initgrids(
         }
     }
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &this->gridecut_lat, 1, MPI_DOUBLE, MPI_MIN , this->pool_world);
+    Parallel_Reduce::reduce_min_pool(this->gridecut_lat);
 #endif
     this->gridecut_lat -= 1e-6;
 
diff --git a/source/source_basis/module_pw/test/depend_mock.cpp b/source/source_basis/module_pw/test/depend_mock.cpp
index 4fdcfd5f4a..99518ca7c3 100644
--- a/source/source_basis/module_pw/test/depend_mock.cpp
+++ b/source/source_basis/module_pw/test/depend_mock.cpp
@@ -2,6 +2,7 @@
 #include "mpi.h"
 #endif
 #include "depend_mock.h"
+#include <complex>
 
 namespace GlobalV
 { 
@@ -11,14 +12,83 @@ namespace GlobalV
 MPI_Comm POOL_WORLD;
 namespace Parallel_Reduce
 {
-    template<typename T> void reduce_all(T& object) { return; };
-    template<typename T> void reduce_pool(T& object) { return; };
+    template<typename T> void reduce_all(T& object);
+    template<typename T> void reduce_all(T* object, const int n);
+    template<typename T> void reduce_pool(T& object);
+    template<typename T> void reduce_pool(T* object, const int n);
 
     template<>
-    void reduce_all<double>(double& object) { return; };
+    void reduce_all<int>(int& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); }
     template<>
-    void reduce_pool<double>(double& object) { return; };
+    void reduce_all<long long>(long long& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); }
     template<>
-    void reduce_pool<float>(float& object) { return; };
+    void reduce_all<double>(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<float>(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<std::complex<double>>(std::complex<double>& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<std::complex<float>>(std::complex<float>& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_FLOAT_COMPLEX, MPI_SUM, MPI_COMM_WORLD); }
+
+    template<>
+    void reduce_all<int>(int* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<long long>(long long* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<double>(double* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<std::complex<double>>(std::complex<double>* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<std::complex<float>>(std::complex<float>* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_C_FLOAT_COMPLEX, MPI_SUM, MPI_COMM_WORLD); }
+
+    template<>
+    void reduce_pool<float>(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_SUM, POOL_WORLD); }
+    template<>
+    void reduce_pool<double>(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, POOL_WORLD); }
+    template<>
+    void reduce_pool<std::complex<double>>(std::complex<double>& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, POOL_WORLD); }
+
+    template<>
+    void reduce_pool<int>(int* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_SUM, POOL_WORLD); }
+    template<>
+    void reduce_pool<double>(double* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, POOL_WORLD); }
+
+    void reduce_max_pool(int* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_MAX, POOL_WORLD); }
+    void reduce_min_pool(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, POOL_WORLD); }
+
+    // Other stubs can remain as is if not used or if they don't break logic
+    void reduce_or_all(bool& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD); }
+    
+    template <typename T>
+    void reduce_max_all(T& object);
+    template<> void reduce_max_all<double>(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); }
+    template<> void reduce_max_all<float>(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD); }
+    template<> void reduce_max_all<int>(int& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); }
+
+    template <typename T>
+    void reduce_min_all(T& object);
+    template<> void reduce_min_all<double>(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); }
+    template<> void reduce_min_all<float>(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD); }
+    template<> void reduce_min_all<int>(int& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); }
+
+    void reduce_or_bp(bool& object) { return; };
+
+    void reduce_double_bgroup(double& object) { return; };
+    void reduce_double_bgroup(double* object, const int n) { return; };
+
+    void reduce_double_bp(double& object) { return; };
+    void reduce_double_bp(double* object, const int n) { return; };
+
+    void reduce_double_kp(double* object, const int n) { return; };
+
+    void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double& object) { return; };
+    void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double* object, const int n) { return; };
+
+    void gather_min_int_all(const int& nproc, int& v) { return; };
+    void gather_max_double_all(const int& nproc, double& v) { return; };
+    void gather_min_double_all(const int& nproc, double& v) { return; };
+    void gather_max_double_pool(const int& nproc_in_pool, double& v) { return; };
+    void gather_min_double_pool(const int& nproc_in_pool, double& v) { return; };
+    void gather_int_all(int& v, int* all) { return; };
 }
 #endif
\ No newline at end of file
diff --git a/source/source_cell/parallel_kpoints.cpp b/source/source_cell/parallel_kpoints.cpp
index 2ca14090fb..dad4aa8637 100644
--- a/source/source_cell/parallel_kpoints.cpp
+++ b/source/source_cell/parallel_kpoints.cpp
@@ -2,6 +2,7 @@
 
 #include "source_base/parallel_common.h"
 #include "source_base/parallel_global.h"
+#include "source_base/parallel_reduce.h"
 
 // the kpoints here are reduced after symmetry applied.
 void Parallel_Kpoints::kinfo(int& nkstot_in,
@@ -123,8 +124,7 @@ void Parallel_Kpoints::gatherkvec(const std::vector<ModuleBase::Vector3<double>>
             vec_global[i + startk_pool[this->my_pool]] = vec_local[i];
         }
     }
-
-    MPI_Allreduce(MPI_IN_PLACE, &vec_global[0], 3 * this->nkstot_np, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    Parallel_Reduce::reduce_all(reinterpret_cast<double*>(vec_global.data()), 3 * this->nkstot_np);
     return;
 }
 #endif
diff --git a/source/source_estate/module_charge/charge_mpi.cpp b/source/source_estate/module_charge/charge_mpi.cpp
index 32fc8bc195..b13ed5ab0b 100644
--- a/source/source_estate/module_charge/charge_mpi.cpp
+++ b/source/source_estate/module_charge/charge_mpi.cpp
@@ -30,7 +30,7 @@ void Charge::reduce_diff_pools(double* array_rho) const
     ModuleBase::timer::tick("Charge", "reduce_diff_pools");
     if (KP_WORLD != MPI_COMM_NULL)
     {
-        MPI_Allreduce(MPI_IN_PLACE, array_rho, this->nrxx, MPI_DOUBLE, MPI_SUM, KP_WORLD);
+        Parallel_Reduce::reduce_double_kp(array_rho, this->nrxx);
     }
     else
     {
@@ -111,7 +111,7 @@ void Charge::reduce_diff_pools(double* array_rho) const
     }
     if(PARAM.globalv.all_ks_run && PARAM.inp.bndpar > 1)
     {
-        MPI_Allreduce(MPI_IN_PLACE, array_rho, this->nrxx, MPI_DOUBLE, MPI_SUM, BP_WORLD);
+        Parallel_Reduce::reduce_double_bp(array_rho, this->nrxx);
     }
     ModuleBase::timer::tick("Charge", "reduce_diff_pools");
 }
diff --git a/source/source_estate/module_charge/symmetry_rhog.cpp b/source/source_estate/module_charge/symmetry_rhog.cpp
index aae2103b75..8292c93acf 100644
--- a/source/source_estate/module_charge/symmetry_rhog.cpp
+++ b/source/source_estate/module_charge/symmetry_rhog.cpp
@@ -1,6 +1,7 @@
 #include "symmetry_rho.h"
 #include "source_pw/module_pwdft/global.h"
 #include "source_base/parallel_global.h"
+#include "source_base/parallel_reduce.h"
 #include "source_hamilt/module_xc/xc_functional.h"
 
 
@@ -10,7 +11,7 @@ void Symmetry_rho::psymmg(std::complex<double>* rhog_part, const ModulePW::PW_Ba
 	int * fftixy2is = new int [rho_basis->fftnxy];
 	rho_basis->getfftixy2is(fftixy2is);		//current proc
 #ifdef __MPI
-	MPI_Allreduce(MPI_IN_PLACE, fftixy2is, rho_basis->fftnxy, MPI_INT, MPI_SUM, POOL_WORLD);
+	Parallel_Reduce::reduce_pool(fftixy2is, rho_basis->fftnxy);
 	if(rho_basis->poolnproc>1)
 		for (int i=0;i<rho_basis->fftnxy;++i)
 			fftixy2is[i]+=rho_basis->poolnproc-1;
diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp
index d4db3d790b..71667c1f3f 100644
--- a/source/source_hsolver/diago_bpcg.cpp
+++ b/source/source_hsolver/diago_bpcg.cpp
@@ -4,6 +4,7 @@
 #include "source_base/global_function.h"
 #include "source_base/kernels/math_kernel_op.h"
 #include "source_base/parallel_comm.h" // different MPI worlds
+#include "source_base/parallel_reduce.h"
 #include "source_hsolver/kernels/bpcg_kernel_op.h"
 #include "para_linear_transform.h"
 
@@ -86,7 +87,7 @@ bool DiagoBPCG<T, Device>::test_error(const ct::Tensor& err_in, const std::vecto
         }
     }
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &not_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD);
+    Parallel_Reduce::reduce_or_bp(not_conv);
 #endif
     return not_conv;
 }
diff --git a/source/source_io/output_log.cpp b/source/source_io/output_log.cpp
index 7a4471b0a6..94ac774ea4 100644
--- a/source/source_io/output_log.cpp
+++ b/source/source_io/output_log.cpp
@@ -6,6 +6,7 @@
 #include "source_base/global_variable.h"
 
 #include "source_base/parallel_comm.h"
+#include "source_base/parallel_reduce.h"
 
 #ifdef __MPI
 #include <mpi.h>
@@ -154,7 +155,7 @@ void output_vacuum_level(const UnitCell* ucell,
         }
 
 #ifdef __MPI
-        MPI_Allreduce(MPI_IN_PLACE, ave, length, MPI_DOUBLE, MPI_SUM, POOL_WORLD);
+        Parallel_Reduce::reduce_pool(ave, length);
 #endif
 
         int surface = nxyz / length;
diff --git a/source/source_io/write_eig_occ.cpp b/source/source_io/write_eig_occ.cpp
index 4b13d885b9..e0271535f8 100644
--- a/source/source_io/write_eig_occ.cpp
+++ b/source/source_io/write_eig_occ.cpp
@@ -5,6 +5,7 @@
 #include "source_base/global_variable.h"
 #include "source_base/timer.h"
 #include "source_base/parallel_comm.h" // use POOL_WORLD
+#include "source_base/parallel_reduce.h"
 
 #ifdef __MPI
 #include <mpi.h> // use MPI_Barrier
@@ -192,7 +193,7 @@ void ModuleIO::write_eig_file(const ModuleBase::matrix &ekb,
 	}
 
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &wrong, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);
+    Parallel_Reduce::reduce_or_all(wrong);
 #endif
     if (wrong)
     {
diff --git a/source/source_lcao/module_lr/utils/lr_util.hpp b/source/source_lcao/module_lr/utils/lr_util.hpp
index 0ef1280c99..f8414027b3 100644
--- a/source/source_lcao/module_lr/utils/lr_util.hpp
+++ b/source/source_lcao/module_lr/utils/lr_util.hpp
@@ -4,6 +4,8 @@
 #include <algorithm>
 #include "source_cell/unitcell.h"
 #include "source_base/constants.h"
+#include "source_base/parallel_reduce.h"
+#include "source_base/parallel_device.h"
 #include "source_hamilt/module_xc/xc_functional.h"
 namespace LR_Util
 {
@@ -172,7 +174,7 @@ namespace LR_Util
 }
 
         //reduce to root
-        MPI_Allreduce(MPI_IN_PLACE, fullmat, global_nrow * global_ncol, get_mpi_datatype(), MPI_SUM, pv.comm());
+        Parallel_Common::reduce_dev<T, base_device::DEVICE_CPU>(fullmat, global_nrow * global_ncol, pv.comm());
     };
 #endif
 
diff --git a/source/source_lcao/module_operator_lcao/op_exx_lcao.hpp b/source/source_lcao/module_operator_lcao/op_exx_lcao.hpp
index 80f0c422a2..48065e80c8 100644
--- a/source/source_lcao/module_operator_lcao/op_exx_lcao.hpp
+++ b/source/source_lcao/module_operator_lcao/op_exx_lcao.hpp
@@ -6,6 +6,7 @@
 #include "source_io/module_parameter/parameter.h"
 #include "source_lcao/module_ri/RI_2D_Comm.h"
 #include "source_pw/module_pwdft/global.h"
+#include "source_base/parallel_reduce.h"
 #include "source_hamilt/module_xc/xc_functional.h"
 #include "source_io/restart_exx_csr.h"
 
@@ -245,7 +246,7 @@ OperatorEXX<OperatorLCAO<TK, TR>>::OperatorEXX(HS_Matrix_K<TK>* hsk_in,
 // Add MPI communication to synchronize all_exist across processes
 #ifdef __MPI
                 // don't read in any files if one of the processes doesn't have it
-                MPI_Allreduce(MPI_IN_PLACE, &all_exist, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+                Parallel_Reduce::reduce_min_all(all_exist);
 #endif                
                 if (all_exist)
                 {
@@ -264,7 +265,7 @@ OperatorEXX<OperatorLCAO<TK, TR>>::OperatorEXX(HS_Matrix_K<TK>* hsk_in,
                     std::ifstream ifs(restart_HR_path_cereal, std::ios::binary);
                     int all_exist_cereal = ifs ? 1 : 0;
 #ifdef __MPI                    
-                    MPI_Allreduce(MPI_IN_PLACE, &all_exist_cereal, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+                    Parallel_Reduce::reduce_min_all(all_exist_cereal);
 #endif                     
                     if (!all_exist_cereal)
                     {
diff --git a/source/source_pw/module_pwdft/VNL_in_pw.cpp b/source/source_pw/module_pwdft/VNL_in_pw.cpp
index 438358c4ae..c007a67c71 100644
--- a/source/source_pw/module_pwdft/VNL_in_pw.cpp
+++ b/source/source_pw/module_pwdft/VNL_in_pw.cpp
@@ -10,6 +10,7 @@
 #include "source_base/math_ylmreal.h"
 #include "source_base/memory.h"
 #include "source_base/module_device/device.h"
+#include "source_base/parallel_reduce.h"
 #include "source_base/timer.h"
 #include "source_pw/module_pwdft/global.h"
 #include "source_pw/module_pwdft/kernels/vnl_op.h"
@@ -683,8 +684,8 @@ void pseudopot_cell_vnl::init_vnl(UnitCell& cell, const ModulePW::PW_Basis* rho_
     }
 
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, this->qq_nt.ptr, this->qq_nt.getSize(), MPI_DOUBLE, MPI_SUM, POOL_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, this->qq_so.ptr, this->qq_so.getSize(), MPI_DOUBLE_COMPLEX, MPI_SUM, POOL_WORLD);
+    Parallel_Reduce::reduce_pool(this->qq_nt.ptr, this->qq_nt.getSize());
+    Parallel_Reduce::reduce_pool(this->qq_so.ptr, this->qq_so.getSize());
 #endif
 
     // set the atomic specific qq_at matrices
@@ -1510,7 +1511,7 @@ void pseudopot_cell_vnl::newq(const ModuleBase::matrix& veff, const ModulePW::PW
     }
 
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, deeq.ptr, deeq.getSize(), MPI_DOUBLE, MPI_SUM, POOL_WORLD);
+    Parallel_Reduce::reduce_pool(deeq.ptr, deeq.getSize());
 #endif
 
     delete[] qnorm;
diff --git a/source/source_pw/module_pwdft/elecond.cpp b/source/source_pw/module_pwdft/elecond.cpp
index 068ca01067..757259b31c 100644
--- a/source/source_pw/module_pwdft/elecond.cpp
+++ b/source/source_pw/module_pwdft/elecond.cpp
@@ -4,6 +4,7 @@
 #include "source_base/global_variable.h"
 #include "source_base/kernels/math_kernel_op.h"
 #include "source_base/parallel_device.h"
+#include "source_base/parallel_reduce.h"
 #include "source_estate/occupy.h"
 #include "source_io/binstream.h"
 #include "source_io/module_parameter/parameter.h"
@@ -93,9 +94,9 @@ void EleCond<FPTYPE, Device>::KG(const int& smear_type,
         jjresponse_ks(ik, nt, dt, decut, wg, velop, ct11.data(), ct12.data(), ct22.data());
     }
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, ct11.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, ct12.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, ct22.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    Parallel_Reduce::reduce_all(ct11.data(), nt);
+    Parallel_Reduce::reduce_all(ct12.data(), nt);
+    Parallel_Reduce::reduce_all(ct22.data(), nt);
 #endif
     //------------------------------------------------------------------
     //                    Output
diff --git a/source/source_pw/module_pwdft/setup_pwwfc.cpp b/source/source_pw/module_pwdft/setup_pwwfc.cpp
index 759178638c..d2f7cfba9c 100644
--- a/source/source_pw/module_pwdft/setup_pwwfc.cpp
+++ b/source/source_pw/module_pwdft/setup_pwwfc.cpp
@@ -1,5 +1,6 @@
 #include "source_pw/module_pwdft/setup_pwwfc.h" // pw_wfc
 #include "source_base/parallel_comm.h" // POOL_WORLD
+#include "source_base/parallel_reduce.h"
 #include "source_io/print_info.h" // print information
 
 void pw::teardown_pwwfc(ModulePW::PW_Basis_K* &pw_wfc)
@@ -56,7 +57,7 @@ void pw::setup_pwwfc(const Input_para& inp,
 #ifdef __MPI
     if (inp.pw_seed > 0)
     {
-        MPI_Allreduce(MPI_IN_PLACE, &pw_wfc->ggecut, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+        Parallel_Reduce::reduce_max_all(pw_wfc->ggecut);
     }
     // qianrui add 2021-8-13 to make different kpar parameters can get the same
     // results
diff --git a/source/source_pw/module_stodft/sto_dos.cpp b/source/source_pw/module_stodft/sto_dos.cpp
index dd90224e15..0326924d41 100644
--- a/source/source_pw/module_stodft/sto_dos.cpp
+++ b/source/source_pw/module_stodft/sto_dos.cpp
@@ -1,5 +1,6 @@
 #include "sto_dos.h"
 
+#include "source_base/parallel_reduce.h"
 #include "source_base/timer.h"
 #include "source_base/tool_title.h"
 #include "source_io/module_parameter/parameter.h"
@@ -234,9 +235,9 @@ void Sto_DOS<FPTYPE, Device>::caldos(const double sigmain, const double de, cons
         error[ie] = tmperror;
     }
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, ks_dos.data(), ndos, MPI_DOUBLE, MPI_SUM, INT_BGROUP);
-    MPI_Allreduce(MPI_IN_PLACE, sto_dos.data(), ndos, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, error.data(), ndos, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    Parallel_Reduce::reduce_double_bgroup(ks_dos.data(), ndos);
+    Parallel_Reduce::reduce_all(sto_dos.data(), ndos);
+    Parallel_Reduce::reduce_all(error.data(), ndos);
 #endif
     if (GlobalV::MY_RANK == 0)
     {
diff --git a/source/source_pw/module_stodft/sto_elecond.cpp b/source/source_pw/module_stodft/sto_elecond.cpp
index b0fe4d71d7..9f9bdb1de7 100644
--- a/source/source_pw/module_stodft/sto_elecond.cpp
+++ b/source/source_pw/module_stodft/sto_elecond.cpp
@@ -5,6 +5,7 @@
 #include "source_base/memory.h"
 #include "source_base/module_container/ATen/tensor.h"
 #include "source_base/parallel_device.h"
+#include "source_base/parallel_reduce.h"
 #include "source_base/timer.h"
 #include "source_base/vector3.h"
 #include "source_io/module_parameter/parameter.h"
@@ -1059,9 +1060,9 @@ void Sto_EleCond<FPTYPE, Device>::sKG(const int& smear_type,
     } // ik loop
     ModuleBase::timer::tick("Sto_EleCond", "kloop");
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, ct11.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, ct12.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, ct22.data(), nt, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    Parallel_Reduce::reduce_all(ct11.data(), nt);
+    Parallel_Reduce::reduce_all(ct12.data(), nt);
+    Parallel_Reduce::reduce_all(ct22.data(), nt);
 #endif
 
     //------------------------------------------------------------------
diff --git a/source/source_pw/module_stodft/sto_iter.cpp b/source/source_pw/module_stodft/sto_iter.cpp
index aa9990a415..3d35b7d458 100644
--- a/source/source_pw/module_stodft/sto_iter.cpp
+++ b/source/source_pw/module_stodft/sto_iter.cpp
@@ -204,9 +204,8 @@ void Stochastic_Iter<T, Device>::checkemm(const int& ik,
     if (ik == nks - 1)
     {
 #ifdef __MPI
-        MPI_Allreduce(MPI_IN_PLACE, p_hamilt_sto->emax, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-        MPI_Allreduce(MPI_IN_PLACE, p_hamilt_sto->emin, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-        MPI_Allreduce(MPI_IN_PLACE, &change, 1, MPI_CHAR, MPI_LOR, MPI_COMM_WORLD);
+            Parallel_Reduce::reduce_max_all(*p_hamilt_sto->emax);
+            Parallel_Reduce::reduce_min_all(*p_hamilt_sto->emin);
 #endif
         if (change)
         {
@@ -249,7 +248,7 @@ void Stochastic_Iter<T, Device>::check_precision(const double ref, const double
     }
 
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &error, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    Parallel_Reduce::reduce_all(error);
 #endif
     double relative_error = std::abs(error / ref);
     GlobalV::ofs_running << info << "Relative Chebyshev Precision: " << relative_error * 1e9 << "E-09" << std::endl;
@@ -468,12 +467,12 @@ double Stochastic_Iter<T, Device>::calne(elecstate::ElecState* pes)
     }
     KS_ne /= GlobalV::NPROC_IN_POOL;
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &KS_ne, 1, MPI_DOUBLE, MPI_SUM, INT_BGROUP);
+    Parallel_Reduce::reduce_double_bgroup(KS_ne);
     if(PARAM.globalv.all_ks_run)
     {
-        MPI_Allreduce(MPI_IN_PLACE, &KS_ne, 1, MPI_DOUBLE, MPI_SUM, BP_WORLD);
+        Parallel_Reduce::reduce_double_bp(KS_ne);
     }
-    MPI_Allreduce(MPI_IN_PLACE, &sto_ne, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    Parallel_Reduce::reduce_all(sto_ne);
 #endif
 
     totne = KS_ne + sto_ne;
@@ -535,12 +534,12 @@ void Stochastic_Iter<T, Device>::sum_stoeband(Stochastic_WF<T, Device>& stowf,
     }
     pes->f_en.demet /= GlobalV::NPROC_IN_POOL;
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &pes->f_en.demet, 1, MPI_DOUBLE, MPI_SUM, INT_BGROUP);
+    Parallel_Reduce::reduce_double_bgroup(pes->f_en.demet);
     if(PARAM.globalv.all_ks_run)
     {
-        MPI_Allreduce(MPI_IN_PLACE, &pes->f_en.demet, 1, MPI_DOUBLE, MPI_SUM, BP_WORLD);
+        Parallel_Reduce::reduce_double_bp(pes->f_en.demet);
     }
-    MPI_Allreduce(MPI_IN_PLACE, &stodemet, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    Parallel_Reduce::reduce_all(stodemet);
 #endif
     pes->f_en.demet += stodemet;
     this->check_precision(pes->f_en.demet, 1e-4, "TS");
@@ -581,7 +580,7 @@ void Stochastic_Iter<T, Device>::sum_stoeband(Stochastic_WF<T, Device>& stowf,
         }
     }
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &sto_eband, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    Parallel_Reduce::reduce_all(sto_eband);
 #endif
     pes->f_en.eband += sto_eband;
     ModuleBase::timer::tick("Stochastic_Iter", "sum_stoeband");
@@ -673,7 +672,7 @@ void Stochastic_Iter<T, Device>::cal_storho(const UnitCell& ucell,
             pes->charge->reduce_diff_pools(sto_rho[is]);
             if (!PARAM.globalv.all_ks_run && PARAM.inp.bndpar > 1)
             {
-                MPI_Allreduce(MPI_IN_PLACE, sto_rho[is], nrxx, MPI_DOUBLE, MPI_SUM, BP_WORLD);
+                Parallel_Reduce::reduce_double_bp(sto_rho[is], nrxx);
             }
         }
     }
@@ -695,7 +694,7 @@ void Stochastic_Iter<T, Device>::cal_storho(const UnitCell& ucell,
     sto_ne *= ucell.omega / wfc_basis->nxyz;
 
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &sto_ne, 1, MPI_DOUBLE, MPI_SUM, POOL_WORLD);
+    Parallel_Reduce::reduce_pool(sto_ne);
 #endif
     double factor = targetne / (KS_ne + sto_ne);
     if (std::abs(factor - 1) > 1e-10)
diff --git a/source/source_pw/module_stodft/sto_tool.cpp b/source/source_pw/module_stodft/sto_tool.cpp
index de1e72e3f1..95391611b2 100644
--- a/source/source_pw/module_stodft/sto_tool.cpp
+++ b/source/source_pw/module_stodft/sto_tool.cpp
@@ -2,6 +2,7 @@
 
 #include "source_base/math_chebyshev.h"
 #include "source_base/parallel_device.h"
+#include "source_base/parallel_reduce.h"
 #include "source_base/timer.h"
 #include "source_io/module_parameter/parameter.h"
 #ifdef __MPI
@@ -103,8 +104,8 @@ void check_che_op<FPTYPE, Device>::operator()(const int& nche_in,
         if (ik == nk - 1)
         {
 #ifdef __MPI
-            MPI_Allreduce(MPI_IN_PLACE, p_hamilt_sto->emax, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-            MPI_Allreduce(MPI_IN_PLACE, p_hamilt_sto->emin, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+            Parallel_Reduce::reduce_max_all(*p_hamilt_sto->emax);
+            Parallel_Reduce::reduce_min_all(*p_hamilt_sto->emin);
 #endif
             GlobalV::ofs_running << "New Emax " << *p_hamilt_sto->emax << " Ry; new Emin " << *p_hamilt_sto->emin
                                  << " Ry" << std::endl;