deepmodeling · A-006 · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026
diff --git a/source/source_base/parallel_reduce.cpp b/source/source_base/parallel_reduce.cpp
@@ -1,3 +1,4 @@
+// Force recompilation
 #include "parallel_reduce.h"
 
 #include "parallel_comm.h"
@@ -99,6 +100,141 @@ void Parallel_Reduce::reduce_double_diag(double* object, const int n)
     return;
 }
 
+template <>
+void Parallel_Reduce::reduce_pool<int>(int* object, const int n)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_SUM, POOL_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_or_all(bool& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_max_all<double>(double& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_max_all<float>(float& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_max_all<int>(int& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_min_all<double>(double& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_min_all<float>(float& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+template <>
+void Parallel_Reduce::reduce_min_all<int>(int& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_max_pool(int* object, const int n)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_MAX, POOL_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_min_pool(double& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, POOL_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_or_bp(bool& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_double_bgroup(double& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, INT_BGROUP);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_double_bgroup(double* object, const int n)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, INT_BGROUP);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_double_kp(double* object, const int n)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, KP_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_double_bp(double& object)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, BP_WORLD);
+#endif
+    return;
+}
+
+void Parallel_Reduce::reduce_double_bp(double* object, const int n)
+{
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, BP_WORLD);
+#endif
+    return;
+}
+
 template <>
 void Parallel_Reduce::reduce_pool<float>(float& object)
 {

diff --git a/source/source_base/parallel_reduce.h b/source/source_base/parallel_reduce.h
@@ -31,6 +31,25 @@ void reduce_int_grid(int* object, const int n); // mohan add 2012-01-12
 void reduce_double_grid(double* object, const int n);
 void reduce_double_diag(double* object, const int n);
 
+void reduce_or_all(bool& object);
+template <typename T>
+void reduce_max_all(T& object);
+template <typename T>
+void reduce_min_all(T& object);
+
+void reduce_max_pool(int* object, const int n);
+void reduce_min_pool(double& object);
+
+void reduce_or_bp(bool& object);
+
+void reduce_double_bgroup(double& object);
+void reduce_double_bgroup(double* object, const int n);
+
+void reduce_double_bp(double& object);
+void reduce_double_bp(double* object, const int n);
+
+void reduce_double_kp(double* object, const int n);
+
 void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double& object);
 void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double* object, const int n);
 

diff --git a/source/source_basis/module_pw/pw_basis_big.h b/source/source_basis/module_pw/pw_basis_big.h
@@ -2,6 +2,7 @@
 #define PW_BASIS_BIG_H
 #include "source_base/constants.h"
 #include "source_base/global_function.h"
+#include "source_base/parallel_reduce.h"
 #ifdef __MPI
 #include "mpi.h"
 #endif
@@ -167,7 +168,7 @@ class PW_Basis_Big : public PW_Basis_Sup
     ibox[1] = 2*n2+1;
     ibox[2] = 2*n3+1;
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, ibox, 3, MPI_INT, MPI_MAX , this->pool_world);
+    Parallel_Reduce::reduce_max_pool(ibox, 3);
 #endif
 
     // Find the minimal FFT box size the factors into the primes (2,3,5,7).
@@ -350,7 +351,7 @@ class PW_Basis_Big : public PW_Basis_Sup
             }
         }
 #ifdef __MPI
-        MPI_Allreduce(MPI_IN_PLACE, &this->gridecut_lat, 1, MPI_DOUBLE, MPI_MIN , this->pool_world);
+        Parallel_Reduce::reduce_min_pool(this->gridecut_lat);
 #endif
         this->gridecut_lat -= 1e-6;
 

diff --git a/source/source_basis/module_pw/pw_init.cpp b/source/source_basis/module_pw/pw_init.cpp
@@ -1,5 +1,6 @@
 #include "pw_basis.h"
 #include "source_base/constants.h"
+#include "source_base/parallel_reduce.h"
 
 namespace ModulePW
 {
@@ -86,7 +87,7 @@ void PW_Basis:: initgrids(
     ibox[1] = 2*n2+1;
     ibox[2] = 2*n3+1;
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, ibox, 3, MPI_INT, MPI_MAX , this->pool_world);
+    Parallel_Reduce::reduce_max_pool(ibox, 3);
 #endif
 
     // Find the minimal FFT box size the factors into the primes (2,3,5,7).
@@ -200,7 +201,7 @@ void PW_Basis:: initgrids(
         }
     }
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &this->gridecut_lat, 1, MPI_DOUBLE, MPI_MIN , this->pool_world);
+    Parallel_Reduce::reduce_min_pool(this->gridecut_lat);
 #endif
     this->gridecut_lat -= 1e-6;
 

diff --git a/source/source_basis/module_pw/test/depend_mock.cpp b/source/source_basis/module_pw/test/depend_mock.cpp
@@ -2,6 +2,7 @@
 #include "mpi.h"
 #endif
 #include "depend_mock.h"
+#include <complex>
 
 namespace GlobalV
 { 
@@ -11,14 +12,83 @@ namespace GlobalV
 MPI_Comm POOL_WORLD;
 namespace Parallel_Reduce
 {
-    template<typename T> void reduce_all(T& object) { return; };
-    template<typename T> void reduce_pool(T& object) { return; };
+    template<typename T> void reduce_all(T& object);
+    template<typename T> void reduce_all(T* object, const int n);
+    template<typename T> void reduce_pool(T& object);
+    template<typename T> void reduce_pool(T* object, const int n);
 
     template<>
-    void reduce_all<double>(double& object) { return; };
+    void reduce_all<int>(int& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); }
     template<>
-    void reduce_pool<double>(double& object) { return; };
+    void reduce_all<long long>(long long& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); }
     template<>
-    void reduce_pool<float>(float& object) { return; };
+    void reduce_all<double>(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<float>(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<std::complex<double>>(std::complex<double>& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<std::complex<float>>(std::complex<float>& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_FLOAT_COMPLEX, MPI_SUM, MPI_COMM_WORLD); }
+
+    template<>
+    void reduce_all<int>(int* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<long long>(long long* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<double>(double* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<std::complex<double>>(std::complex<double>* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); }
+    template<>
+    void reduce_all<std::complex<float>>(std::complex<float>* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_C_FLOAT_COMPLEX, MPI_SUM, MPI_COMM_WORLD); }
+
+    template<>
+    void reduce_pool<float>(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_SUM, POOL_WORLD); }
+    template<>
+    void reduce_pool<double>(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_SUM, POOL_WORLD); }
+    template<>
+    void reduce_pool<std::complex<double>>(std::complex<double>& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, POOL_WORLD); }
+
+    template<>
+    void reduce_pool<int>(int* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_SUM, POOL_WORLD); }
+    template<>
+    void reduce_pool<double>(double* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, POOL_WORLD); }
+
+    void reduce_max_pool(int* object, const int n) { MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_INT, MPI_MAX, POOL_WORLD); }
+    void reduce_min_pool(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, POOL_WORLD); }
+
+    // Other stubs can remain as is if not used or if they don't break logic
+    void reduce_or_all(bool& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD); }
+
+    template <typename T>
+    void reduce_max_all(T& object);
+    template<> void reduce_max_all<double>(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); }
+    template<> void reduce_max_all<float>(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD); }
+    template<> void reduce_max_all<int>(int& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); }
+
+    template <typename T>
+    void reduce_min_all(T& object);
+    template<> void reduce_min_all<double>(double& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); }
+    template<> void reduce_min_all<float>(float& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD); }
+    template<> void reduce_min_all<int>(int& object) { MPI_Allreduce(MPI_IN_PLACE, &object, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); }
+
+    void reduce_or_bp(bool& object) { return; };
+
+    void reduce_double_bgroup(double& object) { return; };
+    void reduce_double_bgroup(double* object, const int n) { return; };
+
+    void reduce_double_bp(double& object) { return; };
+    void reduce_double_bp(double* object, const int n) { return; };
+
+    void reduce_double_kp(double* object, const int n) { return; };
+
+    void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double& object) { return; };
+    void reduce_double_allpool(const int& npool, const int& nproc_in_pool, double* object, const int n) { return; };
+
+    void gather_min_int_all(const int& nproc, int& v) { return; };
+    void gather_max_double_all(const int& nproc, double& v) { return; };
+    void gather_min_double_all(const int& nproc, double& v) { return; };
+    void gather_max_double_pool(const int& nproc_in_pool, double& v) { return; };
+    void gather_min_double_pool(const int& nproc_in_pool, double& v) { return; };
+    void gather_int_all(int& v, int* all) { return; };
 }
 #endif
diff --git a/source/source_cell/parallel_kpoints.cpp b/source/source_cell/parallel_kpoints.cpp
@@ -2,6 +2,7 @@
 
 #include "source_base/parallel_common.h"
 #include "source_base/parallel_global.h"
+#include "source_base/parallel_reduce.h"
 
 // the kpoints here are reduced after symmetry applied.
 void Parallel_Kpoints::kinfo(int& nkstot_in,
@@ -123,8 +124,7 @@ void Parallel_Kpoints::gatherkvec(const std::vector<ModuleBase::Vector3<double>>
             vec_global[i + startk_pool[this->my_pool]] = vec_local[i];
         }
     }
-
-    MPI_Allreduce(MPI_IN_PLACE, &vec_global[0], 3 * this->nkstot_np, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    Parallel_Reduce::reduce_all(reinterpret_cast<double*>(vec_global.data()), 3 * this->nkstot_np);
     return;
 }
 #endif

diff --git a/source/source_estate/module_charge/charge_mpi.cpp b/source/source_estate/module_charge/charge_mpi.cpp
@@ -30,7 +30,7 @@ void Charge::reduce_diff_pools(double* array_rho) const
     ModuleBase::timer::tick("Charge", "reduce_diff_pools");
     if (KP_WORLD != MPI_COMM_NULL)
     {
-        MPI_Allreduce(MPI_IN_PLACE, array_rho, this->nrxx, MPI_DOUBLE, MPI_SUM, KP_WORLD);
+        Parallel_Reduce::reduce_double_kp(array_rho, this->nrxx);
     }
     else
     {
@@ -111,7 +111,7 @@ void Charge::reduce_diff_pools(double* array_rho) const
     }
     if(PARAM.globalv.all_ks_run && PARAM.inp.bndpar > 1)
     {
-        MPI_Allreduce(MPI_IN_PLACE, array_rho, this->nrxx, MPI_DOUBLE, MPI_SUM, BP_WORLD);
+        Parallel_Reduce::reduce_double_bp(array_rho, this->nrxx);
     }
     ModuleBase::timer::tick("Charge", "reduce_diff_pools");
 }

diff --git a/source/source_estate/module_charge/symmetry_rhog.cpp b/source/source_estate/module_charge/symmetry_rhog.cpp
@@ -1,6 +1,7 @@
 #include "symmetry_rho.h"
 #include "source_pw/module_pwdft/global.h"
 #include "source_base/parallel_global.h"
+#include "source_base/parallel_reduce.h"
 #include "source_hamilt/module_xc/xc_functional.h"
 
 
@@ -10,7 +11,7 @@ void Symmetry_rho::psymmg(std::complex<double>* rhog_part, const ModulePW::PW_Ba
 	int * fftixy2is = new int [rho_basis->fftnxy];
 	rho_basis->getfftixy2is(fftixy2is);		//current proc
 #ifdef __MPI
-	MPI_Allreduce(MPI_IN_PLACE, fftixy2is, rho_basis->fftnxy, MPI_INT, MPI_SUM, POOL_WORLD);
+	Parallel_Reduce::reduce_pool(fftixy2is, rho_basis->fftnxy);
 	if(rho_basis->poolnproc>1)
 		for (int i=0;i<rho_basis->fftnxy;++i)
 			fftixy2is[i]+=rho_basis->poolnproc-1;

diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp
@@ -4,6 +4,7 @@
 #include "source_base/global_function.h"
 #include "source_base/kernels/math_kernel_op.h"
 #include "source_base/parallel_comm.h" // different MPI worlds
+#include "source_base/parallel_reduce.h"
 #include "source_hsolver/kernels/bpcg_kernel_op.h"
 #include "para_linear_transform.h"
 
@@ -86,7 +87,7 @@ bool DiagoBPCG<T, Device>::test_error(const ct::Tensor& err_in, const std::vecto
         }
     }
 #ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &not_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD);
+    Parallel_Reduce::reduce_or_bp(not_conv);
 #endif
     return not_conv;
 }