add gatherC for para_gemm

Qianruipku · Qianruipku · commit bc7b92c42ac8 · 2025-01-20T12:01:04.000+08:00
diff --git a/source/module_base/para_gemm.cpp b/source/module_base/para_gemm.cpp
@@ -24,55 +24,59 @@ void PGemmCN<T, Device>::set_dimension(
     const int ncolB_in,
     const int LDB_in,
     const int nrow_in,
-    const int LDC_global_in)
+    const int LDC_in,
+    const bool gatherC_in)
 {
 #ifdef __MPI
     MPI_Comm_rank(comm_col, &col_rank);
     MPI_Comm_size(comm_col, &col_nproc);
     if (comm_row != MPI_COMM_NULL)
     {
         MPI_Comm_rank(comm_row, &row_rank);
+        MPI_Comm_size(comm_row, &row_nproc);
     }
     col_world = comm_col;
     row_world = comm_row;
 #endif
     this->LDA = LDA_in;
     this->LDB = LDB_in;
-    this->LDC_global = LDC_global_in;
+    this->LDC = LDC_in;
     this->ncolA = ncolA_in;
     this->ncolB = ncolB_in;
     this->nrow = nrow_in;
 #ifdef __MPI
+    this->gatherC = gatherC_in;
     colA_loc.resize(col_nproc);
-    colB_loc.resize(col_nproc);
-    row_loc.resize(col_nproc);
-    recv_counts.resize(col_nproc);
-    displs.resize(col_nproc);
-    requests.resize(col_nproc);
     MPI_Allgather(&ncolA, 1, MPI_INT, colA_loc.data(), 1, MPI_INT, col_world);
-    MPI_Allgather(&ncolB, 1, MPI_INT, colB_loc.data(), 1, MPI_INT, col_world);
-    MPI_Allgather(&nrow, 1, MPI_INT, row_loc.data(), 1, MPI_INT, col_world);
     for (int ip = 0; ip < col_nproc; ip++)
     {
         max_colA = std::max(max_colA, colA_loc[ip]);
     }
 
-    for (int ip = 0; ip < col_nproc; ip++)
-    {
-        recv_counts[ip] = LDC_global * colB_loc[ip];
-    }
-    displs[0] = 0;
-    for (int ip = 1; ip < col_nproc; ip++)
+    if (this->gatherC)
     {
-        displs[ip] = displs[ip - 1] + recv_counts[ip - 1];
+        colB_loc.resize(col_nproc);
+        recv_counts.resize(col_nproc);
+        displs.resize(col_nproc);
+        requests.resize(col_nproc);
+        MPI_Allgather(&ncolB, 1, MPI_INT, colB_loc.data(), 1, MPI_INT, col_world);
+        for (int ip = 0; ip < col_nproc; ip++)
+        {
+            recv_counts[ip] = LDC * colB_loc[ip];
+        }
+        displs[0] = 0;
+        for (int ip = 1; ip < col_nproc; ip++)
+        {
+            displs[ip] = displs[ip - 1] + recv_counts[ip - 1];
+        }
+        size_C_global = displs[col_nproc - 1] + recv_counts[col_nproc - 1];
     }
-    size_C_global = displs[col_nproc - 1] + recv_counts[col_nproc - 1];
-    send_counts = ncolB * LDC_global;
+    size_C_local = ncolB * LDC;
 #endif
 }
 
 template <typename T, typename Device>
-void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T beta, T* C_global)
+void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T beta, T* C)
 {
     const Device* ctx = {};
 #ifdef __MPI
@@ -88,20 +92,23 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T
             }
         }
 
-        std::vector<T> C_tmp(send_counts);
-        T* Ctmp_device = nullptr;
-        if (std::is_same<Device, base_device::DEVICE_GPU>::value)
-        {
-            resmem_dev_op()(Ctmp_device, send_counts);
-        }
-        else
+        T* C_local = C;
+        std::vector<T> C_tmp;
+        if (this->gatherC)
         {
-            Ctmp_device = C_tmp.data();
+            C_tmp.resize(size_C_local);
+            if (std::is_same<Device, base_device::DEVICE_GPU>::value)
+            {
+                C_local = nullptr;
+                resmem_dev_op()(C_local, size_C_local);
+            }
+            else
+            {
+                C_local = C_tmp.data();
+            }
+            syncmem_dev_op()(C_local, C + displs[col_rank], size_C_local);
         }
 
-        T* C_local = C_global + displs[col_rank];
-        syncmem_dev_op()(Ctmp_device, C_local, send_counts);
-
         T* Atmp_device = nullptr;
         if (std::is_same<Device, base_device::DEVICE_GPU>::value)
         {
@@ -116,7 +123,7 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T
         T real_beta = row_rank == 0 ? beta : 0;
         for (int ip = 0; ip < col_nproc; ip++)
         {
-            T* C_start = Ctmp_device + shift;
+            T* C_start = C_local + shift;
             if (col_rank == ip)
             {
                 ModuleBase::gemm_op<T, Device>()(ctx,
@@ -132,7 +139,7 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T
                                                  LDB,
                                                  &real_beta,
                                                  C_start,
-                                                 LDC_global);
+                                                 LDC);
                 shift += ncolA;
             }
             else
@@ -155,61 +162,65 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T
                                                  LDB,
                                                  &real_beta,
                                                  C_start,
-                                                 LDC_global);
+                                                 LDC);
                 shift += m;
             }
         }
 
-        T* Cglobal_cpu = nullptr;
-        if (std::is_same<Device, base_device::DEVICE_GPU>::value)
+        if (this->gatherC)
         {
-            delmem_dev_op()(Ctmp_device);
-            delmem_dev_op()(Atmp_device);
-            syncmem_dev_op()(C_tmp.data(), Ctmp_device, send_counts);
-            resmem_dev_op()(Cglobal_cpu, size_C_global);
+            T* Cglobal_cpu = nullptr;
+            T* Clocal_cpu = C_tmp.data();;
+            if (std::is_same<Device, base_device::DEVICE_GPU>::value)
+            {
+                delmem_dev_op()(Atmp_device);
+
+                syncmem_d2h_op()(Clocal_cpu, C_local, size_C_local);
+                delmem_dev_op()(C_local);
+                
+                resmem_dev_op()(Cglobal_cpu, size_C_global);
+            }
+            else
+            {
+                Cglobal_cpu = C;
+            }
+            if (this->row_nproc > 1)
+            {
+                Parallel_Common::reduce_data(Clocal_cpu, size_C_local, row_world);
+            }
+            Parallel_Common::gatherv_data(Clocal_cpu,
+                                          size_C_local,
+                                          Cglobal_cpu,
+                                          recv_counts.data(),
+                                          displs.data(),
+                                          col_world);
+
+            if (std::is_same<Device, base_device::DEVICE_GPU>::value)
+            {
+                syncmem_h2d_op()(C, Cglobal_cpu, size_C_global);
+                delmem_dev_op()(Cglobal_cpu);
+            }
         }
         else
         {
-            Cglobal_cpu = C_global;
-        }
-        Parallel_Common::gatherv_data(C_tmp.data(),
-                                      send_counts,
-                                      Cglobal_cpu,
-                                      recv_counts.data(),
-                                      displs.data(),
-                                      col_world);
-        if (row_world != MPI_COMM_NULL)
-        {
-            Parallel_Common::reduce_data(Cglobal_cpu, size_C_global, row_world);
-        }
-        if (std::is_same<Device, base_device::DEVICE_GPU>::value)
-        {
-            syncmem_dev_op()(C_global, Cglobal_cpu, size_C_global);
-            delmem_dev_op()(Cglobal_cpu);
+            if (this->row_nproc > 1)
+            {
+                Parallel_Common::reduce_dev<T, Device>(C, size_C_local, row_world);
+            }
         }
     }
     else
     {
         T real_beta = row_rank == 0 ? beta : 0;
 #else
-        T real_beta = beta;
+    T real_beta = beta;
 #endif
-        ModuleBase::gemm_op<T, Device>()(ctx,
-                                         'C',
-                                         'N',
-                                         ncolA,
-                                         ncolB,
-                                         nrow,
-                                         &alpha,
-                                         A,
-                                         LDA,
-                                         B,
-                                         LDB,
-                                         &real_beta,
-                                         C_global,
-                                         LDC_global);
+        ModuleBase::gemm_op<T, Device>()(ctx, 'C', 'N', ncolA, ncolB, nrow, &alpha, A, LDA, B, LDB, &real_beta, C, LDC);
 #ifdef __MPI
-        Parallel_Common::reduce_dev<T, Device>(C_global, size_C_global, row_world);
+        if (this->row_nproc > 1)
+        {
+            Parallel_Common::reduce_dev<T, Device>(C, size_C_local, row_world);
+        }
     }
 #endif
 }
diff --git a/source/module_base/para_gemm.h b/source/module_base/para_gemm.h
@@ -12,9 +12,13 @@ namespace ModuleBase
 {
 /**
  * @brief this class is used to perform parallel matrix multiplication
- *        C_global = alpha * A^+ * B + beta * C_global
- *        Here, A and B are local matrices in each proc, and C_global is a global matrix gathered from all procs
- *        All procs have their own C_global matrix with the same values.
+ *        C = alpha * A^H * B + beta * C
+ *        Here, A and B are local matrices in each proc,
+ *        C can be C_local or C_global, depending on the value of gatherC
+ *        C_local is a local matrix in each proc
+ *        C_global is a global matrix gathered from all procs and all procs have their own C_global matrix with the same
+ *        C_global and C_local have the same LDC, but different column numbers
+ * values.
  */
 template <typename T, typename Device = base_device::DEVICE_CPU>
 class PGemmCN
@@ -24,14 +28,15 @@ class PGemmCN
     ~PGemmCN();
 
     /**
-     * @brief set the dimension of A, B, and C_global
+     * @brief set the dimension of A, B, and C
      *
      * @param ncolA number of columns of A, which is a local matrix in each proc
      * @param LDA leading dimension of A in each proc
      * @param ncolB number of columns of B, which is a local matrix in each proc
      * @param LDB leading dimension of B in each proc
      * @param nrow number of rows of A or B
-     * @param LDC_global leading dimension of C_global, which is the global C matrix gathered from all procs
+     * @param LDC leading dimension of C. C can be C_local or C_global
+     * @param gatherC whether gather C_local to C_global
      */
     void set_dimension(
 #ifdef __MPI
@@ -43,47 +48,46 @@ class PGemmCN
         const int ncolB,
         const int LDB,
         const int nrow,
-        const int LDC_global);
+        const int LDC,
+        const bool gatherC = true);
+
     /**
-     * @brief calculate C_global = alpha * A^+ * B + beta * C_global
+     * @brief calculate C = alpha * A^H * B + beta * C
      *
-     * @param alpha
-     * @param A
-     * @param B
-     * @param beta
-     * @param C_global
      */
-    void multiply(const T alpha, const T* A, const T* B, const T beta, T* C_global);
+    void multiply(const T alpha, const T* A, const T* B, const T beta, T* C);
 #ifdef __MPI
     MPI_Comm col_world = MPI_COMM_NULL; ///< column communicator world
     MPI_Comm row_world = MPI_COMM_NULL; ///< row communicator world
 
     int col_rank = 0;  ///< rank in col_world
     int col_nproc = 1; ///< number of procs in col_world
     int row_rank = 0;  ///< rank in row_world
+    int row_nproc = 1; ///< number of procs in row_world
 
     std::vector<int> colA_loc; ///< [col_nproc] number of columns of A matrix in each proc
     int max_colA = 0;          ///< maximum number of columns of A matrix in all procs
     std::vector<int> colB_loc; ///<[col_nproc] number of columns of B matrix in each proc
-    std::vector<int> row_loc;  ///<[col_nproc] number of rows of C matrix in each proc
 
     std::vector<MPI_Request> requests; ///< MPI request
     std::vector<int> recv_counts;      ///< receive counts for gathering C_local to C_global
     std::vector<int> displs;           ///< displacements for gathering C_local to C_global
-    int send_counts = 0;               ///< send counts for gathering C_local to C_global
+    int size_C_local = 0;              ///< size of C_local, which is a local matrix in each proc
     int size_C_global = 0;             ///< size of C_global, which is the global C matrix gathered from all procs
+    bool gatherC = true;               ///< whether gather C_local to C_global
 #endif
-    int ncolA = 0;      ///< number of columns of A, which is a local matrix in each proc
-    int ncolB = 0;      ///< number of columns of B, which is a local matrix in each proc
-    int nrow = 0;       ///< number of rows of A or B
-    int LDA = 0;        ///< leading dimension of A in each proc
-    int LDB = 0;        ///< leading dimension of B in each proc
-    int LDC_global = 0; ///< leading dimension of C_global, which is the global C matrix gathered from all procs
+    int ncolA = 0; ///< number of columns of A, which is a local matrix in each proc
+    int ncolB = 0; ///< number of columns of B, which is a local matrix in each proc
+    int nrow = 0;  ///< number of rows of A or B
+    int LDA = 0;   ///< leading dimension of A in each proc
+    int LDB = 0;   ///< leading dimension of B in each proc
+    int LDC = 0;   ///< leading dimension of C, which can be C_local or C_global
   private:
-    using resmem_dev_op = base_device::memory::resize_memory_op<T, Device>; 
+    using resmem_dev_op = base_device::memory::resize_memory_op<T, Device>;
     using delmem_dev_op = base_device::memory::delete_memory_op<T, Device>;
     using syncmem_dev_op = base_device::memory::synchronize_memory_op<T, Device, Device>;
-
+    using syncmem_d2h_op = base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>;
+    using syncmem_h2d_op = base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>;
 };
 } // namespace ModuleBase
 #endif
diff --git a/source/module_base/test_parallel/test_para_gemm.cpp b/source/module_base/test_parallel/test_para_gemm.cpp