diff --git a/source/module_base/kernels/cuda/math_kernel_op.cu b/source/module_base/kernels/cuda/math_kernel_op.cu
index c17f9cd579..39043daff1 100644
--- a/source/module_base/kernels/cuda/math_kernel_op.cu
+++ b/source/module_base/kernels/cuda/math_kernel_op.cu
@@ -385,22 +385,15 @@ __global__ void matrix_transpose_kernel(
     }
 }
 
-
 template <typename T>
-__global__ void matrix_setTo_another_kernel(
-        const int n,
-        const int LDA,
-        const int LDB,
-    const T* matrix_A,
-    T* matrix_B)
+__global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
 {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j < LDA && j < LDB)
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i < n1 && j < n2)
     {
-        for (int i = 0; i < n; i++)
-        {
-            matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
-        }
+        B[i * LDB + j] = A[i * LDA + j];
     }
 }
 
@@ -980,40 +973,43 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
 }
 
 template <>
-void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                     const double* A,
-                                                                     const int& LDA,
-                                                                     double* B,
-                                                                     const int& LDB)
+void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                             const int& n2,
+                                                             const double* A,
+                                                             const int& LDA,
+                                                             double* B,
+                                                             const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    matrix_setTo_another_kernel<double> <<<block, thread >>> (n, LDA, LDB, A, B);
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+    matrix_copy_kernel<double> <<<gridSize, blockSize >>> (n1, n2, A, LDA, B, LDB);
     cudaCheckOnDebug();
 }
 template <>
-void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                                  const std::complex<float>* A,
-                                                                                  const int& LDA,
-                                                                                  std::complex<float>* B,
-                                                                                  const int& LDB)
+void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                                          const int& n2,
+                                                                          const std::complex<float>* A,
+                                                                          const int& LDA,
+                                                                          std::complex<float>* B,
+                                                                          const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    matrix_setTo_another_kernel<thrust::complex<float>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+    matrix_copy_kernel<thrust::complex<float>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
     cudaCheckOnDebug();
+
 }
 template <>
-void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                                   const std::complex<double>* A,
-                                                                                   const int& LDA,
-                                                                                   std::complex<double>* B,
-                                                                                   const int& LDB)
+void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                                           const int& n2,
+                                                                           const std::complex<double>* A,
+                                                                           const int& LDA,
+                                                                           std::complex<double>* B,
+                                                                           const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    matrix_setTo_another_kernel<thrust::complex<double>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));
-
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+    matrix_copy_kernel<thrust::complex<double>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
     cudaCheckOnDebug();
 }
 
@@ -1027,7 +1023,7 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GP
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -1035,15 +1031,15 @@ template struct line_minimize_with_block_op<std::complex<double>, base_device::D
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
+template struct matrixCopy<double, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace ModuleBase
diff --git a/source/module_base/kernels/math_kernel_op.cpp b/source/module_base/kernels/math_kernel_op.cpp
index 3f25a1e47b..bb0bb923a5 100644
--- a/source/module_base/kernels/math_kernel_op.cpp
+++ b/source/module_base/kernels/math_kernel_op.cpp
@@ -337,16 +337,16 @@ struct matrixTranspose_op<T, base_device::DEVICE_CPU>
 };
 
 template <typename T>
-struct matrixSetToAnother<T, base_device::DEVICE_CPU>
+struct matrixCopy<T, base_device::DEVICE_CPU>
 {
-    void operator()(const int& n, const T* A, const int& LDA, T* B, const int& LDB)
+    void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB)
     {
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
 #endif
-        for (int i = 0; i < n; i++)
+        for (int i = 0; i < n1; i++)
         {
-            for (int j = 0; j < LDA; j++)
+            for (int j = 0; j < n2; j++)
             {
                 B[i * LDB + j] = A[i * LDA + j];
             }
@@ -367,7 +367,7 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
-template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_CPU>;
+template struct matrixCopy<std::complex<float>, base_device::DEVICE_CPU>;
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 
@@ -385,7 +385,8 @@ template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_C
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
-template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_CPU>;
+template struct matrixCopy<double, base_device::DEVICE_CPU>;
+template struct matrixCopy<std::complex<double>, base_device::DEVICE_CPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
 
@@ -394,7 +395,6 @@ template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
-template struct matrixSetToAnother<double, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
 #endif
 #ifdef __DSP
diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h
index 33ad5a7b4d..d94aeb6806 100644
--- a/source/module_base/kernels/math_kernel_op.h
+++ b/source/module_base/kernels/math_kernel_op.h
@@ -298,19 +298,19 @@ template <typename T, typename Device> struct matrixTranspose_op {
                   const T *input_matrix, T *output_matrix);
 };
 
-template <typename T, typename Device> struct matrixSetToAnother {
-  /// @brief initialize matrix B with A
+template <typename T, typename Device> struct matrixCopy {
+  /// @brief copy matrix A to B, they can have different leading dimensions
   ///
   /// Input Parameters
-  /// \param n : first dimension of matrix
+  /// \param n1 : first dimension of matrix
+  /// \param n2 : second dimension of matrix
   /// \param A : input matrix A
   /// \param LDA : leading dimension of A
   /// \param LDB : leading dimension of B
   ///
   /// Output Parameters
   /// \param B : output matrix B
-  void operator()(const int &n, const T *A, const int &LDA,
-                  T *B, const int &LDB);
+  void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -370,12 +370,13 @@ struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU> {
                   const Real constant2);
 };
 
-template <typename T> struct matrixSetToAnother<T, base_device::DEVICE_GPU> {
-  void operator()(const int &n,
-                  const T *A, // input
-                  const int &LDA,
-                  T *B, // output
-                  const int &LDB);
+template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
+    void operator()(const int& n1,
+                    const int& n2,
+                    const T* A, // input
+                    const int& LDA,
+                    T* B, // output
+                    const int& LDB);
 };
 
 void createGpuBlasHandle();
diff --git a/source/module_base/kernels/rocm/math_kernel_op.hip.cu b/source/module_base/kernels/rocm/math_kernel_op.hip.cu
index ba20d282b7..0279bfee75 100644
--- a/source/module_base/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/module_base/kernels/rocm/math_kernel_op.hip.cu
@@ -307,23 +307,16 @@ __global__ void matrix_transpose_kernel(
     }
 }
 
-
 template <typename T>
-__launch_bounds__(1024)
-__global__ void matrix_setTo_another_kernel(
-        const int n,
-        const int LDA,
-        const int LDB,
-    const T* matrix_A,
-    T* matrix_B)
-{
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j < LDA && j < LDB)
+__launch_bounds__(1024) __global__
+    void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
+{
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i < n1 && j < n2)
     {
-        for (int i = 0; i < n; i++)
-        {
-            matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
-        }
+        B[i * LDB + j] = A[i * LDA + j];
     }
 }
 
@@ -889,39 +882,45 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
 }
 
 template <>
-void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                     const double* A,
-                                                                     const int& LDA,
-                                                                     double* B,
-                                                                     const int& LDB)
+void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                             const int& n2,
+                                                             const double* A,
+                                                             const int& LDA,
+                                                             double* B,
+                                                             const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<double>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, A, B);
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<double>), gridSize, blockSize, 0, 0, n1, n2, A, LDA, B, LDB);
     hipCheckOnDebug();
 }
 template <>
-void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                                  const std::complex<float>* A,
-                                                                                  const int& LDA,
-                                                                                  std::complex<float>* B,
-                                                                                  const int& LDB)
+void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                                          const int& n2,
+                                                                          const std::complex<float>* A,
+                                                                          const int& LDA,
+                                                                          std::complex<float>* B,
+                                                                          const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<thrust::complex<float>>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<thrust::complex<float>>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
     hipCheckOnDebug();
 }
 template <>
-void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                                   const std::complex<double>* A,
-                                                                                   const int& LDA,
-                                                                                   std::complex<double>* B,
-                                                                                   const int& LDB)
+void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                                           const int& n2,
+                                                                           const std::complex<double>* A,
+                                                                           const int& LDA,
+                                                                           std::complex<double>* B,
+                                                                           const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<thrust::complex<double>>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<thrust::complex<double>>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
     hipCheckOnDebug();
 }
 
@@ -935,7 +934,7 @@ template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -944,14 +943,14 @@ template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
+template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace ModuleBase
diff --git a/source/module_base/kernels/test/math_kernel_test.cpp b/source/module_base/kernels/test/math_kernel_test.cpp
index 02dd7278f3..9490563743 100644
--- a/source/module_base/kernels/test/math_kernel_test.cpp
+++ b/source/module_base/kernels/test/math_kernel_test.cpp
@@ -630,7 +630,7 @@ TEST_F(TestModuleHsolverMathKernel, gemv_op_gpu)
     delete_memory_op()(Y_gemv_dev);
 }
 
-TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu)
+TEST_F(TestModuleHsolverMathKernel, matrixCopy_op_gpu)
 {
     // const std::vector<std::complex<double> > expect_result = {
     //   {-0.11893203,-0.13492526}, {-0.40314756, 0.07734553}, {0.06892412, 0.14837423}, {0.0,  0.0},
@@ -665,25 +665,15 @@ TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu)
                                                                                                         B.size());
 
     // run
-    ModuleBase::matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>()(n,
-                                                                                 device_A,
-                                                                                 LDA,
-                                                                                 device_B,
-                                                                                 LDB);
+    ModuleBase::matrixCopy<std::complex<double>, base_device::DEVICE_GPU>()(n, LDA, device_A, LDA, device_B, LDB);
 
     std::vector<std::complex<double>> B_gpu2cpu(8);
     base_device::memory::synchronize_memory_op<std::complex<double>,
                                                base_device::DEVICE_CPU,
-                                               base_device::DEVICE_GPU>()(B_gpu2cpu.data(),
-                                                                          device_B,
-                                                                          B_gpu2cpu.size());
+                                               base_device::DEVICE_GPU>()(B_gpu2cpu.data(), device_B, B_gpu2cpu.size());
 
     std::vector<std::complex<double>> B_cpu(8);
-    ModuleBase::matrixSetToAnother<std::complex<double>, base_device::DEVICE_CPU>()(n,
-                                                                                 A.data(),
-                                                                                 LDA,
-                                                                                 B_cpu.data(),
-                                                                                 LDB);
+    ModuleBase::matrixCopy<std::complex<double>, base_device::DEVICE_CPU>()(n, LDA, A.data(), LDA, B_cpu.data(), LDB);
 
     // for (int i = 0; i < 4; i++)
     // {
diff --git a/source/module_base/para_gemm.cpp b/source/module_base/para_gemm.cpp
index 4783faa17a..abfacf5154 100644
--- a/source/module_base/para_gemm.cpp
+++ b/source/module_base/para_gemm.cpp
@@ -12,6 +12,11 @@ PGemmCN<T, Device>::PGemmCN()
 template <typename T, typename Device>
 PGemmCN<T, Device>::~PGemmCN()
 {
+#ifdef __MPI
+    delmem_dev_op()(C_local_tmp_);
+    delmem_dev_op()(A_tmp_device_);
+    delmem_dev_op()(B_tmp_device_);
+#endif
 }
 
 template <typename T, typename Device>
@@ -63,46 +68,77 @@ void PGemmCN<T, Device>::set_dimension(
     default:
         break;
     }
-    requests.resize(col_nproc);
-    if (this->divideCrow)
+
+    if(col_nproc > 1)
     {
-        colB_loc.resize(col_nproc);
-        MPI_Allgather(&ncolB, 1, MPI_INT, colB_loc.data(), 1, MPI_INT, col_world);
-        int sum = 0;
-        for (int ip = 0; ip < col_nproc; ip++)
+        requests.resize(col_nproc);
+        if (this->divideCrow)
         {
-            max_colB = std::max(max_colB, colB_loc[ip]);
-            sum += colB_loc[ip];
+            colB_loc.resize(col_nproc);
+            MPI_Allgather(&ncolB, 1, MPI_INT, colB_loc.data(), 1, MPI_INT, col_world);
+            int sum = 0;
+            for (int ip = 0; ip < col_nproc; ip++)
+            {
+                max_colB = std::max(max_colB, colB_loc[ip]);
+                sum += colB_loc[ip];
+            }
+            size_C_local = sum * LDC;
+
+            // allocate temperory memory
+            if (std::is_same<Device, base_device::DEVICE_GPU>::value)
+            {
+                resmem_dev_op()(B_tmp_device_, max_colB * LDB);
+            }
+            B_tmp_.resize(max_colB * LDB);
         }
-        size_C_local = sum * LDC;
-    }
-    else
-    {
-        colA_loc.resize(col_nproc);
-        MPI_Allgather(&ncolA, 1, MPI_INT, colA_loc.data(), 1, MPI_INT, col_world);
-        for (int ip = 0; ip < col_nproc; ip++)
+        else
         {
-            max_colA = std::max(max_colA, colA_loc[ip]);
-        }
-        size_C_local = ncolB * LDC;
-    }
+            colA_loc.resize(col_nproc);
+            MPI_Allgather(&ncolA, 1, MPI_INT, colA_loc.data(), 1, MPI_INT, col_world);
+            for (int ip = 0; ip < col_nproc; ip++)
+            {
+                max_colA = std::max(max_colA, colA_loc[ip]);
+            }
+            size_C_local = ncolB * LDC;
 
-    if (this->gatherC)
-    {
-        colB_loc.resize(col_nproc);
-        recv_counts.resize(col_nproc);
-        displs.resize(col_nproc);
-        MPI_Allgather(&ncolB, 1, MPI_INT, colB_loc.data(), 1, MPI_INT, col_world);
-        for (int ip = 0; ip < col_nproc; ip++)
-        {
-            recv_counts[ip] = LDC * colB_loc[ip];
+            // allocate temperory memory
+            if (std::is_same<Device, base_device::DEVICE_GPU>::value)
+            {
+                resmem_dev_op()(A_tmp_device_, max_colA * LDA);
+#ifndef __CUDA_MPI
+                isend_tmp_.resize(max_colA * LDA);
+#endif
+            }
+            A_tmp_.resize(max_colA * LDA);
         }
-        displs[0] = 0;
-        for (int ip = 1; ip < col_nproc; ip++)
+
+        if (this->gatherC)
         {
-            displs[ip] = displs[ip - 1] + recv_counts[ip - 1];
+            colB_loc.resize(col_nproc);
+            recv_counts.resize(col_nproc);
+            displs.resize(col_nproc);
+            MPI_Allgather(&ncolB, 1, MPI_INT, colB_loc.data(), 1, MPI_INT, col_world);
+            for (int ip = 0; ip < col_nproc; ip++)
+            {
+                recv_counts[ip] = LDC * colB_loc[ip];
+            }
+            displs[0] = 0;
+            for (int ip = 1; ip < col_nproc; ip++)
+            {
+                displs[ip] = displs[ip - 1] + recv_counts[ip - 1];
+            }
+            size_C_global = displs[col_nproc - 1] + recv_counts[col_nproc - 1];
+
+            // allocate temperory memory
+            if (std::is_same<Device, base_device::DEVICE_GPU>::value)
+            {
+                resmem_dev_op()(C_local_tmp_, size_C_local);
+#ifndef __CUDA_MPI
+                C_global_tmp_.resize(size_C_global);
+#endif
+            }
+            C_tmp_.resize(size_C_local);
         }
-        size_C_global = displs[col_nproc - 1] + recv_counts[col_nproc - 1];
     }
 #endif
 }
@@ -144,7 +180,8 @@ void PGemmCN<T, Device>::multiply_single(const T alpha, const T* A, const T* B,
 #ifdef __MPI
     if (this->row_nproc > 1)
     {
-        Parallel_Common::reduce_dev<T, Device>(C, size_C_local, row_world);
+        const int size = ncolB * LDC;
+        Parallel_Common::reduce_dev<T, Device>(C, size, row_world);
     }
 #endif
 }
@@ -155,50 +192,43 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
 {
     const Device* ctx = {};
 
-    std::vector<T> B_tmp(max_colA * LDA);
-    std::vector<T> isend_tmp;
-#ifndef __CUDA_MPI
-    if (std::is_same<Device, base_device::DEVICE_GPU>::value)
-    {
-        isend_tmp.resize(max_colA * LDA);
-    }
-#endif
+    // send A to other procs
+    T* isend_tmp = isend_tmp_.data();
     for (int ip = 0; ip < col_nproc; ip++)
     {
         if (col_rank != ip)
         {
             int size = ncolA * LDA;
-            Parallel_Common::isend_dev<T, Device>(A, size, ip, 0, col_world, &requests[ip], isend_tmp.data());
+            Parallel_Common::isend_dev<T, Device>(A, size, ip, 0, col_world, &requests[ip], isend_tmp);
         }
     }
 
+
+    //init pointers
     T* C_local = C;
-    std::vector<T> C_tmp;
     if (this->gatherC)
     {
-        C_tmp.resize(size_C_local);
         if (std::is_same<Device, base_device::DEVICE_GPU>::value)
         {
-            C_local = nullptr;
-            resmem_dev_op()(C_local, size_C_local);
+            C_local = C_local_tmp_;
         }
         else
         {
-            C_local = C_tmp.data();
+            C_local = C_tmp_.data();
         }
         syncmem_dev_op()(C_local, C + displs[col_rank], size_C_local);
     }
-
     T* Atmp_device = nullptr;
     if (std::is_same<Device, base_device::DEVICE_GPU>::value)
     {
-        resmem_dev_op()(Atmp_device, max_colA * LDA);
+        Atmp_device = A_tmp_device_;
     }
     else
     {
-        Atmp_device = B_tmp.data();
+        Atmp_device = A_tmp_.data();
     }
 
+    // multiply
     int shift = 0;
     T real_beta = row_rank == 0 ? beta : 0;
     for (int ip = 0; ip < col_nproc; ip++)
@@ -226,7 +256,7 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
             int m = colA_loc[ip];
             int size = m * LDA;
             MPI_Status status;
-            Parallel_Common::recv_dev<T, Device>(Atmp_device, size, ip, 0, col_world, &status, B_tmp.data());
+            Parallel_Common::recv_dev<T, Device>(Atmp_device, size, ip, 0, col_world, &status, A_tmp_.data());
             MPI_Wait(&requests[ip], &status);
             ModuleBase::gemm_op<T, Device>()('C',
                                              'N',
@@ -248,44 +278,35 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
     if (this->gatherC)
     {
 #ifdef __CUDA_MPI
-        if (this->row_nproc > 1)
-        {
-            Parallel_Common::reduce_data(C_local, size_C_local, row_world);
-        }
-        Parallel_Common::gatherv_data(C_local, size_C_local, C, recv_counts.data(), displs.data(), col_world);
+        T* Clocal_mpi = C_local;
+        T* Cglobal_mpi = C;
 #else
-        T* Cglobal_cpu = nullptr;
-        T* Clocal_cpu = C_tmp.data();
-        std::vector<T> cpu_tmp;
-        
+        T* Clocal_mpi = C_tmp_.data();
+        T* Cglobal_mpi = nullptr;
         if (std::is_same<Device, base_device::DEVICE_GPU>::value)
         {
-            delmem_dev_op()(Atmp_device);
-
-            syncmem_d2h_op()(Clocal_cpu, C_local, size_C_local);
-            delmem_dev_op()(C_local);
-
-            cpu_tmp.resize(size_C_global);
-            Cglobal_cpu = cpu_tmp.data();
+            syncmem_d2h_op()(Clocal_mpi, C_local, size_C_local);
+            Cglobal_mpi = C_global_tmp_.data();
         }
         else
         {
-            Cglobal_cpu = C;
+            Cglobal_mpi = C;
         }
+#endif
         if (this->row_nproc > 1)
         {
-            Parallel_Common::reduce_data(Clocal_cpu, size_C_local, row_world);
+            Parallel_Common::reduce_data(Clocal_mpi, size_C_local, row_world);
         }
-        Parallel_Common::gatherv_data(Clocal_cpu,
+        Parallel_Common::gatherv_data(Clocal_mpi,
                                       size_C_local,
-                                      Cglobal_cpu,
+                                      Cglobal_mpi,
                                       recv_counts.data(),
                                       displs.data(),
                                       col_world);
-
+#ifndef __CUDA_MPI
         if (std::is_same<Device, base_device::DEVICE_GPU>::value)
         {
-            syncmem_h2d_op()(C, Cglobal_cpu, size_C_global);
+            syncmem_h2d_op()(C, Cglobal_mpi, size_C_global);
         }
 #endif
     }
@@ -303,28 +324,28 @@ void PGemmCN<T, Device>::multiply_row(const T alpha, const T* A, const T* B, con
 {
     const Device* ctx = {};
 
-    std::vector<T> B_tmp(max_colB * LDB);
+    // Send B to other procs
     for (int ip = 0; ip < col_nproc; ip++)
     {
         if (col_rank != ip)
         {
             int size = ncolB * LDB;
-            Parallel_Common::isend_dev<T, Device>(B, size, ip, 0, col_world, &requests[ip], B_tmp.data());
+            Parallel_Common::isend_dev<T, Device>(B, size, ip, 0, col_world, &requests[ip], B_tmp_.data());
         }
     }
 
-    std::vector<T> C_tmp;
-
+    // init pointers
     T* Btmp_device = nullptr;
     if (std::is_same<Device, base_device::DEVICE_GPU>::value)
     {
-        resmem_dev_op()(Btmp_device, max_colB * LDB);
+        Btmp_device = B_tmp_device_;
     }
     else
     {
-        Btmp_device = B_tmp.data();
+        Btmp_device = B_tmp_.data();
     }
 
+    // multiply
     int shift = 0;
     T real_beta = row_rank == 0 ? beta : 0;
     for (int ip = 0; ip < col_nproc; ip++)
@@ -352,7 +373,7 @@ void PGemmCN<T, Device>::multiply_row(const T alpha, const T* A, const T* B, con
             int m = colB_loc[ip];
             int size = m * LDB;
             MPI_Status status;
-            Parallel_Common::recv_dev<T, Device>(Btmp_device, size, ip, 0, col_world, &status, B_tmp.data());
+            Parallel_Common::recv_dev<T, Device>(Btmp_device, size, ip, 0, col_world, &status, B_tmp_.data());
             MPI_Wait(&requests[ip], &status);
             ModuleBase::gemm_op<T, Device>()('C',
                                              'N',
diff --git a/source/module_base/para_gemm.h b/source/module_base/para_gemm.h
index f00d3cd975..a0e01d2734 100644
--- a/source/module_base/para_gemm.h
+++ b/source/module_base/para_gemm.h
@@ -98,6 +98,20 @@ class PGemmCN
     using syncmem_dev_op = base_device::memory::synchronize_memory_op<T, Device, Device>;
     using syncmem_d2h_op = base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>;
     using syncmem_h2d_op = base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>;
+
+#ifdef __MPI
+  private:
+    std::vector<T> isend_tmp_; ///< temperory memory for sending data
+    std::vector<T> A_tmp_;     ///< temperory memory for A
+    std::vector<T> B_tmp_;     ///< temperory memory for B
+    std::vector<T> C_tmp_;     ///< temperory memory for C
+    std::vector<T> C_global_tmp_; ///< temperory memory for C_global
+    T* C_local_tmp_ = nullptr;    ///< temperory memory for C_local
+    T* A_tmp_device_ = nullptr;   ///< temperory memory for A
+    T* B_tmp_device_ = nullptr;   ///< temperory memory for B
+#endif
+
+
 };
 } // namespace ModuleBase
 #endif
\ No newline at end of file
diff --git a/source/module_hsolver/diago_iter_assist.cpp b/source/module_hsolver/diago_iter_assist.cpp
index ce1996db70..e39b8bd44b 100644
--- a/source/module_hsolver/diago_iter_assist.cpp
+++ b/source/module_hsolver/diago_iter_assist.cpp
@@ -136,7 +136,7 @@ void DiagoIterAssist<T, Device>::diagH_subspace(const hamilt::Hamilt<T, Device>*
 
     if (!in_place)
     {
-        ModuleBase::matrixSetToAnother<T, Device>()(n_band, temp, ld_temp, evc.get_pointer(), dmax);
+        ModuleBase::matrixCopy<T, Device>()(n_band, ld_temp, temp, ld_temp, evc.get_pointer(), dmax);
         delmem_complex_op()(temp);
     }
     delmem_complex_op()(hcc);
@@ -347,7 +347,7 @@ void DiagoIterAssist<T, Device>::diagH_subspace_init(hamilt::Hamilt<T, Device>*
                                          evc.get_pointer(),
                                          dmax);
 
-        // matrixSetToAnother<T, Device>()(ctx, n_band, evctemp, dmin, evc.get_pointer(), dmax);
+        // matrixCopy<T, Device>()(ctx, n_band, evctemp, dmin, evc.get_pointer(), dmax);
 
         // delmem_complex_op()(ctx, evctemp);
     }
@@ -561,7 +561,7 @@ void DiagoIterAssist<T, Device>::diag_subspace_psi(const T* hcc,
                                          &zero,
                                          temp,
                                          dmin);
-        ModuleBase::matrixSetToAnother<T, Device>()(n_band, temp, dmin, evc.get_pointer(), dmax);
+        ModuleBase::matrixCopy<T, Device>()(n_band, dmin, temp, dmin, evc.get_pointer(), dmax);
         delmem_complex_op()(temp);
     }
 
diff --git a/source/module_hsolver/para_linear_transform.cpp b/source/module_hsolver/para_linear_transform.cpp
index 2c531f3dc9..5a6c8def27 100644
--- a/source/module_hsolver/para_linear_transform.cpp
+++ b/source/module_hsolver/para_linear_transform.cpp
@@ -1,4 +1,5 @@
 #include "para_linear_transform.h"
+
 #include "module_base/timer.h"
 
 #include <algorithm>
@@ -6,6 +7,15 @@
 namespace hsolver
 {
 template <typename T, typename Device>
+PLinearTransform<T, Device>::~PLinearTransform()
+{
+#ifdef __MPI
+    delmem_dev_op()(U_tmp_);
+    delmem_dev_op()(B_tmp_);
+    delmem_dev_op()(A_tmp_device_);
+#endif
+}
+template <typename T, typename Device>
 void PLinearTransform<T, Device>::set_dimension(const int nrowA,
                                                 const int ncolA,
                                                 const int ncolB,
@@ -46,6 +56,18 @@ void PLinearTransform<T, Device>::set_dimension(const int nrowA,
             start_colB[ip] = start_colB[ip - 1] + colB_loc[ip - 1];
         }
         this->max_colB = *std::max_element(colB_loc.begin(), colB_loc.end());
+
+        // allocate temperory memory
+        resmem_dev_op()(B_tmp_, ncolB * LDA);
+        resmem_dev_op()(U_tmp_, max_colA * max_colB);
+        if (std::is_same<Device, base_device::DEVICE_GPU>::value)
+        {
+            resmem_dev_op()(A_tmp_device_, max_colA * LDA);
+#ifndef __CUDA_MPI
+            isend_tmp_.resize(max_colA * LDA);
+#endif
+        }
+        A_tmp_.resize(max_colA * LDA);
     }
 #else
     nproc_col = 1;
@@ -56,100 +78,75 @@ template <typename T, typename Device>
 void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, const T beta, T* B)
 {
     ModuleBase::timer::tick("PLinearTransform", "act");
-    const Device* ctx = {};
 #ifdef __MPI
     if (nproc_col > 1)
     {
+        syncmem_dev_op()(B_tmp_, B, ncolB * LDA);
         std::vector<MPI_Request> requests(nproc_col);
-        std::vector<T> A_tmp(max_colA * LDA);
-        std::vector<T> isend_tmp;
-        T* A_tmp_device = A_tmp.data();
-        if (std::is_same<Device, base_device::DEVICE_GPU>::value)
-        {
-            A_tmp_device = nullptr;
-#ifndef __CUDA_MPI
-            isend_tmp.resize(max_colA * LDA);
-#endif
-            resmem_dev_op()(A_tmp_device, max_colA * LDA);
-        }
-        T* B_tmp = nullptr;
-        resmem_dev_op()(B_tmp, ncolB * LDA);
-        syncmem_dev_op()(B_tmp, B, ncolB * LDA);
-        setmem_dev_op()(B, 0.0, ncolB * LDA);
-
-        T* U_tmp = nullptr;
-        resmem_dev_op()(U_tmp, max_colA * max_colB);
-
         // Send
         for (int ip = 0; ip < nproc_col; ++ip)
         {
             if (rank_col != ip)
             {
                 int size = LDA * ncolA;
-                Parallel_Common::isend_dev<T, Device>(A, size, ip, 0, col_world, &requests[ip], isend_tmp.data());
+                Parallel_Common::isend_dev<T, Device>(A, size, ip, 0, col_world, &requests[ip], isend_tmp_.data());
             }
         }
 
-        // Receive
+        // local part
         const int start = this->localU ? 0 : start_colB[rank_col];
+        const T* U_part = U + start_colA[rank_col] + start * ncolA_glo;
+        ModuleBase::matrixCopy<T, Device>()(ncolB, ncolA, U_part, ncolA_glo, U_tmp_, ncolA);
+        ModuleBase::gemm_op<T, Device>()('N', 'N', nrowA, ncolB, ncolA, &alpha, A, LDA, U_tmp_, ncolA, &beta, B, LDA);
+
+        // Receive
+        T* Atmp_device = nullptr;
+        if (std::is_same<Device, base_device::DEVICE_GPU>::value)
+        {
+            Atmp_device = A_tmp_device_;
+        }
+        else
+        {
+            Atmp_device = A_tmp_.data();
+        }
         for (int ip = 0; ip < nproc_col; ++ip)
         {
-            T real_beta = ip == 0 ? beta : 0;
-            const int ncolA_ip = colA_loc[ip];
-            // get U_tmp
-
-            const int start_row = start_colA[ip];
-            for (int i = 0; i < ncolB; ++i)
+            if (ip != rank_col)
             {
-                const T* U_part = U + start_row + (i + start) * ncolA_glo;
-                syncmem_dev_op()(U_tmp + i * ncolA_ip, U_part, ncolA_ip);
-            }
+                T zero = 0.0;
+                const int ncolA_ip = colA_loc[ip];
+                const T* U_part = U + start_colA[ip] + start * ncolA_glo;
+                ModuleBase::matrixCopy<T, Device>()(ncolB, ncolA_ip, U_part, ncolA_glo, U_tmp_, ncolA_ip);
 
-            if (ip == rank_col)
-            {
-                ModuleBase::gemm_op<T, Device>()('N',
-                                                 'N',
-                                                 nrowA,
-                                                 ncolB,
-                                                 ncolA_ip,
-                                                 &alpha,
-                                                 A,
-                                                 LDA,
-                                                 U_tmp,
-                                                 ncolA_ip,
-                                                 &real_beta,
-                                                 B_tmp,
-                                                 LDA);
-            }
-            else
-            {
                 int size = LDA * ncolA_ip;
                 MPI_Status status;
-                Parallel_Common::recv_dev<T, Device>(A_tmp_device, size, ip, 0, col_world, &status, A_tmp.data());
-                MPI_Wait(&requests[ip], &status);
+                Parallel_Common::recv_dev<T, Device>(Atmp_device, size, ip, 0, col_world, &status, A_tmp_.data());
                 ModuleBase::gemm_op<T, Device>()('N',
                                                  'N',
                                                  nrowA,
                                                  ncolB,
                                                  ncolA_ip,
                                                  &alpha,
-                                                 A_tmp_device,
+                                                 Atmp_device,
                                                  LDA,
-                                                 U_tmp,
+                                                 U_tmp_,
                                                  ncolA_ip,
-                                                 &real_beta,
-                                                 B_tmp,
+                                                 &zero,
+                                                 B_tmp_,
                                                  LDA);
+                // sum all the results
+                T one = 1.0;
+                ModuleBase::axpy_op<T, Device>()(ncolB * LDA, &one, B_tmp_, 1, B, 1);
             }
-            // sum all the results
-            T one = 1.0;
-            ModuleBase::axpy_op<T, Device>()(ncolB * LDA, &one, B_tmp, 1, B, 1);
         }
-        delmem_dev_op()(U_tmp);
-        delmem_dev_op()(B_tmp);
-        if (std::is_same<Device, base_device::DEVICE_GPU>::value)
+
+        for (int ip = 0; ip < nproc_col; ++ip)
         {
-            delmem_dev_op()(A_tmp_device);
+            if (rank_col != ip)
+            {
+                MPI_Status status;
+                MPI_Wait(&requests[ip], &status);
+            }
         }
     }
     else
diff --git a/source/module_hsolver/para_linear_transform.h b/source/module_hsolver/para_linear_transform.h
index 42cb02fb47..8e5e69d203 100644
--- a/source/module_hsolver/para_linear_transform.h
+++ b/source/module_hsolver/para_linear_transform.h
@@ -4,6 +4,7 @@
 #include "module_base/module_device/device.h"
 #include "module_base/module_device/memory_op.h"
 #include "module_base/parallel_device.h"
+
 #include <vector>
 #ifdef __MPI
 #include "mpi.h"
@@ -24,6 +25,7 @@ class PLinearTransform
     using resmem_dev_op = base_device::memory::resize_memory_op<T, Device>;
     using setmem_dev_op = base_device::memory::set_memory_op<T, Device>;
     using delmem_dev_op = base_device::memory::delete_memory_op<T, Device>;
+    ~PLinearTransform();
     int nproc_col = 1;
     int rank_col = 0;
     int nrowA = 0;
@@ -71,6 +73,15 @@ class PLinearTransform
      *
      */
     void act(const T alpha, const T* A, const T* U_global, const T beta, T* B);
+
+#ifdef __MPI
+  private:
+    std::vector<T> A_tmp_;      // temperory memory for A
+    std::vector<T> isend_tmp_;  // temperory memory for isend
+    T* U_tmp_ = nullptr;        // temperory memory for U
+    T* B_tmp_ = nullptr;        // temperory memory for B
+    T* A_tmp_device_ = nullptr; // temperory pointer
+#endif
 };
 } // namespace hsolver
 #endif
\ No newline at end of file
diff --git a/tests/integrate/102_PW_BPCG_BP/result.ref b/tests/integrate/102_PW_BPCG_BP/result.ref
index b97d5ed713..8226c47db4 100755
--- a/tests/integrate/102_PW_BPCG_BP/result.ref
+++ b/tests/integrate/102_PW_BPCG_BP/result.ref
@@ -1,7 +1,7 @@
 etotref -4869.7470520063843651
 etotperatomref -2434.8735260032
-totalforceref 5.194830
-totalstressref 37241.453346
+totalforceref 5.1952120
+totalstressref 37241.490309
 pointgroupref C_1
 spacegroupref C_1
 nksibzref 8