deepmodeling · Qianruipku · Mar 7, 2025 · Feb 26, 2025 · Mar 4, 2025 · Mar 6, 2025
diff --git a/source/module_base/kernels/cuda/math_kernel_op.cu b/source/module_base/kernels/cuda/math_kernel_op.cu
@@ -385,22 +385,15 @@ __global__ void matrix_transpose_kernel(
     }
 }
 
-
 template <typename T>
-__global__ void matrix_setTo_another_kernel(
-        const int n,
-        const int LDA,
-        const int LDB,
-    const T* matrix_A,
-    T* matrix_B)
+__global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
 {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j < LDA && j < LDB)
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i < n1 && j < n2)
     {
-        for (int i = 0; i < n; i++)
-        {
-            matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
-        }
+        B[i * LDB + j] = A[i * LDA + j];
     }
 }
 
@@ -1012,43 +1005,46 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
 }
 
 template <>
-void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
-                                                                     const int& n,
+void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                     const int& n1,
+                                                                     const int& n2,
                                                                      const double* A,
                                                                      const int& LDA,
                                                                      double* B,
                                                                      const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    matrix_setTo_another_kernel<double> <<<block, thread >>> (n, LDA, LDB, A, B);
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+    matrix_copy_kernel<double> <<<gridSize, blockSize >>> (n1, n2, A, LDA, B, LDB);
     cudaCheckOnDebug();
 }
 template <>
-void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
-                                                                                  const int& n,
+void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                                  const int& n1,
+                                                                                  const int& n2,
                                                                                   const std::complex<float>* A,
                                                                                   const int& LDA,
                                                                                   std::complex<float>* B,
                                                                                   const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    matrix_setTo_another_kernel<thrust::complex<float>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+    matrix_copy_kernel<thrust::complex<float>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
     cudaCheckOnDebug();
+
 }
 template <>
-void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
-                                                                                   const int& n,
+void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                                   const int& n1,
+                                                                                   const int& n2,
                                                                                    const std::complex<double>* A,
                                                                                    const int& LDA,
                                                                                    std::complex<double>* B,
                                                                                    const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    matrix_setTo_another_kernel<thrust::complex<double>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));
-
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+    matrix_copy_kernel<thrust::complex<double>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
     cudaCheckOnDebug();
 }
 
@@ -1062,23 +1058,23 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GP
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
+template struct matrixCopy<double, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace ModuleBase
diff --git a/source/module_base/kernels/math_kernel_op.cpp b/source/module_base/kernels/math_kernel_op.cpp
@@ -346,16 +346,22 @@ struct matrixTranspose_op<T, base_device::DEVICE_CPU>
 };
 
 template <typename T>
-struct matrixSetToAnother<T, base_device::DEVICE_CPU>
+struct matrixCopy<T, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_CPU* d, const int& n, const T* A, const int& LDA, T* B, const int& LDB)
+    void operator()(const base_device::DEVICE_CPU* d,
+                    const int& n1,
+                    const int& n2,
+                    const T* A,
+                    const int& LDA,
+                    T* B,
+                    const int& LDB)
     {
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
 #endif
-        for (int i = 0; i < n; i++)
+        for (int i = 0; i < n1; i++)
         {
-            for (int j = 0; j < LDA; j++)
+            for (int j = 0; j < n2; j++)
             {
                 B[i * LDB + j] = A[i * LDA + j];
             }
@@ -376,7 +382,7 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
-template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_CPU>;
+template struct matrixCopy<std::complex<float>, base_device::DEVICE_CPU>;
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 
@@ -394,7 +400,8 @@ template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_C
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
-template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_CPU>;
+template struct matrixCopy<double, base_device::DEVICE_CPU>;
+template struct matrixCopy<std::complex<double>, base_device::DEVICE_CPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
 
@@ -403,7 +410,6 @@ template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
-template struct matrixSetToAnother<double, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
 #endif
 #ifdef __DSP

diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h
@@ -309,20 +309,20 @@ template <typename T, typename Device> struct matrixTranspose_op {
                   const T *input_matrix, T *output_matrix);
 };
 
-template <typename T, typename Device> struct matrixSetToAnother {
-  /// @brief initialize matrix B with A
+template <typename T, typename Device> struct matrixCopy {
+  /// @brief copy matrix A to B, they can have different leading dimensions
   ///
   /// Input Parameters
   /// \param d : the type of computing device
-  /// \param n : first dimension of matrix
+  /// \param n1 : first dimension of matrix
+  /// \param n2 : second dimension of matrix
   /// \param A : input matrix A
   /// \param LDA : leading dimension of A
   /// \param LDB : leading dimension of B
   ///
   /// Output Parameters
   /// \param B : output matrix B
-  void operator()(const Device *d, const int &n, const T *A, const int &LDA,
-                  T *B, const int &LDB);
+  void operator()(const Device* d, const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -382,12 +382,14 @@ struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU> {
                   const Real constant2);
 };
 
-template <typename T> struct matrixSetToAnother<T, base_device::DEVICE_GPU> {
-  void operator()(const base_device::DEVICE_GPU *d, const int &n,
-                  const T *A, // input
-                  const int &LDA,
-                  T *B, // output
-                  const int &LDB);
+template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
+    void operator()(const base_device::DEVICE_GPU* d,
+                    const int& n1,
+                    const int& n2,
+                    const T* A, // input
+                    const int& LDA,
+                    T* B, // output
+                    const int& LDB);
 };
 
 void createGpuBlasHandle();

diff --git a/source/module_base/kernels/rocm/math_kernel_op.hip.cu b/source/module_base/kernels/rocm/math_kernel_op.hip.cu
@@ -307,23 +307,16 @@ __global__ void matrix_transpose_kernel(
     }
 }
 
-
 template <typename T>
-__launch_bounds__(1024)
-__global__ void matrix_setTo_another_kernel(
-        const int n,
-        const int LDA,
-        const int LDB,
-    const T* matrix_A,
-    T* matrix_B)
-{
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j < LDA && j < LDB)
+__launch_bounds__(1024) __global__
+    void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
+{
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i < n1 && j < n2)
     {
-        for (int i = 0; i < n; i++)
-        {
-            matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
-        }
+        B[i * LDB + j] = A[i * LDA + j];
     }
 }
 
@@ -921,42 +914,48 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
 }
 
 template <>
-void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
-                                                                     const int& n,
+void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                     const int& n1,
+                                                                     const int& n2,
                                                                      const double* A,
                                                                      const int& LDA,
                                                                      double* B,
                                                                      const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<double>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, A, B);
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<double>), gridSize, blockSize, 0, 0, n1, n2, A, LDA, B, LDB);
     hipCheckOnDebug();
 }
 template <>
-void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
-                                                                                  const int& n,
+void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                                  const int& n1,
+                                                                                  const int& n2,
                                                                                   const std::complex<float>* A,
                                                                                   const int& LDA,
                                                                                   std::complex<float>* B,
                                                                                   const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<thrust::complex<float>>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<thrust::complex<float>>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
     hipCheckOnDebug();
 }
 template <>
-void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
-                                                                                   const int& n,
+void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                                   const int& n1,
+                                                                                   const int& n2,
                                                                                    const std::complex<double>* A,
                                                                                    const int& LDA,
                                                                                    std::complex<double>* B,
                                                                                    const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<thrust::complex<double>>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<thrust::complex<double>>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
     hipCheckOnDebug();
 }
 
@@ -970,7 +969,7 @@ template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -979,14 +978,14 @@ template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
+template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace ModuleBase