deepmodeling
diff --git a/‎source/module_base/kernels/cuda/math_kernel_op.cu‎
Lines changed: 38 additions & 42 deletions b/‎source/module_base/kernels/cuda/math_kernel_op.cu‎
Lines changed: 38 additions & 42 deletions
diff --git a/‎source/module_base/kernels/math_kernel_op.cpp‎
Lines changed: 7 additions & 7 deletions b/‎source/module_base/kernels/math_kernel_op.cpp‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎source/module_base/kernels/math_kernel_op.h‎
Lines changed: 12 additions & 11 deletions b/‎source/module_base/kernels/math_kernel_op.h‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎source/module_base/kernels/rocm/math_kernel_op.hip.cu‎
Lines changed: 41 additions & 42 deletions b/‎source/module_base/kernels/rocm/math_kernel_op.hip.cu‎
Lines changed: 41 additions & 42 deletions
@@ -385,22 +385,15 @@ __global__ void matrix_transpose_kernel(
     }
 }
 
-
 template <typename T>
-__global__ void matrix_setTo_another_kernel(
-        const int n,
-        const int LDA,
-        const int LDB,
-    const T* matrix_A,
-    T* matrix_B)
+__global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
 {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j < LDA && j < LDB)
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i < n1 && j < n2)
     {
-        for (int i = 0; i < n; i++)
-        {
-            matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
-        }
+        B[i * LDB + j] = A[i * LDA + j];
     }
 }
 
@@ -980,40 +973,43 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
 }
 
 template <>
-void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                     const double* A,
-                                                                     const int& LDA,
-                                                                     double* B,
-                                                                     const int& LDB)
+void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                             const int& n2,
+                                                             const double* A,
+                                                             const int& LDA,
+                                                             double* B,
+                                                             const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    matrix_setTo_another_kernel<double> <<<block, thread >>> (n, LDA, LDB, A, B);
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+    matrix_copy_kernel<double> <<<gridSize, blockSize >>> (n1, n2, A, LDA, B, LDB);
     cudaCheckOnDebug();
 }
 template <>
-void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                                  const std::complex<float>* A,
-                                                                                  const int& LDA,
-                                                                                  std::complex<float>* B,
-                                                                                  const int& LDB)
+void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                                          const int& n2,
+                                                                          const std::complex<float>* A,
+                                                                          const int& LDA,
+                                                                          std::complex<float>* B,
+                                                                          const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    matrix_setTo_another_kernel<thrust::complex<float>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+    matrix_copy_kernel<thrust::complex<float>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
     cudaCheckOnDebug();
+
 }
 template <>
-void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                                   const std::complex<double>* A,
-                                                                                   const int& LDA,
-                                                                                   std::complex<double>* B,
-                                                                                   const int& LDB)
+void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                                           const int& n2,
+                                                                           const std::complex<double>* A,
+                                                                           const int& LDA,
+                                                                           std::complex<double>* B,
+                                                                           const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    matrix_setTo_another_kernel<thrust::complex<double>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));
-
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+    matrix_copy_kernel<thrust::complex<double>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
     cudaCheckOnDebug();
 }
 
@@ -1027,23 +1023,23 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GP
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
+template struct matrixCopy<double, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace ModuleBase
@@ -337,16 +337,16 @@ struct matrixTranspose_op<T, base_device::DEVICE_CPU>
 };
 
 template <typename T>
-struct matrixSetToAnother<T, base_device::DEVICE_CPU>
+struct matrixCopy<T, base_device::DEVICE_CPU>
 {
-    void operator()(const int& n, const T* A, const int& LDA, T* B, const int& LDB)
+    void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB)
     {
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
 #endif
-        for (int i = 0; i < n; i++)
+        for (int i = 0; i < n1; i++)
         {
-            for (int j = 0; j < LDA; j++)
+            for (int j = 0; j < n2; j++)
             {
                 B[i * LDB + j] = A[i * LDA + j];
             }
@@ -367,7 +367,7 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
-template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_CPU>;
+template struct matrixCopy<std::complex<float>, base_device::DEVICE_CPU>;
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 
@@ -385,7 +385,8 @@ template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_C
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
-template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_CPU>;
+template struct matrixCopy<double, base_device::DEVICE_CPU>;
+template struct matrixCopy<std::complex<double>, base_device::DEVICE_CPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
 
@@ -394,7 +395,6 @@ template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
-template struct matrixSetToAnother<double, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
 #endif
 #ifdef __DSP
 
@@ -298,19 +298,19 @@ template <typename T, typename Device> struct matrixTranspose_op {
                   const T *input_matrix, T *output_matrix);
 };
 
-template <typename T, typename Device> struct matrixSetToAnother {
-  /// @brief initialize matrix B with A
+template <typename T, typename Device> struct matrixCopy {
+  /// @brief copy matrix A to B, they can have different leading dimensions
   ///
   /// Input Parameters
-  /// \param n : first dimension of matrix
+  /// \param n1 : first dimension of matrix
+  /// \param n2 : second dimension of matrix
   /// \param A : input matrix A
   /// \param LDA : leading dimension of A
   /// \param LDB : leading dimension of B
   ///
   /// Output Parameters
   /// \param B : output matrix B
-  void operator()(const int &n, const T *A, const int &LDA,
-                  T *B, const int &LDB);
+  void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -370,12 +370,13 @@ struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU> {
                   const Real constant2);
 };
 
-template <typename T> struct matrixSetToAnother<T, base_device::DEVICE_GPU> {
-  void operator()(const int &n,
-                  const T *A, // input
-                  const int &LDA,
-                  T *B, // output
-                  const int &LDB);
+template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
+    void operator()(const int& n1,
+                    const int& n2,
+                    const T* A, // input
+                    const int& LDA,
+                    T* B, // output
+                    const int& LDB);
 };
 
 void createGpuBlasHandle();
 
@@ -307,23 +307,16 @@ __global__ void matrix_transpose_kernel(
     }
 }
 
-
 template <typename T>
-__launch_bounds__(1024)
-__global__ void matrix_setTo_another_kernel(
-        const int n,
-        const int LDA,
-        const int LDB,
-    const T* matrix_A,
-    T* matrix_B)
-{
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j < LDA && j < LDB)
+__launch_bounds__(1024) __global__
+    void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
+{
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i < n1 && j < n2)
     {
-        for (int i = 0; i < n; i++)
-        {
-            matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
-        }
+        B[i * LDB + j] = A[i * LDA + j];
     }
 }
 
@@ -889,39 +882,45 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
 }
 
 template <>
-void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                     const double* A,
-                                                                     const int& LDA,
-                                                                     double* B,
-                                                                     const int& LDB)
+void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                             const int& n2,
+                                                             const double* A,
+                                                             const int& LDA,
+                                                             double* B,
+                                                             const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<double>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, A, B);
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<double>), gridSize, blockSize, 0, 0, n1, n2, A, LDA, B, LDB);
     hipCheckOnDebug();
 }
 template <>
-void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                                  const std::complex<float>* A,
-                                                                                  const int& LDA,
-                                                                                  std::complex<float>* B,
-                                                                                  const int& LDB)
+void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                                          const int& n2,
+                                                                          const std::complex<float>* A,
+                                                                          const int& LDA,
+                                                                          std::complex<float>* B,
+                                                                          const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<thrust::complex<float>>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<thrust::complex<float>>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
     hipCheckOnDebug();
 }
 template <>
-void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n,
-                                                                                   const std::complex<double>* A,
-                                                                                   const int& LDA,
-                                                                                   std::complex<double>* B,
-                                                                                   const int& LDB)
+void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n1,
+                                                                           const int& n2,
+                                                                           const std::complex<double>* A,
+                                                                           const int& LDA,
+                                                                           std::complex<double>* B,
+                                                                           const int& LDB)
 {
-    int thread = 1024;
-    int block = (LDA + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<thrust::complex<double>>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));
+    const dim3 blockSize(16, 16);
+    const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<thrust::complex<double>>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
     hipCheckOnDebug();
 }
 
@@ -935,7 +934,7 @@ template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -944,14 +943,14 @@ template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
-template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
+template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace ModuleBase