@@ -307,23 +307,16 @@ __global__ void matrix_transpose_kernel(
307307 }
308308}
309309
310-
311310template <typename T>
312- __launch_bounds__ (1024 )
313- __global__ void matrix_setTo_another_kernel (
314- const int n,
315- const int LDA,
316- const int LDB,
317- const T* matrix_A,
318- T* matrix_B)
319- {
320- int j = blockIdx .x * blockDim .x + threadIdx .x ;
321- if (j < LDA && j < LDB)
311+ __launch_bounds__ (1024 ) __global__
312+ void matrix_copy_kernel (const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
313+ {
314+ const int i = blockIdx .x * blockDim .x + threadIdx .x ;
315+ const int j = blockIdx .y * blockDim .y + threadIdx .y ;
316+
317+ if (i < n1 && j < n2)
322318 {
323- for (int i = 0 ; i < n; i++)
324- {
325- matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
326- }
319+ B[i * LDB + j] = A[i * LDA + j];
327320 }
328321}
329322
@@ -889,39 +882,45 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
889882}
890883
891884template <>
892- void matrixSetToAnother<double , base_device::DEVICE_GPU>::operator ()(const int & n,
893- const double * A,
894- const int & LDA,
895- double * B,
896- const int & LDB)
885+ void matrixCopy<double , base_device::DEVICE_GPU>::operator ()(const int & n1,
886+ const int & n2,
887+ const double * A,
888+ const int & LDA,
889+ double * B,
890+ const int & LDB)
897891{
898- int thread = 1024 ;
899- int block = (LDA + thread - 1 ) / thread;
900- hipLaunchKernelGGL (HIP_KERNEL_NAME (matrix_setTo_another_kernel<double >), dim3 (block), dim3 (thread), 0 , 0 , n, LDA, LDB, A, B);
892+ const dim3 blockSize (16 , 16 );
893+ const dim3 gridSize ((n1 + blockSize.x - 1 ) / blockSize.x , (n2 + blockSize.y - 1 ) / blockSize.y );
894+
895+ hipLaunchKernelGGL (HIP_KERNEL_NAME (matrix_copy_kernel<double >), gridSize, blockSize, 0 , 0 , n1, n2, A, LDA, B, LDB);
901896 hipCheckOnDebug ();
902897}
903898template <>
904- void matrixSetToAnother<std::complex <float >, base_device::DEVICE_GPU>::operator ()(const int & n,
905- const std::complex <float >* A,
906- const int & LDA,
907- std::complex <float >* B,
908- const int & LDB)
899+ void matrixCopy<std::complex <float >, base_device::DEVICE_GPU>::operator ()(const int & n1,
900+ const int & n2,
901+ const std::complex <float >* A,
902+ const int & LDA,
903+ std::complex <float >* B,
904+ const int & LDB)
909905{
910- int thread = 1024 ;
911- int block = (LDA + thread - 1 ) / thread;
912- hipLaunchKernelGGL (HIP_KERNEL_NAME (matrix_setTo_another_kernel<thrust::complex <float >>), dim3 (block), dim3 (thread), 0 , 0 , n, LDA, LDB, reinterpret_cast <const thrust::complex <float >*>(A), reinterpret_cast <thrust::complex <float >*>(B));
906+ const dim3 blockSize (16 , 16 );
907+ const dim3 gridSize ((n1 + blockSize.x - 1 ) / blockSize.x , (n2 + blockSize.y - 1 ) / blockSize.y );
908+
909+ hipLaunchKernelGGL (HIP_KERNEL_NAME (matrix_copy_kernel<thrust::complex <float >>), gridSize, blockSize, 0 , 0 , n1, n2, reinterpret_cast <const thrust::complex <float >*>(A), LDA, reinterpret_cast <thrust::complex <float >*>(B), LDB);
913910 hipCheckOnDebug ();
914911}
915912template <>
916- void matrixSetToAnother<std::complex <double >, base_device::DEVICE_GPU>::operator ()(const int & n,
917- const std::complex <double >* A,
918- const int & LDA,
919- std::complex <double >* B,
920- const int & LDB)
913+ void matrixCopy<std::complex <double >, base_device::DEVICE_GPU>::operator ()(const int & n1,
914+ const int & n2,
915+ const std::complex <double >* A,
916+ const int & LDA,
917+ std::complex <double >* B,
918+ const int & LDB)
921919{
922- int thread = 1024 ;
923- int block = (LDA + thread - 1 ) / thread;
924- hipLaunchKernelGGL (HIP_KERNEL_NAME (matrix_setTo_another_kernel<thrust::complex <double >>), dim3 (block), dim3 (thread), 0 , 0 , n, LDA, LDB, reinterpret_cast <const thrust::complex <double >*>(A), reinterpret_cast <thrust::complex <double >*>(B));
920+ const dim3 blockSize (16 , 16 );
921+ const dim3 gridSize ((n1 + blockSize.x - 1 ) / blockSize.x , (n2 + blockSize.y - 1 ) / blockSize.y );
922+
923+ hipLaunchKernelGGL (HIP_KERNEL_NAME (matrix_copy_kernel<thrust::complex <double >>), gridSize, blockSize, 0 , 0 , n1, n2, reinterpret_cast <const thrust::complex <double >*>(A), LDA, reinterpret_cast <thrust::complex <double >*>(B), LDB);
925924 hipCheckOnDebug ();
926925}
927926
@@ -935,7 +934,7 @@ template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_
935934template struct vector_mul_vector_op <std::complex <float >, base_device::DEVICE_GPU>;
936935template struct vector_div_vector_op <std::complex <float >, base_device::DEVICE_GPU>;
937936template struct constantvector_addORsub_constantVector_op <std::complex <float >, base_device::DEVICE_GPU>;
938- template struct matrixSetToAnother <std::complex <float >, base_device::DEVICE_GPU>;
937+ template struct matrixCopy <std::complex <float >, base_device::DEVICE_GPU>;
939938
940939template struct dot_real_op <std::complex <double >, base_device::DEVICE_GPU>;
941940template struct calc_grad_with_block_op <std::complex <double >, base_device::DEVICE_GPU>;
@@ -944,14 +943,14 @@ template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE
944943template struct vector_mul_vector_op <std::complex <double >, base_device::DEVICE_GPU>;
945944template struct vector_div_vector_op <std::complex <double >, base_device::DEVICE_GPU>;
946945template struct constantvector_addORsub_constantVector_op <std::complex <double >, base_device::DEVICE_GPU>;
947- template struct matrixSetToAnother <std::complex <double >, base_device::DEVICE_GPU>;
946+ template struct matrixCopy <std::complex <double >, base_device::DEVICE_GPU>;
948947
949948#ifdef __LCAO
950949template struct dot_real_op <double , base_device::DEVICE_GPU>;
951950template struct vector_div_constant_op <double , base_device::DEVICE_GPU>;
952951template struct vector_mul_vector_op <double , base_device::DEVICE_GPU>;
953952template struct vector_div_vector_op <double , base_device::DEVICE_GPU>;
954- template struct matrixSetToAnother <double , base_device::DEVICE_GPU>;
953+ template struct matrixCopy <double , base_device::DEVICE_GPU>;
955954template struct constantvector_addORsub_constantVector_op <double , base_device::DEVICE_GPU>;
956955#endif
957956} // namespace ModuleBase
0 commit comments