@@ -385,22 +385,15 @@ __global__ void matrix_transpose_kernel(
385385 }
386386}
387387
388-
389388template <typename T>
390- __global__ void matrix_setTo_another_kernel (
391- const int n,
392- const int LDA,
393- const int LDB,
394- const T* matrix_A,
395- T* matrix_B)
389+ __global__ void matrix_copy_kernel (const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
396390{
397- int j = blockIdx .x * blockDim .x + threadIdx .x ;
398- if (j < LDA && j < LDB)
391+ const int i = blockIdx .x * blockDim .x + threadIdx .x ;
392+ const int j = blockIdx .y * blockDim .y + threadIdx .y ;
393+
394+ if (i < n1 && j < n2)
399395 {
400- for (int i = 0 ; i < n; i++)
401- {
402- matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
403- }
396+ B[i * LDB + j] = A[i * LDA + j];
404397 }
405398}
406399
@@ -980,40 +973,43 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
980973}
981974
982975template <>
983- void matrixSetToAnother<double , base_device::DEVICE_GPU>::operator ()(const int & n,
984- const double * A,
985- const int & LDA,
986- double * B,
987- const int & LDB)
976+ void matrixCopy<double , base_device::DEVICE_GPU>::operator ()(const int & n1,
977+ const int & n2,
978+ const double * A,
979+ const int & LDA,
980+ double * B,
981+ const int & LDB)
988982{
989- int thread = 1024 ;
990- int block = (LDA + thread - 1 ) / thread ;
991- matrix_setTo_another_kernel <double > <<<block, thread >>> (n, LDA, LDB, A , B);
983+ const dim3 blockSize ( 16 , 16 ) ;
984+ const dim3 gridSize ((n1 + blockSize. x - 1 ) / blockSize. x , (n2 + blockSize. y - 1 ) / blockSize. y ) ;
985+ matrix_copy_kernel <double > <<<gridSize, blockSize >>> (n1, n2, A, LDA , B, LDB );
992986 cudaCheckOnDebug ();
993987}
994988template <>
995- void matrixSetToAnother<std::complex <float >, base_device::DEVICE_GPU>::operator ()(const int & n,
996- const std::complex <float >* A,
997- const int & LDA,
998- std::complex <float >* B,
999- const int & LDB)
989+ void matrixCopy<std::complex <float >, base_device::DEVICE_GPU>::operator ()(const int & n1,
990+ const int & n2,
991+ const std::complex <float >* A,
992+ const int & LDA,
993+ std::complex <float >* B,
994+ const int & LDB)
1000995{
1001- int thread = 1024 ;
1002- int block = (LDA + thread - 1 ) / thread ;
1003- matrix_setTo_another_kernel <thrust::complex <float >> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast <const thrust::complex <float >*>(A), reinterpret_cast <thrust::complex <float >*>(B));
996+ const dim3 blockSize ( 16 , 16 ) ;
997+ const dim3 gridSize ((n1 + blockSize. x - 1 ) / blockSize. x , (n2 + blockSize. y - 1 ) / blockSize. y ) ;
998+ matrix_copy_kernel <thrust::complex <float >> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast <const thrust::complex <float >*>(A), LDA, reinterpret_cast <thrust::complex <float >*>(B), LDB );
1004999 cudaCheckOnDebug ();
1000+
10051001}
10061002template <>
1007- void matrixSetToAnother<std::complex <double >, base_device::DEVICE_GPU>::operator ()(const int & n,
1008- const std::complex <double >* A,
1009- const int & LDA,
1010- std::complex <double >* B,
1011- const int & LDB)
1003+ void matrixCopy<std::complex <double >, base_device::DEVICE_GPU>::operator ()(const int & n1,
1004+ const int & n2,
1005+ const std::complex <double >* A,
1006+ const int & LDA,
1007+ std::complex <double >* B,
1008+ const int & LDB)
10121009{
1013- int thread = 1024 ;
1014- int block = (LDA + thread - 1 ) / thread;
1015- matrix_setTo_another_kernel<thrust::complex <double >> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast <const thrust::complex <double >*>(A), reinterpret_cast <thrust::complex <double >*>(B));
1016-
1010+ const dim3 blockSize (16 , 16 );
1011+ const dim3 gridSize ((n1 + blockSize.x - 1 ) / blockSize.x , (n2 + blockSize.y - 1 ) / blockSize.y );
1012+ matrix_copy_kernel<thrust::complex <double >> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast <const thrust::complex <double >*>(A), LDA, reinterpret_cast <thrust::complex <double >*>(B), LDB);
10171013 cudaCheckOnDebug ();
10181014}
10191015
@@ -1027,23 +1023,23 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GP
10271023template struct vector_div_vector_op <std::complex <float >, base_device::DEVICE_GPU>;
10281024template struct constantvector_addORsub_constantVector_op <float , base_device::DEVICE_GPU>;
10291025template struct constantvector_addORsub_constantVector_op <std::complex <float >, base_device::DEVICE_GPU>;
1030- template struct matrixSetToAnother <std::complex <float >, base_device::DEVICE_GPU>;
1026+ template struct matrixCopy <std::complex <float >, base_device::DEVICE_GPU>;
10311027
10321028template struct dot_real_op <std::complex <double >, base_device::DEVICE_GPU>;
10331029template struct calc_grad_with_block_op <std::complex <double >, base_device::DEVICE_GPU>;
10341030template struct line_minimize_with_block_op <std::complex <double >, base_device::DEVICE_GPU>;
10351031template struct vector_div_constant_op <std::complex <double >, base_device::DEVICE_GPU>;
10361032template struct vector_mul_vector_op <std::complex <double >, base_device::DEVICE_GPU>;
10371033template struct vector_div_vector_op <std::complex <double >, base_device::DEVICE_GPU>;
1034+ template struct constantvector_addORsub_constantVector_op <double , base_device::DEVICE_GPU>;
10381035template struct constantvector_addORsub_constantVector_op <std::complex <double >, base_device::DEVICE_GPU>;
1039- template struct matrixSetToAnother <std::complex <double >, base_device::DEVICE_GPU>;
1036+ template struct matrixCopy <double , base_device::DEVICE_GPU>;
1037+ template struct matrixCopy <std::complex <double >, base_device::DEVICE_GPU>;
10401038
10411039#ifdef __LCAO
10421040template struct dot_real_op <double , base_device::DEVICE_GPU>;
10431041template struct vector_div_constant_op <double , base_device::DEVICE_GPU>;
10441042template struct vector_mul_vector_op <double , base_device::DEVICE_GPU>;
10451043template struct vector_div_vector_op <double , base_device::DEVICE_GPU>;
1046- template struct matrixSetToAnother <double , base_device::DEVICE_GPU>;
1047- template struct constantvector_addORsub_constantVector_op <double , base_device::DEVICE_GPU>;
10481044#endif
10491045} // namespace ModuleBase
0 commit comments