Skip to content

Commit 3b59b29

Browse files
authored
Performance: Optimize para_gemm and para_linear_transform (#5967)
* change to copy matrix * optimize PLinearTransform::act * fix: CUDA compiling error without LCAO * optimize allocate for GPU * fix compile * update results
1 parent f1508aa commit 3b59b29

File tree

11 files changed

+292
-263
lines changed

11 files changed

+292
-263
lines changed

source/module_base/kernels/cuda/math_kernel_op.cu

Lines changed: 38 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -385,22 +385,15 @@ __global__ void matrix_transpose_kernel(
385385
}
386386
}
387387

388-
389388
template <typename T>
390-
__global__ void matrix_setTo_another_kernel(
391-
const int n,
392-
const int LDA,
393-
const int LDB,
394-
const T* matrix_A,
395-
T* matrix_B)
389+
__global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
396390
{
397-
int j = blockIdx.x * blockDim.x + threadIdx.x;
398-
if (j < LDA && j < LDB)
391+
const int i = blockIdx.x * blockDim.x + threadIdx.x;
392+
const int j = blockIdx.y * blockDim.y + threadIdx.y;
393+
394+
if (i < n1 && j < n2)
399395
{
400-
for (int i = 0; i < n; i++)
401-
{
402-
matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
403-
}
396+
B[i * LDB + j] = A[i * LDA + j];
404397
}
405398
}
406399

@@ -980,40 +973,43 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
980973
}
981974

982975
template <>
983-
void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const int& n,
984-
const double* A,
985-
const int& LDA,
986-
double* B,
987-
const int& LDB)
976+
void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const int& n1,
977+
const int& n2,
978+
const double* A,
979+
const int& LDA,
980+
double* B,
981+
const int& LDB)
988982
{
989-
int thread = 1024;
990-
int block = (LDA + thread - 1) / thread;
991-
matrix_setTo_another_kernel<double> <<<block, thread >>> (n, LDA, LDB, A, B);
983+
const dim3 blockSize(16, 16);
984+
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
985+
matrix_copy_kernel<double> <<<gridSize, blockSize >>> (n1, n2, A, LDA, B, LDB);
992986
cudaCheckOnDebug();
993987
}
994988
template <>
995-
void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n,
996-
const std::complex<float>* A,
997-
const int& LDA,
998-
std::complex<float>* B,
999-
const int& LDB)
989+
void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n1,
990+
const int& n2,
991+
const std::complex<float>* A,
992+
const int& LDA,
993+
std::complex<float>* B,
994+
const int& LDB)
1000995
{
1001-
int thread = 1024;
1002-
int block = (LDA + thread - 1) / thread;
1003-
matrix_setTo_another_kernel<thrust::complex<float>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
996+
const dim3 blockSize(16, 16);
997+
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
998+
matrix_copy_kernel<thrust::complex<float>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
1004999
cudaCheckOnDebug();
1000+
10051001
}
10061002
template <>
1007-
void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n,
1008-
const std::complex<double>* A,
1009-
const int& LDA,
1010-
std::complex<double>* B,
1011-
const int& LDB)
1003+
void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n1,
1004+
const int& n2,
1005+
const std::complex<double>* A,
1006+
const int& LDA,
1007+
std::complex<double>* B,
1008+
const int& LDB)
10121009
{
1013-
int thread = 1024;
1014-
int block = (LDA + thread - 1) / thread;
1015-
matrix_setTo_another_kernel<thrust::complex<double>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));
1016-
1010+
const dim3 blockSize(16, 16);
1011+
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
1012+
matrix_copy_kernel<thrust::complex<double>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
10171013
cudaCheckOnDebug();
10181014
}
10191015

@@ -1027,23 +1023,23 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GP
10271023
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
10281024
template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
10291025
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
1030-
template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
1026+
template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
10311027

10321028
template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
10331029
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
10341030
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
10351031
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
10361032
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
10371033
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
1034+
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
10381035
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
1039-
template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
1036+
template struct matrixCopy<double, base_device::DEVICE_GPU>;
1037+
template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
10401038

10411039
#ifdef __LCAO
10421040
template struct dot_real_op<double, base_device::DEVICE_GPU>;
10431041
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
10441042
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
10451043
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
1046-
template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
1047-
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
10481044
#endif
10491045
} // namespace ModuleBase

source/module_base/kernels/math_kernel_op.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -337,16 +337,16 @@ struct matrixTranspose_op<T, base_device::DEVICE_CPU>
337337
};
338338

339339
template <typename T>
340-
struct matrixSetToAnother<T, base_device::DEVICE_CPU>
340+
struct matrixCopy<T, base_device::DEVICE_CPU>
341341
{
342-
void operator()(const int& n, const T* A, const int& LDA, T* B, const int& LDB)
342+
void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB)
343343
{
344344
#ifdef _OPENMP
345345
#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
346346
#endif
347-
for (int i = 0; i < n; i++)
347+
for (int i = 0; i < n1; i++)
348348
{
349-
for (int j = 0; j < LDA; j++)
349+
for (int j = 0; j < n2; j++)
350350
{
351351
B[i * LDB + j] = A[i * LDA + j];
352352
}
@@ -367,7 +367,7 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
367367
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
368368
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
369369
template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
370-
template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_CPU>;
370+
template struct matrixCopy<std::complex<float>, base_device::DEVICE_CPU>;
371371
template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
372372
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
373373

@@ -385,7 +385,8 @@ template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_C
385385
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
386386
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
387387
template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
388-
template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_CPU>;
388+
template struct matrixCopy<double, base_device::DEVICE_CPU>;
389+
template struct matrixCopy<std::complex<double>, base_device::DEVICE_CPU>;
389390
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
390391
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
391392

@@ -394,7 +395,6 @@ template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
394395
template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
395396
template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
396397
template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
397-
template struct matrixSetToAnother<double, base_device::DEVICE_CPU>;
398398
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
399399
#endif
400400
#ifdef __DSP

source/module_base/kernels/math_kernel_op.h

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -298,19 +298,19 @@ template <typename T, typename Device> struct matrixTranspose_op {
298298
const T *input_matrix, T *output_matrix);
299299
};
300300

301-
template <typename T, typename Device> struct matrixSetToAnother {
302-
/// @brief initialize matrix B with A
301+
template <typename T, typename Device> struct matrixCopy {
302+
/// @brief copy matrix A to B, they can have different leading dimensions
303303
///
304304
/// Input Parameters
305-
/// \param n : first dimension of matrix
305+
/// \param n1 : first dimension of matrix
306+
/// \param n2 : second dimension of matrix
306307
/// \param A : input matrix A
307308
/// \param LDA : leading dimension of A
308309
/// \param LDB : leading dimension of B
309310
///
310311
/// Output Parameters
311312
/// \param B : output matrix B
312-
void operator()(const int &n, const T *A, const int &LDA,
313-
T *B, const int &LDB);
313+
void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
314314
};
315315

316316
#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -370,12 +370,13 @@ struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU> {
370370
const Real constant2);
371371
};
372372

373-
template <typename T> struct matrixSetToAnother<T, base_device::DEVICE_GPU> {
374-
void operator()(const int &n,
375-
const T *A, // input
376-
const int &LDA,
377-
T *B, // output
378-
const int &LDB);
373+
template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
374+
void operator()(const int& n1,
375+
const int& n2,
376+
const T* A, // input
377+
const int& LDA,
378+
T* B, // output
379+
const int& LDB);
379380
};
380381

381382
void createGpuBlasHandle();

source/module_base/kernels/rocm/math_kernel_op.hip.cu

Lines changed: 41 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -307,23 +307,16 @@ __global__ void matrix_transpose_kernel(
307307
}
308308
}
309309

310-
311310
template <typename T>
312-
__launch_bounds__(1024)
313-
__global__ void matrix_setTo_another_kernel(
314-
const int n,
315-
const int LDA,
316-
const int LDB,
317-
const T* matrix_A,
318-
T* matrix_B)
319-
{
320-
int j = blockIdx.x * blockDim.x + threadIdx.x;
321-
if (j < LDA && j < LDB)
311+
__launch_bounds__(1024) __global__
312+
void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
313+
{
314+
const int i = blockIdx.x * blockDim.x + threadIdx.x;
315+
const int j = blockIdx.y * blockDim.y + threadIdx.y;
316+
317+
if (i < n1 && j < n2)
322318
{
323-
for (int i = 0; i < n; i++)
324-
{
325-
matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
326-
}
319+
B[i * LDB + j] = A[i * LDA + j];
327320
}
328321
}
329322

@@ -889,39 +882,45 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
889882
}
890883

891884
template <>
892-
void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const int& n,
893-
const double* A,
894-
const int& LDA,
895-
double* B,
896-
const int& LDB)
885+
void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const int& n1,
886+
const int& n2,
887+
const double* A,
888+
const int& LDA,
889+
double* B,
890+
const int& LDB)
897891
{
898-
int thread = 1024;
899-
int block = (LDA + thread - 1) / thread;
900-
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<double>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, A, B);
892+
const dim3 blockSize(16, 16);
893+
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
894+
895+
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<double>), gridSize, blockSize, 0, 0, n1, n2, A, LDA, B, LDB);
901896
hipCheckOnDebug();
902897
}
903898
template <>
904-
void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n,
905-
const std::complex<float>* A,
906-
const int& LDA,
907-
std::complex<float>* B,
908-
const int& LDB)
899+
void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n1,
900+
const int& n2,
901+
const std::complex<float>* A,
902+
const int& LDA,
903+
std::complex<float>* B,
904+
const int& LDB)
909905
{
910-
int thread = 1024;
911-
int block = (LDA + thread - 1) / thread;
912-
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<thrust::complex<float>>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
906+
const dim3 blockSize(16, 16);
907+
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
908+
909+
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<thrust::complex<float>>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
913910
hipCheckOnDebug();
914911
}
915912
template <>
916-
void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n,
917-
const std::complex<double>* A,
918-
const int& LDA,
919-
std::complex<double>* B,
920-
const int& LDB)
913+
void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n1,
914+
const int& n2,
915+
const std::complex<double>* A,
916+
const int& LDA,
917+
std::complex<double>* B,
918+
const int& LDB)
921919
{
922-
int thread = 1024;
923-
int block = (LDA + thread - 1) / thread;
924-
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<thrust::complex<double>>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));
920+
const dim3 blockSize(16, 16);
921+
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
922+
923+
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<thrust::complex<double>>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
925924
hipCheckOnDebug();
926925
}
927926

@@ -935,7 +934,7 @@ template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_
935934
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
936935
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
937936
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
938-
template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
937+
template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
939938

940939
template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
941940
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -944,14 +943,14 @@ template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE
944943
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
945944
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
946945
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
947-
template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
946+
template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
948947

949948
#ifdef __LCAO
950949
template struct dot_real_op<double, base_device::DEVICE_GPU>;
951950
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
952951
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
953952
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
954-
template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
953+
template struct matrixCopy<double, base_device::DEVICE_GPU>;
955954
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
956955
#endif
957956
} // namespace ModuleBase

0 commit comments

Comments
 (0)