diff --git a/source/module_base/kernels/cuda/math_kernel_op.cu b/source/module_base/kernels/cuda/math_kernel_op.cu index c17f9cd579..39043daff1 100644 --- a/source/module_base/kernels/cuda/math_kernel_op.cu +++ b/source/module_base/kernels/cuda/math_kernel_op.cu @@ -385,22 +385,15 @@ __global__ void matrix_transpose_kernel( } } - template -__global__ void matrix_setTo_another_kernel( - const int n, - const int LDA, - const int LDB, - const T* matrix_A, - T* matrix_B) +__global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j < LDA && j < LDB) + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + + if (i < n1 && j < n2) { - for (int i = 0; i < n; i++) - { - matrix_B[i * LDB + j] = matrix_A[i * LDA + j]; - } + B[i * LDB + j] = A[i * LDA + j]; } } @@ -980,40 +973,43 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator } template <> -void matrixSetToAnother::operator()(const int& n, - const double* A, - const int& LDA, - double* B, - const int& LDB) +void matrixCopy::operator()(const int& n1, + const int& n2, + const double* A, + const int& LDA, + double* B, + const int& LDB) { - int thread = 1024; - int block = (LDA + thread - 1) / thread; - matrix_setTo_another_kernel <<>> (n, LDA, LDB, A, B); + const dim3 blockSize(16, 16); + const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y); + matrix_copy_kernel <<>> (n1, n2, A, LDA, B, LDB); cudaCheckOnDebug(); } template <> -void matrixSetToAnother, base_device::DEVICE_GPU>::operator()(const int& n, - const std::complex* A, - const int& LDA, - std::complex* B, - const int& LDB) +void matrixCopy, base_device::DEVICE_GPU>::operator()(const int& n1, + const int& n2, + const std::complex* A, + const int& LDA, + std::complex* B, + const int& LDB) { - int thread = 1024; - int block = (LDA + thread - 1) / thread; - matrix_setTo_another_kernel> <<>> (n, LDA, LDB, reinterpret_cast*>(A), reinterpret_cast*>(B)); + const dim3 blockSize(16, 16); + const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y); + matrix_copy_kernel> <<>> (n1, n2, reinterpret_cast*>(A), LDA, reinterpret_cast*>(B), LDB); cudaCheckOnDebug(); + } template <> -void matrixSetToAnother, base_device::DEVICE_GPU>::operator()(const int& n, - const std::complex* A, - const int& LDA, - std::complex* B, - const int& LDB) +void matrixCopy, base_device::DEVICE_GPU>::operator()(const int& n1, + const int& n2, + const std::complex* A, + const int& LDA, + std::complex* B, + const int& LDB) { - int thread = 1024; - int block = (LDA + thread - 1) / thread; - matrix_setTo_another_kernel> <<>> (n, LDA, LDB, reinterpret_cast*>(A), reinterpret_cast*>(B)); - + const dim3 blockSize(16, 16); + const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y); + matrix_copy_kernel> <<>> (n1, n2, reinterpret_cast*>(A), LDA, reinterpret_cast*>(B), LDB); cudaCheckOnDebug(); } @@ -1027,7 +1023,7 @@ template struct vector_mul_vector_op, base_device::DEVICE_GP template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; -template struct matrixSetToAnother, base_device::DEVICE_GPU>; +template struct matrixCopy, base_device::DEVICE_GPU>; template struct dot_real_op, base_device::DEVICE_GPU>; template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; @@ -1035,15 +1031,15 @@ template struct line_minimize_with_block_op, base_device::D template struct vector_div_constant_op, base_device::DEVICE_GPU>; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; +template struct constantvector_addORsub_constantVector_op; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; -template struct matrixSetToAnother, base_device::DEVICE_GPU>; +template struct matrixCopy; +template struct matrixCopy, base_device::DEVICE_GPU>; #ifdef __LCAO template struct dot_real_op; template struct vector_div_constant_op; template struct vector_mul_vector_op; template struct vector_div_vector_op; -template struct matrixSetToAnother; -template struct constantvector_addORsub_constantVector_op; #endif } // namespace ModuleBase diff --git a/source/module_base/kernels/math_kernel_op.cpp b/source/module_base/kernels/math_kernel_op.cpp index 3f25a1e47b..bb0bb923a5 100644 --- a/source/module_base/kernels/math_kernel_op.cpp +++ b/source/module_base/kernels/math_kernel_op.cpp @@ -337,16 +337,16 @@ struct matrixTranspose_op }; template -struct matrixSetToAnother +struct matrixCopy { - void operator()(const int& n, const T* A, const int& LDA, T* B, const int& LDB) + void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB) { #ifdef _OPENMP #pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T)) #endif - for (int i = 0; i < n; i++) + for (int i = 0; i < n1; i++) { - for (int j = 0; j < LDA; j++) + for (int j = 0; j < n2; j++) { B[i * LDB + j] = A[i * LDA + j]; } @@ -367,7 +367,7 @@ template struct vector_mul_vector_op, base_device::DEVICE_CP template struct vector_div_vector_op, base_device::DEVICE_CPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_CPU>; template struct matrixTranspose_op, base_device::DEVICE_CPU>; -template struct matrixSetToAnother, base_device::DEVICE_CPU>; +template struct matrixCopy, base_device::DEVICE_CPU>; template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; @@ -385,7 +385,8 @@ template struct vector_mul_vector_op, base_device::DEVICE_C template struct vector_div_vector_op, base_device::DEVICE_CPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_CPU>; template struct matrixTranspose_op, base_device::DEVICE_CPU>; -template struct matrixSetToAnother, base_device::DEVICE_CPU>; +template struct matrixCopy; +template struct matrixCopy, base_device::DEVICE_CPU>; template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; @@ -394,7 +395,6 @@ template struct vector_mul_vector_op; template struct vector_div_constant_op; template struct vector_div_vector_op; template struct matrixTranspose_op; -template struct matrixSetToAnother; template struct constantvector_addORsub_constantVector_op; #endif #ifdef __DSP diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h index 33ad5a7b4d..d94aeb6806 100644 --- a/source/module_base/kernels/math_kernel_op.h +++ b/source/module_base/kernels/math_kernel_op.h @@ -298,19 +298,19 @@ template struct matrixTranspose_op { const T *input_matrix, T *output_matrix); }; -template struct matrixSetToAnother { - /// @brief initialize matrix B with A +template struct matrixCopy { + /// @brief copy matrix A to B, they can have different leading dimensions /// /// Input Parameters - /// \param n : first dimension of matrix + /// \param n1 : first dimension of matrix + /// \param n2 : second dimension of matrix /// \param A : input matrix A /// \param LDA : leading dimension of A /// \param LDB : leading dimension of B /// /// Output Parameters /// \param B : output matrix B - void operator()(const int &n, const T *A, const int &LDA, - T *B, const int &LDB); + void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB); }; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM @@ -370,12 +370,13 @@ struct constantvector_addORsub_constantVector_op { const Real constant2); }; -template struct matrixSetToAnother { - void operator()(const int &n, - const T *A, // input - const int &LDA, - T *B, // output - const int &LDB); +template struct matrixCopy { + void operator()(const int& n1, + const int& n2, + const T* A, // input + const int& LDA, + T* B, // output + const int& LDB); }; void createGpuBlasHandle(); diff --git a/source/module_base/kernels/rocm/math_kernel_op.hip.cu b/source/module_base/kernels/rocm/math_kernel_op.hip.cu index ba20d282b7..0279bfee75 100644 --- a/source/module_base/kernels/rocm/math_kernel_op.hip.cu +++ b/source/module_base/kernels/rocm/math_kernel_op.hip.cu @@ -307,23 +307,16 @@ __global__ void matrix_transpose_kernel( } } - template -__launch_bounds__(1024) -__global__ void matrix_setTo_another_kernel( - const int n, - const int LDA, - const int LDB, - const T* matrix_A, - T* matrix_B) -{ - int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j < LDA && j < LDB) +__launch_bounds__(1024) __global__ + void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB) +{ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + + if (i < n1 && j < n2) { - for (int i = 0; i < n; i++) - { - matrix_B[i * LDB + j] = matrix_A[i * LDA + j]; - } + B[i * LDB + j] = A[i * LDA + j]; } } @@ -889,39 +882,45 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator } template <> -void matrixSetToAnother::operator()(const int& n, - const double* A, - const int& LDA, - double* B, - const int& LDB) +void matrixCopy::operator()(const int& n1, + const int& n2, + const double* A, + const int& LDA, + double* B, + const int& LDB) { - int thread = 1024; - int block = (LDA + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, A, B); + const dim3 blockSize(16, 16); + const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel), gridSize, blockSize, 0, 0, n1, n2, A, LDA, B, LDB); hipCheckOnDebug(); } template <> -void matrixSetToAnother, base_device::DEVICE_GPU>::operator()(const int& n, - const std::complex* A, - const int& LDA, - std::complex* B, - const int& LDB) +void matrixCopy, base_device::DEVICE_GPU>::operator()(const int& n1, + const int& n2, + const std::complex* A, + const int& LDA, + std::complex* B, + const int& LDB) { - int thread = 1024; - int block = (LDA + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast*>(A), reinterpret_cast*>(B)); + const dim3 blockSize(16, 16); + const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast*>(A), LDA, reinterpret_cast*>(B), LDB); hipCheckOnDebug(); } template <> -void matrixSetToAnother, base_device::DEVICE_GPU>::operator()(const int& n, - const std::complex* A, - const int& LDA, - std::complex* B, - const int& LDB) +void matrixCopy, base_device::DEVICE_GPU>::operator()(const int& n1, + const int& n2, + const std::complex* A, + const int& LDA, + std::complex* B, + const int& LDB) { - int thread = 1024; - int block = (LDA + thread - 1) / thread; - hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast*>(A), reinterpret_cast*>(B)); + const dim3 blockSize(16, 16); + const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast*>(A), LDA, reinterpret_cast*>(B), LDB); hipCheckOnDebug(); } @@ -935,7 +934,7 @@ template struct vector_div_constant_op, base_device::DEVICE_ template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; -template struct matrixSetToAnother, base_device::DEVICE_GPU>; +template struct matrixCopy, base_device::DEVICE_GPU>; template struct dot_real_op, base_device::DEVICE_GPU>; template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; @@ -944,14 +943,14 @@ template struct vector_div_constant_op, base_device::DEVICE template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; -template struct matrixSetToAnother, base_device::DEVICE_GPU>; +template struct matrixCopy, base_device::DEVICE_GPU>; #ifdef __LCAO template struct dot_real_op; template struct vector_div_constant_op; template struct vector_mul_vector_op; template struct vector_div_vector_op; -template struct matrixSetToAnother; +template struct matrixCopy; template struct constantvector_addORsub_constantVector_op; #endif } // namespace ModuleBase diff --git a/source/module_base/kernels/test/math_kernel_test.cpp b/source/module_base/kernels/test/math_kernel_test.cpp index 02dd7278f3..9490563743 100644 --- a/source/module_base/kernels/test/math_kernel_test.cpp +++ b/source/module_base/kernels/test/math_kernel_test.cpp @@ -630,7 +630,7 @@ TEST_F(TestModuleHsolverMathKernel, gemv_op_gpu) delete_memory_op()(Y_gemv_dev); } -TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu) +TEST_F(TestModuleHsolverMathKernel, matrixCopy_op_gpu) { // const std::vector > expect_result = { // {-0.11893203,-0.13492526}, {-0.40314756, 0.07734553}, {0.06892412, 0.14837423}, {0.0, 0.0}, @@ -665,25 +665,15 @@ TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu) B.size()); // run - ModuleBase::matrixSetToAnother, base_device::DEVICE_GPU>()(n, - device_A, - LDA, - device_B, - LDB); + ModuleBase::matrixCopy, base_device::DEVICE_GPU>()(n, LDA, device_A, LDA, device_B, LDB); std::vector> B_gpu2cpu(8); base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, - base_device::DEVICE_GPU>()(B_gpu2cpu.data(), - device_B, - B_gpu2cpu.size()); + base_device::DEVICE_GPU>()(B_gpu2cpu.data(), device_B, B_gpu2cpu.size()); std::vector> B_cpu(8); - ModuleBase::matrixSetToAnother, base_device::DEVICE_CPU>()(n, - A.data(), - LDA, - B_cpu.data(), - LDB); + ModuleBase::matrixCopy, base_device::DEVICE_CPU>()(n, LDA, A.data(), LDA, B_cpu.data(), LDB); // for (int i = 0; i < 4; i++) // { diff --git a/source/module_base/para_gemm.cpp b/source/module_base/para_gemm.cpp index 4783faa17a..abfacf5154 100644 --- a/source/module_base/para_gemm.cpp +++ b/source/module_base/para_gemm.cpp @@ -12,6 +12,11 @@ PGemmCN::PGemmCN() template PGemmCN::~PGemmCN() { +#ifdef __MPI + delmem_dev_op()(C_local_tmp_); + delmem_dev_op()(A_tmp_device_); + delmem_dev_op()(B_tmp_device_); +#endif } template @@ -63,46 +68,77 @@ void PGemmCN::set_dimension( default: break; } - requests.resize(col_nproc); - if (this->divideCrow) + + if(col_nproc > 1) { - colB_loc.resize(col_nproc); - MPI_Allgather(&ncolB, 1, MPI_INT, colB_loc.data(), 1, MPI_INT, col_world); - int sum = 0; - for (int ip = 0; ip < col_nproc; ip++) + requests.resize(col_nproc); + if (this->divideCrow) { - max_colB = std::max(max_colB, colB_loc[ip]); - sum += colB_loc[ip]; + colB_loc.resize(col_nproc); + MPI_Allgather(&ncolB, 1, MPI_INT, colB_loc.data(), 1, MPI_INT, col_world); + int sum = 0; + for (int ip = 0; ip < col_nproc; ip++) + { + max_colB = std::max(max_colB, colB_loc[ip]); + sum += colB_loc[ip]; + } + size_C_local = sum * LDC; + + // allocate temperory memory + if (std::is_same::value) + { + resmem_dev_op()(B_tmp_device_, max_colB * LDB); + } + B_tmp_.resize(max_colB * LDB); } - size_C_local = sum * LDC; - } - else - { - colA_loc.resize(col_nproc); - MPI_Allgather(&ncolA, 1, MPI_INT, colA_loc.data(), 1, MPI_INT, col_world); - for (int ip = 0; ip < col_nproc; ip++) + else { - max_colA = std::max(max_colA, colA_loc[ip]); - } - size_C_local = ncolB * LDC; - } + colA_loc.resize(col_nproc); + MPI_Allgather(&ncolA, 1, MPI_INT, colA_loc.data(), 1, MPI_INT, col_world); + for (int ip = 0; ip < col_nproc; ip++) + { + max_colA = std::max(max_colA, colA_loc[ip]); + } + size_C_local = ncolB * LDC; - if (this->gatherC) - { - colB_loc.resize(col_nproc); - recv_counts.resize(col_nproc); - displs.resize(col_nproc); - MPI_Allgather(&ncolB, 1, MPI_INT, colB_loc.data(), 1, MPI_INT, col_world); - for (int ip = 0; ip < col_nproc; ip++) - { - recv_counts[ip] = LDC * colB_loc[ip]; + // allocate temperory memory + if (std::is_same::value) + { + resmem_dev_op()(A_tmp_device_, max_colA * LDA); +#ifndef __CUDA_MPI + isend_tmp_.resize(max_colA * LDA); +#endif + } + A_tmp_.resize(max_colA * LDA); } - displs[0] = 0; - for (int ip = 1; ip < col_nproc; ip++) + + if (this->gatherC) { - displs[ip] = displs[ip - 1] + recv_counts[ip - 1]; + colB_loc.resize(col_nproc); + recv_counts.resize(col_nproc); + displs.resize(col_nproc); + MPI_Allgather(&ncolB, 1, MPI_INT, colB_loc.data(), 1, MPI_INT, col_world); + for (int ip = 0; ip < col_nproc; ip++) + { + recv_counts[ip] = LDC * colB_loc[ip]; + } + displs[0] = 0; + for (int ip = 1; ip < col_nproc; ip++) + { + displs[ip] = displs[ip - 1] + recv_counts[ip - 1]; + } + size_C_global = displs[col_nproc - 1] + recv_counts[col_nproc - 1]; + + // allocate temperory memory + if (std::is_same::value) + { + resmem_dev_op()(C_local_tmp_, size_C_local); +#ifndef __CUDA_MPI + C_global_tmp_.resize(size_C_global); +#endif + } + C_tmp_.resize(size_C_local); } - size_C_global = displs[col_nproc - 1] + recv_counts[col_nproc - 1]; } #endif } @@ -144,7 +180,8 @@ void PGemmCN::multiply_single(const T alpha, const T* A, const T* B, #ifdef __MPI if (this->row_nproc > 1) { - Parallel_Common::reduce_dev(C, size_C_local, row_world); + const int size = ncolB * LDC; + Parallel_Common::reduce_dev(C, size, row_world); } #endif } @@ -155,50 +192,43 @@ void PGemmCN::multiply_col(const T alpha, const T* A, const T* B, con { const Device* ctx = {}; - std::vector B_tmp(max_colA * LDA); - std::vector isend_tmp; -#ifndef __CUDA_MPI - if (std::is_same::value) - { - isend_tmp.resize(max_colA * LDA); - } -#endif + // send A to other procs + T* isend_tmp = isend_tmp_.data(); for (int ip = 0; ip < col_nproc; ip++) { if (col_rank != ip) { int size = ncolA * LDA; - Parallel_Common::isend_dev(A, size, ip, 0, col_world, &requests[ip], isend_tmp.data()); + Parallel_Common::isend_dev(A, size, ip, 0, col_world, &requests[ip], isend_tmp); } } + + //init pointers T* C_local = C; - std::vector C_tmp; if (this->gatherC) { - C_tmp.resize(size_C_local); if (std::is_same::value) { - C_local = nullptr; - resmem_dev_op()(C_local, size_C_local); + C_local = C_local_tmp_; } else { - C_local = C_tmp.data(); + C_local = C_tmp_.data(); } syncmem_dev_op()(C_local, C + displs[col_rank], size_C_local); } - T* Atmp_device = nullptr; if (std::is_same::value) { - resmem_dev_op()(Atmp_device, max_colA * LDA); + Atmp_device = A_tmp_device_; } else { - Atmp_device = B_tmp.data(); + Atmp_device = A_tmp_.data(); } + // multiply int shift = 0; T real_beta = row_rank == 0 ? beta : 0; for (int ip = 0; ip < col_nproc; ip++) @@ -226,7 +256,7 @@ void PGemmCN::multiply_col(const T alpha, const T* A, const T* B, con int m = colA_loc[ip]; int size = m * LDA; MPI_Status status; - Parallel_Common::recv_dev(Atmp_device, size, ip, 0, col_world, &status, B_tmp.data()); + Parallel_Common::recv_dev(Atmp_device, size, ip, 0, col_world, &status, A_tmp_.data()); MPI_Wait(&requests[ip], &status); ModuleBase::gemm_op()('C', 'N', @@ -248,44 +278,35 @@ void PGemmCN::multiply_col(const T alpha, const T* A, const T* B, con if (this->gatherC) { #ifdef __CUDA_MPI - if (this->row_nproc > 1) - { - Parallel_Common::reduce_data(C_local, size_C_local, row_world); - } - Parallel_Common::gatherv_data(C_local, size_C_local, C, recv_counts.data(), displs.data(), col_world); + T* Clocal_mpi = C_local; + T* Cglobal_mpi = C; #else - T* Cglobal_cpu = nullptr; - T* Clocal_cpu = C_tmp.data(); - std::vector cpu_tmp; - + T* Clocal_mpi = C_tmp_.data(); + T* Cglobal_mpi = nullptr; if (std::is_same::value) { - delmem_dev_op()(Atmp_device); - - syncmem_d2h_op()(Clocal_cpu, C_local, size_C_local); - delmem_dev_op()(C_local); - - cpu_tmp.resize(size_C_global); - Cglobal_cpu = cpu_tmp.data(); + syncmem_d2h_op()(Clocal_mpi, C_local, size_C_local); + Cglobal_mpi = C_global_tmp_.data(); } else { - Cglobal_cpu = C; + Cglobal_mpi = C; } +#endif if (this->row_nproc > 1) { - Parallel_Common::reduce_data(Clocal_cpu, size_C_local, row_world); + Parallel_Common::reduce_data(Clocal_mpi, size_C_local, row_world); } - Parallel_Common::gatherv_data(Clocal_cpu, + Parallel_Common::gatherv_data(Clocal_mpi, size_C_local, - Cglobal_cpu, + Cglobal_mpi, recv_counts.data(), displs.data(), col_world); - +#ifndef __CUDA_MPI if (std::is_same::value) { - syncmem_h2d_op()(C, Cglobal_cpu, size_C_global); + syncmem_h2d_op()(C, Cglobal_mpi, size_C_global); } #endif } @@ -303,28 +324,28 @@ void PGemmCN::multiply_row(const T alpha, const T* A, const T* B, con { const Device* ctx = {}; - std::vector B_tmp(max_colB * LDB); + // Send B to other procs for (int ip = 0; ip < col_nproc; ip++) { if (col_rank != ip) { int size = ncolB * LDB; - Parallel_Common::isend_dev(B, size, ip, 0, col_world, &requests[ip], B_tmp.data()); + Parallel_Common::isend_dev(B, size, ip, 0, col_world, &requests[ip], B_tmp_.data()); } } - std::vector C_tmp; - + // init pointers T* Btmp_device = nullptr; if (std::is_same::value) { - resmem_dev_op()(Btmp_device, max_colB * LDB); + Btmp_device = B_tmp_device_; } else { - Btmp_device = B_tmp.data(); + Btmp_device = B_tmp_.data(); } + // multiply int shift = 0; T real_beta = row_rank == 0 ? beta : 0; for (int ip = 0; ip < col_nproc; ip++) @@ -352,7 +373,7 @@ void PGemmCN::multiply_row(const T alpha, const T* A, const T* B, con int m = colB_loc[ip]; int size = m * LDB; MPI_Status status; - Parallel_Common::recv_dev(Btmp_device, size, ip, 0, col_world, &status, B_tmp.data()); + Parallel_Common::recv_dev(Btmp_device, size, ip, 0, col_world, &status, B_tmp_.data()); MPI_Wait(&requests[ip], &status); ModuleBase::gemm_op()('C', 'N', diff --git a/source/module_base/para_gemm.h b/source/module_base/para_gemm.h index f00d3cd975..a0e01d2734 100644 --- a/source/module_base/para_gemm.h +++ b/source/module_base/para_gemm.h @@ -98,6 +98,20 @@ class PGemmCN using syncmem_dev_op = base_device::memory::synchronize_memory_op; using syncmem_d2h_op = base_device::memory::synchronize_memory_op; using syncmem_h2d_op = base_device::memory::synchronize_memory_op; + +#ifdef __MPI + private: + std::vector isend_tmp_; ///< temperory memory for sending data + std::vector A_tmp_; ///< temperory memory for A + std::vector B_tmp_; ///< temperory memory for B + std::vector C_tmp_; ///< temperory memory for C + std::vector C_global_tmp_; ///< temperory memory for C_global + T* C_local_tmp_ = nullptr; ///< temperory memory for C_local + T* A_tmp_device_ = nullptr; ///< temperory memory for A + T* B_tmp_device_ = nullptr; ///< temperory memory for B +#endif + + }; } // namespace ModuleBase #endif \ No newline at end of file diff --git a/source/module_hsolver/diago_iter_assist.cpp b/source/module_hsolver/diago_iter_assist.cpp index ce1996db70..e39b8bd44b 100644 --- a/source/module_hsolver/diago_iter_assist.cpp +++ b/source/module_hsolver/diago_iter_assist.cpp @@ -136,7 +136,7 @@ void DiagoIterAssist::diagH_subspace(const hamilt::Hamilt* if (!in_place) { - ModuleBase::matrixSetToAnother()(n_band, temp, ld_temp, evc.get_pointer(), dmax); + ModuleBase::matrixCopy()(n_band, ld_temp, temp, ld_temp, evc.get_pointer(), dmax); delmem_complex_op()(temp); } delmem_complex_op()(hcc); @@ -347,7 +347,7 @@ void DiagoIterAssist::diagH_subspace_init(hamilt::Hamilt* evc.get_pointer(), dmax); - // matrixSetToAnother()(ctx, n_band, evctemp, dmin, evc.get_pointer(), dmax); + // matrixCopy()(ctx, n_band, evctemp, dmin, evc.get_pointer(), dmax); // delmem_complex_op()(ctx, evctemp); } @@ -561,7 +561,7 @@ void DiagoIterAssist::diag_subspace_psi(const T* hcc, &zero, temp, dmin); - ModuleBase::matrixSetToAnother()(n_band, temp, dmin, evc.get_pointer(), dmax); + ModuleBase::matrixCopy()(n_band, dmin, temp, dmin, evc.get_pointer(), dmax); delmem_complex_op()(temp); } diff --git a/source/module_hsolver/para_linear_transform.cpp b/source/module_hsolver/para_linear_transform.cpp index 2c531f3dc9..5a6c8def27 100644 --- a/source/module_hsolver/para_linear_transform.cpp +++ b/source/module_hsolver/para_linear_transform.cpp @@ -1,4 +1,5 @@ #include "para_linear_transform.h" + #include "module_base/timer.h" #include @@ -6,6 +7,15 @@ namespace hsolver { template +PLinearTransform::~PLinearTransform() +{ +#ifdef __MPI + delmem_dev_op()(U_tmp_); + delmem_dev_op()(B_tmp_); + delmem_dev_op()(A_tmp_device_); +#endif +} +template void PLinearTransform::set_dimension(const int nrowA, const int ncolA, const int ncolB, @@ -46,6 +56,18 @@ void PLinearTransform::set_dimension(const int nrowA, start_colB[ip] = start_colB[ip - 1] + colB_loc[ip - 1]; } this->max_colB = *std::max_element(colB_loc.begin(), colB_loc.end()); + + // allocate temperory memory + resmem_dev_op()(B_tmp_, ncolB * LDA); + resmem_dev_op()(U_tmp_, max_colA * max_colB); + if (std::is_same::value) + { + resmem_dev_op()(A_tmp_device_, max_colA * LDA); +#ifndef __CUDA_MPI + isend_tmp_.resize(max_colA * LDA); +#endif + } + A_tmp_.resize(max_colA * LDA); } #else nproc_col = 1; @@ -56,100 +78,75 @@ template void PLinearTransform::act(const T alpha, const T* A, const T* U, const T beta, T* B) { ModuleBase::timer::tick("PLinearTransform", "act"); - const Device* ctx = {}; #ifdef __MPI if (nproc_col > 1) { + syncmem_dev_op()(B_tmp_, B, ncolB * LDA); std::vector requests(nproc_col); - std::vector A_tmp(max_colA * LDA); - std::vector isend_tmp; - T* A_tmp_device = A_tmp.data(); - if (std::is_same::value) - { - A_tmp_device = nullptr; -#ifndef __CUDA_MPI - isend_tmp.resize(max_colA * LDA); -#endif - resmem_dev_op()(A_tmp_device, max_colA * LDA); - } - T* B_tmp = nullptr; - resmem_dev_op()(B_tmp, ncolB * LDA); - syncmem_dev_op()(B_tmp, B, ncolB * LDA); - setmem_dev_op()(B, 0.0, ncolB * LDA); - - T* U_tmp = nullptr; - resmem_dev_op()(U_tmp, max_colA * max_colB); - // Send for (int ip = 0; ip < nproc_col; ++ip) { if (rank_col != ip) { int size = LDA * ncolA; - Parallel_Common::isend_dev(A, size, ip, 0, col_world, &requests[ip], isend_tmp.data()); + Parallel_Common::isend_dev(A, size, ip, 0, col_world, &requests[ip], isend_tmp_.data()); } } - // Receive + // local part const int start = this->localU ? 0 : start_colB[rank_col]; + const T* U_part = U + start_colA[rank_col] + start * ncolA_glo; + ModuleBase::matrixCopy()(ncolB, ncolA, U_part, ncolA_glo, U_tmp_, ncolA); + ModuleBase::gemm_op()('N', 'N', nrowA, ncolB, ncolA, &alpha, A, LDA, U_tmp_, ncolA, &beta, B, LDA); + + // Receive + T* Atmp_device = nullptr; + if (std::is_same::value) + { + Atmp_device = A_tmp_device_; + } + else + { + Atmp_device = A_tmp_.data(); + } for (int ip = 0; ip < nproc_col; ++ip) { - T real_beta = ip == 0 ? beta : 0; - const int ncolA_ip = colA_loc[ip]; - // get U_tmp - - const int start_row = start_colA[ip]; - for (int i = 0; i < ncolB; ++i) + if (ip != rank_col) { - const T* U_part = U + start_row + (i + start) * ncolA_glo; - syncmem_dev_op()(U_tmp + i * ncolA_ip, U_part, ncolA_ip); - } + T zero = 0.0; + const int ncolA_ip = colA_loc[ip]; + const T* U_part = U + start_colA[ip] + start * ncolA_glo; + ModuleBase::matrixCopy()(ncolB, ncolA_ip, U_part, ncolA_glo, U_tmp_, ncolA_ip); - if (ip == rank_col) - { - ModuleBase::gemm_op()('N', - 'N', - nrowA, - ncolB, - ncolA_ip, - &alpha, - A, - LDA, - U_tmp, - ncolA_ip, - &real_beta, - B_tmp, - LDA); - } - else - { int size = LDA * ncolA_ip; MPI_Status status; - Parallel_Common::recv_dev(A_tmp_device, size, ip, 0, col_world, &status, A_tmp.data()); - MPI_Wait(&requests[ip], &status); + Parallel_Common::recv_dev(Atmp_device, size, ip, 0, col_world, &status, A_tmp_.data()); ModuleBase::gemm_op()('N', 'N', nrowA, ncolB, ncolA_ip, &alpha, - A_tmp_device, + Atmp_device, LDA, - U_tmp, + U_tmp_, ncolA_ip, - &real_beta, - B_tmp, + &zero, + B_tmp_, LDA); + // sum all the results + T one = 1.0; + ModuleBase::axpy_op()(ncolB * LDA, &one, B_tmp_, 1, B, 1); } - // sum all the results - T one = 1.0; - ModuleBase::axpy_op()(ncolB * LDA, &one, B_tmp, 1, B, 1); } - delmem_dev_op()(U_tmp); - delmem_dev_op()(B_tmp); - if (std::is_same::value) + + for (int ip = 0; ip < nproc_col; ++ip) { - delmem_dev_op()(A_tmp_device); + if (rank_col != ip) + { + MPI_Status status; + MPI_Wait(&requests[ip], &status); + } } } else diff --git a/source/module_hsolver/para_linear_transform.h b/source/module_hsolver/para_linear_transform.h index 42cb02fb47..8e5e69d203 100644 --- a/source/module_hsolver/para_linear_transform.h +++ b/source/module_hsolver/para_linear_transform.h @@ -4,6 +4,7 @@ #include "module_base/module_device/device.h" #include "module_base/module_device/memory_op.h" #include "module_base/parallel_device.h" + #include #ifdef __MPI #include "mpi.h" @@ -24,6 +25,7 @@ class PLinearTransform using resmem_dev_op = base_device::memory::resize_memory_op; using setmem_dev_op = base_device::memory::set_memory_op; using delmem_dev_op = base_device::memory::delete_memory_op; + ~PLinearTransform(); int nproc_col = 1; int rank_col = 0; int nrowA = 0; @@ -71,6 +73,15 @@ class PLinearTransform * */ void act(const T alpha, const T* A, const T* U_global, const T beta, T* B); + +#ifdef __MPI + private: + std::vector A_tmp_; // temperory memory for A + std::vector isend_tmp_; // temperory memory for isend + T* U_tmp_ = nullptr; // temperory memory for U + T* B_tmp_ = nullptr; // temperory memory for B + T* A_tmp_device_ = nullptr; // temperory pointer +#endif }; } // namespace hsolver #endif \ No newline at end of file diff --git a/tests/integrate/102_PW_BPCG_BP/result.ref b/tests/integrate/102_PW_BPCG_BP/result.ref index b97d5ed713..8226c47db4 100755 --- a/tests/integrate/102_PW_BPCG_BP/result.ref +++ b/tests/integrate/102_PW_BPCG_BP/result.ref @@ -1,7 +1,7 @@ etotref -4869.7470520063843651 etotperatomref -2434.8735260032 -totalforceref 5.194830 -totalstressref 37241.453346 +totalforceref 5.1952120 +totalstressref 37241.490309 pointgroupref C_1 spacegroupref C_1 nksibzref 8