From c76845a62413a3d2fc9ebca0caf3310f49389217 Mon Sep 17 00:00:00 2001 From: Qianruipku Date: Mon, 25 Nov 2024 21:26:18 +0800 Subject: [PATCH 1/2] fix stuck in out_chg --- source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp | 3 --- source/module_io/write_cube.cpp | 2 +- tests/integrate/107_PW_OK/INPUT | 1 + 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp index 829029e4bb..5d6fe41764 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp @@ -355,7 +355,6 @@ void Parallel_Grid::reduce(double* rhotot, const double* const rhoin)const // send the Barrier command. if(GlobalV::MY_POOL!=0) { - MPI_Barrier(MPI_COMM_WORLD); return; } @@ -407,8 +406,6 @@ void Parallel_Grid::reduce(double* rhotot, const double* const rhoin)const delete[] zpiece; - MPI_Barrier(MPI_COMM_WORLD); - return; } #endif diff --git a/source/module_io/write_cube.cpp b/source/module_io/write_cube.cpp index 8fa212c60f..dbb4e469cf 100644 --- a/source/module_io/write_cube.cpp +++ b/source/module_io/write_cube.cpp @@ -35,7 +35,7 @@ void ModuleIO::write_vdata_palgrid( // reduce std::vector data_xyz_full(nxyz); // data to be written #ifdef __MPI // reduce to rank 0 - if (my_pool == 0) + if (my_pool == 0 && GlobalV::MY_STOGROUP == 0) { pgrid.reduce(data_xyz_full.data(), data); } diff --git a/tests/integrate/107_PW_OK/INPUT b/tests/integrate/107_PW_OK/INPUT index 05c3e66e66..764bb78d76 100644 --- a/tests/integrate/107_PW_OK/INPUT +++ b/tests/integrate/107_PW_OK/INPUT @@ -11,6 +11,7 @@ pseudo_dir ../../PP_ORB ecutwfc 20 scf_thr 1e-8 scf_nmax 100 +kpar 2 out_chg 1 dft_functional scan From e85e43b017650b6e67d8a4ba990262faf35d7c24 Mon Sep 17 00:00:00 2001 From: Qianruipku Date: Thu, 28 Nov 2024 13:13:26 +0800 Subject: [PATCH 2/2] fix DCU low efficiency --- .../hamilt_pwdft/kernels/cuda/stress_op.cu | 22 ++++++++++++++++--- .../kernels/rocm/stress_op.hip.cu | 22 ++++++++++++++++--- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index 36e0aac37a..997827d669 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -185,9 +185,25 @@ __global__ void cal_multi_dot(const int npw, const thrust::complex* psi, FPTYPE* sum) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < npw) { - atomicAdd(sum, fac * gk1[idx] * gk2[idx] * d_kfac[idx] * thrust::norm(psi[idx])); + __shared__ FPTYPE s_sum[THREADS_PER_BLOCK]; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int cacheid = threadIdx.x; + FPTYPE local_sum = 0; + while (tid < npw) { + local_sum += fac * gk1[tid] * gk2[tid] * d_kfac[tid] * thrust::norm(psi[tid]); + tid += blockDim.x * gridDim.x; + } + s_sum[cacheid] = local_sum; + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (cacheid < s) { + s_sum[cacheid] += s_sum[cacheid + s]; + } + __syncthreads(); + } + if (cacheid == 0) { + atomicAdd(sum, s_sum[0]); } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index cd26e312a5..a5f8e553af 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -342,9 +342,25 @@ __global__ void cal_multi_dot(const int npw, const thrust::complex* psi, FPTYPE* sum) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < npw) { - atomicAdd(sum, fac * gk1[idx] * gk2[idx] * d_kfac[idx] * thrust::norm(psi[idx])); + __shared__ FPTYPE s_sum[THREADS_PER_BLOCK]; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int cacheid = threadIdx.x; + FPTYPE local_sum = 0; + while (tid < npw) { + local_sum += fac * gk1[tid] * gk2[tid] * d_kfac[tid] * thrust::norm(psi[tid]); + tid += blockDim.x * gridDim.x; + } + s_sum[cacheid] = local_sum; + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (cacheid < s) { + s_sum[cacheid] += s_sum[cacheid + s]; + } + __syncthreads(); + } + if (cacheid == 0) { + atomicAdd(sum, s_sum[0]); } }