From c76845a62413a3d2fc9ebca0caf3310f49389217 Mon Sep 17 00:00:00 2001
From: Qianruipku <Terry_Liu@pku.edu.cn>
Date: Mon, 25 Nov 2024 21:26:18 +0800
Subject: [PATCH 1/2] fix stuck in out_chg

---
 source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp | 3 ---
 source/module_io/write_cube.cpp                        | 2 +-
 tests/integrate/107_PW_OK/INPUT                        | 1 +
 3 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp
index 829029e4bb..5d6fe41764 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp
@@ -355,7 +355,6 @@ void Parallel_Grid::reduce(double* rhotot, const double* const rhoin)const
 	// send the Barrier command.
 	if(GlobalV::MY_POOL!=0) 
 	{
-		MPI_Barrier(MPI_COMM_WORLD);
 		return;
 	}
 
@@ -407,8 +406,6 @@ void Parallel_Grid::reduce(double* rhotot, const double* const rhoin)const
 
 	delete[] zpiece;	
 
-	MPI_Barrier(MPI_COMM_WORLD);
-
 	return;
 }
 #endif
diff --git a/source/module_io/write_cube.cpp b/source/module_io/write_cube.cpp
index 8fa212c60f..dbb4e469cf 100644
--- a/source/module_io/write_cube.cpp
+++ b/source/module_io/write_cube.cpp
@@ -35,7 +35,7 @@ void ModuleIO::write_vdata_palgrid(
     // reduce
     std::vector<double> data_xyz_full(nxyz);    // data to be written
 #ifdef __MPI    // reduce to rank 0
-    if (my_pool == 0)
+    if (my_pool == 0 && GlobalV::MY_STOGROUP == 0)
     {
         pgrid.reduce(data_xyz_full.data(), data);
     }
diff --git a/tests/integrate/107_PW_OK/INPUT b/tests/integrate/107_PW_OK/INPUT
index 05c3e66e66..764bb78d76 100644
--- a/tests/integrate/107_PW_OK/INPUT
+++ b/tests/integrate/107_PW_OK/INPUT
@@ -11,6 +11,7 @@ pseudo_dir	../../PP_ORB
 ecutwfc			20
 scf_thr			1e-8
 scf_nmax		100
+kpar           2
 
 out_chg         1
 dft_functional  scan	

From e85e43b017650b6e67d8a4ba990262faf35d7c24 Mon Sep 17 00:00:00 2001
From: Qianruipku <Terry_Liu@pku.edu.cn>
Date: Thu, 28 Nov 2024 13:13:26 +0800
Subject: [PATCH 2/2] fix DCU low efficiency

---
 .../hamilt_pwdft/kernels/cuda/stress_op.cu    | 22 ++++++++++++++++---
 .../kernels/rocm/stress_op.hip.cu             | 22 ++++++++++++++++---
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index 36e0aac37a..997827d669 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -185,9 +185,25 @@ __global__ void cal_multi_dot(const int npw,
                               const thrust::complex<FPTYPE>* psi,
                               FPTYPE* sum)
 {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx < npw) {
-        atomicAdd(sum, fac * gk1[idx] * gk2[idx] * d_kfac[idx] * thrust::norm(psi[idx]));
+    __shared__ FPTYPE s_sum[THREADS_PER_BLOCK];
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int cacheid = threadIdx.x;
+    FPTYPE local_sum = 0;
+    while (tid < npw) {
+        local_sum += fac * gk1[tid] * gk2[tid] * d_kfac[tid] * thrust::norm(psi[tid]);
+        tid += blockDim.x * gridDim.x;
+    }
+    s_sum[cacheid] = local_sum;
+    __syncthreads();
+
+    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (cacheid < s) {
+            s_sum[cacheid] += s_sum[cacheid + s];
+        }
+        __syncthreads();
+    }
+    if (cacheid == 0) {
+        atomicAdd(sum, s_sum[0]);
     }
 }
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
index cd26e312a5..a5f8e553af 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -342,9 +342,25 @@ __global__ void cal_multi_dot(const int npw,
                               const thrust::complex<FPTYPE>* psi,
                               FPTYPE* sum)
 {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx < npw) {
-        atomicAdd(sum, fac * gk1[idx] * gk2[idx] * d_kfac[idx] * thrust::norm(psi[idx]));
+    __shared__ FPTYPE s_sum[THREADS_PER_BLOCK];
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int cacheid = threadIdx.x;
+    FPTYPE local_sum = 0;
+    while (tid < npw) {
+        local_sum += fac * gk1[tid] * gk2[tid] * d_kfac[tid] * thrust::norm(psi[tid]);
+        tid += blockDim.x * gridDim.x;
+    }
+    s_sum[cacheid] = local_sum;
+    __syncthreads();
+
+    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (cacheid < s) {
+            s_sum[cacheid] += s_sum[cacheid + s];
+        }
+        __syncthreads();
+    }
+    if (cacheid == 0) {
+        atomicAdd(sum, s_sum[0]);
     }
 }