Skip to content

Commit fb83351

Browse files
committed
fix bug and do little optimization
1 parent 294eff0 commit fb83351

File tree

2 files changed

+17
-17
lines changed

2 files changed

+17
-17
lines changed

source/source_pw/module_pwdft/forces_cc.cpp

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -180,26 +180,23 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
180180
syncmem_var_h2d_op()(rhocgigg_vec_d, rhocgigg_vec.data(), rho_basis->npw);
181181
}
182182

183-
double force[3] = {0, 0, 0};
184-
185183
if(this->device == base_device::GpuDevice ) {
186184
hamilt::cal_force_npw_op<FPTYPE, Device>()(
187185
psiv_d, gv_d, rhocgigg_vec_d, force_it_d, tau_it_d,
188186
rho_basis->npw, ucell_in.omega, ucell_in.tpiba, ucell_in.atoms[it].na
189187
);
190-
tau_it_d += 3 * ucell_in.atoms[it].na; // update the start address of each atom type's tau
191-
force_it_d += 3 * ucell_in.atoms[it].na;
192188
} else {
193189
#pragma omp for
194190
for(int ia = 0; ia < ucell_in.atoms[it].na; ia++)
195191
{
196192
double fx = 0.0, fy = 0.0, fz = 0.0;
193+
int iat = ucell_in.itia2iat(it, ia);
197194
for (int ig = 0; ig < rho_basis->npw; ig++)
198195
{
199196
const std::complex<double> psiv_conj = conj(psiv[ig]);
200197

201-
const double arg = ModuleBase::TWO_PI * (gv_h[ig * 3] * tau_h[ia * 3]
202-
+ gv_h[ig * 3 + 1] * tau_h[ia * 3 + 1] + gv_h[ig * 3 + 2] * tau_h[ia * 3 + 2]);
198+
const double arg = ModuleBase::TWO_PI * (gv_h[ig * 3] * tau_h[iat * 3]
199+
+ gv_h[ig * 3 + 1] * tau_h[iat * 3 + 1] + gv_h[ig * 3 + 2] * tau_h[iat * 3 + 2]);
203200
double sinp, cosp;
204201
ModuleBase::libm::sincos(arg, &sinp, &cosp);
205202
const std::complex<double> expiarg = std::complex<double>(sinp, cosp);
@@ -215,20 +212,18 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
215212
const std::complex<double> ipol2 = tmp_var * gv_h[ig * 3 + 2];
216213
fz += ipol2.real();
217214
}
218-
int iat = ucell_in.itia2iat(it, ia);
219215
forcecc(iat, 0) += fx;
220216
forcecc(iat, 1) += fy;
221217
forcecc(iat, 2) += fz;
222218
}
223219
}
224220
}
225-
if(this->device == base_device::GpuDevice)
226-
{
227-
syncmem_var_d2h_op()(forcecc.c, force_d, 3 * nat);
228-
}
221+
tau_it_d += 3 * ucell_in.atoms[it].na; // update the start address of each atom type's tau
222+
force_it_d += 3 * ucell_in.atoms[it].na;
229223
}
230-
if (this->device == base_device::GpuDevice)
224+
if(this->device == base_device::GpuDevice)
231225
{
226+
syncmem_var_d2h_op()(forcecc.c, force_d, 3 * nat);
232227
delmem_var_op()(gv_d);
233228
delmem_var_op()(tau_d);
234229
delmem_var_op()(force_d);

source/source_pw/module_pwdft/kernels/cuda/stress_op.cu

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ __global__ void cal_force_npw(
721721
FPTYPE t_force0 = 0;
722722
FPTYPE t_force1 = 0;
723723
FPTYPE t_force2 = 0;
724-
for(int ig = tid; ig<npw;ig += blockDim.x){ {
724+
for(int ig = tid; ig<npw;ig += blockDim.x) {
725725
const thrust::complex<FPTYPE> psiv_conj = conj(psiv[ig]);
726726

727727
const FPTYPE arg = ModuleBase::TWO_PI * (gv[ig * 3] * pos_x + gv[ig * 3 + 1] * pos_y + gv[ig * 3 + 2] * pos_z);
@@ -740,10 +740,15 @@ __global__ void cal_force_npw(
740740
const thrust::complex<FPTYPE> ipol2 = tmp_var * gv[ig * 3 + 2];
741741
t_force2 += ipol2.real();
742742
}
743-
atomicAdd(&force[ia * 3], t_force0);
744-
atomicAdd(&force[ia * 3 + 1], t_force1);
745-
atomicAdd(&force[ia * 3 + 2], t_force2);
746-
}
743+
__syncwarp();
744+
warp_reduce(t_force0);
745+
warp_reduce(t_force1);
746+
warp_reduce(t_force2);
747+
if (threadIdx.x % WARP_SIZE == 0) {
748+
atomicAdd(&force[ia * 3], t_force0);
749+
atomicAdd(&force[ia * 3 + 1], t_force1);
750+
atomicAdd(&force[ia * 3 + 2], t_force2);
751+
}
747752
}
748753

749754
template <typename FPTYPE>

0 commit comments

Comments
 (0)