Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 65 additions & 53 deletions source/source_pw/module_pwdft/forces_cc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,44 +108,48 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
double* rhocg = new double[rho_basis->ngg];
ModuleBase::GlobalFunc::ZEROS(rhocg, rho_basis->ngg);

std::vector<double> gv_x(rho_basis->npw);
std::vector<double> gv_y(rho_basis->npw);
std::vector<double> gv_z(rho_basis->npw);
std::vector<double> gv_h(3 * rho_basis->npw);
std::vector<double> tau_h(3 * this->nat);
std::vector<double> rhocgigg_vec(rho_basis->npw);
double *gv_x_d = nullptr;
double *gv_y_d = nullptr;
double *gv_z_d = nullptr;
double *gv_d = nullptr;
double *tau_d = nullptr;
double *force_d = nullptr;
double *rhocgigg_vec_d = nullptr;
std::complex<FPTYPE>* psiv_d = nullptr;
this->device = base_device::get_device_type<Device>(this->ctx);


#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int ig = 0; ig < rho_basis->npw; ig++)
{
gv_x[ig] = rho_basis->gcar[ig].x;
gv_y[ig] = rho_basis->gcar[ig].y;
gv_z[ig] = rho_basis->gcar[ig].z;
gv_h[3 * ig] = rho_basis->gcar[ig].x;
gv_h[3 * ig + 1] = rho_basis->gcar[ig].y;
gv_h[3 * ig + 2] = rho_basis->gcar[ig].z;
}

for (int iat = 0; iat < this->nat; iat++)
{
int it = ucell_in.iat2it[iat];
int ia = ucell_in.iat2ia[iat];
tau_h[iat * 3] = ucell_in.atoms[it].tau[ia].x;
tau_h[iat * 3 + 1] = ucell_in.atoms[it].tau[ia].y;
tau_h[iat * 3 + 2] = ucell_in.atoms[it].tau[ia].z;
}

if(this->device == base_device::GpuDevice ) {
resmem_var_op()(gv_x_d, rho_basis->npw);
resmem_var_op()(gv_y_d, rho_basis->npw);
resmem_var_op()(gv_z_d, rho_basis->npw);
resmem_var_op()(gv_d, rho_basis->npw * 3);
resmem_var_op()(tau_d, this->nat * 3);
resmem_var_op()(rhocgigg_vec_d, rho_basis->npw);
resmem_complex_op()(psiv_d, rho_basis->nmaxgr);
resmem_var_op()(force_d, 3);
resmem_var_op()(force_d, 3 * this->nat);

syncmem_var_h2d_op()(gv_x_d, gv_x.data(), rho_basis->npw);
syncmem_var_h2d_op()(gv_y_d, gv_y.data(), rho_basis->npw);
syncmem_var_h2d_op()(gv_z_d, gv_z.data(), rho_basis->npw);
syncmem_var_h2d_op()(gv_d, gv_h.data(), rho_basis->npw * 3);
syncmem_var_h2d_op()(tau_d, tau_h.data(), this->nat * 3);
syncmem_complex_h2d_op()(psiv_d, psiv, rho_basis->nmaxgr);
syncmem_var_h2d_op()(force_d, forcecc.c, 3 * this->nat);
}


double* tau_it_d = tau_d; // the start address of each atom type's tau
double* force_it_d = force_d;
for (int it = 0; it < ucell_in.ntype; ++it)
{
if (ucell_in.atoms[it].ncpp.nlcc)
Expand All @@ -166,10 +170,7 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
rho_basis,
1,
ucell_in);

#ifdef _OPENMP
#pragma omp parallel for
#endif

for (int ig = 0; ig < rho_basis->npw; ig++)
{
rhocgigg_vec[ig] = rhocg[rho_basis->ig2igg[ig]];
Expand All @@ -178,42 +179,53 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
if(this->device == base_device::GpuDevice ) {
syncmem_var_h2d_op()(rhocgigg_vec_d, rhocgigg_vec.data(), rho_basis->npw);
}
for (int ia = 0; ia < ucell_in.atoms[it].na; ++ia)
{
const ModuleBase::Vector3<double> pos = ucell_in.atoms[it].tau[ia];
// get iat form table
int iat = ucell_in.itia2iat(it, ia);
double force[3] = {0, 0, 0};

if(this->device == base_device::GpuDevice ) {
syncmem_var_h2d_op()(force_d, force, 3);
hamilt::cal_force_npw_op<FPTYPE, Device>()(
psiv_d, gv_x_d, gv_y_d, gv_z_d, rhocgigg_vec_d, force_d, pos.x, pos.y, pos.z,
rho_basis->npw, ucell_in.omega, ucell_in.tpiba
);
syncmem_var_d2h_op()(force, force_d, 3);

} else {
hamilt::cal_force_npw_op<FPTYPE, Device>()(
psiv, gv_x.data(), gv_y.data(), gv_z.data(), rhocgigg_vec.data(), force, pos.x, pos.y, pos.z,
rho_basis->npw, ucell_in.omega, ucell_in.tpiba
);
}

if(this->device == base_device::GpuDevice ) {
hamilt::cal_force_npw_op<FPTYPE, Device>()(
psiv_d, gv_d, rhocgigg_vec_d, force_it_d, tau_it_d,
rho_basis->npw, ucell_in.omega, ucell_in.tpiba, ucell_in.atoms[it].na
);
} else {
#pragma omp for
for(int ia = 0; ia < ucell_in.atoms[it].na; ia++)
{
forcecc(iat, 0) += force[0];
forcecc(iat, 1) += force[1];
forcecc(iat, 2) += force[2];
double fx = 0.0, fy = 0.0, fz = 0.0;
int iat = ucell_in.itia2iat(it, ia);
for (int ig = 0; ig < rho_basis->npw; ig++)
{
const std::complex<double> psiv_conj = conj(psiv[ig]);

const double arg = ModuleBase::TWO_PI * (gv_h[ig * 3] * tau_h[iat * 3]
+ gv_h[ig * 3 + 1] * tau_h[iat * 3 + 1] + gv_h[ig * 3 + 2] * tau_h[iat * 3 + 2]);
double sinp, cosp;
ModuleBase::libm::sincos(arg, &sinp, &cosp);
const std::complex<double> expiarg = std::complex<double>(sinp, cosp);

const std::complex<double> tmp_var = psiv_conj * expiarg * ucell_in.tpiba * ucell_in.omega * rhocgigg_vec[ig];

const std::complex<double> ipol0 = tmp_var * gv_h[ig * 3];
fx += ipol0.real();

const std::complex<double> ipol1 = tmp_var * gv_h[ig * 3 + 1];
fy += ipol1.real();

const std::complex<double> ipol2 = tmp_var * gv_h[ig * 3 + 2];
fz += ipol2.real();
}
forcecc(iat, 0) += fx;
forcecc(iat, 1) += fy;
forcecc(iat, 2) += fz;
}
}

}
tau_it_d += 3 * ucell_in.atoms[it].na; // update the start address of each atom type's tau
force_it_d += 3 * ucell_in.atoms[it].na;
}
if (this->device == base_device::GpuDevice)
if(this->device == base_device::GpuDevice)
{
delmem_var_op()(gv_x_d);
delmem_var_op()(gv_y_d);
delmem_var_op()(gv_z_d);
syncmem_var_d2h_op()(forcecc.c, force_d, 3 * nat);
delmem_var_op()(gv_d);
delmem_var_op()(tau_d);
delmem_var_op()(force_d);
delmem_var_op()(rhocgigg_vec_d);
delmem_complex_op()(psiv_d);
Expand Down
58 changes: 31 additions & 27 deletions source/source_pw/module_pwdft/kernels/cuda/stress_op.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#include "source_pw/module_pwdft/kernels/stress_op.h"
#include "source_base/constants.h"
#include "source_base/module_device/device.h"
#include "vnl_tools_cu.hpp"
#include "source_base/module_device/types.h"

#include <complex>
#include <thrust/complex.h>
#include <base/macros/macros.h>
#include <source_base/module_device/device.h>

#include <cuda_runtime.h>

Expand Down Expand Up @@ -703,43 +704,51 @@ __global__ void cal_stress_drhoc_aux3(
template <typename FPTYPE>
__global__ void cal_force_npw(
const thrust::complex<FPTYPE> *psiv,
const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
const FPTYPE* gv,
const FPTYPE* rhocgigg_vec,
FPTYPE* force,
const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z,
const FPTYPE* tau,
const int npw,
const FPTYPE omega, const FPTYPE tpiba
){
const double TWO_PI = 2.0 * 3.14159265358979323846;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int begin_idx = tid * 1024;
if(begin_idx > npw) return;
int ia = blockIdx.x;
int tid = threadIdx.x;
if(tid > npw) return;

FPTYPE pos_x = tau[ia * 3];
FPTYPE pos_y = tau[ia * 3 + 1];
FPTYPE pos_z = tau[ia * 3 + 2];
FPTYPE t_force0 = 0;
FPTYPE t_force1 = 0;
FPTYPE t_force2 = 0;
for(int ig = begin_idx; ig<begin_idx+1024 && ig<npw;ig++) {
for(int ig = tid; ig<npw;ig += blockDim.x) {
const thrust::complex<FPTYPE> psiv_conj = conj(psiv[ig]);

const FPTYPE arg = TWO_PI * (gv_x[ig] * pos_x + gv_y[ig] * pos_y + gv_z[ig] * pos_z);
const FPTYPE sinp = sin(arg);
const FPTYPE cosp = cos(arg);
const FPTYPE arg = ModuleBase::TWO_PI * (gv[ig * 3] * pos_x + gv[ig * 3 + 1] * pos_y + gv[ig * 3 + 2] * pos_z);
FPTYPE sinp, cosp;
sincos(arg, &sinp, &cosp);
const thrust::complex<FPTYPE> expiarg = thrust::complex<FPTYPE>(sinp, cosp);

const thrust::complex<FPTYPE> tmp_var = psiv_conj * expiarg * tpiba * omega * rhocgigg_vec[ig];

const thrust::complex<FPTYPE> ipol0 = tmp_var * gv_x[ig];
const thrust::complex<FPTYPE> ipol0 = tmp_var * gv[ig * 3];
t_force0 += ipol0.real();

const thrust::complex<FPTYPE> ipol1 = tmp_var * gv_y[ig];
const thrust::complex<FPTYPE> ipol1 = tmp_var * gv[ig * 3 + 1];
t_force1 += ipol1.real();

const thrust::complex<FPTYPE> ipol2 = tmp_var * gv_z[ig];
const thrust::complex<FPTYPE> ipol2 = tmp_var * gv[ig * 3 + 2];
t_force2 += ipol2.real();
}
atomicAdd(&force[0], t_force0);
atomicAdd(&force[1], t_force1);
atomicAdd(&force[2], t_force2);
__syncwarp();
warp_reduce(t_force0);
warp_reduce(t_force1);
warp_reduce(t_force2);
if (threadIdx.x % WARP_SIZE == 0) {
atomicAdd(&force[ia * 3], t_force0);
atomicAdd(&force[ia * 3 + 1], t_force1);
atomicAdd(&force[ia * 3 + 2], t_force2);
}
}

template <typename FPTYPE>
Expand Down Expand Up @@ -880,22 +889,17 @@ void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
template <typename FPTYPE>
void cal_force_npw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
const std::complex<FPTYPE> *psiv,
const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
const FPTYPE* gv,
const FPTYPE* rhocgigg_vec,
FPTYPE* force,
const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z,
const FPTYPE* tau,
const int npw,
const FPTYPE omega, const FPTYPE tpiba
const FPTYPE omega, const FPTYPE tpiba, const int na
)
{
// Divide the npw size range into blocksize 1024 blocks
int t_size = 1024;
int t_num = (npw%t_size) ? (npw/t_size + 1) : (npw/t_size);
dim3 npwgrid(((t_num%THREADS_PER_BLOCK) ? (t_num/THREADS_PER_BLOCK + 1) : (t_num/THREADS_PER_BLOCK)));

cal_force_npw <<< npwgrid, THREADS_PER_BLOCK >>> (
cal_force_npw <<<na, THREADS_PER_BLOCK >>> (
reinterpret_cast<const thrust::complex<FPTYPE>*>(psiv),
gv_x, gv_y, gv_z, rhocgigg_vec, force, pos_x, pos_y, pos_z,
gv, rhocgigg_vec, force, tau,
npw, omega, tpiba
);
return ;
Expand Down
16 changes: 5 additions & 11 deletions source/source_pw/module_pwdft/kernels/rocm/stress_op.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -844,23 +844,17 @@ void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
template <typename FPTYPE>
void cal_force_npw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
const std::complex<FPTYPE> *psiv,
const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
const FPTYPE* gv,
const FPTYPE* rhocgigg_vec,
FPTYPE* force,
const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z,
const FPTYPE* tau,
const int npw,
const FPTYPE omega, const FPTYPE tpiba
const FPTYPE omega, const FPTYPE tpiba, const int na
)
{
int t_size = 1024;
int t_num = (npw%t_size) ? (npw/t_size + 1) : (npw/t_size);

dim3 npwgrid(((t_num%THREADS_PER_BLOCK) ? (t_num/THREADS_PER_BLOCK + 1) : (t_num/THREADS_PER_BLOCK)));


hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_npw<FPTYPE>), npwgrid, THREADS_PER_BLOCK,0,0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_npw<FPTYPE>), na, THREADS_PER_BLOCK,0,0,
reinterpret_cast<const thrust::complex<FPTYPE>*>(psiv),
gv_x, gv_y, gv_z, rhocgigg_vec, force, pos_x, pos_y, pos_z,
gv, rhocgigg_vec, force, tau,
npw, omega, tpiba
);

Expand Down
46 changes: 0 additions & 46 deletions source/source_pw/module_pwdft/kernels/stress_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -629,49 +629,6 @@ struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_CPU> {
}
};


template <typename FPTYPE>
struct cal_force_npw_op<FPTYPE, base_device::DEVICE_CPU> {
void operator()(const std::complex<FPTYPE>* psiv,
const FPTYPE* gv_x,
const FPTYPE* gv_y,
const FPTYPE* gv_z,
const FPTYPE* rhocgigg_vec,
FPTYPE* force,
const FPTYPE pos_x,
const FPTYPE pos_y,
const FPTYPE pos_z,
const int npw,
const FPTYPE omega,
const FPTYPE tpiba)
{

#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int ig = 0; ig < npw; ig++)
{
const std::complex<FPTYPE> psiv_conj = conj(psiv[ig]);

const FPTYPE arg = ModuleBase::TWO_PI * (gv_x[ig] * pos_x + gv_y[ig] * pos_y + gv_z[ig] * pos_z);
FPTYPE sinp, cosp;
ModuleBase::libm::sincos(arg, &sinp, &cosp);
const std::complex<FPTYPE> expiarg = std::complex<FPTYPE>(sinp, cosp);

const std::complex<FPTYPE> tmp_var = psiv_conj * expiarg * tpiba * omega * rhocgigg_vec[ig];

const std::complex<FPTYPE> ipol0 = tmp_var * gv_x[ig];
force[0] += ipol0.real();

const std::complex<FPTYPE> ipol1 = tmp_var * gv_y[ig];
force[1] += ipol1.real();

const std::complex<FPTYPE> ipol2 = tmp_var * gv_z[ig];
force[2] += ipol2.real();
}
}
};

template <typename FPTYPE>
struct cal_multi_dot_op<FPTYPE, base_device::DEVICE_CPU> {
FPTYPE operator()(const int& npw,
Expand Down Expand Up @@ -768,9 +725,6 @@ template struct cal_vq_deri_op<double, base_device::DEVICE_CPU>;
template struct cal_stress_drhoc_aux_op<float, base_device::DEVICE_CPU>;
template struct cal_stress_drhoc_aux_op<double, base_device::DEVICE_CPU>;

template struct cal_force_npw_op<float, base_device::DEVICE_CPU>;
template struct cal_force_npw_op<double, base_device::DEVICE_CPU>;

template struct cal_multi_dot_op<float, base_device::DEVICE_CPU>;
template struct cal_multi_dot_op<double, base_device::DEVICE_CPU>;

Expand Down
14 changes: 7 additions & 7 deletions source/source_pw/module_pwdft/kernels/stress_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,13 +241,13 @@ struct cal_stress_drhoc_aux_op{
template <typename FPTYPE, typename Device>
struct cal_force_npw_op{
void operator()(const std::complex<FPTYPE> *psiv,
const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
const FPTYPE* gv,
const FPTYPE* rhocgigg_vec,
FPTYPE* force,
const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_xz,
const FPTYPE* tau_x,
const int npw,
const FPTYPE omega, const FPTYPE tpiba
);
const FPTYPE omega, const FPTYPE tpiba, const int na
) {}
};

template <typename FPTYPE, typename Device>
Expand Down Expand Up @@ -480,12 +480,12 @@ struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>{
template <typename FPTYPE>
struct cal_force_npw_op<FPTYPE, base_device::DEVICE_GPU>{
void operator()(const std::complex<FPTYPE> *psiv,
const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
const FPTYPE* gv,
const FPTYPE* rhocgigg_vec,
FPTYPE* force,
const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_xz,
const FPTYPE* tau,
const int npw,
const FPTYPE omega, const FPTYPE tpiba
const FPTYPE omega, const FPTYPE tpiba, const int na
);
};

Expand Down
Loading