Skip to content

Add convolution for psi #6432

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions source/source_basis/module_pw/pw_basis_k.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,6 @@ class PW_Basis_K : public PW_Basis
const int ik,
const bool add = false,
const FPTYPE factor = 1.0) const; // in:(nz, ns) ; out(nplane,nx*ny)

#endif

template <typename FPTYPE, typename Device>
Expand All @@ -176,7 +175,6 @@ class PW_Basis_K : public PW_Basis
const bool add = false,
const FPTYPE factor = 1.0) const; // in:(nz, ns) ; out(nplane,nx*ny)


template <typename TK,
typename Device,
typename std::enable_if<std::is_same<Device, base_device::DEVICE_CPU>::value, int>::type = 0>
Expand Down Expand Up @@ -245,6 +243,47 @@ class PW_Basis_K : public PW_Basis
{
this->recip2real_gpu(in, out, ik, add, factor);
}
template <typename FPTYPE, typename Device,
typename std::enable_if<std::is_same<Device, base_device::DEVICE_GPU>::value, int>::type = 0>
void convolution(const int ik,
const int size,
const FPTYPE* input,
const typename GetTypeReal<FPTYPE>::type* input1,
FPTYPE* output,
const bool add = false,
const typename GetTypeReal<FPTYPE>::type factor =1.0) const
{
this->convolution_gpu(ik, size, input, input1, output, add, factor);
}
template <typename FPTYPE, typename Device,
typename std::enable_if<std::is_same<Device, base_device::DEVICE_CPU>::value, int>::type = 0>
void convolution(const int ik,
const int size,
const FPTYPE* input,
const typename GetTypeReal<FPTYPE>::type* input1,
FPTYPE* output,
const bool add = false,
const typename GetTypeReal<FPTYPE>::type factor =1.0) const
{
this->convolution_cpu(ik, size, input, input1, output, add, factor);
}
template <typename FPTYPE>
void convolution_cpu(const int ik,
const int size,
const std::complex<FPTYPE>* input,
const FPTYPE* input1,
std::complex<FPTYPE>* output,
const bool add = false,
const FPTYPE factor = 1.0) const;

template <typename FPTYPE>
void convolution_gpu(const int ik,
const int size,
const std::complex<FPTYPE>* input,
const FPTYPE* input1,
std::complex<FPTYPE>* output,
const bool add = false,
const FPTYPE factor = 1.0) const;

public:
//operator:
Expand Down
5 changes: 2 additions & 3 deletions source/source_basis/module_pw/pw_basis_sup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,8 +412,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
{
int z = iz;
if (z < 0) {
z += this->nz;
}
z += this->nz;}
if (!found[ixy * this->nz + z])
{
found[ixy * this->nz + z] = true;
Expand All @@ -422,7 +421,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
pw_filled++;
if (xprime && ixy / fftny == 0) {
ng_xeq0++;
}
}
}
}
}
Expand Down
132 changes: 130 additions & 2 deletions source/source_basis/module_pw/pw_transform_k.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include "source_basis/module_pw/kernels/pw_op.h"
#include "pw_basis_k.h"
#include "pw_gatherscatter.h"

#include "source_pw/module_pwdft/kernels/veff_op.h"
#include <cassert>
#include <complex>

Expand Down Expand Up @@ -337,6 +337,63 @@ void PW_Basis_K::recip_to_real(const base_device::DEVICE_CPU* /*dev*/,
this->recip2real(in, out, ik, add, factor);
#endif
}
template <typename FPTYPE>
void PW_Basis_K::convolution_cpu(const int ik,
const int size,
const std::complex<FPTYPE>* input,
const FPTYPE* input1,
std::complex<FPTYPE>* output,
const bool add,
const FPTYPE factor) const
{
ModuleBase::timer::tick(this->classname, "convolution");
assert(this->gamma_only == false);
// ModuleBase::GlobalFunc::ZEROS(fft_bundle.get_auxg_data<double>(), this->nst * this->nz);
// memset the auxr of 0 in the auxr,here the len of the auxr is nxyz
auto* auxg = this->fft_bundle.get_auxg_data<FPTYPE>();
auto* auxr=this->fft_bundle.get_auxr_data<FPTYPE>();

memset(auxg, 0, this->nst * this->nz * 2 * 8);
const int startig = ik * this->npwk_max;
const int npwk = this->npwk[ik];

// copy the mapping form the type of stick to the 3dfft
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE))
#endif
for (int igl = 0; igl < npwk; ++igl)
{
auxg[this->igl2isz_k[igl + startig]] = input[igl];
}

// use 3d fft backward
this->fft_bundle.fftzbac(auxg, auxg);

this->gathers_scatterp(auxg, auxr);

this->fft_bundle.fftxybac(auxr, auxr);
for (int ir = 0; ir < size; ir++)
{
auxr[ir] *= input1[ir];
}

// 3d fft
this->fft_bundle.fftxyfor(auxr, auxr);

this->gatherp_scatters(auxr, auxg);

this->fft_bundle.fftzfor(auxg, auxg);
// copy the result from the auxr to the out ,while consider the add
FPTYPE tmpfac = factor / FPTYPE(this->nxyz);
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE))
#endif
for (int igl = 0; igl < npwk; ++igl)
{
output[igl] += tmpfac * auxg[this->igl2isz_k[igl + startig]];
}
ModuleBase::timer::tick(this->classname, "convolution");
}

#if (defined(__CUDA) || defined(__ROCM))
template <>
Expand Down Expand Up @@ -534,6 +591,50 @@ void PW_Basis_K::recip2real_gpu(const std::complex<FPTYPE>* in,
ModuleBase::timer::tick(this->classname, "recip_to_real gpu");
}

template <typename FPTYPE>
void PW_Basis_K::convolution_gpu(const int ik,
const int size,
const std::complex<FPTYPE>* input,
const FPTYPE* input1,
std::complex<FPTYPE>* output,
const bool add,
const FPTYPE factor) const
{
ModuleBase::timer::tick(this->classname, "convolution");

assert(this->gamma_only == false);
const base_device::DEVICE_GPU* gpux;
// memset the auxr of 0 in the auxr,here the len of the auxr is nxyz

base_device::memory::set_memory_op<std::complex<FPTYPE>, base_device::DEVICE_GPU>()(
this->fft_bundle.get_auxr_3d_data<FPTYPE>(),
0,
this->nxyz);
auto* auxr = this->fft_bundle.get_auxr_3d_data<FPTYPE>();
const int startig = ik * this->npwk_max;
const int npw_k = this->npwk[ik];

// copy the mapping form the type of stick to the 3dfft
set_3d_fft_box_op<FPTYPE, base_device::DEVICE_GPU>()(npw_k, this->ig2ixyz_k + startig, input, auxr);

// use 3d fft backward
this->fft_bundle.fft3D_backward(auxr, auxr);

hamilt::veff_pw_op<FPTYPE,base_device::DEVICE_GPU>()(gpux,size,auxr,input1);

// 3d fft
this->fft_bundle.fft3D_forward(auxr, auxr);
// copy the result from the auxr to the out ,while consider the add
set_real_to_recip_output_op<FPTYPE, base_device::DEVICE_GPU>()(npw_k,
this->nxyz,
add,
factor,
this->ig2ixyz_k + startig,
auxr,
output);
ModuleBase::timer::tick(this->classname, "convolution");
}

template void PW_Basis_K::real2recip_gpu<float>(const std::complex<float>*,
std::complex<float>*,
const int,
Expand All @@ -557,8 +658,35 @@ template void PW_Basis_K::recip2real_gpu<double>(const std::complex<double>*,
const int,
const bool,
const double) const;

template void PW_Basis_K::convolution_gpu<float>(const int ik,
const int size,
const std::complex<float>* input,
const float* input1,
std::complex<float>* output,
const bool add,
const float factor) const;
template void PW_Basis_K::convolution_gpu<double>(const int ik,
const int size,
const std::complex<double>* input,
const double* input1,
std::complex<double>* output,
const bool add,
const double factor) const;
#endif
template void PW_Basis_K::convolution_cpu<float>(const int ik,
const int size,
const std::complex<float>* input,
const float* input1,
std::complex<float>* output,
const bool add,
const float factor) const;
template void PW_Basis_K::convolution_cpu<double>(const int ik,
const int size,
const std::complex<double>* input,
const double* input1,
std::complex<double>* output,
const bool add,
const double factor) const;

template void PW_Basis_K::real2recip<float>(const float* in,
std::complex<float>* out,
Expand Down
6 changes: 5 additions & 1 deletion source/source_io/write_vxc_lip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,11 @@ namespace ModuleIO
{
// 2.1 local xc
vxcs_op_pw = new hamilt::Veff<hamilt::OperatorPW<T>>(kv.isk.data(),
potxc->get_veff_smooth_data<FPTYPE>(), potxc->get_veff_smooth().nr, potxc->get_veff_smooth().nc, &wfc_basis);
potxc->get_veff_smooth_data<FPTYPE>(),
PARAM.inp.nspin,
potxc->get_veff_smooth().nr,
potxc->get_veff_smooth().nc,
&wfc_basis);
vxcs_op_pw->init(ik); // set k-point index
psi_pw.fix_k(ik);
hpsi_localxc.fix_k(ik);
Expand Down
1 change: 1 addition & 0 deletions source/source_pw/module_pwdft/hamilt_pw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ HamiltPW<T, Device>::HamiltPW(elecstate::Potential* pot_in,
pot_in->pot_register(pot_register_in);
Operator<T, Device>* veff = new Veff<OperatorPW<T, Device>>(isk,
pot_in->get_veff_smooth_data<Real>(),
PARAM.inp.nspin,
pot_in->get_veff_smooth().nr,
pot_in->get_veff_smooth().nc,
wfc_basis);
Expand Down
50 changes: 41 additions & 9 deletions source/source_pw/module_pwdft/kernels/cuda/veff_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,50 @@ __global__ void veff_pw(
const int size,
thrust::complex<FPTYPE>* out,
thrust::complex<FPTYPE>* out1,
const FPTYPE* in)
thrust::complex<FPTYPE>* in)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= size) {return;}
thrust::complex<FPTYPE> sup =
out[idx] * (in[0 * size + idx] + in[3 * size + idx])
+ out1[idx] * (in[1 * size + idx] - thrust::complex<FPTYPE>(0.0, 1.0) * in[2 * size + idx]);
thrust::complex<FPTYPE> sdown =
out1[idx] * (in[0 * size + idx] - in[3 * size + idx])
+ out[idx] * (in[1 * size + idx] + thrust::complex<FPTYPE>(0.0, 1.0) * in[2 * size + idx]);
const int base = idx * 4;
thrust::complex<FPTYPE> sup = out[idx] * in[base] + out1[idx] * in[base+1];
thrust::complex<FPTYPE> sdown = out1[idx] * in[base+2] + out[idx] * in[base+3];
out[idx] = sup;
out1[idx] = sdown;
}

template <typename FPTYPE>
__global__ void rearrange_op(
const int size,
const FPTYPE* in,
thrust::complex<FPTYPE>* out)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= size) {return;}
const int base = idx * 4;
const FPTYPE part_1 = in[idx];
const FPTYPE part_2 = in[idx + size];
const FPTYPE part_3 = in[idx + 2 * size];
const FPTYPE part_4 = in[idx + 3 * size];
out[base] = thrust::complex<FPTYPE>(part_1 + part_4, 0.0);
out[base + 1] = thrust::complex<FPTYPE>(part_2 , -part_3);
out[base + 2] = thrust::complex<FPTYPE>(part_1 - part_4, 0.0);
out[base + 3] = thrust::complex<FPTYPE>(part_2, part_3);

}
template <typename FPTYPE>
void rearrange<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* device,
const int& size,
const FPTYPE* in,
std::complex<FPTYPE>* out) const
{
const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
rearrange_op<FPTYPE><<<block, THREADS_PER_BLOCK>>>(
size, // control params
in, // array of data
reinterpret_cast<thrust::complex<FPTYPE>*>(out)); // array of data
cudaCheckOnDebug();
}

template <typename FPTYPE>
void veff_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
const int& size,
Expand All @@ -60,18 +90,20 @@ void veff_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::
const int& size,
std::complex<FPTYPE>* out,
std::complex<FPTYPE>* out1,
const FPTYPE** in)
std::complex<FPTYPE>* in)
{
const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
veff_pw<FPTYPE><<<block, THREADS_PER_BLOCK>>>(
size, // control params
reinterpret_cast<thrust::complex<FPTYPE>*>(out), // array of data
reinterpret_cast<thrust::complex<FPTYPE>*>(out1), // array of data
in[0]); // array of data
reinterpret_cast<thrust::complex<FPTYPE>*>(in)); // array of data

cudaCheckOnDebug();
}

template struct rearrange<float, base_device::DEVICE_GPU>;
template struct rearrange<double, base_device::DEVICE_GPU>;
template struct veff_pw_op<float, base_device::DEVICE_GPU>;
template struct veff_pw_op<double, base_device::DEVICE_GPU>;

Expand Down
Loading
Loading