diff --git a/source/source_basis/module_pw/pw_basis_k.h b/source/source_basis/module_pw/pw_basis_k.h index b87da9ca0f..83b4d7722d 100644 --- a/source/source_basis/module_pw/pw_basis_k.h +++ b/source/source_basis/module_pw/pw_basis_k.h @@ -158,7 +158,6 @@ class PW_Basis_K : public PW_Basis const int ik, const bool add = false, const FPTYPE factor = 1.0) const; // in:(nz, ns) ; out(nplane,nx*ny) - #endif template @@ -176,7 +175,6 @@ class PW_Basis_K : public PW_Basis const bool add = false, const FPTYPE factor = 1.0) const; // in:(nz, ns) ; out(nplane,nx*ny) - template ::value, int>::type = 0> @@ -245,6 +243,47 @@ class PW_Basis_K : public PW_Basis { this->recip2real_gpu(in, out, ik, add, factor); } + template ::value, int>::type = 0> + void convolution(const int ik, + const int size, + const FPTYPE* input, + const typename GetTypeReal::type* input1, + FPTYPE* output, + const bool add = false, + const typename GetTypeReal::type factor =1.0) const + { + this->convolution_gpu(ik, size, input, input1, output, add, factor); + } + template ::value, int>::type = 0> + void convolution(const int ik, + const int size, + const FPTYPE* input, + const typename GetTypeReal::type* input1, + FPTYPE* output, + const bool add = false, + const typename GetTypeReal::type factor =1.0) const + { + this->convolution_cpu(ik, size, input, input1, output, add, factor); + } + template + void convolution_cpu(const int ik, + const int size, + const std::complex* input, + const FPTYPE* input1, + std::complex* output, + const bool add = false, + const FPTYPE factor = 1.0) const; + + template + void convolution_gpu(const int ik, + const int size, + const std::complex* input, + const FPTYPE* input1, + std::complex* output, + const bool add = false, + const FPTYPE factor = 1.0) const; public: //operator: diff --git a/source/source_basis/module_pw/pw_basis_sup.cpp b/source/source_basis/module_pw/pw_basis_sup.cpp index d88345016a..17bdfa594a 100644 --- a/source/source_basis/module_pw/pw_basis_sup.cpp +++ b/source/source_basis/module_pw/pw_basis_sup.cpp @@ -412,8 +412,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy( { int z = iz; if (z < 0) { - z += this->nz; -} + z += this->nz;} if (!found[ixy * this->nz + z]) { found[ixy * this->nz + z] = true; @@ -422,7 +421,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy( pw_filled++; if (xprime && ixy / fftny == 0) { ng_xeq0++; -} + } } } } diff --git a/source/source_basis/module_pw/pw_transform_k.cpp b/source/source_basis/module_pw/pw_transform_k.cpp index 36290d091a..f5faf70fa9 100644 --- a/source/source_basis/module_pw/pw_transform_k.cpp +++ b/source/source_basis/module_pw/pw_transform_k.cpp @@ -2,7 +2,7 @@ #include "source_basis/module_pw/kernels/pw_op.h" #include "pw_basis_k.h" #include "pw_gatherscatter.h" - +#include "source_pw/module_pwdft/kernels/veff_op.h" #include #include @@ -337,6 +337,63 @@ void PW_Basis_K::recip_to_real(const base_device::DEVICE_CPU* /*dev*/, this->recip2real(in, out, ik, add, factor); #endif } +template +void PW_Basis_K::convolution_cpu(const int ik, + const int size, + const std::complex* input, + const FPTYPE* input1, + std::complex* output, + const bool add, + const FPTYPE factor) const +{ + ModuleBase::timer::tick(this->classname, "convolution"); + assert(this->gamma_only == false); + // ModuleBase::GlobalFunc::ZEROS(fft_bundle.get_auxg_data(), this->nst * this->nz); + // memset the auxr of 0 in the auxr,here the len of the auxr is nxyz + auto* auxg = this->fft_bundle.get_auxg_data(); + auto* auxr=this->fft_bundle.get_auxr_data(); + + memset(auxg, 0, this->nst * this->nz * 2 * 8); + const int startig = ik * this->npwk_max; + const int npwk = this->npwk[ik]; + + // copy the mapping form the type of stick to the 3dfft + #ifdef _OPENMP + #pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) + #endif + for (int igl = 0; igl < npwk; ++igl) + { + auxg[this->igl2isz_k[igl + startig]] = input[igl]; + } + + // use 3d fft backward + this->fft_bundle.fftzbac(auxg, auxg); + + this->gathers_scatterp(auxg, auxr); + + this->fft_bundle.fftxybac(auxr, auxr); + for (int ir = 0; ir < size; ir++) + { + auxr[ir] *= input1[ir]; + } + + // 3d fft + this->fft_bundle.fftxyfor(auxr, auxr); + + this->gatherp_scatters(auxr, auxg); + + this->fft_bundle.fftzfor(auxg, auxg); + // copy the result from the auxr to the out ,while consider the add + FPTYPE tmpfac = factor / FPTYPE(this->nxyz); +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#endif + for (int igl = 0; igl < npwk; ++igl) + { + output[igl] += tmpfac * auxg[this->igl2isz_k[igl + startig]]; + } + ModuleBase::timer::tick(this->classname, "convolution"); +} #if (defined(__CUDA) || defined(__ROCM)) template <> @@ -534,6 +591,50 @@ void PW_Basis_K::recip2real_gpu(const std::complex* in, ModuleBase::timer::tick(this->classname, "recip_to_real gpu"); } +template +void PW_Basis_K::convolution_gpu(const int ik, + const int size, + const std::complex* input, + const FPTYPE* input1, + std::complex* output, + const bool add, + const FPTYPE factor) const +{ + ModuleBase::timer::tick(this->classname, "convolution"); + + assert(this->gamma_only == false); + const base_device::DEVICE_GPU* gpux; + // memset the auxr of 0 in the auxr,here the len of the auxr is nxyz + + base_device::memory::set_memory_op, base_device::DEVICE_GPU>()( + this->fft_bundle.get_auxr_3d_data(), + 0, + this->nxyz); + auto* auxr = this->fft_bundle.get_auxr_3d_data(); + const int startig = ik * this->npwk_max; + const int npw_k = this->npwk[ik]; + + // copy the mapping form the type of stick to the 3dfft + set_3d_fft_box_op()(npw_k, this->ig2ixyz_k + startig, input, auxr); + + // use 3d fft backward + this->fft_bundle.fft3D_backward(auxr, auxr); + + hamilt::veff_pw_op()(gpux,size,auxr,input1); + + // 3d fft + this->fft_bundle.fft3D_forward(auxr, auxr); + // copy the result from the auxr to the out ,while consider the add + set_real_to_recip_output_op()(npw_k, + this->nxyz, + add, + factor, + this->ig2ixyz_k + startig, + auxr, + output); + ModuleBase::timer::tick(this->classname, "convolution"); + } + template void PW_Basis_K::real2recip_gpu(const std::complex*, std::complex*, const int, @@ -557,8 +658,35 @@ template void PW_Basis_K::recip2real_gpu(const std::complex*, const int, const bool, const double) const; - +template void PW_Basis_K::convolution_gpu(const int ik, + const int size, + const std::complex* input, + const float* input1, + std::complex* output, + const bool add, + const float factor) const; +template void PW_Basis_K::convolution_gpu(const int ik, + const int size, + const std::complex* input, + const double* input1, + std::complex* output, + const bool add, + const double factor) const; #endif +template void PW_Basis_K::convolution_cpu(const int ik, + const int size, + const std::complex* input, + const float* input1, + std::complex* output, + const bool add, + const float factor) const; +template void PW_Basis_K::convolution_cpu(const int ik, + const int size, + const std::complex* input, + const double* input1, + std::complex* output, + const bool add, + const double factor) const; template void PW_Basis_K::real2recip(const float* in, std::complex* out, diff --git a/source/source_io/write_vxc_lip.hpp b/source/source_io/write_vxc_lip.hpp index 3705993022..308dca2b23 100644 --- a/source/source_io/write_vxc_lip.hpp +++ b/source/source_io/write_vxc_lip.hpp @@ -160,7 +160,11 @@ namespace ModuleIO { // 2.1 local xc vxcs_op_pw = new hamilt::Veff>(kv.isk.data(), - potxc->get_veff_smooth_data(), potxc->get_veff_smooth().nr, potxc->get_veff_smooth().nc, &wfc_basis); + potxc->get_veff_smooth_data(), + PARAM.inp.nspin, + potxc->get_veff_smooth().nr, + potxc->get_veff_smooth().nc, + &wfc_basis); vxcs_op_pw->init(ik); // set k-point index psi_pw.fix_k(ik); hpsi_localxc.fix_k(ik); diff --git a/source/source_pw/module_pwdft/hamilt_pw.cpp b/source/source_pw/module_pwdft/hamilt_pw.cpp index 86782ed665..f095557637 100644 --- a/source/source_pw/module_pwdft/hamilt_pw.cpp +++ b/source/source_pw/module_pwdft/hamilt_pw.cpp @@ -80,6 +80,7 @@ HamiltPW::HamiltPW(elecstate::Potential* pot_in, pot_in->pot_register(pot_register_in); Operator* veff = new Veff>(isk, pot_in->get_veff_smooth_data(), + PARAM.inp.nspin, pot_in->get_veff_smooth().nr, pot_in->get_veff_smooth().nc, wfc_basis); diff --git a/source/source_pw/module_pwdft/kernels/cuda/veff_op.cu b/source/source_pw/module_pwdft/kernels/cuda/veff_op.cu index f4eb37e5ac..51c1013698 100644 --- a/source/source_pw/module_pwdft/kernels/cuda/veff_op.cu +++ b/source/source_pw/module_pwdft/kernels/cuda/veff_op.cu @@ -26,20 +26,50 @@ __global__ void veff_pw( const int size, thrust::complex* out, thrust::complex* out1, - const FPTYPE* in) + thrust::complex* in) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx >= size) {return;} - thrust::complex sup = - out[idx] * (in[0 * size + idx] + in[3 * size + idx]) - + out1[idx] * (in[1 * size + idx] - thrust::complex(0.0, 1.0) * in[2 * size + idx]); - thrust::complex sdown = - out1[idx] * (in[0 * size + idx] - in[3 * size + idx]) - + out[idx] * (in[1 * size + idx] + thrust::complex(0.0, 1.0) * in[2 * size + idx]); + const int base = idx * 4; + thrust::complex sup = out[idx] * in[base] + out1[idx] * in[base+1]; + thrust::complex sdown = out1[idx] * in[base+2] + out[idx] * in[base+3]; out[idx] = sup; out1[idx] = sdown; } +template +__global__ void rearrange_op( + const int size, + const FPTYPE* in, + thrust::complex* out) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= size) {return;} + const int base = idx * 4; + const FPTYPE part_1 = in[idx]; + const FPTYPE part_2 = in[idx + size]; + const FPTYPE part_3 = in[idx + 2 * size]; + const FPTYPE part_4 = in[idx + 3 * size]; + out[base] = thrust::complex(part_1 + part_4, 0.0); + out[base + 1] = thrust::complex(part_2 , -part_3); + out[base + 2] = thrust::complex(part_1 - part_4, 0.0); + out[base + 3] = thrust::complex(part_2, part_3); + +} +template +void rearrange::operator()(const base_device::DEVICE_GPU* device, + const int& size, + const FPTYPE* in, + std::complex* out) const +{ + const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + rearrange_op<<>>( + size, // control params + in, // array of data + reinterpret_cast*>(out)); // array of data + cudaCheckOnDebug(); +} + template void veff_pw_op::operator()(const base_device::DEVICE_GPU* dev, const int& size, @@ -60,18 +90,20 @@ void veff_pw_op::operator()(const base_device:: const int& size, std::complex* out, std::complex* out1, - const FPTYPE** in) + std::complex* in) { const int block = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; veff_pw<<>>( size, // control params reinterpret_cast*>(out), // array of data reinterpret_cast*>(out1), // array of data - in[0]); // array of data + reinterpret_cast*>(in)); // array of data cudaCheckOnDebug(); } +template struct rearrange; +template struct rearrange; template struct veff_pw_op; template struct veff_pw_op; diff --git a/source/source_pw/module_pwdft/kernels/test/veff_op_test.cpp b/source/source_pw/module_pwdft/kernels/test/veff_op_test.cpp index 9f00625925..f3badce661 100644 --- a/source/source_pw/module_pwdft/kernels/test/veff_op_test.cpp +++ b/source/source_pw/module_pwdft/kernels/test/veff_op_test.cpp @@ -32,6 +32,8 @@ class TestModuleHamiltVeff : public ::testing::Test const base_device::DEVICE_CPU* cpu_ctx = {}; const base_device::DEVICE_GPU* gpu_ctx = {}; + using rearrange_cpu = hamilt::rearrange; + using rearrange_gpu = hamilt::rearrange; using veff_cpu_op = hamilt::veff_pw_op; using veff_gpu_op = hamilt::veff_pw_op; @@ -70,13 +72,10 @@ TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_cpu) std::vector> expected_out1_spin(out_spin.size(), std::complex(0, 0)); std::vector> res = out_spin; std::vector> res1 = out1_spin; + std::vector> in_(4 * in_spin.size(), std::complex(0, 0)); + rearrange_cpu()(cpu_ctx, this->size, in_spin.data(), in_.data()); - const double * in_[4]; - for (int ii = 0; ii < 4; ii++) { - in_[ii] = in_spin.data() + ii * this->size; - } - - veff_cpu_op()(cpu_ctx, this->size, res.data(), res1.data(), in_); + veff_cpu_op()(cpu_ctx, this->size, res.data(), res1.data(), in_.data()); for (int ii = 0; ii < res.size(); ii++) { EXPECT_LT(std::abs(res[ii] - expected_out_spin[ii]), 7.5e-5); EXPECT_LT(std::abs(res1[ii] - expected_out1_spin[ii]), 6e-5); @@ -108,23 +107,22 @@ TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_gpu) { std::vector> out1_spin(out_spin.size(), std::complex(0, 0)); std::vector> expected_out1_spin(out_spin.size(), std::complex(0, 0)); + std::vector> in_(4 * in_spin.size(), std::complex(0, 0)); std::vector> res = out_spin; std::vector> res1 = out1_spin; double* d_in = NULL; + std::complex* d_in_ = NULL; std::complex* d_res = NULL, * d_res1 = NULL; resize_memory_double_op()(d_in, in_spin.size()); resize_memory_complex_op()(d_res, res.size()); resize_memory_complex_op()(d_res1, res1.size()); + resize_memory_complex_op()(d_in_, in_spin.size()*4); syncmem_double_h2d_op()(d_in, in_spin.data(), in_spin.size()); syncmem_complex_h2d_op()(d_res, res.data(), res.size()); syncmem_complex_h2d_op()(d_res1, res1.data(), res1.size()); - - const double * in_[4]; - for (int ii = 0; ii < 4; ii++) { - in_[ii] = d_in + ii * this->size; - } - - veff_gpu_op()(gpu_ctx, this->size, d_res, d_res1, in_); + + rearrange_gpu()(gpu_ctx, this->size, d_in, d_in_); + veff_gpu_op()(gpu_ctx, this->size, d_res, d_res1, d_in_); syncmem_complex_d2h_op()(res.data(), d_res, res.size()); syncmem_complex_d2h_op()(res1.data(), d_res1, res1.size()); @@ -135,5 +133,6 @@ TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_gpu) delete_memory_double_op()(d_in); delete_memory_complex_op()(d_res); delete_memory_complex_op()(d_res1); + delete_memory_complex_op()(d_in_); } #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM \ No newline at end of file diff --git a/source/source_pw/module_pwdft/kernels/veff_op.cpp b/source/source_pw/module_pwdft/kernels/veff_op.cpp index 23646e9608..b6f458275a 100644 --- a/source/source_pw/module_pwdft/kernels/veff_op.cpp +++ b/source/source_pw/module_pwdft/kernels/veff_op.cpp @@ -20,26 +20,48 @@ struct veff_pw_op const int& size, std::complex* out, std::complex* out1, - const FPTYPE** in) + std::complex* in) { + #ifdef _OPENMP #pragma omp parallel for #endif - for (int ir = 0; ir < size; ir++) { - auto sup = out[ir] * (in[0][ir] + in[3][ir]) - + out1[ir] - * (in[1][ir] - - std::complex(0.0, 1.0) * in[2][ir]); - auto sdown = out1[ir] * (in[0][ir] - in[3][ir]) - + out[ir] - * (in[1][ir] - + std::complex(0.0, 1.0) * in[2][ir]); + for (int ir = 0; ir < size; ir++) + { + const int base = ir * 4; + auto sup = out[ir] * (in[base]) + out1[ir] * (in[base + 1]); + auto sdown = out1[ir] * (in[base + 2]) + out[ir] * (in[base + 3]); out[ir] = sup; out1[ir] = sdown; } } }; +template +struct rearrange +{ + void operator()(const base_device::DEVICE_CPU* dev, + const int& size, + const FPTYPE* in, + std::complex* out) const + { + for (int ir=0; ir < size; ir++) + { + const int base = 4 *ir; + FPTYPE part_1 = in[ir]; + FPTYPE part_2 = in[ir + size]; + FPTYPE part_3 = in[ir + 2*size]; + FPTYPE part_4 = in[ir + 3*size]; + out[base ] = std::complex(part_1 + part_4, 0.0); + out[base + 1] = std::complex(part_2 , -part_3); + out[base + 2] = std::complex(part_1 - part_4, 0.0); + out[base + 3] = std::complex(part_2, part_3); + } + } +}; + +template struct rearrange; +template struct rearrange; template struct veff_pw_op; template struct veff_pw_op; diff --git a/source/source_pw/module_pwdft/kernels/veff_op.h b/source/source_pw/module_pwdft/kernels/veff_op.h index cda82d9a9a..0bfdd6bea0 100644 --- a/source/source_pw/module_pwdft/kernels/veff_op.h +++ b/source/source_pw/module_pwdft/kernels/veff_op.h @@ -48,7 +48,7 @@ struct veff_pw_op { const int& size, std::complex* out, std::complex* out1, - const FPTYPE** in); + std::complex* in); }; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM @@ -62,7 +62,25 @@ struct veff_pw_op const int& size, std::complex* out, std::complex* out1, - const FPTYPE** in); + std::complex* in); +}; + +#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM +template +struct rearrange +{ + void operator()(const Device* device,const int& size, const FPTYPE* in, std::complex* out) const; +}; + +#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM + +template +struct rearrange +{ + void operator()(const base_device::DEVICE_GPU* device, + const int& size, + const FPTYPE* in, + std::complex* out) const; }; #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM } // namespace hamilt diff --git a/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp b/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp index e2813b3a9a..25497b7897 100644 --- a/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp +++ b/source/source_pw/module_pwdft/operator_pw/veff_pw.cpp @@ -7,10 +7,11 @@ namespace hamilt { template Veff>::Veff(const int* isk_in, - const Real* veff_in, - const int veff_row, - const int veff_col, - const ModulePW::PW_Basis_K* wfcpw_in) + const Real* veff_in, + const int nspin_in, + const int veff_row, + const int veff_col, + const ModulePW::PW_Basis_K* wfcpw_in) { if (isk_in == nullptr || wfcpw_in == nullptr) { @@ -21,20 +22,28 @@ Veff>::Veff(const int* isk_in, this->cal_type = calculation_type::pw_veff; this->isk = isk_in; this->veff = veff_in; + this->nspin = nspin_in; //note: "veff = nullptr" means that this core does not treat potential but still treats wf. this->veff_row = veff_row; this->veff_col = veff_col; this->wfcpw = wfcpw_in; resmem_complex_op()(this->porter, this->wfcpw->nmaxgr, "Veff::porter"); - resmem_complex_op()(this->porter1, this->wfcpw->nmaxgr, "Veff::porter1"); - + if (nspin == 4) + { + resmem_complex_op()(nspin_4_veff, 4*veff_col, "Veff::porter"); + resmem_complex_op()(this->porter1, this->wfcpw->nmaxgr, "Veff::porter1"); + } } template Veff>::~Veff() { delmem_complex_op()(this->porter); - delmem_complex_op()(this->porter1); + if (nspin == 4) + { + delmem_complex_op()(this->porter1); + delmem_complex_op()(this->nspin_4_veff); + } } template @@ -96,29 +105,28 @@ void Veff>::act( { for (int ib = 0; ib < nbands; ib += npol) { - wfcpw->recip_to_real(tmpsi_in, this->porter, this->ik); + wfcpw->convolution(this->ik, + this->veff_col, + tmpsi_in, + this->veff + current_spin * this->veff_col, + tmhpsi, + true); // NOTICE: when MPI threads are larger than the number of Z grids // veff would contain nothing, and nothing should be done in real space // but the 3DFFT can not be skipped, it will cause hanging - veff_op()(this->ctx, this->veff_col, this->porter, this->veff + current_spin * this->veff_col); - wfcpw->real_to_recip(this->porter, tmhpsi, this->ik, true); tmhpsi += psi_offset; tmpsi_in += psi_offset; } } else if (npol == 2) { - const Real* current_veff[4]={nullptr}; - for (int is = 0; is < 4; is++) - { - current_veff[is] = this->veff + is * this->veff_col; - } + rearrange()(this->ctx, this->veff_col, this->veff, this->nspin_4_veff); for (int ib = 0; ib < nbands; ib += npol) { // FFT to real space and do things. wfcpw->recip_to_real(tmpsi_in, this->porter, this->ik); wfcpw->recip_to_real(tmpsi_in + max_npw, this->porter1, this->ik); - veff_op()(this->ctx, this->veff_col, this->porter, this->porter1, current_veff); + veff_op()(this->ctx, this->veff_col, this->porter, this->porter1, nspin_4_veff); // FFT back to G space. wfcpw->real_to_recip(this->porter, tmhpsi, this->ik, true); wfcpw->real_to_recip(this->porter1, tmhpsi + max_npw, this->ik, true); diff --git a/source/source_pw/module_pwdft/operator_pw/veff_pw.h b/source/source_pw/module_pwdft/operator_pw/veff_pw.h index 3c2024edc5..3568189470 100644 --- a/source/source_pw/module_pwdft/operator_pw/veff_pw.h +++ b/source/source_pw/module_pwdft/operator_pw/veff_pw.h @@ -27,6 +27,7 @@ class Veff> : public OperatorPW public: Veff(const int* isk_in, const Real* veff_in, + const int nspin_in, const int veff_row, const int veff_col, const ModulePW::PW_Basis_K* wfcpw_in); @@ -65,9 +66,11 @@ class Veff> : public OperatorPW int veff_col = 0; int veff_row = 0; + int nspin = 1; const Real *veff = nullptr, *h_veff = nullptr, *d_veff = nullptr; T *porter = nullptr; T *porter1 = nullptr; + mutable T* nspin_4_veff=nullptr; base_device::AbacusDevice_t device = {}; using veff_op = veff_pw_op;