From 9a42117334e2b061086ae05b42c77b9dbe166ee0 Mon Sep 17 00:00:00 2001 From: Jie Li Date: Sat, 8 Feb 2025 18:53:24 +0800 Subject: [PATCH 1/6] Feature: implement k continuity initialization strategy & kernel optimization in davidson-subspcae algorithm - add k continuity initialization strategy in planewave basis - implement heterogenous computation branching between CPU and DCU - implement optimized eigenvalue operations for GPU & DCU - implement optimized preconditioner for GPU & DCU - implement optimized normalization op for GPU & DCU --- source/module_base/module_device/device.cpp | 9 +- source/module_esolver/esolver.cpp | 16 +- source/module_esolver/esolver_ks_pw.cpp | 3 +- source/module_hsolver/diago_dav_subspace.cpp | 128 +++--- source/module_hsolver/hsolver_pw.cpp | 210 ++++++++-- source/module_hsolver/hsolver_pw.h | 24 +- .../kernels/cuda/math_kernel_op.cu | 365 ++++++++++++++++++ .../module_hsolver/kernels/math_kernel_op.cpp | 89 +++++ .../module_hsolver/kernels/math_kernel_op.h | 62 +++ .../kernels/rocm/dngvd_op.hip.cu | 262 ++++++++++--- .../kernels/rocm/math_kernel_op.hip.cu | 355 +++++++++++++++++ .../module_hsolver/test/test_hsolver_pw.cpp | 62 +++ .../module_hsolver/test/test_hsolver_sdft.cpp | 60 +++ .../module_io/read_input_item_elec_stru.cpp | 20 + source/module_parameter/input_parameter.h | 1 + source/module_psi/psi_init.cpp | 3 +- 16 files changed, 1519 insertions(+), 150 deletions(-) diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp index b20ea9f3ad..9373a5a31a 100644 --- a/source/module_base/module_device/device.cpp +++ b/source/module_base/module_device/device.cpp @@ -5,7 +5,7 @@ #include #include - +#include #ifdef __MPI #include "mpi.h" #endif @@ -166,6 +166,11 @@ int device_count = -1; cudaGetDeviceCount(&device_count); #elif defined(__ROCM) hipGetDeviceCount(&device_count); +/***auto start_time = std::chrono::high_resolution_clock::now(); +std::cout << "Starting hipGetDeviceCount.." << std::endl; +auto end_time = std::chrono::high_resolution_clock::now(); +auto duration = std::chrono::duration_cast>(end_time - start_time); +std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/ #endif if (device_count <= 0) { @@ -711,4 +716,4 @@ void record_device_memory( #endif } // end of namespace information -} // end of namespace base_device \ No newline at end of file +} // end of namespace base_device diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp index 0352492a3a..f8985387d9 100644 --- a/source/module_esolver/esolver.cpp +++ b/source/module_esolver/esolver.cpp @@ -107,13 +107,21 @@ std::string determine_type() } if (GlobalV::MY_RANK == 0) { - std::cout << " RUNNING WITH DEVICE : " << device_info << " / " + /***auto start_time = std::chrono::high_resolution_clock::now(); + std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/ + std::cout << " RUNNING WITH DEVICE : " << device_info << " / " << base_device::information::get_device_info(PARAM.inp.device) << std::endl; + /***auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast>(end_time - start_time); + std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/ } - + /*** auto start_time = std::chrono::high_resolution_clock::now(); + std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/ GlobalV::ofs_running << "\n RUNNING WITH DEVICE : " << device_info << " / " - << base_device::information::get_device_info(PARAM.inp.device) << std::endl; - + << base_device::information::get_device_info(PARAM.inp.device) << std::endl; + /***auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast>(end_time - start_time); + std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/ return esolver_type; } diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp index 0b303f33e1..664ac186d5 100644 --- a/source/module_esolver/esolver_ks_pw.cpp +++ b/source/module_esolver/esolver_ks_pw.cpp @@ -556,7 +556,8 @@ void ESolver_KS_PW::hamilt2density_single(UnitCell& ucell, hsolver::DiagoIterAssist::SCF_ITER, hsolver::DiagoIterAssist::PW_DIAG_NMAX, hsolver::DiagoIterAssist::PW_DIAG_THR, - hsolver::DiagoIterAssist::need_subspace); + hsolver::DiagoIterAssist::need_subspace, + PARAM.inp.use_k_continuity); hsolver_pw_obj.solve(this->p_hamilt, this->kspw_psi[0], diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp index b180d72c13..f07f4cfefa 100644 --- a/source/module_hsolver/diago_dav_subspace.cpp +++ b/source/module_hsolver/diago_dav_subspace.cpp @@ -7,7 +7,6 @@ #include "module_hsolver/kernels/dngvd_op.h" #include "module_hsolver/kernels/math_kernel_op.h" #include "module_base/kernels/dsp/dsp_connector.h" - #include using namespace hsolver; @@ -60,7 +59,7 @@ Diago_DavSubspace::Diago_DavSubspace(const std::vector& precond if (this->device == base_device::GpuDevice) { resmem_real_op()(this->ctx, this->d_precondition, nbasis_in); - // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in); } #endif } @@ -288,27 +287,28 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, psi_iter + (nbase) * this->dim, this->dim); - std::vector e_temp_cpu(nbase, 0); + // Eigenvalues operation section + std::vector e_temp_cpu(this->notconv, 0); Real* e_temp_hd = e_temp_cpu.data(); - if(this->device == base_device::GpuDevice) + if (this->device == base_device::GpuDevice) { e_temp_hd = nullptr; - resmem_real_op()(this->ctx, e_temp_hd, nbase); + resmem_real_op()(this->ctx, e_temp_hd, this->notconv); } - for (int m = 0; m < notconv; m++) + + for (int m = 0; m < this->notconv; m++) { - e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m])); - if (this->device == base_device::GpuDevice) - { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), nbase); - } - vector_mul_vector_op()(this->ctx, - nbase, - vcc + m * this->nbase_x, - vcc + m * this->nbase_x, - e_temp_hd); + e_temp_cpu[m] = -(*eigenvalue_iter)[m]; } - if(this->device == base_device::GpuDevice) + + if (this->device == base_device::GpuDevice) + { + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), this->notconv); + } + + apply_eigenvalues_op()(this->ctx, nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd); + + if (this->device == base_device::GpuDevice) { delmem_real_op()(this->ctx, e_temp_hd); } @@ -333,54 +333,62 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, psi_iter + nbase * this->dim, this->dim); - // "precondition!!!" - std::vector pre(this->dim, 0.0); - for (int m = 0; m < notconv; m++) - { - for (size_t i = 0; i < this->dim; i++) - { - // pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]); - double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]); - pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); - } + // Precondition section #if defined(__CUDA) || defined(__ROCM) - if (this->device == base_device::GpuDevice) - { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, pre.data(), this->dim); - vector_div_vector_op()(this->ctx, - this->dim, - psi_iter + (nbase + m) * this->dim, - psi_iter + (nbase + m) * this->dim, - this->d_precondition); - } - else + if (this->device == base_device::GpuDevice) + { + Real* eigenvalues_gpu = nullptr; + resmem_real_op()(this->ctx, eigenvalues_gpu, notconv); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu,(*eigenvalue_iter).data(), notconv); + + precondition_op()(this->ctx, + this->dim, + psi_iter, + nbase, + notconv, + d_precondition, + eigenvalues_gpu); + delmem_real_op()(this->ctx, eigenvalues_gpu); + } + else #endif - { - vector_div_vector_op()(this->ctx, - this->dim, - psi_iter + (nbase + m) * this->dim, - psi_iter + (nbase + m) * this->dim, - pre.data()); - } + { + precondition_op()(this->ctx, + this->dim, + psi_iter, + nbase, + notconv, + this->precondition.data(), + (*eigenvalue_iter).data()); } - // "normalize!!!" in order to improve numerical stability of subspace diagonalization - std::vector psi_norm(notconv, 0.0); - for (size_t i = 0; i < notconv; i++) + // Normalize section +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + Real* psi_norm = nullptr; + resmem_real_op()(this->ctx, psi_norm, notconv); + using setmem_real_op = base_device::memory::set_memory_op; + setmem_real_op()(this->ctx, psi_norm, 0.0, notconv); + + normalize_op()(this->ctx, + this->dim, + psi_iter, + nbase, + notconv, + psi_norm); + delmem_real_op()(this->ctx, psi_norm); + } + else +#endif { - psi_norm[i] = dot_real_op()(this->ctx, - this->dim, - psi_iter + (nbase + i) * this->dim, - psi_iter + (nbase + i) * this->dim, - true); - assert(psi_norm[i] > 0.0); - psi_norm[i] = sqrt(psi_norm[i]); - - vector_div_constant_op()(this->ctx, - this->dim, - psi_iter + (nbase + i) * this->dim, - psi_iter + (nbase + i) * this->dim, - psi_norm[i]); + Real* psi_norm = nullptr; + normalize_op()(this->ctx, + this->dim, + psi_iter, + nbase, + notconv, + psi_norm); } // update hpsi[:, nbase:nbase+notconv] diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp index 0c1ad2e8b8..d0648addf5 100644 --- a/source/module_hsolver/hsolver_pw.cpp +++ b/source/module_hsolver/hsolver_pw.cpp @@ -280,47 +280,106 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, std::vector eigenvalues(this->wfc_basis->nks * psi.get_nbands(), 0.0); ethr_band.resize(psi.get_nbands(), this->diag_thr); - /// Loop over k points for solve Hamiltonian to charge density - for (int ik = 0; ik < this->wfc_basis->nks; ++ik) - { - /// update H(k) for each k point - pHamilt->updateHk(ik); + // Initialize k-point continuity if enabled + static int count = 0; + if (use_k_continuity) { + build_k_neighbors(); + } + + // Loop over k points for solve Hamiltonian to charge density + if (use_k_continuity) { + // K-point continuity case + for (int i = 0; i < this->wfc_basis->nks; ++i) + { + const int ik = k_order[i]; + + // update H(k) for each k point + pHamilt->updateHk(ik); #ifdef USE_PAW - this->paw_func_in_kloop(ik, tpiba); + this->paw_func_in_kloop(ik, tpiba); #endif - /// update psi pointer for each k point - psi.fix_k(ik); + // update psi pointer for each k point + psi.fix_k(ik); + + // If using k-point continuity and not first k-point, propagate from parent + if (ik > 0 && count == 0 && k_parent.find(ik) != k_parent.end()) { + propagate_psi(psi, k_parent[ik], ik); + } - // template add precondition calculating here - update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0())); + // template add precondition calculating here + update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0())); - // use smooth threshold for all iter methods - if (PARAM.inp.diago_smooth_ethr == true) - { - this->cal_smooth_ethr(pes->klist->wk[ik], - &pes->wg(ik, 0), - DiagoIterAssist::PW_DIAG_THR, - ethr_band); - } + // use smooth threshold for all iter methods + if (PARAM.inp.diago_smooth_ethr == true) + { + this->cal_smooth_ethr(pes->klist->wk[ik], + &pes->wg(ik, 0), + DiagoIterAssist::PW_DIAG_THR, + ethr_band); + } #ifdef USE_PAW - this->call_paw_cell_set_currentk(ik); + this->call_paw_cell_set_currentk(ik); #endif - /// solve eigenvector and eigenvalue for H(k) - this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks); + // solve eigenvector and eigenvalue for H(k) + this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks); - if (skip_charge) + if (skip_charge) + { + GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik + << " is: " << DiagoIterAssist::avg_iter + << " ; where current threshold is: " << this->diag_thr << " . " << std::endl; + DiagoIterAssist::avg_iter = 0.0; + } + } + } + else { + // Original code without k-point continuity + for (int ik = 0; ik < this->wfc_basis->nks; ++ik) { - GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik - << " is: " << DiagoIterAssist::avg_iter - << " ; where current threshold is: " << this->diag_thr << " . " << std::endl; - DiagoIterAssist::avg_iter = 0.0; + // update H(k) for each k point + pHamilt->updateHk(ik); + +#ifdef USE_PAW + this->paw_func_in_kloop(ik, tpiba); +#endif + + // update psi pointer for each k point + psi.fix_k(ik); + + // template add precondition calculating here + update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0())); + + // use smooth threshold for all iter methods + if (PARAM.inp.diago_smooth_ethr == true) + { + this->cal_smooth_ethr(pes->klist->wk[ik], + &pes->wg(ik, 0), + DiagoIterAssist::PW_DIAG_THR, + ethr_band); + } + +#ifdef USE_PAW + this->call_paw_cell_set_currentk(ik); +#endif + + // solve eigenvector and eigenvalue for H(k) + this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks); + + if (skip_charge) + { + GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik + << " is: " << DiagoIterAssist::avg_iter + << " ; where current threshold is: " << this->diag_thr << " . " << std::endl; + DiagoIterAssist::avg_iter = 0.0; + } } - /// calculate the contribution of Psi for charge density rho } + + count++; // END Loop over k points // copy eigenvalues to ekb in ElecState @@ -666,6 +725,101 @@ void HSolverPW::output_iterInfo() } } +template +void HSolverPW::build_k_neighbors() { + const int nk = this->wfc_basis->nks; + kvecs_c.resize(nk); + k_order.clear(); + k_order.reserve(nk); + + // Store k-points and corresponding indices + struct KPoint { + ModuleBase::Vector3 kvec; + int index; + double norm; + + KPoint(const ModuleBase::Vector3& v, int i) : + kvec(v), index(i), norm(v.norm()) {} + }; + + // Build k-point list + std::vector klist; + for (int ik = 0; ik < nk; ++ik) { + kvecs_c[ik] = this->wfc_basis->kvec_c[ik]; + klist.push_back(KPoint(kvecs_c[ik], ik)); + } + + // Sort k-points by distance from origin + std::sort(klist.begin(), klist.end(), + [](const KPoint& a, const KPoint& b) { + return a.norm < b.norm; + }); + + // Build parent-child relationships + k_order.push_back(klist[0].index); + + // Find nearest processed k-point as parent for each k-point + for (int i = 1; i < nk; ++i) { + int current_k = klist[i].index; + double min_dist = 1e10; + int parent = -1; + + // find the nearest k-point as parent + for (int j = 0; j < k_order.size(); ++j) { + int processed_k = k_order[j]; + double dist = (kvecs_c[current_k] - kvecs_c[processed_k]).norm2(); + if (dist < min_dist) { + min_dist = dist; + parent = processed_k; + } + } + + k_parent[current_k] = parent; + k_order.push_back(current_k); + } +} + +template +void HSolverPW::propagate_psi(psi::Psi& psi, const int from_ik, const int to_ik) { + const int nbands = psi.get_nbands(); + const int npwk = this->wfc_basis->npwk[to_ik]; + + // Get k-point difference + ModuleBase::Vector3 dk = kvecs_c[to_ik] - kvecs_c[from_ik]; + + // Allocate porter locally + T* porter = nullptr; + resmem_complex_op()(this->ctx, porter, this->wfc_basis->nmaxgr, "HSolverPW::porter"); + + // Process each band + for (int ib = 0; ib < nbands; ib++) + { + // Fix current k-point and band + // psi.fix_k(from_ik); + + // FFT to real space + // this->wfc_basis->recip_to_real(this->ctx, psi.get_pointer(ib), porter, from_ik); + this->wfc_basis->recip_to_real(this->ctx, &psi(from_ik, ib, 0), porter, from_ik); + + // Apply phase factor + // // TODO: Check how to get the r vector + // ModuleBase::Vector3 r = this->wfc_basis->get_ir2r(ir); + // double phase = this->wfc_basis->tpiba * (dk.x * r.x + dk.y * r.y + dk.z * r.z); + // psi_real[ir] *= std::exp(std::complex(0.0, phase)); + // } + + // Fix k-point for target + // psi.fix_k(to_ik); + + // FFT back to reciprocal space + // this->wfc_basis->real_to_recip(this->ctx, porter, psi.get_pointer(ib), to_ik, true); + this->wfc_basis->real_to_recip(this->ctx, porter, &psi(to_ik, ib, 0), to_ik); + } + + // Clean up porter + delmem_complex_op()(this->ctx, porter); +} + template class HSolverPW, base_device::DEVICE_CPU>; template class HSolverPW, base_device::DEVICE_CPU>; #if ((defined __CUDA) || (defined __ROCM)) @@ -673,4 +827,4 @@ template class HSolverPW, base_device::DEVICE_GPU>; template class HSolverPW, base_device::DEVICE_GPU>; #endif -} // namespace hsolver \ No newline at end of file +} // namespace hsolver diff --git a/source/module_hsolver/hsolver_pw.h b/source/module_hsolver/hsolver_pw.h index 0dee4fbdbf..7f0bfc7c23 100644 --- a/source/module_hsolver/hsolver_pw.h +++ b/source/module_hsolver/hsolver_pw.h @@ -6,6 +6,8 @@ #include "module_base/macros.h" #include "module_basis/module_pw/pw_basis_k.h" #include "module_psi/wavefunc.h" +#include +#include "module_base/memory.h" namespace hsolver { @@ -18,6 +20,9 @@ class HSolverPW // return T if T is real type(float, double), // otherwise return the real type of T(complex, complex) using Real = typename GetTypeReal::type; + using resmem_complex_op = base_device::memory::resize_memory_op; + using delmem_complex_op = base_device::memory::delete_memory_op; + using setmem_complex_op = base_device::memory::set_memory_op; public: HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in, @@ -30,10 +35,12 @@ class HSolverPW const int scf_iter_in, const int diag_iter_max_in, const double diag_thr_in, - const bool need_subspace_in) + const bool need_subspace_in, + const bool use_k_continuity_in = false) : wfc_basis(wfc_basis_in), calculation_type(calculation_type_in), basis_type(basis_type_in), method(method_in), use_paw(use_paw_in), use_uspp(use_uspp_in), nspin(nspin_in), scf_iter(scf_iter_in), - diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in){}; + diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in), + use_k_continuity(use_k_continuity_in) {}; /// @brief solve function for pw /// @param pHamilt interface to hamilt @@ -51,6 +58,7 @@ class HSolverPW const double tpiba, const int nat); + protected: // diago caller void hamiltSolvePsiK(hamilt::Hamilt* hm, @@ -79,6 +87,8 @@ class HSolverPW const bool need_subspace; // for cg or dav_subspace + const bool use_k_continuity; + protected: Device* ctx = {}; @@ -99,8 +109,16 @@ class HSolverPW void paw_func_after_kloop(psi::Psi& psi, elecstate::ElecState* pes,const double tpiba,const int nat); #endif + + // K-point continuity related members + std::vector k_order; + std::unordered_map k_parent; + std::vector> kvecs_c; + + void build_k_neighbors(); + void propagate_psi(psi::Psi& psi, const int from_ik, const int to_ik); }; } // namespace hsolver -#endif \ No newline at end of file +#endif diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu index 149b9ce389..8fb7c542eb 100644 --- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu +++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu @@ -42,6 +42,48 @@ struct GetTypeThrust> { static cublasHandle_t cublas_handle = nullptr; +// Forward declarations for abs2 +template +__device__ typename GetTypeReal::type abs2(const T& x); + +// Specialization for real types (double) +template<> +__device__ double abs2(const double& x) { + return x * x; +} + +// Specialization for real types (float) +template<> +__device__ float abs2(const float& x) { + return x * x; +} + +// Specialization for complex float +template<> +__device__ float abs2(const thrust::complex& x) { + return x.real() * x.real() + x.imag() * x.imag(); +} + +// Specialization for complex double +template<> +__device__ double abs2(const thrust::complex& x) { + return x.real() * x.real() + x.imag() * x.imag(); +} + +// Specialization for std::complex (for interface compatibility) +template<> +__device__ float abs2(const std::complex& x) { + const thrust::complex* tx = reinterpret_cast*>(&x); + return tx->real() * tx->real() + tx->imag() * tx->imag(); +} + +// Specialization for std::complex (for interface compatibility) +template<> +__device__ double abs2(const std::complex& x) { + const thrust::complex* tx = reinterpret_cast*>(&x); + return tx->real() * tx->real() + tx->imag() * tx->imag(); +} + static inline void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) { cublasErrcheck(cublasSdot(cublas_handle, n, x, incx, y, incy, &result)); @@ -1035,6 +1077,320 @@ void matrixSetToAnother, base_device::DEVICE_GPU>::operator cudaCheckOnDebug(); } +// Kernel for applying eigenvalues to vectors + +template +__global__ void apply_eigenvalues_kernel(const typename GetTypeReal::type* eigenvalues, const T* vectors, T* result, int nbase, int nbase_x, int notconv) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; // Linear thread ID + const int total_elements = notconv * nbase; + + if (tid < total_elements) + { + const int m = tid / nbase; // Row index (eigenvalue index) + const int idx = tid % nbase; // Column index within the row + result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx]; + } +} + +// Wrapper for applying eigenvalues to complex vectors +template +inline void apply_eigenvalues_complex_wrapper(const base_device::DEVICE_GPU* d, + const int& nbase, + const int& nbase_x, + const int& notconv, + const FPTYPE* eigenvalues, + const std::complex* vectors, + std::complex* result) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vectors_tmp = reinterpret_cast*>(vectors); + const int total_elements = notconv * nbase; + const int threadsPerBlock = 256; + const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + + apply_eigenvalues_kernel><<>>( + eigenvalues, vectors_tmp, result_tmp, nbase, nbase_x, notconv); + + cudaCheckOnDebug(); +} + +// Specialization for double +template <> +void apply_eigenvalues_op::operator()(const base_device::DEVICE_GPU *d, + const int &nbase, const int &nbase_x, const int ¬conv, + double *result, const double *vectors, const double *eigenvalues) { + const int total_elements = notconv * nbase; + const int threadsPerBlock = 256; + const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + + apply_eigenvalues_kernel<<>>( + eigenvalues, vectors, result, nbase, nbase_x, notconv); + cudaCheckOnDebug(); +} + +// Specialization for std::complex +template <> +void apply_eigenvalues_op, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d, + const int &nbase, const int &nbase_x, const int ¬conv, + std::complex *result, const std::complex *vectors, const float *eigenvalues) { + apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result); +} + +// Specialization for std::complex +template <> +void apply_eigenvalues_op, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d, + const int &nbase, const int &nbase_x, const int ¬conv, + std::complex *result, const std::complex *vectors, const double *eigenvalues) { + apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result); +} + +template +__global__ void precondition_kernel( + const int dim, + const int notconv, + T* psi_iter, + const int nbase, + const typename GetTypeReal::type* precondition, // Real type + const typename GetTypeReal::type* eigenvalues) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < dim * notconv) { + const int i = tid % dim; // Basis index + const int m = tid / dim; // Band index + + using Real = typename GetTypeReal::type; + Real x = abs(precondition[i] - eigenvalues[m]); + Real pre = 0.5 * (1.0 + x + sqrt(1.0 + (x - 1.0) * (x - 1.0))); + psi_iter[(nbase + m) * dim + i] /= pre; + } +} + +// Specialization for double (for LCAO) +template <> +void precondition_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + double* psi_iter, + const int& nbase, + const int& notconv, + const double* precondition, + const double* eigenvalues) +{ + const int total_elements = dim * notconv; + const int threadsPerBlock = thread_per_block; + const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + + precondition_kernel<<>>( + dim, notconv, psi_iter, nbase, precondition, eigenvalues); + + cudaCheckOnDebug(); +} + +// Specialization for complex +template <> +void precondition_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + const float* precondition, + const float* eigenvalues) +{ + const int total_elements = dim * notconv; + const int threadsPerBlock = thread_per_block; + const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + + precondition_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, + precondition, // Already float + eigenvalues); + + cudaCheckOnDebug(); +} + +// Specialization for complex +template <> +void precondition_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + const double* precondition, + const double* eigenvalues) +{ + const int total_elements = dim * notconv; + const int threadsPerBlock = thread_per_block; + const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + + precondition_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, + precondition, // Already double + eigenvalues); + + cudaCheckOnDebug(); +} + +// First kernel: calculate norms +template +__global__ void calculate_norm_kernel( + const int dim, + const int notconv, + const T* psi_iter, + const int nbase, + typename GetTypeReal::type* psi_norm) +{ + using Real = typename GetTypeReal::type; + __shared__ Real shared_mem[256]; + + const int tid = threadIdx.x; + const int bid = blockIdx.x; + const int m = bid / ((dim + blockDim.x - 1) / blockDim.x); + const int num_blocks_per_band = (dim + blockDim.x - 1) / blockDim.x; + const int block_in_band = bid % num_blocks_per_band; + + if (m >= notconv) return; + + // Initialize shared memory + shared_mem[tid] = 0.0; + + // Each thread processes one element + const int i = tid + block_in_band * blockDim.x; + if (i < dim) { + shared_mem[tid] = abs2(psi_iter[(nbase + m) * dim + i]); + } + __syncthreads(); + + // Parallel reduction in shared memory + for (int stride = blockDim.x/2; stride > 0; stride >>= 1) { + if (tid < stride) { + shared_mem[tid] += shared_mem[tid + stride]; + } + __syncthreads(); + } + + // First thread of first block initializes norm to 0 + // if (tid == 0 && block_in_band == 0) { + // psi_norm[m] = 0.0; + // } + // __syncthreads(); + + // Write result for this block to global memory + if (tid == 0) { + atomicAdd(&psi_norm[m], shared_mem[0]); + } +} + +// Second kernel: normalize vectors +template +__global__ void normalize_vectors_kernel( + const int dim, + const int notconv, + T* psi_iter, + const int nbase, + const typename GetTypeReal::type* psi_norm) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int m = tid / dim; + const int i = tid % dim; + + if (m < notconv && i < dim) { + psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / sqrt(psi_norm[m]); + } +} + + +// Specialization for complex +template <> +void normalize_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + float* psi_norm) +{ + const int threadsPerBlock = 256; + const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv; + + // First kernel: calculate norms + calculate_norm_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, psi_norm); + cudaDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + normalize_vectors_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, psi_norm); + cudaCheckOnDebug(); +} + +// Specialization for complex +template <> +void normalize_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + double* psi_norm) +{ + const int threadsPerBlock = 256; + const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv; + + // First kernel: calculate norms + calculate_norm_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, psi_norm); + cudaDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + normalize_vectors_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, psi_norm); + cudaCheckOnDebug(); +} + +// Specialization for double (for LCAO) +template <> +void normalize_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + double* psi_iter, + const int& nbase, + const int& notconv, + double* psi_norm) +{ + const int threadsPerBlock = 256; + const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv; + + // First kernel: calculate norms + calculate_norm_kernel<<>>( + dim, notconv, psi_iter, nbase, psi_norm); + cudaDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + normalize_vectors_kernel<<>>( + dim, notconv, psi_iter, nbase, psi_norm); + cudaCheckOnDebug(); +} // Explicitly instantiate functors for the types of functor registered. template struct dot_real_op, base_device::DEVICE_GPU>; @@ -1045,6 +1401,9 @@ template struct vector_mul_vector_op, base_device::DEVICE_GP template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixSetToAnother, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct normalize_op, base_device::DEVICE_GPU>; template struct dot_real_op, base_device::DEVICE_GPU>; template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; @@ -1054,6 +1413,9 @@ template struct vector_mul_vector_op, base_device::DEVICE_G template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixSetToAnother, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct normalize_op, base_device::DEVICE_GPU>; #ifdef __LCAO template struct dot_real_op; @@ -1062,5 +1424,8 @@ template struct vector_mul_vector_op; template struct vector_div_vector_op; template struct matrixSetToAnother; template struct constantvector_addORsub_constantVector_op; +template struct apply_eigenvalues_op; +template struct precondition_op; +template struct normalize_op; #endif } // namespace hsolver diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp index c3784949ab..87cd549482 100644 --- a/source/module_hsolver/kernels/math_kernel_op.cpp +++ b/source/module_hsolver/kernels/math_kernel_op.cpp @@ -2,6 +2,7 @@ #include #include +#include namespace hsolver { @@ -173,6 +174,51 @@ struct vector_mul_vector_op } }; +template +struct apply_eigenvalues_op +{ + using Real = typename GetTypeReal::type; + void operator()(const base_device::DEVICE_CPU* d, const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues) + { + for (int m = 0; m < notconv; m++) + { + for (int idx = 0; idx < nbase; idx++) + { + result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx]; + } + } + } +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const base_device::DEVICE_CPU* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues) + { + using Real = typename GetTypeReal::type; + std::vector pre(dim, 0.0); + for (int m = 0; m < notconv; m++) + { + for (size_t i = 0; i < dim; i++) + { + Real x = std::abs(precondition[i] - eigenvalues[m]); + pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); + } + vector_div_vector_op()(d, + dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + pre.data()); + } + } +}; + template struct vector_div_vector_op { @@ -355,6 +401,40 @@ struct matrixSetToAnother } }; +template +struct normalize_op { + void operator()(const base_device::DEVICE_CPU* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + typename GetTypeReal::type* psi_norm = nullptr) + { + using Real = typename GetTypeReal::type; + for (int m = 0; m < notconv; m++) + { + // Calculate norm using dot_real_op + Real psi_m_norm = dot_real_op()(d, + dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + true); + assert(psi_m_norm > 0.0); + psi_m_norm = sqrt(psi_m_norm); + + // Normalize using vector_div_constant_op + vector_div_constant_op()(d, + dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + psi_m_norm); + if (psi_norm) { + psi_norm[m] = psi_m_norm; + } + } + } +}; + // Explicitly instantiate functors for the types of functor registered. template struct scal_op; template struct axpy_op, base_device::DEVICE_CPU>; @@ -365,6 +445,9 @@ template struct gemm_op; template struct dot_real_op, base_device::DEVICE_CPU>; template struct vector_div_constant_op, base_device::DEVICE_CPU>; template struct vector_mul_vector_op, base_device::DEVICE_CPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_CPU>; +template struct precondition_op, base_device::DEVICE_CPU>; +template struct normalize_op, base_device::DEVICE_CPU>; template struct vector_div_vector_op, base_device::DEVICE_CPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_CPU>; template struct matrixTranspose_op, base_device::DEVICE_CPU>; @@ -381,6 +464,9 @@ template struct gemm_op; template struct dot_real_op, base_device::DEVICE_CPU>; template struct vector_div_constant_op, base_device::DEVICE_CPU>; template struct vector_mul_vector_op, base_device::DEVICE_CPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_CPU>; +template struct precondition_op, base_device::DEVICE_CPU>; +template struct normalize_op, base_device::DEVICE_CPU>; template struct vector_div_vector_op, base_device::DEVICE_CPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_CPU>; template struct matrixTranspose_op, base_device::DEVICE_CPU>; @@ -392,11 +478,14 @@ template struct line_minimize_with_block_op, base_device::D template struct axpy_op; template struct dot_real_op; template struct vector_mul_vector_op; +template struct apply_eigenvalues_op; +template struct precondition_op; template struct vector_div_constant_op; template struct vector_div_vector_op; template struct matrixTranspose_op; template struct matrixSetToAnother; template struct constantvector_addORsub_constantVector_op; +template struct normalize_op; #endif #ifdef __DSP template struct gemm_op_mt, base_device::DEVICE_CPU>; diff --git a/source/module_hsolver/kernels/math_kernel_op.h b/source/module_hsolver/kernels/math_kernel_op.h index 0daf0e5718..8318c4310b 100644 --- a/source/module_hsolver/kernels/math_kernel_op.h +++ b/source/module_hsolver/kernels/math_kernel_op.h @@ -325,6 +325,48 @@ template struct matrixSetToAnother { T *B, const int &LDB); }; +template +struct apply_eigenvalues_op { + using Real = typename GetTypeReal::type; + + void operator()(const Device *d, const int &nbase, const int &nbase_x, const int ¬conv, + T *result, const T *vectors, const Real *eigenvalues); +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const Device* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues); +}; + +template +struct normalize_op { + using Real = typename GetTypeReal::type; + void operator()(const Device* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + Real* psi_norm = nullptr); +}; + +template +struct normalize_op { + using Real = typename GetTypeReal::type; + void operator()(const base_device::DEVICE_GPU* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + Real* psi_norm); +}; + #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template @@ -393,6 +435,26 @@ template struct matrixSetToAnother { void createGpuBlasHandle(); void destoryBLAShandle(); +// vector operator: result[i] = -lambda[i] * vector[i] +template struct apply_eigenvalues_op { + using Real = typename GetTypeReal::type; + + void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int ¬conv, + T *result, const T *vectors, const Real *eigenvalues); +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const base_device::DEVICE_GPU* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues); +}; + #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM } // namespace hsolver diff --git a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu index 5eeb1697e8..8cba06db1b 100644 --- a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu +++ b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu @@ -5,12 +5,25 @@ namespace hsolver { +// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op + +static hipsolverHandle_t hipsolver_H = nullptr; +// Test on DCU platform. When nstart is greater than 234, code on DCU performs better. +const int N_DCU = 234; + void createGpuSolverHandle() { - return; + if (hipsolver_H == nullptr) + { + hipsolverErrcheck(hipsolverCreate(&hipsolver_H)); + } } void destroyGpuSolverHandle() { - return; + if (hipsolver_H != nullptr) + { + hipsolverErrcheck(hipsolverDestroy(hipsolver_H)); + hipsolver_H = nullptr; + } } #ifdef __LCAO @@ -23,22 +36,68 @@ void dngvd_op::operator()(const base_device::DE double* _eigenvalue, double* _vcc) { - std::vector hcc(nstart * nstart, 0.0); - std::vector scc(nstart * nstart, 0.0); - std::vector vcc(nstart * nstart, 0.0); - std::vector eigenvalue(nstart, 0); - hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost)); - hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost)); - base_device::DEVICE_CPU* cpu_ctx = {}; - dngvd_op()(cpu_ctx, - nstart, - ldh, - hcc.data(), - scc.data(), - eigenvalue.data(), - vcc.data()); - hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice)); - hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + // copied from ../cuda/dngvd_op.cu, "dngvd_op" + assert(nstart == ldh); + + if (nstart > N_DCU){ + hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(double) * ldh * nstart, hipMemcpyDeviceToDevice)); + // now vcc contains hcc + + // prepare some values for hipsolverDnZhegvd_bufferSize + int * devInfo = nullptr; + int lwork = 0, info_gpu = 0; + double * work = nullptr; + hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int))); + hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER; + + // calculate the sizes needed for pre-allocated buffer. + hipsolverErrcheck(hipsolverDnDsygvd_bufferSize( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + _vcc, ldh, + _scc, ldh, + _eigenvalue, + &lwork)); + + // allocate memery + hipErrcheck(hipMalloc((void**)&work, sizeof(double) * lwork)); + + // compute eigenvalues and eigenvectors. + hipsolverErrcheck(hipsolverDnDsygvd( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + _vcc, ldh, + const_cast(_scc), ldh, + _eigenvalue, + work, lwork, devInfo)); + + hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost)); + + // free the buffer + hipErrcheck(hipFree(work)); + hipErrcheck(hipFree(devInfo)); + } + // if(fail_info != nullptr) *fail_info = info_gpu; + else{ + std::vector hcc(nstart * nstart, 0.0); + std::vector scc(nstart * nstart, 0.0); + std::vector vcc(nstart * nstart, 0.0); + std::vector eigenvalue(nstart, 0); + hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost)); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost)); + base_device::DEVICE_CPU* cpu_ctx = {}; + dngvd_op()(cpu_ctx, + nstart, + ldh, + hcc.data(), + scc.data(), + eigenvalue.data(), + vcc.data()); + hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice)); + hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + } + + } #endif // __LCAO @@ -51,22 +110,67 @@ void dngvd_op, base_device::DEVICE_GPU>::operator()(const ba float* _eigenvalue, std::complex* _vcc) { - std::vector> hcc(nstart * nstart, {0, 0}); - std::vector> scc(nstart * nstart, {0, 0}); - std::vector> vcc(nstart * nstart, {0, 0}); - std::vector eigenvalue(nstart, 0); - hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); - hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); - base_device::DEVICE_CPU* cpu_ctx = {}; - dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, - nstart, - ldh, - hcc.data(), - scc.data(), - eigenvalue.data(), - vcc.data()); - hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); - hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice)); + // copied from ../cuda/dngvd_op.cu, "dngvd_op" + assert(nstart == ldh); + + if (nstart > N_DCU){ + hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex) * ldh * nstart, hipMemcpyDeviceToDevice)); + // now vcc contains hcc + + // prepare some values for hipsolverDnZhegvd_bufferSize + int * devInfo = nullptr; + int lwork = 0, info_gpu = 0; + float2 * work = nullptr; + hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int))); + hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER; + + // calculate the sizes needed for pre-allocated buffer. + hipsolverErrcheck(hipsolverDnChegvd_bufferSize( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + reinterpret_cast(_scc), ldh, + _eigenvalue, + &lwork)); + + // allocate memery + hipErrcheck(hipMalloc((void**)&work, sizeof(float2) * lwork)); + + // compute eigenvalues and eigenvectors. + hipsolverErrcheck(hipsolverDnChegvd( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + const_cast(reinterpret_cast(_scc)), ldh, + _eigenvalue, + work, lwork, devInfo)); + + hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost)); + // free the buffer + hipErrcheck(hipFree(work)); + hipErrcheck(hipFree(devInfo)); + } + // if(fail_info != nullptr) *fail_info = info_gpu; + else{ + std::vector> hcc(nstart * nstart, {0, 0}); + std::vector> scc(nstart * nstart, {0, 0}); + std::vector> vcc(nstart * nstart, {0, 0}); + std::vector eigenvalue(nstart, 0); + hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); + base_device::DEVICE_CPU* cpu_ctx = {}; + dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, + nstart, + ldh, + hcc.data(), + scc.data(), + eigenvalue.data(), + vcc.data()); + hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); + hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice)); + } + + } template <> @@ -76,24 +180,80 @@ void dngvd_op, base_device::DEVICE_GPU>::operator()(const b const std::complex* _hcc, const std::complex* _scc, double* _eigenvalue, - std::complex* _vcc) + std::complex* _vcc + ) { - std::vector> hcc(nstart * nstart, {0, 0}); - std::vector> scc(nstart * nstart, {0, 0}); - std::vector> vcc(nstart * nstart, {0, 0}); - std::vector eigenvalue(nstart, 0); - hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); - hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); - base_device::DEVICE_CPU* cpu_ctx = {}; - dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, - nstart, - ldh, - hcc.data(), - scc.data(), - eigenvalue.data(), - vcc.data()); - hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); - hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + // copied from ../cuda/dngvd_op.cu, "dngvd_op" + assert(nstart == ldh); + + // save a copy of scc in case the diagonalization fails + if (nstart > N_DCU){ + std::vector> scc(nstart * nstart, {0, 0}); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); + + hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex) * ldh * nstart, hipMemcpyDeviceToDevice)); + + // now vcc contains hcc + + // prepare some values for hipsolverDnZhegvd_bufferSize + int * devInfo = nullptr; + int lwork = 0, info_gpu = 0; + double2 * work = nullptr; + hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int))); + hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER; + + // calculate the sizes needed for pre-allocated buffer. + hipsolverErrcheck(hipsolverDnZhegvd_bufferSize( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + reinterpret_cast(_scc), ldh, + _eigenvalue, + &lwork)); + + // allocate memery + hipErrcheck(hipMalloc((void**)&work, sizeof(double2) * lwork)); + + // compute eigenvalues and eigenvectors. + hipsolverErrcheck(hipsolverDnZhegvd( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + const_cast(reinterpret_cast(_scc)), ldh, + _eigenvalue, + work, lwork, devInfo)); + + hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost)); + // free the buffer + hipErrcheck(hipFree(work)); + hipErrcheck(hipFree(devInfo)); + } + // if(fail_info != nullptr) *fail_info = info_gpu; + else{ + std::vector> hcc(nstart * nstart, {0, 0}); + std::vector> scc(nstart * nstart, {0, 0}); + std::vector> vcc(nstart * nstart, {0, 0}); + std::vector eigenvalue(nstart, 0); + hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); + base_device::DEVICE_CPU* cpu_ctx = {}; + dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, + nstart, + ldh, + hcc.data(), + scc.data(), + eigenvalue.data(), + vcc.data()); + hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); + hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + } + + + + + + + } #ifdef __LCAO diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu index ef5a1c1ece..4ca6af509c 100644 --- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu +++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu @@ -11,6 +11,7 @@ #define WARP_SIZE 32 #define FULL_MASK 0xffffffff #define THREAD_PER_BLOCK 256 + template <> struct GetTypeReal> { using type = float; /**< The return type specialization for std::complex. */ @@ -20,6 +21,49 @@ struct GetTypeReal> { using type = double; /**< The return type specialization for std::complex. */ }; +// Forward declarations for abs2 +template +__device__ typename GetTypeReal::type abs2(const T& x); + +// Specialization for real types (double) +template<> +__device__ double abs2(const double& x) { + return x * x; +} + +// Specialization for real types (float) +template<> +__device__ float abs2(const float& x) { + return x * x; +} + +// Specialization for complex float +template<> +__device__ float abs2(const thrust::complex& x) { + return x.real() * x.real() + x.imag() * x.imag(); +} + +// Specialization for complex double +template<> +__device__ double abs2(const thrust::complex& x) { + return x.real() * x.real() + x.imag() * x.imag(); +} + +// Specialization for std::complex (for interface compatibility) +template<> +__device__ float abs2(const std::complex& x) { + const thrust::complex* tx = reinterpret_cast*>(&x); + return tx->real() * tx->real() + tx->imag() * tx->imag(); +} + +// Specialization for std::complex (for interface compatibility) +template<> +__device__ double abs2(const std::complex& x) { + const thrust::complex* tx = reinterpret_cast*>(&x); + return tx->real() * tx->real() + tx->imag() * tx->imag(); +} + + namespace hsolver { template @@ -943,7 +987,309 @@ void matrixSetToAnother, base_device::DEVICE_GPU>::operator hipCheckOnDebug(); } +template +__launch_bounds__(256) +__global__ void apply_eigenvalues_kernel( + const typename GetTypeReal::type* eigenvalues, + const T* vectors, + T* result, + const int nbase, + const int nbase_x, + const int notconv) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + const int total_elements = notconv * nbase; + if (i < total_elements) + { + const int row = i / nbase; + const int col = i % nbase; + result[row * nbase_x + col] = eigenvalues[row] * vectors[row * nbase_x + col]; + } +} + +template +inline void apply_eigenvalues_complex_wrapper(const base_device::DEVICE_GPU* d, + const int& nbase, + const int& nbase_x, + const int& notconv, + const FPTYPE* eigenvalues, + const std::complex* vectors, + std::complex* result) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vectors_tmp = reinterpret_cast*>(vectors); + const int total_elements = notconv * nbase; + int thread = 256; + int block = (total_elements + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_eigenvalues_kernel>), dim3(block), dim3(thread), 0, 0, + eigenvalues, vectors_tmp, result_tmp, nbase, nbase_x, notconv); + + hipCheckOnDebug(); +} + +template <> +void apply_eigenvalues_op::operator()(const base_device::DEVICE_GPU* d, + const int& nbase, + const int& nbase_x, + const int& notconv, + double* result, + const double* vectors, + const double* eigenvalues) +{ + const int total_elements = notconv * nbase; + int thread = 256; + int block = (total_elements + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_eigenvalues_kernel), dim3(block), dim3(thread), 0, 0, + eigenvalues, vectors, result, nbase, nbase_x, notconv); + + hipCheckOnDebug(); +} +template <> +void apply_eigenvalues_op, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d, + const int& nbase, + const int& nbase_x, + const int& notconv, + std::complex* result, + const std::complex* vectors, + const float* eigenvalues) +{ + apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result); +} + +template <> +void apply_eigenvalues_op, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d, + const int& nbase, + const int& nbase_x, + const int& notconv, + std::complex* result, + const std::complex* vectors, + const double* eigenvalues) +{ + apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result); +} + +template +__launch_bounds__(256) +__global__ void precondition_kernel( + const int dim, + const int notconv, + T* psi_iter, + const int nbase, + const typename GetTypeReal::type* precondition, // Real type + const typename GetTypeReal::type* eigenvalues) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < dim * notconv) { + const int i = tid % dim; // Basis index + const int m = tid / dim; // Band index + + using Real = typename GetTypeReal::type; + Real x = abs(precondition[i] - eigenvalues[m]); + Real pre = 0.5 * (1.0 + x + sqrt(1.0 + (x - 1.0) * (x - 1.0))); + psi_iter[(nbase + m) * dim + i] /= pre; + } +} + +template <> +void precondition_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + double* psi_iter, + const int& nbase, + const int& notconv, + const double* precondition, + const double* eigenvalues) +{ + const int total_elements = dim * notconv; + int thread = 256; + int block = (total_elements + thread - 1) / thread; + + hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel), dim3(block), dim3(thread), 0, 0, + dim, notconv, psi_iter, nbase, precondition, eigenvalues); + + hipCheckOnDebug(); +} + +template <> +void precondition_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + const float* precondition, + const float* eigenvalues) +{ + const int total_elements = dim * notconv; + int thread = 256; + int block = (total_elements + thread - 1) / thread; + + hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel>), dim3(block), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, precondition, eigenvalues); + + hipCheckOnDebug(); +} + +template <> +void precondition_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + const double* precondition, + const double* eigenvalues) +{ + const int total_elements = dim * notconv; + int thread = 256; + int block = (total_elements + thread - 1) / thread; + + hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel>), dim3(block), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, precondition, eigenvalues); + + hipCheckOnDebug(); +} + +template +__launch_bounds__(256) +__global__ void calculate_norm_kernel( + const int dim, + const int notconv, + const T* psi_iter, + const int nbase, + typename GetTypeReal::type* psi_norm) +{ + using Real = typename GetTypeReal::type; + __shared__ Real shared_mem[256]; + + const int tid = threadIdx.x; + const int bid = blockIdx.x; + const int m = bid / ((dim + blockDim.x - 1) / blockDim.x); + const int num_blocks_per_band = (dim + blockDim.x - 1) / blockDim.x; + const int block_in_band = bid % num_blocks_per_band; + + if (m >= notconv) return; + + // Initialize shared memory + shared_mem[tid] = 0.0; + + // Each thread processes one element + const int i = tid + block_in_band * blockDim.x; + if (i < dim) { + shared_mem[tid] = abs2(psi_iter[(nbase + m) * dim + i]); + } + __syncthreads(); + + // Parallel reduction in shared memory + for (int stride = blockDim.x/2; stride > 0; stride >>= 1) { + if (tid < stride) { + shared_mem[tid] += shared_mem[tid + stride]; + } + __syncthreads(); + } + + // Write result for this block to global memory + if (tid == 0) { + atomicAdd(&psi_norm[m], shared_mem[0]); + } +} + +template +__launch_bounds__(256) +__global__ void normalize_vectors_kernel( + const int dim, + const int notconv, + T* psi_iter, + const int nbase, + const typename GetTypeReal::type* psi_norm) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int m = tid / dim; + const int i = tid % dim; + + if (m < notconv && i < dim) { + psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / sqrt(psi_norm[m]); + } +} + +template <> +void normalize_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + double* psi_iter, + const int& nbase, + const int& notconv, + double* psi_norm) +{ + const int thread = 256; + const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv; + + // First kernel: calculate norms + hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel), dim3(numBlocksForNorm), dim3(thread), 0, 0, + dim, notconv, psi_iter, nbase, psi_norm); + hipDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel), dim3(numBlocksForNormalize), dim3(thread), 0, 0, + dim, notconv, psi_iter, nbase, psi_norm); + + hipCheckOnDebug(); +} + +template <> +void normalize_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + float* psi_norm) +{ + const int thread = 256; + const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv; + + // First kernel: calculate norms + hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel>), dim3(numBlocksForNorm), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, psi_norm); + hipDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel>), dim3(numBlocksForNormalize), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, psi_norm); + + hipCheckOnDebug(); +} + +template <> +void normalize_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + double* psi_norm) +{ + const int thread = 256; + const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv; + + // First kernel: calculate norms + hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel>), dim3(numBlocksForNorm), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, psi_norm); + hipDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel>), dim3(numBlocksForNormalize), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, psi_norm); + + hipCheckOnDebug(); +} // Explicitly instantiate functors for the types of functor registered. template struct dot_real_op, base_device::DEVICE_GPU>; @@ -951,6 +1297,9 @@ template struct calc_grad_with_block_op, base_device::DEVICE template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; template struct vector_div_constant_op, base_device::DEVICE_GPU>; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct normalize_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixSetToAnother, base_device::DEVICE_GPU>; @@ -960,6 +1309,9 @@ template struct calc_grad_with_block_op, base_device::DEVIC template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; template struct vector_div_constant_op, base_device::DEVICE_GPU>; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct normalize_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixSetToAnother, base_device::DEVICE_GPU>; @@ -968,8 +1320,11 @@ template struct matrixSetToAnother, base_device::DEVICE_GPU template struct dot_real_op; template struct vector_div_constant_op; template struct vector_mul_vector_op; +template struct apply_eigenvalues_op; +template struct precondition_op; template struct vector_div_vector_op; template struct matrixSetToAnother; template struct constantvector_addORsub_constantVector_op; +template struct normalize_op; #endif } // namespace hsolver diff --git a/source/module_hsolver/test/test_hsolver_pw.cpp b/source/module_hsolver/test/test_hsolver_pw.cpp index 5763f1b7d8..1345dfc99e 100644 --- a/source/module_hsolver/test/test_hsolver_pw.cpp +++ b/source/module_hsolver/test/test_hsolver_pw.cpp @@ -14,6 +14,68 @@ #include "module_hsolver/hsolver_pw.h" #undef private #undef protected + +// Mock implementations for the template functions causing linking errors +namespace ModulePW { + // Mock implementation for recip_to_real + template + void PW_Basis_K::recip_to_real(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + // In a real test, you might want to implement behavior that simulates the actual function + } + + // Mock implementation for real_to_recip + template + void PW_Basis_K::real_to_recip(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + } + + // Explicit template instantiations + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; +} + /************************************************ * unit test of HSolverPW class ***********************************************/ diff --git a/source/module_hsolver/test/test_hsolver_sdft.cpp b/source/module_hsolver/test/test_hsolver_sdft.cpp index 642ee9daae..235c664a78 100644 --- a/source/module_hsolver/test/test_hsolver_sdft.cpp +++ b/source/module_hsolver/test/test_hsolver_sdft.cpp @@ -133,6 +133,66 @@ void Stochastic_Iter::cal_storho(const UnitCell& ucell, Charge::Charge(){}; Charge::~Charge(){}; +// Mock implementations for the template functions causing linking errors +namespace ModulePW { + // Mock implementation for recip_to_real + template + void PW_Basis_K::recip_to_real(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + // In a real test, you might want to implement behavior that simulates the actual function + } + + // Mock implementation for real_to_recip + template + void PW_Basis_K::real_to_recip(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + } + + // Explicit template instantiations + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; +} /************************************************ * unit test of HSolverPW_SDFT class ***********************************************/ diff --git a/source/module_io/read_input_item_elec_stru.cpp b/source/module_io/read_input_item_elec_stru.cpp index 73c741bb5f..19d5b2f1c2 100644 --- a/source/module_io/read_input_item_elec_stru.cpp +++ b/source/module_io/read_input_item_elec_stru.cpp @@ -346,6 +346,26 @@ void ReadInput::item_elec_stru() read_sync_bool(input.diago_smooth_ethr); this->add_item(item); } + { + Input_Item item("use_k_continuity"); + item.annotation = "whether to use k-point continuity for initializing wave functions"; + read_sync_bool(input.use_k_continuity); + item.check_value = [](const Input_Item& item, const Parameter& para) { + if (para.input.use_k_continuity && para.input.basis_type != "pw") { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity only works for PW basis"); + } + if (para.input.use_k_continuity && para.input.calculation == "nscf") { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for NSCF calculation"); + } + if (para.input.use_k_continuity && para.input.nspin == 2) { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for spin-polarized calculation"); + } + if (para.input.use_k_continuity && para.input.esolver_type == "sdft") { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for SDFT calculation"); + } + }; + this->add_item(item); + } { Input_Item item("pw_diag_ndim"); item.annotation = "dimension of workspace for Davidson diagonalization"; diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h index 0bb1979318..aa7d31309c 100644 --- a/source/module_parameter/input_parameter.h +++ b/source/module_parameter/input_parameter.h @@ -90,6 +90,7 @@ struct Input_para bool diago_smooth_ethr = false; ///< smooth ethr for iter methods int pw_diag_ndim = 4; ///< dimension of workspace for Davidson diagonalization int diago_cg_prec = 1; ///< mohan add 2012-03-31 + bool use_k_continuity = false; ///< whether to use k-point continuity for initializing wave functions std::string smearing_method = "gauss"; ///< "gauss", ///< "mp","methfessel-paxton" diff --git a/source/module_psi/psi_init.cpp b/source/module_psi/psi_init.cpp index 201539a54f..a3d59d4e7b 100644 --- a/source/module_psi/psi_init.cpp +++ b/source/module_psi/psi_init.cpp @@ -265,6 +265,7 @@ void PSIInit::initialize_psi(Psi>* psi, { for (int ik = 0; ik < this->pw_wfc->nks; ++ik) { + if(PARAM.inp.use_k_continuity && ik > 0) continue; //! Update Hamiltonian from other kpoint to the given one p_hamilt->updateHk(ik); @@ -316,4 +317,4 @@ template class PSIInit, base_device::DEVICE_CPU>; template class PSIInit, base_device::DEVICE_GPU>; template class PSIInit, base_device::DEVICE_GPU>; #endif -} // namespace psi \ No newline at end of file +} // namespace psi From c605cae89309e29120822443160943f6036bd41c Mon Sep 17 00:00:00 2001 From: Jie Date: Mon, 26 May 2025 15:56:44 +0800 Subject: [PATCH 2/6] implement gpu op for sincos loops --- .../kernels/cuda/force_sincos_op.cu | 204 ++++++++++++++++++ .../hamilt_pwdft/kernels/force_sincos_op.cpp | 123 +++++++++++ .../hamilt_pwdft/kernels/force_sincos_op.h | 106 +++++++++ .../kernels/rocm/force_sincos_op.hip.cu | 203 +++++++++++++++++ 4 files changed, 636 insertions(+) create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu new file mode 100644 index 0000000000..f97214893e --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu @@ -0,0 +1,204 @@ +#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h" +#include "module_base/module_device/types.h" + +#include +#include +#include +#include + +#define THREADS_PER_BLOCK 256 + +namespace hamilt { + +// CUDA kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * ( + gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z + ); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + __sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE factor = vloc_factors[ig] * + (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE arg = TWO_PI * ( + gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z + ); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + __sincos(arg, &sinp, &cosp); + + // Calculate Ewald sum contribution + const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * sumnb; + local_force_y += gcar[ig * 3 + 1] * sumnb; + local_force_z += gcar[ig * 3 + 2] * sumnb; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact); + atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact); + atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact); +} + +// GPU operator implementations +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_loc_sincos_kernel<<>>( + nat, npw, + gcar, tau, iat2it, vloc_factors, + reinterpret_cast*>(aux), + scale_factor, force + ); + + cudaCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_ew_sincos_kernel<<>>( + nat, npw, ig_gge0, + gcar, tau, iat2it, it_facts, + reinterpret_cast*>(aux), + force + ); + + cudaCheckOnDebug(); +} + +template struct cal_force_loc_sincos_op; +template struct cal_force_loc_sincos_op; + +template struct cal_force_ew_sincos_op; +template struct cal_force_ew_sincos_op; + +} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp new file mode 100644 index 0000000000..e7ce8554cc --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp @@ -0,0 +1,123 @@ +#include "force_sincos_op.h" +#include "module_base/libm/libm.h" + +#ifdef _OPENMP +#include +#endif + +namespace hamilt +{ + +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int iat = 0; iat < nat; ++iat) + { + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + for (int ig = 0; ig < npw; ig++) + { + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + + const FPTYPE factor = vloc_factors[ig] * + (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + local_force[0] += gcar[ig * 3 + 0] * factor; + local_force[1] += gcar[ig * 3 + 1] * factor; + local_force[2] += gcar[ig * 3 + 2] * factor; + } + + force[iat * 3 + 0] += local_force[0] * scale_factor; + force[iat * 3 + 1] += local_force[1] * scale_factor; + force[iat * 3 + 2] += local_force[2] * scale_factor; + } + } +}; + +template +struct cal_force_ew_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int iat = 0; iat < nat; ++iat) + { + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + for (int ig = 0; ig < npw; ig++) + { + // Skip G=0 term + if (ig == ig_gge0) continue; + + const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(arg, &sinp, &cosp); + + const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); + + local_force[0] += gcar[ig * 3 + 0] * sumnb; + local_force[1] += gcar[ig * 3 + 1] * sumnb; + local_force[2] += gcar[ig * 3 + 2] * sumnb; + } + + force[iat * 3 + 0] += local_force[0] * it_fact; + force[iat * 3 + 1] += local_force[1] * it_fact; + force[iat * 3 + 2] += local_force[2] * it_fact; + } + } +}; + +// Template instantiations +template struct cal_force_loc_sincos_op; +template struct cal_force_loc_sincos_op; + +template struct cal_force_ew_sincos_op; +template struct cal_force_ew_sincos_op; + +} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h new file mode 100644 index 0000000000..455b8425a7 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h @@ -0,0 +1,106 @@ +#ifndef FORCE_SINCOS_OP_H +#define FORCE_SINCOS_OP_H + +#include "module_base/module_device/types.h" +#include + +namespace hamilt +{ + +template +struct cal_force_loc_sincos_op +{ + /// @brief Calculate local pseudopotential forces (sincos loop only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param iat2it - atom to type mapping [nat] + /// @param vloc_factors - precomputed vloc factors [npw] + /// @param aux - charge density in G-space [npw] + /// @param scale_factor - tpiba * omega + /// + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force); +}; + +template +struct cal_force_ew_sincos_op +{ + /// @brief Calculate Ewald forces (sincos loop only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param ig_gge0 - index of G=0 vector (-1 if not present) + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param iat2it - atom to type mapping [nat] + /// @param it_facts - precomputed it_fact for each atom [nat] + /// @param aux - structure factor related array [npw] + /// + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force); +}; + +#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM + +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force); +}; + +template +struct cal_force_ew_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force); +}; + +#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM + +} // namespace hamilt + +#endif // FORCE_SINCOS_OP_H \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu new file mode 100644 index 0000000000..a09cd28bed --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu @@ -0,0 +1,203 @@ +#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h" + +#include +#include +#include +#include + +#define THREADS_PER_BLOCK 256 + +namespace hamilt { + +// HIP kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * ( + gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z + ); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + __sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE factor = vloc_factors[ig] * + (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE arg = TWO_PI * ( + gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z + ); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + __sincos(arg, &sinp, &cosp); + + // Calculate Ewald sum contribution + const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * sumnb; + local_force_y += gcar[ig * 3 + 1] * sumnb; + local_force_z += gcar[ig * 3 + 2] * sumnb; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact); + atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact); + atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact); +} + +// GPU operator implementations +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_loc_sincos_kernel), grid, block, 0, 0, + nat, npw, + gcar, tau, iat2it, vloc_factors, + reinterpret_cast*>(aux), + scale_factor, force + ); + + hipCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_ew_sincos_kernel), grid, block, 0, 0, + nat, npw, ig_gge0, + gcar, tau, iat2it, it_facts, + reinterpret_cast*>(aux), + force + ); + + hipCheckOnDebug(); +} + +template struct cal_force_loc_sincos_op; +template struct cal_force_loc_sincos_op; + +template struct cal_force_ew_sincos_op; +template struct cal_force_ew_sincos_op; + +} // namespace hamilt \ No newline at end of file From fbfc91a49ff49908a787f719cbbc667459815b13 Mon Sep 17 00:00:00 2001 From: Jie Date: Tue, 27 May 2025 18:35:51 +0800 Subject: [PATCH 3/6] add cpu kernel for cal_force_loc & cal_force_ew --- .../module_hamilt_pw/hamilt_pwdft/forces.cpp | 452 +++++++++++------- .../hamilt_pwdft/kernels/force_op.cpp | 255 ++++++++++ .../hamilt_pwdft/kernels/force_op.h | 107 +++++ 3 files changed, 636 insertions(+), 178 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index df29f88989..52ddf7da63 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -15,6 +15,8 @@ #include "module_hamilt_general/module_ewald/H_Ewald_pw.h" #include "module_hamilt_general/module_surchem/surchem.h" #include "module_hamilt_general/module_vdw/vdw.h" +#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" +#include #ifdef _OPENMP #include @@ -23,6 +25,208 @@ #include "module_cell/module_paw/paw_cell.h" #endif +// Data preparation function for Ewald parallel operator +template +void prepare_ew_parallel_data(const UnitCell& ucell, + ModulePW::PW_Basis* rho_basis, + const std::complex* aux, + const double alpha, + const double fact, + std::vector& gcar_flat, + std::vector& tau_flat, + std::vector& iat2it_array, + std::vector& it_facts, + std::vector& atom_na, + std::vector& zv_values, + std::vector& latvec_flat, + std::vector& G_flat) +{ + const int nat = ucell.nat; + const int npw = rho_basis->npw; + const int ntype = ucell.ntype; + + // Prepare gcar_flat + gcar_flat.resize(npw * 3); + for (int ig = 0; ig < npw; ig++) + { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // Prepare tau_flat and iat2it_array + tau_flat.resize(nat * 3); + iat2it_array.resize(nat); + int iat = 0; + for (int it = 0; it < ntype; it++) + { + for (int ia = 0; ia < ucell.atoms[it].na; ia++) + { + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia].x); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia].y); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia].z); + iat2it_array[iat] = it; + iat++; + } + } + + // Prepare it_facts + it_facts.resize(nat); + iat = 0; + for (int it = 0; it < ntype; it++) + { + double zv; + if (PARAM.inp.use_paw) + { +#ifdef USE_PAW + zv = GlobalC::paw_cell.get_val(it); +#else + zv = ucell.atoms[it].ncpp.zv; +#endif + } + else + { + zv = ucell.atoms[it].ncpp.zv; + } + + const FPTYPE it_fact = static_cast(zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact); + + for (int ia = 0; ia < ucell.atoms[it].na; ia++) + { + it_facts[iat] = it_fact; + iat++; + } + } + + // Prepare atom_na + atom_na.resize(ntype); + for (int it = 0; it < ntype; it++) + { + atom_na[it] = ucell.atoms[it].na; + } + + // Prepare zv_values + zv_values.resize(ntype); + for (int it = 0; it < ntype; it++) + { + if (PARAM.inp.use_paw) + { +#ifdef USE_PAW + zv_values[it] = static_cast(GlobalC::paw_cell.get_val(it)); +#else + zv_values[it] = static_cast(ucell.atoms[it].ncpp.zv); +#endif + } + else + { + zv_values[it] = static_cast(ucell.atoms[it].ncpp.zv); + } + } + + // Prepare latvec_flat (row-major order) + latvec_flat.resize(9); + latvec_flat[0] = static_cast(ucell.latvec.e11); + latvec_flat[1] = static_cast(ucell.latvec.e12); + latvec_flat[2] = static_cast(ucell.latvec.e13); + latvec_flat[3] = static_cast(ucell.latvec.e21); + latvec_flat[4] = static_cast(ucell.latvec.e22); + latvec_flat[5] = static_cast(ucell.latvec.e23); + latvec_flat[6] = static_cast(ucell.latvec.e31); + latvec_flat[7] = static_cast(ucell.latvec.e32); + latvec_flat[8] = static_cast(ucell.latvec.e33); + + // Prepare G_flat (row-major order) + G_flat.resize(9); + G_flat[0] = static_cast(ucell.G.e11); + G_flat[1] = static_cast(ucell.G.e12); + G_flat[2] = static_cast(ucell.G.e13); + G_flat[3] = static_cast(ucell.G.e21); + G_flat[4] = static_cast(ucell.G.e22); + G_flat[5] = static_cast(ucell.G.e23); + G_flat[6] = static_cast(ucell.G.e31); + G_flat[7] = static_cast(ucell.G.e32); + G_flat[8] = static_cast(ucell.G.e33); +} + +// Data preparation function for local force sincos operator +template +void prepare_loc_sincos_data(const UnitCell& ucell, + ModulePW::PW_Basis* rho_basis, + const ModuleBase::matrix& vloc, + const std::complex* aux, + std::vector& gcar_flat, + std::vector& tau_flat, + std::vector& iat2it_array, + std::vector& vloc_factors) +{ + const int nat = ucell.nat; + const int npw = rho_basis->npw; + + // Prepare gcar_flat + gcar_flat.resize(npw * 3); + for (int ig = 0; ig < npw; ig++) + { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // Prepare tau_flat and iat2it_array + tau_flat.resize(nat * 3); + iat2it_array.resize(nat); + int iat = 0; + for (int it = 0; it < ucell.ntype; it++) + { + for (int ia = 0; ia < ucell.atoms[it].na; ia++) + { + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia].x); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia].y); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia].z); + iat2it_array[iat] = it; + iat++; + } + } + + // Prepare vloc_factors: vloc(it, ig2igg[ig]) for each G-vector and atom type + vloc_factors.resize(npw * ucell.ntype); + for (int it = 0; it < ucell.ntype; it++) + { + for (int ig = 0; ig < npw; ig++) + { + vloc_factors[it * npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); + } + } +} + +// Data copy back function for local force sincos operator +template +void copy_back_loc_force_data(const FPTYPE* force, + const int nat, + const FPTYPE& scale_factor, + ModuleBase::matrix& forcelc) +{ + for (int iat = 0; iat < nat; iat++) + { + forcelc(iat, 0) = static_cast(force[iat * 3 + 0] * scale_factor); + forcelc(iat, 1) = static_cast(force[iat * 3 + 1] * scale_factor); + forcelc(iat, 2) = static_cast(force[iat * 3 + 2] * scale_factor); + } +} + +// Data copy back function for Ewald parallel operator +template +void copy_back_ew_force_data(const FPTYPE* force, + const int nat, + ModuleBase::matrix& forceion) +{ + for (int iat = 0; iat < nat; iat++) + { + forceion(iat, 0) = static_cast(force[iat * 3 + 0]); + forceion(iat, 1) = static_cast(force[iat * 3 + 1]); + forceion(iat, 2) = static_cast(force[iat * 3 + 2]); + } +} + template void Forces::cal_force(UnitCell& ucell, ModuleBase::matrix& force, @@ -531,30 +735,40 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < this->nat; ++iat) + // Prepare data for the sincos operator + std::vector gcar_flat, tau_flat, vloc_factors; + std::vector iat2it_array; + prepare_loc_sincos_data(ucell, rho_basis, vloc, aux, + gcar_flat, tau_flat, iat2it_array, vloc_factors); + + // Prepare force array for the operator + std::vector force_flat(this->nat * 3, 0.0); + + // Convert aux to FPTYPE complex array + std::vector> aux_fptype(rho_basis->npw); + for (int ig = 0; ig < rho_basis->npw; ig++) { - // read `it` `ia` from the table - int it = ucell.iat2it[iat]; - int ia = ucell.iat2ia[iat]; - for (int ig = 0; ig < rho_basis->npw; ig++) - { - const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]); - double sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - const double factor - = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real()); - forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor; - forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor; - forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor; - } - forcelc(iat, 0) *= (ucell.tpiba * ucell.omega); - forcelc(iat, 1) *= (ucell.tpiba * ucell.omega); - forcelc(iat, 2) *= (ucell.tpiba * ucell.omega); + aux_fptype[ig] = std::complex(static_cast(aux[ig].real()), + static_cast(aux[ig].imag())); } + // Call the sincos operator + hamilt::cal_force_loc_sincos_op sincos_op; + sincos_op(this->ctx, + this->nat, + rho_basis->npw, + ucell.ntype, + gcar_flat.data(), + tau_flat.data(), + iat2it_array.data(), + vloc_factors.data(), + aux_fptype.data(), + force_flat.data()); + + // Copy back the results with scaling + const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega); + copy_back_loc_force_data(force_flat.data(), this->nat, scale_factor, forcelc); + // this->print(GlobalV::ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); delete[] aux; @@ -665,166 +879,48 @@ void Forces::cal_force_ew(const UnitCell& ucell, aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0); } -#ifdef _OPENMP -#pragma omp parallel - { - int num_threads = omp_get_num_threads(); - int thread_id = omp_get_thread_num(); -#else - int num_threads = 1; - int thread_id = 0; -#endif + // Prepare data for the parallel operator + std::vector gcar_flat, tau_flat, it_facts, zv_values, latvec_flat, G_flat; + std::vector iat2it_array, atom_na; + prepare_ew_parallel_data(ucell, rho_basis, aux, alpha, fact, + gcar_flat, tau_flat, iat2it_array, it_facts, + atom_na, zv_values, latvec_flat, G_flat); - /* Here is task distribution for multi-thread, - 0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`. - To avoid syncing, we must calculate work range of each thread by our self - 1. Calculate the iat range [iat_beg, iat_end) by each thread - a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat) - 2. each thread iterate atoms form `iat_beg` to `iat_end-1` - */ - int iat_beg, iat_end; - int it_beg, ia_beg; - ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end); - iat_end = iat_beg + iat_end; - ucell.iat2iait(iat_beg, &ia_beg, &it_beg); - - int iat = iat_beg; - int it = it_beg; - int ia = ia_beg; - - // preprocess ig_gap for skipping the ig point - int ig_gap = (rho_basis->ig_gge0 >= 0 && rho_basis->ig_gge0 < rho_basis->npw) ? rho_basis->ig_gge0 : -1; - - double it_fact = 0.; - int last_it = -1; - - // iterating atoms - while (iat < iat_end) - { - if (it != last_it) - { // calculate it_tact when it is changed - double zv; - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - zv = GlobalC::paw_cell.get_val(it); -#endif - } - else - { - zv = ucell.atoms[it].ncpp.zv; - } - it_fact = zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact; - last_it = it; - } - - if (ucell.atoms[it].na != 0) - { - const auto ig_loop = [&](int ig_beg, int ig_end) { - for (int ig = ig_beg; ig < ig_end; ig++) - { - const ModuleBase::Vector3 gcar = rho_basis->gcar[ig]; - const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]); - double sinp, cosp; - ModuleBase::libm::sincos(arg, &sinp, &cosp); - double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - forceion(iat, 0) += gcar[0] * sumnb; - forceion(iat, 1) += gcar[1] * sumnb; - forceion(iat, 2) += gcar[2] * sumnb; - } - }; - - // skip ig_gge0 point by separating ig loop into two part - ig_loop(0, ig_gap); - ig_loop(ig_gap + 1, rho_basis->npw); - - forceion(iat, 0) *= it_fact; - forceion(iat, 1) *= it_fact; - forceion(iat, 2) *= it_fact; - - ++iat; - ucell.step_iait(&ia, &it); - } - } - - // means that the processor contains G=0 term. - if (rho_basis->ig_gge0 >= 0) - { - double rmax = 5.0 / (sqrt(alpha) * ucell.lat0); - int nrm = 0; + // Prepare force array for the operator + std::vector force_flat(this->nat * 3, 0.0); - // output of rgen: the number of vectors in the sphere - const int mxr = 200; - // the maximum number of R vectors included in r - ModuleBase::Vector3* r = new ModuleBase::Vector3[mxr]; - double* r2 = new double[mxr]; - ModuleBase::GlobalFunc::ZEROS(r2, mxr); - int* irr = new int[mxr]; - ModuleBase::GlobalFunc::ZEROS(irr, mxr); - // the square modulus of R_j-tau_s-tau_s' - - int iat1 = iat_beg; - int T1 = it_beg; - int I1 = ia_beg; - const double sqa = sqrt(alpha); - const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI); - - // iterating atoms. - // do not need to sync threads because task range of each thread is isolated - while (iat1 < iat_end) - { - int iat2 = 0; // mohan fix bug 2011-06-07 - int I2 = 0; - int T2 = 0; - while (iat2 < this->nat) - { - if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0) - { - ModuleBase::Vector3 d_tau - = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2]; - H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm); - - for (int n = 0; n < nrm; n++) - { - const double rr = sqrt(r2[n]) * ucell.lat0; - - double factor; - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2 - / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) - * ucell.lat0; -#endif - } - else - { - factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv - * ModuleBase::e2 / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) - * ucell.lat0; - } - - forceion(iat1, 0) -= factor * r[n].x; - forceion(iat1, 1) -= factor * r[n].y; - forceion(iat1, 2) -= factor * r[n].z; - } - } - ++iat2; - ucell.step_iait(&I2, &T2); - } // atom b - ++iat1; - ucell.step_iait(&I1, &T1); - } // atom a - - delete[] r; - delete[] r2; - delete[] irr; - } -#ifdef _OPENMP + // Convert aux to FPTYPE complex array + std::vector> aux_fptype(rho_basis->npw); + for (int ig = 0; ig < rho_basis->npw; ig++) + { + aux_fptype[ig] = std::complex(static_cast(aux[ig].real()), + static_cast(aux[ig].imag())); } -#endif + + // Call the parallel operator + hamilt::cal_force_ew_parallel_op parallel_op; + parallel_op(this->ctx, + this->nat, + rho_basis->npw, + ucell.ntype, + rho_basis->ig_gge0, + gcar_flat.data(), + tau_flat.data(), + iat2it_array.data(), + it_facts.data(), + atom_na.data(), + zv_values.data(), + aux_fptype.data(), + static_cast(alpha), + static_cast(fact), + static_cast(ucell.lat0), + latvec_flat.data(), + G_flat.data(), + PARAM.inp.use_paw, + force_flat.data()); + + // Copy back the results + copy_back_ew_force_data(force_flat.data(), this->nat, forceion); Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 6d797e147d..b5745f2ac5 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -1,4 +1,8 @@ #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" +#include "module_base/libm/libm.h" +#include "module_base/tool_threading.h" +#include "module_hamilt_general/module_ewald/H_Ewald_pw.h" +#include #ifdef _OPENMP #include @@ -424,10 +428,261 @@ struct cal_force_nl_op } }; +// CPU implementation of local force sincos operator +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int iat = 0; iat < nat; ++iat) + { + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + for (int ig = 0; ig < npw; ig++) + { + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + + const FPTYPE vloc_factor = vloc_factors[it * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + local_force[0] += gcar[ig * 3 + 0] * factor; + local_force[1] += gcar[ig * 3 + 1] * factor; + local_force[2] += gcar[ig * 3 + 2] * factor; + } + + force[iat * 3 + 0] = local_force[0]; + force[iat * 3 + 1] = local_force[1]; + force[iat * 3 + 2] = local_force[2]; + } + } +}; + +// CPU implementation of Ewald parallel operator +template +struct cal_force_ew_parallel_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const int* atom_na, + const FPTYPE* zv_values, + const std::complex* aux, + const FPTYPE& alpha, + const FPTYPE& fact, + const FPTYPE& lat0, + const FPTYPE* latvec, + const FPTYPE* G, + const bool& use_paw, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + const FPTYPE e2 = 2.0; // ModuleBase::e2 equivalent + +#ifdef _OPENMP +#pragma omp parallel + { + int num_threads = omp_get_num_threads(); + int thread_id = omp_get_thread_num(); +#else + int num_threads = 1; + int thread_id = 0; +#endif + + // Task distribution for multi-thread + int iat_beg, iat_end; + ModuleBase::TASK_DIST_1D(num_threads, thread_id, nat, iat_beg, iat_end); + iat_end = iat_beg + iat_end; + + // Preprocess ig_gap for skipping the ig point + int ig_gap = (ig_gge0 >= 0 && ig_gge0 < npw) ? ig_gge0 : -1; + + // Part 1: sincos calculation (replacing original while loop with for loop) + for (int iat = iat_beg; iat < iat_end; ++iat) + { + const int it = iat2it[iat]; + + // Check if this atom type has atoms + if (atom_na[it] == 0) continue; + + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + // G-vector loop with ig_gap handling (using for loops for GPU compatibility) + for (int ig = 0; ig < npw; ig++) + { + // Skip G=0 term + if (ig == ig_gap) continue; + + const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(arg, &sinp, &cosp); + + const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); + + local_force[0] += gcar[ig * 3 + 0] * sumnb; + local_force[1] += gcar[ig * 3 + 1] * sumnb; + local_force[2] += gcar[ig * 3 + 2] * sumnb; + } + + // Apply it_fact scaling + force[iat * 3 + 0] += local_force[0] * it_fact; + force[iat * 3 + 1] += local_force[1] * it_fact; + force[iat * 3 + 2] += local_force[2] * it_fact; + } + + // Part 2: Real space Ewald calculation (when processor contains G=0 term) + if (ig_gge0 >= 0) + { + const FPTYPE rmax = 5.0 / (sqrt(alpha) * lat0); + const int mxr = 200; + + // Allocate temporary arrays for each thread + ModuleBase::Vector3* r_flat = new ModuleBase::Vector3[mxr]; + double* r2 = new double[mxr]; + int* irr = new int[mxr]; + + // Initialize arrays + for (int i = 0; i < mxr; i++) { + r2[i] = 0.0; + irr[i] = 0; + r_flat[i].x = 0.0; + r_flat[i].y = 0.0; + r_flat[i].z = 0.0; + } + + const FPTYPE sqa = sqrt(alpha); + const FPTYPE sq8a_2pi = sqrt(8.0 * alpha / TWO_PI); + + // Iterate over atoms in assigned range + for (int iat1 = iat_beg; iat1 < iat_end; ++iat1) + { + const int T1 = iat2it[iat1]; + if (atom_na[T1] == 0) continue; + + const FPTYPE tau1_x = tau[iat1 * 3 + 0]; + const FPTYPE tau1_y = tau[iat1 * 3 + 1]; + const FPTYPE tau1_z = tau[iat1 * 3 + 2]; + + // Iterate over all other atoms + for (int iat2 = 0; iat2 < nat; ++iat2) + { + const int T2 = iat2it[iat2]; + + if (iat1 != iat2 && atom_na[T2] != 0 && atom_na[T1] != 0) + { + const FPTYPE tau2_x = tau[iat2 * 3 + 0]; + const FPTYPE tau2_y = tau[iat2 * 3 + 1]; + const FPTYPE tau2_z = tau[iat2 * 3 + 2]; + + // Calculate d_tau + const FPTYPE d_tau_x = tau1_x - tau2_x; + const FPTYPE d_tau_y = tau1_y - tau2_y; + const FPTYPE d_tau_z = tau1_z - tau2_z; + + // Reconstruct Matrix3 objects from flat arrays for rgen call + ModuleBase::Matrix3 latvec_matrix( + static_cast(latvec[0]), static_cast(latvec[1]), static_cast(latvec[2]), + static_cast(latvec[3]), static_cast(latvec[4]), static_cast(latvec[5]), + static_cast(latvec[6]), static_cast(latvec[7]), static_cast(latvec[8]) + ); + + ModuleBase::Matrix3 G_matrix( + static_cast(G[0]), static_cast(G[1]), static_cast(G[2]), + static_cast(G[3]), static_cast(G[4]), static_cast(G[5]), + static_cast(G[6]), static_cast(G[7]), static_cast(G[8]) + ); + + ModuleBase::Vector3 dtau_vec( + static_cast(d_tau_x), + static_cast(d_tau_y), + static_cast(d_tau_z) + ); + + // Call H_Ewald_pw::rgen to generate neighbor shells + int nrm = 0; + H_Ewald_pw::rgen(dtau_vec, static_cast(rmax), irr, latvec_matrix, G_matrix, r_flat, r2, nrm); + + // Process all generated r vectors + for (int nr = 0; nr < nrm; nr++) + { + const FPTYPE rr = sqrt(r2[nr]) * lat0; + + FPTYPE factor; + if (use_paw) + { + factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr) + * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr)) + * lat0; + } + else + { + factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr) + * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr)) + * lat0; + } + + force[iat1 * 3 + 0] -= factor * static_cast(r_flat[nr].x); + force[iat1 * 3 + 1] -= factor * static_cast(r_flat[nr].y); + force[iat1 * 3 + 2] -= factor * static_cast(r_flat[nr].z); + } + } + } + } + + delete[] r_flat; + delete[] r2; + delete[] irr; + } + +#ifdef _OPENMP + } +#endif + } +}; + template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_parallel_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_parallel_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 3aa5d4f87e..152f41e310 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -146,6 +146,83 @@ struct cal_force_nl_op const FPTYPE* lambda, const std::complex* becp, const std::complex* dbecp, + FPTYPE* force); +}; + +template +struct cal_force_loc_sincos_op +{ + /// @brief Calculate local pseudopotential forces (sincos loop only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param ntype - number of atom types + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param iat2it - atom to type mapping [nat] + /// @param vloc_factors - precomputed vloc factors [ntype * npw] + /// @param aux - charge density in G-space [npw] + /// + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + FPTYPE* force); +}; + +template +struct cal_force_ew_parallel_op +{ + /// @brief Calculate Ewald forces (parallel region only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param ntype - number of atom types + /// @param ig_gge0 - index of G=0 vector (-1 if not present) + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param iat2it - atom to type mapping [nat] + /// @param it_facts - precomputed it_fact for each atom [nat] + /// @param atom_na - number of atoms of each type [ntype] + /// @param zv_values - valence charge for each type [ntype] + /// @param aux - structure factor related array [npw] + /// @param alpha - Ewald parameter + /// @param fact - scaling factor + /// @param lat0 - lattice constant + /// @param latvec - lattice vectors [3 * 3] + /// @param G - reciprocal lattice vectors [3 * 3] + /// @param use_pwa - whether PAW is used + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ntype, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const int* atom_na, + const FPTYPE* zv_values, + const std::complex* aux, + const FPTYPE& alpha, + const FPTYPE& fact, + const FPTYPE& lat0, + const FPTYPE* latvec, + const FPTYPE* G, + const bool& use_paw, FPTYPE* force); }; @@ -248,6 +325,36 @@ struct cal_force_nl_op FPTYPE* force); }; +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + FPTYPE* force); +}; + +template +struct cal_force_ew_parallel_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force); +}; + /** * @brief revert the vkb values for force_nl calculation */ From 8a6339c4e0981dfc17e9fba881e9b375d0e64a71 Mon Sep 17 00:00:00 2001 From: Jie Date: Fri, 30 May 2025 15:06:08 +0800 Subject: [PATCH 4/6] fix sincos op for gpu&cpu --- .../module_hamilt_pw/hamilt_pwdft/forces.cpp | 614 ++++++++++-------- .../hamilt_pwdft/kernels/cuda/force_op.cu | 184 ++++++ .../kernels/cuda/force_sincos_op.cu | 204 ------ .../hamilt_pwdft/kernels/force_op.cpp | 204 +----- .../hamilt_pwdft/kernels/force_op.h | 34 +- .../hamilt_pwdft/kernels/force_sincos_op.cpp | 123 ---- .../hamilt_pwdft/kernels/force_sincos_op.h | 106 --- .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 185 ++++++ .../kernels/rocm/force_sincos_op.hip.cu | 203 ------ 9 files changed, 751 insertions(+), 1106 deletions(-) delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index 52ddf7da63..5e1d3cd6a2 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -15,8 +15,7 @@ #include "module_hamilt_general/module_ewald/H_Ewald_pw.h" #include "module_hamilt_general/module_surchem/surchem.h" #include "module_hamilt_general/module_vdw/vdw.h" -#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" -#include +#include "kernels/force_op.h" #ifdef _OPENMP #include @@ -25,208 +24,6 @@ #include "module_cell/module_paw/paw_cell.h" #endif -// Data preparation function for Ewald parallel operator -template -void prepare_ew_parallel_data(const UnitCell& ucell, - ModulePW::PW_Basis* rho_basis, - const std::complex* aux, - const double alpha, - const double fact, - std::vector& gcar_flat, - std::vector& tau_flat, - std::vector& iat2it_array, - std::vector& it_facts, - std::vector& atom_na, - std::vector& zv_values, - std::vector& latvec_flat, - std::vector& G_flat) -{ - const int nat = ucell.nat; - const int npw = rho_basis->npw; - const int ntype = ucell.ntype; - - // Prepare gcar_flat - gcar_flat.resize(npw * 3); - for (int ig = 0; ig < npw; ig++) - { - gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); - gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); - gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); - } - - // Prepare tau_flat and iat2it_array - tau_flat.resize(nat * 3); - iat2it_array.resize(nat); - int iat = 0; - for (int it = 0; it < ntype; it++) - { - for (int ia = 0; ia < ucell.atoms[it].na; ia++) - { - tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia].x); - tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia].y); - tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia].z); - iat2it_array[iat] = it; - iat++; - } - } - - // Prepare it_facts - it_facts.resize(nat); - iat = 0; - for (int it = 0; it < ntype; it++) - { - double zv; - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - zv = GlobalC::paw_cell.get_val(it); -#else - zv = ucell.atoms[it].ncpp.zv; -#endif - } - else - { - zv = ucell.atoms[it].ncpp.zv; - } - - const FPTYPE it_fact = static_cast(zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact); - - for (int ia = 0; ia < ucell.atoms[it].na; ia++) - { - it_facts[iat] = it_fact; - iat++; - } - } - - // Prepare atom_na - atom_na.resize(ntype); - for (int it = 0; it < ntype; it++) - { - atom_na[it] = ucell.atoms[it].na; - } - - // Prepare zv_values - zv_values.resize(ntype); - for (int it = 0; it < ntype; it++) - { - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - zv_values[it] = static_cast(GlobalC::paw_cell.get_val(it)); -#else - zv_values[it] = static_cast(ucell.atoms[it].ncpp.zv); -#endif - } - else - { - zv_values[it] = static_cast(ucell.atoms[it].ncpp.zv); - } - } - - // Prepare latvec_flat (row-major order) - latvec_flat.resize(9); - latvec_flat[0] = static_cast(ucell.latvec.e11); - latvec_flat[1] = static_cast(ucell.latvec.e12); - latvec_flat[2] = static_cast(ucell.latvec.e13); - latvec_flat[3] = static_cast(ucell.latvec.e21); - latvec_flat[4] = static_cast(ucell.latvec.e22); - latvec_flat[5] = static_cast(ucell.latvec.e23); - latvec_flat[6] = static_cast(ucell.latvec.e31); - latvec_flat[7] = static_cast(ucell.latvec.e32); - latvec_flat[8] = static_cast(ucell.latvec.e33); - - // Prepare G_flat (row-major order) - G_flat.resize(9); - G_flat[0] = static_cast(ucell.G.e11); - G_flat[1] = static_cast(ucell.G.e12); - G_flat[2] = static_cast(ucell.G.e13); - G_flat[3] = static_cast(ucell.G.e21); - G_flat[4] = static_cast(ucell.G.e22); - G_flat[5] = static_cast(ucell.G.e23); - G_flat[6] = static_cast(ucell.G.e31); - G_flat[7] = static_cast(ucell.G.e32); - G_flat[8] = static_cast(ucell.G.e33); -} - -// Data preparation function for local force sincos operator -template -void prepare_loc_sincos_data(const UnitCell& ucell, - ModulePW::PW_Basis* rho_basis, - const ModuleBase::matrix& vloc, - const std::complex* aux, - std::vector& gcar_flat, - std::vector& tau_flat, - std::vector& iat2it_array, - std::vector& vloc_factors) -{ - const int nat = ucell.nat; - const int npw = rho_basis->npw; - - // Prepare gcar_flat - gcar_flat.resize(npw * 3); - for (int ig = 0; ig < npw; ig++) - { - gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); - gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); - gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); - } - - // Prepare tau_flat and iat2it_array - tau_flat.resize(nat * 3); - iat2it_array.resize(nat); - int iat = 0; - for (int it = 0; it < ucell.ntype; it++) - { - for (int ia = 0; ia < ucell.atoms[it].na; ia++) - { - tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia].x); - tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia].y); - tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia].z); - iat2it_array[iat] = it; - iat++; - } - } - - // Prepare vloc_factors: vloc(it, ig2igg[ig]) for each G-vector and atom type - vloc_factors.resize(npw * ucell.ntype); - for (int it = 0; it < ucell.ntype; it++) - { - for (int ig = 0; ig < npw; ig++) - { - vloc_factors[it * npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); - } - } -} - -// Data copy back function for local force sincos operator -template -void copy_back_loc_force_data(const FPTYPE* force, - const int nat, - const FPTYPE& scale_factor, - ModuleBase::matrix& forcelc) -{ - for (int iat = 0; iat < nat; iat++) - { - forcelc(iat, 0) = static_cast(force[iat * 3 + 0] * scale_factor); - forcelc(iat, 1) = static_cast(force[iat * 3 + 1] * scale_factor); - forcelc(iat, 2) = static_cast(force[iat * 3 + 2] * scale_factor); - } -} - -// Data copy back function for Ewald parallel operator -template -void copy_back_ew_force_data(const FPTYPE* force, - const int nat, - ModuleBase::matrix& forceion) -{ - for (int iat = 0; iat < nat; iat++) - { - forceion(iat, 0) = static_cast(force[iat * 3 + 0]); - forceion(iat, 1) = static_cast(force[iat * 3 + 1]); - forceion(iat, 2) = static_cast(force[iat * 3 + 2]); - } -} - template void Forces::cal_force(UnitCell& ucell, ModuleBase::matrix& force, @@ -735,39 +532,125 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); - // Prepare data for the sincos operator - std::vector gcar_flat, tau_flat, vloc_factors; - std::vector iat2it_array; - prepare_loc_sincos_data(ucell, rho_basis, vloc, aux, - gcar_flat, tau_flat, iat2it_array, vloc_factors); - - // Prepare force array for the operator - std::vector force_flat(this->nat * 3, 0.0); - - // Convert aux to FPTYPE complex array + // =============== GPU/CPU异构优化:使用sincos op替换原循环 =============== + + + // 准备原子相关数据:按照全局原子索引iat顺序 + std::vector tau_flat(this->nat * 3); + std::vector iat2it_host(this->nat); + std::vector gcar_flat(rho_basis->npw * 3); + + // 按照原始代码逻辑:遍历全局原子索引iat,通过查表获取(it,ia) + for (int iat = 0; iat < this->nat; iat++) { + int it = ucell.iat2it[iat]; // 查表获取原子类型 + int ia = ucell.iat2ia[iat]; // 查表获取该类型内的原子索引 + + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); + iat2it_host[iat] = it; + } + + for (int ig = 0; ig < rho_basis->npw; ig++) { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // 重新计算vloc_factors考虑所有原子类型 + std::vector vloc_per_type_host(ucell.ntype * rho_basis->npw); + for (int it = 0; it < ucell.ntype; it++) { + for (int ig = 0; ig < rho_basis->npw; ig++) { + vloc_per_type_host[it * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); + } + } + + // 转换aux到FPTYPE类型 std::vector> aux_fptype(rho_basis->npw); - for (int ig = 0; ig < rho_basis->npw; ig++) + for (int ig = 0; ig < rho_basis->npw; ig++) { + aux_fptype[ig] = static_cast>(aux[ig]); + } + + // 设备端内存和指针设置(根据设备类型分支) + FPTYPE* d_gcar = gcar_flat.data(); + FPTYPE* d_tau = tau_flat.data(); + int* d_iat2it = iat2it_host.data(); + FPTYPE* d_vloc_per_type = vloc_per_type_host.data(); + std::complex* d_aux = aux_fptype.data(); + FPTYPE* d_force = nullptr; + std::vector force_host(this->nat * 3); + + if (this->device == base_device::GpuDevice) + { + d_gcar = nullptr; + d_tau = nullptr; + d_iat2it = nullptr; + d_vloc_per_type = nullptr; + d_aux = nullptr; + + resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); + resmem_var_op()(this->ctx, d_tau, this->nat * 3); + resmem_int_op()(this->ctx, d_iat2it, this->nat); + resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw); + resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); + resmem_var_op()(this->ctx, d_force, this->nat * 3); + + // 数据传输到设备 + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); + + // 初始化force为0 + base_device::memory::set_memory_op()(this->ctx, d_force, 0.0, this->nat * 3); + } + else { - aux_fptype[ig] = std::complex(static_cast(aux[ig].real()), - static_cast(aux[ig].imag())); - } - - // Call the sincos operator - hamilt::cal_force_loc_sincos_op sincos_op; - sincos_op(this->ctx, - this->nat, - rho_basis->npw, - ucell.ntype, - gcar_flat.data(), - tau_flat.data(), - iat2it_array.data(), - vloc_factors.data(), - aux_fptype.data(), - force_flat.data()); - - // Copy back the results with scaling + d_force = force_host.data(); + // 对于CPU,直接初始化为0 + std::fill(force_host.begin(), force_host.end(), static_cast(0.0)); + } + + // 计算缩放因子 const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega); - copy_back_loc_force_data(force_flat.data(), this->nat, scale_factor, forcelc); + + // 调用op进行sincos计算 + hamilt::cal_force_loc_sincos_op()( + this->ctx, + this->nat, + rho_basis->npw, + ucell.ntype, + d_gcar, + d_tau, + d_iat2it, + d_vloc_per_type, + d_aux, + scale_factor, + d_force + ); + + // 根据设备类型处理结果 + if (this->device == base_device::GpuDevice) + { + // 结果传回CPU + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3); + + // 清理设备内存 + delmem_var_op()(this->ctx, d_gcar); + delmem_var_op()(this->ctx, d_tau); + delmem_int_op()(this->ctx, d_iat2it); + delmem_var_op()(this->ctx, d_vloc_per_type); + delmem_complex_op()(this->ctx, d_aux); + delmem_var_op()(this->ctx, d_force); + } + + // 将结果写入forcelc矩阵 + for (int iat = 0; iat < this->nat; iat++) { + forcelc(iat, 0) = static_cast(force_host[iat * 3 + 0]); + forcelc(iat, 1) = static_cast(force_host[iat * 3 + 1]); + forcelc(iat, 2) = static_cast(force_host[iat * 3 + 2]); + } // this->print(GlobalV::ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); @@ -879,48 +762,233 @@ void Forces::cal_force_ew(const UnitCell& ucell, aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0); } - // Prepare data for the parallel operator - std::vector gcar_flat, tau_flat, it_facts, zv_values, latvec_flat, G_flat; - std::vector iat2it_array, atom_na; - prepare_ew_parallel_data(ucell, rho_basis, aux, alpha, fact, - gcar_flat, tau_flat, iat2it_array, it_facts, - atom_na, zv_values, latvec_flat, G_flat); + // =============== 第一步:G空间sincos计算(在OpenMP区域外)=============== + + // 预计算每个原子的it_fact和相关数据:按照全局原子索引iat顺序 + std::vector it_facts_host(this->nat); + std::vector tau_flat(this->nat * 3); + std::vector iat2it_host(this->nat); + + // 按照原始代码逻辑:遍历全局原子索引iat,通过查表获取(it,ia) + for (int iat = 0; iat < this->nat; iat++) { + int it = ucell.iat2it[iat]; // 查表获取原子类型 + int ia = ucell.iat2ia[iat]; // 查表获取该类型内的原子索引 + + double zv; + if (PARAM.inp.use_paw) + { +#ifdef USE_PAW + zv = GlobalC::paw_cell.get_val(it); +#endif + } + else + { + zv = ucell.atoms[it].ncpp.zv; + } + + it_facts_host[iat] = static_cast(zv * ModuleBase::e2 * ucell.tpiba * + ModuleBase::TWO_PI / ucell.omega * fact); + + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); + iat2it_host[iat] = it; + } + + // 准备设备端数据 + std::vector gcar_flat(rho_basis->npw * 3); + for (int ig = 0; ig < rho_basis->npw; ig++) { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // 转换aux到FPTYPE类型 + std::vector> aux_fptype(rho_basis->npw); + for (int ig = 0; ig < rho_basis->npw; ig++) { + aux_fptype[ig] = static_cast>(aux[ig]); + } + + // 设备端内存和指针设置(根据设备类型分支) + FPTYPE* d_gcar = gcar_flat.data(); + FPTYPE* d_tau = tau_flat.data(); + int* d_iat2it = iat2it_host.data(); + FPTYPE* d_it_facts = it_facts_host.data(); + std::complex* d_aux = aux_fptype.data(); + FPTYPE* d_force_g = nullptr; + std::vector force_g_host(this->nat * 3); + + if (this->device == base_device::GpuDevice) + { + d_gcar = nullptr; + d_tau = nullptr; + d_iat2it = nullptr; + d_it_facts = nullptr; + d_aux = nullptr; + + resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); + resmem_var_op()(this->ctx, d_tau, this->nat * 3); + resmem_int_op()(this->ctx, d_iat2it, this->nat); + resmem_var_op()(this->ctx, d_it_facts, this->nat); + resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); + resmem_var_op()(this->ctx, d_force_g, this->nat * 3); + + // 数据传输 + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); + + // 初始化force为0 + base_device::memory::set_memory_op()(this->ctx, d_force_g, 0.0, this->nat * 3); + } + else + { + d_force_g = force_g_host.data(); + // 对于CPU,直接初始化为0 + std::fill(force_g_host.begin(), force_g_host.end(), static_cast(0.0)); + } + + // 调用op处理G空间sincos计算(在OpenMP区域外,无冲突) + hamilt::cal_force_ew_sincos_op()( + this->ctx, + this->nat, + rho_basis->npw, + rho_basis->ig_gge0, // G=0项索引,op内部会自动跳过 + d_gcar, + d_tau, + d_iat2it, + d_it_facts, + d_aux, + d_force_g + ); + + // 根据设备类型处理结果 + if (this->device == base_device::GpuDevice) + { + // 将G空间结果传回CPU + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3); + + // 清理设备内存 + delmem_var_op()(this->ctx, d_gcar); + delmem_var_op()(this->ctx, d_tau); + delmem_int_op()(this->ctx, d_iat2it); + delmem_var_op()(this->ctx, d_it_facts); + delmem_complex_op()(this->ctx, d_aux); + delmem_var_op()(this->ctx, d_force_g); + } + + // 累加到forceion + for (int iat = 0; iat < this->nat; iat++) { + forceion(iat, 0) += static_cast(force_g_host[iat * 3 + 0]); + forceion(iat, 1) += static_cast(force_g_host[iat * 3 + 1]); + forceion(iat, 2) += static_cast(force_g_host[iat * 3 + 2]); + } - // Prepare force array for the operator - std::vector force_flat(this->nat * 3, 0.0); + // =============== 第二步:实空间计算(保留原OpenMP结构)=============== - // Convert aux to FPTYPE complex array - std::vector> aux_fptype(rho_basis->npw); - for (int ig = 0; ig < rho_basis->npw; ig++) +#ifdef _OPENMP +#pragma omp parallel { - aux_fptype[ig] = std::complex(static_cast(aux[ig].real()), - static_cast(aux[ig].imag())); - } - - // Call the parallel operator - hamilt::cal_force_ew_parallel_op parallel_op; - parallel_op(this->ctx, - this->nat, - rho_basis->npw, - ucell.ntype, - rho_basis->ig_gge0, - gcar_flat.data(), - tau_flat.data(), - iat2it_array.data(), - it_facts.data(), - atom_na.data(), - zv_values.data(), - aux_fptype.data(), - static_cast(alpha), - static_cast(fact), - static_cast(ucell.lat0), - latvec_flat.data(), - G_flat.data(), - PARAM.inp.use_paw, - force_flat.data()); - - // Copy back the results - copy_back_ew_force_data(force_flat.data(), this->nat, forceion); + int num_threads = omp_get_num_threads(); + int thread_id = omp_get_thread_num(); +#else + int num_threads = 1; + int thread_id = 0; +#endif + + /* Here is task distribution for multi-thread, + 0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`. + To avoid syncing, we must calculate work range of each thread by our self + 1. Calculate the iat range [iat_beg, iat_end) by each thread + a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat) + 2. each thread iterate atoms form `iat_beg` to `iat_end-1` + */ + int iat_beg, iat_end; + int it_beg, ia_beg; + ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end); + iat_end = iat_beg + iat_end; + ucell.iat2iait(iat_beg, &ia_beg, &it_beg); + + // 只保留实空间相互作用计算(means that the processor contains G=0 term) + if (rho_basis->ig_gge0 >= 0) + { + double rmax = 5.0 / (sqrt(alpha) * ucell.lat0); + int nrm = 0; + + // output of rgen: the number of vectors in the sphere + const int mxr = 200; + // the maximum number of R vectors included in r + ModuleBase::Vector3* r = new ModuleBase::Vector3[mxr]; + double* r2 = new double[mxr]; + ModuleBase::GlobalFunc::ZEROS(r2, mxr); + int* irr = new int[mxr]; + ModuleBase::GlobalFunc::ZEROS(irr, mxr); + // the square modulus of R_j-tau_s-tau_s' + + int iat1 = iat_beg; + int T1 = it_beg; + int I1 = ia_beg; + const double sqa = sqrt(alpha); + const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI); + + // iterating atoms. + // do not need to sync threads because task range of each thread is isolated + while (iat1 < iat_end) + { + int iat2 = 0; // mohan fix bug 2011-06-07 + int I2 = 0; + int T2 = 0; + while (iat2 < this->nat) + { + if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0) + { + ModuleBase::Vector3 d_tau + = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2]; + H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm); + + for (int n = 0; n < nrm; n++) + { + const double rr = sqrt(r2[n]) * ucell.lat0; + + double factor; + if (PARAM.inp.use_paw) + { +#ifdef USE_PAW + factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2 + / (rr * rr) + * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) + * ucell.lat0; +#endif + } + else + { + factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv + * ModuleBase::e2 / (rr * rr) + * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) + * ucell.lat0; + } + + forceion(iat1, 0) -= factor * r[n].x; + forceion(iat1, 1) -= factor * r[n].y; + forceion(iat1, 2) -= factor * r[n].z; + } + } + ++iat2; + ucell.step_iait(&I2, &T2); + } // atom b + ++iat1; + ucell.step_iait(&I1, &T1); + } // atom a + + delete[] r; + delete[] r2; + delete[] irr; + } +#ifdef _OPENMP + } +#endif Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index 5d0656d105..077864caff 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -13,6 +13,125 @@ namespace hamilt { +// CUDA kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const int ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_per_type, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate Ewald sum contribution + const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x); + atomicAdd(&force[iat * 3 + 1], local_force_y); + atomicAdd(&force[iat * 3 + 2], local_force_z); +} template __global__ void cal_vkb1_nl( @@ -188,6 +307,67 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } +// GPU operators +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_loc_sincos_kernel<<>>( + nat, npw, ntype, gcar, tau, iat2it, vloc_per_type, + reinterpret_cast*>(aux), + scale_factor, force + ); + + cudaCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_ew_sincos_kernel<<>>( + nat, npw, ig_gge0, gcar, tau, iat2it, it_facts, + reinterpret_cast*>(aux), force + ); + + cudaCheckOnDebug(); +} + template __global__ void cal_force_nl( const int ntype, @@ -613,8 +793,12 @@ template void saveVkbValues(const int *gcar_zero_ptrs, const std::comple template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu deleted file mode 100644 index f97214893e..0000000000 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu +++ /dev/null @@ -1,204 +0,0 @@ -#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h" -#include "module_base/module_device/types.h" - -#include -#include -#include -#include - -#define THREADS_PER_BLOCK 256 - -namespace hamilt { - -// CUDA kernels for sincos loops -template -__global__ void cal_force_loc_sincos_kernel( - const int nat, - const int npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const thrust::complex* aux, - const FPTYPE scale_factor, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Calculate phase: 2π * (G · τ) - const FPTYPE phase = TWO_PI * ( - gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z - ); - - // Use CUDA intrinsic for sincos - FPTYPE sinp, cosp; - __sincos(phase, &sinp, &cosp); - - // Calculate force factor - const FPTYPE factor = vloc_factors[ig] * - (cosp * aux[ig].imag() + sinp * aux[ig].real()); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * factor; - local_force_y += gcar[ig * 3 + 1] * factor; - local_force_z += gcar[ig * 3 + 2] * factor; - } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); - atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); - atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); -} - -template -__global__ void cal_force_ew_sincos_kernel( - const int nat, - const int npw, - const int ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const thrust::complex* aux, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Skip G=0 term - if (ig == ig_gge0) continue; - - // Calculate phase: 2π * (G · τ) - const FPTYPE arg = TWO_PI * ( - gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z - ); - - // Use CUDA intrinsic for sincos - FPTYPE sinp, cosp; - __sincos(arg, &sinp, &cosp); - - // Calculate Ewald sum contribution - const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * sumnb; - local_force_y += gcar[ig * 3 + 1] * sumnb; - local_force_z += gcar[ig * 3 + 2] * sumnb; - } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact); - atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact); - atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact); -} - -// GPU operator implementations -template -void cal_force_loc_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force) -{ - // Calculate optimal grid configuration - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - cal_force_loc_sincos_kernel<<>>( - nat, npw, - gcar, tau, iat2it, vloc_factors, - reinterpret_cast*>(aux), - scale_factor, force - ); - - cudaCheckOnDebug(); -} - -template -void cal_force_ew_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force) -{ - // Calculate optimal grid configuration - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - cal_force_ew_sincos_kernel<<>>( - nat, npw, ig_gge0, - gcar, tau, iat2it, it_facts, - reinterpret_cast*>(aux), - force - ); - - cudaCheckOnDebug(); -} - -template struct cal_force_loc_sincos_op; -template struct cal_force_loc_sincos_op; - -template struct cal_force_ew_sincos_op; -template struct cal_force_ew_sincos_op; - -} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index b5745f2ac5..e0dc91a70c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -439,8 +439,9 @@ struct cal_force_loc_sincos_op const FPTYPE* gcar, const FPTYPE* tau, const int* iat2it, - const FPTYPE* vloc_factors, + const FPTYPE* vloc_per_type, const std::complex* aux, + const FPTYPE& scale_factor, FPTYPE* force) { const FPTYPE TWO_PI = 2.0 * M_PI; @@ -465,8 +466,8 @@ struct cal_force_loc_sincos_op FPTYPE sinp, cosp; ModuleBase::libm::sincos(phase, &sinp, &cosp); - const FPTYPE vloc_factor = vloc_factors[it * npw + ig]; - const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor; local_force[0] += gcar[ig * 3 + 0] * factor; local_force[1] += gcar[ig * 3 + 1] * factor; @@ -480,209 +481,66 @@ struct cal_force_loc_sincos_op } }; -// CPU implementation of Ewald parallel operator +// CPU implementation of Ewald force sincos operator template -struct cal_force_ew_parallel_op +struct cal_force_ew_sincos_op { void operator()(const base_device::DEVICE_CPU* ctx, const int& nat, const int& npw, - const int& ntype, const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, const int* iat2it, const FPTYPE* it_facts, - const int* atom_na, - const FPTYPE* zv_values, const std::complex* aux, - const FPTYPE& alpha, - const FPTYPE& fact, - const FPTYPE& lat0, - const FPTYPE* latvec, - const FPTYPE* G, - const bool& use_paw, FPTYPE* force) { const FPTYPE TWO_PI = 2.0 * M_PI; - const FPTYPE e2 = 2.0; // ModuleBase::e2 equivalent #ifdef _OPENMP -#pragma omp parallel - { - int num_threads = omp_get_num_threads(); - int thread_id = omp_get_thread_num(); -#else - int num_threads = 1; - int thread_id = 0; +#pragma omp parallel for #endif + for (int iat = 0; iat < nat; ++iat) + { + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; - // Task distribution for multi-thread - int iat_beg, iat_end; - ModuleBase::TASK_DIST_1D(num_threads, thread_id, nat, iat_beg, iat_end); - iat_end = iat_beg + iat_end; - - // Preprocess ig_gap for skipping the ig point - int ig_gap = (ig_gge0 >= 0 && ig_gge0 < npw) ? ig_gge0 : -1; + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - // Part 1: sincos calculation (replacing original while loop with for loop) - for (int iat = iat_beg; iat < iat_end; ++iat) + for (int ig = 0; ig < npw; ig++) { - const int it = iat2it[iat]; - - // Check if this atom type has atoms - if (atom_na[it] == 0) continue; - - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; + // Skip G=0 term + if (ig == ig_gge0) continue; - FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - - // G-vector loop with ig_gap handling (using for loops for GPU compatibility) - for (int ig = 0; ig < npw; ig++) - { - // Skip G=0 term - if (ig == ig_gap) continue; - - const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(arg, &sinp, &cosp); - - const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - - local_force[0] += gcar[ig * 3 + 0] * sumnb; - local_force[1] += gcar[ig * 3 + 1] * sumnb; - local_force[2] += gcar[ig * 3 + 2] * sumnb; - } - - // Apply it_fact scaling - force[iat * 3 + 0] += local_force[0] * it_fact; - force[iat * 3 + 1] += local_force[1] * it_fact; - force[iat * 3 + 2] += local_force[2] * it_fact; - } - - // Part 2: Real space Ewald calculation (when processor contains G=0 term) - if (ig_gge0 >= 0) - { - const FPTYPE rmax = 5.0 / (sqrt(alpha) * lat0); - const int mxr = 200; + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); - // Allocate temporary arrays for each thread - ModuleBase::Vector3* r_flat = new ModuleBase::Vector3[mxr]; - double* r2 = new double[mxr]; - int* irr = new int[mxr]; + const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); - // Initialize arrays - for (int i = 0; i < mxr; i++) { - r2[i] = 0.0; - irr[i] = 0; - r_flat[i].x = 0.0; - r_flat[i].y = 0.0; - r_flat[i].z = 0.0; - } - - const FPTYPE sqa = sqrt(alpha); - const FPTYPE sq8a_2pi = sqrt(8.0 * alpha / TWO_PI); - - // Iterate over atoms in assigned range - for (int iat1 = iat_beg; iat1 < iat_end; ++iat1) - { - const int T1 = iat2it[iat1]; - if (atom_na[T1] == 0) continue; - - const FPTYPE tau1_x = tau[iat1 * 3 + 0]; - const FPTYPE tau1_y = tau[iat1 * 3 + 1]; - const FPTYPE tau1_z = tau[iat1 * 3 + 2]; - - // Iterate over all other atoms - for (int iat2 = 0; iat2 < nat; ++iat2) - { - const int T2 = iat2it[iat2]; - - if (iat1 != iat2 && atom_na[T2] != 0 && atom_na[T1] != 0) - { - const FPTYPE tau2_x = tau[iat2 * 3 + 0]; - const FPTYPE tau2_y = tau[iat2 * 3 + 1]; - const FPTYPE tau2_z = tau[iat2 * 3 + 2]; - - // Calculate d_tau - const FPTYPE d_tau_x = tau1_x - tau2_x; - const FPTYPE d_tau_y = tau1_y - tau2_y; - const FPTYPE d_tau_z = tau1_z - tau2_z; - - // Reconstruct Matrix3 objects from flat arrays for rgen call - ModuleBase::Matrix3 latvec_matrix( - static_cast(latvec[0]), static_cast(latvec[1]), static_cast(latvec[2]), - static_cast(latvec[3]), static_cast(latvec[4]), static_cast(latvec[5]), - static_cast(latvec[6]), static_cast(latvec[7]), static_cast(latvec[8]) - ); - - ModuleBase::Matrix3 G_matrix( - static_cast(G[0]), static_cast(G[1]), static_cast(G[2]), - static_cast(G[3]), static_cast(G[4]), static_cast(G[5]), - static_cast(G[6]), static_cast(G[7]), static_cast(G[8]) - ); - - ModuleBase::Vector3 dtau_vec( - static_cast(d_tau_x), - static_cast(d_tau_y), - static_cast(d_tau_z) - ); - - // Call H_Ewald_pw::rgen to generate neighbor shells - int nrm = 0; - H_Ewald_pw::rgen(dtau_vec, static_cast(rmax), irr, latvec_matrix, G_matrix, r_flat, r2, nrm); - - // Process all generated r vectors - for (int nr = 0; nr < nrm; nr++) - { - const FPTYPE rr = sqrt(r2[nr]) * lat0; - - FPTYPE factor; - if (use_paw) - { - factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr)) - * lat0; - } - else - { - factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr)) - * lat0; - } - - force[iat1 * 3 + 0] -= factor * static_cast(r_flat[nr].x); - force[iat1 * 3 + 1] -= factor * static_cast(r_flat[nr].y); - force[iat1 * 3 + 2] -= factor * static_cast(r_flat[nr].z); - } - } - } - } - - delete[] r_flat; - delete[] r2; - delete[] irr; + local_force[0] += gcar[ig * 3 + 0] * factor; + local_force[1] += gcar[ig * 3 + 1] * factor; + local_force[2] += gcar[ig * 3 + 2] * factor; } -#ifdef _OPENMP + force[iat * 3 + 0] = local_force[0]; + force[iat * 3 + 1] = local_force[1]; + force[iat * 3 + 2] = local_force[2]; } -#endif } }; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; template struct cal_force_loc_sincos_op; -template struct cal_force_ew_parallel_op; - +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; template struct cal_force_loc_sincos_op; -template struct cal_force_ew_parallel_op; - +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 152f41e310..3fc5d92212 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -162,8 +162,9 @@ struct cal_force_loc_sincos_op /// @param gcar - G-vector Cartesian coordinates [npw * 3] /// @param tau - atomic positions [nat * 3] /// @param iat2it - atom to type mapping [nat] - /// @param vloc_factors - precomputed vloc factors [ntype * npw] + /// @param vloc_per_type - precomputed vloc factors per type [ntype * npw] /// @param aux - charge density in G-space [npw] + /// @param scale_factor - tpiba * omega /// /// Output Parameters /// @param force - output forces [nat * 3] @@ -174,55 +175,39 @@ struct cal_force_loc_sincos_op const FPTYPE* gcar, const FPTYPE* tau, const int* iat2it, - const FPTYPE* vloc_factors, + const FPTYPE* vloc_per_type, const std::complex* aux, + const FPTYPE& scale_factor, FPTYPE* force); }; template -struct cal_force_ew_parallel_op +struct cal_force_ew_sincos_op { - /// @brief Calculate Ewald forces (parallel region only) + /// @brief Calculate Ewald forces (sincos loop only) /// /// Input Parameters /// @param ctx - which device this function runs on /// @param nat - number of atoms /// @param npw - number of plane waves - /// @param ntype - number of atom types /// @param ig_gge0 - index of G=0 vector (-1 if not present) /// @param gcar - G-vector Cartesian coordinates [npw * 3] /// @param tau - atomic positions [nat * 3] /// @param iat2it - atom to type mapping [nat] /// @param it_facts - precomputed it_fact for each atom [nat] - /// @param atom_na - number of atoms of each type [ntype] - /// @param zv_values - valence charge for each type [ntype] /// @param aux - structure factor related array [npw] - /// @param alpha - Ewald parameter - /// @param fact - scaling factor - /// @param lat0 - lattice constant - /// @param latvec - lattice vectors [3 * 3] - /// @param G - reciprocal lattice vectors [3 * 3] - /// @param use_pwa - whether PAW is used + /// /// Output Parameters /// @param force - output forces [nat * 3] void operator()(const Device* ctx, const int& nat, const int& npw, - const int& ntype, const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, const int* iat2it, const FPTYPE* it_facts, - const int* atom_na, - const FPTYPE* zv_values, const std::complex* aux, - const FPTYPE& alpha, - const FPTYPE& fact, - const FPTYPE& lat0, - const FPTYPE* latvec, - const FPTYPE* G, - const bool& use_paw, FPTYPE* force); }; @@ -335,13 +320,14 @@ struct cal_force_loc_sincos_op const FPTYPE* gcar, const FPTYPE* tau, const int* iat2it, - const FPTYPE* vloc_factors, + const FPTYPE* vloc_per_type, const std::complex* aux, + const FPTYPE& scale_factor, FPTYPE* force); }; template -struct cal_force_ew_parallel_op +struct cal_force_ew_sincos_op { void operator()(const base_device::DEVICE_GPU* ctx, const int& nat, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp deleted file mode 100644 index e7ce8554cc..0000000000 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp +++ /dev/null @@ -1,123 +0,0 @@ -#include "force_sincos_op.h" -#include "module_base/libm/libm.h" - -#ifdef _OPENMP -#include -#endif - -namespace hamilt -{ - -template -struct cal_force_loc_sincos_op -{ - void operator()(const base_device::DEVICE_CPU* ctx, - const int& nat, - const int& npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force) - { - const FPTYPE TWO_PI = 2.0 * M_PI; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < nat; ++iat) - { - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - - FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - - for (int ig = 0; ig < npw; ig++) - { - const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - - const FPTYPE factor = vloc_factors[ig] * - (cosp * aux[ig].imag() + sinp * aux[ig].real()); - - local_force[0] += gcar[ig * 3 + 0] * factor; - local_force[1] += gcar[ig * 3 + 1] * factor; - local_force[2] += gcar[ig * 3 + 2] * factor; - } - - force[iat * 3 + 0] += local_force[0] * scale_factor; - force[iat * 3 + 1] += local_force[1] * scale_factor; - force[iat * 3 + 2] += local_force[2] * scale_factor; - } - } -}; - -template -struct cal_force_ew_sincos_op -{ - void operator()(const base_device::DEVICE_CPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force) - { - const FPTYPE TWO_PI = 2.0 * M_PI; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < nat; ++iat) - { - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; - - FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - - for (int ig = 0; ig < npw; ig++) - { - // Skip G=0 term - if (ig == ig_gge0) continue; - - const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(arg, &sinp, &cosp); - - const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - - local_force[0] += gcar[ig * 3 + 0] * sumnb; - local_force[1] += gcar[ig * 3 + 1] * sumnb; - local_force[2] += gcar[ig * 3 + 2] * sumnb; - } - - force[iat * 3 + 0] += local_force[0] * it_fact; - force[iat * 3 + 1] += local_force[1] * it_fact; - force[iat * 3 + 2] += local_force[2] * it_fact; - } - } -}; - -// Template instantiations -template struct cal_force_loc_sincos_op; -template struct cal_force_loc_sincos_op; - -template struct cal_force_ew_sincos_op; -template struct cal_force_ew_sincos_op; - -} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h deleted file mode 100644 index 455b8425a7..0000000000 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef FORCE_SINCOS_OP_H -#define FORCE_SINCOS_OP_H - -#include "module_base/module_device/types.h" -#include - -namespace hamilt -{ - -template -struct cal_force_loc_sincos_op -{ - /// @brief Calculate local pseudopotential forces (sincos loop only) - /// - /// Input Parameters - /// @param ctx - which device this function runs on - /// @param nat - number of atoms - /// @param npw - number of plane waves - /// @param gcar - G-vector Cartesian coordinates [npw * 3] - /// @param tau - atomic positions [nat * 3] - /// @param iat2it - atom to type mapping [nat] - /// @param vloc_factors - precomputed vloc factors [npw] - /// @param aux - charge density in G-space [npw] - /// @param scale_factor - tpiba * omega - /// - /// Output Parameters - /// @param force - output forces [nat * 3] - void operator()(const Device* ctx, - const int& nat, - const int& npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force); -}; - -template -struct cal_force_ew_sincos_op -{ - /// @brief Calculate Ewald forces (sincos loop only) - /// - /// Input Parameters - /// @param ctx - which device this function runs on - /// @param nat - number of atoms - /// @param npw - number of plane waves - /// @param ig_gge0 - index of G=0 vector (-1 if not present) - /// @param gcar - G-vector Cartesian coordinates [npw * 3] - /// @param tau - atomic positions [nat * 3] - /// @param iat2it - atom to type mapping [nat] - /// @param it_facts - precomputed it_fact for each atom [nat] - /// @param aux - structure factor related array [npw] - /// - /// Output Parameters - /// @param force - output forces [nat * 3] - void operator()(const Device* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force); -}; - -#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM - -template -struct cal_force_loc_sincos_op -{ - void operator()(const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force); -}; - -template -struct cal_force_ew_sincos_op -{ - void operator()(const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force); -}; - -#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM - -} // namespace hamilt - -#endif // FORCE_SINCOS_OP_H \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index c78b333b86..9feeb76fed 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -618,10 +618,195 @@ template void revertVkbValues(const int *gcar_zero_ptrs, std::complex(const int *gcar_zero_ptrs, const std::complex *vkb_ptr, std::complex *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx); +// HIP kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const int ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_per_type, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate Ewald sum contribution + const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x); + atomicAdd(&force[iat * 3 + 1], local_force_y); + atomicAdd(&force[iat * 3 + 2], local_force_z); +} + +// GPU operators +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(cal_force_loc_sincos_kernel, + grid, block, 0, 0, + nat, npw, ntype, gcar, tau, iat2it, vloc_per_type, + reinterpret_cast*>(aux), + scale_factor, force); + + hipCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(cal_force_ew_sincos_kernel, + grid, block, 0, 0, + nat, npw, ig_gge0, gcar, tau, iat2it, it_facts, + reinterpret_cast*>(aux), force); + + hipCheckOnDebug(); +} + template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu deleted file mode 100644 index a09cd28bed..0000000000 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu +++ /dev/null @@ -1,203 +0,0 @@ -#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h" - -#include -#include -#include -#include - -#define THREADS_PER_BLOCK 256 - -namespace hamilt { - -// HIP kernels for sincos loops -template -__global__ void cal_force_loc_sincos_kernel( - const int nat, - const int npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const thrust::complex* aux, - const FPTYPE scale_factor, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Calculate phase: 2π * (G · τ) - const FPTYPE phase = TWO_PI * ( - gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z - ); - - // Use HIP intrinsic for sincos - FPTYPE sinp, cosp; - __sincos(phase, &sinp, &cosp); - - // Calculate force factor - const FPTYPE factor = vloc_factors[ig] * - (cosp * aux[ig].imag() + sinp * aux[ig].real()); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * factor; - local_force_y += gcar[ig * 3 + 1] * factor; - local_force_z += gcar[ig * 3 + 2] * factor; - } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); - atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); - atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); -} - -template -__global__ void cal_force_ew_sincos_kernel( - const int nat, - const int npw, - const int ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const thrust::complex* aux, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Skip G=0 term - if (ig == ig_gge0) continue; - - // Calculate phase: 2π * (G · τ) - const FPTYPE arg = TWO_PI * ( - gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z - ); - - // Use HIP intrinsic for sincos - FPTYPE sinp, cosp; - __sincos(arg, &sinp, &cosp); - - // Calculate Ewald sum contribution - const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * sumnb; - local_force_y += gcar[ig * 3 + 1] * sumnb; - local_force_z += gcar[ig * 3 + 2] * sumnb; - } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact); - atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact); - atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact); -} - -// GPU operator implementations -template -void cal_force_loc_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force) -{ - // Calculate optimal grid configuration - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_loc_sincos_kernel), grid, block, 0, 0, - nat, npw, - gcar, tau, iat2it, vloc_factors, - reinterpret_cast*>(aux), - scale_factor, force - ); - - hipCheckOnDebug(); -} - -template -void cal_force_ew_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force) -{ - // Calculate optimal grid configuration - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_ew_sincos_kernel), grid, block, 0, 0, - nat, npw, ig_gge0, - gcar, tau, iat2it, it_facts, - reinterpret_cast*>(aux), - force - ); - - hipCheckOnDebug(); -} - -template struct cal_force_loc_sincos_op; -template struct cal_force_loc_sincos_op; - -template struct cal_force_ew_sincos_op; -template struct cal_force_ew_sincos_op; - -} // namespace hamilt \ No newline at end of file From 465d991b7c2b94c385597751815255e32c536502 Mon Sep 17 00:00:00 2001 From: Jie Date: Tue, 3 Jun 2025 11:26:23 +0800 Subject: [PATCH 5/6] fix vloc computation in cal_force_loc_sincos_op --- source/module_hamilt_pw/hamilt_pwdft/forces.cpp | 15 ++++----------- .../hamilt_pwdft/kernels/cuda/force_op.cu | 7 ++----- .../hamilt_pwdft/kernels/force_op.cpp | 4 +--- .../hamilt_pwdft/kernels/force_op.h | 5 +---- .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 7 ++----- 5 files changed, 10 insertions(+), 28 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index 5e1d3cd6a2..d50288eef8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -537,7 +537,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, // 准备原子相关数据:按照全局原子索引iat顺序 std::vector tau_flat(this->nat * 3); - std::vector iat2it_host(this->nat); std::vector gcar_flat(rho_basis->npw * 3); // 按照原始代码逻辑:遍历全局原子索引iat,通过查表获取(it,ia) @@ -548,7 +547,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); - iat2it_host[iat] = it; } for (int ig = 0; ig < rho_basis->npw; ig++) { @@ -559,9 +557,10 @@ void Forces::cal_force_loc(const UnitCell& ucell, // 重新计算vloc_factors考虑所有原子类型 std::vector vloc_per_type_host(ucell.ntype * rho_basis->npw); - for (int it = 0; it < ucell.ntype; it++) { + for (int iat = 0; iat < this->nat; iat++) { + int it = ucell.iat2it[iat]; for (int ig = 0; ig < rho_basis->npw; ig++) { - vloc_per_type_host[it * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); + vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); } } @@ -574,7 +573,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, // 设备端内存和指针设置(根据设备类型分支) FPTYPE* d_gcar = gcar_flat.data(); FPTYPE* d_tau = tau_flat.data(); - int* d_iat2it = iat2it_host.data(); FPTYPE* d_vloc_per_type = vloc_per_type_host.data(); std::complex* d_aux = aux_fptype.data(); FPTYPE* d_force = nullptr; @@ -584,13 +582,11 @@ void Forces::cal_force_loc(const UnitCell& ucell, { d_gcar = nullptr; d_tau = nullptr; - d_iat2it = nullptr; d_vloc_per_type = nullptr; d_aux = nullptr; resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); resmem_var_op()(this->ctx, d_tau, this->nat * 3); - resmem_int_op()(this->ctx, d_iat2it, this->nat); resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw); resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); resmem_var_op()(this->ctx, d_force, this->nat * 3); @@ -598,7 +594,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, // 数据传输到设备 syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw); syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); @@ -623,7 +618,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, ucell.ntype, d_gcar, d_tau, - d_iat2it, d_vloc_per_type, d_aux, scale_factor, @@ -639,7 +633,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, // 清理设备内存 delmem_var_op()(this->ctx, d_gcar); delmem_var_op()(this->ctx, d_tau); - delmem_int_op()(this->ctx, d_iat2it); delmem_var_op()(this->ctx, d_vloc_per_type); delmem_complex_op()(this->ctx, d_aux); delmem_var_op()(this->ctx, d_force); @@ -652,7 +645,7 @@ void Forces::cal_force_loc(const UnitCell& ucell, forcelc(iat, 2) = static_cast(force_host[iat * 3 + 2]); } - // this->print(GlobalV::ofs_running, "local forces", forcelc); + // this->print(GlobalV: :ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); delete[] aux; ModuleBase::timer::tick("Forces", "cal_force_loc"); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index 077864caff..a82bd1ee0b 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -21,7 +21,6 @@ __global__ void cal_force_loc_sincos_kernel( const int ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const thrust::complex* aux, const FPTYPE scale_factor, @@ -36,7 +35,6 @@ __global__ void cal_force_loc_sincos_kernel( if (iat >= nat) return; // Load atom information to registers - const int it = iat2it[iat]; const FPTYPE tau_x = tau[iat * 3 + 0]; const FPTYPE tau_y = tau[iat * 3 + 1]; const FPTYPE tau_z = tau[iat * 3 + 2]; @@ -58,7 +56,7 @@ __global__ void cal_force_loc_sincos_kernel( sincos(phase, &sinp, &cosp); // Calculate force factor - const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); // Accumulate force contributions @@ -316,7 +314,6 @@ void cal_force_loc_sincos_op::operator()( const int& ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const std::complex* aux, const FPTYPE& scale_factor, @@ -331,7 +328,7 @@ void cal_force_loc_sincos_op::operator()( dim3 block(threads_per_block); cal_force_loc_sincos_kernel<<>>( - nat, npw, ntype, gcar, tau, iat2it, vloc_per_type, + nat, npw, ntype, gcar, tau, vloc_per_type, reinterpret_cast*>(aux), scale_factor, force ); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index e0dc91a70c..49576abbfa 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -438,7 +438,6 @@ struct cal_force_loc_sincos_op const int& ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const std::complex* aux, const FPTYPE& scale_factor, @@ -451,7 +450,6 @@ struct cal_force_loc_sincos_op #endif for (int iat = 0; iat < nat; ++iat) { - const int it = iat2it[iat]; const FPTYPE tau_x = tau[iat * 3 + 0]; const FPTYPE tau_y = tau[iat * 3 + 1]; const FPTYPE tau_z = tau[iat * 3 + 2]; @@ -466,7 +464,7 @@ struct cal_force_loc_sincos_op FPTYPE sinp, cosp; ModuleBase::libm::sincos(phase, &sinp, &cosp); - const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor; local_force[0] += gcar[ig * 3 + 0] * factor; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 3fc5d92212..58fcd842ef 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -161,8 +161,7 @@ struct cal_force_loc_sincos_op /// @param ntype - number of atom types /// @param gcar - G-vector Cartesian coordinates [npw * 3] /// @param tau - atomic positions [nat * 3] - /// @param iat2it - atom to type mapping [nat] - /// @param vloc_per_type - precomputed vloc factors per type [ntype * npw] + /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw] /// @param aux - charge density in G-space [npw] /// @param scale_factor - tpiba * omega /// @@ -174,7 +173,6 @@ struct cal_force_loc_sincos_op const int& ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const std::complex* aux, const FPTYPE& scale_factor, @@ -319,7 +317,6 @@ struct cal_force_loc_sincos_op const int& ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const std::complex* aux, const FPTYPE& scale_factor, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index 9feeb76fed..7f0d4b099e 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -626,7 +626,6 @@ __global__ void cal_force_loc_sincos_kernel( const int ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const thrust::complex* aux, const FPTYPE scale_factor, @@ -641,7 +640,6 @@ __global__ void cal_force_loc_sincos_kernel( if (iat >= nat) return; // Load atom information to registers - const int it = iat2it[iat]; const FPTYPE tau_x = tau[iat * 3 + 0]; const FPTYPE tau_y = tau[iat * 3 + 1]; const FPTYPE tau_z = tau[iat * 3 + 2]; @@ -663,7 +661,7 @@ __global__ void cal_force_loc_sincos_kernel( sincos(phase, &sinp, &cosp); // Calculate force factor - const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); // Accumulate force contributions @@ -747,7 +745,6 @@ void cal_force_loc_sincos_op::operator()( const int& ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const std::complex* aux, const FPTYPE& scale_factor, @@ -763,7 +760,7 @@ void cal_force_loc_sincos_op::operator()( hipLaunchKernelGGL(cal_force_loc_sincos_kernel, grid, block, 0, 0, - nat, npw, ntype, gcar, tau, iat2it, vloc_per_type, + nat, npw, ntype, gcar, tau, vloc_per_type, reinterpret_cast*>(aux), scale_factor, force); From a2491171233b14d4db555f88b0f75f932ddc3b5e Mon Sep 17 00:00:00 2001 From: Jie Date: Tue, 3 Jun 2025 20:35:20 +0800 Subject: [PATCH 6/6] fix cal_force_ew --- .../module_hamilt_pw/hamilt_pwdft/forces.cpp | 67 ++++++------------- .../hamilt_pwdft/kernels/cuda/force_op.cu | 8 +-- .../hamilt_pwdft/kernels/force_op.cpp | 3 +- .../hamilt_pwdft/kernels/force_op.h | 3 - .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 8 +-- 5 files changed, 29 insertions(+), 60 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index d50288eef8..4cade5d1ac 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -532,17 +532,17 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); - // =============== GPU/CPU异构优化:使用sincos op替换原循环 =============== + // sincos op for G space - // 准备原子相关数据:按照全局原子索引iat顺序 + // data preparation std::vector tau_flat(this->nat * 3); std::vector gcar_flat(rho_basis->npw * 3); - // 按照原始代码逻辑:遍历全局原子索引iat,通过查表获取(it,ia) + for (int iat = 0; iat < this->nat; iat++) { - int it = ucell.iat2it[iat]; // 查表获取原子类型 - int ia = ucell.iat2ia[iat]; // 查表获取该类型内的原子索引 + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); @@ -555,7 +555,7 @@ void Forces::cal_force_loc(const UnitCell& ucell, gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); } - // 重新计算vloc_factors考虑所有原子类型 + // calculate vloc_factors for all atom types std::vector vloc_per_type_host(ucell.ntype * rho_basis->npw); for (int iat = 0; iat < this->nat; iat++) { int it = ucell.iat2it[iat]; @@ -564,13 +564,11 @@ void Forces::cal_force_loc(const UnitCell& ucell, } } - // 转换aux到FPTYPE类型 std::vector> aux_fptype(rho_basis->npw); for (int ig = 0; ig < rho_basis->npw; ig++) { aux_fptype[ig] = static_cast>(aux[ig]); } - // 设备端内存和指针设置(根据设备类型分支) FPTYPE* d_gcar = gcar_flat.data(); FPTYPE* d_tau = tau_flat.data(); FPTYPE* d_vloc_per_type = vloc_per_type_host.data(); @@ -591,26 +589,22 @@ void Forces::cal_force_loc(const UnitCell& ucell, resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); resmem_var_op()(this->ctx, d_force, this->nat * 3); - // 数据传输到设备 syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw); syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); - // 初始化force为0 base_device::memory::set_memory_op()(this->ctx, d_force, 0.0, this->nat * 3); } else { d_force = force_host.data(); - // 对于CPU,直接初始化为0 std::fill(force_host.begin(), force_host.end(), static_cast(0.0)); } - // 计算缩放因子 const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega); - // 调用op进行sincos计算 + // call op for sincos calculation hamilt::cal_force_loc_sincos_op()( this->ctx, this->nat, @@ -624,13 +618,10 @@ void Forces::cal_force_loc(const UnitCell& ucell, d_force ); - // 根据设备类型处理结果 if (this->device == base_device::GpuDevice) { - // 结果传回CPU syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3); - // 清理设备内存 delmem_var_op()(this->ctx, d_gcar); delmem_var_op()(this->ctx, d_tau); delmem_var_op()(this->ctx, d_vloc_per_type); @@ -638,7 +629,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, delmem_var_op()(this->ctx, d_force); } - // 将结果写入forcelc矩阵 for (int iat = 0; iat < this->nat; iat++) { forcelc(iat, 0) = static_cast(force_host[iat * 3 + 0]); forcelc(iat, 1) = static_cast(force_host[iat * 3 + 1]); @@ -755,17 +745,15 @@ void Forces::cal_force_ew(const UnitCell& ucell, aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0); } - // =============== 第一步:G空间sincos计算(在OpenMP区域外)=============== + // sincos op for cal_force_ew - // 预计算每个原子的it_fact和相关数据:按照全局原子索引iat顺序 std::vector it_facts_host(this->nat); std::vector tau_flat(this->nat * 3); - std::vector iat2it_host(this->nat); - // 按照原始代码逻辑:遍历全局原子索引iat,通过查表获取(it,ia) + // iterate over by lookup table for (int iat = 0; iat < this->nat; iat++) { - int it = ucell.iat2it[iat]; // 查表获取原子类型 - int ia = ucell.iat2ia[iat]; // 查表获取该类型内的原子索引 + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; double zv; if (PARAM.inp.use_paw) @@ -785,10 +773,8 @@ void Forces::cal_force_ew(const UnitCell& ucell, tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); - iat2it_host[iat] = it; } - // 准备设备端数据 std::vector gcar_flat(rho_basis->npw * 3); for (int ig = 0; ig < rho_basis->npw; ig++) { gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); @@ -796,16 +782,13 @@ void Forces::cal_force_ew(const UnitCell& ucell, gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); } - // 转换aux到FPTYPE类型 std::vector> aux_fptype(rho_basis->npw); for (int ig = 0; ig < rho_basis->npw; ig++) { aux_fptype[ig] = static_cast>(aux[ig]); } - // 设备端内存和指针设置(根据设备类型分支) FPTYPE* d_gcar = gcar_flat.data(); FPTYPE* d_tau = tau_flat.data(); - int* d_iat2it = iat2it_host.data(); FPTYPE* d_it_facts = it_facts_host.data(); std::complex* d_aux = aux_fptype.data(); FPTYPE* d_force_g = nullptr; @@ -815,72 +798,66 @@ void Forces::cal_force_ew(const UnitCell& ucell, { d_gcar = nullptr; d_tau = nullptr; - d_iat2it = nullptr; d_it_facts = nullptr; d_aux = nullptr; resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); resmem_var_op()(this->ctx, d_tau, this->nat * 3); - resmem_int_op()(this->ctx, d_iat2it, this->nat); resmem_var_op()(this->ctx, d_it_facts, this->nat); resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); resmem_var_op()(this->ctx, d_force_g, this->nat * 3); - // 数据传输 + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat); syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); - // 初始化force为0 + base_device::memory::set_memory_op()(this->ctx, d_force_g, 0.0, this->nat * 3); } else { d_force_g = force_g_host.data(); - // 对于CPU,直接初始化为0 std::fill(force_g_host.begin(), force_g_host.end(), static_cast(0.0)); } - // 调用op处理G空间sincos计算(在OpenMP区域外,无冲突) + // call op for sincos calculation hamilt::cal_force_ew_sincos_op()( this->ctx, this->nat, rho_basis->npw, - rho_basis->ig_gge0, // G=0项索引,op内部会自动跳过 + rho_basis->ig_gge0, d_gcar, d_tau, - d_iat2it, d_it_facts, d_aux, d_force_g ); - // 根据设备类型处理结果 + if (this->device == base_device::GpuDevice) { - // 将G空间结果传回CPU + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3); - // 清理设备内存 + delmem_var_op()(this->ctx, d_gcar); delmem_var_op()(this->ctx, d_tau); - delmem_int_op()(this->ctx, d_iat2it); delmem_var_op()(this->ctx, d_it_facts); delmem_complex_op()(this->ctx, d_aux); delmem_var_op()(this->ctx, d_force_g); } - // 累加到forceion + for (int iat = 0; iat < this->nat; iat++) { forceion(iat, 0) += static_cast(force_g_host[iat * 3 + 0]); forceion(iat, 1) += static_cast(force_g_host[iat * 3 + 1]); forceion(iat, 2) += static_cast(force_g_host[iat * 3 + 2]); } - // =============== 第二步:实空间计算(保留原OpenMP结构)=============== - + +// calculate real space force #ifdef _OPENMP #pragma omp parallel { @@ -904,7 +881,7 @@ void Forces::cal_force_ew(const UnitCell& ucell, iat_end = iat_beg + iat_end; ucell.iat2iait(iat_beg, &ia_beg, &it_beg); - // 只保留实空间相互作用计算(means that the processor contains G=0 term) + if (rho_basis->ig_gge0 >= 0) { double rmax = 5.0 / (sqrt(alpha) * ucell.lat0); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index a82bd1ee0b..5ff3ddf2b1 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -78,7 +78,6 @@ __global__ void cal_force_ew_sincos_kernel( const int ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const thrust::complex* aux, FPTYPE* force) @@ -116,8 +115,8 @@ __global__ void cal_force_ew_sincos_kernel( FPTYPE sinp, cosp; sincos(phase, &sinp, &cosp); - // Calculate Ewald sum contribution - const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + // Calculate Ewald sum contribution (fixed sign error) + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); // Accumulate force contributions local_force_x += gcar[ig * 3 + 0] * factor; @@ -344,7 +343,6 @@ void cal_force_ew_sincos_op::operator()( const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const std::complex* aux, FPTYPE* force) @@ -358,7 +356,7 @@ void cal_force_ew_sincos_op::operator()( dim3 block(threads_per_block); cal_force_ew_sincos_kernel<<>>( - nat, npw, ig_gge0, gcar, tau, iat2it, it_facts, + nat, npw, ig_gge0, gcar, tau, it_facts, reinterpret_cast*>(aux), force ); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 49576abbfa..109321d6c3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -489,7 +489,6 @@ struct cal_force_ew_sincos_op const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const std::complex* aux, FPTYPE* force) @@ -519,7 +518,7 @@ struct cal_force_ew_sincos_op FPTYPE sinp, cosp; ModuleBase::libm::sincos(phase, &sinp, &cosp); - const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); local_force[0] += gcar[ig * 3 + 0] * factor; local_force[1] += gcar[ig * 3 + 1] * factor; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 58fcd842ef..acf490b278 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -191,7 +191,6 @@ struct cal_force_ew_sincos_op /// @param ig_gge0 - index of G=0 vector (-1 if not present) /// @param gcar - G-vector Cartesian coordinates [npw * 3] /// @param tau - atomic positions [nat * 3] - /// @param iat2it - atom to type mapping [nat] /// @param it_facts - precomputed it_fact for each atom [nat] /// @param aux - structure factor related array [npw] /// @@ -203,7 +202,6 @@ struct cal_force_ew_sincos_op const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const std::complex* aux, FPTYPE* force); @@ -332,7 +330,6 @@ struct cal_force_ew_sincos_op const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const std::complex* aux, FPTYPE* force); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index 7f0d4b099e..6bb3a84e7e 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -683,7 +683,6 @@ __global__ void cal_force_ew_sincos_kernel( const int ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const thrust::complex* aux, FPTYPE* force) @@ -721,8 +720,8 @@ __global__ void cal_force_ew_sincos_kernel( FPTYPE sinp, cosp; sincos(phase, &sinp, &cosp); - // Calculate Ewald sum contribution - const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + // Calculate Ewald sum contribution (fixed sign error) + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); // Accumulate force contributions local_force_x += gcar[ig * 3 + 0] * factor; @@ -775,7 +774,6 @@ void cal_force_ew_sincos_op::operator()( const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const std::complex* aux, FPTYPE* force) @@ -790,7 +788,7 @@ void cal_force_ew_sincos_op::operator()( hipLaunchKernelGGL(cal_force_ew_sincos_kernel, grid, block, 0, 0, - nat, npw, ig_gge0, gcar, tau, iat2it, it_facts, + nat, npw, ig_gge0, gcar, tau, it_facts, reinterpret_cast*>(aux), force); hipCheckOnDebug();