diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp index b20ea9f3ad..9373a5a31a 100644 --- a/source/module_base/module_device/device.cpp +++ b/source/module_base/module_device/device.cpp @@ -5,7 +5,7 @@ #include #include - +#include #ifdef __MPI #include "mpi.h" #endif @@ -166,6 +166,11 @@ int device_count = -1; cudaGetDeviceCount(&device_count); #elif defined(__ROCM) hipGetDeviceCount(&device_count); +/***auto start_time = std::chrono::high_resolution_clock::now(); +std::cout << "Starting hipGetDeviceCount.." << std::endl; +auto end_time = std::chrono::high_resolution_clock::now(); +auto duration = std::chrono::duration_cast>(end_time - start_time); +std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/ #endif if (device_count <= 0) { @@ -711,4 +716,4 @@ void record_device_memory( #endif } // end of namespace information -} // end of namespace base_device \ No newline at end of file +} // end of namespace base_device diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp index 0352492a3a..f8985387d9 100644 --- a/source/module_esolver/esolver.cpp +++ b/source/module_esolver/esolver.cpp @@ -107,13 +107,21 @@ std::string determine_type() } if (GlobalV::MY_RANK == 0) { - std::cout << " RUNNING WITH DEVICE : " << device_info << " / " + /***auto start_time = std::chrono::high_resolution_clock::now(); + std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/ + std::cout << " RUNNING WITH DEVICE : " << device_info << " / " << base_device::information::get_device_info(PARAM.inp.device) << std::endl; + /***auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast>(end_time - start_time); + std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/ } - + /*** auto start_time = std::chrono::high_resolution_clock::now(); + std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/ GlobalV::ofs_running << "\n RUNNING WITH DEVICE : " << device_info << " / " - << base_device::information::get_device_info(PARAM.inp.device) << std::endl; - + << base_device::information::get_device_info(PARAM.inp.device) << std::endl; + /***auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast>(end_time - start_time); + std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/ return esolver_type; } diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp index 0b303f33e1..664ac186d5 100644 --- a/source/module_esolver/esolver_ks_pw.cpp +++ b/source/module_esolver/esolver_ks_pw.cpp @@ -556,7 +556,8 @@ void ESolver_KS_PW::hamilt2density_single(UnitCell& ucell, hsolver::DiagoIterAssist::SCF_ITER, hsolver::DiagoIterAssist::PW_DIAG_NMAX, hsolver::DiagoIterAssist::PW_DIAG_THR, - hsolver::DiagoIterAssist::need_subspace); + hsolver::DiagoIterAssist::need_subspace, + PARAM.inp.use_k_continuity); hsolver_pw_obj.solve(this->p_hamilt, this->kspw_psi[0], diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index df29f88989..4cade5d1ac 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -15,6 +15,7 @@ #include "module_hamilt_general/module_ewald/H_Ewald_pw.h" #include "module_hamilt_general/module_surchem/surchem.h" #include "module_hamilt_general/module_vdw/vdw.h" +#include "kernels/force_op.h" #ifdef _OPENMP #include @@ -531,31 +532,110 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < this->nat; ++iat) - { - // read `it` `ia` from the table + // sincos op for G space + + + // data preparation + std::vector tau_flat(this->nat * 3); + std::vector gcar_flat(rho_basis->npw * 3); + + + for (int iat = 0; iat < this->nat; iat++) { + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); + } + + for (int ig = 0; ig < rho_basis->npw; ig++) { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // calculate vloc_factors for all atom types + std::vector vloc_per_type_host(ucell.ntype * rho_basis->npw); + for (int iat = 0; iat < this->nat; iat++) { int it = ucell.iat2it[iat]; - int ia = ucell.iat2ia[iat]; - for (int ig = 0; ig < rho_basis->npw; ig++) - { - const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]); - double sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - const double factor - = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real()); - forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor; - forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor; - forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor; + for (int ig = 0; ig < rho_basis->npw; ig++) { + vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); } - forcelc(iat, 0) *= (ucell.tpiba * ucell.omega); - forcelc(iat, 1) *= (ucell.tpiba * ucell.omega); - forcelc(iat, 2) *= (ucell.tpiba * ucell.omega); + } + + std::vector> aux_fptype(rho_basis->npw); + for (int ig = 0; ig < rho_basis->npw; ig++) { + aux_fptype[ig] = static_cast>(aux[ig]); + } + + FPTYPE* d_gcar = gcar_flat.data(); + FPTYPE* d_tau = tau_flat.data(); + FPTYPE* d_vloc_per_type = vloc_per_type_host.data(); + std::complex* d_aux = aux_fptype.data(); + FPTYPE* d_force = nullptr; + std::vector force_host(this->nat * 3); + + if (this->device == base_device::GpuDevice) + { + d_gcar = nullptr; + d_tau = nullptr; + d_vloc_per_type = nullptr; + d_aux = nullptr; + + resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); + resmem_var_op()(this->ctx, d_tau, this->nat * 3); + resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw); + resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); + resmem_var_op()(this->ctx, d_force, this->nat * 3); + + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); + + base_device::memory::set_memory_op()(this->ctx, d_force, 0.0, this->nat * 3); + } + else + { + d_force = force_host.data(); + std::fill(force_host.begin(), force_host.end(), static_cast(0.0)); + } + + const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega); + + // call op for sincos calculation + hamilt::cal_force_loc_sincos_op()( + this->ctx, + this->nat, + rho_basis->npw, + ucell.ntype, + d_gcar, + d_tau, + d_vloc_per_type, + d_aux, + scale_factor, + d_force + ); + + if (this->device == base_device::GpuDevice) + { + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3); + + delmem_var_op()(this->ctx, d_gcar); + delmem_var_op()(this->ctx, d_tau); + delmem_var_op()(this->ctx, d_vloc_per_type); + delmem_complex_op()(this->ctx, d_aux); + delmem_var_op()(this->ctx, d_force); + } + + for (int iat = 0; iat < this->nat; iat++) { + forcelc(iat, 0) = static_cast(force_host[iat * 3 + 0]); + forcelc(iat, 1) = static_cast(force_host[iat * 3 + 1]); + forcelc(iat, 2) = static_cast(force_host[iat * 3 + 2]); } - // this->print(GlobalV::ofs_running, "local forces", forcelc); + // this->print(GlobalV: :ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); delete[] aux; ModuleBase::timer::tick("Forces", "cal_force_loc"); @@ -665,6 +745,119 @@ void Forces::cal_force_ew(const UnitCell& ucell, aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0); } + // sincos op for cal_force_ew + + std::vector it_facts_host(this->nat); + std::vector tau_flat(this->nat * 3); + + // iterate over by lookup table + for (int iat = 0; iat < this->nat; iat++) { + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + + double zv; + if (PARAM.inp.use_paw) + { +#ifdef USE_PAW + zv = GlobalC::paw_cell.get_val(it); +#endif + } + else + { + zv = ucell.atoms[it].ncpp.zv; + } + + it_facts_host[iat] = static_cast(zv * ModuleBase::e2 * ucell.tpiba * + ModuleBase::TWO_PI / ucell.omega * fact); + + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); + } + + std::vector gcar_flat(rho_basis->npw * 3); + for (int ig = 0; ig < rho_basis->npw; ig++) { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + std::vector> aux_fptype(rho_basis->npw); + for (int ig = 0; ig < rho_basis->npw; ig++) { + aux_fptype[ig] = static_cast>(aux[ig]); + } + + FPTYPE* d_gcar = gcar_flat.data(); + FPTYPE* d_tau = tau_flat.data(); + FPTYPE* d_it_facts = it_facts_host.data(); + std::complex* d_aux = aux_fptype.data(); + FPTYPE* d_force_g = nullptr; + std::vector force_g_host(this->nat * 3); + + if (this->device == base_device::GpuDevice) + { + d_gcar = nullptr; + d_tau = nullptr; + d_it_facts = nullptr; + d_aux = nullptr; + + resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); + resmem_var_op()(this->ctx, d_tau, this->nat * 3); + resmem_var_op()(this->ctx, d_it_facts, this->nat); + resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); + resmem_var_op()(this->ctx, d_force_g, this->nat * 3); + + + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); + + + base_device::memory::set_memory_op()(this->ctx, d_force_g, 0.0, this->nat * 3); + } + else + { + d_force_g = force_g_host.data(); + std::fill(force_g_host.begin(), force_g_host.end(), static_cast(0.0)); + } + + // call op for sincos calculation + hamilt::cal_force_ew_sincos_op()( + this->ctx, + this->nat, + rho_basis->npw, + rho_basis->ig_gge0, + d_gcar, + d_tau, + d_it_facts, + d_aux, + d_force_g + ); + + + if (this->device == base_device::GpuDevice) + { + + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3); + + + delmem_var_op()(this->ctx, d_gcar); + delmem_var_op()(this->ctx, d_tau); + delmem_var_op()(this->ctx, d_it_facts); + delmem_complex_op()(this->ctx, d_aux); + delmem_var_op()(this->ctx, d_force_g); + } + + + for (int iat = 0; iat < this->nat; iat++) { + forceion(iat, 0) += static_cast(force_g_host[iat * 3 + 0]); + forceion(iat, 1) += static_cast(force_g_host[iat * 3 + 1]); + forceion(iat, 2) += static_cast(force_g_host[iat * 3 + 2]); + } + + +// calculate real space force #ifdef _OPENMP #pragma omp parallel { @@ -688,66 +881,7 @@ void Forces::cal_force_ew(const UnitCell& ucell, iat_end = iat_beg + iat_end; ucell.iat2iait(iat_beg, &ia_beg, &it_beg); - int iat = iat_beg; - int it = it_beg; - int ia = ia_beg; - - // preprocess ig_gap for skipping the ig point - int ig_gap = (rho_basis->ig_gge0 >= 0 && rho_basis->ig_gge0 < rho_basis->npw) ? rho_basis->ig_gge0 : -1; - - double it_fact = 0.; - int last_it = -1; - - // iterating atoms - while (iat < iat_end) - { - if (it != last_it) - { // calculate it_tact when it is changed - double zv; - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - zv = GlobalC::paw_cell.get_val(it); -#endif - } - else - { - zv = ucell.atoms[it].ncpp.zv; - } - it_fact = zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact; - last_it = it; - } - - if (ucell.atoms[it].na != 0) - { - const auto ig_loop = [&](int ig_beg, int ig_end) { - for (int ig = ig_beg; ig < ig_end; ig++) - { - const ModuleBase::Vector3 gcar = rho_basis->gcar[ig]; - const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]); - double sinp, cosp; - ModuleBase::libm::sincos(arg, &sinp, &cosp); - double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - forceion(iat, 0) += gcar[0] * sumnb; - forceion(iat, 1) += gcar[1] * sumnb; - forceion(iat, 2) += gcar[2] * sumnb; - } - }; - - // skip ig_gge0 point by separating ig loop into two part - ig_loop(0, ig_gap); - ig_loop(ig_gap + 1, rho_basis->npw); - - forceion(iat, 0) *= it_fact; - forceion(iat, 1) *= it_fact; - forceion(iat, 2) *= it_fact; - - ++iat; - ucell.step_iait(&ia, &it); - } - } - - // means that the processor contains G=0 term. + if (rho_basis->ig_gge0 >= 0) { double rmax = 5.0 / (sqrt(alpha) * ucell.lat0); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index 5d0656d105..5ff3ddf2b1 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -13,6 +13,122 @@ namespace hamilt { +// CUDA kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const int ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate Ewald sum contribution (fixed sign error) + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x); + atomicAdd(&force[iat * 3 + 1], local_force_y); + atomicAdd(&force[iat * 3 + 2], local_force_z); +} template __global__ void cal_vkb1_nl( @@ -188,6 +304,65 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } +// GPU operators +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_loc_sincos_kernel<<>>( + nat, npw, ntype, gcar, tau, vloc_per_type, + reinterpret_cast*>(aux), + scale_factor, force + ); + + cudaCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_ew_sincos_kernel<<>>( + nat, npw, ig_gge0, gcar, tau, it_facts, + reinterpret_cast*>(aux), force + ); + + cudaCheckOnDebug(); +} + template __global__ void cal_force_nl( const int ntype, @@ -613,8 +788,12 @@ template void saveVkbValues(const int *gcar_zero_ptrs, const std::comple template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 6d797e147d..109321d6c3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -1,4 +1,8 @@ #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" +#include "module_base/libm/libm.h" +#include "module_base/tool_threading.h" +#include "module_hamilt_general/module_ewald/H_Ewald_pw.h" +#include #ifdef _OPENMP #include @@ -424,10 +428,116 @@ struct cal_force_nl_op } }; +// CPU implementation of local force sincos operator +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int iat = 0; iat < nat; ++iat) + { + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + for (int ig = 0; ig < npw; ig++) + { + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor; + + local_force[0] += gcar[ig * 3 + 0] * factor; + local_force[1] += gcar[ig * 3 + 1] * factor; + local_force[2] += gcar[ig * 3 + 2] * factor; + } + + force[iat * 3 + 0] = local_force[0]; + force[iat * 3 + 1] = local_force[1]; + force[iat * 3 + 2] = local_force[2]; + } + } +}; + +// CPU implementation of Ewald force sincos operator +template +struct cal_force_ew_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int iat = 0; iat < nat; ++iat) + { + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + for (int ig = 0; ig < npw; ig++) + { + // Skip G=0 term + if (ig == ig_gge0) continue; + + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); + + local_force[0] += gcar[ig * 3 + 0] * factor; + local_force[1] += gcar[ig * 3 + 1] * factor; + local_force[2] += gcar[ig * 3 + 2] * factor; + } + + force[iat * 3 + 0] = local_force[0]; + force[iat * 3 + 1] = local_force[1]; + force[iat * 3 + 2] = local_force[2]; + } + } +}; + template struct cal_vkb1_nl_op; template struct cal_force_nl_op; - +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; - +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 3aa5d4f87e..acf490b278 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -146,6 +146,64 @@ struct cal_force_nl_op const FPTYPE* lambda, const std::complex* becp, const std::complex* dbecp, + FPTYPE* force); +}; + +template +struct cal_force_loc_sincos_op +{ + /// @brief Calculate local pseudopotential forces (sincos loop only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param ntype - number of atom types + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw] + /// @param aux - charge density in G-space [npw] + /// @param scale_factor - tpiba * omega + /// + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force); +}; + +template +struct cal_force_ew_sincos_op +{ + /// @brief Calculate Ewald forces (sincos loop only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param ig_gge0 - index of G=0 vector (-1 if not present) + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param it_facts - precomputed it_fact for each atom [nat] + /// @param aux - structure factor related array [npw] + /// + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const std::complex* aux, FPTYPE* force); }; @@ -248,6 +306,35 @@ struct cal_force_nl_op FPTYPE* force); }; +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force); +}; + +template +struct cal_force_ew_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force); +}; + /** * @brief revert the vkb values for force_nl calculation */ diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index c78b333b86..6bb3a84e7e 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -618,10 +618,190 @@ template void revertVkbValues(const int *gcar_zero_ptrs, std::complex(const int *gcar_zero_ptrs, const std::complex *vkb_ptr, std::complex *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx); +// HIP kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const int ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate Ewald sum contribution (fixed sign error) + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x); + atomicAdd(&force[iat * 3 + 1], local_force_y); + atomicAdd(&force[iat * 3 + 2], local_force_z); +} + +// GPU operators +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(cal_force_loc_sincos_kernel, + grid, block, 0, 0, + nat, npw, ntype, gcar, tau, vloc_per_type, + reinterpret_cast*>(aux), + scale_factor, force); + + hipCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(cal_force_ew_sincos_kernel, + grid, block, 0, 0, + nat, npw, ig_gge0, gcar, tau, it_facts, + reinterpret_cast*>(aux), force); + + hipCheckOnDebug(); +} + template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp index b180d72c13..f07f4cfefa 100644 --- a/source/module_hsolver/diago_dav_subspace.cpp +++ b/source/module_hsolver/diago_dav_subspace.cpp @@ -7,7 +7,6 @@ #include "module_hsolver/kernels/dngvd_op.h" #include "module_hsolver/kernels/math_kernel_op.h" #include "module_base/kernels/dsp/dsp_connector.h" - #include using namespace hsolver; @@ -60,7 +59,7 @@ Diago_DavSubspace::Diago_DavSubspace(const std::vector& precond if (this->device == base_device::GpuDevice) { resmem_real_op()(this->ctx, this->d_precondition, nbasis_in); - // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in); } #endif } @@ -288,27 +287,28 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, psi_iter + (nbase) * this->dim, this->dim); - std::vector e_temp_cpu(nbase, 0); + // Eigenvalues operation section + std::vector e_temp_cpu(this->notconv, 0); Real* e_temp_hd = e_temp_cpu.data(); - if(this->device == base_device::GpuDevice) + if (this->device == base_device::GpuDevice) { e_temp_hd = nullptr; - resmem_real_op()(this->ctx, e_temp_hd, nbase); + resmem_real_op()(this->ctx, e_temp_hd, this->notconv); } - for (int m = 0; m < notconv; m++) + + for (int m = 0; m < this->notconv; m++) { - e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m])); - if (this->device == base_device::GpuDevice) - { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), nbase); - } - vector_mul_vector_op()(this->ctx, - nbase, - vcc + m * this->nbase_x, - vcc + m * this->nbase_x, - e_temp_hd); + e_temp_cpu[m] = -(*eigenvalue_iter)[m]; } - if(this->device == base_device::GpuDevice) + + if (this->device == base_device::GpuDevice) + { + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), this->notconv); + } + + apply_eigenvalues_op()(this->ctx, nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd); + + if (this->device == base_device::GpuDevice) { delmem_real_op()(this->ctx, e_temp_hd); } @@ -333,54 +333,62 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, psi_iter + nbase * this->dim, this->dim); - // "precondition!!!" - std::vector pre(this->dim, 0.0); - for (int m = 0; m < notconv; m++) - { - for (size_t i = 0; i < this->dim; i++) - { - // pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]); - double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]); - pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); - } + // Precondition section #if defined(__CUDA) || defined(__ROCM) - if (this->device == base_device::GpuDevice) - { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, pre.data(), this->dim); - vector_div_vector_op()(this->ctx, - this->dim, - psi_iter + (nbase + m) * this->dim, - psi_iter + (nbase + m) * this->dim, - this->d_precondition); - } - else + if (this->device == base_device::GpuDevice) + { + Real* eigenvalues_gpu = nullptr; + resmem_real_op()(this->ctx, eigenvalues_gpu, notconv); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu,(*eigenvalue_iter).data(), notconv); + + precondition_op()(this->ctx, + this->dim, + psi_iter, + nbase, + notconv, + d_precondition, + eigenvalues_gpu); + delmem_real_op()(this->ctx, eigenvalues_gpu); + } + else #endif - { - vector_div_vector_op()(this->ctx, - this->dim, - psi_iter + (nbase + m) * this->dim, - psi_iter + (nbase + m) * this->dim, - pre.data()); - } + { + precondition_op()(this->ctx, + this->dim, + psi_iter, + nbase, + notconv, + this->precondition.data(), + (*eigenvalue_iter).data()); } - // "normalize!!!" in order to improve numerical stability of subspace diagonalization - std::vector psi_norm(notconv, 0.0); - for (size_t i = 0; i < notconv; i++) + // Normalize section +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + Real* psi_norm = nullptr; + resmem_real_op()(this->ctx, psi_norm, notconv); + using setmem_real_op = base_device::memory::set_memory_op; + setmem_real_op()(this->ctx, psi_norm, 0.0, notconv); + + normalize_op()(this->ctx, + this->dim, + psi_iter, + nbase, + notconv, + psi_norm); + delmem_real_op()(this->ctx, psi_norm); + } + else +#endif { - psi_norm[i] = dot_real_op()(this->ctx, - this->dim, - psi_iter + (nbase + i) * this->dim, - psi_iter + (nbase + i) * this->dim, - true); - assert(psi_norm[i] > 0.0); - psi_norm[i] = sqrt(psi_norm[i]); - - vector_div_constant_op()(this->ctx, - this->dim, - psi_iter + (nbase + i) * this->dim, - psi_iter + (nbase + i) * this->dim, - psi_norm[i]); + Real* psi_norm = nullptr; + normalize_op()(this->ctx, + this->dim, + psi_iter, + nbase, + notconv, + psi_norm); } // update hpsi[:, nbase:nbase+notconv] diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp index 0c1ad2e8b8..d0648addf5 100644 --- a/source/module_hsolver/hsolver_pw.cpp +++ b/source/module_hsolver/hsolver_pw.cpp @@ -280,47 +280,106 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, std::vector eigenvalues(this->wfc_basis->nks * psi.get_nbands(), 0.0); ethr_band.resize(psi.get_nbands(), this->diag_thr); - /// Loop over k points for solve Hamiltonian to charge density - for (int ik = 0; ik < this->wfc_basis->nks; ++ik) - { - /// update H(k) for each k point - pHamilt->updateHk(ik); + // Initialize k-point continuity if enabled + static int count = 0; + if (use_k_continuity) { + build_k_neighbors(); + } + + // Loop over k points for solve Hamiltonian to charge density + if (use_k_continuity) { + // K-point continuity case + for (int i = 0; i < this->wfc_basis->nks; ++i) + { + const int ik = k_order[i]; + + // update H(k) for each k point + pHamilt->updateHk(ik); #ifdef USE_PAW - this->paw_func_in_kloop(ik, tpiba); + this->paw_func_in_kloop(ik, tpiba); #endif - /// update psi pointer for each k point - psi.fix_k(ik); + // update psi pointer for each k point + psi.fix_k(ik); + + // If using k-point continuity and not first k-point, propagate from parent + if (ik > 0 && count == 0 && k_parent.find(ik) != k_parent.end()) { + propagate_psi(psi, k_parent[ik], ik); + } - // template add precondition calculating here - update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0())); + // template add precondition calculating here + update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0())); - // use smooth threshold for all iter methods - if (PARAM.inp.diago_smooth_ethr == true) - { - this->cal_smooth_ethr(pes->klist->wk[ik], - &pes->wg(ik, 0), - DiagoIterAssist::PW_DIAG_THR, - ethr_band); - } + // use smooth threshold for all iter methods + if (PARAM.inp.diago_smooth_ethr == true) + { + this->cal_smooth_ethr(pes->klist->wk[ik], + &pes->wg(ik, 0), + DiagoIterAssist::PW_DIAG_THR, + ethr_band); + } #ifdef USE_PAW - this->call_paw_cell_set_currentk(ik); + this->call_paw_cell_set_currentk(ik); #endif - /// solve eigenvector and eigenvalue for H(k) - this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks); + // solve eigenvector and eigenvalue for H(k) + this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks); - if (skip_charge) + if (skip_charge) + { + GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik + << " is: " << DiagoIterAssist::avg_iter + << " ; where current threshold is: " << this->diag_thr << " . " << std::endl; + DiagoIterAssist::avg_iter = 0.0; + } + } + } + else { + // Original code without k-point continuity + for (int ik = 0; ik < this->wfc_basis->nks; ++ik) { - GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik - << " is: " << DiagoIterAssist::avg_iter - << " ; where current threshold is: " << this->diag_thr << " . " << std::endl; - DiagoIterAssist::avg_iter = 0.0; + // update H(k) for each k point + pHamilt->updateHk(ik); + +#ifdef USE_PAW + this->paw_func_in_kloop(ik, tpiba); +#endif + + // update psi pointer for each k point + psi.fix_k(ik); + + // template add precondition calculating here + update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0())); + + // use smooth threshold for all iter methods + if (PARAM.inp.diago_smooth_ethr == true) + { + this->cal_smooth_ethr(pes->klist->wk[ik], + &pes->wg(ik, 0), + DiagoIterAssist::PW_DIAG_THR, + ethr_band); + } + +#ifdef USE_PAW + this->call_paw_cell_set_currentk(ik); +#endif + + // solve eigenvector and eigenvalue for H(k) + this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks); + + if (skip_charge) + { + GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik + << " is: " << DiagoIterAssist::avg_iter + << " ; where current threshold is: " << this->diag_thr << " . " << std::endl; + DiagoIterAssist::avg_iter = 0.0; + } } - /// calculate the contribution of Psi for charge density rho } + + count++; // END Loop over k points // copy eigenvalues to ekb in ElecState @@ -666,6 +725,101 @@ void HSolverPW::output_iterInfo() } } +template +void HSolverPW::build_k_neighbors() { + const int nk = this->wfc_basis->nks; + kvecs_c.resize(nk); + k_order.clear(); + k_order.reserve(nk); + + // Store k-points and corresponding indices + struct KPoint { + ModuleBase::Vector3 kvec; + int index; + double norm; + + KPoint(const ModuleBase::Vector3& v, int i) : + kvec(v), index(i), norm(v.norm()) {} + }; + + // Build k-point list + std::vector klist; + for (int ik = 0; ik < nk; ++ik) { + kvecs_c[ik] = this->wfc_basis->kvec_c[ik]; + klist.push_back(KPoint(kvecs_c[ik], ik)); + } + + // Sort k-points by distance from origin + std::sort(klist.begin(), klist.end(), + [](const KPoint& a, const KPoint& b) { + return a.norm < b.norm; + }); + + // Build parent-child relationships + k_order.push_back(klist[0].index); + + // Find nearest processed k-point as parent for each k-point + for (int i = 1; i < nk; ++i) { + int current_k = klist[i].index; + double min_dist = 1e10; + int parent = -1; + + // find the nearest k-point as parent + for (int j = 0; j < k_order.size(); ++j) { + int processed_k = k_order[j]; + double dist = (kvecs_c[current_k] - kvecs_c[processed_k]).norm2(); + if (dist < min_dist) { + min_dist = dist; + parent = processed_k; + } + } + + k_parent[current_k] = parent; + k_order.push_back(current_k); + } +} + +template +void HSolverPW::propagate_psi(psi::Psi& psi, const int from_ik, const int to_ik) { + const int nbands = psi.get_nbands(); + const int npwk = this->wfc_basis->npwk[to_ik]; + + // Get k-point difference + ModuleBase::Vector3 dk = kvecs_c[to_ik] - kvecs_c[from_ik]; + + // Allocate porter locally + T* porter = nullptr; + resmem_complex_op()(this->ctx, porter, this->wfc_basis->nmaxgr, "HSolverPW::porter"); + + // Process each band + for (int ib = 0; ib < nbands; ib++) + { + // Fix current k-point and band + // psi.fix_k(from_ik); + + // FFT to real space + // this->wfc_basis->recip_to_real(this->ctx, psi.get_pointer(ib), porter, from_ik); + this->wfc_basis->recip_to_real(this->ctx, &psi(from_ik, ib, 0), porter, from_ik); + + // Apply phase factor + // // TODO: Check how to get the r vector + // ModuleBase::Vector3 r = this->wfc_basis->get_ir2r(ir); + // double phase = this->wfc_basis->tpiba * (dk.x * r.x + dk.y * r.y + dk.z * r.z); + // psi_real[ir] *= std::exp(std::complex(0.0, phase)); + // } + + // Fix k-point for target + // psi.fix_k(to_ik); + + // FFT back to reciprocal space + // this->wfc_basis->real_to_recip(this->ctx, porter, psi.get_pointer(ib), to_ik, true); + this->wfc_basis->real_to_recip(this->ctx, porter, &psi(to_ik, ib, 0), to_ik); + } + + // Clean up porter + delmem_complex_op()(this->ctx, porter); +} + template class HSolverPW, base_device::DEVICE_CPU>; template class HSolverPW, base_device::DEVICE_CPU>; #if ((defined __CUDA) || (defined __ROCM)) @@ -673,4 +827,4 @@ template class HSolverPW, base_device::DEVICE_GPU>; template class HSolverPW, base_device::DEVICE_GPU>; #endif -} // namespace hsolver \ No newline at end of file +} // namespace hsolver diff --git a/source/module_hsolver/hsolver_pw.h b/source/module_hsolver/hsolver_pw.h index 0dee4fbdbf..7f0bfc7c23 100644 --- a/source/module_hsolver/hsolver_pw.h +++ b/source/module_hsolver/hsolver_pw.h @@ -6,6 +6,8 @@ #include "module_base/macros.h" #include "module_basis/module_pw/pw_basis_k.h" #include "module_psi/wavefunc.h" +#include +#include "module_base/memory.h" namespace hsolver { @@ -18,6 +20,9 @@ class HSolverPW // return T if T is real type(float, double), // otherwise return the real type of T(complex, complex) using Real = typename GetTypeReal::type; + using resmem_complex_op = base_device::memory::resize_memory_op; + using delmem_complex_op = base_device::memory::delete_memory_op; + using setmem_complex_op = base_device::memory::set_memory_op; public: HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in, @@ -30,10 +35,12 @@ class HSolverPW const int scf_iter_in, const int diag_iter_max_in, const double diag_thr_in, - const bool need_subspace_in) + const bool need_subspace_in, + const bool use_k_continuity_in = false) : wfc_basis(wfc_basis_in), calculation_type(calculation_type_in), basis_type(basis_type_in), method(method_in), use_paw(use_paw_in), use_uspp(use_uspp_in), nspin(nspin_in), scf_iter(scf_iter_in), - diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in){}; + diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in), + use_k_continuity(use_k_continuity_in) {}; /// @brief solve function for pw /// @param pHamilt interface to hamilt @@ -51,6 +58,7 @@ class HSolverPW const double tpiba, const int nat); + protected: // diago caller void hamiltSolvePsiK(hamilt::Hamilt* hm, @@ -79,6 +87,8 @@ class HSolverPW const bool need_subspace; // for cg or dav_subspace + const bool use_k_continuity; + protected: Device* ctx = {}; @@ -99,8 +109,16 @@ class HSolverPW void paw_func_after_kloop(psi::Psi& psi, elecstate::ElecState* pes,const double tpiba,const int nat); #endif + + // K-point continuity related members + std::vector k_order; + std::unordered_map k_parent; + std::vector> kvecs_c; + + void build_k_neighbors(); + void propagate_psi(psi::Psi& psi, const int from_ik, const int to_ik); }; } // namespace hsolver -#endif \ No newline at end of file +#endif diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu index 149b9ce389..8fb7c542eb 100644 --- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu +++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu @@ -42,6 +42,48 @@ struct GetTypeThrust> { static cublasHandle_t cublas_handle = nullptr; +// Forward declarations for abs2 +template +__device__ typename GetTypeReal::type abs2(const T& x); + +// Specialization for real types (double) +template<> +__device__ double abs2(const double& x) { + return x * x; +} + +// Specialization for real types (float) +template<> +__device__ float abs2(const float& x) { + return x * x; +} + +// Specialization for complex float +template<> +__device__ float abs2(const thrust::complex& x) { + return x.real() * x.real() + x.imag() * x.imag(); +} + +// Specialization for complex double +template<> +__device__ double abs2(const thrust::complex& x) { + return x.real() * x.real() + x.imag() * x.imag(); +} + +// Specialization for std::complex (for interface compatibility) +template<> +__device__ float abs2(const std::complex& x) { + const thrust::complex* tx = reinterpret_cast*>(&x); + return tx->real() * tx->real() + tx->imag() * tx->imag(); +} + +// Specialization for std::complex (for interface compatibility) +template<> +__device__ double abs2(const std::complex& x) { + const thrust::complex* tx = reinterpret_cast*>(&x); + return tx->real() * tx->real() + tx->imag() * tx->imag(); +} + static inline void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) { cublasErrcheck(cublasSdot(cublas_handle, n, x, incx, y, incy, &result)); @@ -1035,6 +1077,320 @@ void matrixSetToAnother, base_device::DEVICE_GPU>::operator cudaCheckOnDebug(); } +// Kernel for applying eigenvalues to vectors + +template +__global__ void apply_eigenvalues_kernel(const typename GetTypeReal::type* eigenvalues, const T* vectors, T* result, int nbase, int nbase_x, int notconv) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; // Linear thread ID + const int total_elements = notconv * nbase; + + if (tid < total_elements) + { + const int m = tid / nbase; // Row index (eigenvalue index) + const int idx = tid % nbase; // Column index within the row + result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx]; + } +} + +// Wrapper for applying eigenvalues to complex vectors +template +inline void apply_eigenvalues_complex_wrapper(const base_device::DEVICE_GPU* d, + const int& nbase, + const int& nbase_x, + const int& notconv, + const FPTYPE* eigenvalues, + const std::complex* vectors, + std::complex* result) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vectors_tmp = reinterpret_cast*>(vectors); + const int total_elements = notconv * nbase; + const int threadsPerBlock = 256; + const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + + apply_eigenvalues_kernel><<>>( + eigenvalues, vectors_tmp, result_tmp, nbase, nbase_x, notconv); + + cudaCheckOnDebug(); +} + +// Specialization for double +template <> +void apply_eigenvalues_op::operator()(const base_device::DEVICE_GPU *d, + const int &nbase, const int &nbase_x, const int ¬conv, + double *result, const double *vectors, const double *eigenvalues) { + const int total_elements = notconv * nbase; + const int threadsPerBlock = 256; + const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + + apply_eigenvalues_kernel<<>>( + eigenvalues, vectors, result, nbase, nbase_x, notconv); + cudaCheckOnDebug(); +} + +// Specialization for std::complex +template <> +void apply_eigenvalues_op, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d, + const int &nbase, const int &nbase_x, const int ¬conv, + std::complex *result, const std::complex *vectors, const float *eigenvalues) { + apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result); +} + +// Specialization for std::complex +template <> +void apply_eigenvalues_op, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d, + const int &nbase, const int &nbase_x, const int ¬conv, + std::complex *result, const std::complex *vectors, const double *eigenvalues) { + apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result); +} + +template +__global__ void precondition_kernel( + const int dim, + const int notconv, + T* psi_iter, + const int nbase, + const typename GetTypeReal::type* precondition, // Real type + const typename GetTypeReal::type* eigenvalues) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < dim * notconv) { + const int i = tid % dim; // Basis index + const int m = tid / dim; // Band index + + using Real = typename GetTypeReal::type; + Real x = abs(precondition[i] - eigenvalues[m]); + Real pre = 0.5 * (1.0 + x + sqrt(1.0 + (x - 1.0) * (x - 1.0))); + psi_iter[(nbase + m) * dim + i] /= pre; + } +} + +// Specialization for double (for LCAO) +template <> +void precondition_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + double* psi_iter, + const int& nbase, + const int& notconv, + const double* precondition, + const double* eigenvalues) +{ + const int total_elements = dim * notconv; + const int threadsPerBlock = thread_per_block; + const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + + precondition_kernel<<>>( + dim, notconv, psi_iter, nbase, precondition, eigenvalues); + + cudaCheckOnDebug(); +} + +// Specialization for complex +template <> +void precondition_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + const float* precondition, + const float* eigenvalues) +{ + const int total_elements = dim * notconv; + const int threadsPerBlock = thread_per_block; + const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + + precondition_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, + precondition, // Already float + eigenvalues); + + cudaCheckOnDebug(); +} + +// Specialization for complex +template <> +void precondition_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + const double* precondition, + const double* eigenvalues) +{ + const int total_elements = dim * notconv; + const int threadsPerBlock = thread_per_block; + const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + + precondition_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, + precondition, // Already double + eigenvalues); + + cudaCheckOnDebug(); +} + +// First kernel: calculate norms +template +__global__ void calculate_norm_kernel( + const int dim, + const int notconv, + const T* psi_iter, + const int nbase, + typename GetTypeReal::type* psi_norm) +{ + using Real = typename GetTypeReal::type; + __shared__ Real shared_mem[256]; + + const int tid = threadIdx.x; + const int bid = blockIdx.x; + const int m = bid / ((dim + blockDim.x - 1) / blockDim.x); + const int num_blocks_per_band = (dim + blockDim.x - 1) / blockDim.x; + const int block_in_band = bid % num_blocks_per_band; + + if (m >= notconv) return; + + // Initialize shared memory + shared_mem[tid] = 0.0; + + // Each thread processes one element + const int i = tid + block_in_band * blockDim.x; + if (i < dim) { + shared_mem[tid] = abs2(psi_iter[(nbase + m) * dim + i]); + } + __syncthreads(); + + // Parallel reduction in shared memory + for (int stride = blockDim.x/2; stride > 0; stride >>= 1) { + if (tid < stride) { + shared_mem[tid] += shared_mem[tid + stride]; + } + __syncthreads(); + } + + // First thread of first block initializes norm to 0 + // if (tid == 0 && block_in_band == 0) { + // psi_norm[m] = 0.0; + // } + // __syncthreads(); + + // Write result for this block to global memory + if (tid == 0) { + atomicAdd(&psi_norm[m], shared_mem[0]); + } +} + +// Second kernel: normalize vectors +template +__global__ void normalize_vectors_kernel( + const int dim, + const int notconv, + T* psi_iter, + const int nbase, + const typename GetTypeReal::type* psi_norm) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int m = tid / dim; + const int i = tid % dim; + + if (m < notconv && i < dim) { + psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / sqrt(psi_norm[m]); + } +} + + +// Specialization for complex +template <> +void normalize_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + float* psi_norm) +{ + const int threadsPerBlock = 256; + const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv; + + // First kernel: calculate norms + calculate_norm_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, psi_norm); + cudaDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + normalize_vectors_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, psi_norm); + cudaCheckOnDebug(); +} + +// Specialization for complex +template <> +void normalize_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + double* psi_norm) +{ + const int threadsPerBlock = 256; + const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv; + + // First kernel: calculate norms + calculate_norm_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, psi_norm); + cudaDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + normalize_vectors_kernel><<>>( + dim, notconv, + reinterpret_cast*>(psi_iter), + nbase, psi_norm); + cudaCheckOnDebug(); +} + +// Specialization for double (for LCAO) +template <> +void normalize_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + double* psi_iter, + const int& nbase, + const int& notconv, + double* psi_norm) +{ + const int threadsPerBlock = 256; + const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv; + + // First kernel: calculate norms + calculate_norm_kernel<<>>( + dim, notconv, psi_iter, nbase, psi_norm); + cudaDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock; + normalize_vectors_kernel<<>>( + dim, notconv, psi_iter, nbase, psi_norm); + cudaCheckOnDebug(); +} // Explicitly instantiate functors for the types of functor registered. template struct dot_real_op, base_device::DEVICE_GPU>; @@ -1045,6 +1401,9 @@ template struct vector_mul_vector_op, base_device::DEVICE_GP template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixSetToAnother, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct normalize_op, base_device::DEVICE_GPU>; template struct dot_real_op, base_device::DEVICE_GPU>; template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; @@ -1054,6 +1413,9 @@ template struct vector_mul_vector_op, base_device::DEVICE_G template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixSetToAnother, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct normalize_op, base_device::DEVICE_GPU>; #ifdef __LCAO template struct dot_real_op; @@ -1062,5 +1424,8 @@ template struct vector_mul_vector_op; template struct vector_div_vector_op; template struct matrixSetToAnother; template struct constantvector_addORsub_constantVector_op; +template struct apply_eigenvalues_op; +template struct precondition_op; +template struct normalize_op; #endif } // namespace hsolver diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp index c3784949ab..87cd549482 100644 --- a/source/module_hsolver/kernels/math_kernel_op.cpp +++ b/source/module_hsolver/kernels/math_kernel_op.cpp @@ -2,6 +2,7 @@ #include #include +#include namespace hsolver { @@ -173,6 +174,51 @@ struct vector_mul_vector_op } }; +template +struct apply_eigenvalues_op +{ + using Real = typename GetTypeReal::type; + void operator()(const base_device::DEVICE_CPU* d, const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues) + { + for (int m = 0; m < notconv; m++) + { + for (int idx = 0; idx < nbase; idx++) + { + result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx]; + } + } + } +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const base_device::DEVICE_CPU* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues) + { + using Real = typename GetTypeReal::type; + std::vector pre(dim, 0.0); + for (int m = 0; m < notconv; m++) + { + for (size_t i = 0; i < dim; i++) + { + Real x = std::abs(precondition[i] - eigenvalues[m]); + pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); + } + vector_div_vector_op()(d, + dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + pre.data()); + } + } +}; + template struct vector_div_vector_op { @@ -355,6 +401,40 @@ struct matrixSetToAnother } }; +template +struct normalize_op { + void operator()(const base_device::DEVICE_CPU* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + typename GetTypeReal::type* psi_norm = nullptr) + { + using Real = typename GetTypeReal::type; + for (int m = 0; m < notconv; m++) + { + // Calculate norm using dot_real_op + Real psi_m_norm = dot_real_op()(d, + dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + true); + assert(psi_m_norm > 0.0); + psi_m_norm = sqrt(psi_m_norm); + + // Normalize using vector_div_constant_op + vector_div_constant_op()(d, + dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + psi_m_norm); + if (psi_norm) { + psi_norm[m] = psi_m_norm; + } + } + } +}; + // Explicitly instantiate functors for the types of functor registered. template struct scal_op; template struct axpy_op, base_device::DEVICE_CPU>; @@ -365,6 +445,9 @@ template struct gemm_op; template struct dot_real_op, base_device::DEVICE_CPU>; template struct vector_div_constant_op, base_device::DEVICE_CPU>; template struct vector_mul_vector_op, base_device::DEVICE_CPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_CPU>; +template struct precondition_op, base_device::DEVICE_CPU>; +template struct normalize_op, base_device::DEVICE_CPU>; template struct vector_div_vector_op, base_device::DEVICE_CPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_CPU>; template struct matrixTranspose_op, base_device::DEVICE_CPU>; @@ -381,6 +464,9 @@ template struct gemm_op; template struct dot_real_op, base_device::DEVICE_CPU>; template struct vector_div_constant_op, base_device::DEVICE_CPU>; template struct vector_mul_vector_op, base_device::DEVICE_CPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_CPU>; +template struct precondition_op, base_device::DEVICE_CPU>; +template struct normalize_op, base_device::DEVICE_CPU>; template struct vector_div_vector_op, base_device::DEVICE_CPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_CPU>; template struct matrixTranspose_op, base_device::DEVICE_CPU>; @@ -392,11 +478,14 @@ template struct line_minimize_with_block_op, base_device::D template struct axpy_op; template struct dot_real_op; template struct vector_mul_vector_op; +template struct apply_eigenvalues_op; +template struct precondition_op; template struct vector_div_constant_op; template struct vector_div_vector_op; template struct matrixTranspose_op; template struct matrixSetToAnother; template struct constantvector_addORsub_constantVector_op; +template struct normalize_op; #endif #ifdef __DSP template struct gemm_op_mt, base_device::DEVICE_CPU>; diff --git a/source/module_hsolver/kernels/math_kernel_op.h b/source/module_hsolver/kernels/math_kernel_op.h index 0daf0e5718..8318c4310b 100644 --- a/source/module_hsolver/kernels/math_kernel_op.h +++ b/source/module_hsolver/kernels/math_kernel_op.h @@ -325,6 +325,48 @@ template struct matrixSetToAnother { T *B, const int &LDB); }; +template +struct apply_eigenvalues_op { + using Real = typename GetTypeReal::type; + + void operator()(const Device *d, const int &nbase, const int &nbase_x, const int ¬conv, + T *result, const T *vectors, const Real *eigenvalues); +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const Device* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues); +}; + +template +struct normalize_op { + using Real = typename GetTypeReal::type; + void operator()(const Device* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + Real* psi_norm = nullptr); +}; + +template +struct normalize_op { + using Real = typename GetTypeReal::type; + void operator()(const base_device::DEVICE_GPU* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + Real* psi_norm); +}; + #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template @@ -393,6 +435,26 @@ template struct matrixSetToAnother { void createGpuBlasHandle(); void destoryBLAShandle(); +// vector operator: result[i] = -lambda[i] * vector[i] +template struct apply_eigenvalues_op { + using Real = typename GetTypeReal::type; + + void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int ¬conv, + T *result, const T *vectors, const Real *eigenvalues); +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const base_device::DEVICE_GPU* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues); +}; + #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM } // namespace hsolver diff --git a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu index 5eeb1697e8..8cba06db1b 100644 --- a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu +++ b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu @@ -5,12 +5,25 @@ namespace hsolver { +// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op + +static hipsolverHandle_t hipsolver_H = nullptr; +// Test on DCU platform. When nstart is greater than 234, code on DCU performs better. +const int N_DCU = 234; + void createGpuSolverHandle() { - return; + if (hipsolver_H == nullptr) + { + hipsolverErrcheck(hipsolverCreate(&hipsolver_H)); + } } void destroyGpuSolverHandle() { - return; + if (hipsolver_H != nullptr) + { + hipsolverErrcheck(hipsolverDestroy(hipsolver_H)); + hipsolver_H = nullptr; + } } #ifdef __LCAO @@ -23,22 +36,68 @@ void dngvd_op::operator()(const base_device::DE double* _eigenvalue, double* _vcc) { - std::vector hcc(nstart * nstart, 0.0); - std::vector scc(nstart * nstart, 0.0); - std::vector vcc(nstart * nstart, 0.0); - std::vector eigenvalue(nstart, 0); - hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost)); - hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost)); - base_device::DEVICE_CPU* cpu_ctx = {}; - dngvd_op()(cpu_ctx, - nstart, - ldh, - hcc.data(), - scc.data(), - eigenvalue.data(), - vcc.data()); - hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice)); - hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + // copied from ../cuda/dngvd_op.cu, "dngvd_op" + assert(nstart == ldh); + + if (nstart > N_DCU){ + hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(double) * ldh * nstart, hipMemcpyDeviceToDevice)); + // now vcc contains hcc + + // prepare some values for hipsolverDnZhegvd_bufferSize + int * devInfo = nullptr; + int lwork = 0, info_gpu = 0; + double * work = nullptr; + hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int))); + hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER; + + // calculate the sizes needed for pre-allocated buffer. + hipsolverErrcheck(hipsolverDnDsygvd_bufferSize( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + _vcc, ldh, + _scc, ldh, + _eigenvalue, + &lwork)); + + // allocate memery + hipErrcheck(hipMalloc((void**)&work, sizeof(double) * lwork)); + + // compute eigenvalues and eigenvectors. + hipsolverErrcheck(hipsolverDnDsygvd( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + _vcc, ldh, + const_cast(_scc), ldh, + _eigenvalue, + work, lwork, devInfo)); + + hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost)); + + // free the buffer + hipErrcheck(hipFree(work)); + hipErrcheck(hipFree(devInfo)); + } + // if(fail_info != nullptr) *fail_info = info_gpu; + else{ + std::vector hcc(nstart * nstart, 0.0); + std::vector scc(nstart * nstart, 0.0); + std::vector vcc(nstart * nstart, 0.0); + std::vector eigenvalue(nstart, 0); + hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost)); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost)); + base_device::DEVICE_CPU* cpu_ctx = {}; + dngvd_op()(cpu_ctx, + nstart, + ldh, + hcc.data(), + scc.data(), + eigenvalue.data(), + vcc.data()); + hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice)); + hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + } + + } #endif // __LCAO @@ -51,22 +110,67 @@ void dngvd_op, base_device::DEVICE_GPU>::operator()(const ba float* _eigenvalue, std::complex* _vcc) { - std::vector> hcc(nstart * nstart, {0, 0}); - std::vector> scc(nstart * nstart, {0, 0}); - std::vector> vcc(nstart * nstart, {0, 0}); - std::vector eigenvalue(nstart, 0); - hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); - hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); - base_device::DEVICE_CPU* cpu_ctx = {}; - dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, - nstart, - ldh, - hcc.data(), - scc.data(), - eigenvalue.data(), - vcc.data()); - hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); - hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice)); + // copied from ../cuda/dngvd_op.cu, "dngvd_op" + assert(nstart == ldh); + + if (nstart > N_DCU){ + hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex) * ldh * nstart, hipMemcpyDeviceToDevice)); + // now vcc contains hcc + + // prepare some values for hipsolverDnZhegvd_bufferSize + int * devInfo = nullptr; + int lwork = 0, info_gpu = 0; + float2 * work = nullptr; + hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int))); + hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER; + + // calculate the sizes needed for pre-allocated buffer. + hipsolverErrcheck(hipsolverDnChegvd_bufferSize( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + reinterpret_cast(_scc), ldh, + _eigenvalue, + &lwork)); + + // allocate memery + hipErrcheck(hipMalloc((void**)&work, sizeof(float2) * lwork)); + + // compute eigenvalues and eigenvectors. + hipsolverErrcheck(hipsolverDnChegvd( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + const_cast(reinterpret_cast(_scc)), ldh, + _eigenvalue, + work, lwork, devInfo)); + + hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost)); + // free the buffer + hipErrcheck(hipFree(work)); + hipErrcheck(hipFree(devInfo)); + } + // if(fail_info != nullptr) *fail_info = info_gpu; + else{ + std::vector> hcc(nstart * nstart, {0, 0}); + std::vector> scc(nstart * nstart, {0, 0}); + std::vector> vcc(nstart * nstart, {0, 0}); + std::vector eigenvalue(nstart, 0); + hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); + base_device::DEVICE_CPU* cpu_ctx = {}; + dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, + nstart, + ldh, + hcc.data(), + scc.data(), + eigenvalue.data(), + vcc.data()); + hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); + hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice)); + } + + } template <> @@ -76,24 +180,80 @@ void dngvd_op, base_device::DEVICE_GPU>::operator()(const b const std::complex* _hcc, const std::complex* _scc, double* _eigenvalue, - std::complex* _vcc) + std::complex* _vcc + ) { - std::vector> hcc(nstart * nstart, {0, 0}); - std::vector> scc(nstart * nstart, {0, 0}); - std::vector> vcc(nstart * nstart, {0, 0}); - std::vector eigenvalue(nstart, 0); - hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); - hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); - base_device::DEVICE_CPU* cpu_ctx = {}; - dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, - nstart, - ldh, - hcc.data(), - scc.data(), - eigenvalue.data(), - vcc.data()); - hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); - hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + // copied from ../cuda/dngvd_op.cu, "dngvd_op" + assert(nstart == ldh); + + // save a copy of scc in case the diagonalization fails + if (nstart > N_DCU){ + std::vector> scc(nstart * nstart, {0, 0}); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); + + hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex) * ldh * nstart, hipMemcpyDeviceToDevice)); + + // now vcc contains hcc + + // prepare some values for hipsolverDnZhegvd_bufferSize + int * devInfo = nullptr; + int lwork = 0, info_gpu = 0; + double2 * work = nullptr; + hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int))); + hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER; + + // calculate the sizes needed for pre-allocated buffer. + hipsolverErrcheck(hipsolverDnZhegvd_bufferSize( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + reinterpret_cast(_scc), ldh, + _eigenvalue, + &lwork)); + + // allocate memery + hipErrcheck(hipMalloc((void**)&work, sizeof(double2) * lwork)); + + // compute eigenvalues and eigenvectors. + hipsolverErrcheck(hipsolverDnZhegvd( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + const_cast(reinterpret_cast(_scc)), ldh, + _eigenvalue, + work, lwork, devInfo)); + + hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost)); + // free the buffer + hipErrcheck(hipFree(work)); + hipErrcheck(hipFree(devInfo)); + } + // if(fail_info != nullptr) *fail_info = info_gpu; + else{ + std::vector> hcc(nstart * nstart, {0, 0}); + std::vector> scc(nstart * nstart, {0, 0}); + std::vector> vcc(nstart * nstart, {0, 0}); + std::vector eigenvalue(nstart, 0); + hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); + base_device::DEVICE_CPU* cpu_ctx = {}; + dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, + nstart, + ldh, + hcc.data(), + scc.data(), + eigenvalue.data(), + vcc.data()); + hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); + hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + } + + + + + + + } #ifdef __LCAO diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu index ef5a1c1ece..4ca6af509c 100644 --- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu +++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu @@ -11,6 +11,7 @@ #define WARP_SIZE 32 #define FULL_MASK 0xffffffff #define THREAD_PER_BLOCK 256 + template <> struct GetTypeReal> { using type = float; /**< The return type specialization for std::complex. */ @@ -20,6 +21,49 @@ struct GetTypeReal> { using type = double; /**< The return type specialization for std::complex. */ }; +// Forward declarations for abs2 +template +__device__ typename GetTypeReal::type abs2(const T& x); + +// Specialization for real types (double) +template<> +__device__ double abs2(const double& x) { + return x * x; +} + +// Specialization for real types (float) +template<> +__device__ float abs2(const float& x) { + return x * x; +} + +// Specialization for complex float +template<> +__device__ float abs2(const thrust::complex& x) { + return x.real() * x.real() + x.imag() * x.imag(); +} + +// Specialization for complex double +template<> +__device__ double abs2(const thrust::complex& x) { + return x.real() * x.real() + x.imag() * x.imag(); +} + +// Specialization for std::complex (for interface compatibility) +template<> +__device__ float abs2(const std::complex& x) { + const thrust::complex* tx = reinterpret_cast*>(&x); + return tx->real() * tx->real() + tx->imag() * tx->imag(); +} + +// Specialization for std::complex (for interface compatibility) +template<> +__device__ double abs2(const std::complex& x) { + const thrust::complex* tx = reinterpret_cast*>(&x); + return tx->real() * tx->real() + tx->imag() * tx->imag(); +} + + namespace hsolver { template @@ -943,7 +987,309 @@ void matrixSetToAnother, base_device::DEVICE_GPU>::operator hipCheckOnDebug(); } +template +__launch_bounds__(256) +__global__ void apply_eigenvalues_kernel( + const typename GetTypeReal::type* eigenvalues, + const T* vectors, + T* result, + const int nbase, + const int nbase_x, + const int notconv) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + const int total_elements = notconv * nbase; + if (i < total_elements) + { + const int row = i / nbase; + const int col = i % nbase; + result[row * nbase_x + col] = eigenvalues[row] * vectors[row * nbase_x + col]; + } +} + +template +inline void apply_eigenvalues_complex_wrapper(const base_device::DEVICE_GPU* d, + const int& nbase, + const int& nbase_x, + const int& notconv, + const FPTYPE* eigenvalues, + const std::complex* vectors, + std::complex* result) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vectors_tmp = reinterpret_cast*>(vectors); + const int total_elements = notconv * nbase; + int thread = 256; + int block = (total_elements + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_eigenvalues_kernel>), dim3(block), dim3(thread), 0, 0, + eigenvalues, vectors_tmp, result_tmp, nbase, nbase_x, notconv); + + hipCheckOnDebug(); +} + +template <> +void apply_eigenvalues_op::operator()(const base_device::DEVICE_GPU* d, + const int& nbase, + const int& nbase_x, + const int& notconv, + double* result, + const double* vectors, + const double* eigenvalues) +{ + const int total_elements = notconv * nbase; + int thread = 256; + int block = (total_elements + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_eigenvalues_kernel), dim3(block), dim3(thread), 0, 0, + eigenvalues, vectors, result, nbase, nbase_x, notconv); + + hipCheckOnDebug(); +} +template <> +void apply_eigenvalues_op, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d, + const int& nbase, + const int& nbase_x, + const int& notconv, + std::complex* result, + const std::complex* vectors, + const float* eigenvalues) +{ + apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result); +} + +template <> +void apply_eigenvalues_op, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d, + const int& nbase, + const int& nbase_x, + const int& notconv, + std::complex* result, + const std::complex* vectors, + const double* eigenvalues) +{ + apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result); +} + +template +__launch_bounds__(256) +__global__ void precondition_kernel( + const int dim, + const int notconv, + T* psi_iter, + const int nbase, + const typename GetTypeReal::type* precondition, // Real type + const typename GetTypeReal::type* eigenvalues) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < dim * notconv) { + const int i = tid % dim; // Basis index + const int m = tid / dim; // Band index + + using Real = typename GetTypeReal::type; + Real x = abs(precondition[i] - eigenvalues[m]); + Real pre = 0.5 * (1.0 + x + sqrt(1.0 + (x - 1.0) * (x - 1.0))); + psi_iter[(nbase + m) * dim + i] /= pre; + } +} + +template <> +void precondition_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + double* psi_iter, + const int& nbase, + const int& notconv, + const double* precondition, + const double* eigenvalues) +{ + const int total_elements = dim * notconv; + int thread = 256; + int block = (total_elements + thread - 1) / thread; + + hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel), dim3(block), dim3(thread), 0, 0, + dim, notconv, psi_iter, nbase, precondition, eigenvalues); + + hipCheckOnDebug(); +} + +template <> +void precondition_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + const float* precondition, + const float* eigenvalues) +{ + const int total_elements = dim * notconv; + int thread = 256; + int block = (total_elements + thread - 1) / thread; + + hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel>), dim3(block), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, precondition, eigenvalues); + + hipCheckOnDebug(); +} + +template <> +void precondition_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + const double* precondition, + const double* eigenvalues) +{ + const int total_elements = dim * notconv; + int thread = 256; + int block = (total_elements + thread - 1) / thread; + + hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel>), dim3(block), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, precondition, eigenvalues); + + hipCheckOnDebug(); +} + +template +__launch_bounds__(256) +__global__ void calculate_norm_kernel( + const int dim, + const int notconv, + const T* psi_iter, + const int nbase, + typename GetTypeReal::type* psi_norm) +{ + using Real = typename GetTypeReal::type; + __shared__ Real shared_mem[256]; + + const int tid = threadIdx.x; + const int bid = blockIdx.x; + const int m = bid / ((dim + blockDim.x - 1) / blockDim.x); + const int num_blocks_per_band = (dim + blockDim.x - 1) / blockDim.x; + const int block_in_band = bid % num_blocks_per_band; + + if (m >= notconv) return; + + // Initialize shared memory + shared_mem[tid] = 0.0; + + // Each thread processes one element + const int i = tid + block_in_band * blockDim.x; + if (i < dim) { + shared_mem[tid] = abs2(psi_iter[(nbase + m) * dim + i]); + } + __syncthreads(); + + // Parallel reduction in shared memory + for (int stride = blockDim.x/2; stride > 0; stride >>= 1) { + if (tid < stride) { + shared_mem[tid] += shared_mem[tid + stride]; + } + __syncthreads(); + } + + // Write result for this block to global memory + if (tid == 0) { + atomicAdd(&psi_norm[m], shared_mem[0]); + } +} + +template +__launch_bounds__(256) +__global__ void normalize_vectors_kernel( + const int dim, + const int notconv, + T* psi_iter, + const int nbase, + const typename GetTypeReal::type* psi_norm) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int m = tid / dim; + const int i = tid % dim; + + if (m < notconv && i < dim) { + psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / sqrt(psi_norm[m]); + } +} + +template <> +void normalize_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + double* psi_iter, + const int& nbase, + const int& notconv, + double* psi_norm) +{ + const int thread = 256; + const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv; + + // First kernel: calculate norms + hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel), dim3(numBlocksForNorm), dim3(thread), 0, 0, + dim, notconv, psi_iter, nbase, psi_norm); + hipDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel), dim3(numBlocksForNormalize), dim3(thread), 0, 0, + dim, notconv, psi_iter, nbase, psi_norm); + + hipCheckOnDebug(); +} + +template <> +void normalize_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + float* psi_norm) +{ + const int thread = 256; + const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv; + + // First kernel: calculate norms + hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel>), dim3(numBlocksForNorm), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, psi_norm); + hipDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel>), dim3(numBlocksForNormalize), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, psi_norm); + + hipCheckOnDebug(); +} + +template <> +void normalize_op, base_device::DEVICE_GPU>::operator()( + const base_device::DEVICE_GPU* ctx, + const int& dim, + std::complex* psi_iter, + const int& nbase, + const int& notconv, + double* psi_norm) +{ + const int thread = 256; + const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv; + + // First kernel: calculate norms + hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel>), dim3(numBlocksForNorm), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, psi_norm); + hipDeviceSynchronize(); // Ensure all norms are computed + + // Second kernel: normalize vectors + const int total_elements = dim * notconv; + const int numBlocksForNormalize = (total_elements + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel>), dim3(numBlocksForNormalize), dim3(thread), 0, 0, + dim, notconv, reinterpret_cast*>(psi_iter), nbase, psi_norm); + + hipCheckOnDebug(); +} // Explicitly instantiate functors for the types of functor registered. template struct dot_real_op, base_device::DEVICE_GPU>; @@ -951,6 +1297,9 @@ template struct calc_grad_with_block_op, base_device::DEVICE template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; template struct vector_div_constant_op, base_device::DEVICE_GPU>; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct normalize_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixSetToAnother, base_device::DEVICE_GPU>; @@ -960,6 +1309,9 @@ template struct calc_grad_with_block_op, base_device::DEVIC template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; template struct vector_div_constant_op, base_device::DEVICE_GPU>; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct normalize_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixSetToAnother, base_device::DEVICE_GPU>; @@ -968,8 +1320,11 @@ template struct matrixSetToAnother, base_device::DEVICE_GPU template struct dot_real_op; template struct vector_div_constant_op; template struct vector_mul_vector_op; +template struct apply_eigenvalues_op; +template struct precondition_op; template struct vector_div_vector_op; template struct matrixSetToAnother; template struct constantvector_addORsub_constantVector_op; +template struct normalize_op; #endif } // namespace hsolver diff --git a/source/module_hsolver/test/test_hsolver_pw.cpp b/source/module_hsolver/test/test_hsolver_pw.cpp index 5763f1b7d8..1345dfc99e 100644 --- a/source/module_hsolver/test/test_hsolver_pw.cpp +++ b/source/module_hsolver/test/test_hsolver_pw.cpp @@ -14,6 +14,68 @@ #include "module_hsolver/hsolver_pw.h" #undef private #undef protected + +// Mock implementations for the template functions causing linking errors +namespace ModulePW { + // Mock implementation for recip_to_real + template + void PW_Basis_K::recip_to_real(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + // In a real test, you might want to implement behavior that simulates the actual function + } + + // Mock implementation for real_to_recip + template + void PW_Basis_K::real_to_recip(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + } + + // Explicit template instantiations + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; +} + /************************************************ * unit test of HSolverPW class ***********************************************/ diff --git a/source/module_hsolver/test/test_hsolver_sdft.cpp b/source/module_hsolver/test/test_hsolver_sdft.cpp index 642ee9daae..235c664a78 100644 --- a/source/module_hsolver/test/test_hsolver_sdft.cpp +++ b/source/module_hsolver/test/test_hsolver_sdft.cpp @@ -133,6 +133,66 @@ void Stochastic_Iter::cal_storho(const UnitCell& ucell, Charge::Charge(){}; Charge::~Charge(){}; +// Mock implementations for the template functions causing linking errors +namespace ModulePW { + // Mock implementation for recip_to_real + template + void PW_Basis_K::recip_to_real(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + // In a real test, you might want to implement behavior that simulates the actual function + } + + // Mock implementation for real_to_recip + template + void PW_Basis_K::real_to_recip(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + } + + // Explicit template instantiations + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; +} /************************************************ * unit test of HSolverPW_SDFT class ***********************************************/ diff --git a/source/module_io/read_input_item_elec_stru.cpp b/source/module_io/read_input_item_elec_stru.cpp index 73c741bb5f..19d5b2f1c2 100644 --- a/source/module_io/read_input_item_elec_stru.cpp +++ b/source/module_io/read_input_item_elec_stru.cpp @@ -346,6 +346,26 @@ void ReadInput::item_elec_stru() read_sync_bool(input.diago_smooth_ethr); this->add_item(item); } + { + Input_Item item("use_k_continuity"); + item.annotation = "whether to use k-point continuity for initializing wave functions"; + read_sync_bool(input.use_k_continuity); + item.check_value = [](const Input_Item& item, const Parameter& para) { + if (para.input.use_k_continuity && para.input.basis_type != "pw") { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity only works for PW basis"); + } + if (para.input.use_k_continuity && para.input.calculation == "nscf") { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for NSCF calculation"); + } + if (para.input.use_k_continuity && para.input.nspin == 2) { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for spin-polarized calculation"); + } + if (para.input.use_k_continuity && para.input.esolver_type == "sdft") { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for SDFT calculation"); + } + }; + this->add_item(item); + } { Input_Item item("pw_diag_ndim"); item.annotation = "dimension of workspace for Davidson diagonalization"; diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h index 0bb1979318..aa7d31309c 100644 --- a/source/module_parameter/input_parameter.h +++ b/source/module_parameter/input_parameter.h @@ -90,6 +90,7 @@ struct Input_para bool diago_smooth_ethr = false; ///< smooth ethr for iter methods int pw_diag_ndim = 4; ///< dimension of workspace for Davidson diagonalization int diago_cg_prec = 1; ///< mohan add 2012-03-31 + bool use_k_continuity = false; ///< whether to use k-point continuity for initializing wave functions std::string smearing_method = "gauss"; ///< "gauss", ///< "mp","methfessel-paxton" diff --git a/source/module_psi/psi_init.cpp b/source/module_psi/psi_init.cpp index 201539a54f..a3d59d4e7b 100644 --- a/source/module_psi/psi_init.cpp +++ b/source/module_psi/psi_init.cpp @@ -265,6 +265,7 @@ void PSIInit::initialize_psi(Psi>* psi, { for (int ik = 0; ik < this->pw_wfc->nks; ++ik) { + if(PARAM.inp.use_k_continuity && ik > 0) continue; //! Update Hamiltonian from other kpoint to the given one p_hamilt->updateHk(ik); @@ -316,4 +317,4 @@ template class PSIInit, base_device::DEVICE_CPU>; template class PSIInit, base_device::DEVICE_GPU>; template class PSIInit, base_device::DEVICE_GPU>; #endif -} // namespace psi \ No newline at end of file +} // namespace psi