diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp
index b20ea9f3ad..9373a5a31a 100644
--- a/source/module_base/module_device/device.cpp
+++ b/source/module_base/module_device/device.cpp
@@ -5,7 +5,7 @@
#include
#include
-
+#include
#ifdef __MPI
#include "mpi.h"
#endif
@@ -166,6 +166,11 @@ int device_count = -1;
cudaGetDeviceCount(&device_count);
#elif defined(__ROCM)
hipGetDeviceCount(&device_count);
+/***auto start_time = std::chrono::high_resolution_clock::now();
+std::cout << "Starting hipGetDeviceCount.." << std::endl;
+auto end_time = std::chrono::high_resolution_clock::now();
+auto duration = std::chrono::duration_cast>(end_time - start_time);
+std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
#endif
if (device_count <= 0)
{
@@ -711,4 +716,4 @@ void record_device_memory(
#endif
} // end of namespace information
-} // end of namespace base_device
\ No newline at end of file
+} // end of namespace base_device
diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp
index 0352492a3a..f8985387d9 100644
--- a/source/module_esolver/esolver.cpp
+++ b/source/module_esolver/esolver.cpp
@@ -107,13 +107,21 @@ std::string determine_type()
}
if (GlobalV::MY_RANK == 0)
{
- std::cout << " RUNNING WITH DEVICE : " << device_info << " / "
+ /***auto start_time = std::chrono::high_resolution_clock::now();
+ std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/
+ std::cout << " RUNNING WITH DEVICE : " << device_info << " / "
<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;
+ /***auto end_time = std::chrono::high_resolution_clock::now();
+ auto duration = std::chrono::duration_cast>(end_time - start_time);
+ std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
}
-
+ /*** auto start_time = std::chrono::high_resolution_clock::now();
+ std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/
GlobalV::ofs_running << "\n RUNNING WITH DEVICE : " << device_info << " / "
- << base_device::information::get_device_info(PARAM.inp.device) << std::endl;
-
+ << base_device::information::get_device_info(PARAM.inp.device) << std::endl;
+ /***auto end_time = std::chrono::high_resolution_clock::now();
+ auto duration = std::chrono::duration_cast>(end_time - start_time);
+ std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
return esolver_type;
}
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index 0b303f33e1..664ac186d5 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -556,7 +556,8 @@ void ESolver_KS_PW::hamilt2density_single(UnitCell& ucell,
hsolver::DiagoIterAssist::SCF_ITER,
hsolver::DiagoIterAssist::PW_DIAG_NMAX,
hsolver::DiagoIterAssist::PW_DIAG_THR,
- hsolver::DiagoIterAssist::need_subspace);
+ hsolver::DiagoIterAssist::need_subspace,
+ PARAM.inp.use_k_continuity);
hsolver_pw_obj.solve(this->p_hamilt,
this->kspw_psi[0],
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index df29f88989..4cade5d1ac 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -15,6 +15,7 @@
#include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
#include "module_hamilt_general/module_surchem/surchem.h"
#include "module_hamilt_general/module_vdw/vdw.h"
+#include "kernels/force_op.h"
#ifdef _OPENMP
#include
@@ -531,31 +532,110 @@ void Forces::cal_force_loc(const UnitCell& ucell,
// to G space. maybe need fftw with OpenMP
rho_basis->real2recip(aux, aux);
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
- for (int iat = 0; iat < this->nat; ++iat)
- {
- // read `it` `ia` from the table
+ // sincos op for G space
+
+
+ // data preparation
+ std::vector tau_flat(this->nat * 3);
+ std::vector gcar_flat(rho_basis->npw * 3);
+
+
+ for (int iat = 0; iat < this->nat; iat++) {
+ int it = ucell.iat2it[iat];
+ int ia = ucell.iat2ia[iat];
+
+ tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]);
+ tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]);
+ tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]);
+ }
+
+ for (int ig = 0; ig < rho_basis->npw; ig++) {
+ gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]);
+ gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]);
+ gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]);
+ }
+
+ // calculate vloc_factors for all atom types
+ std::vector vloc_per_type_host(ucell.ntype * rho_basis->npw);
+ for (int iat = 0; iat < this->nat; iat++) {
int it = ucell.iat2it[iat];
- int ia = ucell.iat2ia[iat];
- for (int ig = 0; ig < rho_basis->npw; ig++)
- {
- const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
- double sinp, cosp;
- ModuleBase::libm::sincos(phase, &sinp, &cosp);
- const double factor
- = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
- forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
- forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
- forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
+ for (int ig = 0; ig < rho_basis->npw; ig++) {
+ vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig]));
}
- forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
- forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
- forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);
+ }
+
+ std::vector> aux_fptype(rho_basis->npw);
+ for (int ig = 0; ig < rho_basis->npw; ig++) {
+ aux_fptype[ig] = static_cast>(aux[ig]);
+ }
+
+ FPTYPE* d_gcar = gcar_flat.data();
+ FPTYPE* d_tau = tau_flat.data();
+ FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
+ std::complex* d_aux = aux_fptype.data();
+ FPTYPE* d_force = nullptr;
+ std::vector force_host(this->nat * 3);
+
+ if (this->device == base_device::GpuDevice)
+ {
+ d_gcar = nullptr;
+ d_tau = nullptr;
+ d_vloc_per_type = nullptr;
+ d_aux = nullptr;
+
+ resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+ resmem_var_op()(this->ctx, d_tau, this->nat * 3);
+ resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw);
+ resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
+ resmem_var_op()(this->ctx, d_force, this->nat * 3);
+
+ syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+ syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
+ syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw);
+ syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
+
+ base_device::memory::set_memory_op()(this->ctx, d_force, 0.0, this->nat * 3);
+ }
+ else
+ {
+ d_force = force_host.data();
+ std::fill(force_host.begin(), force_host.end(), static_cast(0.0));
+ }
+
+ const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega);
+
+ // call op for sincos calculation
+ hamilt::cal_force_loc_sincos_op()(
+ this->ctx,
+ this->nat,
+ rho_basis->npw,
+ ucell.ntype,
+ d_gcar,
+ d_tau,
+ d_vloc_per_type,
+ d_aux,
+ scale_factor,
+ d_force
+ );
+
+ if (this->device == base_device::GpuDevice)
+ {
+ syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3);
+
+ delmem_var_op()(this->ctx, d_gcar);
+ delmem_var_op()(this->ctx, d_tau);
+ delmem_var_op()(this->ctx, d_vloc_per_type);
+ delmem_complex_op()(this->ctx, d_aux);
+ delmem_var_op()(this->ctx, d_force);
+ }
+
+ for (int iat = 0; iat < this->nat; iat++) {
+ forcelc(iat, 0) = static_cast(force_host[iat * 3 + 0]);
+ forcelc(iat, 1) = static_cast(force_host[iat * 3 + 1]);
+ forcelc(iat, 2) = static_cast(force_host[iat * 3 + 2]);
}
- // this->print(GlobalV::ofs_running, "local forces", forcelc);
+ // this->print(GlobalV: :ofs_running, "local forces", forcelc);
Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
delete[] aux;
ModuleBase::timer::tick("Forces", "cal_force_loc");
@@ -665,6 +745,119 @@ void Forces::cal_force_ew(const UnitCell& ucell,
aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0);
}
+ // sincos op for cal_force_ew
+
+ std::vector it_facts_host(this->nat);
+ std::vector tau_flat(this->nat * 3);
+
+ // iterate over by lookup table
+ for (int iat = 0; iat < this->nat; iat++) {
+ int it = ucell.iat2it[iat];
+ int ia = ucell.iat2ia[iat];
+
+ double zv;
+ if (PARAM.inp.use_paw)
+ {
+#ifdef USE_PAW
+ zv = GlobalC::paw_cell.get_val(it);
+#endif
+ }
+ else
+ {
+ zv = ucell.atoms[it].ncpp.zv;
+ }
+
+ it_facts_host[iat] = static_cast(zv * ModuleBase::e2 * ucell.tpiba *
+ ModuleBase::TWO_PI / ucell.omega * fact);
+
+ tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]);
+ tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]);
+ tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]);
+ }
+
+ std::vector gcar_flat(rho_basis->npw * 3);
+ for (int ig = 0; ig < rho_basis->npw; ig++) {
+ gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]);
+ gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]);
+ gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]);
+ }
+
+ std::vector> aux_fptype(rho_basis->npw);
+ for (int ig = 0; ig < rho_basis->npw; ig++) {
+ aux_fptype[ig] = static_cast>(aux[ig]);
+ }
+
+ FPTYPE* d_gcar = gcar_flat.data();
+ FPTYPE* d_tau = tau_flat.data();
+ FPTYPE* d_it_facts = it_facts_host.data();
+ std::complex* d_aux = aux_fptype.data();
+ FPTYPE* d_force_g = nullptr;
+ std::vector force_g_host(this->nat * 3);
+
+ if (this->device == base_device::GpuDevice)
+ {
+ d_gcar = nullptr;
+ d_tau = nullptr;
+ d_it_facts = nullptr;
+ d_aux = nullptr;
+
+ resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+ resmem_var_op()(this->ctx, d_tau, this->nat * 3);
+ resmem_var_op()(this->ctx, d_it_facts, this->nat);
+ resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
+ resmem_var_op()(this->ctx, d_force_g, this->nat * 3);
+
+
+ syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+ syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
+ syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat);
+ syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
+
+
+ base_device::memory::set_memory_op()(this->ctx, d_force_g, 0.0, this->nat * 3);
+ }
+ else
+ {
+ d_force_g = force_g_host.data();
+ std::fill(force_g_host.begin(), force_g_host.end(), static_cast(0.0));
+ }
+
+ // call op for sincos calculation
+ hamilt::cal_force_ew_sincos_op()(
+ this->ctx,
+ this->nat,
+ rho_basis->npw,
+ rho_basis->ig_gge0,
+ d_gcar,
+ d_tau,
+ d_it_facts,
+ d_aux,
+ d_force_g
+ );
+
+
+ if (this->device == base_device::GpuDevice)
+ {
+
+ syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3);
+
+
+ delmem_var_op()(this->ctx, d_gcar);
+ delmem_var_op()(this->ctx, d_tau);
+ delmem_var_op()(this->ctx, d_it_facts);
+ delmem_complex_op()(this->ctx, d_aux);
+ delmem_var_op()(this->ctx, d_force_g);
+ }
+
+
+ for (int iat = 0; iat < this->nat; iat++) {
+ forceion(iat, 0) += static_cast(force_g_host[iat * 3 + 0]);
+ forceion(iat, 1) += static_cast(force_g_host[iat * 3 + 1]);
+ forceion(iat, 2) += static_cast(force_g_host[iat * 3 + 2]);
+ }
+
+
+// calculate real space force
#ifdef _OPENMP
#pragma omp parallel
{
@@ -688,66 +881,7 @@ void Forces::cal_force_ew(const UnitCell& ucell,
iat_end = iat_beg + iat_end;
ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
- int iat = iat_beg;
- int it = it_beg;
- int ia = ia_beg;
-
- // preprocess ig_gap for skipping the ig point
- int ig_gap = (rho_basis->ig_gge0 >= 0 && rho_basis->ig_gge0 < rho_basis->npw) ? rho_basis->ig_gge0 : -1;
-
- double it_fact = 0.;
- int last_it = -1;
-
- // iterating atoms
- while (iat < iat_end)
- {
- if (it != last_it)
- { // calculate it_tact when it is changed
- double zv;
- if (PARAM.inp.use_paw)
- {
-#ifdef USE_PAW
- zv = GlobalC::paw_cell.get_val(it);
-#endif
- }
- else
- {
- zv = ucell.atoms[it].ncpp.zv;
- }
- it_fact = zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact;
- last_it = it;
- }
-
- if (ucell.atoms[it].na != 0)
- {
- const auto ig_loop = [&](int ig_beg, int ig_end) {
- for (int ig = ig_beg; ig < ig_end; ig++)
- {
- const ModuleBase::Vector3 gcar = rho_basis->gcar[ig];
- const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]);
- double sinp, cosp;
- ModuleBase::libm::sincos(arg, &sinp, &cosp);
- double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
- forceion(iat, 0) += gcar[0] * sumnb;
- forceion(iat, 1) += gcar[1] * sumnb;
- forceion(iat, 2) += gcar[2] * sumnb;
- }
- };
-
- // skip ig_gge0 point by separating ig loop into two part
- ig_loop(0, ig_gap);
- ig_loop(ig_gap + 1, rho_basis->npw);
-
- forceion(iat, 0) *= it_fact;
- forceion(iat, 1) *= it_fact;
- forceion(iat, 2) *= it_fact;
-
- ++iat;
- ucell.step_iait(&ia, &it);
- }
- }
-
- // means that the processor contains G=0 term.
+
if (rho_basis->ig_gge0 >= 0)
{
double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index 5d0656d105..5ff3ddf2b1 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -13,6 +13,122 @@
namespace hamilt {
+// CUDA kernels for sincos loops
+template
+__global__ void cal_force_loc_sincos_kernel(
+ const int nat,
+ const int npw,
+ const int ntype,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* vloc_per_type,
+ const thrust::complex* aux,
+ const FPTYPE scale_factor,
+ FPTYPE* force)
+{
+ const FPTYPE TWO_PI = 2.0 * M_PI;
+
+ const int iat = blockIdx.y;
+ const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+ const int ig_stride = gridDim.x * blockDim.x;
+
+ if (iat >= nat) return;
+
+ // Load atom information to registers
+ const FPTYPE tau_x = tau[iat * 3 + 0];
+ const FPTYPE tau_y = tau[iat * 3 + 1];
+ const FPTYPE tau_z = tau[iat * 3 + 2];
+
+ // Local accumulation variables
+ FPTYPE local_force_x = 0.0;
+ FPTYPE local_force_y = 0.0;
+ FPTYPE local_force_z = 0.0;
+
+ // Grid-stride loop over G-vectors
+ for (int ig = ig_start; ig < npw; ig += ig_stride) {
+ // Calculate phase: 2π * (G · τ)
+ const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
+ gcar[ig * 3 + 1] * tau_y +
+ gcar[ig * 3 + 2] * tau_z);
+
+ // Use CUDA intrinsic for sincos
+ FPTYPE sinp, cosp;
+ sincos(phase, &sinp, &cosp);
+
+ // Calculate force factor
+ const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
+ const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+
+ // Accumulate force contributions
+ local_force_x += gcar[ig * 3 + 0] * factor;
+ local_force_y += gcar[ig * 3 + 1] * factor;
+ local_force_z += gcar[ig * 3 + 2] * factor;
+ }
+
+ // Atomic add to global memory
+ atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+ atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+ atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template
+__global__ void cal_force_ew_sincos_kernel(
+ const int nat,
+ const int npw,
+ const int ig_gge0,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* it_facts,
+ const thrust::complex* aux,
+ FPTYPE* force)
+{
+ const FPTYPE TWO_PI = 2.0 * M_PI;
+
+ const int iat = blockIdx.y;
+ const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+ const int ig_stride = gridDim.x * blockDim.x;
+
+ if (iat >= nat) return;
+
+ // Load atom information to registers
+ const FPTYPE tau_x = tau[iat * 3 + 0];
+ const FPTYPE tau_y = tau[iat * 3 + 1];
+ const FPTYPE tau_z = tau[iat * 3 + 2];
+ const FPTYPE it_fact = it_facts[iat];
+
+ // Local accumulation variables
+ FPTYPE local_force_x = 0.0;
+ FPTYPE local_force_y = 0.0;
+ FPTYPE local_force_z = 0.0;
+
+ // Grid-stride loop over G-vectors
+ for (int ig = ig_start; ig < npw; ig += ig_stride) {
+ // Skip G=0 term
+ if (ig == ig_gge0) continue;
+
+ // Calculate phase: 2π * (G · τ)
+ const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
+ gcar[ig * 3 + 1] * tau_y +
+ gcar[ig * 3 + 2] * tau_z);
+
+ // Use CUDA intrinsic for sincos
+ FPTYPE sinp, cosp;
+ sincos(phase, &sinp, &cosp);
+
+ // Calculate Ewald sum contribution (fixed sign error)
+ const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
+
+ // Accumulate force contributions
+ local_force_x += gcar[ig * 3 + 0] * factor;
+ local_force_y += gcar[ig * 3 + 1] * factor;
+ local_force_z += gcar[ig * 3 + 2] * factor;
+ }
+
+ // Atomic add to global memory
+ atomicAdd(&force[iat * 3 + 0], local_force_x);
+ atomicAdd(&force[iat * 3 + 1], local_force_y);
+ atomicAdd(&force[iat * 3 + 2], local_force_z);
+}
template
__global__ void cal_vkb1_nl(
@@ -188,6 +304,65 @@ void cal_force_nl_op::operator()(const base_dev
cudaCheckOnDebug();
}
+// GPU operators
+template
+void cal_force_loc_sincos_op::operator()(
+ const base_device::DEVICE_GPU* ctx,
+ const int& nat,
+ const int& npw,
+ const int& ntype,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* vloc_per_type,
+ const std::complex* aux,
+ const FPTYPE& scale_factor,
+ FPTYPE* force)
+{
+ // Calculate optimal grid configuration for GPU load balancing
+ const int threads_per_block = THREADS_PER_BLOCK;
+ const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+ const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+
+ dim3 grid(max_blocks_x, nat);
+ dim3 block(threads_per_block);
+
+ cal_force_loc_sincos_kernel<<>>(
+ nat, npw, ntype, gcar, tau, vloc_per_type,
+ reinterpret_cast*>(aux),
+ scale_factor, force
+ );
+
+ cudaCheckOnDebug();
+}
+
+template
+void cal_force_ew_sincos_op::operator()(
+ const base_device::DEVICE_GPU* ctx,
+ const int& nat,
+ const int& npw,
+ const int& ig_gge0,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* it_facts,
+ const std::complex* aux,
+ FPTYPE* force)
+{
+ // Calculate optimal grid configuration for GPU load balancing
+ const int threads_per_block = THREADS_PER_BLOCK;
+ const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+ const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+
+ dim3 grid(max_blocks_x, nat);
+ dim3 block(threads_per_block);
+
+ cal_force_ew_sincos_kernel<<>>(
+ nat, npw, ig_gge0, gcar, tau, it_facts,
+ reinterpret_cast*>(aux), force
+ );
+
+ cudaCheckOnDebug();
+}
+
template
__global__ void cal_force_nl(
const int ntype,
@@ -613,8 +788,12 @@ template void saveVkbValues(const int *gcar_zero_ptrs, const std::comple
template struct cal_vkb1_nl_op;
template struct cal_force_nl_op;
+template struct cal_force_loc_sincos_op;
+template struct cal_force_ew_sincos_op;
template struct cal_vkb1_nl_op;
template struct cal_force_nl_op;
+template struct cal_force_loc_sincos_op;
+template struct cal_force_ew_sincos_op;
} // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 6d797e147d..109321d6c3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -1,4 +1,8 @@
#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
+#include "module_base/libm/libm.h"
+#include "module_base/tool_threading.h"
+#include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
+#include
#ifdef _OPENMP
#include
@@ -424,10 +428,116 @@ struct cal_force_nl_op
}
};
+// CPU implementation of local force sincos operator
+template
+struct cal_force_loc_sincos_op
+{
+ void operator()(const base_device::DEVICE_CPU* ctx,
+ const int& nat,
+ const int& npw,
+ const int& ntype,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* vloc_per_type,
+ const std::complex* aux,
+ const FPTYPE& scale_factor,
+ FPTYPE* force)
+ {
+ const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+ for (int iat = 0; iat < nat; ++iat)
+ {
+ const FPTYPE tau_x = tau[iat * 3 + 0];
+ const FPTYPE tau_y = tau[iat * 3 + 1];
+ const FPTYPE tau_z = tau[iat * 3 + 2];
+
+ FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+ for (int ig = 0; ig < npw; ig++)
+ {
+ const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
+ gcar[ig * 3 + 1] * tau_y +
+ gcar[ig * 3 + 2] * tau_z);
+ FPTYPE sinp, cosp;
+ ModuleBase::libm::sincos(phase, &sinp, &cosp);
+
+ const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
+ const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor;
+
+ local_force[0] += gcar[ig * 3 + 0] * factor;
+ local_force[1] += gcar[ig * 3 + 1] * factor;
+ local_force[2] += gcar[ig * 3 + 2] * factor;
+ }
+
+ force[iat * 3 + 0] = local_force[0];
+ force[iat * 3 + 1] = local_force[1];
+ force[iat * 3 + 2] = local_force[2];
+ }
+ }
+};
+
+// CPU implementation of Ewald force sincos operator
+template
+struct cal_force_ew_sincos_op
+{
+ void operator()(const base_device::DEVICE_CPU* ctx,
+ const int& nat,
+ const int& npw,
+ const int& ig_gge0,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* it_facts,
+ const std::complex* aux,
+ FPTYPE* force)
+ {
+ const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+ for (int iat = 0; iat < nat; ++iat)
+ {
+ const FPTYPE tau_x = tau[iat * 3 + 0];
+ const FPTYPE tau_y = tau[iat * 3 + 1];
+ const FPTYPE tau_z = tau[iat * 3 + 2];
+ const FPTYPE it_fact = it_facts[iat];
+
+ FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+ for (int ig = 0; ig < npw; ig++)
+ {
+ // Skip G=0 term
+ if (ig == ig_gge0) continue;
+
+ const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
+ gcar[ig * 3 + 1] * tau_y +
+ gcar[ig * 3 + 2] * tau_z);
+ FPTYPE sinp, cosp;
+ ModuleBase::libm::sincos(phase, &sinp, &cosp);
+
+ const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
+
+ local_force[0] += gcar[ig * 3 + 0] * factor;
+ local_force[1] += gcar[ig * 3 + 1] * factor;
+ local_force[2] += gcar[ig * 3 + 2] * factor;
+ }
+
+ force[iat * 3 + 0] = local_force[0];
+ force[iat * 3 + 1] = local_force[1];
+ force[iat * 3 + 2] = local_force[2];
+ }
+ }
+};
+
template struct cal_vkb1_nl_op;
template struct cal_force_nl_op;
-
+template struct cal_force_loc_sincos_op;
+template struct cal_force_ew_sincos_op;
template struct cal_vkb1_nl_op;
template struct cal_force_nl_op;
-
+template struct cal_force_loc_sincos_op;
+template struct cal_force_ew_sincos_op;
} // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 3aa5d4f87e..acf490b278 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -146,6 +146,64 @@ struct cal_force_nl_op
const FPTYPE* lambda,
const std::complex* becp,
const std::complex* dbecp,
+ FPTYPE* force);
+};
+
+template
+struct cal_force_loc_sincos_op
+{
+ /// @brief Calculate local pseudopotential forces (sincos loop only)
+ ///
+ /// Input Parameters
+ /// @param ctx - which device this function runs on
+ /// @param nat - number of atoms
+ /// @param npw - number of plane waves
+ /// @param ntype - number of atom types
+ /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+ /// @param tau - atomic positions [nat * 3]
+ /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw]
+ /// @param aux - charge density in G-space [npw]
+ /// @param scale_factor - tpiba * omega
+ ///
+ /// Output Parameters
+ /// @param force - output forces [nat * 3]
+ void operator()(const Device* ctx,
+ const int& nat,
+ const int& npw,
+ const int& ntype,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* vloc_per_type,
+ const std::complex* aux,
+ const FPTYPE& scale_factor,
+ FPTYPE* force);
+};
+
+template
+struct cal_force_ew_sincos_op
+{
+ /// @brief Calculate Ewald forces (sincos loop only)
+ ///
+ /// Input Parameters
+ /// @param ctx - which device this function runs on
+ /// @param nat - number of atoms
+ /// @param npw - number of plane waves
+ /// @param ig_gge0 - index of G=0 vector (-1 if not present)
+ /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+ /// @param tau - atomic positions [nat * 3]
+ /// @param it_facts - precomputed it_fact for each atom [nat]
+ /// @param aux - structure factor related array [npw]
+ ///
+ /// Output Parameters
+ /// @param force - output forces [nat * 3]
+ void operator()(const Device* ctx,
+ const int& nat,
+ const int& npw,
+ const int& ig_gge0,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* it_facts,
+ const std::complex* aux,
FPTYPE* force);
};
@@ -248,6 +306,35 @@ struct cal_force_nl_op
FPTYPE* force);
};
+template
+struct cal_force_loc_sincos_op
+{
+ void operator()(const base_device::DEVICE_GPU* ctx,
+ const int& nat,
+ const int& npw,
+ const int& ntype,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* vloc_per_type,
+ const std::complex* aux,
+ const FPTYPE& scale_factor,
+ FPTYPE* force);
+};
+
+template
+struct cal_force_ew_sincos_op
+{
+ void operator()(const base_device::DEVICE_GPU* ctx,
+ const int& nat,
+ const int& npw,
+ const int& ig_gge0,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* it_facts,
+ const std::complex* aux,
+ FPTYPE* force);
+};
+
/**
* @brief revert the vkb values for force_nl calculation
*/
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index c78b333b86..6bb3a84e7e 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -618,10 +618,190 @@ template void revertVkbValues(const int *gcar_zero_ptrs, std::complex(const int *gcar_zero_ptrs, const std::complex *vkb_ptr, std::complex *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx);
+// HIP kernels for sincos loops
+template
+__global__ void cal_force_loc_sincos_kernel(
+ const int nat,
+ const int npw,
+ const int ntype,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* vloc_per_type,
+ const thrust::complex* aux,
+ const FPTYPE scale_factor,
+ FPTYPE* force)
+{
+ const FPTYPE TWO_PI = 2.0 * M_PI;
+
+ const int iat = blockIdx.y;
+ const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+ const int ig_stride = gridDim.x * blockDim.x;
+
+ if (iat >= nat) return;
+
+ // Load atom information to registers
+ const FPTYPE tau_x = tau[iat * 3 + 0];
+ const FPTYPE tau_y = tau[iat * 3 + 1];
+ const FPTYPE tau_z = tau[iat * 3 + 2];
+
+ // Local accumulation variables
+ FPTYPE local_force_x = 0.0;
+ FPTYPE local_force_y = 0.0;
+ FPTYPE local_force_z = 0.0;
+
+ // Grid-stride loop over G-vectors
+ for (int ig = ig_start; ig < npw; ig += ig_stride) {
+ // Calculate phase: 2π * (G · τ)
+ const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
+ gcar[ig * 3 + 1] * tau_y +
+ gcar[ig * 3 + 2] * tau_z);
+
+ // Use HIP intrinsic for sincos
+ FPTYPE sinp, cosp;
+ sincos(phase, &sinp, &cosp);
+
+ // Calculate force factor
+ const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
+ const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+
+ // Accumulate force contributions
+ local_force_x += gcar[ig * 3 + 0] * factor;
+ local_force_y += gcar[ig * 3 + 1] * factor;
+ local_force_z += gcar[ig * 3 + 2] * factor;
+ }
+
+ // Atomic add to global memory
+ atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+ atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+ atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template
+__global__ void cal_force_ew_sincos_kernel(
+ const int nat,
+ const int npw,
+ const int ig_gge0,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* it_facts,
+ const thrust::complex* aux,
+ FPTYPE* force)
+{
+ const FPTYPE TWO_PI = 2.0 * M_PI;
+
+ const int iat = blockIdx.y;
+ const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+ const int ig_stride = gridDim.x * blockDim.x;
+
+ if (iat >= nat) return;
+
+ // Load atom information to registers
+ const FPTYPE tau_x = tau[iat * 3 + 0];
+ const FPTYPE tau_y = tau[iat * 3 + 1];
+ const FPTYPE tau_z = tau[iat * 3 + 2];
+ const FPTYPE it_fact = it_facts[iat];
+
+ // Local accumulation variables
+ FPTYPE local_force_x = 0.0;
+ FPTYPE local_force_y = 0.0;
+ FPTYPE local_force_z = 0.0;
+
+ // Grid-stride loop over G-vectors
+ for (int ig = ig_start; ig < npw; ig += ig_stride) {
+ // Skip G=0 term
+ if (ig == ig_gge0) continue;
+
+ // Calculate phase: 2π * (G · τ)
+ const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
+ gcar[ig * 3 + 1] * tau_y +
+ gcar[ig * 3 + 2] * tau_z);
+
+ // Use HIP intrinsic for sincos
+ FPTYPE sinp, cosp;
+ sincos(phase, &sinp, &cosp);
+
+ // Calculate Ewald sum contribution (fixed sign error)
+ const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
+
+ // Accumulate force contributions
+ local_force_x += gcar[ig * 3 + 0] * factor;
+ local_force_y += gcar[ig * 3 + 1] * factor;
+ local_force_z += gcar[ig * 3 + 2] * factor;
+ }
+
+ // Atomic add to global memory
+ atomicAdd(&force[iat * 3 + 0], local_force_x);
+ atomicAdd(&force[iat * 3 + 1], local_force_y);
+ atomicAdd(&force[iat * 3 + 2], local_force_z);
+}
+
+// GPU operators
+template
+void cal_force_loc_sincos_op::operator()(
+ const base_device::DEVICE_GPU* ctx,
+ const int& nat,
+ const int& npw,
+ const int& ntype,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* vloc_per_type,
+ const std::complex* aux,
+ const FPTYPE& scale_factor,
+ FPTYPE* force)
+{
+ // Calculate optimal grid configuration for GPU load balancing
+ const int threads_per_block = THREADS_PER_BLOCK;
+ const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+ const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+
+ dim3 grid(max_blocks_x, nat);
+ dim3 block(threads_per_block);
+
+ hipLaunchKernelGGL(cal_force_loc_sincos_kernel,
+ grid, block, 0, 0,
+ nat, npw, ntype, gcar, tau, vloc_per_type,
+ reinterpret_cast*>(aux),
+ scale_factor, force);
+
+ hipCheckOnDebug();
+}
+
+template
+void cal_force_ew_sincos_op::operator()(
+ const base_device::DEVICE_GPU* ctx,
+ const int& nat,
+ const int& npw,
+ const int& ig_gge0,
+ const FPTYPE* gcar,
+ const FPTYPE* tau,
+ const FPTYPE* it_facts,
+ const std::complex* aux,
+ FPTYPE* force)
+{
+ // Calculate optimal grid configuration for GPU load balancing
+ const int threads_per_block = THREADS_PER_BLOCK;
+ const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+ const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+
+ dim3 grid(max_blocks_x, nat);
+ dim3 block(threads_per_block);
+
+ hipLaunchKernelGGL(cal_force_ew_sincos_kernel,
+ grid, block, 0, 0,
+ nat, npw, ig_gge0, gcar, tau, it_facts,
+ reinterpret_cast*>(aux), force);
+
+ hipCheckOnDebug();
+}
+
template struct cal_vkb1_nl_op;
template struct cal_force_nl_op;
+template struct cal_force_loc_sincos_op;
+template struct cal_force_ew_sincos_op;
template struct cal_vkb1_nl_op;
template struct cal_force_nl_op;
+template struct cal_force_loc_sincos_op;
+template struct cal_force_ew_sincos_op;
} // namespace hamilt
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index b180d72c13..f07f4cfefa 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -7,7 +7,6 @@
#include "module_hsolver/kernels/dngvd_op.h"
#include "module_hsolver/kernels/math_kernel_op.h"
#include "module_base/kernels/dsp/dsp_connector.h"
-
#include
using namespace hsolver;
@@ -60,7 +59,7 @@ Diago_DavSubspace::Diago_DavSubspace(const std::vector& precond
if (this->device == base_device::GpuDevice)
{
resmem_real_op()(this->ctx, this->d_precondition, nbasis_in);
- // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
+ syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
}
#endif
}
@@ -288,27 +287,28 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func,
psi_iter + (nbase) * this->dim,
this->dim);
- std::vector e_temp_cpu(nbase, 0);
+ // Eigenvalues operation section
+ std::vector e_temp_cpu(this->notconv, 0);
Real* e_temp_hd = e_temp_cpu.data();
- if(this->device == base_device::GpuDevice)
+ if (this->device == base_device::GpuDevice)
{
e_temp_hd = nullptr;
- resmem_real_op()(this->ctx, e_temp_hd, nbase);
+ resmem_real_op()(this->ctx, e_temp_hd, this->notconv);
}
- for (int m = 0; m < notconv; m++)
+
+ for (int m = 0; m < this->notconv; m++)
{
- e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m]));
- if (this->device == base_device::GpuDevice)
- {
- syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), nbase);
- }
- vector_mul_vector_op()(this->ctx,
- nbase,
- vcc + m * this->nbase_x,
- vcc + m * this->nbase_x,
- e_temp_hd);
+ e_temp_cpu[m] = -(*eigenvalue_iter)[m];
}
- if(this->device == base_device::GpuDevice)
+
+ if (this->device == base_device::GpuDevice)
+ {
+ syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), this->notconv);
+ }
+
+ apply_eigenvalues_op()(this->ctx, nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd);
+
+ if (this->device == base_device::GpuDevice)
{
delmem_real_op()(this->ctx, e_temp_hd);
}
@@ -333,54 +333,62 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func,
psi_iter + nbase * this->dim,
this->dim);
- // "precondition!!!"
- std::vector pre(this->dim, 0.0);
- for (int m = 0; m < notconv; m++)
- {
- for (size_t i = 0; i < this->dim; i++)
- {
- // pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
- double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
- pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
- }
+ // Precondition section
#if defined(__CUDA) || defined(__ROCM)
- if (this->device == base_device::GpuDevice)
- {
- syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, pre.data(), this->dim);
- vector_div_vector_op()(this->ctx,
- this->dim,
- psi_iter + (nbase + m) * this->dim,
- psi_iter + (nbase + m) * this->dim,
- this->d_precondition);
- }
- else
+ if (this->device == base_device::GpuDevice)
+ {
+ Real* eigenvalues_gpu = nullptr;
+ resmem_real_op()(this->ctx, eigenvalues_gpu, notconv);
+ syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu,(*eigenvalue_iter).data(), notconv);
+
+ precondition_op()(this->ctx,
+ this->dim,
+ psi_iter,
+ nbase,
+ notconv,
+ d_precondition,
+ eigenvalues_gpu);
+ delmem_real_op()(this->ctx, eigenvalues_gpu);
+ }
+ else
#endif
- {
- vector_div_vector_op()(this->ctx,
- this->dim,
- psi_iter + (nbase + m) * this->dim,
- psi_iter + (nbase + m) * this->dim,
- pre.data());
- }
+ {
+ precondition_op()(this->ctx,
+ this->dim,
+ psi_iter,
+ nbase,
+ notconv,
+ this->precondition.data(),
+ (*eigenvalue_iter).data());
}
- // "normalize!!!" in order to improve numerical stability of subspace diagonalization
- std::vector psi_norm(notconv, 0.0);
- for (size_t i = 0; i < notconv; i++)
+ // Normalize section
+#if defined(__CUDA) || defined(__ROCM)
+ if (this->device == base_device::GpuDevice)
+ {
+ Real* psi_norm = nullptr;
+ resmem_real_op()(this->ctx, psi_norm, notconv);
+ using setmem_real_op = base_device::memory::set_memory_op;
+ setmem_real_op()(this->ctx, psi_norm, 0.0, notconv);
+
+ normalize_op()(this->ctx,
+ this->dim,
+ psi_iter,
+ nbase,
+ notconv,
+ psi_norm);
+ delmem_real_op()(this->ctx, psi_norm);
+ }
+ else
+#endif
{
- psi_norm[i] = dot_real_op()(this->ctx,
- this->dim,
- psi_iter + (nbase + i) * this->dim,
- psi_iter + (nbase + i) * this->dim,
- true);
- assert(psi_norm[i] > 0.0);
- psi_norm[i] = sqrt(psi_norm[i]);
-
- vector_div_constant_op()(this->ctx,
- this->dim,
- psi_iter + (nbase + i) * this->dim,
- psi_iter + (nbase + i) * this->dim,
- psi_norm[i]);
+ Real* psi_norm = nullptr;
+ normalize_op()(this->ctx,
+ this->dim,
+ psi_iter,
+ nbase,
+ notconv,
+ psi_norm);
}
// update hpsi[:, nbase:nbase+notconv]
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index 0c1ad2e8b8..d0648addf5 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -280,47 +280,106 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt,
std::vector eigenvalues(this->wfc_basis->nks * psi.get_nbands(), 0.0);
ethr_band.resize(psi.get_nbands(), this->diag_thr);
- /// Loop over k points for solve Hamiltonian to charge density
- for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
- {
- /// update H(k) for each k point
- pHamilt->updateHk(ik);
+ // Initialize k-point continuity if enabled
+ static int count = 0;
+ if (use_k_continuity) {
+ build_k_neighbors();
+ }
+
+ // Loop over k points for solve Hamiltonian to charge density
+ if (use_k_continuity) {
+ // K-point continuity case
+ for (int i = 0; i < this->wfc_basis->nks; ++i)
+ {
+ const int ik = k_order[i];
+
+ // update H(k) for each k point
+ pHamilt->updateHk(ik);
#ifdef USE_PAW
- this->paw_func_in_kloop(ik, tpiba);
+ this->paw_func_in_kloop(ik, tpiba);
#endif
- /// update psi pointer for each k point
- psi.fix_k(ik);
+ // update psi pointer for each k point
+ psi.fix_k(ik);
+
+ // If using k-point continuity and not first k-point, propagate from parent
+ if (ik > 0 && count == 0 && k_parent.find(ik) != k_parent.end()) {
+ propagate_psi(psi, k_parent[ik], ik);
+ }
- // template add precondition calculating here
- update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
+ // template add precondition calculating here
+ update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
- // use smooth threshold for all iter methods
- if (PARAM.inp.diago_smooth_ethr == true)
- {
- this->cal_smooth_ethr(pes->klist->wk[ik],
- &pes->wg(ik, 0),
- DiagoIterAssist::PW_DIAG_THR,
- ethr_band);
- }
+ // use smooth threshold for all iter methods
+ if (PARAM.inp.diago_smooth_ethr == true)
+ {
+ this->cal_smooth_ethr(pes->klist->wk[ik],
+ &pes->wg(ik, 0),
+ DiagoIterAssist::PW_DIAG_THR,
+ ethr_band);
+ }
#ifdef USE_PAW
- this->call_paw_cell_set_currentk(ik);
+ this->call_paw_cell_set_currentk(ik);
#endif
- /// solve eigenvector and eigenvalue for H(k)
- this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
+ // solve eigenvector and eigenvalue for H(k)
+ this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
- if (skip_charge)
+ if (skip_charge)
+ {
+ GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+ << " is: " << DiagoIterAssist::avg_iter
+ << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
+ DiagoIterAssist::avg_iter = 0.0;
+ }
+ }
+ }
+ else {
+ // Original code without k-point continuity
+ for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
{
- GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
- << " is: " << DiagoIterAssist::avg_iter
- << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
- DiagoIterAssist::avg_iter = 0.0;
+ // update H(k) for each k point
+ pHamilt->updateHk(ik);
+
+#ifdef USE_PAW
+ this->paw_func_in_kloop(ik, tpiba);
+#endif
+
+ // update psi pointer for each k point
+ psi.fix_k(ik);
+
+ // template add precondition calculating here
+ update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
+
+ // use smooth threshold for all iter methods
+ if (PARAM.inp.diago_smooth_ethr == true)
+ {
+ this->cal_smooth_ethr(pes->klist->wk[ik],
+ &pes->wg(ik, 0),
+ DiagoIterAssist::PW_DIAG_THR,
+ ethr_band);
+ }
+
+#ifdef USE_PAW
+ this->call_paw_cell_set_currentk(ik);
+#endif
+
+ // solve eigenvector and eigenvalue for H(k)
+ this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
+
+ if (skip_charge)
+ {
+ GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+ << " is: " << DiagoIterAssist::avg_iter
+ << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
+ DiagoIterAssist::avg_iter = 0.0;
+ }
}
- /// calculate the contribution of Psi for charge density rho
}
+
+ count++;
// END Loop over k points
// copy eigenvalues to ekb in ElecState
@@ -666,6 +725,101 @@ void HSolverPW::output_iterInfo()
}
}
+template
+void HSolverPW::build_k_neighbors() {
+ const int nk = this->wfc_basis->nks;
+ kvecs_c.resize(nk);
+ k_order.clear();
+ k_order.reserve(nk);
+
+ // Store k-points and corresponding indices
+ struct KPoint {
+ ModuleBase::Vector3 kvec;
+ int index;
+ double norm;
+
+ KPoint(const ModuleBase::Vector3& v, int i) :
+ kvec(v), index(i), norm(v.norm()) {}
+ };
+
+ // Build k-point list
+ std::vector klist;
+ for (int ik = 0; ik < nk; ++ik) {
+ kvecs_c[ik] = this->wfc_basis->kvec_c[ik];
+ klist.push_back(KPoint(kvecs_c[ik], ik));
+ }
+
+ // Sort k-points by distance from origin
+ std::sort(klist.begin(), klist.end(),
+ [](const KPoint& a, const KPoint& b) {
+ return a.norm < b.norm;
+ });
+
+ // Build parent-child relationships
+ k_order.push_back(klist[0].index);
+
+ // Find nearest processed k-point as parent for each k-point
+ for (int i = 1; i < nk; ++i) {
+ int current_k = klist[i].index;
+ double min_dist = 1e10;
+ int parent = -1;
+
+ // find the nearest k-point as parent
+ for (int j = 0; j < k_order.size(); ++j) {
+ int processed_k = k_order[j];
+ double dist = (kvecs_c[current_k] - kvecs_c[processed_k]).norm2();
+ if (dist < min_dist) {
+ min_dist = dist;
+ parent = processed_k;
+ }
+ }
+
+ k_parent[current_k] = parent;
+ k_order.push_back(current_k);
+ }
+}
+
+template
+void HSolverPW::propagate_psi(psi::Psi& psi, const int from_ik, const int to_ik) {
+ const int nbands = psi.get_nbands();
+ const int npwk = this->wfc_basis->npwk[to_ik];
+
+ // Get k-point difference
+ ModuleBase::Vector3 dk = kvecs_c[to_ik] - kvecs_c[from_ik];
+
+ // Allocate porter locally
+ T* porter = nullptr;
+ resmem_complex_op()(this->ctx, porter, this->wfc_basis->nmaxgr, "HSolverPW::porter");
+
+ // Process each band
+ for (int ib = 0; ib < nbands; ib++)
+ {
+ // Fix current k-point and band
+ // psi.fix_k(from_ik);
+
+ // FFT to real space
+ // this->wfc_basis->recip_to_real(this->ctx, psi.get_pointer(ib), porter, from_ik);
+ this->wfc_basis->recip_to_real(this->ctx, &psi(from_ik, ib, 0), porter, from_ik);
+
+ // Apply phase factor
+ // // TODO: Check how to get the r vector
+ // ModuleBase::Vector3 r = this->wfc_basis->get_ir2r(ir);
+ // double phase = this->wfc_basis->tpiba * (dk.x * r.x + dk.y * r.y + dk.z * r.z);
+ // psi_real[ir] *= std::exp(std::complex(0.0, phase));
+ // }
+
+ // Fix k-point for target
+ // psi.fix_k(to_ik);
+
+ // FFT back to reciprocal space
+ // this->wfc_basis->real_to_recip(this->ctx, porter, psi.get_pointer(ib), to_ik, true);
+ this->wfc_basis->real_to_recip(this->ctx, porter, &psi(to_ik, ib, 0), to_ik);
+ }
+
+ // Clean up porter
+ delmem_complex_op()(this->ctx, porter);
+}
+
template class HSolverPW, base_device::DEVICE_CPU>;
template class HSolverPW, base_device::DEVICE_CPU>;
#if ((defined __CUDA) || (defined __ROCM))
@@ -673,4 +827,4 @@ template class HSolverPW, base_device::DEVICE_GPU>;
template class HSolverPW, base_device::DEVICE_GPU>;
#endif
-} // namespace hsolver
\ No newline at end of file
+} // namespace hsolver
diff --git a/source/module_hsolver/hsolver_pw.h b/source/module_hsolver/hsolver_pw.h
index 0dee4fbdbf..7f0bfc7c23 100644
--- a/source/module_hsolver/hsolver_pw.h
+++ b/source/module_hsolver/hsolver_pw.h
@@ -6,6 +6,8 @@
#include "module_base/macros.h"
#include "module_basis/module_pw/pw_basis_k.h"
#include "module_psi/wavefunc.h"
+#include
+#include "module_base/memory.h"
namespace hsolver
{
@@ -18,6 +20,9 @@ class HSolverPW
// return T if T is real type(float, double),
// otherwise return the real type of T(complex, complex)
using Real = typename GetTypeReal::type;
+ using resmem_complex_op = base_device::memory::resize_memory_op;
+ using delmem_complex_op = base_device::memory::delete_memory_op;
+ using setmem_complex_op = base_device::memory::set_memory_op;
public:
HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in,
@@ -30,10 +35,12 @@ class HSolverPW
const int scf_iter_in,
const int diag_iter_max_in,
const double diag_thr_in,
- const bool need_subspace_in)
+ const bool need_subspace_in,
+ const bool use_k_continuity_in = false)
: wfc_basis(wfc_basis_in), calculation_type(calculation_type_in), basis_type(basis_type_in), method(method_in),
use_paw(use_paw_in), use_uspp(use_uspp_in), nspin(nspin_in), scf_iter(scf_iter_in),
- diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in){};
+ diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in),
+ use_k_continuity(use_k_continuity_in) {};
/// @brief solve function for pw
/// @param pHamilt interface to hamilt
@@ -51,6 +58,7 @@ class HSolverPW
const double tpiba,
const int nat);
+
protected:
// diago caller
void hamiltSolvePsiK(hamilt::Hamilt* hm,
@@ -79,6 +87,8 @@ class HSolverPW
const bool need_subspace; // for cg or dav_subspace
+ const bool use_k_continuity;
+
protected:
Device* ctx = {};
@@ -99,8 +109,16 @@ class HSolverPW
void paw_func_after_kloop(psi::Psi& psi, elecstate::ElecState* pes,const double tpiba,const int nat);
#endif
+
+ // K-point continuity related members
+ std::vector k_order;
+ std::unordered_map k_parent;
+ std::vector> kvecs_c;
+
+ void build_k_neighbors();
+ void propagate_psi(psi::Psi& psi, const int from_ik, const int to_ik);
};
} // namespace hsolver
-#endif
\ No newline at end of file
+#endif
diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
index 149b9ce389..8fb7c542eb 100644
--- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -42,6 +42,48 @@ struct GetTypeThrust> {
static cublasHandle_t cublas_handle = nullptr;
+// Forward declarations for abs2
+template
+__device__ typename GetTypeReal::type abs2(const T& x);
+
+// Specialization for real types (double)
+template<>
+__device__ double abs2(const double& x) {
+ return x * x;
+}
+
+// Specialization for real types (float)
+template<>
+__device__ float abs2(const float& x) {
+ return x * x;
+}
+
+// Specialization for complex float
+template<>
+__device__ float abs2(const thrust::complex& x) {
+ return x.real() * x.real() + x.imag() * x.imag();
+}
+
+// Specialization for complex double
+template<>
+__device__ double abs2(const thrust::complex& x) {
+ return x.real() * x.real() + x.imag() * x.imag();
+}
+
+// Specialization for std::complex (for interface compatibility)
+template<>
+__device__ float abs2(const std::complex& x) {
+ const thrust::complex* tx = reinterpret_cast*>(&x);
+ return tx->real() * tx->real() + tx->imag() * tx->imag();
+}
+
+// Specialization for std::complex (for interface compatibility)
+template<>
+__device__ double abs2(const std::complex& x) {
+ const thrust::complex* tx = reinterpret_cast*>(&x);
+ return tx->real() * tx->real() + tx->imag() * tx->imag();
+}
+
static inline
void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) {
cublasErrcheck(cublasSdot(cublas_handle, n, x, incx, y, incy, &result));
@@ -1035,6 +1077,320 @@ void matrixSetToAnother, base_device::DEVICE_GPU>::operator
cudaCheckOnDebug();
}
+// Kernel for applying eigenvalues to vectors
+
+template
+__global__ void apply_eigenvalues_kernel(const typename GetTypeReal::type* eigenvalues, const T* vectors, T* result, int nbase, int nbase_x, int notconv)
+{
+ const int tid = blockIdx.x * blockDim.x + threadIdx.x; // Linear thread ID
+ const int total_elements = notconv * nbase;
+
+ if (tid < total_elements)
+ {
+ const int m = tid / nbase; // Row index (eigenvalue index)
+ const int idx = tid % nbase; // Column index within the row
+ result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+ }
+}
+
+// Wrapper for applying eigenvalues to complex vectors
+template
+inline void apply_eigenvalues_complex_wrapper(const base_device::DEVICE_GPU* d,
+ const int& nbase,
+ const int& nbase_x,
+ const int& notconv,
+ const FPTYPE* eigenvalues,
+ const std::complex* vectors,
+ std::complex* result)
+{
+ thrust::complex* result_tmp = reinterpret_cast*>(result);
+ const thrust::complex* vectors_tmp = reinterpret_cast*>(vectors);
+ const int total_elements = notconv * nbase;
+ const int threadsPerBlock = 256;
+ const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+
+ apply_eigenvalues_kernel><<>>(
+ eigenvalues, vectors_tmp, result_tmp, nbase, nbase_x, notconv);
+
+ cudaCheckOnDebug();
+}
+
+// Specialization for double
+template <>
+void apply_eigenvalues_op::operator()(const base_device::DEVICE_GPU *d,
+ const int &nbase, const int &nbase_x, const int ¬conv,
+ double *result, const double *vectors, const double *eigenvalues) {
+ const int total_elements = notconv * nbase;
+ const int threadsPerBlock = 256;
+ const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+
+ apply_eigenvalues_kernel<<>>(
+ eigenvalues, vectors, result, nbase, nbase_x, notconv);
+ cudaCheckOnDebug();
+}
+
+// Specialization for std::complex
+template <>
+void apply_eigenvalues_op, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d,
+ const int &nbase, const int &nbase_x, const int ¬conv,
+ std::complex *result, const std::complex *vectors, const float *eigenvalues) {
+ apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result);
+}
+
+// Specialization for std::complex
+template <>
+void apply_eigenvalues_op, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d,
+ const int &nbase, const int &nbase_x, const int ¬conv,
+ std::complex *result, const std::complex *vectors, const double *eigenvalues) {
+ apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result);
+}
+
+template
+__global__ void precondition_kernel(
+ const int dim,
+ const int notconv,
+ T* psi_iter,
+ const int nbase,
+ const typename GetTypeReal::type* precondition, // Real type
+ const typename GetTypeReal::type* eigenvalues)
+{
+ const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ if (tid < dim * notconv) {
+ const int i = tid % dim; // Basis index
+ const int m = tid / dim; // Band index
+
+ using Real = typename GetTypeReal::type;
+ Real x = abs(precondition[i] - eigenvalues[m]);
+ Real pre = 0.5 * (1.0 + x + sqrt(1.0 + (x - 1.0) * (x - 1.0)));
+ psi_iter[(nbase + m) * dim + i] /= pre;
+ }
+}
+
+// Specialization for double (for LCAO)
+template <>
+void precondition_op::operator()(
+ const base_device::DEVICE_GPU* ctx,
+ const int& dim,
+ double* psi_iter,
+ const int& nbase,
+ const int& notconv,
+ const double* precondition,
+ const double* eigenvalues)
+{
+ const int total_elements = dim * notconv;
+ const int threadsPerBlock = thread_per_block;
+ const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+
+ precondition_kernel<<>>(
+ dim, notconv, psi_iter, nbase, precondition, eigenvalues);
+
+ cudaCheckOnDebug();
+}
+
+// Specialization for complex
+template <>
+void precondition_op, base_device::DEVICE_GPU>::operator()(
+ const base_device::DEVICE_GPU* ctx,
+ const int& dim,
+ std::complex* psi_iter,
+ const int& nbase,
+ const int& notconv,
+ const float* precondition,
+ const float* eigenvalues)
+{
+ const int total_elements = dim * notconv;
+ const int threadsPerBlock = thread_per_block;
+ const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+
+ precondition_kernel><<>>(
+ dim, notconv,
+ reinterpret_cast*>(psi_iter),
+ nbase,
+ precondition, // Already float
+ eigenvalues);
+
+ cudaCheckOnDebug();
+}
+
+// Specialization for complex
+template <>
+void precondition_op