From 9a42117334e2b061086ae05b42c77b9dbe166ee0 Mon Sep 17 00:00:00 2001
From: Jie Li <li_j20@fudan.edu.cn>
Date: Sat, 8 Feb 2025 18:53:24 +0800
Subject: [PATCH 1/6] Feature: implement k continuity initialization strategy &
 kernel optimization in davidson-subspcae algorithm

- add k continuity initialization strategy in planewave basis
- implement heterogenous computation branching between CPU and DCU
- implement optimized eigenvalue operations for GPU & DCU
- implement optimized preconditioner for GPU & DCU
- implement optimized normalization op for GPU & DCU
---
 source/module_base/module_device/device.cpp   |   9 +-
 source/module_esolver/esolver.cpp             |  16 +-
 source/module_esolver/esolver_ks_pw.cpp       |   3 +-
 source/module_hsolver/diago_dav_subspace.cpp  | 128 +++---
 source/module_hsolver/hsolver_pw.cpp          | 210 ++++++++--
 source/module_hsolver/hsolver_pw.h            |  24 +-
 .../kernels/cuda/math_kernel_op.cu            | 365 ++++++++++++++++++
 .../module_hsolver/kernels/math_kernel_op.cpp |  89 +++++
 .../module_hsolver/kernels/math_kernel_op.h   |  62 +++
 .../kernels/rocm/dngvd_op.hip.cu              | 262 ++++++++++---
 .../kernels/rocm/math_kernel_op.hip.cu        | 355 +++++++++++++++++
 .../module_hsolver/test/test_hsolver_pw.cpp   |  62 +++
 .../module_hsolver/test/test_hsolver_sdft.cpp |  60 +++
 .../module_io/read_input_item_elec_stru.cpp   |  20 +
 source/module_parameter/input_parameter.h     |   1 +
 source/module_psi/psi_init.cpp                |   3 +-
 16 files changed, 1519 insertions(+), 150 deletions(-)
diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp
index b20ea9f3ad..9373a5a31a 100644
--- a/source/module_base/module_device/device.cpp
+++ b/source/module_base/module_device/device.cpp
@@ -5,7 +5,7 @@
 
 #include <base/macros/macros.h>
 #include <cstring>
-
+#include <iostream>
 #ifdef __MPI
 #include "mpi.h"
 #endif
@@ -166,6 +166,11 @@ int device_count = -1;
 cudaGetDeviceCount(&device_count);
 #elif defined(__ROCM)
 hipGetDeviceCount(&device_count);
+/***auto start_time = std::chrono::high_resolution_clock::now();
+std::cout << "Starting hipGetDeviceCount.." << std::endl;
+auto end_time = std::chrono::high_resolution_clock::now();
+auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
 #endif
 if (device_count <= 0)
 {
@@ -711,4 +716,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
 #endif
 
 } // end of namespace information
-} // end of namespace base_device
\ No newline at end of file
+} // end of namespace base_device
diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp
index 0352492a3a..f8985387d9 100644
--- a/source/module_esolver/esolver.cpp
+++ b/source/module_esolver/esolver.cpp
@@ -107,13 +107,21 @@ std::string determine_type()
 	}
 	if (GlobalV::MY_RANK == 0)
 	{
-		std::cout << " RUNNING WITH DEVICE  : " << device_info << " / "
+		/***auto start_time = std::chrono::high_resolution_clock::now();
+        std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/
+        std::cout << " RUNNING WITH DEVICE  : " << device_info << " / "
 			<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;
+        /***auto end_time = std::chrono::high_resolution_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+        std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
 	}
-
+   /*** auto start_time = std::chrono::high_resolution_clock::now();
+    std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/
     GlobalV::ofs_running << "\n RUNNING WITH DEVICE  : " << device_info << " / "
-                         << base_device::information::get_device_info(PARAM.inp.device) << std::endl;
-
+                         << base_device::information::get_device_info(PARAM.inp.device) << std::endl; 
+    /***auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
     return esolver_type;
 }
 
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index 0b303f33e1..664ac186d5 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -556,7 +556,8 @@ void ESolver_KS_PW<T, Device>::hamilt2density_single(UnitCell& ucell,
                                                      hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace);
+                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace,
+                                                     PARAM.inp.use_k_continuity);
 
         hsolver_pw_obj.solve(this->p_hamilt,
                              this->kspw_psi[0],
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index b180d72c13..f07f4cfefa 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -7,7 +7,6 @@
 #include "module_hsolver/kernels/dngvd_op.h"
 #include "module_hsolver/kernels/math_kernel_op.h"
 #include "module_base/kernels/dsp/dsp_connector.h"
-
 #include <vector>
 
 using namespace hsolver;
@@ -60,7 +59,7 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
     if (this->device == base_device::GpuDevice)
     {
         resmem_real_op()(this->ctx, this->d_precondition, nbasis_in);
-        // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
     }
 #endif
 }
@@ -288,27 +287,28 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                          psi_iter + (nbase) * this->dim,
                          this->dim);
 
-    std::vector<Real> e_temp_cpu(nbase, 0);
+    // Eigenvalues operation section
+    std::vector<Real> e_temp_cpu(this->notconv, 0);
     Real* e_temp_hd = e_temp_cpu.data();
-    if(this->device == base_device::GpuDevice)
+    if (this->device == base_device::GpuDevice)
     {
         e_temp_hd = nullptr;
-        resmem_real_op()(this->ctx, e_temp_hd, nbase);
+        resmem_real_op()(this->ctx, e_temp_hd, this->notconv);
     }
-    for (int m = 0; m < notconv; m++)
+
+    for (int m = 0; m < this->notconv; m++)
     {
-        e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m]));
-        if (this->device == base_device::GpuDevice)
-        {
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), nbase);
-        }
-        vector_mul_vector_op<T, Device>()(this->ctx,
-                                                nbase,
-                                                vcc + m * this->nbase_x,
-                                                vcc + m * this->nbase_x,
-                                                e_temp_hd);
+        e_temp_cpu[m] = -(*eigenvalue_iter)[m];
     }
-    if(this->device == base_device::GpuDevice)
+
+    if (this->device == base_device::GpuDevice)
+    {
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), this->notconv);
+    }
+    
+    apply_eigenvalues_op<T, Device>()(this->ctx, nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd);
+
+    if (this->device == base_device::GpuDevice)
     {
         delmem_real_op()(this->ctx, e_temp_hd);
     }
@@ -333,54 +333,62 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                          psi_iter + nbase * this->dim,
                          this->dim);
 
-    // "precondition!!!"
-    std::vector<Real> pre(this->dim, 0.0);
-    for (int m = 0; m < notconv; m++)
-    {
-        for (size_t i = 0; i < this->dim; i++)
-        {
-            // pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
-            double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
-            pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
-        }
+    // Precondition section
 #if defined(__CUDA) || defined(__ROCM)
-        if (this->device == base_device::GpuDevice)
-        {
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, pre.data(), this->dim);
-            vector_div_vector_op<T, Device>()(this->ctx,
-                                              this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              this->d_precondition);
-        }
-        else
+    if (this->device == base_device::GpuDevice)
+    {
+        Real* eigenvalues_gpu = nullptr;
+        resmem_real_op()(this->ctx, eigenvalues_gpu, notconv);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu,(*eigenvalue_iter).data(), notconv);
+        
+        precondition_op<T, Device>()(this->ctx,
+                                    this->dim,
+                                    psi_iter,
+                                    nbase,
+                                    notconv,
+                                    d_precondition,
+                                    eigenvalues_gpu);
+        delmem_real_op()(this->ctx, eigenvalues_gpu);
+    }
+    else
 #endif
-        {
-            vector_div_vector_op<T, Device>()(this->ctx,
-                                              this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              pre.data());
-        }
+    {
+        precondition_op<T, Device>()(this->ctx,
+                                    this->dim,
+                                    psi_iter,
+                                    nbase,
+                                    notconv,
+                                    this->precondition.data(),
+                                    (*eigenvalue_iter).data());
     }
 
-    // "normalize!!!" in order to improve numerical stability of subspace diagonalization
-    std::vector<Real> psi_norm(notconv, 0.0);
-    for (size_t i = 0; i < notconv; i++)
+    // Normalize section
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        Real* psi_norm = nullptr;
+        resmem_real_op()(this->ctx, psi_norm, notconv);
+        using setmem_real_op = base_device::memory::set_memory_op<Real, Device>;
+        setmem_real_op()(this->ctx, psi_norm, 0.0, notconv);
+        
+        normalize_op<T, Device>()(this->ctx,
+                                this->dim,
+                                psi_iter,
+                                nbase,
+                                notconv,
+                                psi_norm);
+        delmem_real_op()(this->ctx, psi_norm);
+    }
+    else
+#endif
     {
-        psi_norm[i] = dot_real_op<T, Device>()(this->ctx,
-                                               this->dim,
-                                               psi_iter + (nbase + i) * this->dim,
-                                               psi_iter + (nbase + i) * this->dim,
-                                               true);
-        assert(psi_norm[i] > 0.0);
-        psi_norm[i] = sqrt(psi_norm[i]);
-
-        vector_div_constant_op<T, Device>()(this->ctx,
-                                            this->dim,
-                                            psi_iter + (nbase + i) * this->dim,
-                                            psi_iter + (nbase + i) * this->dim,
-                                            psi_norm[i]);
+        Real* psi_norm = nullptr;
+        normalize_op<T, Device>()(this->ctx,
+                                this->dim,
+                                psi_iter,
+                                nbase,
+                                notconv,
+                                psi_norm);
     }
 
     // update hpsi[:, nbase:nbase+notconv]
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index 0c1ad2e8b8..d0648addf5 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -280,47 +280,106 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
     std::vector<Real> eigenvalues(this->wfc_basis->nks * psi.get_nbands(), 0.0);
     ethr_band.resize(psi.get_nbands(), this->diag_thr);
 
-    /// Loop over k points for solve Hamiltonian to charge density
-    for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
-    {
-        /// update H(k) for each k point
-        pHamilt->updateHk(ik);
+    // Initialize k-point continuity if enabled
+    static int count = 0;
+    if (use_k_continuity) {
+        build_k_neighbors();
+    }
+
+    // Loop over k points for solve Hamiltonian to charge density
+    if (use_k_continuity) {
+        // K-point continuity case
+        for (int i = 0; i < this->wfc_basis->nks; ++i)
+        {
+            const int ik = k_order[i];
+            
+            // update H(k) for each k point
+            pHamilt->updateHk(ik);
 
 #ifdef USE_PAW
-        this->paw_func_in_kloop(ik, tpiba);
+            this->paw_func_in_kloop(ik, tpiba);
 #endif
 
-        /// update psi pointer for each k point
-        psi.fix_k(ik);
+            // update psi pointer for each k point
+            psi.fix_k(ik);
+
+            // If using k-point continuity and not first k-point, propagate from parent
+            if (ik > 0 && count == 0 && k_parent.find(ik) != k_parent.end()) {
+                propagate_psi(psi, k_parent[ik], ik);
+            }
 
-        // template add precondition calculating here
-        update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
+            // template add precondition calculating here
+            update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
 
-        // use smooth threshold for all iter methods
-        if (PARAM.inp.diago_smooth_ethr == true)
-        {
-            this->cal_smooth_ethr(pes->klist->wk[ik],
-                                  &pes->wg(ik, 0),
-                                  DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                  ethr_band);
-        }
+            // use smooth threshold for all iter methods
+            if (PARAM.inp.diago_smooth_ethr == true)
+            {
+                this->cal_smooth_ethr(pes->klist->wk[ik],
+                                    &pes->wg(ik, 0),
+                                    DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                    ethr_band);
+            }
 
 #ifdef USE_PAW
-        this->call_paw_cell_set_currentk(ik);
+            this->call_paw_cell_set_currentk(ik);
 #endif
 
-        /// solve eigenvector and eigenvalue for H(k)
-        this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
+            // solve eigenvector and eigenvalue for H(k)
+            this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
 
-        if (skip_charge)
+            if (skip_charge)
+            {
+                GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+                                    << " is: " << DiagoIterAssist<T, Device>::avg_iter
+                                    << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
+                DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            }
+        }
+    }
+    else {
+        // Original code without k-point continuity
+        for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
         {
-            GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
-                                 << " is: " << DiagoIterAssist<T, Device>::avg_iter
-                                 << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
-            DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            // update H(k) for each k point
+            pHamilt->updateHk(ik);
+
+#ifdef USE_PAW
+            this->paw_func_in_kloop(ik, tpiba);
+#endif
+
+            // update psi pointer for each k point
+            psi.fix_k(ik);
+
+            // template add precondition calculating here
+            update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
+
+            // use smooth threshold for all iter methods
+            if (PARAM.inp.diago_smooth_ethr == true)
+            {
+                this->cal_smooth_ethr(pes->klist->wk[ik],
+                                    &pes->wg(ik, 0),
+                                    DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                    ethr_band);
+            }
+
+#ifdef USE_PAW
+            this->call_paw_cell_set_currentk(ik);
+#endif
+
+            // solve eigenvector and eigenvalue for H(k)
+            this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
+
+            if (skip_charge)
+            {
+                GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+                                    << " is: " << DiagoIterAssist<T, Device>::avg_iter
+                                    << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
+                DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            }
         }
-        /// calculate the contribution of Psi for charge density rho
     }
+    
+    count++;
     // END Loop over k points
 
     // copy eigenvalues to ekb in ElecState
@@ -666,6 +725,101 @@ void HSolverPW<T, Device>::output_iterInfo()
     }
 }
 
+template <typename T, typename Device>
+void HSolverPW<T, Device>::build_k_neighbors() {
+    const int nk = this->wfc_basis->nks;
+    kvecs_c.resize(nk);
+    k_order.clear();
+    k_order.reserve(nk);
+    
+    // Store k-points and corresponding indices
+    struct KPoint {
+        ModuleBase::Vector3<double> kvec;
+        int index;
+        double norm;
+        
+        KPoint(const ModuleBase::Vector3<double>& v, int i) : 
+            kvec(v), index(i), norm(v.norm()) {}
+    };
+    
+    // Build k-point list
+    std::vector<KPoint> klist;
+    for (int ik = 0; ik < nk; ++ik) {
+        kvecs_c[ik] = this->wfc_basis->kvec_c[ik];
+        klist.push_back(KPoint(kvecs_c[ik], ik));
+    }
+    
+    // Sort k-points by distance from origin
+    std::sort(klist.begin(), klist.end(),
+        [](const KPoint& a, const KPoint& b) {
+            return a.norm < b.norm;
+        });
+    
+    // Build parent-child relationships
+    k_order.push_back(klist[0].index);
+    
+    // Find nearest processed k-point as parent for each k-point
+    for (int i = 1; i < nk; ++i) {
+        int current_k = klist[i].index;
+        double min_dist = 1e10;
+        int parent = -1;
+        
+        // find the nearest k-point as parent
+        for (int j = 0; j < k_order.size(); ++j) {
+            int processed_k = k_order[j];
+            double dist = (kvecs_c[current_k] - kvecs_c[processed_k]).norm2();
+            if (dist < min_dist) {
+                min_dist = dist;
+                parent = processed_k;
+            }
+        }
+        
+        k_parent[current_k] = parent;
+        k_order.push_back(current_k);
+    }
+}
+
+template <typename T, typename Device>
+void HSolverPW<T, Device>::propagate_psi(psi::Psi<T, Device>& psi, const int from_ik, const int to_ik) {
+    const int nbands = psi.get_nbands();
+    const int npwk = this->wfc_basis->npwk[to_ik];
+    
+    // Get k-point difference
+    ModuleBase::Vector3<double> dk = kvecs_c[to_ik] - kvecs_c[from_ik];
+    
+    // Allocate porter locally
+    T* porter = nullptr;
+    resmem_complex_op()(this->ctx, porter, this->wfc_basis->nmaxgr, "HSolverPW::porter");
+    
+    // Process each band
+    for (int ib = 0; ib < nbands; ib++)
+    {
+        // Fix current k-point and band
+        // psi.fix_k(from_ik);
+        
+        // FFT to real space
+        // this->wfc_basis->recip_to_real(this->ctx, psi.get_pointer(ib), porter, from_ik);
+        this->wfc_basis->recip_to_real(this->ctx, &psi(from_ik, ib, 0), porter, from_ik);
+        
+        // Apply phase factor
+        //     // TODO: Check how to get the r vector
+        //     ModuleBase::Vector3<double> r = this->wfc_basis->get_ir2r(ir);
+        //     double phase = this->wfc_basis->tpiba * (dk.x * r.x + dk.y * r.y + dk.z * r.z);
+        //     psi_real[ir] *= std::exp(std::complex<double>(0.0, phase));
+        // }
+        
+        // Fix k-point for target
+        // psi.fix_k(to_ik);
+        
+        // FFT back to reciprocal space
+        // this->wfc_basis->real_to_recip(this->ctx, porter, psi.get_pointer(ib), to_ik, true);
+        this->wfc_basis->real_to_recip(this->ctx, porter, &psi(to_ik, ib, 0), to_ik);
+    }
+
+    // Clean up porter
+    delmem_complex_op()(this->ctx, porter);
+}
+
 template class HSolverPW<std::complex<float>, base_device::DEVICE_CPU>;
 template class HSolverPW<std::complex<double>, base_device::DEVICE_CPU>;
 #if ((defined __CUDA) || (defined __ROCM))
@@ -673,4 +827,4 @@ template class HSolverPW<std::complex<float>, base_device::DEVICE_GPU>;
 template class HSolverPW<std::complex<double>, base_device::DEVICE_GPU>;
 #endif
 
-} // namespace hsolver
\ No newline at end of file
+} // namespace hsolver
diff --git a/source/module_hsolver/hsolver_pw.h b/source/module_hsolver/hsolver_pw.h
index 0dee4fbdbf..7f0bfc7c23 100644
--- a/source/module_hsolver/hsolver_pw.h
+++ b/source/module_hsolver/hsolver_pw.h
@@ -6,6 +6,8 @@
 #include "module_base/macros.h"
 #include "module_basis/module_pw/pw_basis_k.h"
 #include "module_psi/wavefunc.h"
+#include <unordered_map>
+#include "module_base/memory.h"
 
 namespace hsolver
 {
@@ -18,6 +20,9 @@ class HSolverPW
     // return T if T is real type(float, double),
     // otherwise return the real type of T(complex<float>, complex<double>)
     using Real = typename GetTypeReal<T>::type;
+    using resmem_complex_op = base_device::memory::resize_memory_op<T, Device>;
+    using delmem_complex_op = base_device::memory::delete_memory_op<T, Device>;
+    using setmem_complex_op = base_device::memory::set_memory_op<T, Device>;
 
   public:
     HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in,
@@ -30,10 +35,12 @@ class HSolverPW
               const int scf_iter_in,
               const int diag_iter_max_in,
               const double diag_thr_in,
-              const bool need_subspace_in)
+              const bool need_subspace_in,
+              const bool use_k_continuity_in = false)
         : wfc_basis(wfc_basis_in), calculation_type(calculation_type_in), basis_type(basis_type_in), method(method_in),
           use_paw(use_paw_in), use_uspp(use_uspp_in), nspin(nspin_in), scf_iter(scf_iter_in),
-          diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in){};
+          diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in),
+          use_k_continuity(use_k_continuity_in) {};
 
     /// @brief solve function for pw
     /// @param pHamilt interface to hamilt
@@ -51,6 +58,7 @@ class HSolverPW
                const double tpiba,
                const int nat);
 
+
   protected:
     // diago caller
     void hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
@@ -79,6 +87,8 @@ class HSolverPW
 
     const bool need_subspace; // for cg or dav_subspace
 
+    const bool use_k_continuity;
+
   protected:
     Device* ctx = {};
 
@@ -99,8 +109,16 @@ class HSolverPW
 
     void paw_func_after_kloop(psi::Psi<T, Device>& psi, elecstate::ElecState* pes,const double tpiba,const int nat);
 #endif
+
+    // K-point continuity related members
+    std::vector<int> k_order;
+    std::unordered_map<int, int> k_parent;
+    std::vector<ModuleBase::Vector3<double>> kvecs_c;
+    
+    void build_k_neighbors();
+    void propagate_psi(psi::Psi<T, Device>& psi, const int from_ik, const int to_ik);
 };
 
 } // namespace hsolver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
index 149b9ce389..8fb7c542eb 100644
--- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -42,6 +42,48 @@ struct GetTypeThrust<std::complex<double>> {
 
 static cublasHandle_t cublas_handle = nullptr;
 
+// Forward declarations for abs2
+template<typename T>
+__device__ typename GetTypeReal<T>::type abs2(const T& x);
+
+// Specialization for real types (double)
+template<>
+__device__ double abs2(const double& x) {
+    return x * x;
+}
+
+// Specialization for real types (float)
+template<>
+__device__ float abs2(const float& x) {
+    return x * x;
+}
+
+// Specialization for complex float
+template<>
+__device__ float abs2(const thrust::complex<float>& x) {
+    return x.real() * x.real() + x.imag() * x.imag();
+}
+
+// Specialization for complex double
+template<>
+__device__ double abs2(const thrust::complex<double>& x) {
+    return x.real() * x.real() + x.imag() * x.imag();
+}
+
+// Specialization for std::complex<float> (for interface compatibility)
+template<>
+__device__ float abs2(const std::complex<float>& x) {
+    const thrust::complex<float>* tx = reinterpret_cast<const thrust::complex<float>*>(&x);
+    return tx->real() * tx->real() + tx->imag() * tx->imag();
+}
+
+// Specialization for std::complex<double> (for interface compatibility)
+template<>
+__device__ double abs2(const std::complex<double>& x) {
+    const thrust::complex<double>* tx = reinterpret_cast<const thrust::complex<double>*>(&x);
+    return tx->real() * tx->real() + tx->imag() * tx->imag();
+}
+
 static inline
 void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) {
     cublasErrcheck(cublasSdot(cublas_handle, n, x, incx, y, incy, &result));
@@ -1035,6 +1077,320 @@ void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator
     cudaCheckOnDebug();
 }
 
+// Kernel for applying eigenvalues to vectors
+
+template <typename T>
+__global__ void apply_eigenvalues_kernel(const typename GetTypeReal<T>::type* eigenvalues, const T* vectors, T* result, int nbase, int nbase_x, int notconv)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;  // Linear thread ID
+    const int total_elements = notconv * nbase;
+    
+    if (tid < total_elements)
+    {
+        const int m = tid / nbase;        // Row index (eigenvalue index)
+        const int idx = tid % nbase;      // Column index within the row
+        result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+    }
+}
+
+// Wrapper for applying eigenvalues to complex vectors
+template <typename FPTYPE>
+inline void apply_eigenvalues_complex_wrapper(const base_device::DEVICE_GPU* d,
+                                              const int& nbase,
+                                              const int& nbase_x,
+                                              const int& notconv,
+                                              const FPTYPE* eigenvalues,
+                                              const std::complex<FPTYPE>* vectors,
+                                              std::complex<FPTYPE>* result)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vectors_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vectors);
+    const int total_elements = notconv * nbase;
+    const int threadsPerBlock = 256;
+    const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    
+    apply_eigenvalues_kernel<thrust::complex<FPTYPE>><<<numBlocks, threadsPerBlock>>>(
+        eigenvalues, vectors_tmp, result_tmp, nbase, nbase_x, notconv);
+
+    cudaCheckOnDebug();
+}
+
+// Specialization for double
+template <>
+void apply_eigenvalues_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d,
+                                                                       const int &nbase, const int &nbase_x, const int &notconv,
+                                                                       double *result, const double *vectors, const double *eigenvalues) {
+    const int total_elements = notconv * nbase;
+    const int threadsPerBlock = 256;
+    const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    
+    apply_eigenvalues_kernel<double><<<numBlocks, threadsPerBlock>>>(
+        eigenvalues, vectors, result, nbase, nbase_x, notconv);
+    cudaCheckOnDebug();
+}
+
+// Specialization for std::complex<float>
+template <>
+void apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d,
+                                                                                      const int &nbase, const int &nbase_x, const int &notconv,
+                                                                                      std::complex<float> *result, const std::complex<float> *vectors, const float *eigenvalues) {
+    apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result);
+}
+
+// Specialization for std::complex<double>
+template <>
+void apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d,
+                                                                                       const int &nbase, const int &nbase_x, const int &notconv,
+                                                                                       std::complex<double> *result, const std::complex<double> *vectors, const double *eigenvalues) {
+    apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result);
+}
+
+template <typename T>
+__global__ void precondition_kernel(
+    const int dim,
+    const int notconv,
+    T* psi_iter,
+    const int nbase,
+    const typename GetTypeReal<T>::type* precondition,  // Real type
+    const typename GetTypeReal<T>::type* eigenvalues)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dim * notconv) {
+        const int i = tid % dim;    // Basis index
+        const int m = tid / dim;    // Band index
+        
+        using Real = typename GetTypeReal<T>::type;
+        Real x = abs(precondition[i] - eigenvalues[m]);
+        Real pre = 0.5 * (1.0 + x + sqrt(1.0 + (x - 1.0) * (x - 1.0)));
+        psi_iter[(nbase + m) * dim + i] /= pre;
+    }
+}
+
+// Specialization for double (for LCAO)
+template <>
+void precondition_op<double, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    double* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const double* precondition,
+    const double* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    const int threadsPerBlock = thread_per_block;
+    const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    
+    precondition_kernel<double><<<numBlocks, threadsPerBlock>>>(
+        dim, notconv, psi_iter, nbase, precondition, eigenvalues);
+
+    cudaCheckOnDebug();
+}
+
+// Specialization for complex<float>
+template <>
+void precondition_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<float>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const float* precondition,  
+    const float* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    const int threadsPerBlock = thread_per_block;
+    const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    
+    precondition_kernel<thrust::complex<float>><<<numBlocks, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<thrust::complex<float>*>(psi_iter),
+        nbase,
+        precondition,  // Already float
+        eigenvalues);
+
+    cudaCheckOnDebug();
+}
+
+// Specialization for complex<double>
+template <>
+void precondition_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<double>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const double* precondition,  
+    const double* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    const int threadsPerBlock = thread_per_block;
+    const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    
+    precondition_kernel<thrust::complex<double>><<<numBlocks, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<thrust::complex<double>*>(psi_iter),
+        nbase,
+        precondition,  // Already double
+        eigenvalues);
+
+    cudaCheckOnDebug();
+}
+
+// First kernel: calculate norms
+template <typename T>
+__global__ void calculate_norm_kernel(
+    const int dim,
+    const int notconv,
+    const T* psi_iter,
+    const int nbase,
+    typename GetTypeReal<T>::type* psi_norm)
+{
+    using Real = typename GetTypeReal<T>::type;
+    __shared__ Real shared_mem[256];
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int m = bid / ((dim + blockDim.x - 1) / blockDim.x);
+    const int num_blocks_per_band = (dim + blockDim.x - 1) / blockDim.x;
+    const int block_in_band = bid % num_blocks_per_band;
+
+    if (m >= notconv) return;
+
+    // Initialize shared memory
+    shared_mem[tid] = 0.0;
+
+    // Each thread processes one element
+    const int i = tid + block_in_band * blockDim.x;
+    if (i < dim) {
+        shared_mem[tid] = abs2(psi_iter[(nbase + m) * dim + i]);
+    }
+    __syncthreads();
+
+    // Parallel reduction in shared memory
+    for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            shared_mem[tid] += shared_mem[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    // First thread of first block initializes norm to 0
+    // if (tid == 0 && block_in_band == 0) {
+    //     psi_norm[m] = 0.0;
+    // }
+    // __syncthreads();
+
+    // Write result for this block to global memory
+    if (tid == 0) {
+        atomicAdd(&psi_norm[m], shared_mem[0]);
+    }
+}
+
+// Second kernel: normalize vectors
+template <typename T>
+__global__ void normalize_vectors_kernel(
+    const int dim,
+    const int notconv,
+    T* psi_iter,
+    const int nbase,
+    const typename GetTypeReal<T>::type* psi_norm)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int m = tid / dim;
+    const int i = tid % dim;
+    
+    if (m < notconv && i < dim) {
+        psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / sqrt(psi_norm[m]);
+    }
+}
+
+
+// Specialization for complex<float>
+template <>
+void normalize_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<float>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    float* psi_norm)
+{
+    const int threadsPerBlock = 256;
+    const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv;
+    
+    // First kernel: calculate norms
+    calculate_norm_kernel<thrust::complex<float>><<<numBlocksForNorm, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<const thrust::complex<float>*>(psi_iter),
+        nbase, psi_norm);
+    cudaDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    normalize_vectors_kernel<thrust::complex<float>><<<numBlocksForNormalize, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<thrust::complex<float>*>(psi_iter),
+        nbase, psi_norm);
+    cudaCheckOnDebug();
+}
+
+// Specialization for complex<double>
+template <>
+void normalize_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<double>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    double* psi_norm)
+{
+    const int threadsPerBlock = 256;
+    const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv;
+    
+    // First kernel: calculate norms
+    calculate_norm_kernel<thrust::complex<double>><<<numBlocksForNorm, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<const thrust::complex<double>*>(psi_iter),
+        nbase, psi_norm);
+    cudaDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    normalize_vectors_kernel<thrust::complex<double>><<<numBlocksForNormalize, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<thrust::complex<double>*>(psi_iter),
+        nbase, psi_norm);
+    cudaCheckOnDebug();
+}
+
+// Specialization for double (for LCAO)
+template <>
+void normalize_op<double, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    double* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    double* psi_norm)
+{
+    const int threadsPerBlock = 256;
+    const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv;
+    
+    // First kernel: calculate norms
+    calculate_norm_kernel<double><<<numBlocksForNorm, threadsPerBlock>>>(
+        dim, notconv, psi_iter, nbase, psi_norm);
+    cudaDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    normalize_vectors_kernel<double><<<numBlocksForNormalize, threadsPerBlock>>>(
+        dim, notconv, psi_iter, nbase, psi_norm);
+    cudaCheckOnDebug();
+}
 
 // Explicitly instantiate functors for the types of functor registered.
 template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
@@ -1045,6 +1401,9 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GP
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_GPU>;
 
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -1054,6 +1413,9 @@ template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_G
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_GPU>;
 
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
@@ -1062,5 +1424,8 @@ template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_GPU>;
+template struct precondition_op<double, base_device::DEVICE_GPU>;
+template struct normalize_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace hsolver
diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp
index c3784949ab..87cd549482 100644
--- a/source/module_hsolver/kernels/math_kernel_op.cpp
+++ b/source/module_hsolver/kernels/math_kernel_op.cpp
@@ -2,6 +2,7 @@
 
 #include <iomanip>
 #include <iostream>
+#include <vector>
 
 namespace hsolver
 {
@@ -173,6 +174,51 @@ struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct apply_eigenvalues_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_CPU* d, const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues)
+    {
+        for (int m = 0; m < notconv; m++)
+        {
+            for (int idx = 0; idx < nbase; idx++)
+            {
+                result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+            }
+        }
+    }
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_CPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_CPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues)
+    {
+        using Real = typename GetTypeReal<T>::type;
+        std::vector<Real> pre(dim, 0.0);
+        for (int m = 0; m < notconv; m++)
+        {
+            for (size_t i = 0; i < dim; i++)
+            {
+                Real x = std::abs(precondition[i] - eigenvalues[m]);
+                pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+            }
+            vector_div_vector_op<T, base_device::DEVICE_CPU>()(d,
+                                                             dim,
+                                                             psi_iter + (nbase + m) * dim,
+                                                             psi_iter + (nbase + m) * dim,
+                                                             pre.data());
+        }
+    }
+};
+
 template <typename T>
 struct vector_div_vector_op<T, base_device::DEVICE_CPU>
 {
@@ -355,6 +401,40 @@ struct matrixSetToAnother<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_CPU> {
+    void operator()(const base_device::DEVICE_CPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   typename GetTypeReal<T>::type* psi_norm = nullptr)
+    {
+        using Real = typename GetTypeReal<T>::type;
+        for (int m = 0; m < notconv; m++)
+        {
+            // Calculate norm using dot_real_op
+            Real psi_m_norm = dot_real_op<T, base_device::DEVICE_CPU>()(d,
+                                                                dim,
+                                                                psi_iter + (nbase + m) * dim,
+                                                                psi_iter + (nbase + m) * dim,
+                                                                true);
+            assert(psi_m_norm > 0.0);
+            psi_m_norm = sqrt(psi_m_norm);
+
+            // Normalize using vector_div_constant_op
+            vector_div_constant_op<T, base_device::DEVICE_CPU>()(d,
+                                                              dim,
+                                                              psi_iter + (nbase + m) * dim,
+                                                              psi_iter + (nbase + m) * dim,
+                                                              psi_m_norm);
+            if (psi_norm) {
+                psi_norm[m] = psi_m_norm;
+            }
+        }
+    }
+};
+
 // Explicitly instantiate functors for the types of functor registered.
 template struct scal_op<float, base_device::DEVICE_CPU>;
 template struct axpy_op<std::complex<float>, base_device::DEVICE_CPU>;
@@ -365,6 +445,9 @@ template struct gemm_op<float, base_device::DEVICE_CPU>;
 template struct dot_real_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
@@ -381,6 +464,9 @@ template struct gemm_op<double, base_device::DEVICE_CPU>;
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
@@ -392,11 +478,14 @@ template struct line_minimize_with_block_op<std::complex<double>, base_device::D
 template struct axpy_op<double, base_device::DEVICE_CPU>;
 template struct dot_real_op<double, base_device::DEVICE_CPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_CPU>;
+template struct precondition_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
 template struct matrixSetToAnother<double, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
+template struct normalize_op<double, base_device::DEVICE_CPU>;
 #endif
 #ifdef __DSP
 template struct gemm_op_mt<std::complex<float>, base_device::DEVICE_CPU>;
diff --git a/source/module_hsolver/kernels/math_kernel_op.h b/source/module_hsolver/kernels/math_kernel_op.h
index 0daf0e5718..8318c4310b 100644
--- a/source/module_hsolver/kernels/math_kernel_op.h
+++ b/source/module_hsolver/kernels/math_kernel_op.h
@@ -325,6 +325,48 @@ template <typename T, typename Device> struct matrixSetToAnother {
                   T *B, const int &LDB);
 };
 
+template <typename T, typename Device>
+struct apply_eigenvalues_op {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const Device *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T, typename Device>
+struct precondition_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,  
+                   const Real* eigenvalues);
+};
+
+template <typename T, typename Device>
+struct normalize_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm = nullptr);
+};
+
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm);
+};
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 
 template <typename T>
@@ -393,6 +435,26 @@ template <typename T> struct matrixSetToAnother<T, base_device::DEVICE_GPU> {
 void createGpuBlasHandle();
 void destoryBLAShandle();
 
+// vector operator: result[i] = -lambda[i] * vector[i]
+template <typename T> struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues);
+};
+
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hsolver
 
diff --git a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
index 5eeb1697e8..8cba06db1b 100644
--- a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
@@ -5,12 +5,25 @@
 
 namespace hsolver {
 
+// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op
+
+static hipsolverHandle_t hipsolver_H = nullptr;
+// Test on DCU platform. When nstart is greater than 234, code on DCU performs better. 
+const int N_DCU = 234;
+
 void createGpuSolverHandle() {
-    return;
+    if (hipsolver_H == nullptr)
+    {
+        hipsolverErrcheck(hipsolverCreate(&hipsolver_H));
+    }
 }
 
 void destroyGpuSolverHandle() {
-    return;
+    if (hipsolver_H != nullptr)
+    {
+        hipsolverErrcheck(hipsolverDestroy(hipsolver_H));
+        hipsolver_H = nullptr;
+    }
 }
 
 #ifdef __LCAO
@@ -23,22 +36,68 @@ void dngvd_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DE
                                                            double* _eigenvalue,
                                                            double* _vcc)
 {
-    std::vector<double> hcc(nstart * nstart, 0.0);
-    std::vector<double> scc(nstart * nstart, 0.0);
-    std::vector<double> vcc(nstart * nstart, 0.0);
-    std::vector<double> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                nstart,
-                                                ldh,
-                                                hcc.data(),
-                                                scc.data(),
-                                                eigenvalue.data(),
-                                                vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+
+    if (nstart > N_DCU){
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(double) * ldh * nstart, hipMemcpyDeviceToDevice));
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        double * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnDsygvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            _vcc, ldh,
+            _scc, ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(double) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnDsygvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            _vcc, ldh,
+            const_cast<double *>(_scc), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<double> hcc(nstart * nstart, 0.0);
+        std::vector<double> scc(nstart * nstart, 0.0);
+        std::vector<double> vcc(nstart * nstart, 0.0);
+        std::vector<double> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                   nstart,
+                                                   ldh,
+                                                   hcc.data(),
+                                                   scc.data(),
+                                                   eigenvalue.data(),
+                                                   vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+    
 }
 #endif // __LCAO
 
@@ -51,22 +110,67 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
                                                                         float* _eigenvalue,
                                                                         std::complex<float>* _vcc)
 {
-    std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
-    std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
-    std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
-    std::vector<float> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                             nstart,
-                                                             ldh,
-                                                             hcc.data(),
-                                                             scc.data(),
-                                                             eigenvalue.data(),
-                                                             vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+    
+    if (nstart > N_DCU){
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<float>) * ldh * nstart, hipMemcpyDeviceToDevice));
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        float2 * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnChegvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<const float2 *>(_vcc), ldh,
+            reinterpret_cast<const float2 *>(_scc), ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(float2) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnChegvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<float2 *>(_vcc), ldh,
+            const_cast<float2 *>(reinterpret_cast<const float2 *>(_scc)), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
+        std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
+        std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
+        std::vector<float> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                                nstart,
+                                                                ldh,
+                                                                hcc.data(),
+                                                                scc.data(),
+                                                                eigenvalue.data(),
+                                                                vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+    
 }
 
 template <>
@@ -76,24 +180,80 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
                                                                          const std::complex<double>* _hcc,
                                                                          const std::complex<double>* _scc,
                                                                          double* _eigenvalue,
-                                                                         std::complex<double>* _vcc)
+                                                                         std::complex<double>* _vcc
+                                                                        )
 {
-    std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
-    std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
-    std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
-    std::vector<double> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                              nstart,
-                                                              ldh,
-                                                              hcc.data(),
-                                                              scc.data(),
-                                                              eigenvalue.data(),
-                                                              vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+
+    // save a copy of scc in case the diagonalization fails
+    if (nstart > N_DCU){
+        std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
+
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<double>) * ldh * nstart, hipMemcpyDeviceToDevice));
+
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        double2 * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnZhegvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<const double2 *>(_vcc), ldh,
+            reinterpret_cast<const double2 *>(_scc), ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(double2) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnZhegvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<double2 *>(_vcc), ldh,
+            const_cast<double2 *>(reinterpret_cast<const double2 *>(_scc)), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
+        std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
+        std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
+        std::vector<double> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                                nstart,
+                                                                ldh,
+                                                                hcc.data(),
+                                                                scc.data(),
+                                                                eigenvalue.data(),
+                                                                vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+
+
+
+
+
+    
 }
 
 #ifdef __LCAO
diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
index ef5a1c1ece..4ca6af509c 100644
--- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
@@ -11,6 +11,7 @@
 #define WARP_SIZE 32
 #define FULL_MASK 0xffffffff
 #define THREAD_PER_BLOCK 256
+
 template <>
 struct GetTypeReal<thrust::complex<float>> {
     using type = float; /**< The return type specialization for std::complex<double>. */
@@ -20,6 +21,49 @@ struct GetTypeReal<thrust::complex<double>> {
     using type = double; /**< The return type specialization for std::complex<double>. */
 };
 
+// Forward declarations for abs2
+template<typename T>
+__device__ typename GetTypeReal<T>::type abs2(const T& x);
+
+// Specialization for real types (double)
+template<>
+__device__ double abs2(const double& x) {
+    return x * x;
+}
+
+// Specialization for real types (float)
+template<>
+__device__ float abs2(const float& x) {
+    return x * x;
+}
+
+// Specialization for complex float
+template<>
+__device__ float abs2(const thrust::complex<float>& x) {
+    return x.real() * x.real() + x.imag() * x.imag();
+}
+
+// Specialization for complex double
+template<>
+__device__ double abs2(const thrust::complex<double>& x) {
+    return x.real() * x.real() + x.imag() * x.imag();
+}
+
+// Specialization for std::complex<float> (for interface compatibility)
+template<>
+__device__ float abs2(const std::complex<float>& x) {
+    const thrust::complex<float>* tx = reinterpret_cast<const thrust::complex<float>*>(&x);
+    return tx->real() * tx->real() + tx->imag() * tx->imag();
+}
+
+// Specialization for std::complex<double> (for interface compatibility)
+template<>
+__device__ double abs2(const std::complex<double>& x) {
+    const thrust::complex<double>* tx = reinterpret_cast<const thrust::complex<double>*>(&x);
+    return tx->real() * tx->real() + tx->imag() * tx->imag();
+}
+
+
 namespace hsolver {
 
 template <typename T>
@@ -943,7 +987,309 @@ void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator
     hipCheckOnDebug();
 }
 
+template <typename T>
+__launch_bounds__(256)
+__global__ void apply_eigenvalues_kernel(
+    const typename GetTypeReal<T>::type* eigenvalues,
+    const T* vectors,
+    T* result,
+    const int nbase,
+    const int nbase_x,
+    const int notconv)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total_elements = notconv * nbase;
+    if (i < total_elements)
+    {
+        const int row = i / nbase;
+        const int col = i % nbase;
+        result[row * nbase_x + col] = eigenvalues[row] * vectors[row * nbase_x + col];
+    }
+}
+
+template <typename FPTYPE>
+inline void apply_eigenvalues_complex_wrapper(const base_device::DEVICE_GPU* d,
+                                            const int& nbase,
+                                            const int& nbase_x,
+                                            const int& notconv,
+                                            const FPTYPE* eigenvalues,
+                                            const std::complex<FPTYPE>* vectors,
+                                            std::complex<FPTYPE>* result)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vectors_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vectors);
+    const int total_elements = notconv * nbase;
+    int thread = 256;
+    int block = (total_elements + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_eigenvalues_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0,
+        eigenvalues, vectors_tmp, result_tmp, nbase, nbase_x, notconv);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void apply_eigenvalues_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                      const int& nbase,
+                                                                      const int& nbase_x,
+                                                                      const int& notconv,
+                                                                      double* result,
+                                                                      const double* vectors,
+                                                                      const double* eigenvalues)
+{
+    const int total_elements = notconv * nbase;
+    int thread = 256;
+    int block = (total_elements + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_eigenvalues_kernel<double>), dim3(block), dim3(thread), 0, 0,
+        eigenvalues, vectors, result, nbase, nbase_x, notconv);
+
+    hipCheckOnDebug();
+}
 
+template <>
+void apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                                   const int& nbase,
+                                                                                   const int& nbase_x,
+                                                                                   const int& notconv,
+                                                                                   std::complex<float>* result,
+                                                                                   const std::complex<float>* vectors,
+                                                                                   const float* eigenvalues)
+{
+    apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result);
+}
+
+template <>
+void apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                                    const int& nbase,
+                                                                                    const int& nbase_x,
+                                                                                    const int& notconv,
+                                                                                    std::complex<double>* result,
+                                                                                    const std::complex<double>* vectors,
+                                                                                    const double* eigenvalues)
+{
+    apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result);
+}
+
+template <typename T>
+__launch_bounds__(256)
+__global__ void precondition_kernel(
+    const int dim,
+    const int notconv,
+    T* psi_iter,
+    const int nbase,
+    const typename GetTypeReal<T>::type* precondition,  // Real type
+    const typename GetTypeReal<T>::type* eigenvalues)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dim * notconv) {
+        const int i = tid % dim;    // Basis index
+        const int m = tid / dim;    // Band index
+        
+        using Real = typename GetTypeReal<T>::type;
+        Real x = abs(precondition[i] - eigenvalues[m]);
+        Real pre = 0.5 * (1.0 + x + sqrt(1.0 + (x - 1.0) * (x - 1.0)));
+        psi_iter[(nbase + m) * dim + i] /= pre;
+    }
+}
+
+template <>
+void precondition_op<double, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    double* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const double* precondition,
+    const double* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    int thread = 256;
+    int block = (total_elements + thread - 1) / thread;
+    
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel<double>), dim3(block), dim3(thread), 0, 0,
+        dim, notconv, psi_iter, nbase, precondition, eigenvalues);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void precondition_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<float>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const float* precondition,
+    const float* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    int thread = 256;
+    int block = (total_elements + thread - 1) / thread;
+    
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel<thrust::complex<float>>), dim3(block), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<thrust::complex<float>*>(psi_iter), nbase, precondition, eigenvalues);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void precondition_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<double>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const double* precondition,
+    const double* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    int thread = 256;
+    int block = (total_elements + thread - 1) / thread;
+    
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel<thrust::complex<double>>), dim3(block), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<thrust::complex<double>*>(psi_iter), nbase, precondition, eigenvalues);
+
+    hipCheckOnDebug();
+}
+
+template <typename T>
+__launch_bounds__(256)
+__global__ void calculate_norm_kernel(
+    const int dim,
+    const int notconv,
+    const T* psi_iter,
+    const int nbase,
+    typename GetTypeReal<T>::type* psi_norm)
+{
+    using Real = typename GetTypeReal<T>::type;
+    __shared__ Real shared_mem[256];
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int m = bid / ((dim + blockDim.x - 1) / blockDim.x);
+    const int num_blocks_per_band = (dim + blockDim.x - 1) / blockDim.x;
+    const int block_in_band = bid % num_blocks_per_band;
+
+    if (m >= notconv) return;
+
+    // Initialize shared memory
+    shared_mem[tid] = 0.0;
+
+    // Each thread processes one element
+    const int i = tid + block_in_band * blockDim.x;
+    if (i < dim) {
+        shared_mem[tid] = abs2(psi_iter[(nbase + m) * dim + i]);
+    }
+    __syncthreads();
+
+    // Parallel reduction in shared memory
+    for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            shared_mem[tid] += shared_mem[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    // Write result for this block to global memory
+    if (tid == 0) {
+        atomicAdd(&psi_norm[m], shared_mem[0]);
+    }
+}
+
+template <typename T>
+__launch_bounds__(256)
+__global__ void normalize_vectors_kernel(
+    const int dim,
+    const int notconv,
+    T* psi_iter,
+    const int nbase,
+    const typename GetTypeReal<T>::type* psi_norm)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int m = tid / dim;
+    const int i = tid % dim;
+    
+    if (m < notconv && i < dim) {
+        psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / sqrt(psi_norm[m]);
+    }
+}
+
+template <>
+void normalize_op<double, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    double* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    double* psi_norm)
+{
+    const int thread = 256;
+    const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv;
+    
+    // First kernel: calculate norms
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel<double>), dim3(numBlocksForNorm), dim3(thread), 0, 0,
+        dim, notconv, psi_iter, nbase, psi_norm);
+    hipDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel<double>), dim3(numBlocksForNormalize), dim3(thread), 0, 0,
+        dim, notconv, psi_iter, nbase, psi_norm);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void normalize_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<float>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    float* psi_norm)
+{
+    const int thread = 256;
+    const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv;
+    
+    // First kernel: calculate norms
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel<thrust::complex<float>>), dim3(numBlocksForNorm), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<const thrust::complex<float>*>(psi_iter), nbase, psi_norm);
+    hipDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel<thrust::complex<float>>), dim3(numBlocksForNormalize), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<thrust::complex<float>*>(psi_iter), nbase, psi_norm);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void normalize_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<double>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    double* psi_norm)
+{
+    const int thread = 256;
+    const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv;
+    
+    // First kernel: calculate norms
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel<thrust::complex<double>>), dim3(numBlocksForNorm), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<const thrust::complex<double>*>(psi_iter), nbase, psi_norm);
+    hipDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel<thrust::complex<double>>), dim3(numBlocksForNormalize), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<thrust::complex<double>*>(psi_iter), nbase, psi_norm);
+
+    hipCheckOnDebug();
+}
 
 // Explicitly instantiate functors for the types of functor registered.
 template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
@@ -951,6 +1297,9 @@ template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
@@ -960,6 +1309,9 @@ template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVIC
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
@@ -968,8 +1320,11 @@ template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_GPU>;
+template struct precondition_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
+template struct normalize_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace hsolver
diff --git a/source/module_hsolver/test/test_hsolver_pw.cpp b/source/module_hsolver/test/test_hsolver_pw.cpp
index 5763f1b7d8..1345dfc99e 100644
--- a/source/module_hsolver/test/test_hsolver_pw.cpp
+++ b/source/module_hsolver/test/test_hsolver_pw.cpp
@@ -14,6 +14,68 @@
 #include "module_hsolver/hsolver_pw.h"
 #undef private
 #undef protected
+
+// Mock implementations for the template functions causing linking errors
+namespace ModulePW {
+    // Mock implementation for recip_to_real
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::recip_to_real(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+        // In a real test, you might want to implement behavior that simulates the actual function
+    }
+
+    // Mock implementation for real_to_recip
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::real_to_recip(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+    }
+
+    // Explicit template instantiations
+    template void PW_Basis_K::recip_to_real<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::recip_to_real<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+
+    template void PW_Basis_K::real_to_recip<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::real_to_recip<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+}
+
 /************************************************
  *  unit test of HSolverPW class
  ***********************************************/
diff --git a/source/module_hsolver/test/test_hsolver_sdft.cpp b/source/module_hsolver/test/test_hsolver_sdft.cpp
index 642ee9daae..235c664a78 100644
--- a/source/module_hsolver/test/test_hsolver_sdft.cpp
+++ b/source/module_hsolver/test/test_hsolver_sdft.cpp
@@ -133,6 +133,66 @@ void Stochastic_Iter<T, Device>::cal_storho(const UnitCell& ucell,
 Charge::Charge(){};
 Charge::~Charge(){};
 
+// Mock implementations for the template functions causing linking errors
+namespace ModulePW {
+    // Mock implementation for recip_to_real
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::recip_to_real(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+        // In a real test, you might want to implement behavior that simulates the actual function
+    }
+
+    // Mock implementation for real_to_recip
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::real_to_recip(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+    }
+
+    // Explicit template instantiations
+    template void PW_Basis_K::recip_to_real<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::recip_to_real<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+
+    template void PW_Basis_K::real_to_recip<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::real_to_recip<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+}
 /************************************************
  *  unit test of HSolverPW_SDFT class
  ***********************************************/
diff --git a/source/module_io/read_input_item_elec_stru.cpp b/source/module_io/read_input_item_elec_stru.cpp
index 73c741bb5f..19d5b2f1c2 100644
--- a/source/module_io/read_input_item_elec_stru.cpp
+++ b/source/module_io/read_input_item_elec_stru.cpp
@@ -346,6 +346,26 @@ void ReadInput::item_elec_stru()
         read_sync_bool(input.diago_smooth_ethr);
         this->add_item(item);
     }
+    {
+        Input_Item item("use_k_continuity");
+        item.annotation = "whether to use k-point continuity for initializing wave functions";
+        read_sync_bool(input.use_k_continuity);
+        item.check_value = [](const Input_Item& item, const Parameter& para) {
+            if (para.input.use_k_continuity && para.input.basis_type != "pw") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity only works for PW basis");
+            }
+            if (para.input.use_k_continuity && para.input.calculation == "nscf") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for NSCF calculation");
+            }
+            if (para.input.use_k_continuity && para.input.nspin == 2) {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for spin-polarized calculation");
+            }
+            if (para.input.use_k_continuity && para.input.esolver_type == "sdft") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for SDFT calculation");
+            }
+        };
+        this->add_item(item);
+    }
     {
         Input_Item item("pw_diag_ndim");
         item.annotation = "dimension of workspace for Davidson diagonalization";
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index 0bb1979318..aa7d31309c 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -90,6 +90,7 @@ struct Input_para
     bool diago_smooth_ethr = false; ///< smooth ethr for iter methods
     int pw_diag_ndim = 4;      ///< dimension of workspace for Davidson diagonalization
     int diago_cg_prec = 1;     ///< mohan add 2012-03-31
+    bool use_k_continuity = false;   ///< whether to use k-point continuity for initializing wave functions
 
     std::string smearing_method = "gauss"; ///< "gauss",
                                            ///< "mp","methfessel-paxton"
diff --git a/source/module_psi/psi_init.cpp b/source/module_psi/psi_init.cpp
index 201539a54f..a3d59d4e7b 100644
--- a/source/module_psi/psi_init.cpp
+++ b/source/module_psi/psi_init.cpp
@@ -265,6 +265,7 @@ void PSIInit<T, Device>::initialize_psi(Psi<std::complex<double>>* psi,
         {
             for (int ik = 0; ik < this->pw_wfc->nks; ++ik)
             {
+                if(PARAM.inp.use_k_continuity && ik > 0) continue;
                 //! Update Hamiltonian from other kpoint to the given one
                 p_hamilt->updateHk(ik);
 
@@ -316,4 +317,4 @@ template class PSIInit<std::complex<double>, base_device::DEVICE_CPU>;
 template class PSIInit<std::complex<float>, base_device::DEVICE_GPU>;
 template class PSIInit<std::complex<double>, base_device::DEVICE_GPU>;
 #endif
-} // namespace psi
\ No newline at end of file
+} // namespace psi

From c605cae89309e29120822443160943f6036bd41c Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Mon, 26 May 2025 15:56:44 +0800
Subject: [PATCH 2/6] implement gpu op for sincos loops

---
 .../kernels/cuda/force_sincos_op.cu           | 204 ++++++++++++++++++
 .../hamilt_pwdft/kernels/force_sincos_op.cpp  | 123 +++++++++++
 .../hamilt_pwdft/kernels/force_sincos_op.h    | 106 +++++++++
 .../kernels/rocm/force_sincos_op.hip.cu       | 203 +++++++++++++++++
 4 files changed, 636 insertions(+)
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu

diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
new file mode 100644
index 0000000000..f97214893e
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
@@ -0,0 +1,204 @@
+#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h"
+#include "module_base/module_device/types.h"
+
+#include <complex>
+#include <thrust/complex.h>
+#include <cuda_runtime.h>
+#include <base/macros/macros.h>
+
+#define THREADS_PER_BLOCK 256
+
+namespace hamilt {
+
+// CUDA kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_factors,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (
+            gcar[ig * 3 + 0] * tau_x + 
+            gcar[ig * 3 + 1] * tau_y + 
+            gcar[ig * 3 + 2] * tau_z
+        );
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        __sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE factor = vloc_factors[ig] * 
+                              (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE arg = TWO_PI * (
+            gcar[ig * 3 + 0] * tau_x + 
+            gcar[ig * 3 + 1] * tau_y + 
+            gcar[ig * 3 + 2] * tau_z
+        );
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        __sincos(arg, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution
+        const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * sumnb;
+        local_force_y += gcar[ig * 3 + 1] * sumnb;
+        local_force_z += gcar[ig * 3 + 2] * sumnb;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact);
+}
+
+// GPU operator implementations
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_factors,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw,
+        gcar, tau, iat2it, vloc_factors,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        scale_factor, force
+    );
+    
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ig_gge0,
+        gcar, tau, iat2it, it_facts,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        force
+    );
+    
+    cudaCheckOnDebug();
+}
+
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
+
+}  // namespace hamilt 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
new file mode 100644
index 0000000000..e7ce8554cc
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
@@ -0,0 +1,123 @@
+#include "force_sincos_op.h"
+#include "module_base/libm/libm.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace hamilt
+{
+
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const int it = iat2it[iat];
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+            for (int ig = 0; ig < npw; ig++)
+            {
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                               gcar[ig * 3 + 1] * tau_y + 
+                                               gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                
+                const FPTYPE factor = vloc_factors[ig] * 
+                                     (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                
+                local_force[0] += gcar[ig * 3 + 0] * factor;
+                local_force[1] += gcar[ig * 3 + 1] * factor;
+                local_force[2] += gcar[ig * 3 + 2] * factor;
+            }
+
+            force[iat * 3 + 0] += local_force[0] * scale_factor;
+            force[iat * 3 + 1] += local_force[1] * scale_factor;
+            force[iat * 3 + 2] += local_force[2] * scale_factor;
+        }
+    }
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const int it = iat2it[iat];
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+            const FPTYPE it_fact = it_facts[iat];
+
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+            for (int ig = 0; ig < npw; ig++)
+            {
+                // Skip G=0 term
+                if (ig == ig_gge0) continue;
+
+                const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                             gcar[ig * 3 + 1] * tau_y + 
+                                             gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                
+                const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
+                
+                local_force[0] += gcar[ig * 3 + 0] * sumnb;
+                local_force[1] += gcar[ig * 3 + 1] * sumnb;
+                local_force[2] += gcar[ig * 3 + 2] * sumnb;
+            }
+
+            force[iat * 3 + 0] += local_force[0] * it_fact;
+            force[iat * 3 + 1] += local_force[1] * it_fact;
+            force[iat * 3 + 2] += local_force[2] * it_fact;
+        }
+    }
+};
+
+// Template instantiations
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
+
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_CPU>;
+
+} // namespace hamilt 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
new file mode 100644
index 0000000000..455b8425a7
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
@@ -0,0 +1,106 @@
+#ifndef FORCE_SINCOS_OP_H
+#define FORCE_SINCOS_OP_H
+
+#include "module_base/module_device/types.h"
+#include <complex>
+
+namespace hamilt
+{
+
+template <typename FPTYPE, typename Device>
+struct cal_force_loc_sincos_op
+{
+    /// @brief Calculate local pseudopotential forces (sincos loop only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param iat2it - atom to type mapping [nat]
+    /// @param vloc_factors - precomputed vloc factors [npw]
+    /// @param aux - charge density in G-space [npw]
+    /// @param scale_factor - tpiba * omega
+    ///
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_ew_sincos_op
+{
+    /// @brief Calculate Ewald forces (sincos loop only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param iat2it - atom to type mapping [nat]
+    /// @param it_facts - precomputed it_fact for each atom [nat]
+    /// @param aux - structure factor related array [npw]
+    ///
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
+#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
+#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+
+} // namespace hamilt
+
+#endif // FORCE_SINCOS_OP_H 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu
new file mode 100644
index 0000000000..a09cd28bed
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu
@@ -0,0 +1,203 @@
+#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h"
+
+#include <complex>
+#include <thrust/complex.h>
+#include <hip/hip_runtime.h>
+#include <base/macros/macros.h>
+
+#define THREADS_PER_BLOCK 256
+
+namespace hamilt {
+
+// HIP kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_factors,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (
+            gcar[ig * 3 + 0] * tau_x + 
+            gcar[ig * 3 + 1] * tau_y + 
+            gcar[ig * 3 + 2] * tau_z
+        );
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        __sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE factor = vloc_factors[ig] * 
+                              (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE arg = TWO_PI * (
+            gcar[ig * 3 + 0] * tau_x + 
+            gcar[ig * 3 + 1] * tau_y + 
+            gcar[ig * 3 + 2] * tau_z
+        );
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        __sincos(arg, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution
+        const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * sumnb;
+        local_force_y += gcar[ig * 3 + 1] * sumnb;
+        local_force_z += gcar[ig * 3 + 2] * sumnb;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact);
+}
+
+// GPU operator implementations
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_factors,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_loc_sincos_kernel<FPTYPE>), grid, block, 0, 0,
+        nat, npw,
+        gcar, tau, iat2it, vloc_factors,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        scale_factor, force
+    );
+    
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_ew_sincos_kernel<FPTYPE>), grid, block, 0, 0,
+        nat, npw, ig_gge0,
+        gcar, tau, iat2it, it_facts,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        force
+    );
+    
+    hipCheckOnDebug();
+}
+
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
+
+}  // namespace hamilt 
\ No newline at end of file

From fbfc91a49ff49908a787f719cbbc667459815b13 Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Tue, 27 May 2025 18:35:51 +0800
Subject: [PATCH 3/6] add cpu kernel for cal_force_loc & cal_force_ew

---
 .../module_hamilt_pw/hamilt_pwdft/forces.cpp  | 452 +++++++++++-------
 .../hamilt_pwdft/kernels/force_op.cpp         | 255 ++++++++++
 .../hamilt_pwdft/kernels/force_op.h           | 107 +++++
 3 files changed, 636 insertions(+), 178 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index df29f88989..52ddf7da63 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -15,6 +15,8 @@
 #include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
 #include "module_hamilt_general/module_surchem/surchem.h"
 #include "module_hamilt_general/module_vdw/vdw.h"
+#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
+#include <vector>
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -23,6 +25,208 @@
 #include "module_cell/module_paw/paw_cell.h"
 #endif
 
+// Data preparation function for Ewald parallel operator
+template <typename FPTYPE>
+void prepare_ew_parallel_data(const UnitCell& ucell,
+                              ModulePW::PW_Basis* rho_basis,
+                              const std::complex<double>* aux,
+                              const double alpha,
+                              const double fact,
+                              std::vector<FPTYPE>& gcar_flat,
+                              std::vector<FPTYPE>& tau_flat,
+                              std::vector<int>& iat2it_array,
+                              std::vector<FPTYPE>& it_facts,
+                              std::vector<int>& atom_na,
+                              std::vector<FPTYPE>& zv_values,
+                              std::vector<FPTYPE>& latvec_flat,
+                              std::vector<FPTYPE>& G_flat)
+{
+    const int nat = ucell.nat;
+    const int npw = rho_basis->npw;
+    const int ntype = ucell.ntype;
+    
+    // Prepare gcar_flat
+    gcar_flat.resize(npw * 3);
+    for (int ig = 0; ig < npw; ig++)
+    {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    // Prepare tau_flat and iat2it_array
+    tau_flat.resize(nat * 3);
+    iat2it_array.resize(nat);
+    int iat = 0;
+    for (int it = 0; it < ntype; it++)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].x);
+            tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].y);
+            tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].z);
+            iat2it_array[iat] = it;
+            iat++;
+        }
+    }
+    
+    // Prepare it_facts
+    it_facts.resize(nat);
+    iat = 0;
+    for (int it = 0; it < ntype; it++)
+    {
+        double zv;
+        if (PARAM.inp.use_paw)
+        {
+#ifdef USE_PAW
+            zv = GlobalC::paw_cell.get_val(it);
+#else
+            zv = ucell.atoms[it].ncpp.zv;
+#endif
+        }
+        else
+        {
+            zv = ucell.atoms[it].ncpp.zv;
+        }
+        
+        const FPTYPE it_fact = static_cast<FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact);
+        
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            it_facts[iat] = it_fact;
+            iat++;
+        }
+    }
+    
+    // Prepare atom_na
+    atom_na.resize(ntype);
+    for (int it = 0; it < ntype; it++)
+    {
+        atom_na[it] = ucell.atoms[it].na;
+    }
+    
+    // Prepare zv_values
+    zv_values.resize(ntype);
+    for (int it = 0; it < ntype; it++)
+    {
+        if (PARAM.inp.use_paw)
+        {
+#ifdef USE_PAW
+            zv_values[it] = static_cast<FPTYPE>(GlobalC::paw_cell.get_val(it));
+#else
+            zv_values[it] = static_cast<FPTYPE>(ucell.atoms[it].ncpp.zv);
+#endif
+        }
+        else
+        {
+            zv_values[it] = static_cast<FPTYPE>(ucell.atoms[it].ncpp.zv);
+        }
+    }
+    
+    // Prepare latvec_flat (row-major order)
+    latvec_flat.resize(9);
+    latvec_flat[0] = static_cast<FPTYPE>(ucell.latvec.e11);
+    latvec_flat[1] = static_cast<FPTYPE>(ucell.latvec.e12);
+    latvec_flat[2] = static_cast<FPTYPE>(ucell.latvec.e13);
+    latvec_flat[3] = static_cast<FPTYPE>(ucell.latvec.e21);
+    latvec_flat[4] = static_cast<FPTYPE>(ucell.latvec.e22);
+    latvec_flat[5] = static_cast<FPTYPE>(ucell.latvec.e23);
+    latvec_flat[6] = static_cast<FPTYPE>(ucell.latvec.e31);
+    latvec_flat[7] = static_cast<FPTYPE>(ucell.latvec.e32);
+    latvec_flat[8] = static_cast<FPTYPE>(ucell.latvec.e33);
+    
+    // Prepare G_flat (row-major order)
+    G_flat.resize(9);
+    G_flat[0] = static_cast<FPTYPE>(ucell.G.e11);
+    G_flat[1] = static_cast<FPTYPE>(ucell.G.e12);
+    G_flat[2] = static_cast<FPTYPE>(ucell.G.e13);
+    G_flat[3] = static_cast<FPTYPE>(ucell.G.e21);
+    G_flat[4] = static_cast<FPTYPE>(ucell.G.e22);
+    G_flat[5] = static_cast<FPTYPE>(ucell.G.e23);
+    G_flat[6] = static_cast<FPTYPE>(ucell.G.e31);
+    G_flat[7] = static_cast<FPTYPE>(ucell.G.e32);
+    G_flat[8] = static_cast<FPTYPE>(ucell.G.e33);
+}
+
+// Data preparation function for local force sincos operator
+template <typename FPTYPE>
+void prepare_loc_sincos_data(const UnitCell& ucell,
+                             ModulePW::PW_Basis* rho_basis,
+                             const ModuleBase::matrix& vloc,
+                             const std::complex<double>* aux,
+                             std::vector<FPTYPE>& gcar_flat,
+                             std::vector<FPTYPE>& tau_flat,
+                             std::vector<int>& iat2it_array,
+                             std::vector<FPTYPE>& vloc_factors)
+{
+    const int nat = ucell.nat;
+    const int npw = rho_basis->npw;
+    
+    // Prepare gcar_flat
+    gcar_flat.resize(npw * 3);
+    for (int ig = 0; ig < npw; ig++)
+    {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    // Prepare tau_flat and iat2it_array
+    tau_flat.resize(nat * 3);
+    iat2it_array.resize(nat);
+    int iat = 0;
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].x);
+            tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].y);
+            tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].z);
+            iat2it_array[iat] = it;
+            iat++;
+        }
+    }
+    
+    // Prepare vloc_factors: vloc(it, ig2igg[ig]) for each G-vector and atom type
+    vloc_factors.resize(npw * ucell.ntype);
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        for (int ig = 0; ig < npw; ig++)
+        {
+            vloc_factors[it * npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
+        }
+    }
+}
+
+// Data copy back function for local force sincos operator
+template <typename FPTYPE>
+void copy_back_loc_force_data(const FPTYPE* force,
+                              const int nat,
+                              const FPTYPE& scale_factor,
+                              ModuleBase::matrix& forcelc)
+{
+    for (int iat = 0; iat < nat; iat++)
+    {
+        forcelc(iat, 0) = static_cast<double>(force[iat * 3 + 0] * scale_factor);
+        forcelc(iat, 1) = static_cast<double>(force[iat * 3 + 1] * scale_factor);
+        forcelc(iat, 2) = static_cast<double>(force[iat * 3 + 2] * scale_factor);
+    }
+}
+
+// Data copy back function for Ewald parallel operator
+template <typename FPTYPE>
+void copy_back_ew_force_data(const FPTYPE* force,
+                             const int nat,
+                             ModuleBase::matrix& forceion)
+{
+    for (int iat = 0; iat < nat; iat++)
+    {
+        forceion(iat, 0) = static_cast<double>(force[iat * 3 + 0]);
+        forceion(iat, 1) = static_cast<double>(force[iat * 3 + 1]);
+        forceion(iat, 2) = static_cast<double>(force[iat * 3 + 2]);
+    }
+}
+
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
                                        ModuleBase::matrix& force,
@@ -531,30 +735,40 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (int iat = 0; iat < this->nat; ++iat)
+    // Prepare data for the sincos operator
+    std::vector<FPTYPE> gcar_flat, tau_flat, vloc_factors;
+    std::vector<int> iat2it_array;
+    prepare_loc_sincos_data<FPTYPE>(ucell, rho_basis, vloc, aux,
+                                    gcar_flat, tau_flat, iat2it_array, vloc_factors);
+
+    // Prepare force array for the operator
+    std::vector<FPTYPE> force_flat(this->nat * 3, 0.0);
+
+    // Convert aux to FPTYPE complex array
+    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
+    for (int ig = 0; ig < rho_basis->npw; ig++)
     {
-        // read `it` `ia` from the table
-        int it = ucell.iat2it[iat];
-        int ia = ucell.iat2ia[iat];
-        for (int ig = 0; ig < rho_basis->npw; ig++)
-        {
-            const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
-            double sinp, cosp;
-            ModuleBase::libm::sincos(phase, &sinp, &cosp);
-            const double factor
-                = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
-            forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
-            forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
-            forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
-        }
-        forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
-        forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
-        forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);
+        aux_fptype[ig] = std::complex<FPTYPE>(static_cast<FPTYPE>(aux[ig].real()),
+                                              static_cast<FPTYPE>(aux[ig].imag()));
     }
 
+    // Call the sincos operator
+    hamilt::cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU> sincos_op;
+    sincos_op(this->ctx,
+              this->nat,
+              rho_basis->npw,
+              ucell.ntype,
+              gcar_flat.data(),
+              tau_flat.data(),
+              iat2it_array.data(),
+              vloc_factors.data(),
+              aux_fptype.data(),
+              force_flat.data());
+
+    // Copy back the results with scaling
+    const FPTYPE scale_factor = static_cast<FPTYPE>(ucell.tpiba * ucell.omega);
+    copy_back_loc_force_data<FPTYPE>(force_flat.data(), this->nat, scale_factor, forcelc);
+
     // this->print(GlobalV::ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
     delete[] aux;
@@ -665,166 +879,48 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         aux[rho_basis->ig_gge0] = std::complex<double>(0.0, 0.0);
     }
 
-#ifdef _OPENMP
-#pragma omp parallel
-    {
-        int num_threads = omp_get_num_threads();
-        int thread_id = omp_get_thread_num();
-#else
-    int num_threads = 1;
-    int thread_id = 0;
-#endif
+    // Prepare data for the parallel operator
+    std::vector<FPTYPE> gcar_flat, tau_flat, it_facts, zv_values, latvec_flat, G_flat;
+    std::vector<int> iat2it_array, atom_na;
+    prepare_ew_parallel_data<FPTYPE>(ucell, rho_basis, aux, alpha, fact,
+                                     gcar_flat, tau_flat, iat2it_array, it_facts,
+                                     atom_na, zv_values, latvec_flat, G_flat);
 
-        /* Here is task distribution for multi-thread,
-            0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`.
-                To avoid syncing, we must calculate work range of each thread by our self
-            1. Calculate the iat range [iat_beg, iat_end) by each thread
-                a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat)
-            2. each thread iterate atoms form `iat_beg` to `iat_end-1`
-        */
-        int iat_beg, iat_end;
-        int it_beg, ia_beg;
-        ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end);
-        iat_end = iat_beg + iat_end;
-        ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
-
-        int iat = iat_beg;
-        int it = it_beg;
-        int ia = ia_beg;
-
-        // preprocess ig_gap for skipping the ig point
-        int ig_gap = (rho_basis->ig_gge0 >= 0 && rho_basis->ig_gge0 < rho_basis->npw) ? rho_basis->ig_gge0 : -1;
-
-        double it_fact = 0.;
-        int last_it = -1;
-
-        // iterating atoms
-        while (iat < iat_end)
-        {
-            if (it != last_it)
-            { // calculate it_tact when it is changed
-                double zv;
-                if (PARAM.inp.use_paw)
-                {
-#ifdef USE_PAW
-                    zv = GlobalC::paw_cell.get_val(it);
-#endif
-                }
-                else
-                {
-                    zv = ucell.atoms[it].ncpp.zv;
-                }
-                it_fact = zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact;
-                last_it = it;
-            }
-
-            if (ucell.atoms[it].na != 0)
-            {
-                const auto ig_loop = [&](int ig_beg, int ig_end) {
-                    for (int ig = ig_beg; ig < ig_end; ig++)
-                    {
-                        const ModuleBase::Vector3<double> gcar = rho_basis->gcar[ig];
-                        const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]);
-                        double sinp, cosp;
-                        ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                        double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-                        forceion(iat, 0) += gcar[0] * sumnb;
-                        forceion(iat, 1) += gcar[1] * sumnb;
-                        forceion(iat, 2) += gcar[2] * sumnb;
-                    }
-                };
-
-                // skip ig_gge0 point by separating ig loop into two part
-                ig_loop(0, ig_gap);
-                ig_loop(ig_gap + 1, rho_basis->npw);
-
-                forceion(iat, 0) *= it_fact;
-                forceion(iat, 1) *= it_fact;
-                forceion(iat, 2) *= it_fact;
-
-                ++iat;
-                ucell.step_iait(&ia, &it);
-            }
-        }
-
-        // means that the processor contains G=0 term.
-        if (rho_basis->ig_gge0 >= 0)
-        {
-            double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
-            int nrm = 0;
+    // Prepare force array for the operator
+    std::vector<FPTYPE> force_flat(this->nat * 3, 0.0);
 
-            // output of rgen: the number of vectors in the sphere
-            const int mxr = 200;
-            // the maximum number of R vectors included in r
-            ModuleBase::Vector3<double>* r = new ModuleBase::Vector3<double>[mxr];
-            double* r2 = new double[mxr];
-            ModuleBase::GlobalFunc::ZEROS(r2, mxr);
-            int* irr = new int[mxr];
-            ModuleBase::GlobalFunc::ZEROS(irr, mxr);
-            // the square modulus of R_j-tau_s-tau_s'
-
-            int iat1 = iat_beg;
-            int T1 = it_beg;
-            int I1 = ia_beg;
-            const double sqa = sqrt(alpha);
-            const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI);
-
-            // iterating atoms.
-            // do not need to sync threads because task range of each thread is isolated
-            while (iat1 < iat_end)
-            {
-                int iat2 = 0; // mohan fix bug 2011-06-07
-                int I2 = 0;
-                int T2 = 0;
-                while (iat2 < this->nat)
-                {
-                    if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0)
-                    {
-                        ModuleBase::Vector3<double> d_tau
-                            = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2];
-                        H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm);
-
-                        for (int n = 0; n < nrm; n++)
-                        {
-                            const double rr = sqrt(r2[n]) * ucell.lat0;
-
-                            double factor;
-                            if (PARAM.inp.use_paw)
-                            {
-#ifdef USE_PAW
-                                factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2
-                                         / (rr * rr)
-                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
-                                         * ucell.lat0;
-#endif
-                            }
-                            else
-                            {
-                                factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv
-                                         * ModuleBase::e2 / (rr * rr)
-                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
-                                         * ucell.lat0;
-                            }
-
-                            forceion(iat1, 0) -= factor * r[n].x;
-                            forceion(iat1, 1) -= factor * r[n].y;
-                            forceion(iat1, 2) -= factor * r[n].z;
-                        }
-                    }
-                    ++iat2;
-                    ucell.step_iait(&I2, &T2);
-                } // atom b
-                ++iat1;
-                ucell.step_iait(&I1, &T1);
-            } // atom a
-
-            delete[] r;
-            delete[] r2;
-            delete[] irr;
-        }
-#ifdef _OPENMP
+    // Convert aux to FPTYPE complex array
+    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
+    for (int ig = 0; ig < rho_basis->npw; ig++)
+    {
+        aux_fptype[ig] = std::complex<FPTYPE>(static_cast<FPTYPE>(aux[ig].real()),
+                                              static_cast<FPTYPE>(aux[ig].imag()));
     }
-#endif
+
+    // Call the parallel operator
+    hamilt::cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_CPU> parallel_op;
+    parallel_op(this->ctx,
+                this->nat,
+                rho_basis->npw,
+                ucell.ntype,
+                rho_basis->ig_gge0,
+                gcar_flat.data(),
+                tau_flat.data(),
+                iat2it_array.data(),
+                it_facts.data(),
+                atom_na.data(),
+                zv_values.data(),
+                aux_fptype.data(),
+                static_cast<FPTYPE>(alpha),
+                static_cast<FPTYPE>(fact),
+                static_cast<FPTYPE>(ucell.lat0),
+                latvec_flat.data(),
+                G_flat.data(),
+                PARAM.inp.use_paw,
+                force_flat.data());
+
+    // Copy back the results
+    copy_back_ew_force_data<FPTYPE>(force_flat.data(), this->nat, forceion);
 
     Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc);
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 6d797e147d..b5745f2ac5 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -1,4 +1,8 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
+#include "module_base/libm/libm.h"
+#include "module_base/tool_threading.h"
+#include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
+#include <cmath>
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -424,10 +428,261 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
+// CPU implementation of local force sincos operator
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const int it = iat2it[iat];
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+            for (int ig = 0; ig < npw; ig++)
+            {
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                               gcar[ig * 3 + 1] * tau_y + 
+                                               gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                
+                const FPTYPE vloc_factor = vloc_factors[it * npw + ig];
+                const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                
+                local_force[0] += gcar[ig * 3 + 0] * factor;
+                local_force[1] += gcar[ig * 3 + 1] * factor;
+                local_force[2] += gcar[ig * 3 + 2] * factor;
+            }
+
+            force[iat * 3 + 0] = local_force[0];
+            force[iat * 3 + 1] = local_force[1];
+            force[iat * 3 + 2] = local_force[2];
+        }
+    }
+};
+
+// CPU implementation of Ewald parallel operator
+template <typename FPTYPE>
+struct cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const int* atom_na,
+                    const FPTYPE* zv_values,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& alpha,
+                    const FPTYPE& fact,
+                    const FPTYPE& lat0,
+                    const FPTYPE* latvec,
+                    const FPTYPE* G,
+                    const bool& use_paw,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+        const FPTYPE e2 = 2.0; // ModuleBase::e2 equivalent
+
+#ifdef _OPENMP
+#pragma omp parallel
+        {
+            int num_threads = omp_get_num_threads();
+            int thread_id = omp_get_thread_num();
+#else
+        int num_threads = 1;
+        int thread_id = 0;
+#endif
+
+            // Task distribution for multi-thread
+            int iat_beg, iat_end;
+            ModuleBase::TASK_DIST_1D(num_threads, thread_id, nat, iat_beg, iat_end);
+            iat_end = iat_beg + iat_end;
+
+            // Preprocess ig_gap for skipping the ig point
+            int ig_gap = (ig_gge0 >= 0 && ig_gge0 < npw) ? ig_gge0 : -1;
+
+            // Part 1: sincos calculation (replacing original while loop with for loop)
+            for (int iat = iat_beg; iat < iat_end; ++iat)
+            {
+                const int it = iat2it[iat];
+                
+                // Check if this atom type has atoms
+                if (atom_na[it] == 0) continue;
+
+                const FPTYPE tau_x = tau[iat * 3 + 0];
+                const FPTYPE tau_y = tau[iat * 3 + 1];
+                const FPTYPE tau_z = tau[iat * 3 + 2];
+                const FPTYPE it_fact = it_facts[iat];
+
+                FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+                // G-vector loop with ig_gap handling (using for loops for GPU compatibility)
+                for (int ig = 0; ig < npw; ig++)
+                {
+                    // Skip G=0 term
+                    if (ig == ig_gap) continue;
+
+                    const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                                 gcar[ig * 3 + 1] * tau_y + 
+                                                 gcar[ig * 3 + 2] * tau_z);
+                    FPTYPE sinp, cosp;
+                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                    
+                    const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
+                    
+                    local_force[0] += gcar[ig * 3 + 0] * sumnb;
+                    local_force[1] += gcar[ig * 3 + 1] * sumnb;
+                    local_force[2] += gcar[ig * 3 + 2] * sumnb;
+                }
+
+                // Apply it_fact scaling
+                force[iat * 3 + 0] += local_force[0] * it_fact;
+                force[iat * 3 + 1] += local_force[1] * it_fact;
+                force[iat * 3 + 2] += local_force[2] * it_fact;
+            }
+
+            // Part 2: Real space Ewald calculation (when processor contains G=0 term)
+            if (ig_gge0 >= 0)
+            {
+                const FPTYPE rmax = 5.0 / (sqrt(alpha) * lat0);
+                const int mxr = 200;
+                
+                // Allocate temporary arrays for each thread
+                ModuleBase::Vector3<double>* r_flat = new ModuleBase::Vector3<double>[mxr];
+                double* r2 = new double[mxr];
+                int* irr = new int[mxr];
+                
+                // Initialize arrays
+                for (int i = 0; i < mxr; i++) {
+                    r2[i] = 0.0;
+                    irr[i] = 0;
+                    r_flat[i].x = 0.0;
+                    r_flat[i].y = 0.0;
+                    r_flat[i].z = 0.0;
+                }
+
+                const FPTYPE sqa = sqrt(alpha);
+                const FPTYPE sq8a_2pi = sqrt(8.0 * alpha / TWO_PI);
+
+                // Iterate over atoms in assigned range
+                for (int iat1 = iat_beg; iat1 < iat_end; ++iat1)
+                {
+                    const int T1 = iat2it[iat1];
+                    if (atom_na[T1] == 0) continue;
+
+                    const FPTYPE tau1_x = tau[iat1 * 3 + 0];
+                    const FPTYPE tau1_y = tau[iat1 * 3 + 1];
+                    const FPTYPE tau1_z = tau[iat1 * 3 + 2];
+
+                    // Iterate over all other atoms
+                    for (int iat2 = 0; iat2 < nat; ++iat2)
+                    {
+                        const int T2 = iat2it[iat2];
+                        
+                        if (iat1 != iat2 && atom_na[T2] != 0 && atom_na[T1] != 0)
+                        {
+                            const FPTYPE tau2_x = tau[iat2 * 3 + 0];
+                            const FPTYPE tau2_y = tau[iat2 * 3 + 1];
+                            const FPTYPE tau2_z = tau[iat2 * 3 + 2];
+                            
+                            // Calculate d_tau
+                            const FPTYPE d_tau_x = tau1_x - tau2_x;
+                            const FPTYPE d_tau_y = tau1_y - tau2_y;
+                            const FPTYPE d_tau_z = tau1_z - tau2_z;
+
+                            // Reconstruct Matrix3 objects from flat arrays for rgen call
+                            ModuleBase::Matrix3 latvec_matrix(
+                                static_cast<double>(latvec[0]), static_cast<double>(latvec[1]), static_cast<double>(latvec[2]),
+                                static_cast<double>(latvec[3]), static_cast<double>(latvec[4]), static_cast<double>(latvec[5]),
+                                static_cast<double>(latvec[6]), static_cast<double>(latvec[7]), static_cast<double>(latvec[8])
+                            );
+                            
+                            ModuleBase::Matrix3 G_matrix(
+                                static_cast<double>(G[0]), static_cast<double>(G[1]), static_cast<double>(G[2]),
+                                static_cast<double>(G[3]), static_cast<double>(G[4]), static_cast<double>(G[5]),
+                                static_cast<double>(G[6]), static_cast<double>(G[7]), static_cast<double>(G[8])
+                            );
+                            
+                            ModuleBase::Vector3<double> dtau_vec(
+                                static_cast<double>(d_tau_x),
+                                static_cast<double>(d_tau_y),
+                                static_cast<double>(d_tau_z)
+                            );
+
+                            // Call H_Ewald_pw::rgen to generate neighbor shells
+                            int nrm = 0;
+                            H_Ewald_pw::rgen(dtau_vec, static_cast<double>(rmax), irr, latvec_matrix, G_matrix, r_flat, r2, nrm);
+                            
+                            // Process all generated r vectors
+                            for (int nr = 0; nr < nrm; nr++)
+                            {
+                                const FPTYPE rr = sqrt(r2[nr]) * lat0;
+                                
+                                FPTYPE factor;
+                                if (use_paw)
+                                {
+                                    factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr)
+                                             * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr))
+                                             * lat0;
+                                }
+                                else
+                                {
+                                    factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr)
+                                             * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr))
+                                             * lat0;
+                                }
+
+                                force[iat1 * 3 + 0] -= factor * static_cast<FPTYPE>(r_flat[nr].x);
+                                force[iat1 * 3 + 1] -= factor * static_cast<FPTYPE>(r_flat[nr].y);
+                                force[iat1 * 3 + 2] -= factor * static_cast<FPTYPE>(r_flat[nr].z);
+                            }
+                        }
+                    }
+                }
+
+                delete[] r_flat;
+                delete[] r2;
+                delete[] irr;
+            }
+
+#ifdef _OPENMP
+        }
+#endif
+    }
+};
+
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_ew_parallel_op<float, base_device::DEVICE_CPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_CPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
+template struct cal_force_ew_parallel_op<double, base_device::DEVICE_CPU>;
 
 } // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 3aa5d4f87e..152f41e310 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -146,6 +146,83 @@ struct cal_force_nl_op
                     const FPTYPE* lambda,
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
+                                          FPTYPE* force);
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_loc_sincos_op
+{
+    /// @brief Calculate local pseudopotential forces (sincos loop only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param ntype - number of atom types
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param iat2it - atom to type mapping [nat]
+    /// @param vloc_factors - precomputed vloc factors [ntype * npw]
+    /// @param aux - charge density in G-space [npw]
+    ///
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_ew_parallel_op
+{
+    /// @brief Calculate Ewald forces (parallel region only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param ntype - number of atom types
+    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param iat2it - atom to type mapping [nat]
+    /// @param it_facts - precomputed it_fact for each atom [nat]
+    /// @param atom_na - number of atoms of each type [ntype]
+    /// @param zv_values - valence charge for each type [ntype]
+    /// @param aux - structure factor related array [npw]
+    /// @param alpha - Ewald parameter
+    /// @param fact - scaling factor
+    /// @param lat0 - lattice constant
+    /// @param latvec - lattice vectors [3 * 3]
+    /// @param G - reciprocal lattice vectors [3 * 3]
+    /// @param use_pwa - whether PAW is used
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const int* atom_na,
+                    const FPTYPE* zv_values,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& alpha,
+                    const FPTYPE& fact,
+                    const FPTYPE& lat0,
+                    const FPTYPE* latvec,
+                    const FPTYPE* G,
+                    const bool& use_paw,
                     FPTYPE* force);
 };
 
@@ -248,6 +325,36 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     FPTYPE* force);
 };
 
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
 /**
  * @brief revert the vkb values for force_nl calculation
  */

From 8a6339c4e0981dfc17e9fba881e9b375d0e64a71 Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Fri, 30 May 2025 15:06:08 +0800
Subject: [PATCH 4/6] fix sincos op for gpu&cpu

---
 .../module_hamilt_pw/hamilt_pwdft/forces.cpp  | 614 ++++++++++--------
 .../hamilt_pwdft/kernels/cuda/force_op.cu     | 184 ++++++
 .../kernels/cuda/force_sincos_op.cu           | 204 ------
 .../hamilt_pwdft/kernels/force_op.cpp         | 204 +-----
 .../hamilt_pwdft/kernels/force_op.h           |  34 +-
 .../hamilt_pwdft/kernels/force_sincos_op.cpp  | 123 ----
 .../hamilt_pwdft/kernels/force_sincos_op.h    | 106 ---
 .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 185 ++++++
 .../kernels/rocm/force_sincos_op.hip.cu       | 203 ------
 9 files changed, 751 insertions(+), 1106 deletions(-)
 delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
 delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
 delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
 delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu

diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index 52ddf7da63..5e1d3cd6a2 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -15,8 +15,7 @@
 #include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
 #include "module_hamilt_general/module_surchem/surchem.h"
 #include "module_hamilt_general/module_vdw/vdw.h"
-#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
-#include <vector>
+#include "kernels/force_op.h"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -25,208 +24,6 @@
 #include "module_cell/module_paw/paw_cell.h"
 #endif
 
-// Data preparation function for Ewald parallel operator
-template <typename FPTYPE>
-void prepare_ew_parallel_data(const UnitCell& ucell,
-                              ModulePW::PW_Basis* rho_basis,
-                              const std::complex<double>* aux,
-                              const double alpha,
-                              const double fact,
-                              std::vector<FPTYPE>& gcar_flat,
-                              std::vector<FPTYPE>& tau_flat,
-                              std::vector<int>& iat2it_array,
-                              std::vector<FPTYPE>& it_facts,
-                              std::vector<int>& atom_na,
-                              std::vector<FPTYPE>& zv_values,
-                              std::vector<FPTYPE>& latvec_flat,
-                              std::vector<FPTYPE>& G_flat)
-{
-    const int nat = ucell.nat;
-    const int npw = rho_basis->npw;
-    const int ntype = ucell.ntype;
-    
-    // Prepare gcar_flat
-    gcar_flat.resize(npw * 3);
-    for (int ig = 0; ig < npw; ig++)
-    {
-        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
-        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
-        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
-    }
-    
-    // Prepare tau_flat and iat2it_array
-    tau_flat.resize(nat * 3);
-    iat2it_array.resize(nat);
-    int iat = 0;
-    for (int it = 0; it < ntype; it++)
-    {
-        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
-        {
-            tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].x);
-            tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].y);
-            tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].z);
-            iat2it_array[iat] = it;
-            iat++;
-        }
-    }
-    
-    // Prepare it_facts
-    it_facts.resize(nat);
-    iat = 0;
-    for (int it = 0; it < ntype; it++)
-    {
-        double zv;
-        if (PARAM.inp.use_paw)
-        {
-#ifdef USE_PAW
-            zv = GlobalC::paw_cell.get_val(it);
-#else
-            zv = ucell.atoms[it].ncpp.zv;
-#endif
-        }
-        else
-        {
-            zv = ucell.atoms[it].ncpp.zv;
-        }
-        
-        const FPTYPE it_fact = static_cast<FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact);
-        
-        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
-        {
-            it_facts[iat] = it_fact;
-            iat++;
-        }
-    }
-    
-    // Prepare atom_na
-    atom_na.resize(ntype);
-    for (int it = 0; it < ntype; it++)
-    {
-        atom_na[it] = ucell.atoms[it].na;
-    }
-    
-    // Prepare zv_values
-    zv_values.resize(ntype);
-    for (int it = 0; it < ntype; it++)
-    {
-        if (PARAM.inp.use_paw)
-        {
-#ifdef USE_PAW
-            zv_values[it] = static_cast<FPTYPE>(GlobalC::paw_cell.get_val(it));
-#else
-            zv_values[it] = static_cast<FPTYPE>(ucell.atoms[it].ncpp.zv);
-#endif
-        }
-        else
-        {
-            zv_values[it] = static_cast<FPTYPE>(ucell.atoms[it].ncpp.zv);
-        }
-    }
-    
-    // Prepare latvec_flat (row-major order)
-    latvec_flat.resize(9);
-    latvec_flat[0] = static_cast<FPTYPE>(ucell.latvec.e11);
-    latvec_flat[1] = static_cast<FPTYPE>(ucell.latvec.e12);
-    latvec_flat[2] = static_cast<FPTYPE>(ucell.latvec.e13);
-    latvec_flat[3] = static_cast<FPTYPE>(ucell.latvec.e21);
-    latvec_flat[4] = static_cast<FPTYPE>(ucell.latvec.e22);
-    latvec_flat[5] = static_cast<FPTYPE>(ucell.latvec.e23);
-    latvec_flat[6] = static_cast<FPTYPE>(ucell.latvec.e31);
-    latvec_flat[7] = static_cast<FPTYPE>(ucell.latvec.e32);
-    latvec_flat[8] = static_cast<FPTYPE>(ucell.latvec.e33);
-    
-    // Prepare G_flat (row-major order)
-    G_flat.resize(9);
-    G_flat[0] = static_cast<FPTYPE>(ucell.G.e11);
-    G_flat[1] = static_cast<FPTYPE>(ucell.G.e12);
-    G_flat[2] = static_cast<FPTYPE>(ucell.G.e13);
-    G_flat[3] = static_cast<FPTYPE>(ucell.G.e21);
-    G_flat[4] = static_cast<FPTYPE>(ucell.G.e22);
-    G_flat[5] = static_cast<FPTYPE>(ucell.G.e23);
-    G_flat[6] = static_cast<FPTYPE>(ucell.G.e31);
-    G_flat[7] = static_cast<FPTYPE>(ucell.G.e32);
-    G_flat[8] = static_cast<FPTYPE>(ucell.G.e33);
-}
-
-// Data preparation function for local force sincos operator
-template <typename FPTYPE>
-void prepare_loc_sincos_data(const UnitCell& ucell,
-                             ModulePW::PW_Basis* rho_basis,
-                             const ModuleBase::matrix& vloc,
-                             const std::complex<double>* aux,
-                             std::vector<FPTYPE>& gcar_flat,
-                             std::vector<FPTYPE>& tau_flat,
-                             std::vector<int>& iat2it_array,
-                             std::vector<FPTYPE>& vloc_factors)
-{
-    const int nat = ucell.nat;
-    const int npw = rho_basis->npw;
-    
-    // Prepare gcar_flat
-    gcar_flat.resize(npw * 3);
-    for (int ig = 0; ig < npw; ig++)
-    {
-        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
-        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
-        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
-    }
-    
-    // Prepare tau_flat and iat2it_array
-    tau_flat.resize(nat * 3);
-    iat2it_array.resize(nat);
-    int iat = 0;
-    for (int it = 0; it < ucell.ntype; it++)
-    {
-        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
-        {
-            tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].x);
-            tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].y);
-            tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].z);
-            iat2it_array[iat] = it;
-            iat++;
-        }
-    }
-    
-    // Prepare vloc_factors: vloc(it, ig2igg[ig]) for each G-vector and atom type
-    vloc_factors.resize(npw * ucell.ntype);
-    for (int it = 0; it < ucell.ntype; it++)
-    {
-        for (int ig = 0; ig < npw; ig++)
-        {
-            vloc_factors[it * npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
-        }
-    }
-}
-
-// Data copy back function for local force sincos operator
-template <typename FPTYPE>
-void copy_back_loc_force_data(const FPTYPE* force,
-                              const int nat,
-                              const FPTYPE& scale_factor,
-                              ModuleBase::matrix& forcelc)
-{
-    for (int iat = 0; iat < nat; iat++)
-    {
-        forcelc(iat, 0) = static_cast<double>(force[iat * 3 + 0] * scale_factor);
-        forcelc(iat, 1) = static_cast<double>(force[iat * 3 + 1] * scale_factor);
-        forcelc(iat, 2) = static_cast<double>(force[iat * 3 + 2] * scale_factor);
-    }
-}
-
-// Data copy back function for Ewald parallel operator
-template <typename FPTYPE>
-void copy_back_ew_force_data(const FPTYPE* force,
-                             const int nat,
-                             ModuleBase::matrix& forceion)
-{
-    for (int iat = 0; iat < nat; iat++)
-    {
-        forceion(iat, 0) = static_cast<double>(force[iat * 3 + 0]);
-        forceion(iat, 1) = static_cast<double>(force[iat * 3 + 1]);
-        forceion(iat, 2) = static_cast<double>(force[iat * 3 + 2]);
-    }
-}
-
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
                                        ModuleBase::matrix& force,
@@ -735,39 +532,125 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-    // Prepare data for the sincos operator
-    std::vector<FPTYPE> gcar_flat, tau_flat, vloc_factors;
-    std::vector<int> iat2it_array;
-    prepare_loc_sincos_data<FPTYPE>(ucell, rho_basis, vloc, aux,
-                                    gcar_flat, tau_flat, iat2it_array, vloc_factors);
-
-    // Prepare force array for the operator
-    std::vector<FPTYPE> force_flat(this->nat * 3, 0.0);
-
-    // Convert aux to FPTYPE complex array
+    // =============== GPU/CPU异构优化：使用sincos op替换原循环 ===============
+    
+    
+    // 准备原子相关数据：按照全局原子索引iat顺序
+    std::vector<FPTYPE> tau_flat(this->nat * 3);
+    std::vector<int> iat2it_host(this->nat);
+    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
+    
+    // 按照原始代码逻辑：遍历全局原子索引iat，通过查表获取(it,ia)
+    for (int iat = 0; iat < this->nat; iat++) {
+        int it = ucell.iat2it[iat];  // 查表获取原子类型
+        int ia = ucell.iat2ia[iat];  // 查表获取该类型内的原子索引
+        
+        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
+        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
+        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
+        iat2it_host[iat] = it;
+    }
+    
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    // 重新计算vloc_factors考虑所有原子类型
+    std::vector<FPTYPE> vloc_per_type_host(ucell.ntype * rho_basis->npw);
+    for (int it = 0; it < ucell.ntype; it++) {
+        for (int ig = 0; ig < rho_basis->npw; ig++) {
+            vloc_per_type_host[it * rho_basis->npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
+        }
+    }
+    
+    // 转换aux到FPTYPE类型
     std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
-    for (int ig = 0; ig < rho_basis->npw; ig++)
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
+    }
+    
+    // 设备端内存和指针设置（根据设备类型分支）
+    FPTYPE* d_gcar = gcar_flat.data();
+    FPTYPE* d_tau = tau_flat.data();
+    int* d_iat2it = iat2it_host.data();
+    FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
+    std::complex<FPTYPE>* d_aux = aux_fptype.data();
+    FPTYPE* d_force = nullptr;
+    std::vector<FPTYPE> force_host(this->nat * 3);
+    
+    if (this->device == base_device::GpuDevice)
+    {
+        d_gcar = nullptr;
+        d_tau = nullptr;
+        d_iat2it = nullptr;
+        d_vloc_per_type = nullptr;
+        d_aux = nullptr;
+        
+        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
+        resmem_int_op()(this->ctx, d_iat2it, this->nat);
+        resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw);
+        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
+        resmem_var_op()(this->ctx, d_force, this->nat * 3);
+        
+        // 数据传输到设备
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
+        
+        // 初始化force为0
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force, 0.0, this->nat * 3);
+    }
+    else
     {
-        aux_fptype[ig] = std::complex<FPTYPE>(static_cast<FPTYPE>(aux[ig].real()),
-                                              static_cast<FPTYPE>(aux[ig].imag()));
-    }
-
-    // Call the sincos operator
-    hamilt::cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU> sincos_op;
-    sincos_op(this->ctx,
-              this->nat,
-              rho_basis->npw,
-              ucell.ntype,
-              gcar_flat.data(),
-              tau_flat.data(),
-              iat2it_array.data(),
-              vloc_factors.data(),
-              aux_fptype.data(),
-              force_flat.data());
-
-    // Copy back the results with scaling
+        d_force = force_host.data();
+        // 对于CPU，直接初始化为0
+        std::fill(force_host.begin(), force_host.end(), static_cast<FPTYPE>(0.0));
+    }
+    
+    // 计算缩放因子
     const FPTYPE scale_factor = static_cast<FPTYPE>(ucell.tpiba * ucell.omega);
-    copy_back_loc_force_data<FPTYPE>(force_flat.data(), this->nat, scale_factor, forcelc);
+    
+    // 调用op进行sincos计算
+    hamilt::cal_force_loc_sincos_op<FPTYPE, Device>()(
+        this->ctx,
+        this->nat,
+        rho_basis->npw,
+        ucell.ntype,
+        d_gcar,
+        d_tau,
+        d_iat2it,
+        d_vloc_per_type,
+        d_aux,
+        scale_factor,
+        d_force
+    );
+    
+    // 根据设备类型处理结果
+    if (this->device == base_device::GpuDevice)
+    {
+        // 结果传回CPU
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3);
+        
+        // 清理设备内存
+        delmem_var_op()(this->ctx, d_gcar);
+        delmem_var_op()(this->ctx, d_tau);
+        delmem_int_op()(this->ctx, d_iat2it);
+        delmem_var_op()(this->ctx, d_vloc_per_type);
+        delmem_complex_op()(this->ctx, d_aux);
+        delmem_var_op()(this->ctx, d_force);
+    }
+    
+    // 将结果写入forcelc矩阵
+    for (int iat = 0; iat < this->nat; iat++) {
+        forcelc(iat, 0) = static_cast<double>(force_host[iat * 3 + 0]);
+        forcelc(iat, 1) = static_cast<double>(force_host[iat * 3 + 1]);
+        forcelc(iat, 2) = static_cast<double>(force_host[iat * 3 + 2]);
+    }
 
     // this->print(GlobalV::ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
@@ -879,48 +762,233 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         aux[rho_basis->ig_gge0] = std::complex<double>(0.0, 0.0);
     }
 
-    // Prepare data for the parallel operator
-    std::vector<FPTYPE> gcar_flat, tau_flat, it_facts, zv_values, latvec_flat, G_flat;
-    std::vector<int> iat2it_array, atom_na;
-    prepare_ew_parallel_data<FPTYPE>(ucell, rho_basis, aux, alpha, fact,
-                                     gcar_flat, tau_flat, iat2it_array, it_facts,
-                                     atom_na, zv_values, latvec_flat, G_flat);
+    // =============== 第一步：G空间sincos计算（在OpenMP区域外）===============
+    
+    // 预计算每个原子的it_fact和相关数据：按照全局原子索引iat顺序
+    std::vector<FPTYPE> it_facts_host(this->nat);
+    std::vector<FPTYPE> tau_flat(this->nat * 3);
+    std::vector<int> iat2it_host(this->nat);
+    
+    // 按照原始代码逻辑：遍历全局原子索引iat，通过查表获取(it,ia)
+    for (int iat = 0; iat < this->nat; iat++) {
+        int it = ucell.iat2it[iat];  // 查表获取原子类型
+        int ia = ucell.iat2ia[iat];  // 查表获取该类型内的原子索引
+        
+        double zv;
+        if (PARAM.inp.use_paw)
+        {
+#ifdef USE_PAW
+            zv = GlobalC::paw_cell.get_val(it);
+#endif
+        }
+        else
+        {
+            zv = ucell.atoms[it].ncpp.zv;
+        }
+        
+        it_facts_host[iat] = static_cast<FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba * 
+                                                ModuleBase::TWO_PI / ucell.omega * fact);
+        
+        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
+        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]); 
+        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
+        iat2it_host[iat] = it;
+    }
+    
+    // 准备设备端数据
+    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    // 转换aux到FPTYPE类型
+    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
+    }
+    
+    // 设备端内存和指针设置（根据设备类型分支）
+    FPTYPE* d_gcar = gcar_flat.data();
+    FPTYPE* d_tau = tau_flat.data();
+    int* d_iat2it = iat2it_host.data();
+    FPTYPE* d_it_facts = it_facts_host.data();
+    std::complex<FPTYPE>* d_aux = aux_fptype.data();
+    FPTYPE* d_force_g = nullptr;
+    std::vector<FPTYPE> force_g_host(this->nat * 3);
+    
+    if (this->device == base_device::GpuDevice)
+    {
+        d_gcar = nullptr;
+        d_tau = nullptr;
+        d_iat2it = nullptr;
+        d_it_facts = nullptr;
+        d_aux = nullptr;
+        
+        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
+        resmem_int_op()(this->ctx, d_iat2it, this->nat);
+        resmem_var_op()(this->ctx, d_it_facts, this->nat);
+        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
+        resmem_var_op()(this->ctx, d_force_g, this->nat * 3);
+        
+        // 数据传输
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
+        
+        // 初始化force为0
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force_g, 0.0, this->nat * 3);
+    }
+    else
+    {
+        d_force_g = force_g_host.data();
+        // 对于CPU，直接初始化为0
+        std::fill(force_g_host.begin(), force_g_host.end(), static_cast<FPTYPE>(0.0));
+    }
+    
+    // 调用op处理G空间sincos计算（在OpenMP区域外，无冲突）
+    hamilt::cal_force_ew_sincos_op<FPTYPE, Device>()(
+        this->ctx,
+        this->nat,
+        rho_basis->npw,
+        rho_basis->ig_gge0,  // G=0项索引，op内部会自动跳过
+        d_gcar,
+        d_tau,
+        d_iat2it,
+        d_it_facts,
+        d_aux,
+        d_force_g
+    );
+    
+    // 根据设备类型处理结果
+    if (this->device == base_device::GpuDevice)
+    {
+        // 将G空间结果传回CPU
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3);
+        
+        // 清理设备内存
+        delmem_var_op()(this->ctx, d_gcar);
+        delmem_var_op()(this->ctx, d_tau);
+        delmem_int_op()(this->ctx, d_iat2it);
+        delmem_var_op()(this->ctx, d_it_facts);
+        delmem_complex_op()(this->ctx, d_aux);
+        delmem_var_op()(this->ctx, d_force_g);
+    }
+    
+    // 累加到forceion
+    for (int iat = 0; iat < this->nat; iat++) {
+        forceion(iat, 0) += static_cast<double>(force_g_host[iat * 3 + 0]);
+        forceion(iat, 1) += static_cast<double>(force_g_host[iat * 3 + 1]); 
+        forceion(iat, 2) += static_cast<double>(force_g_host[iat * 3 + 2]);
+    }
 
-    // Prepare force array for the operator
-    std::vector<FPTYPE> force_flat(this->nat * 3, 0.0);
+    // =============== 第二步：实空间计算（保留原OpenMP结构）===============
 
-    // Convert aux to FPTYPE complex array
-    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
-    for (int ig = 0; ig < rho_basis->npw; ig++)
+#ifdef _OPENMP
+#pragma omp parallel
     {
-        aux_fptype[ig] = std::complex<FPTYPE>(static_cast<FPTYPE>(aux[ig].real()),
-                                              static_cast<FPTYPE>(aux[ig].imag()));
-    }
-
-    // Call the parallel operator
-    hamilt::cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_CPU> parallel_op;
-    parallel_op(this->ctx,
-                this->nat,
-                rho_basis->npw,
-                ucell.ntype,
-                rho_basis->ig_gge0,
-                gcar_flat.data(),
-                tau_flat.data(),
-                iat2it_array.data(),
-                it_facts.data(),
-                atom_na.data(),
-                zv_values.data(),
-                aux_fptype.data(),
-                static_cast<FPTYPE>(alpha),
-                static_cast<FPTYPE>(fact),
-                static_cast<FPTYPE>(ucell.lat0),
-                latvec_flat.data(),
-                G_flat.data(),
-                PARAM.inp.use_paw,
-                force_flat.data());
-
-    // Copy back the results
-    copy_back_ew_force_data<FPTYPE>(force_flat.data(), this->nat, forceion);
+        int num_threads = omp_get_num_threads();
+        int thread_id = omp_get_thread_num();
+#else
+    int num_threads = 1;
+    int thread_id = 0;
+#endif
+
+        /* Here is task distribution for multi-thread,
+            0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`.
+                To avoid syncing, we must calculate work range of each thread by our self
+            1. Calculate the iat range [iat_beg, iat_end) by each thread
+                a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat)
+            2. each thread iterate atoms form `iat_beg` to `iat_end-1`
+        */
+        int iat_beg, iat_end;
+        int it_beg, ia_beg;
+        ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end);
+        iat_end = iat_beg + iat_end;
+        ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
+
+        // 只保留实空间相互作用计算（means that the processor contains G=0 term）
+        if (rho_basis->ig_gge0 >= 0)
+        {
+            double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
+            int nrm = 0;
+
+            // output of rgen: the number of vectors in the sphere
+            const int mxr = 200;
+            // the maximum number of R vectors included in r
+            ModuleBase::Vector3<double>* r = new ModuleBase::Vector3<double>[mxr];
+            double* r2 = new double[mxr];
+            ModuleBase::GlobalFunc::ZEROS(r2, mxr);
+            int* irr = new int[mxr];
+            ModuleBase::GlobalFunc::ZEROS(irr, mxr);
+            // the square modulus of R_j-tau_s-tau_s'
+
+            int iat1 = iat_beg;
+            int T1 = it_beg;
+            int I1 = ia_beg;
+            const double sqa = sqrt(alpha);
+            const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI);
+
+            // iterating atoms.
+            // do not need to sync threads because task range of each thread is isolated
+            while (iat1 < iat_end)
+            {
+                int iat2 = 0; // mohan fix bug 2011-06-07
+                int I2 = 0;
+                int T2 = 0;
+                while (iat2 < this->nat)
+                {
+                    if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0)
+                    {
+                        ModuleBase::Vector3<double> d_tau
+                            = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2];
+                        H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm);
+
+                        for (int n = 0; n < nrm; n++)
+                        {
+                            const double rr = sqrt(r2[n]) * ucell.lat0;
+
+                            double factor;
+                            if (PARAM.inp.use_paw)
+                            {
+#ifdef USE_PAW
+                                factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2
+                                         / (rr * rr)
+                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
+                                         * ucell.lat0;
+#endif
+                            }
+                            else
+                            {
+                                factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv
+                                         * ModuleBase::e2 / (rr * rr)
+                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
+                                         * ucell.lat0;
+                            }
+
+                            forceion(iat1, 0) -= factor * r[n].x;
+                            forceion(iat1, 1) -= factor * r[n].y;
+                            forceion(iat1, 2) -= factor * r[n].z;
+                        }
+                    }
+                    ++iat2;
+                    ucell.step_iait(&I2, &T2);
+                } // atom b
+                ++iat1;
+                ucell.step_iait(&I1, &T1);
+            } // atom a
+
+            delete[] r;
+            delete[] r2;
+            delete[] irr;
+        }
+#ifdef _OPENMP
+    }
+#endif
 
     Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc);
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index 5d0656d105..077864caff 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -13,6 +13,125 @@
 
 namespace hamilt {
 
+// CUDA kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_per_type,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+        const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution
+        const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x);
+    atomicAdd(&force[iat * 3 + 1], local_force_y);
+    atomicAdd(&force[iat * 3 + 2], local_force_z);
+}
 
 template <typename FPTYPE>
 __global__ void cal_vkb1_nl(
@@ -188,6 +307,67 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     cudaCheckOnDebug();
 }
 
+// GPU operators
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_per_type,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ntype, gcar, tau, iat2it, vloc_per_type,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        scale_factor, force
+    );
+    
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ig_gge0, gcar, tau, iat2it, it_facts,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force
+    );
+    
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void cal_force_nl(
         const int ntype,
@@ -613,8 +793,12 @@ template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::comple
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
 
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
deleted file mode 100644
index f97214893e..0000000000
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
+++ /dev/null
@@ -1,204 +0,0 @@
-#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h"
-#include "module_base/module_device/types.h"
-
-#include <complex>
-#include <thrust/complex.h>
-#include <cuda_runtime.h>
-#include <base/macros/macros.h>
-
-#define THREADS_PER_BLOCK 256
-
-namespace hamilt {
-
-// CUDA kernels for sincos loops
-template <typename FPTYPE>
-__global__ void cal_force_loc_sincos_kernel(
-    const int nat,
-    const int npw,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* vloc_factors,
-    const thrust::complex<FPTYPE>* aux,
-    const FPTYPE scale_factor,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const int it = iat2it[iat];
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE phase = TWO_PI * (
-            gcar[ig * 3 + 0] * tau_x + 
-            gcar[ig * 3 + 1] * tau_y + 
-            gcar[ig * 3 + 2] * tau_z
-        );
-        
-        // Use CUDA intrinsic for sincos
-        FPTYPE sinp, cosp;
-        __sincos(phase, &sinp, &cosp);
-        
-        // Calculate force factor
-        const FPTYPE factor = vloc_factors[ig] * 
-                              (cosp * aux[ig].imag() + sinp * aux[ig].real());
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * factor;
-        local_force_y += gcar[ig * 3 + 1] * factor;
-        local_force_z += gcar[ig * 3 + 2] * factor;
-    }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
-    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
-    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
-}
-
-template <typename FPTYPE>
-__global__ void cal_force_ew_sincos_kernel(
-    const int nat,
-    const int npw,
-    const int ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* it_facts,
-    const thrust::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const int it = iat2it[iat];
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    const FPTYPE it_fact = it_facts[iat];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Skip G=0 term
-        if (ig == ig_gge0) continue;
-        
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE arg = TWO_PI * (
-            gcar[ig * 3 + 0] * tau_x + 
-            gcar[ig * 3 + 1] * tau_y + 
-            gcar[ig * 3 + 2] * tau_z
-        );
-        
-        // Use CUDA intrinsic for sincos
-        FPTYPE sinp, cosp;
-        __sincos(arg, &sinp, &cosp);
-        
-        // Calculate Ewald sum contribution
-        const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * sumnb;
-        local_force_y += gcar[ig * 3 + 1] * sumnb;
-        local_force_z += gcar[ig * 3 + 2] * sumnb;
-    }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact);
-    atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact);
-    atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact);
-}
-
-// GPU operator implementations
-template <typename FPTYPE>
-void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* vloc_factors,
-    const std::complex<FPTYPE>* aux,
-    const FPTYPE& scale_factor,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw,
-        gcar, tau, iat2it, vloc_factors,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
-        scale_factor, force
-    );
-    
-    cudaCheckOnDebug();
-}
-
-template <typename FPTYPE>
-void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const int& ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* it_facts,
-    const std::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw, ig_gge0,
-        gcar, tau, iat2it, it_facts,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
-        force
-    );
-    
-    cudaCheckOnDebug();
-}
-
-template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
-
-template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
-
-}  // namespace hamilt 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index b5745f2ac5..e0dc91a70c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -439,8 +439,9 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
                     const int* iat2it,
-                    const FPTYPE* vloc_factors,
+                    const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
                     FPTYPE* force)
     {
         const FPTYPE TWO_PI = 2.0 * M_PI;
@@ -465,8 +466,8 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                 FPTYPE sinp, cosp;
                 ModuleBase::libm::sincos(phase, &sinp, &cosp);
                 
-                const FPTYPE vloc_factor = vloc_factors[it * npw + ig];
-                const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+                const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor;
                 
                 local_force[0] += gcar[ig * 3 + 0] * factor;
                 local_force[1] += gcar[ig * 3 + 1] * factor;
@@ -480,209 +481,66 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
-// CPU implementation of Ewald parallel operator
+// CPU implementation of Ewald force sincos operator
 template <typename FPTYPE>
-struct cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_CPU>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
 {
     void operator()(const base_device::DEVICE_CPU* ctx,
                     const int& nat,
                     const int& npw,
-                    const int& ntype,
                     const int& ig_gge0,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
                     const int* iat2it,
                     const FPTYPE* it_facts,
-                    const int* atom_na,
-                    const FPTYPE* zv_values,
                     const std::complex<FPTYPE>* aux,
-                    const FPTYPE& alpha,
-                    const FPTYPE& fact,
-                    const FPTYPE& lat0,
-                    const FPTYPE* latvec,
-                    const FPTYPE* G,
-                    const bool& use_paw,
                     FPTYPE* force)
     {
         const FPTYPE TWO_PI = 2.0 * M_PI;
-        const FPTYPE e2 = 2.0; // ModuleBase::e2 equivalent
 
 #ifdef _OPENMP
-#pragma omp parallel
-        {
-            int num_threads = omp_get_num_threads();
-            int thread_id = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int thread_id = 0;
+#pragma omp parallel for
 #endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+            const FPTYPE it_fact = it_facts[iat];
 
-            // Task distribution for multi-thread
-            int iat_beg, iat_end;
-            ModuleBase::TASK_DIST_1D(num_threads, thread_id, nat, iat_beg, iat_end);
-            iat_end = iat_beg + iat_end;
-
-            // Preprocess ig_gap for skipping the ig point
-            int ig_gap = (ig_gge0 >= 0 && ig_gge0 < npw) ? ig_gge0 : -1;
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
 
-            // Part 1: sincos calculation (replacing original while loop with for loop)
-            for (int iat = iat_beg; iat < iat_end; ++iat)
+            for (int ig = 0; ig < npw; ig++)
             {
-                const int it = iat2it[iat];
-                
-                // Check if this atom type has atoms
-                if (atom_na[it] == 0) continue;
-
-                const FPTYPE tau_x = tau[iat * 3 + 0];
-                const FPTYPE tau_y = tau[iat * 3 + 1];
-                const FPTYPE tau_z = tau[iat * 3 + 2];
-                const FPTYPE it_fact = it_facts[iat];
+                // Skip G=0 term
+                if (ig == ig_gge0) continue;
 
-                FPTYPE local_force[3] = {0.0, 0.0, 0.0};
-
-                // G-vector loop with ig_gap handling (using for loops for GPU compatibility)
-                for (int ig = 0; ig < npw; ig++)
-                {
-                    // Skip G=0 term
-                    if (ig == ig_gap) continue;
-
-                    const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                                 gcar[ig * 3 + 1] * tau_y + 
-                                                 gcar[ig * 3 + 2] * tau_z);
-                    FPTYPE sinp, cosp;
-                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                    
-                    const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-                    
-                    local_force[0] += gcar[ig * 3 + 0] * sumnb;
-                    local_force[1] += gcar[ig * 3 + 1] * sumnb;
-                    local_force[2] += gcar[ig * 3 + 2] * sumnb;
-                }
-
-                // Apply it_fact scaling
-                force[iat * 3 + 0] += local_force[0] * it_fact;
-                force[iat * 3 + 1] += local_force[1] * it_fact;
-                force[iat * 3 + 2] += local_force[2] * it_fact;
-            }
-
-            // Part 2: Real space Ewald calculation (when processor contains G=0 term)
-            if (ig_gge0 >= 0)
-            {
-                const FPTYPE rmax = 5.0 / (sqrt(alpha) * lat0);
-                const int mxr = 200;
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                               gcar[ig * 3 + 1] * tau_y + 
+                                               gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
                 
-                // Allocate temporary arrays for each thread
-                ModuleBase::Vector3<double>* r_flat = new ModuleBase::Vector3<double>[mxr];
-                double* r2 = new double[mxr];
-                int* irr = new int[mxr];
+                const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
                 
-                // Initialize arrays
-                for (int i = 0; i < mxr; i++) {
-                    r2[i] = 0.0;
-                    irr[i] = 0;
-                    r_flat[i].x = 0.0;
-                    r_flat[i].y = 0.0;
-                    r_flat[i].z = 0.0;
-                }
-
-                const FPTYPE sqa = sqrt(alpha);
-                const FPTYPE sq8a_2pi = sqrt(8.0 * alpha / TWO_PI);
-
-                // Iterate over atoms in assigned range
-                for (int iat1 = iat_beg; iat1 < iat_end; ++iat1)
-                {
-                    const int T1 = iat2it[iat1];
-                    if (atom_na[T1] == 0) continue;
-
-                    const FPTYPE tau1_x = tau[iat1 * 3 + 0];
-                    const FPTYPE tau1_y = tau[iat1 * 3 + 1];
-                    const FPTYPE tau1_z = tau[iat1 * 3 + 2];
-
-                    // Iterate over all other atoms
-                    for (int iat2 = 0; iat2 < nat; ++iat2)
-                    {
-                        const int T2 = iat2it[iat2];
-                        
-                        if (iat1 != iat2 && atom_na[T2] != 0 && atom_na[T1] != 0)
-                        {
-                            const FPTYPE tau2_x = tau[iat2 * 3 + 0];
-                            const FPTYPE tau2_y = tau[iat2 * 3 + 1];
-                            const FPTYPE tau2_z = tau[iat2 * 3 + 2];
-                            
-                            // Calculate d_tau
-                            const FPTYPE d_tau_x = tau1_x - tau2_x;
-                            const FPTYPE d_tau_y = tau1_y - tau2_y;
-                            const FPTYPE d_tau_z = tau1_z - tau2_z;
-
-                            // Reconstruct Matrix3 objects from flat arrays for rgen call
-                            ModuleBase::Matrix3 latvec_matrix(
-                                static_cast<double>(latvec[0]), static_cast<double>(latvec[1]), static_cast<double>(latvec[2]),
-                                static_cast<double>(latvec[3]), static_cast<double>(latvec[4]), static_cast<double>(latvec[5]),
-                                static_cast<double>(latvec[6]), static_cast<double>(latvec[7]), static_cast<double>(latvec[8])
-                            );
-                            
-                            ModuleBase::Matrix3 G_matrix(
-                                static_cast<double>(G[0]), static_cast<double>(G[1]), static_cast<double>(G[2]),
-                                static_cast<double>(G[3]), static_cast<double>(G[4]), static_cast<double>(G[5]),
-                                static_cast<double>(G[6]), static_cast<double>(G[7]), static_cast<double>(G[8])
-                            );
-                            
-                            ModuleBase::Vector3<double> dtau_vec(
-                                static_cast<double>(d_tau_x),
-                                static_cast<double>(d_tau_y),
-                                static_cast<double>(d_tau_z)
-                            );
-
-                            // Call H_Ewald_pw::rgen to generate neighbor shells
-                            int nrm = 0;
-                            H_Ewald_pw::rgen(dtau_vec, static_cast<double>(rmax), irr, latvec_matrix, G_matrix, r_flat, r2, nrm);
-                            
-                            // Process all generated r vectors
-                            for (int nr = 0; nr < nrm; nr++)
-                            {
-                                const FPTYPE rr = sqrt(r2[nr]) * lat0;
-                                
-                                FPTYPE factor;
-                                if (use_paw)
-                                {
-                                    factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr)
-                                             * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr))
-                                             * lat0;
-                                }
-                                else
-                                {
-                                    factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr)
-                                             * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr))
-                                             * lat0;
-                                }
-
-                                force[iat1 * 3 + 0] -= factor * static_cast<FPTYPE>(r_flat[nr].x);
-                                force[iat1 * 3 + 1] -= factor * static_cast<FPTYPE>(r_flat[nr].y);
-                                force[iat1 * 3 + 2] -= factor * static_cast<FPTYPE>(r_flat[nr].z);
-                            }
-                        }
-                    }
-                }
-
-                delete[] r_flat;
-                delete[] r2;
-                delete[] irr;
+                local_force[0] += gcar[ig * 3 + 0] * factor;
+                local_force[1] += gcar[ig * 3 + 1] * factor;
+                local_force[2] += gcar[ig * 3 + 2] * factor;
             }
 
-#ifdef _OPENMP
+            force[iat * 3 + 0] = local_force[0];
+            force[iat * 3 + 1] = local_force[1];
+            force[iat * 3 + 2] = local_force[2];
         }
-#endif
     }
 };
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
-template struct cal_force_ew_parallel_op<float, base_device::DEVICE_CPU>;
-
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_CPU>;
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
-template struct cal_force_ew_parallel_op<double, base_device::DEVICE_CPU>;
-
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_CPU>;
 } // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 152f41e310..3fc5d92212 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -162,8 +162,9 @@ struct cal_force_loc_sincos_op
     /// @param gcar - G-vector Cartesian coordinates [npw * 3]
     /// @param tau - atomic positions [nat * 3]
     /// @param iat2it - atom to type mapping [nat]
-    /// @param vloc_factors - precomputed vloc factors [ntype * npw]
+    /// @param vloc_per_type - precomputed vloc factors per type [ntype * npw]
     /// @param aux - charge density in G-space [npw]
+    /// @param scale_factor - tpiba * omega
     ///
     /// Output Parameters
     /// @param force - output forces [nat * 3]
@@ -174,55 +175,39 @@ struct cal_force_loc_sincos_op
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
                     const int* iat2it,
-                    const FPTYPE* vloc_factors,
+                    const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
                     FPTYPE* force);
 };
 
 template <typename FPTYPE, typename Device>
-struct cal_force_ew_parallel_op
+struct cal_force_ew_sincos_op
 {
-    /// @brief Calculate Ewald forces (parallel region only)
+    /// @brief Calculate Ewald forces (sincos loop only)
     ///
     /// Input Parameters
     /// @param ctx - which device this function runs on
     /// @param nat - number of atoms
     /// @param npw - number of plane waves
-    /// @param ntype - number of atom types
     /// @param ig_gge0 - index of G=0 vector (-1 if not present)
     /// @param gcar - G-vector Cartesian coordinates [npw * 3]
     /// @param tau - atomic positions [nat * 3]
     /// @param iat2it - atom to type mapping [nat]
     /// @param it_facts - precomputed it_fact for each atom [nat]
-    /// @param atom_na - number of atoms of each type [ntype]
-    /// @param zv_values - valence charge for each type [ntype]
     /// @param aux - structure factor related array [npw]
-    /// @param alpha - Ewald parameter
-    /// @param fact - scaling factor
-    /// @param lat0 - lattice constant
-    /// @param latvec - lattice vectors [3 * 3]
-    /// @param G - reciprocal lattice vectors [3 * 3]
-    /// @param use_pwa - whether PAW is used
+    ///
     /// Output Parameters
     /// @param force - output forces [nat * 3]
     void operator()(const Device* ctx,
                     const int& nat,
                     const int& npw,
-                    const int& ntype,
                     const int& ig_gge0,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
                     const int* iat2it,
                     const FPTYPE* it_facts,
-                    const int* atom_na,
-                    const FPTYPE* zv_values,
                     const std::complex<FPTYPE>* aux,
-                    const FPTYPE& alpha,
-                    const FPTYPE& fact,
-                    const FPTYPE& lat0,
-                    const FPTYPE* latvec,
-                    const FPTYPE* G,
-                    const bool& use_paw,
                     FPTYPE* force);
 };
 
@@ -335,13 +320,14 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
                     const int* iat2it,
-                    const FPTYPE* vloc_factors,
+                    const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
                     FPTYPE* force);
 };
 
 template <typename FPTYPE>
-struct cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_GPU>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
 {
     void operator()(const base_device::DEVICE_GPU* ctx,
                     const int& nat,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
deleted file mode 100644
index e7ce8554cc..0000000000
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-#include "force_sincos_op.h"
-#include "module_base/libm/libm.h"
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace hamilt
-{
-
-template <typename FPTYPE>
-struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
-{
-    void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* vloc_factors,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force)
-    {
-        const FPTYPE TWO_PI = 2.0 * M_PI;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-        for (int iat = 0; iat < nat; ++iat)
-        {
-            const int it = iat2it[iat];
-            const FPTYPE tau_x = tau[iat * 3 + 0];
-            const FPTYPE tau_y = tau[iat * 3 + 1];
-            const FPTYPE tau_z = tau[iat * 3 + 2];
-
-            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
-
-            for (int ig = 0; ig < npw; ig++)
-            {
-                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                               gcar[ig * 3 + 1] * tau_y + 
-                                               gcar[ig * 3 + 2] * tau_z);
-                FPTYPE sinp, cosp;
-                ModuleBase::libm::sincos(phase, &sinp, &cosp);
-                
-                const FPTYPE factor = vloc_factors[ig] * 
-                                     (cosp * aux[ig].imag() + sinp * aux[ig].real());
-                
-                local_force[0] += gcar[ig * 3 + 0] * factor;
-                local_force[1] += gcar[ig * 3 + 1] * factor;
-                local_force[2] += gcar[ig * 3 + 2] * factor;
-            }
-
-            force[iat * 3 + 0] += local_force[0] * scale_factor;
-            force[iat * 3 + 1] += local_force[1] * scale_factor;
-            force[iat * 3 + 2] += local_force[2] * scale_factor;
-        }
-    }
-};
-
-template <typename FPTYPE>
-struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
-{
-    void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force)
-    {
-        const FPTYPE TWO_PI = 2.0 * M_PI;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-        for (int iat = 0; iat < nat; ++iat)
-        {
-            const int it = iat2it[iat];
-            const FPTYPE tau_x = tau[iat * 3 + 0];
-            const FPTYPE tau_y = tau[iat * 3 + 1];
-            const FPTYPE tau_z = tau[iat * 3 + 2];
-            const FPTYPE it_fact = it_facts[iat];
-
-            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
-
-            for (int ig = 0; ig < npw; ig++)
-            {
-                // Skip G=0 term
-                if (ig == ig_gge0) continue;
-
-                const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                             gcar[ig * 3 + 1] * tau_y + 
-                                             gcar[ig * 3 + 2] * tau_z);
-                FPTYPE sinp, cosp;
-                ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                
-                const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-                
-                local_force[0] += gcar[ig * 3 + 0] * sumnb;
-                local_force[1] += gcar[ig * 3 + 1] * sumnb;
-                local_force[2] += gcar[ig * 3 + 2] * sumnb;
-            }
-
-            force[iat * 3 + 0] += local_force[0] * it_fact;
-            force[iat * 3 + 1] += local_force[1] * it_fact;
-            force[iat * 3 + 2] += local_force[2] * it_fact;
-        }
-    }
-};
-
-// Template instantiations
-template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
-template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
-
-template struct cal_force_ew_sincos_op<float, base_device::DEVICE_CPU>;
-template struct cal_force_ew_sincos_op<double, base_device::DEVICE_CPU>;
-
-} // namespace hamilt 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
deleted file mode 100644
index 455b8425a7..0000000000
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef FORCE_SINCOS_OP_H
-#define FORCE_SINCOS_OP_H
-
-#include "module_base/module_device/types.h"
-#include <complex>
-
-namespace hamilt
-{
-
-template <typename FPTYPE, typename Device>
-struct cal_force_loc_sincos_op
-{
-    /// @brief Calculate local pseudopotential forces (sincos loop only)
-    ///
-    /// Input Parameters
-    /// @param ctx - which device this function runs on
-    /// @param nat - number of atoms
-    /// @param npw - number of plane waves
-    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
-    /// @param tau - atomic positions [nat * 3]
-    /// @param iat2it - atom to type mapping [nat]
-    /// @param vloc_factors - precomputed vloc factors [npw]
-    /// @param aux - charge density in G-space [npw]
-    /// @param scale_factor - tpiba * omega
-    ///
-    /// Output Parameters
-    /// @param force - output forces [nat * 3]
-    void operator()(const Device* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* vloc_factors,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force);
-};
-
-template <typename FPTYPE, typename Device>
-struct cal_force_ew_sincos_op
-{
-    /// @brief Calculate Ewald forces (sincos loop only)
-    ///
-    /// Input Parameters
-    /// @param ctx - which device this function runs on
-    /// @param nat - number of atoms
-    /// @param npw - number of plane waves
-    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
-    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
-    /// @param tau - atomic positions [nat * 3]
-    /// @param iat2it - atom to type mapping [nat]
-    /// @param it_facts - precomputed it_fact for each atom [nat]
-    /// @param aux - structure factor related array [npw]
-    ///
-    /// Output Parameters
-    /// @param force - output forces [nat * 3]
-    void operator()(const Device* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force);
-};
-
-#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
-
-template <typename FPTYPE>
-struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
-{
-    void operator()(const base_device::DEVICE_GPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* vloc_factors,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force);
-};
-
-template <typename FPTYPE>
-struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
-{
-    void operator()(const base_device::DEVICE_GPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force);
-};
-
-#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
-
-} // namespace hamilt
-
-#endif // FORCE_SINCOS_OP_H 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index c78b333b86..9feeb76fed 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -618,10 +618,195 @@ template void revertVkbValues<double>(const int *gcar_zero_ptrs, std::complex<do
 // for saveVkbValues functions instantiation
 template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::complex<double> *vkb_ptr, std::complex<double> *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx);
 
+// HIP kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_per_type,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+        const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution
+        const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x);
+    atomicAdd(&force[iat * 3 + 1], local_force_y);
+    atomicAdd(&force[iat * 3 + 2], local_force_z);
+}
+
+// GPU operators
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_per_type,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(cal_force_loc_sincos_kernel<FPTYPE>,
+                       grid, block, 0, 0,
+                       nat, npw, ntype, gcar, tau, iat2it, vloc_per_type,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+                       scale_factor, force);
+    
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(cal_force_ew_sincos_kernel<FPTYPE>,
+                       grid, block, 0, 0,
+                       nat, npw, ig_gge0, gcar, tau, iat2it, it_facts,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force);
+    
+    hipCheckOnDebug();
+}
+
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
 
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu
deleted file mode 100644
index a09cd28bed..0000000000
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu
+++ /dev/null
@@ -1,203 +0,0 @@
-#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h"
-
-#include <complex>
-#include <thrust/complex.h>
-#include <hip/hip_runtime.h>
-#include <base/macros/macros.h>
-
-#define THREADS_PER_BLOCK 256
-
-namespace hamilt {
-
-// HIP kernels for sincos loops
-template <typename FPTYPE>
-__global__ void cal_force_loc_sincos_kernel(
-    const int nat,
-    const int npw,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* vloc_factors,
-    const thrust::complex<FPTYPE>* aux,
-    const FPTYPE scale_factor,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const int it = iat2it[iat];
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE phase = TWO_PI * (
-            gcar[ig * 3 + 0] * tau_x + 
-            gcar[ig * 3 + 1] * tau_y + 
-            gcar[ig * 3 + 2] * tau_z
-        );
-        
-        // Use HIP intrinsic for sincos
-        FPTYPE sinp, cosp;
-        __sincos(phase, &sinp, &cosp);
-        
-        // Calculate force factor
-        const FPTYPE factor = vloc_factors[ig] * 
-                              (cosp * aux[ig].imag() + sinp * aux[ig].real());
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * factor;
-        local_force_y += gcar[ig * 3 + 1] * factor;
-        local_force_z += gcar[ig * 3 + 2] * factor;
-    }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
-    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
-    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
-}
-
-template <typename FPTYPE>
-__global__ void cal_force_ew_sincos_kernel(
-    const int nat,
-    const int npw,
-    const int ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* it_facts,
-    const thrust::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const int it = iat2it[iat];
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    const FPTYPE it_fact = it_facts[iat];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Skip G=0 term
-        if (ig == ig_gge0) continue;
-        
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE arg = TWO_PI * (
-            gcar[ig * 3 + 0] * tau_x + 
-            gcar[ig * 3 + 1] * tau_y + 
-            gcar[ig * 3 + 2] * tau_z
-        );
-        
-        // Use HIP intrinsic for sincos
-        FPTYPE sinp, cosp;
-        __sincos(arg, &sinp, &cosp);
-        
-        // Calculate Ewald sum contribution
-        const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * sumnb;
-        local_force_y += gcar[ig * 3 + 1] * sumnb;
-        local_force_z += gcar[ig * 3 + 2] * sumnb;
-    }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact);
-    atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact);
-    atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact);
-}
-
-// GPU operator implementations
-template <typename FPTYPE>
-void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* vloc_factors,
-    const std::complex<FPTYPE>* aux,
-    const FPTYPE& scale_factor,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_loc_sincos_kernel<FPTYPE>), grid, block, 0, 0,
-        nat, npw,
-        gcar, tau, iat2it, vloc_factors,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
-        scale_factor, force
-    );
-    
-    hipCheckOnDebug();
-}
-
-template <typename FPTYPE>
-void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const int& ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* it_facts,
-    const std::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_ew_sincos_kernel<FPTYPE>), grid, block, 0, 0,
-        nat, npw, ig_gge0,
-        gcar, tau, iat2it, it_facts,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
-        force
-    );
-    
-    hipCheckOnDebug();
-}
-
-template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
-
-template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
-
-}  // namespace hamilt 
\ No newline at end of file

From 465d991b7c2b94c385597751815255e32c536502 Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Tue, 3 Jun 2025 11:26:23 +0800
Subject: [PATCH 5/6] fix vloc computation in cal_force_loc_sincos_op

---
 source/module_hamilt_pw/hamilt_pwdft/forces.cpp   | 15 ++++-----------
 .../hamilt_pwdft/kernels/cuda/force_op.cu         |  7 ++-----
 .../hamilt_pwdft/kernels/force_op.cpp             |  4 +---
 .../hamilt_pwdft/kernels/force_op.h               |  5 +----
 .../hamilt_pwdft/kernels/rocm/force_op.hip.cu     |  7 ++-----
 5 files changed, 10 insertions(+), 28 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index 5e1d3cd6a2..d50288eef8 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -537,7 +537,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     
     // 准备原子相关数据：按照全局原子索引iat顺序
     std::vector<FPTYPE> tau_flat(this->nat * 3);
-    std::vector<int> iat2it_host(this->nat);
     std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
     
     // 按照原始代码逻辑：遍历全局原子索引iat，通过查表获取(it,ia)
@@ -548,7 +547,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
         tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
         tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
-        iat2it_host[iat] = it;
     }
     
     for (int ig = 0; ig < rho_basis->npw; ig++) {
@@ -559,9 +557,10 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     
     // 重新计算vloc_factors考虑所有原子类型
     std::vector<FPTYPE> vloc_per_type_host(ucell.ntype * rho_basis->npw);
-    for (int it = 0; it < ucell.ntype; it++) {
+    for (int iat = 0; iat < this->nat; iat++) {
+        int it = ucell.iat2it[iat];
         for (int ig = 0; ig < rho_basis->npw; ig++) {
-            vloc_per_type_host[it * rho_basis->npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
+            vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
         }
     }
     
@@ -574,7 +573,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // 设备端内存和指针设置（根据设备类型分支）
     FPTYPE* d_gcar = gcar_flat.data();
     FPTYPE* d_tau = tau_flat.data();
-    int* d_iat2it = iat2it_host.data();
     FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
     std::complex<FPTYPE>* d_aux = aux_fptype.data();
     FPTYPE* d_force = nullptr;
@@ -584,13 +582,11 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     {
         d_gcar = nullptr;
         d_tau = nullptr;
-        d_iat2it = nullptr;
         d_vloc_per_type = nullptr;
         d_aux = nullptr;
         
         resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
         resmem_var_op()(this->ctx, d_tau, this->nat * 3);
-        resmem_int_op()(this->ctx, d_iat2it, this->nat);
         resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw);
         resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
         resmem_var_op()(this->ctx, d_force, this->nat * 3);
@@ -598,7 +594,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         // 数据传输到设备
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw);
         syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
         
@@ -623,7 +618,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         ucell.ntype,
         d_gcar,
         d_tau,
-        d_iat2it,
         d_vloc_per_type,
         d_aux,
         scale_factor,
@@ -639,7 +633,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         // 清理设备内存
         delmem_var_op()(this->ctx, d_gcar);
         delmem_var_op()(this->ctx, d_tau);
-        delmem_int_op()(this->ctx, d_iat2it);
         delmem_var_op()(this->ctx, d_vloc_per_type);
         delmem_complex_op()(this->ctx, d_aux);
         delmem_var_op()(this->ctx, d_force);
@@ -652,7 +645,7 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         forcelc(iat, 2) = static_cast<double>(force_host[iat * 3 + 2]);
     }
 
-    // this->print(GlobalV::ofs_running, "local forces", forcelc);
+    // this->print(GlobalV: :ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
     delete[] aux;
     ModuleBase::timer::tick("Forces", "cal_force_loc");
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index 077864caff..a82bd1ee0b 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -21,7 +21,6 @@ __global__ void cal_force_loc_sincos_kernel(
     const int ntype,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* vloc_per_type,
     const thrust::complex<FPTYPE>* aux,
     const FPTYPE scale_factor,
@@ -36,7 +35,6 @@ __global__ void cal_force_loc_sincos_kernel(
     if (iat >= nat) return;
     
     // Load atom information to registers
-    const int it = iat2it[iat];
     const FPTYPE tau_x = tau[iat * 3 + 0];
     const FPTYPE tau_y = tau[iat * 3 + 1];
     const FPTYPE tau_z = tau[iat * 3 + 2];
@@ -58,7 +56,7 @@ __global__ void cal_force_loc_sincos_kernel(
         sincos(phase, &sinp, &cosp);
         
         // Calculate force factor
-        const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+        const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
         const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
         
         // Accumulate force contributions
@@ -316,7 +314,6 @@ void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     const int& ntype,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* vloc_per_type,
     const std::complex<FPTYPE>* aux,
     const FPTYPE& scale_factor,
@@ -331,7 +328,7 @@ void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     dim3 block(threads_per_block);
     
     cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw, ntype, gcar, tau, iat2it, vloc_per_type,
+        nat, npw, ntype, gcar, tau, vloc_per_type,
         reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
         scale_factor, force
     );
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index e0dc91a70c..49576abbfa 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -438,7 +438,6 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& ntype,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
                     const FPTYPE& scale_factor,
@@ -451,7 +450,6 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
 #endif
         for (int iat = 0; iat < nat; ++iat)
         {
-            const int it = iat2it[iat];
             const FPTYPE tau_x = tau[iat * 3 + 0];
             const FPTYPE tau_y = tau[iat * 3 + 1];
             const FPTYPE tau_z = tau[iat * 3 + 2];
@@ -466,7 +464,7 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                 FPTYPE sinp, cosp;
                 ModuleBase::libm::sincos(phase, &sinp, &cosp);
                 
-                const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+                const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
                 const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor;
                 
                 local_force[0] += gcar[ig * 3 + 0] * factor;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 3fc5d92212..58fcd842ef 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -161,8 +161,7 @@ struct cal_force_loc_sincos_op
     /// @param ntype - number of atom types
     /// @param gcar - G-vector Cartesian coordinates [npw * 3]
     /// @param tau - atomic positions [nat * 3]
-    /// @param iat2it - atom to type mapping [nat]
-    /// @param vloc_per_type - precomputed vloc factors per type [ntype * npw]
+    /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw]
     /// @param aux - charge density in G-space [npw]
     /// @param scale_factor - tpiba * omega
     ///
@@ -174,7 +173,6 @@ struct cal_force_loc_sincos_op
                     const int& ntype,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
                     const FPTYPE& scale_factor,
@@ -319,7 +317,6 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& ntype,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
                     const FPTYPE& scale_factor,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index 9feeb76fed..7f0d4b099e 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -626,7 +626,6 @@ __global__ void cal_force_loc_sincos_kernel(
     const int ntype,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* vloc_per_type,
     const thrust::complex<FPTYPE>* aux,
     const FPTYPE scale_factor,
@@ -641,7 +640,6 @@ __global__ void cal_force_loc_sincos_kernel(
     if (iat >= nat) return;
     
     // Load atom information to registers
-    const int it = iat2it[iat];
     const FPTYPE tau_x = tau[iat * 3 + 0];
     const FPTYPE tau_y = tau[iat * 3 + 1];
     const FPTYPE tau_z = tau[iat * 3 + 2];
@@ -663,7 +661,7 @@ __global__ void cal_force_loc_sincos_kernel(
         sincos(phase, &sinp, &cosp);
         
         // Calculate force factor
-        const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+        const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
         const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
         
         // Accumulate force contributions
@@ -747,7 +745,6 @@ void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     const int& ntype,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* vloc_per_type,
     const std::complex<FPTYPE>* aux,
     const FPTYPE& scale_factor,
@@ -763,7 +760,7 @@ void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     
     hipLaunchKernelGGL(cal_force_loc_sincos_kernel<FPTYPE>,
                        grid, block, 0, 0,
-                       nat, npw, ntype, gcar, tau, iat2it, vloc_per_type,
+                       nat, npw, ntype, gcar, tau, vloc_per_type,
                        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
                        scale_factor, force);
     

From a2491171233b14d4db555f88b0f75f932ddc3b5e Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Tue, 3 Jun 2025 20:35:20 +0800
Subject: [PATCH 6/6] fix cal_force_ew

---
 .../module_hamilt_pw/hamilt_pwdft/forces.cpp  | 67 ++++++-------------
 .../hamilt_pwdft/kernels/cuda/force_op.cu     |  8 +--
 .../hamilt_pwdft/kernels/force_op.cpp         |  3 +-
 .../hamilt_pwdft/kernels/force_op.h           |  3 -
 .../hamilt_pwdft/kernels/rocm/force_op.hip.cu |  8 +--
 5 files changed, 29 insertions(+), 60 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index d50288eef8..4cade5d1ac 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -532,17 +532,17 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-    // =============== GPU/CPU异构优化：使用sincos op替换原循环 ===============
+    // sincos op for G space
     
     
-    // 准备原子相关数据：按照全局原子索引iat顺序
+    // data preparation
     std::vector<FPTYPE> tau_flat(this->nat * 3);
     std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
     
-    // 按照原始代码逻辑：遍历全局原子索引iat，通过查表获取(it,ia)
+    
     for (int iat = 0; iat < this->nat; iat++) {
-        int it = ucell.iat2it[iat];  // 查表获取原子类型
-        int ia = ucell.iat2ia[iat];  // 查表获取该类型内的原子索引
+        int it = ucell.iat2it[iat];  
+        int ia = ucell.iat2ia[iat];  
         
         tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
         tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
@@ -555,7 +555,7 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
     }
     
-    // 重新计算vloc_factors考虑所有原子类型
+    // calculate vloc_factors for all atom types
     std::vector<FPTYPE> vloc_per_type_host(ucell.ntype * rho_basis->npw);
     for (int iat = 0; iat < this->nat; iat++) {
         int it = ucell.iat2it[iat];
@@ -564,13 +564,11 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         }
     }
     
-    // 转换aux到FPTYPE类型
     std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
     for (int ig = 0; ig < rho_basis->npw; ig++) {
         aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
     }
     
-    // 设备端内存和指针设置（根据设备类型分支）
     FPTYPE* d_gcar = gcar_flat.data();
     FPTYPE* d_tau = tau_flat.data();
     FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
@@ -591,26 +589,22 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
         resmem_var_op()(this->ctx, d_force, this->nat * 3);
         
-        // 数据传输到设备
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw);
         syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
         
-        // 初始化force为0
         base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force, 0.0, this->nat * 3);
     }
     else
     {
         d_force = force_host.data();
-        // 对于CPU，直接初始化为0
         std::fill(force_host.begin(), force_host.end(), static_cast<FPTYPE>(0.0));
     }
     
-    // 计算缩放因子
     const FPTYPE scale_factor = static_cast<FPTYPE>(ucell.tpiba * ucell.omega);
     
-    // 调用op进行sincos计算
+    // call op for sincos calculation
     hamilt::cal_force_loc_sincos_op<FPTYPE, Device>()(
         this->ctx,
         this->nat,
@@ -624,13 +618,10 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         d_force
     );
     
-    // 根据设备类型处理结果
     if (this->device == base_device::GpuDevice)
     {
-        // 结果传回CPU
         syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3);
         
-        // 清理设备内存
         delmem_var_op()(this->ctx, d_gcar);
         delmem_var_op()(this->ctx, d_tau);
         delmem_var_op()(this->ctx, d_vloc_per_type);
@@ -638,7 +629,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         delmem_var_op()(this->ctx, d_force);
     }
     
-    // 将结果写入forcelc矩阵
     for (int iat = 0; iat < this->nat; iat++) {
         forcelc(iat, 0) = static_cast<double>(force_host[iat * 3 + 0]);
         forcelc(iat, 1) = static_cast<double>(force_host[iat * 3 + 1]);
@@ -755,17 +745,15 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         aux[rho_basis->ig_gge0] = std::complex<double>(0.0, 0.0);
     }
 
-    // =============== 第一步：G空间sincos计算（在OpenMP区域外）===============
+    // sincos op for cal_force_ew
     
-    // 预计算每个原子的it_fact和相关数据：按照全局原子索引iat顺序
     std::vector<FPTYPE> it_facts_host(this->nat);
     std::vector<FPTYPE> tau_flat(this->nat * 3);
-    std::vector<int> iat2it_host(this->nat);
     
-    // 按照原始代码逻辑：遍历全局原子索引iat，通过查表获取(it,ia)
+    // iterate over by lookup table
     for (int iat = 0; iat < this->nat; iat++) {
-        int it = ucell.iat2it[iat];  // 查表获取原子类型
-        int ia = ucell.iat2ia[iat];  // 查表获取该类型内的原子索引
+        int it = ucell.iat2it[iat];  
+        int ia = ucell.iat2ia[iat];  
         
         double zv;
         if (PARAM.inp.use_paw)
@@ -785,10 +773,8 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
         tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]); 
         tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
-        iat2it_host[iat] = it;
     }
     
-    // 准备设备端数据
     std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
     for (int ig = 0; ig < rho_basis->npw; ig++) {
         gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
@@ -796,16 +782,13 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
     }
     
-    // 转换aux到FPTYPE类型
     std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
     for (int ig = 0; ig < rho_basis->npw; ig++) {
         aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
     }
     
-    // 设备端内存和指针设置（根据设备类型分支）
     FPTYPE* d_gcar = gcar_flat.data();
     FPTYPE* d_tau = tau_flat.data();
-    int* d_iat2it = iat2it_host.data();
     FPTYPE* d_it_facts = it_facts_host.data();
     std::complex<FPTYPE>* d_aux = aux_fptype.data();
     FPTYPE* d_force_g = nullptr;
@@ -815,72 +798,66 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
     {
         d_gcar = nullptr;
         d_tau = nullptr;
-        d_iat2it = nullptr;
         d_it_facts = nullptr;
         d_aux = nullptr;
         
         resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
         resmem_var_op()(this->ctx, d_tau, this->nat * 3);
-        resmem_int_op()(this->ctx, d_iat2it, this->nat);
         resmem_var_op()(this->ctx, d_it_facts, this->nat);
         resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
         resmem_var_op()(this->ctx, d_force_g, this->nat * 3);
         
-        // 数据传输
+        
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat);
         syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
         
-        // 初始化force为0
+     
         base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force_g, 0.0, this->nat * 3);
     }
     else
     {
         d_force_g = force_g_host.data();
-        // 对于CPU，直接初始化为0
         std::fill(force_g_host.begin(), force_g_host.end(), static_cast<FPTYPE>(0.0));
     }
     
-    // 调用op处理G空间sincos计算（在OpenMP区域外，无冲突）
+    // call op for sincos calculation
     hamilt::cal_force_ew_sincos_op<FPTYPE, Device>()(
         this->ctx,
         this->nat,
         rho_basis->npw,
-        rho_basis->ig_gge0,  // G=0项索引，op内部会自动跳过
+        rho_basis->ig_gge0,  
         d_gcar,
         d_tau,
-        d_iat2it,
         d_it_facts,
         d_aux,
         d_force_g
     );
     
-    // 根据设备类型处理结果
+   
     if (this->device == base_device::GpuDevice)
     {
-        // 将G空间结果传回CPU
+        
         syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3);
         
-        // 清理设备内存
+        
         delmem_var_op()(this->ctx, d_gcar);
         delmem_var_op()(this->ctx, d_tau);
-        delmem_int_op()(this->ctx, d_iat2it);
         delmem_var_op()(this->ctx, d_it_facts);
         delmem_complex_op()(this->ctx, d_aux);
         delmem_var_op()(this->ctx, d_force_g);
     }
     
-    // 累加到forceion
+
     for (int iat = 0; iat < this->nat; iat++) {
         forceion(iat, 0) += static_cast<double>(force_g_host[iat * 3 + 0]);
         forceion(iat, 1) += static_cast<double>(force_g_host[iat * 3 + 1]); 
         forceion(iat, 2) += static_cast<double>(force_g_host[iat * 3 + 2]);
     }
 
-    // =============== 第二步：实空间计算（保留原OpenMP结构）===============
-
+   
+// calculate real space force
 #ifdef _OPENMP
 #pragma omp parallel
     {
@@ -904,7 +881,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         iat_end = iat_beg + iat_end;
         ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
 
-        // 只保留实空间相互作用计算（means that the processor contains G=0 term）
+        
         if (rho_basis->ig_gge0 >= 0)
         {
             double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index a82bd1ee0b..5ff3ddf2b1 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -78,7 +78,6 @@ __global__ void cal_force_ew_sincos_kernel(
     const int ig_gge0,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* it_facts,
     const thrust::complex<FPTYPE>* aux,
     FPTYPE* force)
@@ -116,8 +115,8 @@ __global__ void cal_force_ew_sincos_kernel(
         FPTYPE sinp, cosp;
         sincos(phase, &sinp, &cosp);
         
-        // Calculate Ewald sum contribution
-        const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        // Calculate Ewald sum contribution (fixed sign error)
+        const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
         
         // Accumulate force contributions
         local_force_x += gcar[ig * 3 + 0] * factor;
@@ -344,7 +343,6 @@ void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     const int& ig_gge0,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* it_facts,
     const std::complex<FPTYPE>* aux,
     FPTYPE* force)
@@ -358,7 +356,7 @@ void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     dim3 block(threads_per_block);
     
     cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw, ig_gge0, gcar, tau, iat2it, it_facts,
+        nat, npw, ig_gge0, gcar, tau, it_facts,
         reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force
     );
     
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 49576abbfa..109321d6c3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -489,7 +489,6 @@ struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& ig_gge0,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* it_facts,
                     const std::complex<FPTYPE>* aux,
                     FPTYPE* force)
@@ -519,7 +518,7 @@ struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                 FPTYPE sinp, cosp;
                 ModuleBase::libm::sincos(phase, &sinp, &cosp);
                 
-                const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
                 
                 local_force[0] += gcar[ig * 3 + 0] * factor;
                 local_force[1] += gcar[ig * 3 + 1] * factor;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 58fcd842ef..acf490b278 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -191,7 +191,6 @@ struct cal_force_ew_sincos_op
     /// @param ig_gge0 - index of G=0 vector (-1 if not present)
     /// @param gcar - G-vector Cartesian coordinates [npw * 3]
     /// @param tau - atomic positions [nat * 3]
-    /// @param iat2it - atom to type mapping [nat]
     /// @param it_facts - precomputed it_fact for each atom [nat]
     /// @param aux - structure factor related array [npw]
     ///
@@ -203,7 +202,6 @@ struct cal_force_ew_sincos_op
                     const int& ig_gge0,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* it_facts,
                     const std::complex<FPTYPE>* aux,
                     FPTYPE* force);
@@ -332,7 +330,6 @@ struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& ig_gge0,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* it_facts,
                     const std::complex<FPTYPE>* aux,
                     FPTYPE* force);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index 7f0d4b099e..6bb3a84e7e 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -683,7 +683,6 @@ __global__ void cal_force_ew_sincos_kernel(
     const int ig_gge0,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* it_facts,
     const thrust::complex<FPTYPE>* aux,
     FPTYPE* force)
@@ -721,8 +720,8 @@ __global__ void cal_force_ew_sincos_kernel(
         FPTYPE sinp, cosp;
         sincos(phase, &sinp, &cosp);
         
-        // Calculate Ewald sum contribution
-        const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        // Calculate Ewald sum contribution (fixed sign error)
+        const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
         
         // Accumulate force contributions
         local_force_x += gcar[ig * 3 + 0] * factor;
@@ -775,7 +774,6 @@ void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     const int& ig_gge0,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* it_facts,
     const std::complex<FPTYPE>* aux,
     FPTYPE* force)
@@ -790,7 +788,7 @@ void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     
     hipLaunchKernelGGL(cal_force_ew_sincos_kernel<FPTYPE>,
                        grid, block, 0, 0,
-                       nat, npw, ig_gge0, gcar, tau, iat2it, it_facts,
+                       nat, npw, ig_gge0, gcar, tau, it_facts,
                        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force);
     
     hipCheckOnDebug();