diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp
index b20ea9f3ad..9373a5a31a 100644
--- a/source/module_base/module_device/device.cpp
+++ b/source/module_base/module_device/device.cpp
@@ -5,7 +5,7 @@
 
 #include <base/macros/macros.h>
 #include <cstring>
-
+#include <iostream>
 #ifdef __MPI
 #include "mpi.h"
 #endif
@@ -166,6 +166,11 @@ int device_count = -1;
 cudaGetDeviceCount(&device_count);
 #elif defined(__ROCM)
 hipGetDeviceCount(&device_count);
+/***auto start_time = std::chrono::high_resolution_clock::now();
+std::cout << "Starting hipGetDeviceCount.." << std::endl;
+auto end_time = std::chrono::high_resolution_clock::now();
+auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
 #endif
 if (device_count <= 0)
 {
@@ -711,4 +716,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
 #endif
 
 } // end of namespace information
-} // end of namespace base_device
\ No newline at end of file
+} // end of namespace base_device
diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp
index 0352492a3a..f8985387d9 100644
--- a/source/module_esolver/esolver.cpp
+++ b/source/module_esolver/esolver.cpp
@@ -107,13 +107,21 @@ std::string determine_type()
 	}
 	if (GlobalV::MY_RANK == 0)
 	{
-		std::cout << " RUNNING WITH DEVICE  : " << device_info << " / "
+		/***auto start_time = std::chrono::high_resolution_clock::now();
+        std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/
+        std::cout << " RUNNING WITH DEVICE  : " << device_info << " / "
 			<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;
+        /***auto end_time = std::chrono::high_resolution_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+        std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
 	}
-
+   /*** auto start_time = std::chrono::high_resolution_clock::now();
+    std::cout << "Starting hipGetDeviceInfo..." << std::endl;***/
     GlobalV::ofs_running << "\n RUNNING WITH DEVICE  : " << device_info << " / "
-                         << base_device::information::get_device_info(PARAM.inp.device) << std::endl;
-
+                         << base_device::information::get_device_info(PARAM.inp.device) << std::endl; 
+    /***auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
     return esolver_type;
 }
 
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index 0b303f33e1..664ac186d5 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -556,7 +556,8 @@ void ESolver_KS_PW<T, Device>::hamilt2density_single(UnitCell& ucell,
                                                      hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace);
+                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace,
+                                                     PARAM.inp.use_k_continuity);
 
         hsolver_pw_obj.solve(this->p_hamilt,
                              this->kspw_psi[0],
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index df29f88989..4cade5d1ac 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -15,6 +15,7 @@
 #include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
 #include "module_hamilt_general/module_surchem/surchem.h"
 #include "module_hamilt_general/module_vdw/vdw.h"
+#include "kernels/force_op.h"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -531,31 +532,110 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (int iat = 0; iat < this->nat; ++iat)
-    {
-        // read `it` `ia` from the table
+    // sincos op for G space
+    
+    
+    // data preparation
+    std::vector<FPTYPE> tau_flat(this->nat * 3);
+    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
+    
+    
+    for (int iat = 0; iat < this->nat; iat++) {
+        int it = ucell.iat2it[iat];  
+        int ia = ucell.iat2ia[iat];  
+        
+        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
+        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
+        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
+    }
+    
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    // calculate vloc_factors for all atom types
+    std::vector<FPTYPE> vloc_per_type_host(ucell.ntype * rho_basis->npw);
+    for (int iat = 0; iat < this->nat; iat++) {
         int it = ucell.iat2it[iat];
-        int ia = ucell.iat2ia[iat];
-        for (int ig = 0; ig < rho_basis->npw; ig++)
-        {
-            const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
-            double sinp, cosp;
-            ModuleBase::libm::sincos(phase, &sinp, &cosp);
-            const double factor
-                = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
-            forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
-            forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
-            forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
+        for (int ig = 0; ig < rho_basis->npw; ig++) {
+            vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
         }
-        forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
-        forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
-        forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);
+    }
+    
+    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
+    }
+    
+    FPTYPE* d_gcar = gcar_flat.data();
+    FPTYPE* d_tau = tau_flat.data();
+    FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
+    std::complex<FPTYPE>* d_aux = aux_fptype.data();
+    FPTYPE* d_force = nullptr;
+    std::vector<FPTYPE> force_host(this->nat * 3);
+    
+    if (this->device == base_device::GpuDevice)
+    {
+        d_gcar = nullptr;
+        d_tau = nullptr;
+        d_vloc_per_type = nullptr;
+        d_aux = nullptr;
+        
+        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
+        resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw);
+        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
+        resmem_var_op()(this->ctx, d_force, this->nat * 3);
+        
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
+        
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force, 0.0, this->nat * 3);
+    }
+    else
+    {
+        d_force = force_host.data();
+        std::fill(force_host.begin(), force_host.end(), static_cast<FPTYPE>(0.0));
+    }
+    
+    const FPTYPE scale_factor = static_cast<FPTYPE>(ucell.tpiba * ucell.omega);
+    
+    // call op for sincos calculation
+    hamilt::cal_force_loc_sincos_op<FPTYPE, Device>()(
+        this->ctx,
+        this->nat,
+        rho_basis->npw,
+        ucell.ntype,
+        d_gcar,
+        d_tau,
+        d_vloc_per_type,
+        d_aux,
+        scale_factor,
+        d_force
+    );
+    
+    if (this->device == base_device::GpuDevice)
+    {
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3);
+        
+        delmem_var_op()(this->ctx, d_gcar);
+        delmem_var_op()(this->ctx, d_tau);
+        delmem_var_op()(this->ctx, d_vloc_per_type);
+        delmem_complex_op()(this->ctx, d_aux);
+        delmem_var_op()(this->ctx, d_force);
+    }
+    
+    for (int iat = 0; iat < this->nat; iat++) {
+        forcelc(iat, 0) = static_cast<double>(force_host[iat * 3 + 0]);
+        forcelc(iat, 1) = static_cast<double>(force_host[iat * 3 + 1]);
+        forcelc(iat, 2) = static_cast<double>(force_host[iat * 3 + 2]);
     }
 
-    // this->print(GlobalV::ofs_running, "local forces", forcelc);
+    // this->print(GlobalV: :ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
     delete[] aux;
     ModuleBase::timer::tick("Forces", "cal_force_loc");
@@ -665,6 +745,119 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         aux[rho_basis->ig_gge0] = std::complex<double>(0.0, 0.0);
     }
 
+    // sincos op for cal_force_ew
+    
+    std::vector<FPTYPE> it_facts_host(this->nat);
+    std::vector<FPTYPE> tau_flat(this->nat * 3);
+    
+    // iterate over by lookup table
+    for (int iat = 0; iat < this->nat; iat++) {
+        int it = ucell.iat2it[iat];  
+        int ia = ucell.iat2ia[iat];  
+        
+        double zv;
+        if (PARAM.inp.use_paw)
+        {
+#ifdef USE_PAW
+            zv = GlobalC::paw_cell.get_val(it);
+#endif
+        }
+        else
+        {
+            zv = ucell.atoms[it].ncpp.zv;
+        }
+        
+        it_facts_host[iat] = static_cast<FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba * 
+                                                ModuleBase::TWO_PI / ucell.omega * fact);
+        
+        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
+        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]); 
+        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
+    }
+    
+    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
+    }
+    
+    FPTYPE* d_gcar = gcar_flat.data();
+    FPTYPE* d_tau = tau_flat.data();
+    FPTYPE* d_it_facts = it_facts_host.data();
+    std::complex<FPTYPE>* d_aux = aux_fptype.data();
+    FPTYPE* d_force_g = nullptr;
+    std::vector<FPTYPE> force_g_host(this->nat * 3);
+    
+    if (this->device == base_device::GpuDevice)
+    {
+        d_gcar = nullptr;
+        d_tau = nullptr;
+        d_it_facts = nullptr;
+        d_aux = nullptr;
+        
+        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
+        resmem_var_op()(this->ctx, d_it_facts, this->nat);
+        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
+        resmem_var_op()(this->ctx, d_force_g, this->nat * 3);
+        
+        
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
+        
+     
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force_g, 0.0, this->nat * 3);
+    }
+    else
+    {
+        d_force_g = force_g_host.data();
+        std::fill(force_g_host.begin(), force_g_host.end(), static_cast<FPTYPE>(0.0));
+    }
+    
+    // call op for sincos calculation
+    hamilt::cal_force_ew_sincos_op<FPTYPE, Device>()(
+        this->ctx,
+        this->nat,
+        rho_basis->npw,
+        rho_basis->ig_gge0,  
+        d_gcar,
+        d_tau,
+        d_it_facts,
+        d_aux,
+        d_force_g
+    );
+    
+   
+    if (this->device == base_device::GpuDevice)
+    {
+        
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3);
+        
+        
+        delmem_var_op()(this->ctx, d_gcar);
+        delmem_var_op()(this->ctx, d_tau);
+        delmem_var_op()(this->ctx, d_it_facts);
+        delmem_complex_op()(this->ctx, d_aux);
+        delmem_var_op()(this->ctx, d_force_g);
+    }
+    
+
+    for (int iat = 0; iat < this->nat; iat++) {
+        forceion(iat, 0) += static_cast<double>(force_g_host[iat * 3 + 0]);
+        forceion(iat, 1) += static_cast<double>(force_g_host[iat * 3 + 1]); 
+        forceion(iat, 2) += static_cast<double>(force_g_host[iat * 3 + 2]);
+    }
+
+   
+// calculate real space force
 #ifdef _OPENMP
 #pragma omp parallel
     {
@@ -688,66 +881,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         iat_end = iat_beg + iat_end;
         ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
 
-        int iat = iat_beg;
-        int it = it_beg;
-        int ia = ia_beg;
-
-        // preprocess ig_gap for skipping the ig point
-        int ig_gap = (rho_basis->ig_gge0 >= 0 && rho_basis->ig_gge0 < rho_basis->npw) ? rho_basis->ig_gge0 : -1;
-
-        double it_fact = 0.;
-        int last_it = -1;
-
-        // iterating atoms
-        while (iat < iat_end)
-        {
-            if (it != last_it)
-            { // calculate it_tact when it is changed
-                double zv;
-                if (PARAM.inp.use_paw)
-                {
-#ifdef USE_PAW
-                    zv = GlobalC::paw_cell.get_val(it);
-#endif
-                }
-                else
-                {
-                    zv = ucell.atoms[it].ncpp.zv;
-                }
-                it_fact = zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact;
-                last_it = it;
-            }
-
-            if (ucell.atoms[it].na != 0)
-            {
-                const auto ig_loop = [&](int ig_beg, int ig_end) {
-                    for (int ig = ig_beg; ig < ig_end; ig++)
-                    {
-                        const ModuleBase::Vector3<double> gcar = rho_basis->gcar[ig];
-                        const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]);
-                        double sinp, cosp;
-                        ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                        double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-                        forceion(iat, 0) += gcar[0] * sumnb;
-                        forceion(iat, 1) += gcar[1] * sumnb;
-                        forceion(iat, 2) += gcar[2] * sumnb;
-                    }
-                };
-
-                // skip ig_gge0 point by separating ig loop into two part
-                ig_loop(0, ig_gap);
-                ig_loop(ig_gap + 1, rho_basis->npw);
-
-                forceion(iat, 0) *= it_fact;
-                forceion(iat, 1) *= it_fact;
-                forceion(iat, 2) *= it_fact;
-
-                ++iat;
-                ucell.step_iait(&ia, &it);
-            }
-        }
-
-        // means that the processor contains G=0 term.
+        
         if (rho_basis->ig_gge0 >= 0)
         {
             double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index 5d0656d105..5ff3ddf2b1 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -13,6 +13,122 @@
 
 namespace hamilt {
 
+// CUDA kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* vloc_per_type,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
+        const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution (fixed sign error)
+        const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x);
+    atomicAdd(&force[iat * 3 + 1], local_force_y);
+    atomicAdd(&force[iat * 3 + 2], local_force_z);
+}
 
 template <typename FPTYPE>
 __global__ void cal_vkb1_nl(
@@ -188,6 +304,65 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     cudaCheckOnDebug();
 }
 
+// GPU operators
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* vloc_per_type,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ntype, gcar, tau, vloc_per_type,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        scale_factor, force
+    );
+    
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ig_gge0, gcar, tau, it_facts,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force
+    );
+    
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void cal_force_nl(
         const int ntype,
@@ -613,8 +788,12 @@ template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::comple
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
 
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 6d797e147d..109321d6c3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -1,4 +1,8 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
+#include "module_base/libm/libm.h"
+#include "module_base/tool_threading.h"
+#include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
+#include <cmath>
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -424,10 +428,116 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
+// CPU implementation of local force sincos operator
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* vloc_per_type,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+            for (int ig = 0; ig < npw; ig++)
+            {
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                               gcar[ig * 3 + 1] * tau_y + 
+                                               gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                
+                const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
+                const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor;
+                
+                local_force[0] += gcar[ig * 3 + 0] * factor;
+                local_force[1] += gcar[ig * 3 + 1] * factor;
+                local_force[2] += gcar[ig * 3 + 2] * factor;
+            }
+
+            force[iat * 3 + 0] = local_force[0];
+            force[iat * 3 + 1] = local_force[1];
+            force[iat * 3 + 2] = local_force[2];
+        }
+    }
+};
+
+// CPU implementation of Ewald force sincos operator
+template <typename FPTYPE>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+            const FPTYPE it_fact = it_facts[iat];
+
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+            for (int ig = 0; ig < npw; ig++)
+            {
+                // Skip G=0 term
+                if (ig == ig_gge0) continue;
+
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                               gcar[ig * 3 + 1] * tau_y + 
+                                               gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                
+                const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
+                
+                local_force[0] += gcar[ig * 3 + 0] * factor;
+                local_force[1] += gcar[ig * 3 + 1] * factor;
+                local_force[2] += gcar[ig * 3 + 2] * factor;
+            }
+
+            force[iat * 3 + 0] = local_force[0];
+            force[iat * 3 + 1] = local_force[1];
+            force[iat * 3 + 2] = local_force[2];
+        }
+    }
+};
+
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_CPU>;
-
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_CPU>;
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_CPU>;
-
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_CPU>;
 } // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 3aa5d4f87e..acf490b278 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -146,6 +146,64 @@ struct cal_force_nl_op
                     const FPTYPE* lambda,
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
+                                          FPTYPE* force);
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_loc_sincos_op
+{
+    /// @brief Calculate local pseudopotential forces (sincos loop only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param ntype - number of atom types
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw]
+    /// @param aux - charge density in G-space [npw]
+    /// @param scale_factor - tpiba * omega
+    ///
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* vloc_per_type,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_ew_sincos_op
+{
+    /// @brief Calculate Ewald forces (sincos loop only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param it_facts - precomputed it_fact for each atom [nat]
+    /// @param aux - structure factor related array [npw]
+    ///
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
                     FPTYPE* force);
 };
 
@@ -248,6 +306,35 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     FPTYPE* force);
 };
 
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* vloc_per_type,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
 /**
  * @brief revert the vkb values for force_nl calculation
  */
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index c78b333b86..6bb3a84e7e 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -618,10 +618,190 @@ template void revertVkbValues<double>(const int *gcar_zero_ptrs, std::complex<do
 // for saveVkbValues functions instantiation
 template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::complex<double> *vkb_ptr, std::complex<double> *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx);
 
+// HIP kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* vloc_per_type,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
+        const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution (fixed sign error)
+        const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x);
+    atomicAdd(&force[iat * 3 + 1], local_force_y);
+    atomicAdd(&force[iat * 3 + 2], local_force_z);
+}
+
+// GPU operators
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* vloc_per_type,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(cal_force_loc_sincos_kernel<FPTYPE>,
+                       grid, block, 0, 0,
+                       nat, npw, ntype, gcar, tau, vloc_per_type,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+                       scale_factor, force);
+    
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(cal_force_ew_sincos_kernel<FPTYPE>,
+                       grid, block, 0, 0,
+                       nat, npw, ig_gge0, gcar, tau, it_facts,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force);
+    
+    hipCheckOnDebug();
+}
+
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
 
 }  // namespace hamilt
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index b180d72c13..f07f4cfefa 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -7,7 +7,6 @@
 #include "module_hsolver/kernels/dngvd_op.h"
 #include "module_hsolver/kernels/math_kernel_op.h"
 #include "module_base/kernels/dsp/dsp_connector.h"
-
 #include <vector>
 
 using namespace hsolver;
@@ -60,7 +59,7 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
     if (this->device == base_device::GpuDevice)
     {
         resmem_real_op()(this->ctx, this->d_precondition, nbasis_in);
-        // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
     }
 #endif
 }
@@ -288,27 +287,28 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                          psi_iter + (nbase) * this->dim,
                          this->dim);
 
-    std::vector<Real> e_temp_cpu(nbase, 0);
+    // Eigenvalues operation section
+    std::vector<Real> e_temp_cpu(this->notconv, 0);
     Real* e_temp_hd = e_temp_cpu.data();
-    if(this->device == base_device::GpuDevice)
+    if (this->device == base_device::GpuDevice)
     {
         e_temp_hd = nullptr;
-        resmem_real_op()(this->ctx, e_temp_hd, nbase);
+        resmem_real_op()(this->ctx, e_temp_hd, this->notconv);
     }
-    for (int m = 0; m < notconv; m++)
+
+    for (int m = 0; m < this->notconv; m++)
     {
-        e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m]));
-        if (this->device == base_device::GpuDevice)
-        {
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), nbase);
-        }
-        vector_mul_vector_op<T, Device>()(this->ctx,
-                                                nbase,
-                                                vcc + m * this->nbase_x,
-                                                vcc + m * this->nbase_x,
-                                                e_temp_hd);
+        e_temp_cpu[m] = -(*eigenvalue_iter)[m];
     }
-    if(this->device == base_device::GpuDevice)
+
+    if (this->device == base_device::GpuDevice)
+    {
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), this->notconv);
+    }
+    
+    apply_eigenvalues_op<T, Device>()(this->ctx, nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd);
+
+    if (this->device == base_device::GpuDevice)
     {
         delmem_real_op()(this->ctx, e_temp_hd);
     }
@@ -333,54 +333,62 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                          psi_iter + nbase * this->dim,
                          this->dim);
 
-    // "precondition!!!"
-    std::vector<Real> pre(this->dim, 0.0);
-    for (int m = 0; m < notconv; m++)
-    {
-        for (size_t i = 0; i < this->dim; i++)
-        {
-            // pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
-            double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
-            pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
-        }
+    // Precondition section
 #if defined(__CUDA) || defined(__ROCM)
-        if (this->device == base_device::GpuDevice)
-        {
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, pre.data(), this->dim);
-            vector_div_vector_op<T, Device>()(this->ctx,
-                                              this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              this->d_precondition);
-        }
-        else
+    if (this->device == base_device::GpuDevice)
+    {
+        Real* eigenvalues_gpu = nullptr;
+        resmem_real_op()(this->ctx, eigenvalues_gpu, notconv);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu,(*eigenvalue_iter).data(), notconv);
+        
+        precondition_op<T, Device>()(this->ctx,
+                                    this->dim,
+                                    psi_iter,
+                                    nbase,
+                                    notconv,
+                                    d_precondition,
+                                    eigenvalues_gpu);
+        delmem_real_op()(this->ctx, eigenvalues_gpu);
+    }
+    else
 #endif
-        {
-            vector_div_vector_op<T, Device>()(this->ctx,
-                                              this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              pre.data());
-        }
+    {
+        precondition_op<T, Device>()(this->ctx,
+                                    this->dim,
+                                    psi_iter,
+                                    nbase,
+                                    notconv,
+                                    this->precondition.data(),
+                                    (*eigenvalue_iter).data());
     }
 
-    // "normalize!!!" in order to improve numerical stability of subspace diagonalization
-    std::vector<Real> psi_norm(notconv, 0.0);
-    for (size_t i = 0; i < notconv; i++)
+    // Normalize section
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        Real* psi_norm = nullptr;
+        resmem_real_op()(this->ctx, psi_norm, notconv);
+        using setmem_real_op = base_device::memory::set_memory_op<Real, Device>;
+        setmem_real_op()(this->ctx, psi_norm, 0.0, notconv);
+        
+        normalize_op<T, Device>()(this->ctx,
+                                this->dim,
+                                psi_iter,
+                                nbase,
+                                notconv,
+                                psi_norm);
+        delmem_real_op()(this->ctx, psi_norm);
+    }
+    else
+#endif
     {
-        psi_norm[i] = dot_real_op<T, Device>()(this->ctx,
-                                               this->dim,
-                                               psi_iter + (nbase + i) * this->dim,
-                                               psi_iter + (nbase + i) * this->dim,
-                                               true);
-        assert(psi_norm[i] > 0.0);
-        psi_norm[i] = sqrt(psi_norm[i]);
-
-        vector_div_constant_op<T, Device>()(this->ctx,
-                                            this->dim,
-                                            psi_iter + (nbase + i) * this->dim,
-                                            psi_iter + (nbase + i) * this->dim,
-                                            psi_norm[i]);
+        Real* psi_norm = nullptr;
+        normalize_op<T, Device>()(this->ctx,
+                                this->dim,
+                                psi_iter,
+                                nbase,
+                                notconv,
+                                psi_norm);
     }
 
     // update hpsi[:, nbase:nbase+notconv]
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index 0c1ad2e8b8..d0648addf5 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -280,47 +280,106 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
     std::vector<Real> eigenvalues(this->wfc_basis->nks * psi.get_nbands(), 0.0);
     ethr_band.resize(psi.get_nbands(), this->diag_thr);
 
-    /// Loop over k points for solve Hamiltonian to charge density
-    for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
-    {
-        /// update H(k) for each k point
-        pHamilt->updateHk(ik);
+    // Initialize k-point continuity if enabled
+    static int count = 0;
+    if (use_k_continuity) {
+        build_k_neighbors();
+    }
+
+    // Loop over k points for solve Hamiltonian to charge density
+    if (use_k_continuity) {
+        // K-point continuity case
+        for (int i = 0; i < this->wfc_basis->nks; ++i)
+        {
+            const int ik = k_order[i];
+            
+            // update H(k) for each k point
+            pHamilt->updateHk(ik);
 
 #ifdef USE_PAW
-        this->paw_func_in_kloop(ik, tpiba);
+            this->paw_func_in_kloop(ik, tpiba);
 #endif
 
-        /// update psi pointer for each k point
-        psi.fix_k(ik);
+            // update psi pointer for each k point
+            psi.fix_k(ik);
+
+            // If using k-point continuity and not first k-point, propagate from parent
+            if (ik > 0 && count == 0 && k_parent.find(ik) != k_parent.end()) {
+                propagate_psi(psi, k_parent[ik], ik);
+            }
 
-        // template add precondition calculating here
-        update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
+            // template add precondition calculating here
+            update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
 
-        // use smooth threshold for all iter methods
-        if (PARAM.inp.diago_smooth_ethr == true)
-        {
-            this->cal_smooth_ethr(pes->klist->wk[ik],
-                                  &pes->wg(ik, 0),
-                                  DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                  ethr_band);
-        }
+            // use smooth threshold for all iter methods
+            if (PARAM.inp.diago_smooth_ethr == true)
+            {
+                this->cal_smooth_ethr(pes->klist->wk[ik],
+                                    &pes->wg(ik, 0),
+                                    DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                    ethr_band);
+            }
 
 #ifdef USE_PAW
-        this->call_paw_cell_set_currentk(ik);
+            this->call_paw_cell_set_currentk(ik);
 #endif
 
-        /// solve eigenvector and eigenvalue for H(k)
-        this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
+            // solve eigenvector and eigenvalue for H(k)
+            this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
 
-        if (skip_charge)
+            if (skip_charge)
+            {
+                GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+                                    << " is: " << DiagoIterAssist<T, Device>::avg_iter
+                                    << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
+                DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            }
+        }
+    }
+    else {
+        // Original code without k-point continuity
+        for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
         {
-            GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
-                                 << " is: " << DiagoIterAssist<T, Device>::avg_iter
-                                 << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
-            DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            // update H(k) for each k point
+            pHamilt->updateHk(ik);
+
+#ifdef USE_PAW
+            this->paw_func_in_kloop(ik, tpiba);
+#endif
+
+            // update psi pointer for each k point
+            psi.fix_k(ik);
+
+            // template add precondition calculating here
+            update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
+
+            // use smooth threshold for all iter methods
+            if (PARAM.inp.diago_smooth_ethr == true)
+            {
+                this->cal_smooth_ethr(pes->klist->wk[ik],
+                                    &pes->wg(ik, 0),
+                                    DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                    ethr_band);
+            }
+
+#ifdef USE_PAW
+            this->call_paw_cell_set_currentk(ik);
+#endif
+
+            // solve eigenvector and eigenvalue for H(k)
+            this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
+
+            if (skip_charge)
+            {
+                GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+                                    << " is: " << DiagoIterAssist<T, Device>::avg_iter
+                                    << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
+                DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            }
         }
-        /// calculate the contribution of Psi for charge density rho
     }
+    
+    count++;
     // END Loop over k points
 
     // copy eigenvalues to ekb in ElecState
@@ -666,6 +725,101 @@ void HSolverPW<T, Device>::output_iterInfo()
     }
 }
 
+template <typename T, typename Device>
+void HSolverPW<T, Device>::build_k_neighbors() {
+    const int nk = this->wfc_basis->nks;
+    kvecs_c.resize(nk);
+    k_order.clear();
+    k_order.reserve(nk);
+    
+    // Store k-points and corresponding indices
+    struct KPoint {
+        ModuleBase::Vector3<double> kvec;
+        int index;
+        double norm;
+        
+        KPoint(const ModuleBase::Vector3<double>& v, int i) : 
+            kvec(v), index(i), norm(v.norm()) {}
+    };
+    
+    // Build k-point list
+    std::vector<KPoint> klist;
+    for (int ik = 0; ik < nk; ++ik) {
+        kvecs_c[ik] = this->wfc_basis->kvec_c[ik];
+        klist.push_back(KPoint(kvecs_c[ik], ik));
+    }
+    
+    // Sort k-points by distance from origin
+    std::sort(klist.begin(), klist.end(),
+        [](const KPoint& a, const KPoint& b) {
+            return a.norm < b.norm;
+        });
+    
+    // Build parent-child relationships
+    k_order.push_back(klist[0].index);
+    
+    // Find nearest processed k-point as parent for each k-point
+    for (int i = 1; i < nk; ++i) {
+        int current_k = klist[i].index;
+        double min_dist = 1e10;
+        int parent = -1;
+        
+        // find the nearest k-point as parent
+        for (int j = 0; j < k_order.size(); ++j) {
+            int processed_k = k_order[j];
+            double dist = (kvecs_c[current_k] - kvecs_c[processed_k]).norm2();
+            if (dist < min_dist) {
+                min_dist = dist;
+                parent = processed_k;
+            }
+        }
+        
+        k_parent[current_k] = parent;
+        k_order.push_back(current_k);
+    }
+}
+
+template <typename T, typename Device>
+void HSolverPW<T, Device>::propagate_psi(psi::Psi<T, Device>& psi, const int from_ik, const int to_ik) {
+    const int nbands = psi.get_nbands();
+    const int npwk = this->wfc_basis->npwk[to_ik];
+    
+    // Get k-point difference
+    ModuleBase::Vector3<double> dk = kvecs_c[to_ik] - kvecs_c[from_ik];
+    
+    // Allocate porter locally
+    T* porter = nullptr;
+    resmem_complex_op()(this->ctx, porter, this->wfc_basis->nmaxgr, "HSolverPW::porter");
+    
+    // Process each band
+    for (int ib = 0; ib < nbands; ib++)
+    {
+        // Fix current k-point and band
+        // psi.fix_k(from_ik);
+        
+        // FFT to real space
+        // this->wfc_basis->recip_to_real(this->ctx, psi.get_pointer(ib), porter, from_ik);
+        this->wfc_basis->recip_to_real(this->ctx, &psi(from_ik, ib, 0), porter, from_ik);
+        
+        // Apply phase factor
+        //     // TODO: Check how to get the r vector
+        //     ModuleBase::Vector3<double> r = this->wfc_basis->get_ir2r(ir);
+        //     double phase = this->wfc_basis->tpiba * (dk.x * r.x + dk.y * r.y + dk.z * r.z);
+        //     psi_real[ir] *= std::exp(std::complex<double>(0.0, phase));
+        // }
+        
+        // Fix k-point for target
+        // psi.fix_k(to_ik);
+        
+        // FFT back to reciprocal space
+        // this->wfc_basis->real_to_recip(this->ctx, porter, psi.get_pointer(ib), to_ik, true);
+        this->wfc_basis->real_to_recip(this->ctx, porter, &psi(to_ik, ib, 0), to_ik);
+    }
+
+    // Clean up porter
+    delmem_complex_op()(this->ctx, porter);
+}
+
 template class HSolverPW<std::complex<float>, base_device::DEVICE_CPU>;
 template class HSolverPW<std::complex<double>, base_device::DEVICE_CPU>;
 #if ((defined __CUDA) || (defined __ROCM))
@@ -673,4 +827,4 @@ template class HSolverPW<std::complex<float>, base_device::DEVICE_GPU>;
 template class HSolverPW<std::complex<double>, base_device::DEVICE_GPU>;
 #endif
 
-} // namespace hsolver
\ No newline at end of file
+} // namespace hsolver
diff --git a/source/module_hsolver/hsolver_pw.h b/source/module_hsolver/hsolver_pw.h
index 0dee4fbdbf..7f0bfc7c23 100644
--- a/source/module_hsolver/hsolver_pw.h
+++ b/source/module_hsolver/hsolver_pw.h
@@ -6,6 +6,8 @@
 #include "module_base/macros.h"
 #include "module_basis/module_pw/pw_basis_k.h"
 #include "module_psi/wavefunc.h"
+#include <unordered_map>
+#include "module_base/memory.h"
 
 namespace hsolver
 {
@@ -18,6 +20,9 @@ class HSolverPW
     // return T if T is real type(float, double),
     // otherwise return the real type of T(complex<float>, complex<double>)
     using Real = typename GetTypeReal<T>::type;
+    using resmem_complex_op = base_device::memory::resize_memory_op<T, Device>;
+    using delmem_complex_op = base_device::memory::delete_memory_op<T, Device>;
+    using setmem_complex_op = base_device::memory::set_memory_op<T, Device>;
 
   public:
     HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in,
@@ -30,10 +35,12 @@ class HSolverPW
               const int scf_iter_in,
               const int diag_iter_max_in,
               const double diag_thr_in,
-              const bool need_subspace_in)
+              const bool need_subspace_in,
+              const bool use_k_continuity_in = false)
         : wfc_basis(wfc_basis_in), calculation_type(calculation_type_in), basis_type(basis_type_in), method(method_in),
           use_paw(use_paw_in), use_uspp(use_uspp_in), nspin(nspin_in), scf_iter(scf_iter_in),
-          diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in){};
+          diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in),
+          use_k_continuity(use_k_continuity_in) {};
 
     /// @brief solve function for pw
     /// @param pHamilt interface to hamilt
@@ -51,6 +58,7 @@ class HSolverPW
                const double tpiba,
                const int nat);
 
+
   protected:
     // diago caller
     void hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
@@ -79,6 +87,8 @@ class HSolverPW
 
     const bool need_subspace; // for cg or dav_subspace
 
+    const bool use_k_continuity;
+
   protected:
     Device* ctx = {};
 
@@ -99,8 +109,16 @@ class HSolverPW
 
     void paw_func_after_kloop(psi::Psi<T, Device>& psi, elecstate::ElecState* pes,const double tpiba,const int nat);
 #endif
+
+    // K-point continuity related members
+    std::vector<int> k_order;
+    std::unordered_map<int, int> k_parent;
+    std::vector<ModuleBase::Vector3<double>> kvecs_c;
+    
+    void build_k_neighbors();
+    void propagate_psi(psi::Psi<T, Device>& psi, const int from_ik, const int to_ik);
 };
 
 } // namespace hsolver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
index 149b9ce389..8fb7c542eb 100644
--- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -42,6 +42,48 @@ struct GetTypeThrust<std::complex<double>> {
 
 static cublasHandle_t cublas_handle = nullptr;
 
+// Forward declarations for abs2
+template<typename T>
+__device__ typename GetTypeReal<T>::type abs2(const T& x);
+
+// Specialization for real types (double)
+template<>
+__device__ double abs2(const double& x) {
+    return x * x;
+}
+
+// Specialization for real types (float)
+template<>
+__device__ float abs2(const float& x) {
+    return x * x;
+}
+
+// Specialization for complex float
+template<>
+__device__ float abs2(const thrust::complex<float>& x) {
+    return x.real() * x.real() + x.imag() * x.imag();
+}
+
+// Specialization for complex double
+template<>
+__device__ double abs2(const thrust::complex<double>& x) {
+    return x.real() * x.real() + x.imag() * x.imag();
+}
+
+// Specialization for std::complex<float> (for interface compatibility)
+template<>
+__device__ float abs2(const std::complex<float>& x) {
+    const thrust::complex<float>* tx = reinterpret_cast<const thrust::complex<float>*>(&x);
+    return tx->real() * tx->real() + tx->imag() * tx->imag();
+}
+
+// Specialization for std::complex<double> (for interface compatibility)
+template<>
+__device__ double abs2(const std::complex<double>& x) {
+    const thrust::complex<double>* tx = reinterpret_cast<const thrust::complex<double>*>(&x);
+    return tx->real() * tx->real() + tx->imag() * tx->imag();
+}
+
 static inline
 void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) {
     cublasErrcheck(cublasSdot(cublas_handle, n, x, incx, y, incy, &result));
@@ -1035,6 +1077,320 @@ void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator
     cudaCheckOnDebug();
 }
 
+// Kernel for applying eigenvalues to vectors
+
+template <typename T>
+__global__ void apply_eigenvalues_kernel(const typename GetTypeReal<T>::type* eigenvalues, const T* vectors, T* result, int nbase, int nbase_x, int notconv)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;  // Linear thread ID
+    const int total_elements = notconv * nbase;
+    
+    if (tid < total_elements)
+    {
+        const int m = tid / nbase;        // Row index (eigenvalue index)
+        const int idx = tid % nbase;      // Column index within the row
+        result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+    }
+}
+
+// Wrapper for applying eigenvalues to complex vectors
+template <typename FPTYPE>
+inline void apply_eigenvalues_complex_wrapper(const base_device::DEVICE_GPU* d,
+                                              const int& nbase,
+                                              const int& nbase_x,
+                                              const int& notconv,
+                                              const FPTYPE* eigenvalues,
+                                              const std::complex<FPTYPE>* vectors,
+                                              std::complex<FPTYPE>* result)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vectors_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vectors);
+    const int total_elements = notconv * nbase;
+    const int threadsPerBlock = 256;
+    const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    
+    apply_eigenvalues_kernel<thrust::complex<FPTYPE>><<<numBlocks, threadsPerBlock>>>(
+        eigenvalues, vectors_tmp, result_tmp, nbase, nbase_x, notconv);
+
+    cudaCheckOnDebug();
+}
+
+// Specialization for double
+template <>
+void apply_eigenvalues_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d,
+                                                                       const int &nbase, const int &nbase_x, const int &notconv,
+                                                                       double *result, const double *vectors, const double *eigenvalues) {
+    const int total_elements = notconv * nbase;
+    const int threadsPerBlock = 256;
+    const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    
+    apply_eigenvalues_kernel<double><<<numBlocks, threadsPerBlock>>>(
+        eigenvalues, vectors, result, nbase, nbase_x, notconv);
+    cudaCheckOnDebug();
+}
+
+// Specialization for std::complex<float>
+template <>
+void apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d,
+                                                                                      const int &nbase, const int &nbase_x, const int &notconv,
+                                                                                      std::complex<float> *result, const std::complex<float> *vectors, const float *eigenvalues) {
+    apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result);
+}
+
+// Specialization for std::complex<double>
+template <>
+void apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU *d,
+                                                                                       const int &nbase, const int &nbase_x, const int &notconv,
+                                                                                       std::complex<double> *result, const std::complex<double> *vectors, const double *eigenvalues) {
+    apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result);
+}
+
+template <typename T>
+__global__ void precondition_kernel(
+    const int dim,
+    const int notconv,
+    T* psi_iter,
+    const int nbase,
+    const typename GetTypeReal<T>::type* precondition,  // Real type
+    const typename GetTypeReal<T>::type* eigenvalues)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dim * notconv) {
+        const int i = tid % dim;    // Basis index
+        const int m = tid / dim;    // Band index
+        
+        using Real = typename GetTypeReal<T>::type;
+        Real x = abs(precondition[i] - eigenvalues[m]);
+        Real pre = 0.5 * (1.0 + x + sqrt(1.0 + (x - 1.0) * (x - 1.0)));
+        psi_iter[(nbase + m) * dim + i] /= pre;
+    }
+}
+
+// Specialization for double (for LCAO)
+template <>
+void precondition_op<double, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    double* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const double* precondition,
+    const double* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    const int threadsPerBlock = thread_per_block;
+    const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    
+    precondition_kernel<double><<<numBlocks, threadsPerBlock>>>(
+        dim, notconv, psi_iter, nbase, precondition, eigenvalues);
+
+    cudaCheckOnDebug();
+}
+
+// Specialization for complex<float>
+template <>
+void precondition_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<float>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const float* precondition,  
+    const float* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    const int threadsPerBlock = thread_per_block;
+    const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    
+    precondition_kernel<thrust::complex<float>><<<numBlocks, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<thrust::complex<float>*>(psi_iter),
+        nbase,
+        precondition,  // Already float
+        eigenvalues);
+
+    cudaCheckOnDebug();
+}
+
+// Specialization for complex<double>
+template <>
+void precondition_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<double>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const double* precondition,  
+    const double* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    const int threadsPerBlock = thread_per_block;
+    const int numBlocks = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    
+    precondition_kernel<thrust::complex<double>><<<numBlocks, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<thrust::complex<double>*>(psi_iter),
+        nbase,
+        precondition,  // Already double
+        eigenvalues);
+
+    cudaCheckOnDebug();
+}
+
+// First kernel: calculate norms
+template <typename T>
+__global__ void calculate_norm_kernel(
+    const int dim,
+    const int notconv,
+    const T* psi_iter,
+    const int nbase,
+    typename GetTypeReal<T>::type* psi_norm)
+{
+    using Real = typename GetTypeReal<T>::type;
+    __shared__ Real shared_mem[256];
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int m = bid / ((dim + blockDim.x - 1) / blockDim.x);
+    const int num_blocks_per_band = (dim + blockDim.x - 1) / blockDim.x;
+    const int block_in_band = bid % num_blocks_per_band;
+
+    if (m >= notconv) return;
+
+    // Initialize shared memory
+    shared_mem[tid] = 0.0;
+
+    // Each thread processes one element
+    const int i = tid + block_in_band * blockDim.x;
+    if (i < dim) {
+        shared_mem[tid] = abs2(psi_iter[(nbase + m) * dim + i]);
+    }
+    __syncthreads();
+
+    // Parallel reduction in shared memory
+    for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            shared_mem[tid] += shared_mem[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    // First thread of first block initializes norm to 0
+    // if (tid == 0 && block_in_band == 0) {
+    //     psi_norm[m] = 0.0;
+    // }
+    // __syncthreads();
+
+    // Write result for this block to global memory
+    if (tid == 0) {
+        atomicAdd(&psi_norm[m], shared_mem[0]);
+    }
+}
+
+// Second kernel: normalize vectors
+template <typename T>
+__global__ void normalize_vectors_kernel(
+    const int dim,
+    const int notconv,
+    T* psi_iter,
+    const int nbase,
+    const typename GetTypeReal<T>::type* psi_norm)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int m = tid / dim;
+    const int i = tid % dim;
+    
+    if (m < notconv && i < dim) {
+        psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / sqrt(psi_norm[m]);
+    }
+}
+
+
+// Specialization for complex<float>
+template <>
+void normalize_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<float>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    float* psi_norm)
+{
+    const int threadsPerBlock = 256;
+    const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv;
+    
+    // First kernel: calculate norms
+    calculate_norm_kernel<thrust::complex<float>><<<numBlocksForNorm, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<const thrust::complex<float>*>(psi_iter),
+        nbase, psi_norm);
+    cudaDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    normalize_vectors_kernel<thrust::complex<float>><<<numBlocksForNormalize, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<thrust::complex<float>*>(psi_iter),
+        nbase, psi_norm);
+    cudaCheckOnDebug();
+}
+
+// Specialization for complex<double>
+template <>
+void normalize_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<double>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    double* psi_norm)
+{
+    const int threadsPerBlock = 256;
+    const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv;
+    
+    // First kernel: calculate norms
+    calculate_norm_kernel<thrust::complex<double>><<<numBlocksForNorm, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<const thrust::complex<double>*>(psi_iter),
+        nbase, psi_norm);
+    cudaDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    normalize_vectors_kernel<thrust::complex<double>><<<numBlocksForNormalize, threadsPerBlock>>>(
+        dim, notconv,
+        reinterpret_cast<thrust::complex<double>*>(psi_iter),
+        nbase, psi_norm);
+    cudaCheckOnDebug();
+}
+
+// Specialization for double (for LCAO)
+template <>
+void normalize_op<double, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    double* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    double* psi_norm)
+{
+    const int threadsPerBlock = 256;
+    const int numBlocksForNorm = ((dim + threadsPerBlock - 1) / threadsPerBlock) * notconv;
+    
+    // First kernel: calculate norms
+    calculate_norm_kernel<double><<<numBlocksForNorm, threadsPerBlock>>>(
+        dim, notconv, psi_iter, nbase, psi_norm);
+    cudaDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + threadsPerBlock - 1) / threadsPerBlock;
+    normalize_vectors_kernel<double><<<numBlocksForNormalize, threadsPerBlock>>>(
+        dim, notconv, psi_iter, nbase, psi_norm);
+    cudaCheckOnDebug();
+}
 
 // Explicitly instantiate functors for the types of functor registered.
 template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
@@ -1045,6 +1401,9 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GP
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_GPU>;
 
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -1054,6 +1413,9 @@ template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_G
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_GPU>;
 
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
@@ -1062,5 +1424,8 @@ template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_GPU>;
+template struct precondition_op<double, base_device::DEVICE_GPU>;
+template struct normalize_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace hsolver
diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp
index c3784949ab..87cd549482 100644
--- a/source/module_hsolver/kernels/math_kernel_op.cpp
+++ b/source/module_hsolver/kernels/math_kernel_op.cpp
@@ -2,6 +2,7 @@
 
 #include <iomanip>
 #include <iostream>
+#include <vector>
 
 namespace hsolver
 {
@@ -173,6 +174,51 @@ struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct apply_eigenvalues_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_CPU* d, const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues)
+    {
+        for (int m = 0; m < notconv; m++)
+        {
+            for (int idx = 0; idx < nbase; idx++)
+            {
+                result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+            }
+        }
+    }
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_CPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_CPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues)
+    {
+        using Real = typename GetTypeReal<T>::type;
+        std::vector<Real> pre(dim, 0.0);
+        for (int m = 0; m < notconv; m++)
+        {
+            for (size_t i = 0; i < dim; i++)
+            {
+                Real x = std::abs(precondition[i] - eigenvalues[m]);
+                pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+            }
+            vector_div_vector_op<T, base_device::DEVICE_CPU>()(d,
+                                                             dim,
+                                                             psi_iter + (nbase + m) * dim,
+                                                             psi_iter + (nbase + m) * dim,
+                                                             pre.data());
+        }
+    }
+};
+
 template <typename T>
 struct vector_div_vector_op<T, base_device::DEVICE_CPU>
 {
@@ -355,6 +401,40 @@ struct matrixSetToAnother<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_CPU> {
+    void operator()(const base_device::DEVICE_CPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   typename GetTypeReal<T>::type* psi_norm = nullptr)
+    {
+        using Real = typename GetTypeReal<T>::type;
+        for (int m = 0; m < notconv; m++)
+        {
+            // Calculate norm using dot_real_op
+            Real psi_m_norm = dot_real_op<T, base_device::DEVICE_CPU>()(d,
+                                                                dim,
+                                                                psi_iter + (nbase + m) * dim,
+                                                                psi_iter + (nbase + m) * dim,
+                                                                true);
+            assert(psi_m_norm > 0.0);
+            psi_m_norm = sqrt(psi_m_norm);
+
+            // Normalize using vector_div_constant_op
+            vector_div_constant_op<T, base_device::DEVICE_CPU>()(d,
+                                                              dim,
+                                                              psi_iter + (nbase + m) * dim,
+                                                              psi_iter + (nbase + m) * dim,
+                                                              psi_m_norm);
+            if (psi_norm) {
+                psi_norm[m] = psi_m_norm;
+            }
+        }
+    }
+};
+
 // Explicitly instantiate functors for the types of functor registered.
 template struct scal_op<float, base_device::DEVICE_CPU>;
 template struct axpy_op<std::complex<float>, base_device::DEVICE_CPU>;
@@ -365,6 +445,9 @@ template struct gemm_op<float, base_device::DEVICE_CPU>;
 template struct dot_real_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
@@ -381,6 +464,9 @@ template struct gemm_op<double, base_device::DEVICE_CPU>;
 template struct dot_real_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
@@ -392,11 +478,14 @@ template struct line_minimize_with_block_op<std::complex<double>, base_device::D
 template struct axpy_op<double, base_device::DEVICE_CPU>;
 template struct dot_real_op<double, base_device::DEVICE_CPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_CPU>;
+template struct precondition_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
 template struct matrixSetToAnother<double, base_device::DEVICE_CPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
+template struct normalize_op<double, base_device::DEVICE_CPU>;
 #endif
 #ifdef __DSP
 template struct gemm_op_mt<std::complex<float>, base_device::DEVICE_CPU>;
diff --git a/source/module_hsolver/kernels/math_kernel_op.h b/source/module_hsolver/kernels/math_kernel_op.h
index 0daf0e5718..8318c4310b 100644
--- a/source/module_hsolver/kernels/math_kernel_op.h
+++ b/source/module_hsolver/kernels/math_kernel_op.h
@@ -325,6 +325,48 @@ template <typename T, typename Device> struct matrixSetToAnother {
                   T *B, const int &LDB);
 };
 
+template <typename T, typename Device>
+struct apply_eigenvalues_op {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const Device *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T, typename Device>
+struct precondition_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,  
+                   const Real* eigenvalues);
+};
+
+template <typename T, typename Device>
+struct normalize_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm = nullptr);
+};
+
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm);
+};
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 
 template <typename T>
@@ -393,6 +435,26 @@ template <typename T> struct matrixSetToAnother<T, base_device::DEVICE_GPU> {
 void createGpuBlasHandle();
 void destoryBLAShandle();
 
+// vector operator: result[i] = -lambda[i] * vector[i]
+template <typename T> struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues);
+};
+
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hsolver
 
diff --git a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
index 5eeb1697e8..8cba06db1b 100644
--- a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
@@ -5,12 +5,25 @@
 
 namespace hsolver {
 
+// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op
+
+static hipsolverHandle_t hipsolver_H = nullptr;
+// Test on DCU platform. When nstart is greater than 234, code on DCU performs better. 
+const int N_DCU = 234;
+
 void createGpuSolverHandle() {
-    return;
+    if (hipsolver_H == nullptr)
+    {
+        hipsolverErrcheck(hipsolverCreate(&hipsolver_H));
+    }
 }
 
 void destroyGpuSolverHandle() {
-    return;
+    if (hipsolver_H != nullptr)
+    {
+        hipsolverErrcheck(hipsolverDestroy(hipsolver_H));
+        hipsolver_H = nullptr;
+    }
 }
 
 #ifdef __LCAO
@@ -23,22 +36,68 @@ void dngvd_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DE
                                                            double* _eigenvalue,
                                                            double* _vcc)
 {
-    std::vector<double> hcc(nstart * nstart, 0.0);
-    std::vector<double> scc(nstart * nstart, 0.0);
-    std::vector<double> vcc(nstart * nstart, 0.0);
-    std::vector<double> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                nstart,
-                                                ldh,
-                                                hcc.data(),
-                                                scc.data(),
-                                                eigenvalue.data(),
-                                                vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+
+    if (nstart > N_DCU){
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(double) * ldh * nstart, hipMemcpyDeviceToDevice));
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        double * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnDsygvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            _vcc, ldh,
+            _scc, ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(double) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnDsygvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            _vcc, ldh,
+            const_cast<double *>(_scc), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<double> hcc(nstart * nstart, 0.0);
+        std::vector<double> scc(nstart * nstart, 0.0);
+        std::vector<double> vcc(nstart * nstart, 0.0);
+        std::vector<double> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                   nstart,
+                                                   ldh,
+                                                   hcc.data(),
+                                                   scc.data(),
+                                                   eigenvalue.data(),
+                                                   vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+    
 }
 #endif // __LCAO
 
@@ -51,22 +110,67 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
                                                                         float* _eigenvalue,
                                                                         std::complex<float>* _vcc)
 {
-    std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
-    std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
-    std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
-    std::vector<float> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                             nstart,
-                                                             ldh,
-                                                             hcc.data(),
-                                                             scc.data(),
-                                                             eigenvalue.data(),
-                                                             vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+    
+    if (nstart > N_DCU){
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<float>) * ldh * nstart, hipMemcpyDeviceToDevice));
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        float2 * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnChegvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<const float2 *>(_vcc), ldh,
+            reinterpret_cast<const float2 *>(_scc), ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(float2) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnChegvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<float2 *>(_vcc), ldh,
+            const_cast<float2 *>(reinterpret_cast<const float2 *>(_scc)), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
+        std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
+        std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
+        std::vector<float> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                                nstart,
+                                                                ldh,
+                                                                hcc.data(),
+                                                                scc.data(),
+                                                                eigenvalue.data(),
+                                                                vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+    
 }
 
 template <>
@@ -76,24 +180,80 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
                                                                          const std::complex<double>* _hcc,
                                                                          const std::complex<double>* _scc,
                                                                          double* _eigenvalue,
-                                                                         std::complex<double>* _vcc)
+                                                                         std::complex<double>* _vcc
+                                                                        )
 {
-    std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
-    std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
-    std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
-    std::vector<double> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                              nstart,
-                                                              ldh,
-                                                              hcc.data(),
-                                                              scc.data(),
-                                                              eigenvalue.data(),
-                                                              vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+
+    // save a copy of scc in case the diagonalization fails
+    if (nstart > N_DCU){
+        std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
+
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<double>) * ldh * nstart, hipMemcpyDeviceToDevice));
+
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        double2 * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnZhegvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<const double2 *>(_vcc), ldh,
+            reinterpret_cast<const double2 *>(_scc), ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(double2) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnZhegvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<double2 *>(_vcc), ldh,
+            const_cast<double2 *>(reinterpret_cast<const double2 *>(_scc)), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
+        std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
+        std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
+        std::vector<double> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                                nstart,
+                                                                ldh,
+                                                                hcc.data(),
+                                                                scc.data(),
+                                                                eigenvalue.data(),
+                                                                vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+
+
+
+
+
+    
 }
 
 #ifdef __LCAO
diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
index ef5a1c1ece..4ca6af509c 100644
--- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
@@ -11,6 +11,7 @@
 #define WARP_SIZE 32
 #define FULL_MASK 0xffffffff
 #define THREAD_PER_BLOCK 256
+
 template <>
 struct GetTypeReal<thrust::complex<float>> {
     using type = float; /**< The return type specialization for std::complex<double>. */
@@ -20,6 +21,49 @@ struct GetTypeReal<thrust::complex<double>> {
     using type = double; /**< The return type specialization for std::complex<double>. */
 };
 
+// Forward declarations for abs2
+template<typename T>
+__device__ typename GetTypeReal<T>::type abs2(const T& x);
+
+// Specialization for real types (double)
+template<>
+__device__ double abs2(const double& x) {
+    return x * x;
+}
+
+// Specialization for real types (float)
+template<>
+__device__ float abs2(const float& x) {
+    return x * x;
+}
+
+// Specialization for complex float
+template<>
+__device__ float abs2(const thrust::complex<float>& x) {
+    return x.real() * x.real() + x.imag() * x.imag();
+}
+
+// Specialization for complex double
+template<>
+__device__ double abs2(const thrust::complex<double>& x) {
+    return x.real() * x.real() + x.imag() * x.imag();
+}
+
+// Specialization for std::complex<float> (for interface compatibility)
+template<>
+__device__ float abs2(const std::complex<float>& x) {
+    const thrust::complex<float>* tx = reinterpret_cast<const thrust::complex<float>*>(&x);
+    return tx->real() * tx->real() + tx->imag() * tx->imag();
+}
+
+// Specialization for std::complex<double> (for interface compatibility)
+template<>
+__device__ double abs2(const std::complex<double>& x) {
+    const thrust::complex<double>* tx = reinterpret_cast<const thrust::complex<double>*>(&x);
+    return tx->real() * tx->real() + tx->imag() * tx->imag();
+}
+
+
 namespace hsolver {
 
 template <typename T>
@@ -943,7 +987,309 @@ void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator
     hipCheckOnDebug();
 }
 
+template <typename T>
+__launch_bounds__(256)
+__global__ void apply_eigenvalues_kernel(
+    const typename GetTypeReal<T>::type* eigenvalues,
+    const T* vectors,
+    T* result,
+    const int nbase,
+    const int nbase_x,
+    const int notconv)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total_elements = notconv * nbase;
+    if (i < total_elements)
+    {
+        const int row = i / nbase;
+        const int col = i % nbase;
+        result[row * nbase_x + col] = eigenvalues[row] * vectors[row * nbase_x + col];
+    }
+}
+
+template <typename FPTYPE>
+inline void apply_eigenvalues_complex_wrapper(const base_device::DEVICE_GPU* d,
+                                            const int& nbase,
+                                            const int& nbase_x,
+                                            const int& notconv,
+                                            const FPTYPE* eigenvalues,
+                                            const std::complex<FPTYPE>* vectors,
+                                            std::complex<FPTYPE>* result)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vectors_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vectors);
+    const int total_elements = notconv * nbase;
+    int thread = 256;
+    int block = (total_elements + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_eigenvalues_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0,
+        eigenvalues, vectors_tmp, result_tmp, nbase, nbase_x, notconv);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void apply_eigenvalues_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                      const int& nbase,
+                                                                      const int& nbase_x,
+                                                                      const int& notconv,
+                                                                      double* result,
+                                                                      const double* vectors,
+                                                                      const double* eigenvalues)
+{
+    const int total_elements = notconv * nbase;
+    int thread = 256;
+    int block = (total_elements + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_eigenvalues_kernel<double>), dim3(block), dim3(thread), 0, 0,
+        eigenvalues, vectors, result, nbase, nbase_x, notconv);
+
+    hipCheckOnDebug();
+}
 
+template <>
+void apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                                   const int& nbase,
+                                                                                   const int& nbase_x,
+                                                                                   const int& notconv,
+                                                                                   std::complex<float>* result,
+                                                                                   const std::complex<float>* vectors,
+                                                                                   const float* eigenvalues)
+{
+    apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result);
+}
+
+template <>
+void apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                                                    const int& nbase,
+                                                                                    const int& nbase_x,
+                                                                                    const int& notconv,
+                                                                                    std::complex<double>* result,
+                                                                                    const std::complex<double>* vectors,
+                                                                                    const double* eigenvalues)
+{
+    apply_eigenvalues_complex_wrapper(d, nbase, nbase_x, notconv, eigenvalues, vectors, result);
+}
+
+template <typename T>
+__launch_bounds__(256)
+__global__ void precondition_kernel(
+    const int dim,
+    const int notconv,
+    T* psi_iter,
+    const int nbase,
+    const typename GetTypeReal<T>::type* precondition,  // Real type
+    const typename GetTypeReal<T>::type* eigenvalues)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dim * notconv) {
+        const int i = tid % dim;    // Basis index
+        const int m = tid / dim;    // Band index
+        
+        using Real = typename GetTypeReal<T>::type;
+        Real x = abs(precondition[i] - eigenvalues[m]);
+        Real pre = 0.5 * (1.0 + x + sqrt(1.0 + (x - 1.0) * (x - 1.0)));
+        psi_iter[(nbase + m) * dim + i] /= pre;
+    }
+}
+
+template <>
+void precondition_op<double, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    double* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const double* precondition,
+    const double* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    int thread = 256;
+    int block = (total_elements + thread - 1) / thread;
+    
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel<double>), dim3(block), dim3(thread), 0, 0,
+        dim, notconv, psi_iter, nbase, precondition, eigenvalues);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void precondition_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<float>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const float* precondition,
+    const float* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    int thread = 256;
+    int block = (total_elements + thread - 1) / thread;
+    
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel<thrust::complex<float>>), dim3(block), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<thrust::complex<float>*>(psi_iter), nbase, precondition, eigenvalues);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void precondition_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<double>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    const double* precondition,
+    const double* eigenvalues)
+{
+    const int total_elements = dim * notconv;
+    int thread = 256;
+    int block = (total_elements + thread - 1) / thread;
+    
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(precondition_kernel<thrust::complex<double>>), dim3(block), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<thrust::complex<double>*>(psi_iter), nbase, precondition, eigenvalues);
+
+    hipCheckOnDebug();
+}
+
+template <typename T>
+__launch_bounds__(256)
+__global__ void calculate_norm_kernel(
+    const int dim,
+    const int notconv,
+    const T* psi_iter,
+    const int nbase,
+    typename GetTypeReal<T>::type* psi_norm)
+{
+    using Real = typename GetTypeReal<T>::type;
+    __shared__ Real shared_mem[256];
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int m = bid / ((dim + blockDim.x - 1) / blockDim.x);
+    const int num_blocks_per_band = (dim + blockDim.x - 1) / blockDim.x;
+    const int block_in_band = bid % num_blocks_per_band;
+
+    if (m >= notconv) return;
+
+    // Initialize shared memory
+    shared_mem[tid] = 0.0;
+
+    // Each thread processes one element
+    const int i = tid + block_in_band * blockDim.x;
+    if (i < dim) {
+        shared_mem[tid] = abs2(psi_iter[(nbase + m) * dim + i]);
+    }
+    __syncthreads();
+
+    // Parallel reduction in shared memory
+    for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            shared_mem[tid] += shared_mem[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    // Write result for this block to global memory
+    if (tid == 0) {
+        atomicAdd(&psi_norm[m], shared_mem[0]);
+    }
+}
+
+template <typename T>
+__launch_bounds__(256)
+__global__ void normalize_vectors_kernel(
+    const int dim,
+    const int notconv,
+    T* psi_iter,
+    const int nbase,
+    const typename GetTypeReal<T>::type* psi_norm)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int m = tid / dim;
+    const int i = tid % dim;
+    
+    if (m < notconv && i < dim) {
+        psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / sqrt(psi_norm[m]);
+    }
+}
+
+template <>
+void normalize_op<double, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    double* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    double* psi_norm)
+{
+    const int thread = 256;
+    const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv;
+    
+    // First kernel: calculate norms
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel<double>), dim3(numBlocksForNorm), dim3(thread), 0, 0,
+        dim, notconv, psi_iter, nbase, psi_norm);
+    hipDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel<double>), dim3(numBlocksForNormalize), dim3(thread), 0, 0,
+        dim, notconv, psi_iter, nbase, psi_norm);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void normalize_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<float>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    float* psi_norm)
+{
+    const int thread = 256;
+    const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv;
+    
+    // First kernel: calculate norms
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel<thrust::complex<float>>), dim3(numBlocksForNorm), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<const thrust::complex<float>*>(psi_iter), nbase, psi_norm);
+    hipDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel<thrust::complex<float>>), dim3(numBlocksForNormalize), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<thrust::complex<float>*>(psi_iter), nbase, psi_norm);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void normalize_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& dim,
+    std::complex<double>* psi_iter,
+    const int& nbase,
+    const int& notconv,
+    double* psi_norm)
+{
+    const int thread = 256;
+    const int numBlocksForNorm = ((dim + thread - 1) / thread) * notconv;
+    
+    // First kernel: calculate norms
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_norm_kernel<thrust::complex<double>>), dim3(numBlocksForNorm), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<const thrust::complex<double>*>(psi_iter), nbase, psi_norm);
+    hipDeviceSynchronize(); // Ensure all norms are computed
+    
+    // Second kernel: normalize vectors
+    const int total_elements = dim * notconv;
+    const int numBlocksForNormalize = (total_elements + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(normalize_vectors_kernel<thrust::complex<double>>), dim3(numBlocksForNormalize), dim3(thread), 0, 0,
+        dim, notconv, reinterpret_cast<thrust::complex<double>*>(psi_iter), nbase, psi_norm);
+
+    hipCheckOnDebug();
+}
 
 // Explicitly instantiate functors for the types of functor registered.
 template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
@@ -951,6 +1297,9 @@ template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
@@ -960,6 +1309,9 @@ template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVIC
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
@@ -968,8 +1320,11 @@ template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_GPU>;
+template struct precondition_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
+template struct normalize_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace hsolver
diff --git a/source/module_hsolver/test/test_hsolver_pw.cpp b/source/module_hsolver/test/test_hsolver_pw.cpp
index 5763f1b7d8..1345dfc99e 100644
--- a/source/module_hsolver/test/test_hsolver_pw.cpp
+++ b/source/module_hsolver/test/test_hsolver_pw.cpp
@@ -14,6 +14,68 @@
 #include "module_hsolver/hsolver_pw.h"
 #undef private
 #undef protected
+
+// Mock implementations for the template functions causing linking errors
+namespace ModulePW {
+    // Mock implementation for recip_to_real
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::recip_to_real(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+        // In a real test, you might want to implement behavior that simulates the actual function
+    }
+
+    // Mock implementation for real_to_recip
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::real_to_recip(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+    }
+
+    // Explicit template instantiations
+    template void PW_Basis_K::recip_to_real<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::recip_to_real<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+
+    template void PW_Basis_K::real_to_recip<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::real_to_recip<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+}
+
 /************************************************
  *  unit test of HSolverPW class
  ***********************************************/
diff --git a/source/module_hsolver/test/test_hsolver_sdft.cpp b/source/module_hsolver/test/test_hsolver_sdft.cpp
index 642ee9daae..235c664a78 100644
--- a/source/module_hsolver/test/test_hsolver_sdft.cpp
+++ b/source/module_hsolver/test/test_hsolver_sdft.cpp
@@ -133,6 +133,66 @@ void Stochastic_Iter<T, Device>::cal_storho(const UnitCell& ucell,
 Charge::Charge(){};
 Charge::~Charge(){};
 
+// Mock implementations for the template functions causing linking errors
+namespace ModulePW {
+    // Mock implementation for recip_to_real
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::recip_to_real(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+        // In a real test, you might want to implement behavior that simulates the actual function
+    }
+
+    // Mock implementation for real_to_recip
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::real_to_recip(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+    }
+
+    // Explicit template instantiations
+    template void PW_Basis_K::recip_to_real<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::recip_to_real<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+
+    template void PW_Basis_K::real_to_recip<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::real_to_recip<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+}
 /************************************************
  *  unit test of HSolverPW_SDFT class
  ***********************************************/
diff --git a/source/module_io/read_input_item_elec_stru.cpp b/source/module_io/read_input_item_elec_stru.cpp
index 73c741bb5f..19d5b2f1c2 100644
--- a/source/module_io/read_input_item_elec_stru.cpp
+++ b/source/module_io/read_input_item_elec_stru.cpp
@@ -346,6 +346,26 @@ void ReadInput::item_elec_stru()
         read_sync_bool(input.diago_smooth_ethr);
         this->add_item(item);
     }
+    {
+        Input_Item item("use_k_continuity");
+        item.annotation = "whether to use k-point continuity for initializing wave functions";
+        read_sync_bool(input.use_k_continuity);
+        item.check_value = [](const Input_Item& item, const Parameter& para) {
+            if (para.input.use_k_continuity && para.input.basis_type != "pw") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity only works for PW basis");
+            }
+            if (para.input.use_k_continuity && para.input.calculation == "nscf") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for NSCF calculation");
+            }
+            if (para.input.use_k_continuity && para.input.nspin == 2) {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for spin-polarized calculation");
+            }
+            if (para.input.use_k_continuity && para.input.esolver_type == "sdft") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for SDFT calculation");
+            }
+        };
+        this->add_item(item);
+    }
     {
         Input_Item item("pw_diag_ndim");
         item.annotation = "dimension of workspace for Davidson diagonalization";
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index 0bb1979318..aa7d31309c 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -90,6 +90,7 @@ struct Input_para
     bool diago_smooth_ethr = false; ///< smooth ethr for iter methods
     int pw_diag_ndim = 4;      ///< dimension of workspace for Davidson diagonalization
     int diago_cg_prec = 1;     ///< mohan add 2012-03-31
+    bool use_k_continuity = false;   ///< whether to use k-point continuity for initializing wave functions
 
     std::string smearing_method = "gauss"; ///< "gauss",
                                            ///< "mp","methfessel-paxton"
diff --git a/source/module_psi/psi_init.cpp b/source/module_psi/psi_init.cpp
index 201539a54f..a3d59d4e7b 100644
--- a/source/module_psi/psi_init.cpp
+++ b/source/module_psi/psi_init.cpp
@@ -265,6 +265,7 @@ void PSIInit<T, Device>::initialize_psi(Psi<std::complex<double>>* psi,
         {
             for (int ik = 0; ik < this->pw_wfc->nks; ++ik)
             {
+                if(PARAM.inp.use_k_continuity && ik > 0) continue;
                 //! Update Hamiltonian from other kpoint to the given one
                 p_hamilt->updateHk(ik);
 
@@ -316,4 +317,4 @@ template class PSIInit<std::complex<double>, base_device::DEVICE_CPU>;
 template class PSIInit<std::complex<float>, base_device::DEVICE_GPU>;
 template class PSIInit<std::complex<double>, base_device::DEVICE_GPU>;
 #endif
-} // namespace psi
\ No newline at end of file
+} // namespace psi