diff --git a/source/module_base/kernels/cuda/math_kernel_op.cu b/source/module_base/kernels/cuda/math_kernel_op.cu
index 39043daff1..7227df630d 100644
--- a/source/module_base/kernels/cuda/math_kernel_op.cu
+++ b/source/module_base/kernels/cuda/math_kernel_op.cu
@@ -325,16 +325,23 @@ __global__ void vector_div_constant_kernel(
 }
 
 template <typename T>
-__global__ void vector_mul_vector_kernel(
-    const int size,
-    T* result,
-    const T* vector1,
-    const typename GetTypeReal<T>::type* vector2)
+__global__ void vector_mul_vector_kernel(const int size,
+                                         T* result,
+                                         const T* vector1,
+                                         const typename GetTypeReal<T>::type* vector2,
+                                         const bool add)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < size)
     {
-        result[i] = vector1[i] * vector2[i];
+        if (add)
+        {
+            result[i] += vector1[i] * vector2[i];
+        }
+        else
+        {
+            result[i] = vector1[i] * vector2[i];
+        }
     }
 }
 
@@ -548,11 +555,12 @@ template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                        double* result,
                                                                        const double* vector1,
-                                                                       const double* vector2)
+                                                                       const double* vector2,
+                                                                       const bool& add)
 {
     int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
-    vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2);
+    vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2, add);
 
     cudaCheckOnDebug();
 }
@@ -561,13 +569,14 @@ template <typename FPTYPE>
 inline void vector_mul_vector_complex_wrapper(const int& dim,
                                               std::complex<FPTYPE>* result,
                                               const std::complex<FPTYPE>* vector1,
-                                              const FPTYPE* vector2)
+                                              const FPTYPE* vector2,
+                                              const bool& add)
 {
     thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
     const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
     int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
-    vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2);
+    vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2, add);
 
     cudaCheckOnDebug();
 }
@@ -575,18 +584,20 @@ template <>
 void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                                     std::complex<float>* result,
                                                                                     const std::complex<float>* vector1,
-                                                                                    const float* vector2)
+                                                                                    const float* vector2,
+                                                                                    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 template <>
 void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
     const int& dim,
     std::complex<double>* result,
     const std::complex<double>* vector1,
-    const double* vector2)
+    const double* vector2,
+    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 
 // vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex)
@@ -1019,6 +1030,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
@@ -1029,6 +1041,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
@@ -1039,7 +1052,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace ModuleBase
diff --git a/source/module_base/kernels/math_kernel_op.cpp b/source/module_base/kernels/math_kernel_op.cpp
index bb0bb923a5..9420d7c4b7 100644
--- a/source/module_base/kernels/math_kernel_op.cpp
+++ b/source/module_base/kernels/math_kernel_op.cpp
@@ -167,14 +167,27 @@ template <typename T>
 struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const int& dim, T* result, const T* vector1, const Real* vector2)
+    void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add)
     {
+        if (add)
+        {
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static, 4096 / sizeof(Real))
 #endif
-        for (int i = 0; i < dim; i++)
+            for (int i = 0; i < dim; i++)
+            {
+                result[i] += vector1[i] * vector2[i];
+            }
+        }
+        else
         {
-            result[i] = vector1[i] * vector2[i];
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+            for (int i = 0; i < dim; i++)
+            {
+                result[i] = vector1[i] * vector2[i];
+            }
         }
     }
 };
diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h
index d94aeb6806..daca2ac548 100644
--- a/source/module_base/kernels/math_kernel_op.h
+++ b/source/module_base/kernels/math_kernel_op.h
@@ -143,11 +143,11 @@ template <typename T, typename Device> struct vector_mul_vector_op {
   /// \param dim : array size
   /// \param vector1 : input array A
   /// \param vector2 : input array B
+  /// \param add : flag to control whether to add the result to the output array
   ///
   /// Output Parameters
   /// \param result : output array
-  void operator()(const int &dim, T *result, const T *vector1,
-                  const Real *vector2);
+  void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
@@ -350,8 +350,7 @@ struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
 // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
 template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
   using Real = typename GetTypeReal<T>::type;
-  void operator()(const int &dim, T *result,
-                  const T *vector1, const Real *vector2);
+  void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
diff --git a/source/module_base/kernels/rocm/math_kernel_op.hip.cu b/source/module_base/kernels/rocm/math_kernel_op.hip.cu
index 0279bfee75..a6a787faf8 100644
--- a/source/module_base/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/module_base/kernels/rocm/math_kernel_op.hip.cu
@@ -248,12 +248,20 @@ __global__ void vector_mul_vector_kernel(
     const int size,
     T* result,
     const T* vector1,
-    const typename GetTypeReal<T>::type* vector2)
+    const typename GetTypeReal<T>::type* vector2,
+    const bool add)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < size)
     {
-        result[i] = vector1[i] * vector2[i];
+        if (add)
+        {
+            result[i] += vector1[i] * vector2[i];
+        }
+        else
+        {
+            result[i] = vector1[i] * vector2[i];
+        }
     }
 }
 
@@ -471,11 +479,12 @@ template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                        double* result,
                                                                        const double* vector1,
-                                                                       const double* vector2)
+                                                                       const double* vector2,
+                                                                       const bool& add)
 {
     int thread = 1024;
     int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2, add);
 
     hipCheckOnDebug();
 }
@@ -485,13 +494,14 @@ template <typename FPTYPE>
 inline void vector_mul_vector_complex_wrapper(const int& dim,
                                               std::complex<FPTYPE>* result,
                                               const std::complex<FPTYPE>* vector1,
-                                              const FPTYPE* vector2)
+                                              const FPTYPE* vector2,
+                                              const bool& add)
 {
     thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
     const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
     int thread = 1024;
     int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2, add);
 
     hipCheckOnDebug();
 }
@@ -499,18 +509,20 @@ template <>
 void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                                     std::complex<float>* result,
                                                                                     const std::complex<float>* vector1,
-                                                                                    const float* vector2)
+                                                                                    const float* vector2,
+                                                                                    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 template <>
 void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
     const int& dim,
     std::complex<double>* result,
     const std::complex<double>* vector1,
-    const double* vector2)
+    const double* vector2,
+    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 template <>
@@ -931,6 +943,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
@@ -940,6 +953,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -948,7 +962,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index 3bddfc2dfa..4626286801 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -943,7 +943,7 @@ void ESolver_KS_PW<T, Device>::after_all_runners(UnitCell& ucell)
     //! 7) Use Kubo-Greenwood method to compute conductivities
     if (PARAM.inp.cal_cond)
     {
-        EleCond elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->psi, &this->ppcell);
+        EleCond<Real, Device> elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->kspw_psi, &this->ppcell);
         elec_cond.KG(PARAM.inp.cond_smear,
                      PARAM.inp.cond_fwhm,
                      PARAM.inp.cond_wcut,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/elecond.cpp b/source/module_hamilt_pw/hamilt_pwdft/elecond.cpp
index 5f59858132..5997a139cf 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/elecond.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/elecond.cpp
@@ -1,10 +1,12 @@
 #include "elecond.h"
 
-#include "module_parameter/parameter.h"
 #include "module_base/global_function.h"
 #include "module_base/global_variable.h"
+#include "module_base/kernels/math_kernel_op.h"
+#include "module_base/parallel_device.h"
 #include "module_elecstate/occupy.h"
 #include "module_io/binstream.h"
+#include "module_parameter/parameter.h"
 
 #include <vector>
 
@@ -24,9 +26,13 @@
 #define TWOSQRT2LN2 2.354820045030949 // FWHM = 2sqrt(2ln2) * \sigma
 #define FACTOR 1.839939223835727e7
 
-EleCond::EleCond(UnitCell* p_ucell_in, K_Vectors* p_kv_in, elecstate::ElecState* p_elec_in,
-                 ModulePW::PW_Basis_K* p_wfcpw_in, psi::Psi<std::complex<double>>* p_psi_in,
-                 pseudopot_cell_vnl* p_ppcell_in)
+template <typename FPTYPE, typename Device>
+EleCond<FPTYPE, Device>::EleCond(UnitCell* p_ucell_in,
+                                 K_Vectors* p_kv_in,
+                                 elecstate::ElecState* p_elec_in,
+                                 ModulePW::PW_Basis_K* p_wfcpw_in,
+                                 psi::Psi<std::complex<FPTYPE>, Device>* p_psi_in,
+                                 pseudopot_cell_vnl* p_ppcell_in)
 {
     this->p_ppcell = p_ppcell_in;
     this->p_ucell = p_ucell_in;
@@ -36,8 +42,14 @@ EleCond::EleCond(UnitCell* p_ucell_in, K_Vectors* p_kv_in, elecstate::ElecState*
     this->p_psi = p_psi_in;
 }
 
-void EleCond::KG(const int& smear_type, const double& fwhmin, const double& wcut, const double& dw_in,
-                 const double& dt_in, const bool& nonlocal, ModuleBase::matrix& wg)
+template <typename FPTYPE, typename Device>
+void EleCond<FPTYPE, Device>::KG(const int& smear_type,
+                                 const double& fwhmin,
+                                 const double& wcut,
+                                 const double& dw_in,
+                                 const double& dt_in,
+                                 const bool& nonlocal,
+                                 ModuleBase::matrix& wg)
 {
     //-----------------------------------------------------------
     //               KS conductivity
@@ -72,7 +84,7 @@ void EleCond::KG(const int& smear_type, const double& fwhmin, const double& wcut
     std::vector<double> ct12(nt, 0);
     std::vector<double> ct22(nt, 0);
 
-    hamilt::Velocity velop(this->p_wfcpw, this->p_kv->isk.data(), this->p_ppcell, this->p_ucell, nonlocal);
+    hamilt::Velocity<FPTYPE, Device> velop(this->p_wfcpw, this->p_kv->isk.data(), this->p_ppcell, this->p_ucell, nonlocal);
     double decut = (wcut + fwhmin) / ModuleBase::Ry_to_eV;
     std::cout << "Recommended dt: " << 0.25 * M_PI / decut << " a.u." << std::endl;
     for (int ik = 0; ik < nk; ++ik)
@@ -94,44 +106,79 @@ void EleCond::KG(const int& smear_type, const double& fwhmin, const double& wcut
     }
 }
 
-void EleCond::jjresponse_ks(const int ik, const int nt, const double dt, const double decut, ModuleBase::matrix& wg,
-                            hamilt::Velocity& velop, double* ct11, double* ct12, double* ct22)
+template <typename FPTYPE, typename Device>
+void EleCond<FPTYPE, Device>::jjresponse_ks(const int ik,
+                                            const int nt,
+                                            const double dt,
+                                            const double decut,
+                                            ModuleBase::matrix& wg,
+                                            hamilt::Velocity<FPTYPE, Device>& velop,
+                                            double* ct11,
+                                            double* ct12,
+                                            double* ct22)
 {
     const int nbands = PARAM.inp.nbands;
-    if (wg(ik, 0) - wg(ik, nbands - 1) < 1e-8 || nbands == 0) {
+    if (wg(ik, 0) - wg(ik, nbands - 1) < 1e-8 || nbands == 0)
+    {
         return;
-}
-    const char transn = 'N';
-    const char transc = 'C';
+    }
     const int ndim = 3;
-    const int npwx = this->p_wfcpw->npwk_max;
+    const int npwk_max = this->p_wfcpw->npwk_max;
     const double ef = this->p_elec->eferm.ef;
     const int npw = this->p_kv->ngk[ik];
     const int reducenb2 = (nbands - 1) * nbands / 2;
     const bool gamma_only = false; // ABACUS do not support gamma_only yet.
-    std::complex<double>* levc = &(this->p_psi[0](ik, 0, 0));
-    std::vector<std::complex<double>> prevc(ndim * npwx * nbands);
-    std::vector<std::complex<double>> pij(nbands * nbands);
+    std::complex<FPTYPE>* levc = &(this->p_psi[0](ik, 0, 0));
+    psi::Psi<std::complex<FPTYPE>, Device> v_psi(1, ndim * nbands, npwk_max, npw, true);
+    std::complex<FPTYPE>* pij_d = nullptr;
+    resmem_complex_op()(pij_d, nbands * nbands);
+
     std::vector<double> pij2(reducenb2, 0);
     // px|right>
-    velop.act(this->p_psi, nbands * PARAM.globalv.npol, levc, prevc.data());
+    velop.act(this->p_psi, nbands * PARAM.globalv.npol, levc, v_psi.get_pointer());
+    std::complex<FPTYPE> one = 1.0;
+    std::complex<FPTYPE> zero = 0.0;
     for (int id = 0; id < ndim; ++id)
     {
+        ModuleBase::gemm_op<std::complex<FPTYPE>, Device>()('C',
+                                                            'N',
+                                                            nbands,
+                                                            nbands,
+                                                            npw,
+                                                            &one,
+                                                            levc,
+                                                            npwk_max,
+                                                            v_psi.get_pointer() + id * npwk_max * nbands,
+                                                            npwk_max,
+                                                            &zero,
+                                                            pij_d,
+                                                            nbands);
 
-        zgemm_(&transc, &transn, &nbands, &nbands, &npw, &ModuleBase::ONE, levc, &npwx,
-               prevc.data() + id * npwx * nbands, &npwx, &ModuleBase::ZERO, pij.data(), &nbands);
+        std::complex<FPTYPE>* pij_c = nullptr;
+        if(std::is_same<Device, base_device::DEVICE_CPU>::value)
+        {
+            pij_c = pij_d;
+        }
+        else
+        {
+            std::vector<std::complex<FPTYPE>> pij_h(nbands * nbands);
+            syncmem_complex_d2h_op()(pij_h.data(), pij_d, nbands * nbands);
+            pij_c = pij_h.data();
+        }
+        
 #ifdef __MPI
-        MPI_Allreduce(MPI_IN_PLACE, pij.data(), nbands * nbands, MPI_DOUBLE_COMPLEX, MPI_SUM, POOL_WORLD);
+        Parallel_Common::reduce_data(pij_c, nbands * nbands, POOL_WORLD);
 #endif
-        if (!gamma_only) {
+        if (!gamma_only)
+        {
             for (int ib = 0, ijb = 0; ib < nbands; ++ib)
             {
                 for (int jb = ib + 1; jb < nbands; ++jb, ++ijb)
                 {
-                    pij2[ijb] += norm(pij[ib * nbands + jb]);
+                    pij2[ijb] += static_cast<double>(std::norm(pij_c[ib * nbands + jb]));
                 }
             }
-}
+        }
     }
 
     if (GlobalV::RANK_IN_POOL == 0)
@@ -185,11 +232,20 @@ void EleCond::jjresponse_ks(const int ik, const int nt, const double dt, const d
         ct12[it] += tmct12 / 2.0;
         ct22[it] += tmct22 / 2.0;
     }
+    delmem_complex_op()(pij_d);
     return;
 }
 
-void EleCond::calcondw(const int nt, const double dt, const int& smear_type, const double fwhmin, const double wcut,
-                       const double dw_in, double* ct11, double* ct12, double* ct22)
+template <typename FPTYPE, typename Device>
+void EleCond<FPTYPE, Device>::calcondw(const int nt,
+                                       const double dt,
+                                       const int& smear_type,
+                                       const double fwhmin,
+                                       const double wcut,
+                                       const double dw_in,
+                                       double* ct11,
+                                       double* ct12,
+                                       double* ct22)
 {
     double factor = FACTOR;
     const int ndim = 3;
@@ -259,4 +315,11 @@ void EleCond::calcondw(const int nt, const double dt, const int& smear_type, con
     std::cout << std::setprecision(6) << "Thermal conductivity: " << kappa0 << " W(mK)^-1" << std::endl;
     std::cout << std::setprecision(6) << "Lorenz number: " << Lorent0 << " k_B^2/e^2" << std::endl;
     ofscond.close();
-}
\ No newline at end of file
+}
+
+template class EleCond<double, base_device::DEVICE_CPU>;
+template class EleCond<float, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class EleCond<double, base_device::DEVICE_GPU>;
+template class EleCond<float, base_device::DEVICE_GPU>;
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/elecond.h b/source/module_hamilt_pw/hamilt_pwdft/elecond.h
index a8c99b22dc..8ccc0a1408 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/elecond.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/elecond.h
@@ -9,11 +9,20 @@
 #include "module_hamilt_pw/hamilt_pwdft/VNL_in_pw.h"
 #include "module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.h"
 
+template<typename FPTYPE, typename Device>
 class EleCond
 {
   public:
-    EleCond(UnitCell* p_ucell_in, K_Vectors* p_kv_in, elecstate::ElecState* p_elec_in, ModulePW::PW_Basis_K* p_wfcpw_in,
-            psi::Psi<std::complex<double>>* p_psi_in, pseudopot_cell_vnl* p_ppcell_in);
+    using resmem_complex_op = base_device::memory::resize_memory_op<std::complex<FPTYPE>, Device>;
+    using delmem_complex_op = base_device::memory::delete_memory_op<std::complex<FPTYPE>, Device>;
+    using syncmem_complex_d2h_op = base_device::memory::synchronize_memory_op<std::complex<FPTYPE>, base_device::DEVICE_CPU, Device>;
+  public:
+    EleCond(UnitCell* p_ucell_in,
+            K_Vectors* p_kv_in,
+            elecstate::ElecState* p_elec_in,
+            ModulePW::PW_Basis_K* p_wfcpw_in,
+            psi::Psi<std::complex<FPTYPE>, Device>* p_psi_in,
+            pseudopot_cell_vnl* p_ppcell_in);
     ~EleCond(){};
 
     /**
@@ -27,16 +36,21 @@ class EleCond
      * @param nonlocal whether to include the nonlocal potential corrections for velocity operator
      * @param wg wg(ik,ib) occupation for the ib-th band in the ik-th kpoint
      */
-    void KG(const int& smear_type, const double& fwhmin, const double& wcut, const double& dw_in, const double& dt_in,
-            const bool& nonlocal, ModuleBase::matrix& wg);
+    void KG(const int& smear_type,
+            const double& fwhmin,
+            const double& wcut,
+            const double& dw_in,
+            const double& dt_in,
+            const bool& nonlocal,
+            ModuleBase::matrix& wg);
 
   protected:
-    pseudopot_cell_vnl* p_ppcell = nullptr;          ///< pointer to the pseudopotential
-    UnitCell* p_ucell = nullptr;                     ///< pointer to the unit cell
-    ModulePW::PW_Basis_K* p_wfcpw = nullptr;         ///< pointer to the plane wave basis
-    K_Vectors* p_kv = nullptr;                       ///< pointer to the k vectors
-    elecstate::ElecState* p_elec = nullptr;          ///< pointer to the electronic state
-    psi::Psi<std::complex<double>>* p_psi = nullptr; ///< pointer to the wavefunction
+    pseudopot_cell_vnl* p_ppcell = nullptr;                  ///< pointer to the pseudopotential
+    UnitCell* p_ucell = nullptr;                             ///< pointer to the unit cell
+    ModulePW::PW_Basis_K* p_wfcpw = nullptr;                 ///< pointer to the plane wave basis
+    K_Vectors* p_kv = nullptr;                               ///< pointer to the k vectors
+    elecstate::ElecState* p_elec = nullptr;                  ///< pointer to the electronic state
+    psi::Psi<std::complex<FPTYPE>, Device>* p_psi = nullptr; ///< pointer to the wavefunction
 
   protected:
     /**
@@ -52,8 +66,15 @@ class EleCond
      * @param ct12 C12(t)
      * @param ct22 C22(t)
      */
-    void jjresponse_ks(const int ik, const int nt, const double dt, const double decut, ModuleBase::matrix& wg,
-                       hamilt::Velocity& velop, double* ct11, double* ct12, double* ct22);
+    void jjresponse_ks(const int ik,
+                       const int nt,
+                       const double dt,
+                       const double decut,
+                       ModuleBase::matrix& wg,
+                       hamilt::Velocity<FPTYPE, Device>& velop,
+                       double* ct11,
+                       double* ct12,
+                       double* ct22);
     /**
      * @brief Calculate the conductivity using the response function
      *
@@ -67,8 +88,15 @@ class EleCond
      * @param ct12 C12 component of the response function
      * @param ct22 C22 component of the response function
      */
-    void calcondw(const int nt, const double dt, const int& smear_type, const double fwhmin, const double wcut,
-                  const double dw_in, double* ct11, double* ct12, double* ct22);
+    void calcondw(const int nt,
+                  const double dt,
+                  const int& smear_type,
+                  const double fwhmin,
+                  const double wcut,
+                  const double dw_in,
+                  double* ct11,
+                  double* ct12,
+                  double* ct22);
 };
 
 #endif // ELECOND_H
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.cpp
index bc2ef072ea..3998ef9b85 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.cpp
@@ -1,19 +1,19 @@
 #include "velocity_pw.h"
-#include "module_base/timer.h"
+
+#include "module_base/kernels/math_kernel_op.h"
 #include "module_base/parallel_reduce.h"
+#include "module_base/timer.h"
 namespace hamilt
 {
 
-Velocity::Velocity
-(
-    const ModulePW::PW_Basis_K* wfcpw_in,
-    const int* isk_in,
-    pseudopot_cell_vnl* ppcell_in,
-    const UnitCell* ucell_in,
-    const bool nonlocal_in
-)
+template <typename FPTYPE, typename Device>
+Velocity<FPTYPE, Device>::Velocity(const ModulePW::PW_Basis_K* wfcpw_in,
+                                   const int* isk_in,
+                                   pseudopot_cell_vnl* ppcell_in,
+                                   const UnitCell* ucell_in,
+                                   const bool nonlocal_in)
 {
-    if( wfcpw_in == nullptr || isk_in == nullptr || ppcell_in == nullptr || ucell_in == nullptr)
+    if (wfcpw_in == nullptr || isk_in == nullptr || ppcell_in == nullptr || ucell_in == nullptr)
     {
         ModuleBase::WARNING_QUIT("Velocity", "Constuctor of Operator::Velocity is failed, please check your code!");
     }
@@ -22,114 +22,199 @@ Velocity::Velocity
     this->ppcell = ppcell_in;
     this->ucell = ucell_in;
     this->nonlocal = nonlocal_in;
-    this->tpiba = ucell_in -> tpiba;
-    if(this->nonlocal) {      this->ppcell->initgradq_vnl(*this->ucell);
+    this->tpiba = ucell_in->tpiba;
+    if (this->nonlocal)
+    {
+        this->ppcell->initgradq_vnl(*this->ucell);
+    }
 }
+
+template <typename FPTYPE, typename Device>
+Velocity<FPTYPE, Device>::~Velocity()
+{
+    delmem_var_op()(this->gx_);
+    delmem_var_op()(this->gy_);
+    delmem_var_op()(this->gz_);
+    delmem_complex_op()(vkb_);
+    delmem_complex_op()(gradvkb_);
 }
 
-void Velocity::init(const int ik_in)
+template <typename FPTYPE, typename Device>
+void Velocity<FPTYPE, Device>::init(const int ik_in)
 {
     this->ik = ik_in;
+    // init G+K
+    const int npw = this->wfcpw->npwk[ik_in];
+    const int npwk_max = this->wfcpw->npwk_max;
+    std::vector<FPTYPE> gtmp(npw);
+    resmem_var_op()(gx_, npw);
+    resmem_var_op()(gy_, npw);
+    resmem_var_op()(gz_, npw);
+    std::vector<FPTYPE*> gtmp_ptr = {this->gx_, this->gy_, this->gz_};
+    for(int i=0; i<3; ++i)
+    {
+        for (int ig = 0; ig < npw; ++ig)
+        {
+            const ModuleBase::Vector3<double> tmpg = wfcpw->getgpluskcar(this->ik, ig);
+            gtmp[ig] = static_cast<FPTYPE>(tmpg[i] * tpiba);
+        }
+        syncmem_var_h2d_op()(gtmp_ptr[i], gtmp.data(), npw);
+    }
+
     // Calculate nonlocal pseudopotential vkb
-	if(this->ppcell->nkb > 0 && this->nonlocal) 
-	{
-        this->ppcell->getgradq_vnl(*this->ucell,ik_in);
-	}
+    if (this->ppcell->nkb > 0 && this->nonlocal)
+    {
+        this->ppcell->getgradq_vnl(*this->ucell, ik_in);
+
+        // sync to device
+        if (std::is_same<Device, base_device::DEVICE_GPU>::value || std::is_same<FPTYPE, float>::value)
+        {
+            const int nkb = this->ppcell->nkb;
+            // vkb
+            resmem_complex_op()(vkb_, nkb * npwk_max);
+            castmem_complex_h2d_op()(vkb_, this->ppcell->vkb.c, nkb * npwk_max);
 
+            // gradvkb
+            resmem_complex_op()(gradvkb_, 3 * nkb * npwk_max);
+            castmem_complex_h2d_op()(gradvkb_, this->ppcell->gradvkb.ptr, 3 * nkb * npwk_max);
+        }
+    }
 }
 
-void Velocity::act
-(
-    const psi::Psi<std::complex<double>> *psi_in, 
-    const int n_npwx, //nbands * NPOL
-    const std::complex<double>* psi0, 
-    std::complex<double>* vpsi,
-    const bool add
-) const
+template <typename FPTYPE, typename Device>
+void Velocity<FPTYPE, Device>::act(const psi::Psi<std::complex<FPTYPE>, Device>* psi_in,
+                                   const int n_npwx,
+                                   const std::complex<FPTYPE>* psi0,
+                                   std::complex<FPTYPE>* vpsi,
+                                   const bool add) const
 {
     ModuleBase::timer::tick("Operator", "Velocity");
 
-    const int npw = psi_in->get_current_nbas();
-
-    const int max_npw = psi_in->get_nbasis() / psi_in->get_npol();
+    const int npw = this->wfcpw->npwk[this->ik];
+    const int max_npw = this->wfcpw->npwk_max;
     const int npol = psi_in->get_npol();
-    const std::complex<double>* tmpsi_in = psi0;
-    std::complex<double>* tmhpsi = vpsi;
+    
+    std::vector<FPTYPE*> gtmp_ptr = {this->gx_, this->gy_, this->gz_};
     // -------------
     //       p
     // -------------
-    for (int ib = 0; ib < n_npwx; ++ib)
+    for (int id = 0; id < 3; ++id)
     {
-        for (int ig = 0; ig < npw; ++ig)
+        const Complex* tmpsi_in = psi0;
+        Complex* tmpvpsi = vpsi + id * n_npwx * max_npw;
+        for (int ib = 0; ib < n_npwx; ++ib)
         {
-            const ModuleBase::Vector3<double>& tmpg = wfcpw->getgpluskcar(this->ik, ig);
-            if(add)
-            {
-                tmhpsi[ig]                       += tmpsi_in[ig] * tmpg.x * tpiba;
-                tmhpsi[ig + n_npwx * max_npw]    += tmpsi_in[ig] * tmpg.y * tpiba;
-                tmhpsi[ig + 2 * n_npwx * max_npw]+= tmpsi_in[ig] * tmpg.z * tpiba;
-            }
-            else
-            {
-                tmhpsi[ig]                        = tmpsi_in[ig] * tmpg.x * tpiba;
-                tmhpsi[ig + n_npwx * max_npw]     = tmpsi_in[ig] * tmpg.y * tpiba;
-                tmhpsi[ig + 2 * n_npwx * max_npw] = tmpsi_in[ig] * tmpg.z * tpiba;
-            }
+            ModuleBase::vector_mul_vector_op<Complex, Device>()(npw, tmpvpsi, tmpsi_in, gtmp_ptr[id], add);
+            tmpvpsi += max_npw;
+            tmpsi_in += max_npw;
         }
-        tmhpsi += max_npw;
-        tmpsi_in += max_npw;
     }
 
     // ---------------------------------------------
-    // i[V_NL, r] = (\nabla_q+\nabla_q')V_{NL}(q,q') 
+    // i[V_NL, r] = (\nabla_q+\nabla_q')V_{NL}(q,q')
     // |\beta><\beta|\psi>
     // ---------------------------------------------
-    if (this->ppcell->nkb <= 0 || !this->nonlocal) 
+    if (this->ppcell->nkb <= 0 || !this->nonlocal)
     {
         ModuleBase::timer::tick("Operator", "Velocity");
         return;
     }
 
-    //1. <\beta|\psi>
+    // 1. <\beta|\psi>
+    Complex* becp1_ = nullptr; ///<[Device, n_npwx * nkb] <\beta|\psi>
+    Complex* becp2_ = nullptr; ///<[Device, n_npwx * 3*nkb] <\nabla\beta|\psi>
+    Complex* ps1_ = nullptr; ///<[Device, nkb * n_npwx] sum of becp1
+    Complex* ps2_ = nullptr; ///<[Device, 3*nkb * n_npwx] sum of becp2
+    resmem_complex_op()(ps1_, this->ppcell->nkb * n_npwx);
+    resmem_complex_op()(ps2_, 3 * this->ppcell->nkb * n_npwx);
+    resmem_complex_op()(becp1_, this->ppcell->nkb * n_npwx);
+    resmem_complex_op()(becp2_, 3 * this->ppcell->nkb * n_npwx);
+
     const int nkb = this->ppcell->nkb;
     const int nkb3 = 3 * nkb;
-    ModuleBase::ComplexMatrix becp1(n_npwx, nkb, false);
-    ModuleBase::ComplexMatrix becp2(n_npwx, nkb3, false);
-    char transC = 'C';
-    char transN = 'N';
-    char transT = 'T';
-    const int npm = n_npwx;
+    Complex one = 1.0;
+    Complex zero = 0.0;
+
+    Complex* vkb_d = reinterpret_cast<Complex*>(this->ppcell->vkb.c);
+    Complex* gradvkb_d = reinterpret_cast<Complex*>(this->ppcell->gradvkb.ptr);
+    if (std::is_same<Device, base_device::DEVICE_GPU>::value || std::is_same<FPTYPE, float>::value)
+    {
+        vkb_d = vkb_;
+        gradvkb_d = gradvkb_;
+    }
+
     if (n_npwx == 1)
     {
         int inc = 1;
-        zgemv_(&transC, &npw, &nkb, 
-               &ModuleBase::ONE, this->ppcell->vkb.c, &max_npw, psi0, &inc, 
-               &ModuleBase::ZERO, becp1.c, &inc);
-        zgemv_(&transC, &npw, &nkb3, 
-               &ModuleBase::ONE, this->ppcell->gradvkb.ptr, &max_npw, psi0, &inc, 
-               &ModuleBase::ZERO, becp2.c, &inc);
+        ModuleBase::gemv_op<Complex, Device>()('C', npw, nkb, &one, vkb_d, max_npw, psi0, inc, &zero, becp1_, inc);
+        ModuleBase::gemv_op<Complex, Device>()('C', npw, nkb3, &one, gradvkb_d, max_npw, psi0, inc, &zero, becp2_, inc);
     }
     else
     {
-        zgemm_(&transC, &transN, &nkb, &n_npwx, &npw,
-               &ModuleBase::ONE, this->ppcell->vkb.c, &max_npw, psi0, &max_npw,
-               &ModuleBase::ZERO, becp1.c, &nkb);
-        zgemm_(&transC, &transN, &nkb3, &n_npwx, &npw,
-               &ModuleBase::ONE, this->ppcell->gradvkb.ptr, &max_npw, psi0, &max_npw,
-               &ModuleBase::ZERO, becp2.c, &nkb3);
+        ModuleBase::gemm_op<Complex, Device>()('C',
+                                               'N',
+                                               nkb,
+                                               n_npwx,
+                                               npw,
+                                               &one,
+                                               vkb_d,
+                                               max_npw,
+                                               psi0,
+                                               max_npw,
+                                               &zero,
+                                               becp1_,
+                                               nkb);
+        ModuleBase::gemm_op<Complex, Device>()('C',
+                                               'N',
+                                               nkb3,
+                                               n_npwx,
+                                               npw,
+                                               &one,
+                                               gradvkb_d,
+                                               max_npw,
+                                               psi0,
+                                               max_npw,
+                                               &zero,
+                                               becp2_,
+                                               nkb3);
     }
-    Parallel_Reduce::reduce_pool(becp1.c, nkb * n_npwx);
-    Parallel_Reduce::reduce_pool(becp2.c, nkb3 * n_npwx);
 
-    //2. <\beta \psi><psi|
-    ModuleBase::ComplexMatrix ps1(nkb, n_npwx, true);
-    ModuleBase::ComplexMatrix ps2(nkb3, n_npwx, true);
+    Complex* becp1_cpu = nullptr;
+    Complex* becp2_cpu = nullptr;
+    Complex* ps1_cpu = nullptr;
+    Complex* ps2_cpu = nullptr;
+    std::vector<Complex> tmp_space1, tmp_space2;
+    if(std::is_same<Device, base_device::DEVICE_GPU>::value)
+    {
+        tmp_space1.resize(nkb * n_npwx + nkb3 * n_npwx);
+        becp1_cpu = tmp_space1.data();
+        becp2_cpu = becp1_cpu + nkb * n_npwx;
+        syncmem_complex_d2h_op()(becp1_cpu, becp1_, nkb * n_npwx);
+        syncmem_complex_d2h_op()(becp2_cpu, becp2_, nkb3 * n_npwx);
+        
+        tmp_space2.resize(nkb * n_npwx + nkb3 * n_npwx, 0.0);
+        ps1_cpu = tmp_space2.data();
+        ps2_cpu = ps1_cpu + nkb * n_npwx;
+    }
+    else
+    {
+        Parallel_Reduce::reduce_pool(becp1_, nkb * n_npwx);
+        Parallel_Reduce::reduce_pool(becp2_, nkb3 * n_npwx);
+        becp1_cpu = becp1_;
+        becp2_cpu = becp2_;
+
+        setmem_complex_op()(ps1_, 0.0, nkb * n_npwx);
+        setmem_complex_op()(ps2_, 0.0, nkb3 * n_npwx);
+        ps1_cpu = ps1_;
+        ps2_cpu = ps2_;
+    }
 
+    // 2. <\beta \psi><psi|
     int sum = 0;
     int iat = 0;
     if (npol == 1)
     {
-        const int current_spin = this->isk[ik];
+        const int current_spin = 0;
         for (int it = 0; it < this->ucell->ntype; it++)
         {
             const int nproj = this->ucell->atoms[it].ncpp.nh;
@@ -141,13 +226,13 @@ void Velocity::act
                     {
                         for (int ib = 0; ib < n_npwx; ++ib)
                         {
-                            double dij = this->ppcell->deeq(current_spin, iat, ip, ip2);
+                            FPTYPE dij = static_cast<FPTYPE>(this->ppcell->deeq(current_spin, iat, ip, ip2));
                             int sumip2 = sum + ip2;
                             int sumip = sum + ip;
-                            ps1(sumip2, ib)  += dij * becp1(ib, sumip);
-                            ps2(sumip2, ib)  += dij * becp2(ib, sumip);
-                            ps2(sumip2 + nkb, ib)  += dij * becp2(ib, sumip + nkb);
-                            ps2(sumip2 + 2*nkb, ib)  += dij * becp2(ib , sumip + 2*nkb);
+                            ps1_cpu[sumip2 * n_npwx + ib] += dij * becp1_cpu[ib * nkb + sumip];
+                            ps2_cpu[sumip2 * n_npwx + ib] += dij * becp2_cpu[ib * nkb3 + sumip];
+                            ps2_cpu[(sumip2 + nkb) * n_npwx + ib] += dij * becp2_cpu[ib * nkb3 + sumip + nkb];
+                            ps2_cpu[(sumip2 + 2 * nkb) * n_npwx + ib] += dij * becp2_cpu[ib * nkb3 + sumip + 2 * nkb];
                         }
                     }
                 }
@@ -158,86 +243,95 @@ void Velocity::act
     }
     else
     {
-        for (int it = 0; it < this->ucell->ntype; it++)
-        {
-            const int nproj = this->ucell->atoms[it].ncpp.nh;
-            for (int ia = 0; ia < this->ucell->atoms[it].na; ia++)
-            {
-                for (int ip = 0; ip < nproj; ip++)
-                {
-                    for (int ip2 = 0; ip2 < nproj; ip2++)
-                    {
-                        for (int ib = 0; ib < n_npwx; ib+=2)
-                        {
-                            int sumip2 = sum + ip2;
-                            int sumip = sum + ip;
-                            std::complex<double> pol1becp1 = becp1(ib, sumip);
-                            std::complex<double> pol2becp1 = becp1(ib+1, sumip);
-                            std::complex<double> pol1becp2x = becp2(ib, sumip);
-                            std::complex<double> pol2becp2x = becp2(ib+1, sumip);
-                            std::complex<double> pol1becp2y = becp2(ib, sumip + nkb);
-                            std::complex<double> pol2becp2y = becp2(ib+1, sumip + nkb);
-                            std::complex<double> pol1becp2z = becp2(ib, sumip + 2 * nkb);
-                            std::complex<double> pol2becp2z = becp2(ib+1, sumip + 2 * nkb);
-                            std::complex<double> dij0 = this->ppcell->deeq_nc(0, iat, ip2, ip);
-                            std::complex<double> dij1 = this->ppcell->deeq_nc(1, iat, ip2, ip);
-                            std::complex<double> dij2 = this->ppcell->deeq_nc(2, iat, ip2, ip);
-                            std::complex<double> dij3 = this->ppcell->deeq_nc(3, iat, ip2, ip);
-
-                            ps1(sumip2, ib)             += dij0 * pol1becp1  + dij1 * pol2becp1;
-                            ps1(sumip2, ib+1)           += dij2 * pol1becp1  + dij3 * pol2becp1;
-                            ps2(sumip2, ib)             += dij0 * pol1becp2x + dij1 * pol2becp2x;
-                            ps2(sumip2, ib+1)           += dij2 * pol1becp2x + dij3 * pol2becp2x;
-                            ps2(sumip2 + nkb, ib)       += dij0 * pol1becp2y + dij1 * pol2becp2y;
-                            ps2(sumip2 + nkb, ib+1)     += dij2 * pol1becp2y + dij3 * pol2becp2y;
-                            ps2(sumip2 + 2*nkb, ib)     += dij0 * pol1becp2z + dij1 * pol2becp2z;
-                            ps2(sumip2 + 2*nkb, ib+1)   += dij2 * pol1becp2z + dij3 * pol2becp2z;
-                        }
-                    }
-                }
-                sum += nproj;
-                ++iat;
-            }
-        }
+        ModuleBase::WARNING_QUIT("Velocity", "Velocity operator does not support the non-collinear case yet!");
+    }
+
+    if(std::is_same<Device, base_device::DEVICE_GPU>::value)
+    {
+        syncmem_complex_h2d_op()(ps1_, ps1_cpu, nkb * n_npwx);
+        syncmem_complex_h2d_op()(ps2_, ps2_cpu, nkb3 * n_npwx);
     }
 
-    
     if (n_npwx == 1)
     {
         int inc = 1;
-        for(int id = 0 ; id < 3 ; ++id)
+        for (int id = 0; id < 3; ++id)
         {
             int vkbshift = id * max_npw * nkb;
             int ps2shift = id * nkb;
-            int npwshift = id * max_npw ;
-            zgemv_(&transN, &npw, &nkb,
-                   &ModuleBase::ONE, this->ppcell->gradvkb.ptr + vkbshift, &max_npw, ps1.c, &inc,
-                   &ModuleBase::ONE, vpsi + npwshift, &inc);
-            zgemv_(&transN, &npw, &nkb,
-                   &ModuleBase::ONE, this->ppcell->vkb.c, &max_npw, ps2.c + ps2shift, &inc,
-                   &ModuleBase::ONE, vpsi + npwshift, &inc);
+            int npwshift = id * max_npw;
+            ModuleBase::gemv_op<Complex, Device>()('N',
+                                                   npw,
+                                                   nkb,
+                                                   &one,
+                                                   gradvkb_d + vkbshift,
+                                                   max_npw,
+                                                   ps1_,
+                                                   inc,
+                                                   &one,
+                                                   vpsi + npwshift,
+                                                   inc);
+            ModuleBase::gemv_op<Complex, Device>()('N',
+                                                   npw,
+                                                   nkb,
+                                                   &one,
+                                                   vkb_d,
+                                                   max_npw,
+                                                   ps2_ + ps2shift,
+                                                   inc,
+                                                   &one,
+                                                   vpsi + npwshift,
+                                                   inc);
         }
     }
     else
     {
-        for(int id = 0 ; id < 3 ; ++id)
+        for (int id = 0; id < 3; ++id)
         {
             int vkbshift = id * max_npw * nkb;
             int ps2shift = id * n_npwx * nkb;
             int npwshift = id * max_npw * n_npwx;
-            zgemm_(&transN, &transT, &npw, &npm, &nkb,
-               &ModuleBase::ONE, this->ppcell->gradvkb.ptr + vkbshift, &max_npw, ps1.c, &n_npwx,
-               &ModuleBase::ONE, vpsi + npwshift, &max_npw);
-            zgemm_(&transN, &transT, &npw, &npm, &nkb,
-               &ModuleBase::ONE, this->ppcell->vkb.c, &max_npw, ps2.c + ps2shift, &n_npwx,
-               &ModuleBase::ONE, vpsi + npwshift, &max_npw);
+            ModuleBase::gemm_op<Complex, Device>()('N',
+                                                   'T',
+                                                   npw,
+                                                   n_npwx,
+                                                   nkb,
+                                                   &one,
+                                                   gradvkb_d + vkbshift,
+                                                   max_npw,
+                                                   ps1_,
+                                                   n_npwx,
+                                                   &one,
+                                                   vpsi + npwshift,
+                                                   max_npw);
+            ModuleBase::gemm_op<Complex, Device>()('N',
+                                                   'T',
+                                                   npw,
+                                                   n_npwx,
+                                                   nkb,
+                                                   &one,
+                                                   vkb_d,
+                                                   max_npw,
+                                                   ps2_ + ps2shift,
+                                                   n_npwx,
+                                                   &one,
+                                                   vpsi + npwshift,
+                                                   max_npw);
         }
     }
-
-
+    delmem_complex_op()(ps1_);
+    delmem_complex_op()(ps2_);
+    delmem_complex_op()(becp1_);
+    delmem_complex_op()(becp2_);
     ModuleBase::timer::tick("Operator", "Velocity");
     return;
 }
 
+template class Velocity<double, base_device::DEVICE_CPU>;
+template class Velocity<float, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class Velocity<double, base_device::DEVICE_GPU>;
+template class Velocity<float, base_device::DEVICE_GPU>;
+#endif
 
-}
\ No newline at end of file
+} // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.h b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.h
index a6752065b0..14ef0799e9 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/velocity_pw.h
@@ -8,6 +8,7 @@ namespace hamilt
 {
 
 //velocity operator mv = im/\hbar * [H,r] =  p + im/\hbar [V_NL, r] 
+template <typename FPTYPE, typename Device = base_device::DEVICE_CPU>
 class Velocity
 {
     public:
@@ -19,7 +20,7 @@ class Velocity
         const bool nonlocal_in = true
     );
 
-    ~Velocity(){};
+    ~Velocity();
 
     void init(const int ik_in);
 
@@ -27,16 +28,16 @@ class Velocity
      * @brief calculate \hat{v}|\psi>
      * 
      * @param psi_in Psi class which contains some information
-     * @param n_npwx i = 1 : n_npwx
+     * @param n_npwx nbands * NPOL
      * @param tmpsi_in |\psi_i>    size: n_npwx*npwx 
-     * @param tmhpsi \hat{v}|\psi> size: 3*n_npwx*npwx
-     * @param add true : tmhpsi = tmhpsi + v|\psi>  false: tmhpsi = v|\psi>
+     * @param tmvpsi \hat{v}|\psi> size: 3*n_npwx*npwx
+     * @param add true : tmvpsi = tmvpsi + v|\psi>  false: tmvpsi = v|\psi>
      * 
      */
-    void act(const psi::Psi<std::complex<double>>* psi_in,
+    void act(const psi::Psi<std::complex<FPTYPE>, Device>* psi_in,
              const int n_npwx,
-             const std::complex<double>* tmpsi_in,
-             std::complex<double>* tmhpsi,
+             const std::complex<FPTYPE>* tmpsi_in,
+             std::complex<FPTYPE>* tmvpsi,
              const bool add = false) const;
     
     bool nonlocal = true;
@@ -53,6 +54,26 @@ class Velocity
     int ik=0;
 
     double tpiba=0.0;
+  
+  private:
+    FPTYPE* gx_ = nullptr; ///<[Device, npwx] x component of G+K
+    FPTYPE* gy_ = nullptr; ///<[Device, npwx] y component of G+K
+    FPTYPE* gz_ = nullptr; ///<[Device, npwx] z component of G+K
+    std::complex<FPTYPE>* vkb_ = nullptr;     ///<[Device, nkb * npwk_max] nonlocal pseudopotential vkb
+    std::complex<FPTYPE>* gradvkb_ = nullptr; ///<[Device, 3*nkb * npwk_max] gradient of nonlocal pseudopotential gradvkb
+    FPTYPE* deeq_ = nullptr;                  ///<[Device] D matrix for nonlocal pseudopotential
+    
+    using Complex = std::complex<FPTYPE>;
+    using resmem_var_op = base_device::memory::resize_memory_op<FPTYPE, Device>;
+    using delmem_var_op = base_device::memory::delete_memory_op<FPTYPE, Device>;
+    using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<FPTYPE, Device, base_device::DEVICE_CPU>;
+    using castmem_var_h2d_op = base_device::memory::cast_memory_op<FPTYPE, double, Device, base_device::DEVICE_CPU>;
+    using resmem_complex_op = base_device::memory::resize_memory_op<std::complex<FPTYPE>, Device>;
+    using setmem_complex_op = base_device::memory::set_memory_op<std::complex<FPTYPE>, Device>;
+    using delmem_complex_op = base_device::memory::delete_memory_op<std::complex<FPTYPE>, Device>;
+    using castmem_complex_h2d_op = base_device::memory::cast_memory_op<std::complex<FPTYPE>, std::complex<double>, Device, base_device::DEVICE_CPU>;
+    using syncmem_complex_d2h_op = base_device::memory::synchronize_memory_op<std::complex<FPTYPE>, base_device::DEVICE_CPU, Device>;
+    using syncmem_complex_h2d_op = base_device::memory::synchronize_memory_op<std::complex<FPTYPE>, Device, base_device::DEVICE_CPU>;
 };
 }
 #endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_elecond.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_elecond.cpp
index fec8130f77..597c84041b 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_elecond.cpp
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_elecond.cpp
@@ -22,7 +22,7 @@ Sto_EleCond::Sto_EleCond(UnitCell* p_ucell_in,
                          hamilt::Hamilt<std::complex<double>>* p_hamilt_in,
                          StoChe<double>& stoche,
                          Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>* p_stowf_in)
-    : EleCond(p_ucell_in, p_kv_in, p_elec_in, p_wfcpw_in, p_psi_in, p_ppcell_in)
+    : EleCond<double, base_device::DEVICE_CPU>(p_ucell_in, p_kv_in, p_elec_in, p_wfcpw_in, p_psi_in, p_ppcell_in)
 {
     this->p_hamilt = p_hamilt_in;
     this->p_hamilt_sto = static_cast<hamilt::HamiltSdftPW<std::complex<double>>*>(p_hamilt_in);
@@ -149,7 +149,7 @@ void Sto_EleCond::cal_jmatrix(const psi::Psi<std::complex<float>>& kspsi_all,
                               const int& bsize_psi,
                               std::vector<std::complex<float>>& j1,
                               std::vector<std::complex<float>>& j2,
-                              hamilt::Velocity& velop,
+                              hamilt::Velocity<double>& velop,
                               const int& ik,
                               const std::complex<double>& factor,
                               const int bandinfo[6])
@@ -580,7 +580,7 @@ void Sto_EleCond::sKG(const int& smear_type,
 
     // ik loop
     ModuleBase::timer::tick("Sto_EleCond", "kloop");
-    hamilt::Velocity velop(p_wfcpw, p_kv->isk.data(), p_ppcell, p_ucell, nonlocal);
+    hamilt::Velocity<double> velop(p_wfcpw, p_kv->isk.data(), p_ppcell, p_ucell, nonlocal);
     for (int ik = 0; ik < nk; ++ik)
     {
         velop.init(ik);
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_elecond.h b/source/module_hamilt_pw/hamilt_stodft/sto_elecond.h
index 77dccff792..4cb55ae328 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_elecond.h
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_elecond.h
@@ -6,7 +6,7 @@
 #include "module_hamilt_pw/hamilt_stodft/sto_wf.h"
 #include "module_hsolver/hsolver_pw_sdft.h"
 
-class Sto_EleCond : protected EleCond
+class Sto_EleCond : protected EleCond<double, base_device::DEVICE_CPU>
 {
   public:
     Sto_EleCond(UnitCell* p_ucell_in,
@@ -90,7 +90,7 @@ class Sto_EleCond : protected EleCond
                      const int& bsize_psi,
                      std::vector<std::complex<float>>& j1,
                      std::vector<std::complex<float>>& j2,
-                     hamilt::Velocity& velop,
+                     hamilt::Velocity<double, base_device::DEVICE_CPU>& velop,
                      const int& ik,
                      const std::complex<double>& factor,
                      const int bandinfo[6]);
diff --git a/tests/integrate/186_PW_KG_100_GPU/INPUT b/tests/integrate/186_PW_KG_100_GPU/INPUT
new file mode 100644
index 0000000000..1d34d5335d
--- /dev/null
+++ b/tests/integrate/186_PW_KG_100_GPU/INPUT
@@ -0,0 +1,37 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix			autotest
+calculation     scf
+esolver_type    ksdft
+
+nbands			100
+pseudo_dir	../../PP_ORB
+symmetry	    1
+kpar            2
+device          gpu
+
+#Parameters (2.Iteration)
+ecutwfc			20
+scf_thr			1e-6
+scf_nmax	    50
+
+
+#Parameters (3.Basis)
+basis_type		pw
+
+#Parameters (4.Smearing)
+smearing_method		fd
+smearing_sigma			0.6
+
+#Parameters (5.Mixing)
+mixing_type		broyden
+mixing_beta		0.4
+mixing_gg0      0.0
+
+cal_cond      1
+cond_fwhm   8
+cond_wcut        10
+cond_dw          1
+cond_dt          0.237464
+cond_dtbatch     1
+cond_nonlocal    1
diff --git a/tests/integrate/186_PW_KG_100_GPU/KPT b/tests/integrate/186_PW_KG_100_GPU/KPT
new file mode 100644
index 0000000000..4fd38968a0
--- /dev/null
+++ b/tests/integrate/186_PW_KG_100_GPU/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+2 2 1 0 0 0
diff --git a/tests/integrate/186_PW_KG_100_GPU/README b/tests/integrate/186_PW_KG_100_GPU/README
new file mode 100644
index 0000000000..033d794f7f
--- /dev/null
+++ b/tests/integrate/186_PW_KG_100_GPU/README
@@ -0,0 +1,9 @@
+This test for:
+*KSDFT
+*Al
+*kpoints 2*2*1
+*100 KS 
+*kpar 2
+*bndpar 1
+*device gpu
+*cal_cond 1
diff --git a/tests/integrate/186_PW_KG_100_GPU/STRU b/tests/integrate/186_PW_KG_100_GPU/STRU
new file mode 100644
index 0000000000..0dbc74b343
--- /dev/null
+++ b/tests/integrate/186_PW_KG_100_GPU/STRU
@@ -0,0 +1,18 @@
+ATOMIC_SPECIES
+Si 14 Si.pz-vbc.UPF
+
+LATTICE_CONSTANT
+5  // add lattice constant
+
+LATTICE_VECTORS
+1 0 0
+0 1 0
+0 0 1
+
+ATOMIC_POSITIONS
+Direct 
+
+Si	// Element type	
+0.0	// magnetism
+1	
+0.00 0.00 0.00 1 1 1
diff --git a/tests/integrate/186_PW_KG_100_GPU/refOnsager.txt b/tests/integrate/186_PW_KG_100_GPU/refOnsager.txt
new file mode 100644
index 0000000000..4f850b65d2
--- /dev/null
+++ b/tests/integrate/186_PW_KG_100_GPU/refOnsager.txt
@@ -0,0 +1,11 @@
+## w(eV)         sigma(Sm^-1)     kappa(W(mK)^-1)        L12/e(Am^-1)      L22/e^2(Wm^-1)
+     0.5              356691             626.131        -7.13544e+06         2.02056e+08
+     1.5              348656             606.514        -7.03014e+06         1.99209e+08
+     2.5              333032             569.396        -6.81899e+06         1.93563e+08
+     3.5              310656             518.557        -6.50176e+06         1.85201e+08
+     4.5              282683             458.689        -6.08031e+06         1.74236e+08
+     5.5              250512             394.607        -5.56125e+06         1.60839e+08
+     6.5              215751             330.615        -4.95931e+06         1.45316e+08
+     7.5              180190             270.122         -4.3001e+06         1.28208e+08
+     8.5              145738             215.497        -3.62025e+06         1.10345e+08
+     9.5              114241             168.083        -2.96321e+06         9.27832e+07
diff --git a/tests/integrate/186_PW_KG_100_GPU/result.ref b/tests/integrate/186_PW_KG_100_GPU/result.ref
new file mode 100644
index 0000000000..13c9a5365f
--- /dev/null
+++ b/tests/integrate/186_PW_KG_100_GPU/result.ref
@@ -0,0 +1,7 @@
+etotref -150.4802165653093
+etotperatomref -150.4802165653
+CompareH_Failed 0
+pointgroupref O_h
+spacegroupref O_h
+nksibzref 3
+totaltimeref 5.10
diff --git a/tests/integrate/CASES_GPU.txt b/tests/integrate/CASES_GPU.txt
index c41f06d83b..b034e955e2 100644
--- a/tests/integrate/CASES_GPU.txt
+++ b/tests/integrate/CASES_GPU.txt
@@ -2,6 +2,7 @@
 102_PW_DA_davidson_GPU
 102_PW_BPCG_GPU
 107_PW_outWfcPw_GPU
+186_PW_KG_100_GPU
 187_PW_SDFT_ALL_GPU
 187_PW_SDFT_MALL_GPU
 187_PW_SDFT_MALL_BPCG_GPU